├── .gradient └── settings.yaml ├── AutoYOLO.ipynb ├── AutoYOLO.py ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── README.md ├── assets ├── logo.png ├── masks1.png ├── masks2.jpg ├── model_diagram.png ├── notebook1.png └── notebook2.png ├── automatic_mask_generator.py ├── build_sam.py ├── dataset ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-39.pyc │ ├── catalog.cpython-39.pyc │ └── concat_dataset.cpython-39.pyc ├── base_dataset.py ├── catalog.py ├── cd_dataset.py ├── concat_dataset.py ├── grounding_dataset.py ├── layout_dataset.py ├── tsv.py ├── tsv_dataset.py └── utils.py ├── datasets └── .placeholder ├── gligen ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-39.pyc │ ├── distributed.cpython-39.pyc │ ├── evaluator.cpython-39.pyc │ ├── task_grounded_generation.cpython-39.pyc │ └── trainer.cpython-39.pyc ├── create_meta.py ├── distributed.py ├── evaluator.py ├── ldm │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ └── util.cpython-39.pyc │ ├── data │ │ ├── __init__.py │ │ ├── base.py │ │ ├── imagenet.py │ │ ├── imagenet_clsidx_to_label.txt │ │ ├── index_synset.yaml │ │ └── lsun.py │ ├── lr_scheduler.py │ ├── models │ │ ├── __pycache__ │ │ │ └── autoencoder.cpython-39.pyc │ │ ├── autoencoder.py │ │ └── diffusion │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-39.pyc │ │ │ ├── ddim.cpython-39.pyc │ │ │ ├── ddpm.cpython-39.pyc │ │ │ ├── ldm.cpython-39.pyc │ │ │ └── plms.cpython-39.pyc │ │ │ ├── classifier.py │ │ │ ├── ddim.py │ │ │ ├── ddpm.py │ │ │ ├── ldm.py │ │ │ └── plms.py │ ├── modules │ │ ├── __pycache__ │ │ │ ├── attention.cpython-39.pyc │ │ │ └── x_transformer.cpython-39.pyc │ │ ├── attention.py │ │ ├── diffusionmodules │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-39.pyc │ │ │ │ ├── model.cpython-39.pyc │ │ │ │ ├── openaimodel.cpython-39.pyc │ │ │ │ ├── positionnet.cpython-39.pyc │ │ │ │ └── util.cpython-39.pyc │ │ │ ├── model.py │ │ │ ├── openaimodel.py │ │ │ ├── positionnet.py │ │ │ ├── positionnet_with_image.py │ │ │ └── util.py │ │ ├── distributions │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-39.pyc │ │ │ │ └── distributions.cpython-39.pyc │ │ │ └── distributions.py │ │ ├── ema.py │ │ ├── encoders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-39.pyc │ │ │ │ └── modules.cpython-39.pyc │ │ │ ├── modules.py │ │ │ └── modules_backup.py │ │ ├── image_degradation │ │ │ ├── __init__.py │ │ │ ├── bsrgan.py │ │ │ ├── bsrgan_light.py │ │ │ └── utils_image.py │ │ ├── losses │ │ │ ├── __init__.py │ │ │ ├── contperceptual.py │ │ │ └── vqperceptual.py │ │ └── x_transformer.py │ └── util.py ├── projection_matrix.pth ├── task_grounded_generation.py └── trainer.py ├── groundingdino ├── _C.cpython-39-x86_64-linux-gnu.so ├── __init__.py ├── __pycache__ │ └── __init__.cpython-39.pyc ├── config │ ├── GroundingDINO_SwinB.cfg.py │ └── GroundingDINO_SwinT_OGC.py ├── datasets │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ └── transforms.cpython-39.pyc │ └── transforms.py ├── models │ ├── GroundingDINO │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-39.pyc │ │ │ ├── bertwarper.cpython-39.pyc │ │ │ ├── fuse_modules.cpython-39.pyc │ │ │ ├── groundingdino.cpython-39.pyc │ │ │ ├── ms_deform_attn.cpython-39.pyc │ │ │ ├── transformer.cpython-39.pyc │ │ │ ├── transformer_vanilla.cpython-39.pyc │ │ │ └── utils.cpython-39.pyc │ │ ├── backbone │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-39.pyc │ │ │ │ ├── backbone.cpython-39.pyc │ │ │ │ ├── position_encoding.cpython-39.pyc │ │ │ │ └── swin_transformer.cpython-39.pyc │ │ │ ├── backbone.py │ │ │ ├── position_encoding.py │ │ │ └── swin_transformer.py │ │ ├── bertwarper.py │ │ ├── csrc │ │ │ ├── MsDeformAttn │ │ │ │ ├── ms_deform_attn.h │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ ├── ms_deform_attn_cpu.h │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ ├── cuda_version.cu │ │ │ └── vision.cpp │ │ ├── fuse_modules.py │ │ ├── groundingdino.py │ │ ├── ms_deform_attn.py │ │ ├── transformer.py │ │ ├── transformer_vanilla.py │ │ └── utils.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ └── registry.cpython-39.pyc │ └── registry.py ├── util │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── box_ops.cpython-39.pyc │ │ ├── slconfig.cpython-39.pyc │ │ └── utils.cpython-39.pyc │ ├── box_ops.py │ ├── get_tokenlizer.py │ ├── inference.py │ ├── logger.py │ ├── misc.py │ ├── slconfig.py │ ├── slio.py │ ├── time_counter.py │ ├── utils.py │ ├── visualizer.py │ └── vl_utils.py └── version.py ├── linter.sh ├── modeling ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-39.pyc │ ├── common.cpython-39.pyc │ ├── image_encoder.cpython-39.pyc │ ├── mask_decoder.cpython-39.pyc │ ├── prompt_encoder.cpython-39.pyc │ ├── sam.cpython-39.pyc │ └── transformer.cpython-39.pyc ├── common.py ├── image_encoder.py ├── mask_decoder.py ├── prompt_encoder.py ├── sam.py └── transformer.py ├── notebooks ├── automatic_mask_generator_example.ipynb ├── images │ ├── dog.jpg │ ├── groceries.jpg │ └── truck.jpg ├── onnx_model_example.ipynb └── predictor_example.ipynb ├── outputs └── .placeholder ├── predictor.py ├── requirements.txt ├── runs └── detect │ └── .placeholder ├── scripts ├── amg.py └── export_onnx_model.py ├── segment_anything ├── .flake8 ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-39.pyc │ ├── automatic_mask_generator.cpython-39.pyc │ ├── build_sam.cpython-39.pyc │ └── predictor.cpython-39.pyc ├── assets │ ├── masks1.png │ ├── masks2.jpg │ ├── model_diagram.png │ ├── notebook1.png │ └── notebook2.png ├── automatic_mask_generator.py ├── build_sam.py ├── linter.sh ├── modeling │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── common.cpython-39.pyc │ │ ├── image_encoder.cpython-39.pyc │ │ ├── mask_decoder.cpython-39.pyc │ │ ├── prompt_encoder.cpython-39.pyc │ │ ├── sam.cpython-39.pyc │ │ └── transformer.cpython-39.pyc │ ├── common.py │ ├── image_encoder.py │ ├── mask_decoder.py │ ├── prompt_encoder.py │ ├── sam.py │ └── transformer.py ├── notebooks │ ├── automatic_mask_generator_example.ipynb │ ├── images │ │ ├── dog.jpg │ │ ├── groceries.jpg │ │ └── truck.jpg │ ├── onnx_model_example.ipynb │ └── predictor_example.ipynb ├── predictor.py ├── scripts │ ├── amg.py │ └── export_onnx_model.py ├── segment_anything.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ ├── requires.txt │ └── top_level.txt ├── segment_anything │ ├── __init__.py │ ├── automatic_mask_generator.py │ ├── build_sam.py │ ├── modeling │ │ ├── __init__.py │ │ ├── common.py │ │ ├── image_encoder.py │ │ ├── mask_decoder.py │ │ ├── prompt_encoder.py │ │ ├── sam.py │ │ └── transformer.py │ ├── predictor.py │ └── utils │ │ ├── __init__.py │ │ ├── amg.py │ │ ├── onnx.py │ │ └── transforms.py ├── setup.cfg ├── setup.py └── utils │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-39.pyc │ ├── amg.cpython-39.pyc │ └── transforms.cpython-39.pyc │ ├── amg.py │ ├── onnx.py │ └── transforms.py ├── setup.cfg ├── setup.py ├── setup.sh └── utils ├── __init__.py ├── __pycache__ ├── __init__.cpython-39.pyc ├── amg.cpython-39.pyc └── transforms.cpython-39.pyc ├── amg.py ├── onnx.py └── transforms.py /.gradient/settings.yaml: -------------------------------------------------------------------------------- 1 | integrations: 2 | dolly-v2-12b: 3 | type: dataset 4 | ref: paperspace/dsi5inn7aonbmv3:latest 5 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to segment-anything 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints, using the `linter.sh` script in the project's root directory. Linting requires `black==23.*`, `isort==5.12.0`, `flake8`, and `mypy`. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to segment-anything, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoYOLO - Ultralytics YOLOv8 Web UI v2 2 | 3 | [![Gradient](https://assets.paperspace.io/img/gradient-badge.svg)](https://console.paperspace.com/github/gradient-ai/autoyolo?machine=Free-GPU) 4 | 5 | This Gradio application is designed to facilitate the end-to-end creation of a YOLOv8 object detection model using Segment Anything, GroundingDINO, BLIP-2, and Dolly v2 to facilitate the automatic labeling of objects in images. Users can then train Ultralytics YOLOv8 models to generate predictions on inputted videos and images. Optionally, users may also manually label images as desired. 6 | 7 | ## Capabilities 8 | 9 | - **AutoLabel**: The key contribution of this application is the AutoLabeler. Using the autolabel tab, users can automatically generate fully labeled images in the Ultralytics YOLOv8 format with only the submission of the images and desired, target, object labels 10 | - **Manually Label Images**: this tab lets you upload images, either in bulk or one at a time, to be labeled. The bounding boxes are automatically detected, and the labels are assigned through a textbox. Entries are separated by semi-colons 11 | - **Image Gallery**: this tab allows us to view our labeled images, seperated by the assigned training split 12 | - **Train**: train any of the YOLOv8 models on the labeled images. Outputs the validation metrics and the best trained model from the run, `best.pt` 13 | - **Inference**: predict object labels on images and videos. Works for direct upload and URL submission of images and YouTube Videos 14 | 15 | ## Next steps 16 | 17 | - Integrating with RoboFlow to enable training on the application with existing projects and Universe datasets 18 | - Streaming video object detection for real time viewing and interaction with the object detection model 19 | - Add in additional text models (GPT4All, OpenAssistant, Otter, etc.) to enable multimodal integration. Potentially removes BLIP-2 from pipeline and speeds up processing 20 | 21 | ## Thanks and credits to: 22 | 23 | - This application was inspired by the work done by Idea Research on their [Grounded Segment Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything) project. Thanks to them for releasing their awesome work, and for inspiring this project. 24 | - This application wouldn't have been feasible without the groundwork completed by the researchers for the [GLIGEN](https://github.com/gligen/GLIGEN) project. Their bounding box detector code was instrumental to making this work. 25 | - [Ultralytics](https://github.com/ultralytics/ultralytics) for their incredible work on YOLOv8 26 | -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/logo.png -------------------------------------------------------------------------------- /assets/masks1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/masks1.png -------------------------------------------------------------------------------- /assets/masks2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/masks2.jpg -------------------------------------------------------------------------------- /assets/model_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/model_diagram.png -------------------------------------------------------------------------------- /assets/notebook1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/notebook1.png -------------------------------------------------------------------------------- /assets/notebook2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/assets/notebook2.png -------------------------------------------------------------------------------- /build_sam.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | 9 | from functools import partial 10 | 11 | from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer 12 | 13 | 14 | def build_sam_vit_h(checkpoint=None): 15 | return _build_sam( 16 | encoder_embed_dim=1280, 17 | encoder_depth=32, 18 | encoder_num_heads=16, 19 | encoder_global_attn_indexes=[7, 15, 23, 31], 20 | checkpoint=checkpoint, 21 | ) 22 | 23 | 24 | build_sam = build_sam_vit_h 25 | 26 | 27 | def build_sam_vit_l(checkpoint=None): 28 | return _build_sam( 29 | encoder_embed_dim=1024, 30 | encoder_depth=24, 31 | encoder_num_heads=16, 32 | encoder_global_attn_indexes=[5, 11, 17, 23], 33 | checkpoint=checkpoint, 34 | ) 35 | 36 | 37 | def build_sam_vit_b(checkpoint=None): 38 | return _build_sam( 39 | encoder_embed_dim=768, 40 | encoder_depth=12, 41 | encoder_num_heads=12, 42 | encoder_global_attn_indexes=[2, 5, 8, 11], 43 | checkpoint=checkpoint, 44 | ) 45 | 46 | 47 | sam_model_registry = { 48 | "default": build_sam, 49 | "vit_h": build_sam, 50 | "vit_l": build_sam_vit_l, 51 | "vit_b": build_sam_vit_b, 52 | } 53 | 54 | 55 | def _build_sam( 56 | encoder_embed_dim, 57 | encoder_depth, 58 | encoder_num_heads, 59 | encoder_global_attn_indexes, 60 | checkpoint=None, 61 | ): 62 | prompt_embed_dim = 256 63 | image_size = 1024 64 | vit_patch_size = 16 65 | image_embedding_size = image_size // vit_patch_size 66 | sam = Sam( 67 | image_encoder=ImageEncoderViT( 68 | depth=encoder_depth, 69 | embed_dim=encoder_embed_dim, 70 | img_size=image_size, 71 | mlp_ratio=4, 72 | norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), 73 | num_heads=encoder_num_heads, 74 | patch_size=vit_patch_size, 75 | qkv_bias=True, 76 | use_rel_pos=True, 77 | global_attn_indexes=encoder_global_attn_indexes, 78 | window_size=14, 79 | out_chans=prompt_embed_dim, 80 | ), 81 | prompt_encoder=PromptEncoder( 82 | embed_dim=prompt_embed_dim, 83 | image_embedding_size=(image_embedding_size, image_embedding_size), 84 | input_image_size=(image_size, image_size), 85 | mask_in_chans=16, 86 | ), 87 | mask_decoder=MaskDecoder( 88 | num_multimask_outputs=3, 89 | transformer=TwoWayTransformer( 90 | depth=2, 91 | embedding_dim=prompt_embed_dim, 92 | mlp_dim=2048, 93 | num_heads=8, 94 | ), 95 | transformer_dim=prompt_embed_dim, 96 | iou_head_depth=3, 97 | iou_head_hidden_dim=256, 98 | ), 99 | pixel_mean=[123.675, 116.28, 103.53], 100 | pixel_std=[58.395, 57.12, 57.375], 101 | ) 102 | sam.eval() 103 | if checkpoint is not None: 104 | with open(checkpoint, "rb") as f: 105 | state_dict = torch.load(f) 106 | sam.load_state_dict(state_dict) 107 | return sam 108 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/dataset/__init__.py -------------------------------------------------------------------------------- /dataset/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/dataset/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/catalog.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/dataset/__pycache__/catalog.cpython-39.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/concat_dataset.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/dataset/__pycache__/concat_dataset.cpython-39.pyc -------------------------------------------------------------------------------- /dataset/catalog.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class DatasetCatalog: 4 | def __init__(self, ROOT, which_embedder): 5 | assert which_embedder in ['clip', 'bert'] 6 | 7 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 8 | 9 | 10 | self.VGGrounding = { 11 | "target": "dataset.tsv_dataset.TSVDataset", 12 | "train_params": dict( 13 | tsv_path=os.path.join(ROOT,'GROUNDING/gqa/tsv/train-00.tsv'), 14 | ) 15 | } 16 | 17 | 18 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 19 | 20 | 21 | self.FlickrGrounding = { 22 | "target": "dataset.tsv_dataset.TSVDataset", 23 | "train_params":dict( 24 | tsv_path=os.path.join(ROOT,'GROUNDING/flickr30k/tsv/train-00.tsv'), 25 | ) 26 | } 27 | 28 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 29 | 30 | self.SBUGrounding = { 31 | "target": "dataset.tsv_dataset.TSVDataset", 32 | "train_params":dict( 33 | tsv_path=os.path.join(ROOT,'GROUNDING/SBU/tsv/train-00.tsv'), 34 | ) 35 | } 36 | 37 | 38 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 39 | 40 | 41 | self.CC3MGrounding = { 42 | "target": "dataset.tsv_dataset.TSVDataset", 43 | "train_params":dict( 44 | tsv_path=os.path.join(ROOT,'GROUNDING/CC3M/tsv/train-00.tsv'), 45 | ) 46 | } 47 | 48 | 49 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 50 | 51 | 52 | self.CC12MGrounding = { 53 | "target": "dataset.tsv_dataset.TSVDataset", 54 | "train_params":dict( 55 | tsv_path=os.path.join(ROOT,'GROUNDING/CC12M/tsv/train-00.tsv'), 56 | ) 57 | } 58 | 59 | 60 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # 61 | 62 | # temp = 'category_embedding_clip.pth' if which_embedder == 'clip' else 'category_embedding_bert.pth' 63 | # obj365_category_embedding_path = os.path.join(ROOT, 'OBJECTS365', temp) 64 | 65 | self.Obj365Detection = { 66 | "target": "dataset.tsv_dataset.TSVDataset", 67 | "train_params":dict( 68 | tsv_path=os.path.join(ROOT,'OBJECTS365/tsv/train-00.tsv'), 69 | ), 70 | } 71 | 72 | 73 | -------------------------------------------------------------------------------- /dataset/concat_dataset.py: -------------------------------------------------------------------------------- 1 | from .catalog import DatasetCatalog 2 | from ldm.util import instantiate_from_config 3 | import torch 4 | 5 | 6 | 7 | 8 | class ConCatDataset(): 9 | def __init__(self, dataset_name_list, ROOT, which_embedder, train=True, repeats=None): 10 | self.datasets = [] 11 | cul_previous_dataset_length = 0 12 | offset_map = [] 13 | which_dataset = [] 14 | 15 | if repeats is None: 16 | repeats = [1] * len(dataset_name_list) 17 | else: 18 | assert len(repeats) == len(dataset_name_list) 19 | 20 | 21 | Catalog = DatasetCatalog(ROOT, which_embedder) 22 | for dataset_idx, (dataset_name, yaml_params) in enumerate(dataset_name_list.items()): 23 | repeat = repeats[dataset_idx] 24 | 25 | dataset_dict = getattr(Catalog, dataset_name) 26 | 27 | target = dataset_dict['target'] 28 | params = dataset_dict['train_params'] if train else dataset_dict['val_params'] 29 | if yaml_params is not None: 30 | params.update(yaml_params) 31 | dataset = instantiate_from_config( dict(target=target, params=params) ) 32 | 33 | self.datasets.append(dataset) 34 | for _ in range(repeat): 35 | offset_map.append( torch.ones(len(dataset))*cul_previous_dataset_length ) 36 | which_dataset.append( torch.ones(len(dataset))*dataset_idx ) 37 | cul_previous_dataset_length += len(dataset) 38 | offset_map = torch.cat(offset_map, dim=0).long() 39 | self.total_length = cul_previous_dataset_length 40 | 41 | self.mapping = torch.arange(self.total_length) - offset_map 42 | self.which_dataset = torch.cat(which_dataset, dim=0).long() 43 | 44 | 45 | def total_images(self): 46 | count = 0 47 | for dataset in self.datasets: 48 | print(dataset.total_images()) 49 | count += dataset.total_images() 50 | return count 51 | 52 | 53 | 54 | def __getitem__(self, idx): 55 | dataset = self.datasets[ self.which_dataset[idx] ] 56 | return dataset[ self.mapping[idx] ] 57 | 58 | 59 | def __len__(self): 60 | return self.total_length 61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /dataset/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2018 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import PIL 18 | import torch 19 | import torchvision.transforms as T 20 | 21 | 22 | IMAGENET_MEAN = [0.485, 0.456, 0.406] 23 | IMAGENET_STD = [0.229, 0.224, 0.225] 24 | 25 | INV_IMAGENET_MEAN = [-m for m in IMAGENET_MEAN] 26 | INV_IMAGENET_STD = [1.0 / s for s in IMAGENET_STD] 27 | 28 | 29 | def imagenet_preprocess(): 30 | return T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) 31 | 32 | 33 | def rescale(x): 34 | lo, hi = x.min(), x.max() 35 | return x.sub(lo).div(hi - lo) 36 | 37 | 38 | def imagenet_deprocess(rescale_image=True): 39 | transforms = [ 40 | T.Normalize(mean=[0, 0, 0], std=INV_IMAGENET_STD), 41 | T.Normalize(mean=INV_IMAGENET_MEAN, std=[1.0, 1.0, 1.0]), 42 | ] 43 | if rescale_image: 44 | transforms.append(rescale) 45 | return T.Compose(transforms) 46 | 47 | 48 | def imagenet_deprocess_batch(imgs, rescale=True): 49 | """ 50 | Input: 51 | - imgs: FloatTensor of shape (N, C, H, W) giving preprocessed images 52 | 53 | Output: 54 | - imgs_de: ByteTensor of shape (N, C, H, W) giving deprocessed images 55 | in the range [0, 255] 56 | """ 57 | if isinstance(imgs, torch.autograd.Variable): 58 | imgs = imgs.data 59 | imgs = imgs.cpu().clone() 60 | deprocess_fn = imagenet_deprocess(rescale_image=rescale) 61 | imgs_de = [] 62 | for i in range(imgs.size(0)): 63 | img_de = deprocess_fn(imgs[i])[None] 64 | img_de = img_de.mul(255).clamp(0, 255).byte() 65 | imgs_de.append(img_de) 66 | imgs_de = torch.cat(imgs_de, dim=0) 67 | return imgs_de 68 | 69 | 70 | class Resize(object): 71 | def __init__(self, size, interp=PIL.Image.BILINEAR): 72 | if isinstance(size, tuple): 73 | H, W = size 74 | self.size = (W, H) 75 | else: 76 | self.size = (size, size) 77 | self.interp = interp 78 | 79 | def __call__(self, img): 80 | return img.resize(self.size, self.interp) 81 | 82 | 83 | def unpack_var(v): 84 | if isinstance(v, torch.autograd.Variable): 85 | return v.data 86 | return v 87 | 88 | 89 | def split_graph_batch(triples, obj_data, obj_to_img, triple_to_img): 90 | triples = unpack_var(triples) 91 | obj_data = [unpack_var(o) for o in obj_data] 92 | obj_to_img = unpack_var(obj_to_img) 93 | triple_to_img = unpack_var(triple_to_img) 94 | 95 | triples_out = [] 96 | obj_data_out = [[] for _ in obj_data] 97 | obj_offset = 0 98 | N = obj_to_img.max() + 1 99 | for i in range(N): 100 | o_idxs = (obj_to_img == i).nonzero().view(-1) 101 | t_idxs = (triple_to_img == i).nonzero().view(-1) 102 | 103 | cur_triples = triples[t_idxs].clone() 104 | cur_triples[:, 0] -= obj_offset 105 | cur_triples[:, 2] -= obj_offset 106 | triples_out.append(cur_triples) 107 | 108 | for j, o_data in enumerate(obj_data): 109 | cur_o_data = None 110 | if o_data is not None: 111 | cur_o_data = o_data[o_idxs] 112 | obj_data_out[j].append(cur_o_data) 113 | 114 | obj_offset += o_idxs.size(0) 115 | 116 | return triples_out, obj_data_out 117 | -------------------------------------------------------------------------------- /datasets/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/datasets/.placeholder -------------------------------------------------------------------------------- /gligen/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import os, sys 3 | sys.path.append(os.path.dirname(__file__)) 4 | sys.path.append(os.path.join(os.path.dirname(__file__), "ldm")) 5 | 6 | import gligen.evaluator as evaluator 7 | import gligen.trainer as trainer 8 | 9 | 10 | # import gligen.ldm as ldm -------------------------------------------------------------------------------- /gligen/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/__pycache__/distributed.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/__pycache__/distributed.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/__pycache__/evaluator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/__pycache__/evaluator.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/__pycache__/task_grounded_generation.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/__pycache__/task_grounded_generation.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/__pycache__/trainer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/__pycache__/trainer.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/create_meta.py: -------------------------------------------------------------------------------- 1 | CKPTS = [ 2 | 3 | dict( 4 | path="/home/chunyl/azure_mount/yuhengdb/fine_tune_ldm/version5_branch6_output/GoldG+SBU+CC3M+CC12M+O365/second_stage_drop_both/tag01/checkpoint_00450001.pth", 5 | feature_type=['before','after_reproject'], 6 | save_folder_name="v5b6_drop_both", 7 | ), 8 | 9 | 10 | # dict( 11 | # path="/home/v-yuhengli/blobfuse/output/fine_tune_ldm/version5_branch6_output/GoldG+SBU+CC3M+CC12M+O365/second_stage_drop_none/tag00/checkpoint_00165001.pth", 12 | # feature_type=['before','after_reproject'], 13 | # save_folder_name="v5b6_drop_none", 14 | # ), 15 | 16 | 17 | 18 | 19 | 20 | ] 21 | 22 | 23 | 24 | # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | # if meta["has_image_mask"] == 0: 34 | # image_embeddings = text_embeddings 35 | # if meta["has_text_mask"] == 0: 36 | # text_embeddings = image_embeddings 37 | 38 | # out = { 39 | # "boxes" : boxes.unsqueeze(0).repeat(batch,1,1), 40 | # "masks" : masks.unsqueeze(0).repeat(batch,1), 41 | # "text_masks" : masks.unsqueeze(0).repeat(batch,1), 42 | # "image_masks" : masks.unsqueeze(0).repeat(batch,1), 43 | # "text_embeddings" : text_embeddings.unsqueeze(0).repeat(batch,1,1), 44 | # "image_embeddings" : image_embeddings.unsqueeze(0).repeat(batch,1,1) 45 | # } 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | META = [ 54 | 55 | 56 | dict( 57 | prompt = "a teddy bear sitting next to a red bird", 58 | phrases = ['a teddy bear', 'a red bird'], 59 | images = ['images/teddy.jpg', 'images/red_bird.jpg'], 60 | locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8] ], 61 | alpha_type = [1.0, 0, 0.0], 62 | has_text_mask = 1, 63 | has_image_mask = 0, 64 | save_folder_name="teddy_bird_1_1" 65 | ), 66 | 67 | 68 | # dict( 69 | # prompt = "a teddy bear sitting next to a bird", 70 | # phrases = ['a teddy bear', 'a bird'], 71 | # images = ['images/teddy.jpg', 'images/red_bird.jpg'], 72 | # locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8] ], 73 | # alpha_type = [1.0, 0, 0.0], 74 | # has_text_mask = 1, 75 | # has_image_mask = 1, 76 | # save_folder_name="teddy_bird_1_1" 77 | # ), 78 | 79 | 80 | # dict( 81 | # prompt = "a teddy bear sitting next to a bird", 82 | # phrases = ['a teddy bear', 'a bird'], 83 | # images = ['images/teddy.jpg', 'images/red_bird.jpg'], 84 | # locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8] ], 85 | # alpha_type = [0.5, 0, 0.5], 86 | # has_text_mask = 1, 87 | # has_image_mask = 0, 88 | # save_folder_name="teddy_bird_1_0" 89 | # ), 90 | 91 | # dict( 92 | # prompt = "", 93 | # phrases = ['a teddy bear', 'an umbrella'], 94 | # images = ['images/teddy.jpg', 'images/umbrella.png'], 95 | # locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8] ], 96 | # alpha_type = [1.0, 0, 0.0], 97 | # has_text_mask = 1, 98 | # has_image_mask = 1, 99 | # save_folder_name="empty_teddy_umbrella_1_1" 100 | # ), 101 | 102 | # dict( 103 | # prompt = "hello kitty and bird hybrid", 104 | # phrases = ['a hello kitty', 'a hello kitty'], 105 | # images = ['images/red_bird.jpg', 'images/red_bird.jpg'], 106 | # locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8] ], 107 | # has_text_mask = 1, 108 | # has_image_mask = 1, 109 | # save_folder_name="hello+bird_1_1" 110 | # ), 111 | 112 | # dict( 113 | # prompt = "hello kitty and teddy bear hybrid", 114 | # phrases = ['a hello kitty', 'a hello kitty'], 115 | # images = ['images/teddy.jpg', 'images/teddy.jpg'], 116 | # locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8] ], 117 | # has_text_mask = 1, 118 | # has_image_mask = 1, 119 | # save_folder_name="hello+teddy_1_1" 120 | # ), 121 | 122 | # dict( 123 | # prompt = "bird and hello kitty hybrid", 124 | # phrases = ['a bird', 'a bird'], 125 | # images = ['images/hello.jpg', 'images/hello.jpg'], 126 | # locations = [ [0.0,0.09,0.33,0.76], [0.55,0.11,1.0,0.8] ], 127 | # alpha_type = [1.0, 0, 0.0], 128 | # has_text_mask = 1, 129 | # has_image_mask = 0.5, 130 | # save_folder_name="bird+hello_1_1" 131 | # ), 132 | 133 | 134 | 135 | # dict( 136 | # prompt = "a deer standing in front of a brick house in the woods, anime, oil painting, high resolution, cottagecore, ghibli inspired, 4k", 137 | # phrases = ['a deer'], 138 | # images = ['images/sky.jpg'], 139 | # locations = [ [0.0,0.5,0.5,0.9] ], 140 | # alpha_type = [1, 0, 0], 141 | # has_text_mask = 1, 142 | # has_image_mask = 1, 143 | # save_folder_name="deer_sky" 144 | # ), 145 | 146 | 147 | # dict( 148 | # prompt = "A woman sitting in a restaurant with a slice of pizza in front of her", 149 | # phrases = ['dining table', 'pizza', 'person', 'wall', 'car', 'paper', 'chair', 'window', 'bottle', 'cup'], 150 | # images = ['images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg','images/hello.jpg'], 151 | # locations = [ [0.0030, 0.3589, 1.0000, 1.0000], 152 | # [0.0779, 0.6744, 0.9768, 1.0000], 153 | # [0.2236, 0.0000, 0.7809, 0.4352], 154 | # [0.0000, 0.0000, 0.4313, 0.4505], 155 | # [0.6275, 0.1050, 0.9444, 0.2497], 156 | # [0.0000, 0.3859, 0.1250, 0.6922], 157 | # [0.7137, 0.2389, 0.8540, 0.4549], 158 | # [0.0000, 0.0000, 0.4667, 0.0630], 159 | # [0.3822, 0.4235, 0.4932, 0.6575], 160 | # [0.6616, 0.3617, 0.7880, 0.5165] ], 161 | # alpha_type = [0.0, 0, 1.0], 162 | # has_text_mask = 1, 163 | # has_image_mask = 0, 164 | # save_folder_name="pizza_1_0" 165 | # ), 166 | 167 | 168 | 169 | 170 | ] -------------------------------------------------------------------------------- /gligen/distributed.py: -------------------------------------------------------------------------------- 1 | import math 2 | import pickle 3 | 4 | import torch 5 | from torch import distributed as dist 6 | from torch.utils.data.sampler import Sampler 7 | 8 | 9 | def get_rank(): 10 | if not dist.is_available(): 11 | return 0 12 | 13 | if not dist.is_initialized(): 14 | return 0 15 | 16 | return dist.get_rank() 17 | 18 | 19 | def synchronize(): 20 | if not dist.is_available(): 21 | return 22 | if not dist.is_initialized(): 23 | return 24 | 25 | world_size = dist.get_world_size() 26 | if world_size == 1: 27 | return 28 | 29 | dist.barrier() 30 | 31 | 32 | def get_world_size(): 33 | if not dist.is_available(): 34 | return 1 35 | if not dist.is_initialized(): 36 | return 1 37 | return dist.get_world_size() 38 | 39 | 40 | def reduce_sum(tensor): 41 | if not dist.is_available(): 42 | return tensor 43 | 44 | if not dist.is_initialized(): 45 | return tensor 46 | 47 | tensor = tensor.clone() 48 | dist.all_reduce(tensor, op=dist.ReduceOp.SUM) 49 | 50 | return tensor 51 | 52 | 53 | def gather_grad(params): 54 | world_size = get_world_size() 55 | 56 | if world_size == 1: 57 | return 58 | 59 | for param in params: 60 | if param.grad is not None: 61 | dist.all_reduce(param.grad.data, op=dist.ReduceOp.SUM) 62 | param.grad.data.div_(world_size) 63 | 64 | 65 | def all_gather(data): 66 | world_size = get_world_size() 67 | 68 | if world_size == 1: 69 | return [data] 70 | 71 | buffer = pickle.dumps(data) 72 | storage = torch.ByteStorage.from_buffer(buffer) 73 | tensor = torch.ByteTensor(storage).to('cuda') 74 | 75 | local_size = torch.IntTensor([tensor.numel()]).to('cuda') 76 | size_list = [torch.IntTensor([0]).to('cuda') for _ in range(world_size)] 77 | dist.all_gather(size_list, local_size) 78 | size_list = [int(size.item()) for size in size_list] 79 | max_size = max(size_list) 80 | 81 | tensor_list = [] 82 | for _ in size_list: 83 | tensor_list.append(torch.ByteTensor(size=(max_size,)).to('cuda')) 84 | 85 | if local_size != max_size: 86 | padding = torch.ByteTensor(size=(max_size - local_size,)).to('cuda') 87 | tensor = torch.cat((tensor, padding), 0) 88 | 89 | dist.all_gather(tensor_list, tensor) 90 | 91 | data_list = [] 92 | 93 | for size, tensor in zip(size_list, tensor_list): 94 | buffer = tensor.cpu().numpy().tobytes()[:size] 95 | data_list.append(pickle.loads(buffer)) 96 | 97 | return data_list 98 | 99 | 100 | def reduce_loss_dict(loss_dict): 101 | world_size = get_world_size() 102 | 103 | if world_size < 2: 104 | return loss_dict 105 | 106 | with torch.no_grad(): 107 | keys = [] 108 | losses = [] 109 | 110 | for k in sorted(loss_dict.keys()): 111 | keys.append(k) 112 | losses.append(loss_dict[k]) 113 | 114 | losses = torch.stack(losses, 0) 115 | dist.reduce(losses, dst=0) 116 | 117 | if dist.get_rank() == 0: 118 | losses /= world_size 119 | 120 | reduced_losses = {k: v for k, v in zip(keys, losses)} 121 | 122 | return reduced_losses 123 | -------------------------------------------------------------------------------- /gligen/ldm/__init__.py: -------------------------------------------------------------------------------- 1 | import gligen.evaluator as evaluator 2 | import gligen.trainer as trainer 3 | import gligen.ldm as ldm -------------------------------------------------------------------------------- /gligen/ldm/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/__pycache__/util.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/__pycache__/util.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/data/__init__.py -------------------------------------------------------------------------------- /gligen/ldm/data/base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset 3 | 4 | 5 | class Txt2ImgIterableBaseDataset(IterableDataset): 6 | ''' 7 | Define an interface to make the IterableDatasets for text2img data chainable 8 | ''' 9 | def __init__(self, num_records=0, valid_ids=None, size=256): 10 | super().__init__() 11 | self.num_records = num_records 12 | self.valid_ids = valid_ids 13 | self.sample_ids = valid_ids 14 | self.size = size 15 | 16 | print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.') 17 | 18 | def __len__(self): 19 | return self.num_records 20 | 21 | @abstractmethod 22 | def __iter__(self): 23 | pass -------------------------------------------------------------------------------- /gligen/ldm/data/lsun.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import PIL 4 | from PIL import Image 5 | from torch.utils.data import Dataset 6 | from torchvision import transforms 7 | 8 | 9 | class LSUNBase(Dataset): 10 | def __init__(self, 11 | txt_file, 12 | data_root, 13 | size=None, 14 | interpolation="bicubic", 15 | flip_p=0.5 16 | ): 17 | self.data_paths = txt_file 18 | self.data_root = data_root 19 | with open(self.data_paths, "r") as f: 20 | self.image_paths = f.read().splitlines() 21 | self._length = len(self.image_paths) 22 | self.labels = { 23 | "relative_file_path_": [l for l in self.image_paths], 24 | "file_path_": [os.path.join(self.data_root, l) 25 | for l in self.image_paths], 26 | } 27 | 28 | self.size = size 29 | self.interpolation = {"linear": PIL.Image.LINEAR, 30 | "bilinear": PIL.Image.BILINEAR, 31 | "bicubic": PIL.Image.BICUBIC, 32 | "lanczos": PIL.Image.LANCZOS, 33 | }[interpolation] 34 | self.flip = transforms.RandomHorizontalFlip(p=flip_p) 35 | 36 | def __len__(self): 37 | return self._length 38 | 39 | def __getitem__(self, i): 40 | example = dict((k, self.labels[k][i]) for k in self.labels) 41 | image = Image.open(example["file_path_"]) 42 | if not image.mode == "RGB": 43 | image = image.convert("RGB") 44 | 45 | # default to score-sde preprocessing 46 | img = np.array(image).astype(np.uint8) 47 | crop = min(img.shape[0], img.shape[1]) 48 | h, w, = img.shape[0], img.shape[1] 49 | img = img[(h - crop) // 2:(h + crop) // 2, 50 | (w - crop) // 2:(w + crop) // 2] 51 | 52 | image = Image.fromarray(img) 53 | if self.size is not None: 54 | image = image.resize((self.size, self.size), resample=self.interpolation) 55 | 56 | image = self.flip(image) 57 | image = np.array(image).astype(np.uint8) 58 | example["image"] = (image / 127.5 - 1.0).astype(np.float32) 59 | return example 60 | 61 | 62 | class LSUNChurchesTrain(LSUNBase): 63 | def __init__(self, **kwargs): 64 | super().__init__(txt_file="data/lsun/church_outdoor_train.txt", data_root="data/lsun/churches", **kwargs) 65 | 66 | 67 | class LSUNChurchesValidation(LSUNBase): 68 | def __init__(self, flip_p=0., **kwargs): 69 | super().__init__(txt_file="data/lsun/church_outdoor_val.txt", data_root="data/lsun/churches", 70 | flip_p=flip_p, **kwargs) 71 | 72 | 73 | class LSUNBedroomsTrain(LSUNBase): 74 | def __init__(self, **kwargs): 75 | super().__init__(txt_file="data/lsun/bedrooms_train.txt", data_root="data/lsun/bedrooms", **kwargs) 76 | 77 | 78 | class LSUNBedroomsValidation(LSUNBase): 79 | def __init__(self, flip_p=0.0, **kwargs): 80 | super().__init__(txt_file="data/lsun/bedrooms_val.txt", data_root="data/lsun/bedrooms", 81 | flip_p=flip_p, **kwargs) 82 | 83 | 84 | class LSUNCatsTrain(LSUNBase): 85 | def __init__(self, **kwargs): 86 | super().__init__(txt_file="data/lsun/cat_train.txt", data_root="data/lsun/cats", **kwargs) 87 | 88 | 89 | class LSUNCatsValidation(LSUNBase): 90 | def __init__(self, flip_p=0., **kwargs): 91 | super().__init__(txt_file="data/lsun/cat_val.txt", data_root="data/lsun/cats", 92 | flip_p=flip_p, **kwargs) 93 | -------------------------------------------------------------------------------- /gligen/ldm/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class LambdaWarmUpCosineScheduler: 5 | """ 6 | note: use with a base_lr of 1.0 7 | """ 8 | def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0): 9 | self.lr_warm_up_steps = warm_up_steps 10 | self.lr_start = lr_start 11 | self.lr_min = lr_min 12 | self.lr_max = lr_max 13 | self.lr_max_decay_steps = max_decay_steps 14 | self.last_lr = 0. 15 | self.verbosity_interval = verbosity_interval 16 | 17 | def schedule(self, n, **kwargs): 18 | if self.verbosity_interval > 0: 19 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}") 20 | if n < self.lr_warm_up_steps: 21 | lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start 22 | self.last_lr = lr 23 | return lr 24 | else: 25 | t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps) 26 | t = min(t, 1.0) 27 | lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * ( 28 | 1 + np.cos(t * np.pi)) 29 | self.last_lr = lr 30 | return lr 31 | 32 | def __call__(self, n, **kwargs): 33 | return self.schedule(n,**kwargs) 34 | 35 | 36 | class LambdaWarmUpCosineScheduler2: 37 | """ 38 | supports repeated iterations, configurable via lists 39 | note: use with a base_lr of 1.0. 40 | """ 41 | def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0): 42 | assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths) 43 | self.lr_warm_up_steps = warm_up_steps 44 | self.f_start = f_start 45 | self.f_min = f_min 46 | self.f_max = f_max 47 | self.cycle_lengths = cycle_lengths 48 | self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths)) 49 | self.last_f = 0. 50 | self.verbosity_interval = verbosity_interval 51 | 52 | def find_in_interval(self, n): 53 | interval = 0 54 | for cl in self.cum_cycles[1:]: 55 | if n <= cl: 56 | return interval 57 | interval += 1 58 | 59 | def schedule(self, n, **kwargs): 60 | cycle = self.find_in_interval(n) 61 | n = n - self.cum_cycles[cycle] 62 | if self.verbosity_interval > 0: 63 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " 64 | f"current cycle {cycle}") 65 | if n < self.lr_warm_up_steps[cycle]: 66 | f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle] 67 | self.last_f = f 68 | return f 69 | else: 70 | t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle]) 71 | t = min(t, 1.0) 72 | f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * ( 73 | 1 + np.cos(t * np.pi)) 74 | self.last_f = f 75 | return f 76 | 77 | def __call__(self, n, **kwargs): 78 | return self.schedule(n, **kwargs) 79 | 80 | 81 | class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2): 82 | 83 | def schedule(self, n, **kwargs): 84 | cycle = self.find_in_interval(n) 85 | n = n - self.cum_cycles[cycle] 86 | if self.verbosity_interval > 0: 87 | if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " 88 | f"current cycle {cycle}") 89 | 90 | if n < self.lr_warm_up_steps[cycle]: 91 | f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle] 92 | self.last_f = f 93 | return f 94 | else: 95 | f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle]) 96 | self.last_f = f 97 | return f 98 | 99 | -------------------------------------------------------------------------------- /gligen/ldm/models/__pycache__/autoencoder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/__pycache__/autoencoder.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/models/autoencoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | #import pytorch_lightning as pl 4 | import torch.nn.functional as F 5 | from contextlib import contextmanager 6 | 7 | # from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer 8 | 9 | from ldm.modules.diffusionmodules.model import Encoder, Decoder 10 | from ldm.modules.distributions.distributions import DiagonalGaussianDistribution 11 | 12 | from ldm.util import instantiate_from_config 13 | 14 | 15 | 16 | 17 | class AutoencoderKL(nn.Module): 18 | def __init__(self, 19 | ddconfig, 20 | embed_dim, 21 | scale_factor=1 22 | ): 23 | super().__init__() 24 | self.encoder = Encoder(**ddconfig) 25 | self.decoder = Decoder(**ddconfig) 26 | assert ddconfig["double_z"] 27 | self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1) 28 | self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1) 29 | self.embed_dim = embed_dim 30 | self.scale_factor = scale_factor 31 | 32 | 33 | 34 | def encode(self, x): 35 | h = self.encoder(x) 36 | moments = self.quant_conv(h) 37 | posterior = DiagonalGaussianDistribution(moments) 38 | return posterior.sample() * self.scale_factor 39 | 40 | def decode(self, z): 41 | z = 1. / self.scale_factor * z 42 | z = self.post_quant_conv(z) 43 | dec = self.decoder(z) 44 | return dec 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /gligen/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__init__.py -------------------------------------------------------------------------------- /gligen/ldm/models/diffusion/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/models/diffusion/__pycache__/ddim.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__pycache__/ddim.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/models/diffusion/__pycache__/ddpm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__pycache__/ddpm.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/models/diffusion/__pycache__/ldm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__pycache__/ldm.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/models/diffusion/__pycache__/plms.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/models/diffusion/__pycache__/plms.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/models/diffusion/ddim.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from tqdm import tqdm 4 | from functools import partial 5 | 6 | from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like 7 | 8 | 9 | class DDIMSampler(object): 10 | def __init__(self, diffusion, model, schedule="linear", alpha_generator_func=None, set_alpha_scale=None): 11 | super().__init__() 12 | self.diffusion = diffusion 13 | self.model = model 14 | self.device = diffusion.betas.device 15 | self.ddpm_num_timesteps = diffusion.num_timesteps 16 | self.schedule = schedule 17 | self.alpha_generator_func = alpha_generator_func 18 | self.set_alpha_scale = set_alpha_scale 19 | 20 | 21 | def register_buffer(self, name, attr): 22 | if type(attr) == torch.Tensor: 23 | attr = attr.to(self.device) 24 | setattr(self, name, attr) 25 | 26 | 27 | def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.): 28 | self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps, 29 | num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=False) 30 | alphas_cumprod = self.diffusion.alphas_cumprod 31 | assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep' 32 | to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.device) 33 | 34 | self.register_buffer('betas', to_torch(self.diffusion.betas)) 35 | self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) 36 | self.register_buffer('alphas_cumprod_prev', to_torch(self.diffusion.alphas_cumprod_prev)) 37 | 38 | # calculations for diffusion q(x_t | x_{t-1}) and others 39 | self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu()))) 40 | self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu()))) 41 | self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu()))) 42 | self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu()))) 43 | self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1))) 44 | 45 | # ddim sampling parameters 46 | ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(), 47 | ddim_timesteps=self.ddim_timesteps, 48 | eta=ddim_eta,verbose=False) 49 | self.register_buffer('ddim_sigmas', ddim_sigmas) 50 | self.register_buffer('ddim_alphas', ddim_alphas) 51 | self.register_buffer('ddim_alphas_prev', ddim_alphas_prev) 52 | self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas)) 53 | sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt( 54 | (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * ( 55 | 1 - self.alphas_cumprod / self.alphas_cumprod_prev)) 56 | self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps) 57 | 58 | 59 | @torch.no_grad() 60 | def sample(self, S, shape, input, uc=None, guidance_scale=1, mask=None, x0=None): 61 | self.make_schedule(ddim_num_steps=S) 62 | return self.ddim_sampling(shape, input, uc, guidance_scale, mask=mask, x0=x0) 63 | 64 | 65 | @torch.no_grad() 66 | def ddim_sampling(self, shape, input, uc, guidance_scale=1, mask=None, x0=None): 67 | b = shape[0] 68 | 69 | img = input["x"] 70 | if img == None: 71 | img = torch.randn(shape, device=self.device) 72 | input["x"] = img 73 | 74 | 75 | time_range = np.flip(self.ddim_timesteps) 76 | total_steps = self.ddim_timesteps.shape[0] 77 | 78 | #iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps) 79 | iterator = time_range 80 | 81 | if self.alpha_generator_func != None: 82 | alphas = self.alpha_generator_func(len(iterator)) 83 | 84 | 85 | for i, step in enumerate(iterator): 86 | 87 | # set alpha 88 | if self.alpha_generator_func != None: 89 | self.set_alpha_scale(self.model, alphas[i]) 90 | 91 | # run 92 | index = total_steps - i - 1 93 | input["timesteps"] = torch.full((b,), step, device=self.device, dtype=torch.long) 94 | 95 | if mask is not None: 96 | assert x0 is not None 97 | img_orig = self.diffusion.q_sample( x0, input["timesteps"] ) 98 | img = img_orig * mask + (1. - mask) * img 99 | input["x"] = img 100 | 101 | img, pred_x0 = self.p_sample_ddim(input, index=index, uc=uc, guidance_scale=guidance_scale) 102 | input["x"] = img 103 | 104 | return img 105 | 106 | 107 | @torch.no_grad() 108 | def p_sample_ddim(self, input, index, uc=None, guidance_scale=1): 109 | 110 | 111 | e_t = self.model(input) 112 | if uc is not None and guidance_scale != 1: 113 | unconditional_input = dict(x=input["x"], timesteps=input["timesteps"], context=uc) 114 | if "inpainting_extra_input" in input: 115 | unconditional_input["inpainting_extra_input"] = input["inpainting_extra_input"] 116 | e_t_uncond = self.model( unconditional_input ) 117 | e_t = e_t_uncond + guidance_scale * (e_t - e_t_uncond) 118 | 119 | # select parameters corresponding to the currently considered timestep 120 | b = input["x"].shape[0] 121 | a_t = torch.full((b, 1, 1, 1), self.ddim_alphas[index], device=self.device) 122 | a_prev = torch.full((b, 1, 1, 1), self.ddim_alphas_prev[index], device=self.device) 123 | sigma_t = torch.full((b, 1, 1, 1), self.ddim_sigmas[index], device=self.device) 124 | sqrt_one_minus_at = torch.full((b, 1, 1, 1), self.ddim_sqrt_one_minus_alphas[index],device=self.device) 125 | 126 | # current prediction for x_0 127 | pred_x0 = (input["x"] - sqrt_one_minus_at * e_t) / a_t.sqrt() 128 | 129 | # direction pointing to x_t 130 | dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t 131 | noise = sigma_t * torch.randn_like( input["x"] ) 132 | x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise 133 | 134 | return x_prev, pred_x0 135 | -------------------------------------------------------------------------------- /gligen/ldm/models/diffusion/ddpm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from functools import partial 5 | from ldm.modules.diffusionmodules.util import make_beta_schedule 6 | 7 | 8 | 9 | 10 | 11 | class DDPM(nn.Module): 12 | def __init__(self, beta_schedule="linear", timesteps=1000, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3): 13 | super().__init__() 14 | 15 | self.v_posterior = 0 16 | self.register_schedule(beta_schedule, timesteps, linear_start, linear_end, cosine_s) 17 | 18 | 19 | def register_schedule(self, beta_schedule="linear", timesteps=1000, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3): 20 | 21 | betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s) 22 | alphas = 1. - betas 23 | alphas_cumprod = np.cumprod(alphas, axis=0) 24 | alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1]) 25 | 26 | timesteps, = betas.shape 27 | self.num_timesteps = int(timesteps) 28 | self.linear_start = linear_start 29 | self.linear_end = linear_end 30 | assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep' 31 | 32 | to_torch = partial(torch.tensor, dtype=torch.float32) 33 | 34 | self.register_buffer('betas', to_torch(betas)) 35 | self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) 36 | self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev)) 37 | 38 | # calculations for diffusion q(x_t | x_{t-1}) and others 39 | self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod))) 40 | self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod))) 41 | self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod))) 42 | self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod))) 43 | self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1))) 44 | 45 | # calculations for posterior q(x_{t-1} | x_t, x_0) 46 | posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / ( 1. - alphas_cumprod) + self.v_posterior * betas 47 | # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t) 48 | 49 | self.register_buffer('posterior_variance', to_torch(posterior_variance)) 50 | 51 | # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain 52 | self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20)))) 53 | self.register_buffer('posterior_mean_coef1', to_torch( betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod))) 54 | self.register_buffer('posterior_mean_coef2', to_torch( (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod))) 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /gligen/ldm/models/diffusion/ldm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from tqdm import tqdm 5 | from ldm.util import default 6 | from ldm.modules.diffusionmodules.util import extract_into_tensor 7 | from .ddpm import DDPM 8 | 9 | 10 | 11 | class LatentDiffusion(DDPM): 12 | def __init__(self, *args, **kwargs): 13 | super().__init__(*args, **kwargs) 14 | # hardcoded 15 | self.clip_denoised = False 16 | 17 | 18 | 19 | def q_sample(self, x_start, t, noise=None): 20 | noise = default(noise, lambda: torch.randn_like(x_start)) 21 | return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + 22 | extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise) 23 | 24 | 25 | "Does not support DDPM sampling anymore. Only do DDIM or PLMS" 26 | 27 | # = = = = = = = = = = = = Below is for sampling = = = = = = = = = = = = # 28 | 29 | # def predict_start_from_noise(self, x_t, t, noise): 30 | # return ( extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - 31 | # extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise ) 32 | 33 | # def q_posterior(self, x_start, x_t, t): 34 | # posterior_mean = ( 35 | # extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + 36 | # extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t 37 | # ) 38 | # posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape) 39 | # posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape) 40 | # return posterior_mean, posterior_variance, posterior_log_variance_clipped 41 | 42 | 43 | # def p_mean_variance(self, model, x, c, t): 44 | 45 | # model_out = model(x, t, c) 46 | # x_recon = self.predict_start_from_noise(x, t=t, noise=model_out) 47 | 48 | # if self.clip_denoised: 49 | # x_recon.clamp_(-1., 1.) 50 | 51 | # model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t) 52 | # return model_mean, posterior_variance, posterior_log_variance, x_recon 53 | 54 | 55 | # @torch.no_grad() 56 | # def p_sample(self, model, x, c, t): 57 | # b, *_, device = *x.shape, x.device 58 | # model_mean, _, model_log_variance, x0 = self.p_mean_variance(model, x=x, c=c, t=t, ) 59 | # noise = torch.randn_like(x) 60 | 61 | # # no noise when t == 0 62 | # nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1))) 63 | 64 | # return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0 65 | 66 | 67 | # @torch.no_grad() 68 | # def p_sample_loop(self, model, shape, c): 69 | # device = self.betas.device 70 | # b = shape[0] 71 | # img = torch.randn(shape, device=device) 72 | 73 | # iterator = tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps) 74 | # for i in iterator: 75 | # ts = torch.full((b,), i, device=device, dtype=torch.long) 76 | # img, x0 = self.p_sample(model, img, c, ts) 77 | 78 | # return img 79 | 80 | 81 | # @torch.no_grad() 82 | # def sample(self, model, shape, c, uc=None, guidance_scale=None): 83 | # return self.p_sample_loop(model, shape, c) 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /gligen/ldm/modules/__pycache__/attention.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/__pycache__/attention.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/modules/__pycache__/x_transformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/__pycache__/x_transformer.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /gligen/ldm/modules/diffusionmodules/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/modules/diffusionmodules/__pycache__/model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__pycache__/model.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/modules/diffusionmodules/__pycache__/openaimodel.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__pycache__/openaimodel.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/modules/diffusionmodules/__pycache__/positionnet.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__pycache__/positionnet.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/modules/diffusionmodules/__pycache__/util.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/diffusionmodules/__pycache__/util.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/modules/diffusionmodules/positionnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from ldm.modules.attention import BasicTransformerBlock 4 | from ldm.modules.diffusionmodules.util import checkpoint, FourierEmbedder 5 | import torch.nn.functional as F 6 | 7 | 8 | 9 | class PositionNet(nn.Module): 10 | def __init__(self, positive_len, out_dim, fourier_freqs=8): 11 | super().__init__() 12 | self.positive_len = positive_len 13 | self.out_dim = out_dim 14 | 15 | self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs) 16 | self.position_dim = fourier_freqs*2*4 # 2 is sin&cos, 4 is xyxy 17 | 18 | self.linears = nn.Sequential( 19 | nn.Linear( self.positive_len + self.position_dim, 512), 20 | nn.SiLU(), 21 | nn.Linear( 512, 512), 22 | nn.SiLU(), 23 | nn.Linear(512, out_dim), 24 | ) 25 | 26 | self.null_positive_feature = torch.nn.Parameter(torch.zeros([self.positive_len])) 27 | self.null_position_feature = torch.nn.Parameter(torch.zeros([self.position_dim])) 28 | 29 | 30 | def forward(self, boxes, masks, positive_embeddings): 31 | B, N, _ = boxes.shape 32 | masks = masks.unsqueeze(-1) 33 | 34 | # embedding position (it may includes padding as placeholder) 35 | xyxy_embedding = self.fourier_embedder(boxes) # B*N*4 --> B*N*C 36 | 37 | # learnable null embedding 38 | positive_null = self.null_positive_feature.view(1,1,-1) 39 | xyxy_null = self.null_position_feature.view(1,1,-1) 40 | 41 | # replace padding with learnable null embedding 42 | positive_embeddings = positive_embeddings*masks + (1-masks)*positive_null 43 | xyxy_embedding = xyxy_embedding*masks + (1-masks)*xyxy_null 44 | 45 | objs = self.linears( torch.cat([positive_embeddings, xyxy_embedding], dim=-1) ) 46 | assert objs.shape == torch.Size([B,N,self.out_dim]) 47 | return objs 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /gligen/ldm/modules/diffusionmodules/positionnet_with_image.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from ldm.modules.attention import BasicTransformerBlock 4 | from ldm.modules.diffusionmodules.util import checkpoint, FourierEmbedder 5 | import torch.nn.functional as F 6 | 7 | 8 | 9 | class PositionNet(nn.Module): 10 | def __init__(self, positive_len, out_dim, fourier_freqs=8): 11 | super().__init__() 12 | self.positive_len = positive_len 13 | self.out_dim = out_dim 14 | 15 | self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs) 16 | self.position_dim = fourier_freqs*2*4 # 2 is sin&cos, 4 is xyxy 17 | 18 | # -------------------------------------------------------------- # 19 | self.linears_text = nn.Sequential( 20 | nn.Linear( self.positive_len + self.position_dim, 512), 21 | nn.SiLU(), 22 | nn.Linear( 512, 512), 23 | nn.SiLU(), 24 | nn.Linear(512, out_dim), 25 | ) 26 | 27 | self.linears_image = nn.Sequential( 28 | nn.Linear( self.positive_len + self.position_dim, 512), 29 | nn.SiLU(), 30 | nn.Linear( 512, 512), 31 | nn.SiLU(), 32 | nn.Linear(512, out_dim), 33 | ) 34 | 35 | # -------------------------------------------------------------- # 36 | self.null_text_feature = torch.nn.Parameter(torch.zeros([self.positive_len])) 37 | self.null_image_feature = torch.nn.Parameter(torch.zeros([self.positive_len])) 38 | self.null_position_feature = torch.nn.Parameter(torch.zeros([self.position_dim])) 39 | 40 | 41 | def forward(self, boxes, masks, text_masks, image_masks, text_embeddings, image_embeddings): 42 | B, N, _ = boxes.shape 43 | masks = masks.unsqueeze(-1) # B*N*1 44 | text_masks = text_masks.unsqueeze(-1) # B*N*1 45 | image_masks = image_masks.unsqueeze(-1) # B*N*1 46 | 47 | # embedding position (it may includes padding as placeholder) 48 | xyxy_embedding = self.fourier_embedder(boxes) # B*N*4 --> B*N*C 49 | 50 | # learnable null embedding 51 | text_null = self.null_text_feature.view(1,1,-1) # 1*1*C 52 | image_null = self.null_image_feature.view(1,1,-1) # 1*1*C 53 | xyxy_null = self.null_position_feature.view(1,1,-1) # 1*1*C 54 | 55 | # replace padding with learnable null embedding 56 | text_embeddings = text_embeddings*text_masks + (1-text_masks)*text_null 57 | image_embeddings = image_embeddings*image_masks + (1-image_masks)*image_null 58 | xyxy_embedding = xyxy_embedding*masks + (1-masks)*xyxy_null 59 | 60 | objs_text = self.linears_text( torch.cat([text_embeddings, xyxy_embedding], dim=-1) ) 61 | objs_image = self.linears_image( torch.cat([image_embeddings,xyxy_embedding], dim=-1) ) 62 | objs = torch.cat( [objs_text,objs_image], dim=1 ) 63 | 64 | assert objs.shape == torch.Size([B,N*2,self.out_dim]) 65 | return objs 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /gligen/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/distributions/__init__.py -------------------------------------------------------------------------------- /gligen/ldm/modules/distributions/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/distributions/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/modules/distributions/__pycache__/distributions.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/distributions/__pycache__/distributions.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/modules/distributions/distributions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | class AbstractDistribution: 6 | def sample(self): 7 | raise NotImplementedError() 8 | 9 | def mode(self): 10 | raise NotImplementedError() 11 | 12 | 13 | class DiracDistribution(AbstractDistribution): 14 | def __init__(self, value): 15 | self.value = value 16 | 17 | def sample(self): 18 | return self.value 19 | 20 | def mode(self): 21 | return self.value 22 | 23 | 24 | class DiagonalGaussianDistribution(object): 25 | def __init__(self, parameters, deterministic=False): 26 | self.parameters = parameters 27 | self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) 28 | self.logvar = torch.clamp(self.logvar, -30.0, 20.0) 29 | self.deterministic = deterministic 30 | self.std = torch.exp(0.5 * self.logvar) 31 | self.var = torch.exp(self.logvar) 32 | if self.deterministic: 33 | self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) 34 | 35 | def sample(self): 36 | x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device) 37 | return x 38 | 39 | def kl(self, other=None): 40 | if self.deterministic: 41 | return torch.Tensor([0.]) 42 | else: 43 | if other is None: 44 | return 0.5 * torch.sum(torch.pow(self.mean, 2) 45 | + self.var - 1.0 - self.logvar, 46 | dim=[1, 2, 3]) 47 | else: 48 | return 0.5 * torch.sum( 49 | torch.pow(self.mean - other.mean, 2) / other.var 50 | + self.var / other.var - 1.0 - self.logvar + other.logvar, 51 | dim=[1, 2, 3]) 52 | 53 | def nll(self, sample, dims=[1,2,3]): 54 | if self.deterministic: 55 | return torch.Tensor([0.]) 56 | logtwopi = np.log(2.0 * np.pi) 57 | return 0.5 * torch.sum( 58 | logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, 59 | dim=dims) 60 | 61 | def mode(self): 62 | return self.mean 63 | 64 | 65 | def normal_kl(mean1, logvar1, mean2, logvar2): 66 | """ 67 | source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 68 | Compute the KL divergence between two gaussians. 69 | Shapes are automatically broadcasted, so batches can be compared to 70 | scalars, among other use cases. 71 | """ 72 | tensor = None 73 | for obj in (mean1, logvar1, mean2, logvar2): 74 | if isinstance(obj, torch.Tensor): 75 | tensor = obj 76 | break 77 | assert tensor is not None, "at least one argument must be a Tensor" 78 | 79 | # Force variances to be Tensors. Broadcasting helps convert scalars to 80 | # Tensors, but it does not work for torch.exp(). 81 | logvar1, logvar2 = [ 82 | x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) 83 | for x in (logvar1, logvar2) 84 | ] 85 | 86 | return 0.5 * ( 87 | -1.0 88 | + logvar2 89 | - logvar1 90 | + torch.exp(logvar1 - logvar2) 91 | + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) 92 | ) 93 | -------------------------------------------------------------------------------- /gligen/ldm/modules/ema.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class LitEma(nn.Module): 6 | def __init__(self, model, decay=0.9999, use_num_upates=True): 7 | super().__init__() 8 | if decay < 0.0 or decay > 1.0: 9 | raise ValueError('Decay must be between 0 and 1') 10 | 11 | self.m_name2s_name = {} 12 | self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32)) 13 | self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates 14 | else torch.tensor(-1,dtype=torch.int)) 15 | 16 | for name, p in model.named_parameters(): 17 | if p.requires_grad: 18 | #remove as '.'-character is not allowed in buffers 19 | s_name = name.replace('.','') 20 | self.m_name2s_name.update({name:s_name}) 21 | self.register_buffer(s_name,p.clone().detach().data) 22 | 23 | self.collected_params = [] 24 | 25 | def forward(self,model): 26 | decay = self.decay 27 | 28 | if self.num_updates >= 0: 29 | self.num_updates += 1 30 | decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates)) 31 | 32 | one_minus_decay = 1.0 - decay 33 | 34 | with torch.no_grad(): 35 | m_param = dict(model.named_parameters()) 36 | shadow_params = dict(self.named_buffers()) 37 | 38 | for key in m_param: 39 | if m_param[key].requires_grad: 40 | sname = self.m_name2s_name[key] 41 | shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) 42 | shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) 43 | else: 44 | assert not key in self.m_name2s_name 45 | 46 | def copy_to(self, model): 47 | m_param = dict(model.named_parameters()) 48 | shadow_params = dict(self.named_buffers()) 49 | for key in m_param: 50 | if m_param[key].requires_grad: 51 | m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) 52 | else: 53 | assert not key in self.m_name2s_name 54 | 55 | def store(self, parameters): 56 | """ 57 | Save the current parameters for restoring later. 58 | Args: 59 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 60 | temporarily stored. 61 | """ 62 | self.collected_params = [param.clone() for param in parameters] 63 | 64 | def restore(self, parameters): 65 | """ 66 | Restore the parameters stored with the `store` method. 67 | Useful to validate the model with EMA parameters without affecting the 68 | original optimization process. Store the parameters before the 69 | `copy_to` method. After validation (or model saving), use this to 70 | restore the former parameters. 71 | Args: 72 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 73 | updated with the stored parameters. 74 | """ 75 | for c_param, param in zip(self.collected_params, parameters): 76 | param.data.copy_(c_param.data) 77 | -------------------------------------------------------------------------------- /gligen/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/encoders/__init__.py -------------------------------------------------------------------------------- /gligen/ldm/modules/encoders/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/encoders/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/modules/encoders/__pycache__/modules.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/ldm/modules/encoders/__pycache__/modules.cpython-39.pyc -------------------------------------------------------------------------------- /gligen/ldm/modules/image_degradation/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr 2 | from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light 3 | -------------------------------------------------------------------------------- /gligen/ldm/modules/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator -------------------------------------------------------------------------------- /gligen/ldm/modules/losses/contperceptual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from taming.modules.losses.vqperceptual import * # TODO: taming dependency yes/no? 5 | 6 | 7 | class LPIPSWithDiscriminator(nn.Module): 8 | def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0, 9 | disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0, 10 | perceptual_weight=1.0, use_actnorm=False, disc_conditional=False, 11 | disc_loss="hinge"): 12 | 13 | super().__init__() 14 | assert disc_loss in ["hinge", "vanilla"] 15 | self.kl_weight = kl_weight 16 | self.pixel_weight = pixelloss_weight 17 | self.perceptual_loss = LPIPS().eval() 18 | self.perceptual_weight = perceptual_weight 19 | # output log variance 20 | self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init) 21 | 22 | self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels, 23 | n_layers=disc_num_layers, 24 | use_actnorm=use_actnorm 25 | ).apply(weights_init) 26 | self.discriminator_iter_start = disc_start 27 | self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss 28 | self.disc_factor = disc_factor 29 | self.discriminator_weight = disc_weight 30 | self.disc_conditional = disc_conditional 31 | 32 | def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None): 33 | if last_layer is not None: 34 | nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0] 35 | g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0] 36 | else: 37 | nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0] 38 | g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0] 39 | 40 | d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4) 41 | d_weight = torch.clamp(d_weight, 0.0, 1e4).detach() 42 | d_weight = d_weight * self.discriminator_weight 43 | return d_weight 44 | 45 | def forward(self, inputs, reconstructions, posteriors, optimizer_idx, 46 | global_step, last_layer=None, cond=None, split="train", 47 | weights=None): 48 | rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous()) 49 | if self.perceptual_weight > 0: 50 | p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous()) 51 | rec_loss = rec_loss + self.perceptual_weight * p_loss 52 | 53 | nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar 54 | weighted_nll_loss = nll_loss 55 | if weights is not None: 56 | weighted_nll_loss = weights*nll_loss 57 | weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0] 58 | nll_loss = torch.sum(nll_loss) / nll_loss.shape[0] 59 | kl_loss = posteriors.kl() 60 | kl_loss = torch.sum(kl_loss) / kl_loss.shape[0] 61 | 62 | # now the GAN part 63 | if optimizer_idx == 0: 64 | # generator update 65 | if cond is None: 66 | assert not self.disc_conditional 67 | logits_fake = self.discriminator(reconstructions.contiguous()) 68 | else: 69 | assert self.disc_conditional 70 | logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1)) 71 | g_loss = -torch.mean(logits_fake) 72 | 73 | if self.disc_factor > 0.0: 74 | try: 75 | d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer) 76 | except RuntimeError: 77 | assert not self.training 78 | d_weight = torch.tensor(0.0) 79 | else: 80 | d_weight = torch.tensor(0.0) 81 | 82 | disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) 83 | loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss 84 | 85 | log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(), 86 | "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(), 87 | "{}/rec_loss".format(split): rec_loss.detach().mean(), 88 | "{}/d_weight".format(split): d_weight.detach(), 89 | "{}/disc_factor".format(split): torch.tensor(disc_factor), 90 | "{}/g_loss".format(split): g_loss.detach().mean(), 91 | } 92 | return loss, log 93 | 94 | if optimizer_idx == 1: 95 | # second pass for discriminator update 96 | if cond is None: 97 | logits_real = self.discriminator(inputs.contiguous().detach()) 98 | logits_fake = self.discriminator(reconstructions.contiguous().detach()) 99 | else: 100 | logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1)) 101 | logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1)) 102 | 103 | disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) 104 | d_loss = disc_factor * self.disc_loss(logits_real, logits_fake) 105 | 106 | log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(), 107 | "{}/logits_real".format(split): logits_real.detach().mean(), 108 | "{}/logits_fake".format(split): logits_fake.detach().mean() 109 | } 110 | return d_loss, log 111 | 112 | -------------------------------------------------------------------------------- /gligen/ldm/util.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | import torch 4 | import numpy as np 5 | 6 | from inspect import isfunction 7 | from PIL import Image, ImageDraw, ImageFont 8 | 9 | 10 | def log_txt_as_img(wh, xc, size=10): 11 | # wh a tuple of (width, height) 12 | # xc a list of captions to plot 13 | b = len(xc) 14 | txts = list() 15 | for bi in range(b): 16 | txt = Image.new("RGB", wh, color="white") 17 | draw = ImageDraw.Draw(txt) 18 | font = ImageFont.truetype('data/DejaVuSans.ttf', size=size) 19 | nc = int(40 * (wh[0] / 256)) 20 | lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc)) 21 | 22 | try: 23 | draw.text((0, 0), lines, fill="black", font=font) 24 | except UnicodeEncodeError: 25 | print("Cant encode string for logging. Skipping.") 26 | 27 | txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0 28 | txts.append(txt) 29 | txts = np.stack(txts) 30 | txts = torch.tensor(txts) 31 | return txts 32 | 33 | 34 | def ismap(x): 35 | if not isinstance(x, torch.Tensor): 36 | return False 37 | return (len(x.shape) == 4) and (x.shape[1] > 3) 38 | 39 | 40 | def isimage(x): 41 | if not isinstance(x,torch.Tensor): 42 | return False 43 | return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1) 44 | 45 | 46 | def exists(x): 47 | return x is not None 48 | 49 | 50 | def default(val, d): 51 | if exists(val): 52 | return val 53 | return d() if isfunction(d) else d 54 | 55 | 56 | def mean_flat(tensor): 57 | """ 58 | https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86 59 | Take the mean over all non-batch dimensions. 60 | """ 61 | return tensor.mean(dim=list(range(1, len(tensor.shape)))) 62 | 63 | 64 | def count_params(model, verbose=False): 65 | total_params = sum(p.numel() for p in model.parameters()) 66 | if verbose: 67 | print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.") 68 | return total_params 69 | 70 | 71 | def instantiate_from_config(config): 72 | if not "target" in config: 73 | if config == '__is_first_stage__': 74 | return None 75 | elif config == "__is_unconditional__": 76 | return None 77 | raise KeyError("Expected key `target` to instantiate.") 78 | return get_obj_from_str(config["target"])(**config.get("params", dict())) 79 | 80 | 81 | def get_obj_from_str(string, reload=False): 82 | module, cls = string.rsplit(".", 1) 83 | if reload: 84 | module_imp = importlib.import_module(module) 85 | importlib.reload(module_imp) 86 | return getattr(importlib.import_module(module, package=None), cls) -------------------------------------------------------------------------------- /gligen/projection_matrix.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/gligen/projection_matrix.pth -------------------------------------------------------------------------------- /groundingdino/_C.cpython-39-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/_C.cpython-39-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /groundingdino/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/__init__.py -------------------------------------------------------------------------------- /groundingdino/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/config/GroundingDINO_SwinB.cfg.py: -------------------------------------------------------------------------------- 1 | batch_size = 1 2 | modelname = "groundingdino" 3 | backbone = "swin_B_384_22k" 4 | position_embedding = "sine" 5 | pe_temperatureH = 20 6 | pe_temperatureW = 20 7 | return_interm_indices = [1, 2, 3] 8 | backbone_freeze_keywords = None 9 | enc_layers = 6 10 | dec_layers = 6 11 | pre_norm = False 12 | dim_feedforward = 2048 13 | hidden_dim = 256 14 | dropout = 0.0 15 | nheads = 8 16 | num_queries = 900 17 | query_dim = 4 18 | num_patterns = 0 19 | num_feature_levels = 4 20 | enc_n_points = 4 21 | dec_n_points = 4 22 | two_stage_type = "standard" 23 | two_stage_bbox_embed_share = False 24 | two_stage_class_embed_share = False 25 | transformer_activation = "relu" 26 | dec_pred_bbox_embed_share = True 27 | dn_box_noise_scale = 1.0 28 | dn_label_noise_ratio = 0.5 29 | dn_label_coef = 1.0 30 | dn_bbox_coef = 1.0 31 | embed_init_tgt = True 32 | dn_labelbook_size = 2000 33 | max_text_len = 256 34 | text_encoder_type = "bert-base-uncased" 35 | use_text_enhancer = True 36 | use_fusion_layer = True 37 | use_checkpoint = True 38 | use_transformer_ckpt = True 39 | use_text_cross_attention = True 40 | text_dropout = 0.0 41 | fusion_dropout = 0.0 42 | fusion_droppath = 0.1 43 | sub_sentence_present = True 44 | -------------------------------------------------------------------------------- /groundingdino/config/GroundingDINO_SwinT_OGC.py: -------------------------------------------------------------------------------- 1 | batch_size = 1 2 | modelname = "groundingdino" 3 | backbone = "swin_T_224_1k" 4 | position_embedding = "sine" 5 | pe_temperatureH = 20 6 | pe_temperatureW = 20 7 | return_interm_indices = [1, 2, 3] 8 | backbone_freeze_keywords = None 9 | enc_layers = 6 10 | dec_layers = 6 11 | pre_norm = False 12 | dim_feedforward = 2048 13 | hidden_dim = 256 14 | dropout = 0.0 15 | nheads = 8 16 | num_queries = 900 17 | query_dim = 4 18 | num_patterns = 0 19 | num_feature_levels = 4 20 | enc_n_points = 4 21 | dec_n_points = 4 22 | two_stage_type = "standard" 23 | two_stage_bbox_embed_share = False 24 | two_stage_class_embed_share = False 25 | transformer_activation = "relu" 26 | dec_pred_bbox_embed_share = True 27 | dn_box_noise_scale = 1.0 28 | dn_label_noise_ratio = 0.5 29 | dn_label_coef = 1.0 30 | dn_bbox_coef = 1.0 31 | embed_init_tgt = True 32 | dn_labelbook_size = 2000 33 | max_text_len = 256 34 | text_encoder_type = "bert-base-uncased" 35 | use_text_enhancer = True 36 | use_fusion_layer = True 37 | use_checkpoint = True 38 | use_transformer_ckpt = True 39 | use_text_cross_attention = True 40 | text_dropout = 0.0 41 | fusion_dropout = 0.0 42 | fusion_droppath = 0.1 43 | sub_sentence_present = True 44 | -------------------------------------------------------------------------------- /groundingdino/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/datasets/__init__.py -------------------------------------------------------------------------------- /groundingdino/datasets/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/datasets/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/datasets/__pycache__/transforms.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/datasets/__pycache__/transforms.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Grounding DINO 3 | # url: https://github.com/IDEA-Research/GroundingDINO 4 | # Copyright (c) 2023 IDEA. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------ 7 | # Conditional DETR 8 | # Copyright (c) 2021 Microsoft. All Rights Reserved. 9 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 10 | # ------------------------------------------------------------------------ 11 | # Copied from DETR (https://github.com/facebookresearch/detr) 12 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 13 | # ------------------------------------------------------------------------ 14 | 15 | from .groundingdino import build_groundingdino 16 | -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/__pycache__/utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/__pycache__/utils.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import build_backbone 2 | -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/backbone/__pycache__/position_encoding.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/backbone/__pycache__/position_encoding.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/backbone/__pycache__/swin_transformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/GroundingDINO/backbone/__pycache__/swin_transformer.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | namespace groundingdino { 20 | 21 | at::Tensor 22 | ms_deform_attn_forward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const int im2col_step) 29 | { 30 | if (value.type().is_cuda()) 31 | { 32 | #ifdef WITH_CUDA 33 | return ms_deform_attn_cuda_forward( 34 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 35 | #else 36 | AT_ERROR("Not compiled with GPU support"); 37 | #endif 38 | } 39 | AT_ERROR("Not implemented on the CPU"); 40 | } 41 | 42 | std::vector 43 | ms_deform_attn_backward( 44 | const at::Tensor &value, 45 | const at::Tensor &spatial_shapes, 46 | const at::Tensor &level_start_index, 47 | const at::Tensor &sampling_loc, 48 | const at::Tensor &attn_weight, 49 | const at::Tensor &grad_output, 50 | const int im2col_step) 51 | { 52 | if (value.type().is_cuda()) 53 | { 54 | #ifdef WITH_CUDA 55 | return ms_deform_attn_cuda_backward( 56 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 57 | #else 58 | AT_ERROR("Not compiled with GPU support"); 59 | #endif 60 | } 61 | AT_ERROR("Not implemented on the CPU"); 62 | } 63 | 64 | } // namespace groundingdino -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | namespace groundingdino { 17 | 18 | at::Tensor 19 | ms_deform_attn_cpu_forward( 20 | const at::Tensor &value, 21 | const at::Tensor &spatial_shapes, 22 | const at::Tensor &level_start_index, 23 | const at::Tensor &sampling_loc, 24 | const at::Tensor &attn_weight, 25 | const int im2col_step) 26 | { 27 | AT_ERROR("Not implement on cpu"); 28 | } 29 | 30 | std::vector 31 | ms_deform_attn_cpu_backward( 32 | const at::Tensor &value, 33 | const at::Tensor &spatial_shapes, 34 | const at::Tensor &level_start_index, 35 | const at::Tensor &sampling_loc, 36 | const at::Tensor &attn_weight, 37 | const at::Tensor &grad_output, 38 | const int im2col_step) 39 | { 40 | AT_ERROR("Not implement on cpu"); 41 | } 42 | 43 | } // namespace groundingdino 44 | -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | namespace groundingdino { 15 | 16 | at::Tensor 17 | ms_deform_attn_cpu_forward( 18 | const at::Tensor &value, 19 | const at::Tensor &spatial_shapes, 20 | const at::Tensor &level_start_index, 21 | const at::Tensor &sampling_loc, 22 | const at::Tensor &attn_weight, 23 | const int im2col_step); 24 | 25 | std::vector 26 | ms_deform_attn_cpu_backward( 27 | const at::Tensor &value, 28 | const at::Tensor &spatial_shapes, 29 | const at::Tensor &level_start_index, 30 | const at::Tensor &sampling_loc, 31 | const at::Tensor &attn_weight, 32 | const at::Tensor &grad_output, 33 | const int im2col_step); 34 | 35 | } // namespace groundingdino 36 | -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/csrc/MsDeformAttn/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | namespace groundingdino { 15 | 16 | at::Tensor ms_deform_attn_cuda_forward( 17 | const at::Tensor &value, 18 | const at::Tensor &spatial_shapes, 19 | const at::Tensor &level_start_index, 20 | const at::Tensor &sampling_loc, 21 | const at::Tensor &attn_weight, 22 | const int im2col_step); 23 | 24 | std::vector ms_deform_attn_cuda_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | } // namespace groundingdino -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/csrc/cuda_version.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace groundingdino { 4 | int get_cudart_version() { 5 | return CUDART_VERSION; 6 | } 7 | } // namespace groundingdino 8 | -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | 3 | #include "MsDeformAttn/ms_deform_attn.h" 4 | 5 | namespace groundingdino { 6 | 7 | #ifdef WITH_CUDA 8 | extern int get_cudart_version(); 9 | #endif 10 | 11 | std::string get_cuda_version() { 12 | #ifdef WITH_CUDA 13 | std::ostringstream oss; 14 | 15 | // copied from 16 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231 17 | auto printCudaStyleVersion = [&](int v) { 18 | oss << (v / 1000) << "." << (v / 10 % 100); 19 | if (v % 10 != 0) { 20 | oss << "." << (v % 10); 21 | } 22 | }; 23 | printCudaStyleVersion(get_cudart_version()); 24 | return oss.str(); 25 | #else 26 | return std::string("not available"); 27 | #endif 28 | } 29 | 30 | // similar to 31 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp 32 | std::string get_compiler_version() { 33 | std::ostringstream ss; 34 | #if defined(__GNUC__) 35 | #ifndef __clang__ 36 | { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; } 37 | #endif 38 | #endif 39 | 40 | #if defined(__clang_major__) 41 | { 42 | ss << "clang " << __clang_major__ << "." << __clang_minor__ << "." 43 | << __clang_patchlevel__; 44 | } 45 | #endif 46 | 47 | #if defined(_MSC_VER) 48 | { ss << "MSVC " << _MSC_FULL_VER; } 49 | #endif 50 | return ss.str(); 51 | } 52 | 53 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 54 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 55 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 56 | } 57 | 58 | } // namespace groundingdino -------------------------------------------------------------------------------- /groundingdino/models/GroundingDINO/transformer_vanilla.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Grounding DINO 3 | # url: https://github.com/IDEA-Research/GroundingDINO 4 | # Copyright (c) 2023 IDEA. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------ 7 | # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved 8 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 9 | """ 10 | DETR Transformer class. 11 | 12 | Copy-paste from torch.nn.Transformer with modifications: 13 | * positional encodings are passed in MHattention 14 | * extra LN at the end of encoder is removed 15 | * decoder returns a stack of activations from all decoding layers 16 | """ 17 | from typing import Optional 18 | 19 | import torch 20 | import torch.nn.functional as F 21 | from torch import Tensor, nn 22 | 23 | from .utils import ( 24 | MLP, 25 | _get_activation_fn, 26 | _get_clones, 27 | gen_encoder_output_proposals, 28 | gen_sineembed_for_position, 29 | sigmoid_focal_loss, 30 | ) 31 | 32 | 33 | class TextTransformer(nn.Module): 34 | def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1): 35 | super().__init__() 36 | self.num_layers = num_layers 37 | self.d_model = d_model 38 | self.nheads = nheads 39 | self.dim_feedforward = dim_feedforward 40 | self.norm = None 41 | 42 | single_encoder_layer = TransformerEncoderLayer( 43 | d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout 44 | ) 45 | self.layers = _get_clones(single_encoder_layer, num_layers) 46 | 47 | def forward(self, memory_text: torch.Tensor, text_attention_mask: torch.Tensor): 48 | """ 49 | 50 | Args: 51 | text_attention_mask: bs, num_token 52 | memory_text: bs, num_token, d_model 53 | 54 | Raises: 55 | RuntimeError: _description_ 56 | 57 | Returns: 58 | output: bs, num_token, d_model 59 | """ 60 | 61 | output = memory_text.transpose(0, 1) 62 | 63 | for layer in self.layers: 64 | output = layer(output, src_key_padding_mask=text_attention_mask) 65 | 66 | if self.norm is not None: 67 | output = self.norm(output) 68 | 69 | return output.transpose(0, 1) 70 | 71 | 72 | class TransformerEncoderLayer(nn.Module): 73 | def __init__( 74 | self, 75 | d_model, 76 | nhead, 77 | dim_feedforward=2048, 78 | dropout=0.1, 79 | activation="relu", 80 | normalize_before=False, 81 | ): 82 | super().__init__() 83 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 84 | # Implementation of Feedforward model 85 | self.linear1 = nn.Linear(d_model, dim_feedforward) 86 | self.dropout = nn.Dropout(dropout) 87 | self.linear2 = nn.Linear(dim_feedforward, d_model) 88 | 89 | self.norm1 = nn.LayerNorm(d_model) 90 | self.norm2 = nn.LayerNorm(d_model) 91 | self.dropout1 = nn.Dropout(dropout) 92 | self.dropout2 = nn.Dropout(dropout) 93 | 94 | self.activation = _get_activation_fn(activation) 95 | self.normalize_before = normalize_before 96 | self.nhead = nhead 97 | 98 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 99 | return tensor if pos is None else tensor + pos 100 | 101 | def forward( 102 | self, 103 | src, 104 | src_mask: Optional[Tensor] = None, 105 | src_key_padding_mask: Optional[Tensor] = None, 106 | pos: Optional[Tensor] = None, 107 | ): 108 | # repeat attn mask 109 | if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]: 110 | # bs, num_q, num_k 111 | src_mask = src_mask.repeat(self.nhead, 1, 1) 112 | 113 | q = k = self.with_pos_embed(src, pos) 114 | 115 | src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0] 116 | 117 | # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] 118 | src = src + self.dropout1(src2) 119 | src = self.norm1(src) 120 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) 121 | src = src + self.dropout2(src2) 122 | src = self.norm2(src) 123 | return src 124 | -------------------------------------------------------------------------------- /groundingdino/models/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Grounding DINO 3 | # url: https://github.com/IDEA-Research/GroundingDINO 4 | # Copyright (c) 2023 IDEA. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------ 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | from .GroundingDINO import build_groundingdino 9 | 10 | 11 | def build_model(args): 12 | # we use register to maintain models from catdet6 on. 13 | from .registry import MODULE_BUILD_FUNCS 14 | 15 | assert args.modelname in MODULE_BUILD_FUNCS._module_dict 16 | build_func = MODULE_BUILD_FUNCS.get(args.modelname) 17 | model = build_func(args) 18 | return model 19 | -------------------------------------------------------------------------------- /groundingdino/models/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/__pycache__/registry.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/models/__pycache__/registry.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/models/registry.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Grounding DINO 3 | # url: https://github.com/IDEA-Research/GroundingDINO 4 | # Copyright (c) 2023 IDEA. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------ 7 | # -*- coding: utf-8 -*- 8 | # @Author: Yihao Chen 9 | # @Date: 2021-08-16 16:03:17 10 | # @Last Modified by: Shilong Liu 11 | # @Last Modified time: 2022-01-23 15:26 12 | # modified from mmcv 13 | 14 | import inspect 15 | from functools import partial 16 | 17 | 18 | class Registry(object): 19 | def __init__(self, name): 20 | self._name = name 21 | self._module_dict = dict() 22 | 23 | def __repr__(self): 24 | format_str = self.__class__.__name__ + "(name={}, items={})".format( 25 | self._name, list(self._module_dict.keys()) 26 | ) 27 | return format_str 28 | 29 | def __len__(self): 30 | return len(self._module_dict) 31 | 32 | @property 33 | def name(self): 34 | return self._name 35 | 36 | @property 37 | def module_dict(self): 38 | return self._module_dict 39 | 40 | def get(self, key): 41 | return self._module_dict.get(key, None) 42 | 43 | def registe_with_name(self, module_name=None, force=False): 44 | return partial(self.register, module_name=module_name, force=force) 45 | 46 | def register(self, module_build_function, module_name=None, force=False): 47 | """Register a module build function. 48 | Args: 49 | module (:obj:`nn.Module`): Module to be registered. 50 | """ 51 | if not inspect.isfunction(module_build_function): 52 | raise TypeError( 53 | "module_build_function must be a function, but got {}".format( 54 | type(module_build_function) 55 | ) 56 | ) 57 | if module_name is None: 58 | module_name = module_build_function.__name__ 59 | if not force and module_name in self._module_dict: 60 | raise KeyError("{} is already registered in {}".format(module_name, self.name)) 61 | self._module_dict[module_name] = module_build_function 62 | 63 | return module_build_function 64 | 65 | 66 | MODULE_BUILD_FUNCS = Registry("model build functions") 67 | -------------------------------------------------------------------------------- /groundingdino/util/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | -------------------------------------------------------------------------------- /groundingdino/util/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/util/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/util/__pycache__/box_ops.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/util/__pycache__/box_ops.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/util/__pycache__/slconfig.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/util/__pycache__/slconfig.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/util/__pycache__/utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/groundingdino/util/__pycache__/utils.cpython-39.pyc -------------------------------------------------------------------------------- /groundingdino/util/box_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | Utilities for bounding box manipulation and GIoU. 4 | """ 5 | import torch 6 | from torchvision.ops.boxes import box_area 7 | 8 | 9 | def box_cxcywh_to_xyxy(x): 10 | x_c, y_c, w, h = x.unbind(-1) 11 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] 12 | return torch.stack(b, dim=-1) 13 | 14 | 15 | def box_xyxy_to_cxcywh(x): 16 | x0, y0, x1, y1 = x.unbind(-1) 17 | b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] 18 | return torch.stack(b, dim=-1) 19 | 20 | 21 | # modified from torchvision to also return the union 22 | def box_iou(boxes1, boxes2): 23 | area1 = box_area(boxes1) 24 | area2 = box_area(boxes2) 25 | 26 | # import ipdb; ipdb.set_trace() 27 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 28 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 29 | 30 | wh = (rb - lt).clamp(min=0) # [N,M,2] 31 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 32 | 33 | union = area1[:, None] + area2 - inter 34 | 35 | iou = inter / (union + 1e-6) 36 | return iou, union 37 | 38 | 39 | def generalized_box_iou(boxes1, boxes2): 40 | """ 41 | Generalized IoU from https://giou.stanford.edu/ 42 | 43 | The boxes should be in [x0, y0, x1, y1] format 44 | 45 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 46 | and M = len(boxes2) 47 | """ 48 | # degenerate boxes gives inf / nan results 49 | # so do an early check 50 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 51 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 52 | # except: 53 | # import ipdb; ipdb.set_trace() 54 | iou, union = box_iou(boxes1, boxes2) 55 | 56 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 57 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 58 | 59 | wh = (rb - lt).clamp(min=0) # [N,M,2] 60 | area = wh[:, :, 0] * wh[:, :, 1] 61 | 62 | return iou - (area - union) / (area + 1e-6) 63 | 64 | 65 | # modified from torchvision to also return the union 66 | def box_iou_pairwise(boxes1, boxes2): 67 | area1 = box_area(boxes1) 68 | area2 = box_area(boxes2) 69 | 70 | lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2] 71 | rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2] 72 | 73 | wh = (rb - lt).clamp(min=0) # [N,2] 74 | inter = wh[:, 0] * wh[:, 1] # [N] 75 | 76 | union = area1 + area2 - inter 77 | 78 | iou = inter / union 79 | return iou, union 80 | 81 | 82 | def generalized_box_iou_pairwise(boxes1, boxes2): 83 | """ 84 | Generalized IoU from https://giou.stanford.edu/ 85 | 86 | Input: 87 | - boxes1, boxes2: N,4 88 | Output: 89 | - giou: N, 4 90 | """ 91 | # degenerate boxes gives inf / nan results 92 | # so do an early check 93 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 94 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 95 | assert boxes1.shape == boxes2.shape 96 | iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4 97 | 98 | lt = torch.min(boxes1[:, :2], boxes2[:, :2]) 99 | rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) 100 | 101 | wh = (rb - lt).clamp(min=0) # [N,2] 102 | area = wh[:, 0] * wh[:, 1] 103 | 104 | return iou - (area - union) / area 105 | 106 | 107 | def masks_to_boxes(masks): 108 | """Compute the bounding boxes around the provided masks 109 | 110 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 111 | 112 | Returns a [N, 4] tensors, with the boxes in xyxy format 113 | """ 114 | if masks.numel() == 0: 115 | return torch.zeros((0, 4), device=masks.device) 116 | 117 | h, w = masks.shape[-2:] 118 | 119 | y = torch.arange(0, h, dtype=torch.float) 120 | x = torch.arange(0, w, dtype=torch.float) 121 | y, x = torch.meshgrid(y, x) 122 | 123 | x_mask = masks * x.unsqueeze(0) 124 | x_max = x_mask.flatten(1).max(-1)[0] 125 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 126 | 127 | y_mask = masks * y.unsqueeze(0) 128 | y_max = y_mask.flatten(1).max(-1)[0] 129 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 130 | 131 | return torch.stack([x_min, y_min, x_max, y_max], 1) 132 | 133 | 134 | if __name__ == "__main__": 135 | x = torch.rand(5, 4) 136 | y = torch.rand(3, 4) 137 | iou, union = box_iou(x, y) 138 | import ipdb 139 | 140 | ipdb.set_trace() 141 | -------------------------------------------------------------------------------- /groundingdino/util/get_tokenlizer.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast 2 | 3 | 4 | def get_tokenlizer(text_encoder_type): 5 | if not isinstance(text_encoder_type, str): 6 | # print("text_encoder_type is not a str") 7 | if hasattr(text_encoder_type, "text_encoder_type"): 8 | text_encoder_type = text_encoder_type.text_encoder_type 9 | elif text_encoder_type.get("text_encoder_type", False): 10 | text_encoder_type = text_encoder_type.get("text_encoder_type") 11 | else: 12 | raise ValueError( 13 | "Unknown type of text_encoder_type: {}".format(type(text_encoder_type)) 14 | ) 15 | print("final text_encoder_type: {}".format(text_encoder_type)) 16 | 17 | tokenizer = AutoTokenizer.from_pretrained(text_encoder_type) 18 | return tokenizer 19 | 20 | 21 | def get_pretrained_language_model(text_encoder_type): 22 | if text_encoder_type == "bert-base-uncased": 23 | return BertModel.from_pretrained(text_encoder_type) 24 | if text_encoder_type == "roberta-base": 25 | return RobertaModel.from_pretrained(text_encoder_type) 26 | raise ValueError("Unknown text_encoder_type {}".format(text_encoder_type)) 27 | -------------------------------------------------------------------------------- /groundingdino/util/inference.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List 2 | 3 | import cv2 4 | import numpy as np 5 | import supervision as sv 6 | import torch 7 | from PIL import Image 8 | from torchvision.ops import box_convert 9 | 10 | import groundingdino.datasets.transforms as T 11 | from groundingdino.models import build_model 12 | from groundingdino.util.misc import clean_state_dict 13 | from groundingdino.util.slconfig import SLConfig 14 | from groundingdino.util.utils import get_phrases_from_posmap 15 | 16 | 17 | def preprocess_caption(caption: str) -> str: 18 | result = caption.lower().strip() 19 | if result.endswith("."): 20 | return result 21 | return result + "." 22 | 23 | 24 | def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda"): 25 | args = SLConfig.fromfile(model_config_path) 26 | args.device = device 27 | model = build_model(args) 28 | checkpoint = torch.load(model_checkpoint_path, map_location="cpu") 29 | model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False) 30 | model.eval() 31 | return model 32 | 33 | 34 | def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]: 35 | transform = T.Compose( 36 | [ 37 | T.RandomResize([800], max_size=1333), 38 | T.ToTensor(), 39 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), 40 | ] 41 | ) 42 | image_source = Image.open(image_path).convert("RGB") 43 | image = np.asarray(image_source) 44 | image_transformed, _ = transform(image_source, None) 45 | return image, image_transformed 46 | 47 | 48 | def predict( 49 | model, 50 | image: torch.Tensor, 51 | caption: str, 52 | box_threshold: float, 53 | text_threshold: float, 54 | device: str = "cuda" 55 | ) -> Tuple[torch.Tensor, torch.Tensor, List[str]]: 56 | caption = preprocess_caption(caption=caption) 57 | 58 | model = model.to(device) 59 | image = image.to(device) 60 | 61 | with torch.no_grad(): 62 | outputs = model(image[None], captions=[caption]) 63 | 64 | prediction_logits = outputs["pred_logits"].cpu().sigmoid()[0] # prediction_logits.shape = (nq, 256) 65 | prediction_boxes = outputs["pred_boxes"].cpu()[0] # prediction_boxes.shape = (nq, 4) 66 | 67 | mask = prediction_logits.max(dim=1)[0] > box_threshold 68 | logits = prediction_logits[mask] # logits.shape = (n, 256) 69 | boxes = prediction_boxes[mask] # boxes.shape = (n, 4) 70 | 71 | tokenizer = model.tokenizer 72 | tokenized = tokenizer(caption) 73 | 74 | phrases = [ 75 | get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '') 76 | for logit 77 | in logits 78 | ] 79 | 80 | return boxes, logits.max(dim=1)[0], phrases 81 | 82 | 83 | def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray: 84 | h, w, _ = image_source.shape 85 | boxes = boxes * torch.Tensor([w, h, w, h]) 86 | xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy() 87 | detections = sv.Detections(xyxy=xyxy) 88 | 89 | labels = [ 90 | f"{phrase} {logit:.2f}" 91 | for phrase, logit 92 | in zip(phrases, logits) 93 | ] 94 | 95 | box_annotator = sv.BoxAnnotator() 96 | annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR) 97 | annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels) 98 | return annotated_frame 99 | -------------------------------------------------------------------------------- /groundingdino/util/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import functools 3 | import logging 4 | import os 5 | import sys 6 | 7 | from termcolor import colored 8 | 9 | 10 | class _ColorfulFormatter(logging.Formatter): 11 | def __init__(self, *args, **kwargs): 12 | self._root_name = kwargs.pop("root_name") + "." 13 | self._abbrev_name = kwargs.pop("abbrev_name", "") 14 | if len(self._abbrev_name): 15 | self._abbrev_name = self._abbrev_name + "." 16 | super(_ColorfulFormatter, self).__init__(*args, **kwargs) 17 | 18 | def formatMessage(self, record): 19 | record.name = record.name.replace(self._root_name, self._abbrev_name) 20 | log = super(_ColorfulFormatter, self).formatMessage(record) 21 | if record.levelno == logging.WARNING: 22 | prefix = colored("WARNING", "red", attrs=["blink"]) 23 | elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL: 24 | prefix = colored("ERROR", "red", attrs=["blink", "underline"]) 25 | else: 26 | return log 27 | return prefix + " " + log 28 | 29 | 30 | # so that calling setup_logger multiple times won't add many handlers 31 | @functools.lru_cache() 32 | def setup_logger(output=None, distributed_rank=0, *, color=True, name="imagenet", abbrev_name=None): 33 | """ 34 | Initialize the detectron2 logger and set its verbosity level to "INFO". 35 | 36 | Args: 37 | output (str): a file name or a directory to save log. If None, will not save log file. 38 | If ends with ".txt" or ".log", assumed to be a file name. 39 | Otherwise, logs will be saved to `output/log.txt`. 40 | name (str): the root module name of this logger 41 | 42 | Returns: 43 | logging.Logger: a logger 44 | """ 45 | logger = logging.getLogger(name) 46 | logger.setLevel(logging.DEBUG) 47 | logger.propagate = False 48 | 49 | if abbrev_name is None: 50 | abbrev_name = name 51 | 52 | plain_formatter = logging.Formatter( 53 | "[%(asctime)s.%(msecs)03d]: %(message)s", datefmt="%m/%d %H:%M:%S" 54 | ) 55 | # stdout logging: master only 56 | if distributed_rank == 0: 57 | ch = logging.StreamHandler(stream=sys.stdout) 58 | ch.setLevel(logging.DEBUG) 59 | if color: 60 | formatter = _ColorfulFormatter( 61 | colored("[%(asctime)s.%(msecs)03d]: ", "green") + "%(message)s", 62 | datefmt="%m/%d %H:%M:%S", 63 | root_name=name, 64 | abbrev_name=str(abbrev_name), 65 | ) 66 | else: 67 | formatter = plain_formatter 68 | ch.setFormatter(formatter) 69 | logger.addHandler(ch) 70 | 71 | # file logging: all workers 72 | if output is not None: 73 | if output.endswith(".txt") or output.endswith(".log"): 74 | filename = output 75 | else: 76 | filename = os.path.join(output, "log.txt") 77 | if distributed_rank > 0: 78 | filename = filename + f".rank{distributed_rank}" 79 | os.makedirs(os.path.dirname(filename), exist_ok=True) 80 | 81 | fh = logging.StreamHandler(_cached_log_stream(filename)) 82 | fh.setLevel(logging.DEBUG) 83 | fh.setFormatter(plain_formatter) 84 | logger.addHandler(fh) 85 | 86 | return logger 87 | 88 | 89 | # cache the opened file object, so that different calls to `setup_logger` 90 | # with the same file name can safely write to the same file. 91 | @functools.lru_cache(maxsize=None) 92 | def _cached_log_stream(filename): 93 | return open(filename, "a") 94 | -------------------------------------------------------------------------------- /groundingdino/util/slio.py: -------------------------------------------------------------------------------- 1 | # ========================================================== 2 | # Modified from mmcv 3 | # ========================================================== 4 | 5 | import json 6 | import pickle 7 | from abc import ABCMeta, abstractmethod 8 | from pathlib import Path 9 | 10 | import yaml 11 | 12 | try: 13 | from yaml import CLoader as Loader, CDumper as Dumper 14 | except ImportError: 15 | from yaml import Loader, Dumper 16 | 17 | 18 | # =========================== 19 | # Rigister handler 20 | # =========================== 21 | 22 | 23 | class BaseFileHandler(metaclass=ABCMeta): 24 | @abstractmethod 25 | def load_from_fileobj(self, file, **kwargs): 26 | pass 27 | 28 | @abstractmethod 29 | def dump_to_fileobj(self, obj, file, **kwargs): 30 | pass 31 | 32 | @abstractmethod 33 | def dump_to_str(self, obj, **kwargs): 34 | pass 35 | 36 | def load_from_path(self, filepath, mode="r", **kwargs): 37 | with open(filepath, mode) as f: 38 | return self.load_from_fileobj(f, **kwargs) 39 | 40 | def dump_to_path(self, obj, filepath, mode="w", **kwargs): 41 | with open(filepath, mode) as f: 42 | self.dump_to_fileobj(obj, f, **kwargs) 43 | 44 | 45 | class JsonHandler(BaseFileHandler): 46 | def load_from_fileobj(self, file): 47 | return json.load(file) 48 | 49 | def dump_to_fileobj(self, obj, file, **kwargs): 50 | json.dump(obj, file, **kwargs) 51 | 52 | def dump_to_str(self, obj, **kwargs): 53 | return json.dumps(obj, **kwargs) 54 | 55 | 56 | class PickleHandler(BaseFileHandler): 57 | def load_from_fileobj(self, file, **kwargs): 58 | return pickle.load(file, **kwargs) 59 | 60 | def load_from_path(self, filepath, **kwargs): 61 | return super(PickleHandler, self).load_from_path(filepath, mode="rb", **kwargs) 62 | 63 | def dump_to_str(self, obj, **kwargs): 64 | kwargs.setdefault("protocol", 2) 65 | return pickle.dumps(obj, **kwargs) 66 | 67 | def dump_to_fileobj(self, obj, file, **kwargs): 68 | kwargs.setdefault("protocol", 2) 69 | pickle.dump(obj, file, **kwargs) 70 | 71 | def dump_to_path(self, obj, filepath, **kwargs): 72 | super(PickleHandler, self).dump_to_path(obj, filepath, mode="wb", **kwargs) 73 | 74 | 75 | class YamlHandler(BaseFileHandler): 76 | def load_from_fileobj(self, file, **kwargs): 77 | kwargs.setdefault("Loader", Loader) 78 | return yaml.load(file, **kwargs) 79 | 80 | def dump_to_fileobj(self, obj, file, **kwargs): 81 | kwargs.setdefault("Dumper", Dumper) 82 | yaml.dump(obj, file, **kwargs) 83 | 84 | def dump_to_str(self, obj, **kwargs): 85 | kwargs.setdefault("Dumper", Dumper) 86 | return yaml.dump(obj, **kwargs) 87 | 88 | 89 | file_handlers = { 90 | "json": JsonHandler(), 91 | "yaml": YamlHandler(), 92 | "yml": YamlHandler(), 93 | "pickle": PickleHandler(), 94 | "pkl": PickleHandler(), 95 | } 96 | 97 | # =========================== 98 | # load and dump 99 | # =========================== 100 | 101 | 102 | def is_str(x): 103 | """Whether the input is an string instance. 104 | 105 | Note: This method is deprecated since python 2 is no longer supported. 106 | """ 107 | return isinstance(x, str) 108 | 109 | 110 | def slload(file, file_format=None, **kwargs): 111 | """Load data from json/yaml/pickle files. 112 | 113 | This method provides a unified api for loading data from serialized files. 114 | 115 | Args: 116 | file (str or :obj:`Path` or file-like object): Filename or a file-like 117 | object. 118 | file_format (str, optional): If not specified, the file format will be 119 | inferred from the file extension, otherwise use the specified one. 120 | Currently supported formats include "json", "yaml/yml" and 121 | "pickle/pkl". 122 | 123 | Returns: 124 | The content from the file. 125 | """ 126 | if isinstance(file, Path): 127 | file = str(file) 128 | if file_format is None and is_str(file): 129 | file_format = file.split(".")[-1] 130 | if file_format not in file_handlers: 131 | raise TypeError(f"Unsupported format: {file_format}") 132 | 133 | handler = file_handlers[file_format] 134 | if is_str(file): 135 | obj = handler.load_from_path(file, **kwargs) 136 | elif hasattr(file, "read"): 137 | obj = handler.load_from_fileobj(file, **kwargs) 138 | else: 139 | raise TypeError('"file" must be a filepath str or a file-object') 140 | return obj 141 | 142 | 143 | def sldump(obj, file=None, file_format=None, **kwargs): 144 | """Dump data to json/yaml/pickle strings or files. 145 | 146 | This method provides a unified api for dumping data as strings or to files, 147 | and also supports custom arguments for each file format. 148 | 149 | Args: 150 | obj (any): The python object to be dumped. 151 | file (str or :obj:`Path` or file-like object, optional): If not 152 | specified, then the object is dump to a str, otherwise to a file 153 | specified by the filename or file-like object. 154 | file_format (str, optional): Same as :func:`load`. 155 | 156 | Returns: 157 | bool: True for success, False otherwise. 158 | """ 159 | if isinstance(file, Path): 160 | file = str(file) 161 | if file_format is None: 162 | if is_str(file): 163 | file_format = file.split(".")[-1] 164 | elif file is None: 165 | raise ValueError("file_format must be specified since file is None") 166 | if file_format not in file_handlers: 167 | raise TypeError(f"Unsupported format: {file_format}") 168 | 169 | handler = file_handlers[file_format] 170 | if file is None: 171 | return handler.dump_to_str(obj, **kwargs) 172 | elif is_str(file): 173 | handler.dump_to_path(obj, file, **kwargs) 174 | elif hasattr(file, "write"): 175 | handler.dump_to_fileobj(obj, file, **kwargs) 176 | else: 177 | raise TypeError('"file" must be a filename str or a file-object') 178 | -------------------------------------------------------------------------------- /groundingdino/util/time_counter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | 4 | 5 | class TimeCounter: 6 | def __init__(self) -> None: 7 | pass 8 | 9 | def clear(self): 10 | self.timedict = {} 11 | self.basetime = time.perf_counter() 12 | 13 | def timeit(self, name): 14 | nowtime = time.perf_counter() - self.basetime 15 | self.timedict[name] = nowtime 16 | self.basetime = time.perf_counter() 17 | 18 | 19 | class TimeHolder: 20 | def __init__(self) -> None: 21 | self.timedict = {} 22 | 23 | def update(self, _timedict: dict): 24 | for k, v in _timedict.items(): 25 | if k not in self.timedict: 26 | self.timedict[k] = AverageMeter(name=k, val_only=True) 27 | self.timedict[k].update(val=v) 28 | 29 | def final_res(self): 30 | return {k: v.avg for k, v in self.timedict.items()} 31 | 32 | def __str__(self): 33 | return json.dumps(self.final_res(), indent=2) 34 | 35 | 36 | class AverageMeter(object): 37 | """Computes and stores the average and current value""" 38 | 39 | def __init__(self, name, fmt=":f", val_only=False): 40 | self.name = name 41 | self.fmt = fmt 42 | self.val_only = val_only 43 | self.reset() 44 | 45 | def reset(self): 46 | self.val = 0 47 | self.avg = 0 48 | self.sum = 0 49 | self.count = 0 50 | 51 | def update(self, val, n=1): 52 | self.val = val 53 | self.sum += val * n 54 | self.count += n 55 | self.avg = self.sum / self.count 56 | 57 | def __str__(self): 58 | if self.val_only: 59 | fmtstr = "{name} {val" + self.fmt + "}" 60 | else: 61 | fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" 62 | return fmtstr.format(**self.__dict__) 63 | -------------------------------------------------------------------------------- /groundingdino/util/vl_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | from typing import List 4 | 5 | import torch 6 | 7 | 8 | def create_positive_map_from_span(tokenized, token_span, max_text_len=256): 9 | """construct a map such that positive_map[i,j] = True iff box i is associated to token j 10 | Input: 11 | - tokenized: 12 | - input_ids: Tensor[1, ntokens] 13 | - attention_mask: Tensor[1, ntokens] 14 | - token_span: list with length num_boxes. 15 | - each item: [start_idx, end_idx] 16 | """ 17 | positive_map = torch.zeros((len(token_span), max_text_len), dtype=torch.float) 18 | for j, tok_list in enumerate(token_span): 19 | for (beg, end) in tok_list: 20 | beg_pos = tokenized.char_to_token(beg) 21 | end_pos = tokenized.char_to_token(end - 1) 22 | if beg_pos is None: 23 | try: 24 | beg_pos = tokenized.char_to_token(beg + 1) 25 | if beg_pos is None: 26 | beg_pos = tokenized.char_to_token(beg + 2) 27 | except: 28 | beg_pos = None 29 | if end_pos is None: 30 | try: 31 | end_pos = tokenized.char_to_token(end - 2) 32 | if end_pos is None: 33 | end_pos = tokenized.char_to_token(end - 3) 34 | except: 35 | end_pos = None 36 | if beg_pos is None or end_pos is None: 37 | continue 38 | 39 | assert beg_pos is not None and end_pos is not None 40 | if os.environ.get("SHILONG_DEBUG_ONLY_ONE_POS", None) == "TRUE": 41 | positive_map[j, beg_pos] = 1 42 | break 43 | else: 44 | positive_map[j, beg_pos : end_pos + 1].fill_(1) 45 | 46 | return positive_map / (positive_map.sum(-1)[:, None] + 1e-6) 47 | 48 | 49 | def build_captions_and_token_span(cat_list, force_lowercase): 50 | """ 51 | Return: 52 | captions: str 53 | cat2tokenspan: dict 54 | { 55 | 'dog': [[0, 2]], 56 | ... 57 | } 58 | """ 59 | 60 | cat2tokenspan = {} 61 | captions = "" 62 | for catname in cat_list: 63 | class_name = catname 64 | if force_lowercase: 65 | class_name = class_name.lower() 66 | if "/" in class_name: 67 | class_name_list: List = class_name.strip().split("/") 68 | class_name_list.append(class_name) 69 | class_name: str = random.choice(class_name_list) 70 | 71 | tokens_positive_i = [] 72 | subnamelist = [i.strip() for i in class_name.strip().split(" ")] 73 | for subname in subnamelist: 74 | if len(subname) == 0: 75 | continue 76 | if len(captions) > 0: 77 | captions = captions + " " 78 | strat_idx = len(captions) 79 | end_idx = strat_idx + len(subname) 80 | tokens_positive_i.append([strat_idx, end_idx]) 81 | captions = captions + subname 82 | 83 | if len(tokens_positive_i) > 0: 84 | captions = captions + " ." 85 | cat2tokenspan[class_name] = tokens_positive_i 86 | 87 | return captions, cat2tokenspan 88 | 89 | 90 | def build_id2posspan_and_caption(category_dict: dict): 91 | """Build id2pos_span and caption from category_dict 92 | 93 | Args: 94 | category_dict (dict): category_dict 95 | """ 96 | cat_list = [item["name"].lower() for item in category_dict] 97 | id2catname = {item["id"]: item["name"].lower() for item in category_dict} 98 | caption, cat2posspan = build_captions_and_token_span(cat_list, force_lowercase=True) 99 | id2posspan = {catid: cat2posspan[catname] for catid, catname in id2catname.items()} 100 | return id2posspan, caption 101 | -------------------------------------------------------------------------------- /groundingdino/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | -------------------------------------------------------------------------------- /linter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | { 5 | black --version | grep -E "23\." > /dev/null 6 | } || { 7 | echo "Linter requires 'black==23.*' !" 8 | exit 1 9 | } 10 | 11 | ISORT_VERSION=$(isort --version-number) 12 | if [[ "$ISORT_VERSION" != 5.12* ]]; then 13 | echo "Linter requires isort==5.12.0 !" 14 | exit 1 15 | fi 16 | 17 | echo "Running isort ..." 18 | isort . --atomic 19 | 20 | echo "Running black ..." 21 | black -l 100 . 22 | 23 | echo "Running flake8 ..." 24 | if [ -x "$(command -v flake8)" ]; then 25 | flake8 . 26 | else 27 | python3 -m flake8 . 28 | fi 29 | 30 | echo "Running mypy..." 31 | 32 | mypy --exclude 'setup.py|notebooks' . 33 | -------------------------------------------------------------------------------- /modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .sam import Sam 8 | from .image_encoder import ImageEncoderViT 9 | from .mask_decoder import MaskDecoder 10 | from .prompt_encoder import PromptEncoder 11 | from .transformer import TwoWayTransformer 12 | -------------------------------------------------------------------------------- /modeling/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /modeling/__pycache__/common.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/common.cpython-39.pyc -------------------------------------------------------------------------------- /modeling/__pycache__/image_encoder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/image_encoder.cpython-39.pyc -------------------------------------------------------------------------------- /modeling/__pycache__/mask_decoder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/mask_decoder.cpython-39.pyc -------------------------------------------------------------------------------- /modeling/__pycache__/prompt_encoder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/prompt_encoder.cpython-39.pyc -------------------------------------------------------------------------------- /modeling/__pycache__/sam.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/sam.cpython-39.pyc -------------------------------------------------------------------------------- /modeling/__pycache__/transformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/modeling/__pycache__/transformer.cpython-39.pyc -------------------------------------------------------------------------------- /modeling/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | 10 | from typing import Type 11 | 12 | 13 | class MLPBlock(nn.Module): 14 | def __init__( 15 | self, 16 | embedding_dim: int, 17 | mlp_dim: int, 18 | act: Type[nn.Module] = nn.GELU, 19 | ) -> None: 20 | super().__init__() 21 | self.lin1 = nn.Linear(embedding_dim, mlp_dim) 22 | self.lin2 = nn.Linear(mlp_dim, embedding_dim) 23 | self.act = act() 24 | 25 | def forward(self, x: torch.Tensor) -> torch.Tensor: 26 | return self.lin2(self.act(self.lin1(x))) 27 | 28 | 29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa 30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa 31 | class LayerNorm2d(nn.Module): 32 | def __init__(self, num_channels: int, eps: float = 1e-6) -> None: 33 | super().__init__() 34 | self.weight = nn.Parameter(torch.ones(num_channels)) 35 | self.bias = nn.Parameter(torch.zeros(num_channels)) 36 | self.eps = eps 37 | 38 | def forward(self, x: torch.Tensor) -> torch.Tensor: 39 | u = x.mean(1, keepdim=True) 40 | s = (x - u).pow(2).mean(1, keepdim=True) 41 | x = (x - u) / torch.sqrt(s + self.eps) 42 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 43 | return x 44 | -------------------------------------------------------------------------------- /notebooks/images/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/notebooks/images/dog.jpg -------------------------------------------------------------------------------- /notebooks/images/groceries.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/notebooks/images/groceries.jpg -------------------------------------------------------------------------------- /notebooks/images/truck.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/notebooks/images/truck.jpg -------------------------------------------------------------------------------- /outputs/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/outputs/.placeholder -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | addict 2 | diffusers 3 | gradio 4 | huggingface_hub 5 | matplotlib 6 | numpy 7 | onnxruntime 8 | opencv_python 9 | Pillow 10 | pycocotools 11 | PyYAML 12 | requests 13 | setuptools 14 | supervision 15 | termcolor 16 | timm 17 | torch 18 | torchvision 19 | transformers==4.28.1 20 | yapf 21 | accelerate 22 | exif 23 | textblob 24 | einops 25 | omegaconf 26 | ultralytics==8.0.95 27 | clip 28 | inflect -------------------------------------------------------------------------------- /runs/detect/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/runs/detect/.placeholder -------------------------------------------------------------------------------- /segment_anything/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = W503, E203, E221, C901, C408, E741, C407, B017, F811, C101, EXE001, EXE002 3 | max-line-length = 100 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 6 | per-file-ignores = 7 | **/__init__.py:F401,F403,E402 8 | -------------------------------------------------------------------------------- /segment_anything/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /segment_anything/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to segment-anything 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints, using the `linter.sh` script in the project's root directory. Linting requires `black==23.*`, `isort==5.12.0`, `flake8`, and `mypy`. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to segment-anything, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. 32 | -------------------------------------------------------------------------------- /segment_anything/README.md: -------------------------------------------------------------------------------- 1 | # Segment Anything 2 | 3 | **[Meta AI Research, FAIR](https://ai.facebook.com/research/)** 4 | 5 | [Alexander Kirillov](https://alexander-kirillov.github.io/), [Eric Mintun](https://ericmintun.github.io/), [Nikhila Ravi](https://nikhilaravi.com/), [Hanzi Mao](https://hanzimao.me/), Chloe Rolland, Laura Gustafson, [Tete Xiao](https://tetexiao.com), [Spencer Whitehead](https://www.spencerwhitehead.com/), Alex Berg, Wan-Yen Lo, [Piotr Dollar](https://pdollar.github.io/), [Ross Girshick](https://www.rossgirshick.info/) 6 | 7 | [[`Paper`](https://ai.facebook.com/research/publications/segment-anything/)] [[`Project`](https://segment-anything.com/)] [[`Demo`](https://segment-anything.com/demo)] [[`Dataset`](https://segment-anything.com/dataset/index.html)] [[`Blog`](https://ai.facebook.com/blog/segment-anything-foundation-model-image-segmentation/)] 8 | 9 | ![SAM design](assets/model_diagram.png?raw=true) 10 | 11 | The **Segment Anything Model (SAM)** produces high quality object masks from input prompts such as points or boxes, and it can be used to generate masks for all objects in an image. It has been trained on a [dataset](https://segment-anything.com/dataset/index.html) of 11 million images and 1.1 billion masks, and has strong zero-shot performance on a variety of segmentation tasks. 12 | 13 |

14 | 15 | 16 |

17 | 18 | ## Installation 19 | 20 | The code requires `python>=3.8`, as well as `pytorch>=1.7` and `torchvision>=0.8`. Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install both PyTorch and TorchVision dependencies. Installing both PyTorch and TorchVision with CUDA support is strongly recommended. 21 | 22 | Install Segment Anything: 23 | 24 | ``` 25 | pip install git+https://github.com/facebookresearch/segment-anything.git 26 | ``` 27 | 28 | or clone the repository locally and install with 29 | 30 | ``` 31 | git clone git@github.com:facebookresearch/segment-anything.git 32 | cd segment-anything; pip install -e . 33 | ``` 34 | 35 | The following optional dependencies are necessary for mask post-processing, saving masks in COCO format, the example notebooks, and exporting the model in ONNX format. `jupyter` is also required to run the example notebooks. 36 | ``` 37 | pip install opencv-python pycocotools matplotlib onnxruntime onnx 38 | ``` 39 | 40 | 41 | ## Getting Started 42 | 43 | First download a [model checkpoint](#model-checkpoints). Then the model can be used in just a few lines to get masks from a given prompt: 44 | 45 | ``` 46 | from segment_anything import build_sam, SamPredictor 47 | predictor = SamPredictor(build_sam(checkpoint="")) 48 | predictor.set_image() 49 | masks, _, _ = predictor.predict() 50 | ``` 51 | 52 | or generate masks for an entire image: 53 | 54 | ``` 55 | from segment_anything import build_sam, SamAutomaticMaskGenerator 56 | mask_generator = SamAutomaticMaskGenerator(build_sam(checkpoint="")) 57 | masks = mask_generator_generate() 58 | ``` 59 | 60 | Additionally, masks can be generated for images from the command line: 61 | 62 | ``` 63 | python scripts/amg.py --checkpoint --input --output 64 | ``` 65 | 66 | See the examples notebooks on [using SAM with prompts](/notebooks/predictor_example.ipynb) and [automatically generating masks](/notebooks/automatic_mask_generator_example.ipynb) for more details. 67 | 68 |

69 | 70 | 71 |

72 | 73 | ## ONNX Export 74 | 75 | SAM's lightweight mask decoder can be exported to ONNX format so that it can be run in any environment that supports ONNX runtime, such as in-browser as showcased in the [demo](https://segment-anything.com/demo). Export the model with 76 | 77 | ``` 78 | python scripts/export_onnx_model.py --checkpoint --output 79 | ``` 80 | 81 | See the [example notebook](https://github.com/facebookresearch/segment-anything/blob/main/notebooks/onnx_model_example.ipynb) for details on how to combine image preprocessing via SAM's backbone with mask prediction using the ONNX model. It is recommended to use the latest stable version of PyTorch for ONNX export. 82 | 83 | ## Model Checkpoints 84 | 85 | Three model versions of the model are available with different backbone sizes. These models can be instantiated by running 86 | ``` 87 | from segment_anything import sam_model_registry 88 | sam = sam_model_registry[""](checkpoint="") 89 | ``` 90 | Click the links below to download the checkpoint for the corresponding model name. The default model in bold can also be instantiated with `build_sam`, as in the examples in [Getting Started](#getting-started). 91 | 92 | * **`default` or `vit_h`: [ViT-H SAM model.](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth)** 93 | * `vit_l`: [ViT-L SAM model.](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth) 94 | * `vit_b`: [ViT-B SAM model.](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth) 95 | 96 | ## License 97 | The model is licensed under the [Apache 2.0 license](LICENSE). 98 | 99 | ## Contributing 100 | 101 | See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md). 102 | 103 | ## Contributors 104 | 105 | The Segment Anything project was made possible with the help of many contributors (alphabetical): 106 | 107 | Aaron Adcock, Vaibhav Aggarwal, Morteza Behrooz, Cheng-Yang Fu, Ashley Gabriel, Ahuva Goldstand, Allen Goodman, Sumanth Gurram, Jiabo Hu, Somya Jain, Devansh Kukreja, Robert Kuo, Joshua Lane, Yanghao Li, Lilian Luong, Jitendra Malik, Mallika Malhotra, William Ngan, Omkar Parkhi, Nikhil Raina, Dirk Rowe, Neil Sejoor, Vanessa Stark, Bala Varadarajan, Bram Wasti, Zachary Winstrom 108 | -------------------------------------------------------------------------------- /segment_anything/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .build_sam import ( 8 | build_sam, 9 | build_sam_vit_h, 10 | build_sam_vit_l, 11 | build_sam_vit_b, 12 | sam_model_registry, 13 | ) 14 | from .predictor import SamPredictor 15 | from .automatic_mask_generator import SamAutomaticMaskGenerator 16 | -------------------------------------------------------------------------------- /segment_anything/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/__pycache__/automatic_mask_generator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/__pycache__/automatic_mask_generator.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/__pycache__/build_sam.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/__pycache__/build_sam.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/__pycache__/predictor.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/__pycache__/predictor.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/assets/masks1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/assets/masks1.png -------------------------------------------------------------------------------- /segment_anything/assets/masks2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/assets/masks2.jpg -------------------------------------------------------------------------------- /segment_anything/assets/model_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/assets/model_diagram.png -------------------------------------------------------------------------------- /segment_anything/assets/notebook1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/assets/notebook1.png -------------------------------------------------------------------------------- /segment_anything/assets/notebook2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/assets/notebook2.png -------------------------------------------------------------------------------- /segment_anything/build_sam.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | 9 | from functools import partial 10 | 11 | from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer 12 | 13 | 14 | def build_sam_vit_h(checkpoint=None): 15 | return _build_sam( 16 | encoder_embed_dim=1280, 17 | encoder_depth=32, 18 | encoder_num_heads=16, 19 | encoder_global_attn_indexes=[7, 15, 23, 31], 20 | checkpoint=checkpoint, 21 | ) 22 | 23 | 24 | build_sam = build_sam_vit_h 25 | 26 | 27 | def build_sam_vit_l(checkpoint=None): 28 | return _build_sam( 29 | encoder_embed_dim=1024, 30 | encoder_depth=24, 31 | encoder_num_heads=16, 32 | encoder_global_attn_indexes=[5, 11, 17, 23], 33 | checkpoint=checkpoint, 34 | ) 35 | 36 | 37 | def build_sam_vit_b(checkpoint=None): 38 | return _build_sam( 39 | encoder_embed_dim=768, 40 | encoder_depth=12, 41 | encoder_num_heads=12, 42 | encoder_global_attn_indexes=[2, 5, 8, 11], 43 | checkpoint=checkpoint, 44 | ) 45 | 46 | 47 | sam_model_registry = { 48 | "default": build_sam, 49 | "vit_h": build_sam, 50 | "vit_l": build_sam_vit_l, 51 | "vit_b": build_sam_vit_b, 52 | } 53 | 54 | 55 | def _build_sam( 56 | encoder_embed_dim, 57 | encoder_depth, 58 | encoder_num_heads, 59 | encoder_global_attn_indexes, 60 | checkpoint=None, 61 | ): 62 | prompt_embed_dim = 256 63 | image_size = 1024 64 | vit_patch_size = 16 65 | image_embedding_size = image_size // vit_patch_size 66 | sam = Sam( 67 | image_encoder=ImageEncoderViT( 68 | depth=encoder_depth, 69 | embed_dim=encoder_embed_dim, 70 | img_size=image_size, 71 | mlp_ratio=4, 72 | norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), 73 | num_heads=encoder_num_heads, 74 | patch_size=vit_patch_size, 75 | qkv_bias=True, 76 | use_rel_pos=True, 77 | global_attn_indexes=encoder_global_attn_indexes, 78 | window_size=14, 79 | out_chans=prompt_embed_dim, 80 | ), 81 | prompt_encoder=PromptEncoder( 82 | embed_dim=prompt_embed_dim, 83 | image_embedding_size=(image_embedding_size, image_embedding_size), 84 | input_image_size=(image_size, image_size), 85 | mask_in_chans=16, 86 | ), 87 | mask_decoder=MaskDecoder( 88 | num_multimask_outputs=3, 89 | transformer=TwoWayTransformer( 90 | depth=2, 91 | embedding_dim=prompt_embed_dim, 92 | mlp_dim=2048, 93 | num_heads=8, 94 | ), 95 | transformer_dim=prompt_embed_dim, 96 | iou_head_depth=3, 97 | iou_head_hidden_dim=256, 98 | ), 99 | pixel_mean=[123.675, 116.28, 103.53], 100 | pixel_std=[58.395, 57.12, 57.375], 101 | ) 102 | sam.eval() 103 | if checkpoint is not None: 104 | with open(checkpoint, "rb") as f: 105 | state_dict = torch.load(f) 106 | sam.load_state_dict(state_dict) 107 | return sam 108 | -------------------------------------------------------------------------------- /segment_anything/linter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | { 5 | black --version | grep -E "23\." > /dev/null 6 | } || { 7 | echo "Linter requires 'black==23.*' !" 8 | exit 1 9 | } 10 | 11 | ISORT_VERSION=$(isort --version-number) 12 | if [[ "$ISORT_VERSION" != 5.12* ]]; then 13 | echo "Linter requires isort==5.12.0 !" 14 | exit 1 15 | fi 16 | 17 | echo "Running isort ..." 18 | isort . --atomic 19 | 20 | echo "Running black ..." 21 | black -l 100 . 22 | 23 | echo "Running flake8 ..." 24 | if [ -x "$(command -v flake8)" ]; then 25 | flake8 . 26 | else 27 | python3 -m flake8 . 28 | fi 29 | 30 | echo "Running mypy..." 31 | 32 | mypy --exclude 'setup.py|notebooks' . 33 | -------------------------------------------------------------------------------- /segment_anything/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .sam import Sam 8 | from .image_encoder import ImageEncoderViT 9 | from .mask_decoder import MaskDecoder 10 | from .prompt_encoder import PromptEncoder 11 | from .transformer import TwoWayTransformer 12 | -------------------------------------------------------------------------------- /segment_anything/modeling/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/modeling/__pycache__/common.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/common.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/modeling/__pycache__/image_encoder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/image_encoder.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/modeling/__pycache__/mask_decoder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/mask_decoder.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/modeling/__pycache__/prompt_encoder.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/prompt_encoder.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/modeling/__pycache__/sam.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/sam.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/modeling/__pycache__/transformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/modeling/__pycache__/transformer.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/modeling/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | 10 | from typing import Type 11 | 12 | 13 | class MLPBlock(nn.Module): 14 | def __init__( 15 | self, 16 | embedding_dim: int, 17 | mlp_dim: int, 18 | act: Type[nn.Module] = nn.GELU, 19 | ) -> None: 20 | super().__init__() 21 | self.lin1 = nn.Linear(embedding_dim, mlp_dim) 22 | self.lin2 = nn.Linear(mlp_dim, embedding_dim) 23 | self.act = act() 24 | 25 | def forward(self, x: torch.Tensor) -> torch.Tensor: 26 | return self.lin2(self.act(self.lin1(x))) 27 | 28 | 29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa 30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa 31 | class LayerNorm2d(nn.Module): 32 | def __init__(self, num_channels: int, eps: float = 1e-6) -> None: 33 | super().__init__() 34 | self.weight = nn.Parameter(torch.ones(num_channels)) 35 | self.bias = nn.Parameter(torch.zeros(num_channels)) 36 | self.eps = eps 37 | 38 | def forward(self, x: torch.Tensor) -> torch.Tensor: 39 | u = x.mean(1, keepdim=True) 40 | s = (x - u).pow(2).mean(1, keepdim=True) 41 | x = (x - u) / torch.sqrt(s + self.eps) 42 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 43 | return x 44 | -------------------------------------------------------------------------------- /segment_anything/notebooks/images/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/notebooks/images/dog.jpg -------------------------------------------------------------------------------- /segment_anything/notebooks/images/groceries.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/notebooks/images/groceries.jpg -------------------------------------------------------------------------------- /segment_anything/notebooks/images/truck.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/notebooks/images/truck.jpg -------------------------------------------------------------------------------- /segment_anything/segment_anything.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: segment-anything 3 | Version: 1.0 4 | Provides-Extra: all 5 | Provides-Extra: dev 6 | License-File: LICENSE 7 | -------------------------------------------------------------------------------- /segment_anything/segment_anything.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE 2 | README.md 3 | setup.cfg 4 | setup.py 5 | modeling/__init__.py 6 | modeling/common.py 7 | modeling/image_encoder.py 8 | modeling/mask_decoder.py 9 | modeling/prompt_encoder.py 10 | modeling/sam.py 11 | modeling/transformer.py 12 | segment_anything/__init__.py 13 | segment_anything/automatic_mask_generator.py 14 | segment_anything/build_sam.py 15 | segment_anything/predictor.py 16 | segment_anything.egg-info/PKG-INFO 17 | segment_anything.egg-info/SOURCES.txt 18 | segment_anything.egg-info/dependency_links.txt 19 | segment_anything.egg-info/requires.txt 20 | segment_anything.egg-info/top_level.txt 21 | segment_anything/modeling/__init__.py 22 | segment_anything/modeling/common.py 23 | segment_anything/modeling/image_encoder.py 24 | segment_anything/modeling/mask_decoder.py 25 | segment_anything/modeling/prompt_encoder.py 26 | segment_anything/modeling/sam.py 27 | segment_anything/modeling/transformer.py 28 | segment_anything/utils/__init__.py 29 | segment_anything/utils/amg.py 30 | segment_anything/utils/onnx.py 31 | segment_anything/utils/transforms.py 32 | utils/__init__.py 33 | utils/amg.py 34 | utils/onnx.py 35 | utils/transforms.py -------------------------------------------------------------------------------- /segment_anything/segment_anything.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /segment_anything/segment_anything.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | 2 | [all] 3 | matplotlib 4 | pycocotools 5 | opencv-python 6 | onnx 7 | onnxruntime 8 | 9 | [dev] 10 | flake8 11 | isort 12 | black 13 | mypy 14 | -------------------------------------------------------------------------------- /segment_anything/segment_anything.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | modeling 2 | segment_anything 3 | utils 4 | -------------------------------------------------------------------------------- /segment_anything/segment_anything/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .build_sam import ( 8 | build_sam, 9 | build_sam_vit_h, 10 | build_sam_vit_l, 11 | build_sam_vit_b, 12 | sam_model_registry, 13 | ) 14 | from .predictor import SamPredictor 15 | from .automatic_mask_generator import SamAutomaticMaskGenerator 16 | -------------------------------------------------------------------------------- /segment_anything/segment_anything/build_sam.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | 9 | from functools import partial 10 | 11 | from .modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer 12 | 13 | 14 | def build_sam_vit_h(checkpoint=None): 15 | return _build_sam( 16 | encoder_embed_dim=1280, 17 | encoder_depth=32, 18 | encoder_num_heads=16, 19 | encoder_global_attn_indexes=[7, 15, 23, 31], 20 | checkpoint=checkpoint, 21 | ) 22 | 23 | 24 | build_sam = build_sam_vit_h 25 | 26 | 27 | def build_sam_vit_l(checkpoint=None): 28 | return _build_sam( 29 | encoder_embed_dim=1024, 30 | encoder_depth=24, 31 | encoder_num_heads=16, 32 | encoder_global_attn_indexes=[5, 11, 17, 23], 33 | checkpoint=checkpoint, 34 | ) 35 | 36 | 37 | def build_sam_vit_b(checkpoint=None): 38 | return _build_sam( 39 | encoder_embed_dim=768, 40 | encoder_depth=12, 41 | encoder_num_heads=12, 42 | encoder_global_attn_indexes=[2, 5, 8, 11], 43 | checkpoint=checkpoint, 44 | ) 45 | 46 | 47 | sam_model_registry = { 48 | "default": build_sam, 49 | "vit_h": build_sam, 50 | "vit_l": build_sam_vit_l, 51 | "vit_b": build_sam_vit_b, 52 | } 53 | 54 | 55 | def _build_sam( 56 | encoder_embed_dim, 57 | encoder_depth, 58 | encoder_num_heads, 59 | encoder_global_attn_indexes, 60 | checkpoint=None, 61 | ): 62 | prompt_embed_dim = 256 63 | image_size = 1024 64 | vit_patch_size = 16 65 | image_embedding_size = image_size // vit_patch_size 66 | sam = Sam( 67 | image_encoder=ImageEncoderViT( 68 | depth=encoder_depth, 69 | embed_dim=encoder_embed_dim, 70 | img_size=image_size, 71 | mlp_ratio=4, 72 | norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), 73 | num_heads=encoder_num_heads, 74 | patch_size=vit_patch_size, 75 | qkv_bias=True, 76 | use_rel_pos=True, 77 | global_attn_indexes=encoder_global_attn_indexes, 78 | window_size=14, 79 | out_chans=prompt_embed_dim, 80 | ), 81 | prompt_encoder=PromptEncoder( 82 | embed_dim=prompt_embed_dim, 83 | image_embedding_size=(image_embedding_size, image_embedding_size), 84 | input_image_size=(image_size, image_size), 85 | mask_in_chans=16, 86 | ), 87 | mask_decoder=MaskDecoder( 88 | num_multimask_outputs=3, 89 | transformer=TwoWayTransformer( 90 | depth=2, 91 | embedding_dim=prompt_embed_dim, 92 | mlp_dim=2048, 93 | num_heads=8, 94 | ), 95 | transformer_dim=prompt_embed_dim, 96 | iou_head_depth=3, 97 | iou_head_hidden_dim=256, 98 | ), 99 | pixel_mean=[123.675, 116.28, 103.53], 100 | pixel_std=[58.395, 57.12, 57.375], 101 | ) 102 | sam.eval() 103 | if checkpoint is not None: 104 | with open(checkpoint, "rb") as f: 105 | state_dict = torch.load(f) 106 | sam.load_state_dict(state_dict) 107 | return sam 108 | -------------------------------------------------------------------------------- /segment_anything/segment_anything/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .sam import Sam 8 | from .image_encoder import ImageEncoderViT 9 | from .mask_decoder import MaskDecoder 10 | from .prompt_encoder import PromptEncoder 11 | from .transformer import TwoWayTransformer 12 | -------------------------------------------------------------------------------- /segment_anything/segment_anything/modeling/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | 10 | from typing import Type 11 | 12 | 13 | class MLPBlock(nn.Module): 14 | def __init__( 15 | self, 16 | embedding_dim: int, 17 | mlp_dim: int, 18 | act: Type[nn.Module] = nn.GELU, 19 | ) -> None: 20 | super().__init__() 21 | self.lin1 = nn.Linear(embedding_dim, mlp_dim) 22 | self.lin2 = nn.Linear(mlp_dim, embedding_dim) 23 | self.act = act() 24 | 25 | def forward(self, x: torch.Tensor) -> torch.Tensor: 26 | return self.lin2(self.act(self.lin1(x))) 27 | 28 | 29 | # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa 30 | # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa 31 | class LayerNorm2d(nn.Module): 32 | def __init__(self, num_channels: int, eps: float = 1e-6) -> None: 33 | super().__init__() 34 | self.weight = nn.Parameter(torch.ones(num_channels)) 35 | self.bias = nn.Parameter(torch.zeros(num_channels)) 36 | self.eps = eps 37 | 38 | def forward(self, x: torch.Tensor) -> torch.Tensor: 39 | u = x.mean(1, keepdim=True) 40 | s = (x - u).pow(2).mean(1, keepdim=True) 41 | x = (x - u) / torch.sqrt(s + self.eps) 42 | x = self.weight[:, None, None] * x + self.bias[:, None, None] 43 | return x 44 | -------------------------------------------------------------------------------- /segment_anything/segment_anything/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /segment_anything/segment_anything/utils/onnx.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.nn import functional as F 10 | 11 | from typing import Tuple 12 | 13 | from ..modeling import Sam 14 | from .amg import calculate_stability_score 15 | 16 | 17 | class SamOnnxModel(nn.Module): 18 | """ 19 | This model should not be called directly, but is used in ONNX export. 20 | It combines the prompt encoder, mask decoder, and mask postprocessing of Sam, 21 | with some functions modified to enable model tracing. Also supports extra 22 | options controlling what information. See the ONNX export script for details. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | model: Sam, 28 | return_single_mask: bool, 29 | use_stability_score: bool = False, 30 | return_extra_metrics: bool = False, 31 | ) -> None: 32 | super().__init__() 33 | self.mask_decoder = model.mask_decoder 34 | self.model = model 35 | self.img_size = model.image_encoder.img_size 36 | self.return_single_mask = return_single_mask 37 | self.use_stability_score = use_stability_score 38 | self.stability_score_offset = 1.0 39 | self.return_extra_metrics = return_extra_metrics 40 | 41 | @staticmethod 42 | def resize_longest_image_size( 43 | input_image_size: torch.Tensor, longest_side: int 44 | ) -> torch.Tensor: 45 | input_image_size = input_image_size.to(torch.float32) 46 | scale = longest_side / torch.max(input_image_size) 47 | transformed_size = scale * input_image_size 48 | transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64) 49 | return transformed_size 50 | 51 | def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor: 52 | point_coords = point_coords + 0.5 53 | point_coords = point_coords / self.img_size 54 | point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords) 55 | point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding) 56 | 57 | point_embedding = point_embedding * (point_labels != -1) 58 | point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * ( 59 | point_labels == -1 60 | ) 61 | 62 | for i in range(self.model.prompt_encoder.num_point_embeddings): 63 | point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[ 64 | i 65 | ].weight * (point_labels == i) 66 | 67 | return point_embedding 68 | 69 | def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor: 70 | mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask) 71 | mask_embedding = mask_embedding + ( 72 | 1 - has_mask_input 73 | ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1) 74 | return mask_embedding 75 | 76 | def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor: 77 | masks = F.interpolate( 78 | masks, 79 | size=(self.img_size, self.img_size), 80 | mode="bilinear", 81 | align_corners=False, 82 | ) 83 | 84 | prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size) 85 | masks = masks[..., : int(prepadded_size[0]), : int(prepadded_size[1])] 86 | 87 | orig_im_size = orig_im_size.to(torch.int64) 88 | h, w = orig_im_size[0], orig_im_size[1] 89 | masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False) 90 | return masks 91 | 92 | def select_masks( 93 | self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int 94 | ) -> Tuple[torch.Tensor, torch.Tensor]: 95 | # Determine if we should return the multiclick mask or not from the number of points. 96 | # The reweighting is used to avoid control flow. 97 | score_reweight = torch.tensor( 98 | [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)] 99 | ).to(iou_preds.device) 100 | score = iou_preds + (num_points - 2.5) * score_reweight 101 | best_idx = torch.argmax(score, dim=1) 102 | masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1) 103 | iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1) 104 | 105 | return masks, iou_preds 106 | 107 | @torch.no_grad() 108 | def forward( 109 | self, 110 | image_embeddings: torch.Tensor, 111 | point_coords: torch.Tensor, 112 | point_labels: torch.Tensor, 113 | mask_input: torch.Tensor, 114 | has_mask_input: torch.Tensor, 115 | orig_im_size: torch.Tensor, 116 | ): 117 | sparse_embedding = self._embed_points(point_coords, point_labels) 118 | dense_embedding = self._embed_masks(mask_input, has_mask_input) 119 | 120 | masks, scores = self.model.mask_decoder.predict_masks( 121 | image_embeddings=image_embeddings, 122 | image_pe=self.model.prompt_encoder.get_dense_pe(), 123 | sparse_prompt_embeddings=sparse_embedding, 124 | dense_prompt_embeddings=dense_embedding, 125 | ) 126 | 127 | if self.use_stability_score: 128 | scores = calculate_stability_score( 129 | masks, self.model.mask_threshold, self.stability_score_offset 130 | ) 131 | 132 | if self.return_single_mask: 133 | masks, scores = self.select_masks(masks, scores, point_coords.shape[1]) 134 | 135 | upscaled_masks = self.mask_postprocessing(masks, orig_im_size) 136 | 137 | if self.return_extra_metrics: 138 | stability_scores = calculate_stability_score( 139 | upscaled_masks, self.model.mask_threshold, self.stability_score_offset 140 | ) 141 | areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1) 142 | return upscaled_masks, scores, stability_scores, areas, masks 143 | 144 | return upscaled_masks, scores, masks 145 | -------------------------------------------------------------------------------- /segment_anything/segment_anything/utils/transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import numpy as np 8 | import torch 9 | from torch.nn import functional as F 10 | from torchvision.transforms.functional import resize, to_pil_image # type: ignore 11 | 12 | from copy import deepcopy 13 | from typing import Tuple 14 | 15 | 16 | class ResizeLongestSide: 17 | """ 18 | Resizes images to longest side 'target_length', as well as provides 19 | methods for resizing coordinates and boxes. Provides methods for 20 | transforming both numpy array and batched torch tensors. 21 | """ 22 | 23 | def __init__(self, target_length: int) -> None: 24 | self.target_length = target_length 25 | 26 | def apply_image(self, image: np.ndarray) -> np.ndarray: 27 | """ 28 | Expects a numpy array with shape HxWxC in uint8 format. 29 | """ 30 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 31 | return np.array(resize(to_pil_image(image), target_size)) 32 | 33 | def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 34 | """ 35 | Expects a numpy array of length 2 in the final dimension. Requires the 36 | original image size in (H, W) format. 37 | """ 38 | old_h, old_w = original_size 39 | new_h, new_w = self.get_preprocess_shape( 40 | original_size[0], original_size[1], self.target_length 41 | ) 42 | coords = deepcopy(coords).astype(float) 43 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 44 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 45 | return coords 46 | 47 | def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 48 | """ 49 | Expects a numpy array shape Bx4. Requires the original image size 50 | in (H, W) format. 51 | """ 52 | boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size) 53 | return boxes.reshape(-1, 4) 54 | 55 | def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor: 56 | """ 57 | Expects batched images with shape BxCxHxW and float format. This 58 | transformation may not exactly match apply_image. apply_image is 59 | the transformation expected by the model. 60 | """ 61 | # Expects an image in BCHW format. May not exactly match apply_image. 62 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 63 | return F.interpolate( 64 | image, target_size, mode="bilinear", align_corners=False, antialias=True 65 | ) 66 | 67 | def apply_coords_torch( 68 | self, coords: torch.Tensor, original_size: Tuple[int, ...] 69 | ) -> torch.Tensor: 70 | """ 71 | Expects a torch tensor with length 2 in the last dimension. Requires the 72 | original image size in (H, W) format. 73 | """ 74 | old_h, old_w = original_size 75 | new_h, new_w = self.get_preprocess_shape( 76 | original_size[0], original_size[1], self.target_length 77 | ) 78 | coords = deepcopy(coords).to(torch.float) 79 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 80 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 81 | return coords 82 | 83 | def apply_boxes_torch( 84 | self, boxes: torch.Tensor, original_size: Tuple[int, ...] 85 | ) -> torch.Tensor: 86 | """ 87 | Expects a torch tensor with shape Bx4. Requires the original image 88 | size in (H, W) format. 89 | """ 90 | boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size) 91 | return boxes.reshape(-1, 4) 92 | 93 | @staticmethod 94 | def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]: 95 | """ 96 | Compute the output size given input size and target long side length. 97 | """ 98 | scale = long_side_length * 1.0 / max(oldh, oldw) 99 | newh, neww = oldh * scale, oldw * scale 100 | neww = int(neww + 0.5) 101 | newh = int(newh + 0.5) 102 | return (newh, neww) 103 | -------------------------------------------------------------------------------- /segment_anything/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length=100 3 | multi_line_output=3 4 | include_trailing_comma=True 5 | known_standard_library=numpy,setuptools 6 | skip_glob=*/__init__.py 7 | known_myself=segment_anything 8 | known_third_party=matplotlib,cv2,torch,torchvision,pycocotools,onnx,black,isort 9 | no_lines_before=STDLIB,THIRDPARTY 10 | sections=FUTURE,STDLIB,THIRDPARTY,MYSELF,FIRSTPARTY,LOCALFOLDER 11 | default_section=FIRSTPARTY 12 | -------------------------------------------------------------------------------- /segment_anything/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from setuptools import find_packages, setup 8 | 9 | setup( 10 | name="segment_anything", 11 | version="1.0", 12 | install_requires=[], 13 | packages=find_packages(exclude="notebooks"), 14 | extras_require={ 15 | "all": ["matplotlib", "pycocotools", "opencv-python", "onnx", "onnxruntime"], 16 | "dev": ["flake8", "isort", "black", "mypy"], 17 | }, 18 | ) 19 | -------------------------------------------------------------------------------- /segment_anything/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /segment_anything/utils/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/utils/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/utils/__pycache__/amg.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/utils/__pycache__/amg.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/utils/__pycache__/transforms.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/segment_anything/utils/__pycache__/transforms.cpython-39.pyc -------------------------------------------------------------------------------- /segment_anything/utils/onnx.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.nn import functional as F 10 | 11 | from typing import Tuple 12 | 13 | from ..modeling import Sam 14 | from .amg import calculate_stability_score 15 | 16 | 17 | class SamOnnxModel(nn.Module): 18 | """ 19 | This model should not be called directly, but is used in ONNX export. 20 | It combines the prompt encoder, mask decoder, and mask postprocessing of Sam, 21 | with some functions modified to enable model tracing. Also supports extra 22 | options controlling what information. See the ONNX export script for details. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | model: Sam, 28 | return_single_mask: bool, 29 | use_stability_score: bool = False, 30 | return_extra_metrics: bool = False, 31 | ) -> None: 32 | super().__init__() 33 | self.mask_decoder = model.mask_decoder 34 | self.model = model 35 | self.img_size = model.image_encoder.img_size 36 | self.return_single_mask = return_single_mask 37 | self.use_stability_score = use_stability_score 38 | self.stability_score_offset = 1.0 39 | self.return_extra_metrics = return_extra_metrics 40 | 41 | @staticmethod 42 | def resize_longest_image_size( 43 | input_image_size: torch.Tensor, longest_side: int 44 | ) -> torch.Tensor: 45 | input_image_size = input_image_size.to(torch.float32) 46 | scale = longest_side / torch.max(input_image_size) 47 | transformed_size = scale * input_image_size 48 | transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64) 49 | return transformed_size 50 | 51 | def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor: 52 | point_coords = point_coords + 0.5 53 | point_coords = point_coords / self.img_size 54 | point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords) 55 | point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding) 56 | 57 | point_embedding = point_embedding * (point_labels != -1) 58 | point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * ( 59 | point_labels == -1 60 | ) 61 | 62 | for i in range(self.model.prompt_encoder.num_point_embeddings): 63 | point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[ 64 | i 65 | ].weight * (point_labels == i) 66 | 67 | return point_embedding 68 | 69 | def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor: 70 | mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask) 71 | mask_embedding = mask_embedding + ( 72 | 1 - has_mask_input 73 | ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1) 74 | return mask_embedding 75 | 76 | def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor: 77 | masks = F.interpolate( 78 | masks, 79 | size=(self.img_size, self.img_size), 80 | mode="bilinear", 81 | align_corners=False, 82 | ) 83 | 84 | prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size) 85 | masks = masks[..., : int(prepadded_size[0]), : int(prepadded_size[1])] 86 | 87 | orig_im_size = orig_im_size.to(torch.int64) 88 | h, w = orig_im_size[0], orig_im_size[1] 89 | masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False) 90 | return masks 91 | 92 | def select_masks( 93 | self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int 94 | ) -> Tuple[torch.Tensor, torch.Tensor]: 95 | # Determine if we should return the multiclick mask or not from the number of points. 96 | # The reweighting is used to avoid control flow. 97 | score_reweight = torch.tensor( 98 | [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)] 99 | ).to(iou_preds.device) 100 | score = iou_preds + (num_points - 2.5) * score_reweight 101 | best_idx = torch.argmax(score, dim=1) 102 | masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1) 103 | iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1) 104 | 105 | return masks, iou_preds 106 | 107 | @torch.no_grad() 108 | def forward( 109 | self, 110 | image_embeddings: torch.Tensor, 111 | point_coords: torch.Tensor, 112 | point_labels: torch.Tensor, 113 | mask_input: torch.Tensor, 114 | has_mask_input: torch.Tensor, 115 | orig_im_size: torch.Tensor, 116 | ): 117 | sparse_embedding = self._embed_points(point_coords, point_labels) 118 | dense_embedding = self._embed_masks(mask_input, has_mask_input) 119 | 120 | masks, scores = self.model.mask_decoder.predict_masks( 121 | image_embeddings=image_embeddings, 122 | image_pe=self.model.prompt_encoder.get_dense_pe(), 123 | sparse_prompt_embeddings=sparse_embedding, 124 | dense_prompt_embeddings=dense_embedding, 125 | ) 126 | 127 | if self.use_stability_score: 128 | scores = calculate_stability_score( 129 | masks, self.model.mask_threshold, self.stability_score_offset 130 | ) 131 | 132 | if self.return_single_mask: 133 | masks, scores = self.select_masks(masks, scores, point_coords.shape[1]) 134 | 135 | upscaled_masks = self.mask_postprocessing(masks, orig_im_size) 136 | 137 | if self.return_extra_metrics: 138 | stability_scores = calculate_stability_score( 139 | upscaled_masks, self.model.mask_threshold, self.stability_score_offset 140 | ) 141 | areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1) 142 | return upscaled_masks, scores, stability_scores, areas, masks 143 | 144 | return upscaled_masks, scores, masks 145 | -------------------------------------------------------------------------------- /segment_anything/utils/transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import numpy as np 8 | import torch 9 | from torch.nn import functional as F 10 | from torchvision.transforms.functional import resize, to_pil_image # type: ignore 11 | 12 | from copy import deepcopy 13 | from typing import Tuple 14 | 15 | 16 | class ResizeLongestSide: 17 | """ 18 | Resizes images to longest side 'target_length', as well as provides 19 | methods for resizing coordinates and boxes. Provides methods for 20 | transforming both numpy array and batched torch tensors. 21 | """ 22 | 23 | def __init__(self, target_length: int) -> None: 24 | self.target_length = target_length 25 | 26 | def apply_image(self, image: np.ndarray) -> np.ndarray: 27 | """ 28 | Expects a numpy array with shape HxWxC in uint8 format. 29 | """ 30 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 31 | return np.array(resize(to_pil_image(image), target_size)) 32 | 33 | def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 34 | """ 35 | Expects a numpy array of length 2 in the final dimension. Requires the 36 | original image size in (H, W) format. 37 | """ 38 | old_h, old_w = original_size 39 | new_h, new_w = self.get_preprocess_shape( 40 | original_size[0], original_size[1], self.target_length 41 | ) 42 | coords = deepcopy(coords).astype(float) 43 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 44 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 45 | return coords 46 | 47 | def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 48 | """ 49 | Expects a numpy array shape Bx4. Requires the original image size 50 | in (H, W) format. 51 | """ 52 | boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size) 53 | return boxes.reshape(-1, 4) 54 | 55 | def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor: 56 | """ 57 | Expects batched images with shape BxCxHxW and float format. This 58 | transformation may not exactly match apply_image. apply_image is 59 | the transformation expected by the model. 60 | """ 61 | # Expects an image in BCHW format. May not exactly match apply_image. 62 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 63 | return F.interpolate( 64 | image, target_size, mode="bilinear", align_corners=False, antialias=True 65 | ) 66 | 67 | def apply_coords_torch( 68 | self, coords: torch.Tensor, original_size: Tuple[int, ...] 69 | ) -> torch.Tensor: 70 | """ 71 | Expects a torch tensor with length 2 in the last dimension. Requires the 72 | original image size in (H, W) format. 73 | """ 74 | old_h, old_w = original_size 75 | new_h, new_w = self.get_preprocess_shape( 76 | original_size[0], original_size[1], self.target_length 77 | ) 78 | coords = deepcopy(coords).to(torch.float) 79 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 80 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 81 | return coords 82 | 83 | def apply_boxes_torch( 84 | self, boxes: torch.Tensor, original_size: Tuple[int, ...] 85 | ) -> torch.Tensor: 86 | """ 87 | Expects a torch tensor with shape Bx4. Requires the original image 88 | size in (H, W) format. 89 | """ 90 | boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size) 91 | return boxes.reshape(-1, 4) 92 | 93 | @staticmethod 94 | def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]: 95 | """ 96 | Compute the output size given input size and target long side length. 97 | """ 98 | scale = long_side_length * 1.0 / max(oldh, oldw) 99 | newh, neww = oldh * scale, oldw * scale 100 | neww = int(neww + 0.5) 101 | newh = int(newh + 0.5) 102 | return (newh, neww) 103 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | line_length=100 3 | multi_line_output=3 4 | include_trailing_comma=True 5 | known_standard_library=numpy,setuptools 6 | skip_glob=*/__init__.py 7 | known_myself=segment_anything 8 | known_third_party=matplotlib,cv2,torch,torchvision,pycocotools,onnx,black,isort 9 | no_lines_before=STDLIB,THIRDPARTY 10 | sections=FUTURE,STDLIB,THIRDPARTY,MYSELF,FIRSTPARTY,LOCALFOLDER 11 | default_section=FIRSTPARTY 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from setuptools import find_packages, setup 8 | 9 | setup( 10 | name="segment_anything", 11 | version="1.0", 12 | install_requires=[], 13 | packages=find_packages(exclude="notebooks"), 14 | extras_require={ 15 | "all": ["matplotlib", "pycocotools", "opencv-python", "onnx", "onnxruntime"], 16 | "dev": ["flake8", "isort", "black", "mypy"], 17 | }, 18 | ) 19 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | pip install -r requirements.txt 2 | 3 | 4 | wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth 5 | wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth 6 | 7 | wandb login --anonymously -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/utils/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/amg.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/utils/__pycache__/amg.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/transforms.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gradient-ai/AutoYOLO/e4e8c60d10687fb218f69d0a34eddff2bc92ce0b/utils/__pycache__/transforms.cpython-39.pyc -------------------------------------------------------------------------------- /utils/onnx.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.nn import functional as F 10 | 11 | from typing import Tuple 12 | 13 | from ..modeling import Sam 14 | from .amg import calculate_stability_score 15 | 16 | 17 | class SamOnnxModel(nn.Module): 18 | """ 19 | This model should not be called directly, but is used in ONNX export. 20 | It combines the prompt encoder, mask decoder, and mask postprocessing of Sam, 21 | with some functions modified to enable model tracing. Also supports extra 22 | options controlling what information. See the ONNX export script for details. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | model: Sam, 28 | return_single_mask: bool, 29 | use_stability_score: bool = False, 30 | return_extra_metrics: bool = False, 31 | ) -> None: 32 | super().__init__() 33 | self.mask_decoder = model.mask_decoder 34 | self.model = model 35 | self.img_size = model.image_encoder.img_size 36 | self.return_single_mask = return_single_mask 37 | self.use_stability_score = use_stability_score 38 | self.stability_score_offset = 1.0 39 | self.return_extra_metrics = return_extra_metrics 40 | 41 | @staticmethod 42 | def resize_longest_image_size( 43 | input_image_size: torch.Tensor, longest_side: int 44 | ) -> torch.Tensor: 45 | input_image_size = input_image_size.to(torch.float32) 46 | scale = longest_side / torch.max(input_image_size) 47 | transformed_size = scale * input_image_size 48 | transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64) 49 | return transformed_size 50 | 51 | def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor: 52 | point_coords = point_coords + 0.5 53 | point_coords = point_coords / self.img_size 54 | point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords) 55 | point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding) 56 | 57 | point_embedding = point_embedding * (point_labels != -1) 58 | point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * ( 59 | point_labels == -1 60 | ) 61 | 62 | for i in range(self.model.prompt_encoder.num_point_embeddings): 63 | point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[ 64 | i 65 | ].weight * (point_labels == i) 66 | 67 | return point_embedding 68 | 69 | def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor: 70 | mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask) 71 | mask_embedding = mask_embedding + ( 72 | 1 - has_mask_input 73 | ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1) 74 | return mask_embedding 75 | 76 | def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor: 77 | masks = F.interpolate( 78 | masks, 79 | size=(self.img_size, self.img_size), 80 | mode="bilinear", 81 | align_corners=False, 82 | ) 83 | 84 | prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size) 85 | masks = masks[..., : int(prepadded_size[0]), : int(prepadded_size[1])] 86 | 87 | orig_im_size = orig_im_size.to(torch.int64) 88 | h, w = orig_im_size[0], orig_im_size[1] 89 | masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False) 90 | return masks 91 | 92 | def select_masks( 93 | self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int 94 | ) -> Tuple[torch.Tensor, torch.Tensor]: 95 | # Determine if we should return the multiclick mask or not from the number of points. 96 | # The reweighting is used to avoid control flow. 97 | score_reweight = torch.tensor( 98 | [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)] 99 | ).to(iou_preds.device) 100 | score = iou_preds + (num_points - 2.5) * score_reweight 101 | best_idx = torch.argmax(score, dim=1) 102 | masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1) 103 | iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1) 104 | 105 | return masks, iou_preds 106 | 107 | @torch.no_grad() 108 | def forward( 109 | self, 110 | image_embeddings: torch.Tensor, 111 | point_coords: torch.Tensor, 112 | point_labels: torch.Tensor, 113 | mask_input: torch.Tensor, 114 | has_mask_input: torch.Tensor, 115 | orig_im_size: torch.Tensor, 116 | ): 117 | sparse_embedding = self._embed_points(point_coords, point_labels) 118 | dense_embedding = self._embed_masks(mask_input, has_mask_input) 119 | 120 | masks, scores = self.model.mask_decoder.predict_masks( 121 | image_embeddings=image_embeddings, 122 | image_pe=self.model.prompt_encoder.get_dense_pe(), 123 | sparse_prompt_embeddings=sparse_embedding, 124 | dense_prompt_embeddings=dense_embedding, 125 | ) 126 | 127 | if self.use_stability_score: 128 | scores = calculate_stability_score( 129 | masks, self.model.mask_threshold, self.stability_score_offset 130 | ) 131 | 132 | if self.return_single_mask: 133 | masks, scores = self.select_masks(masks, scores, point_coords.shape[1]) 134 | 135 | upscaled_masks = self.mask_postprocessing(masks, orig_im_size) 136 | 137 | if self.return_extra_metrics: 138 | stability_scores = calculate_stability_score( 139 | upscaled_masks, self.model.mask_threshold, self.stability_score_offset 140 | ) 141 | areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1) 142 | return upscaled_masks, scores, stability_scores, areas, masks 143 | 144 | return upscaled_masks, scores, masks 145 | -------------------------------------------------------------------------------- /utils/transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import numpy as np 8 | import torch 9 | from torch.nn import functional as F 10 | from torchvision.transforms.functional import resize, to_pil_image # type: ignore 11 | 12 | from copy import deepcopy 13 | from typing import Tuple 14 | 15 | 16 | class ResizeLongestSide: 17 | """ 18 | Resizes images to longest side 'target_length', as well as provides 19 | methods for resizing coordinates and boxes. Provides methods for 20 | transforming both numpy array and batched torch tensors. 21 | """ 22 | 23 | def __init__(self, target_length: int) -> None: 24 | self.target_length = target_length 25 | 26 | def apply_image(self, image: np.ndarray) -> np.ndarray: 27 | """ 28 | Expects a numpy array with shape HxWxC in uint8 format. 29 | """ 30 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 31 | return np.array(resize(to_pil_image(image), target_size)) 32 | 33 | def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 34 | """ 35 | Expects a numpy array of length 2 in the final dimension. Requires the 36 | original image size in (H, W) format. 37 | """ 38 | old_h, old_w = original_size 39 | new_h, new_w = self.get_preprocess_shape( 40 | original_size[0], original_size[1], self.target_length 41 | ) 42 | coords = deepcopy(coords).astype(float) 43 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 44 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 45 | return coords 46 | 47 | def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: 48 | """ 49 | Expects a numpy array shape Bx4. Requires the original image size 50 | in (H, W) format. 51 | """ 52 | boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size) 53 | return boxes.reshape(-1, 4) 54 | 55 | def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor: 56 | """ 57 | Expects batched images with shape BxCxHxW and float format. This 58 | transformation may not exactly match apply_image. apply_image is 59 | the transformation expected by the model. 60 | """ 61 | # Expects an image in BCHW format. May not exactly match apply_image. 62 | target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length) 63 | return F.interpolate( 64 | image, target_size, mode="bilinear", align_corners=False, antialias=True 65 | ) 66 | 67 | def apply_coords_torch( 68 | self, coords: torch.Tensor, original_size: Tuple[int, ...] 69 | ) -> torch.Tensor: 70 | """ 71 | Expects a torch tensor with length 2 in the last dimension. Requires the 72 | original image size in (H, W) format. 73 | """ 74 | old_h, old_w = original_size 75 | new_h, new_w = self.get_preprocess_shape( 76 | original_size[0], original_size[1], self.target_length 77 | ) 78 | coords = deepcopy(coords).to(torch.float) 79 | coords[..., 0] = coords[..., 0] * (new_w / old_w) 80 | coords[..., 1] = coords[..., 1] * (new_h / old_h) 81 | return coords 82 | 83 | def apply_boxes_torch( 84 | self, boxes: torch.Tensor, original_size: Tuple[int, ...] 85 | ) -> torch.Tensor: 86 | """ 87 | Expects a torch tensor with shape Bx4. Requires the original image 88 | size in (H, W) format. 89 | """ 90 | boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size) 91 | return boxes.reshape(-1, 4) 92 | 93 | @staticmethod 94 | def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]: 95 | """ 96 | Compute the output size given input size and target long side length. 97 | """ 98 | scale = long_side_length * 1.0 / max(oldh, oldw) 99 | newh, neww = oldh * scale, oldw * scale 100 | neww = int(neww + 0.5) 101 | newh = int(newh + 0.5) 102 | return (newh, neww) 103 | --------------------------------------------------------------------------------