├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.PE ├── LICENSE.PLM ├── README.md ├── apps ├── detection │ ├── DETA_pe │ │ ├── README.md │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── coco.py │ │ │ ├── coco_eval.py │ │ │ ├── coco_panoptic.py │ │ │ ├── data_prefetcher.py │ │ │ ├── objects365.py │ │ │ ├── panoptic_eval.py │ │ │ ├── samplers.py │ │ │ ├── torchvision_datasets │ │ │ │ ├── __init__.py │ │ │ │ └── coco.py │ │ │ └── transforms.py │ │ ├── engine.py │ │ ├── engine_tta.py │ │ ├── main.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── assigner.py │ │ │ ├── backbone.py │ │ │ ├── deformable_detr.py │ │ │ ├── deformable_transformer.py │ │ │ ├── matcher.py │ │ │ ├── ops │ │ │ │ ├── functions │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── ms_deform_attn_func.py │ │ │ │ ├── make.sh │ │ │ │ ├── modules │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── ms_deform_attn.py │ │ │ │ ├── setup.py │ │ │ │ ├── src │ │ │ │ │ ├── cpu │ │ │ │ │ │ ├── ms_deform_attn_cpu.cpp │ │ │ │ │ │ └── ms_deform_attn_cpu.h │ │ │ │ │ ├── cuda │ │ │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ │ │ ├── ms_deform_attn_cuda.h │ │ │ │ │ │ └── ms_deform_im2col_cuda.cuh │ │ │ │ │ ├── ms_deform_attn.h │ │ │ │ │ └── vision.cpp │ │ │ │ └── test.py │ │ │ ├── pev1.py │ │ │ ├── position_encoding.py │ │ │ ├── segmentation.py │ │ │ ├── swin.py │ │ │ ├── utils_d2.py │ │ │ ├── utils_fed_loss.py │ │ │ └── utils_softnms.py │ │ ├── scripts │ │ │ ├── eval.sh │ │ │ ├── eval_1824pix.sh │ │ │ ├── eval_tta_slurm.sh │ │ │ ├── eval_tta_slurm_1824pix.sh │ │ │ ├── finetune_further_spatial_Gwin384_cocoep3_1824pix_8node.sh │ │ │ ├── finetune_spatial_Gwin384_cocoep12_1728pix_8node.sh │ │ │ ├── pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node.sh │ │ │ └── pretrain_spatial_Gwin384_o365ep12_1024pix_16node.sh │ │ └── util │ │ │ ├── __init__.py │ │ │ ├── box_ops.py │ │ │ ├── ema.py │ │ │ ├── misc.py │ │ │ └── plot_utils.py │ ├── INSTALL.md │ ├── README.md │ ├── detectron2_pe │ │ ├── __init__.py │ │ ├── checkpoint │ │ │ ├── __init__.py │ │ │ └── detection_checkpoint.py │ │ └── modeling │ │ │ ├── __init__.py │ │ │ └── backbone │ │ │ ├── __init__.py │ │ │ └── pev1_det.py │ ├── projects │ │ └── ViTDet │ │ │ └── configs │ │ │ ├── COCO │ │ │ ├── mask_rcnn_PEcore_G_coco75ep.py │ │ │ ├── mask_rcnn_PEspatial_G_coco36ep.py │ │ │ └── mask_rcnn_vitdet_b_100ep.py │ │ │ ├── LVIS │ │ │ ├── mask_rcnn_PEcore_G_lvis75ep.py │ │ │ └── mask_rcnn_PEspatial_G_lvis75ep.py │ │ │ └── common │ │ │ └── coco_loader_lsj.py │ ├── scripts │ │ ├── coco │ │ │ ├── train_mask_rcnn_PEcore_G_coco75ep.sh │ │ │ └── train_mask_rcnn_PEspatial_G_coco36ep.sh │ │ ├── evaluate_local.sh │ │ └── lvis │ │ │ ├── train_mask_rcnn_PEcore_G_lvis75ep.sh │ │ │ └── train_mask_rcnn_PEspatial_G_lvis75ep.sh │ └── tools │ │ ├── convert_d2.py │ │ ├── lazyconfig_train_net_pe.py │ │ └── lazyconfig_train_net_pe_slurm.py ├── pe │ ├── README.md │ ├── clip_benchmark │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── babel_imagenet.py │ │ │ ├── builder.py │ │ │ ├── caltech101.py │ │ │ ├── crossmodal3600.py │ │ │ ├── en_classnames.json │ │ │ ├── en_zeroshot_classification_templates.json │ │ │ ├── flickr.py │ │ │ ├── flickr30k_200.py │ │ │ ├── flores_langs.py │ │ │ ├── imagenetv2.py │ │ │ ├── kitti.py │ │ │ ├── multilingual_mscoco.py │ │ │ ├── objectnet.py │ │ │ ├── pos_neg_caption_dataset.py │ │ │ ├── tfds.py │ │ │ ├── video_classification_dataset.py │ │ │ ├── video_retrieval_dataset.py │ │ │ ├── voc2007.py │ │ │ ├── winoground.py │ │ │ └── xtd200.py │ │ ├── metrics │ │ │ ├── __captioning.py │ │ │ ├── __init__.py │ │ │ ├── image_caption_selection.py │ │ │ ├── linear_probe.py │ │ │ ├── multiclass_retrieval.py │ │ │ ├── visualization.py │ │ │ ├── zeroshot_classification.py │ │ │ └── zeroshot_retrieval.py │ │ ├── model_collection.py │ │ ├── tasks │ │ │ └── wds_benchmarks.txt │ │ └── webdataset_builder.py │ └── docs │ │ ├── assets │ │ ├── cat.png │ │ ├── dog.mp4 │ │ ├── dog.png │ │ ├── spatial_correspondence.png │ │ ├── spatial_features.png │ │ └── teaser.png │ │ ├── evaluation.md │ │ └── pe_demo.ipynb └── plm │ ├── README.md │ ├── configs │ ├── datasets.yaml │ ├── stage_1 │ │ ├── plm_1b.yaml │ │ ├── plm_3b.yaml │ │ └── plm_8b.yaml │ ├── stage_2 │ │ ├── plm_1b.yaml │ │ ├── plm_3b.yaml │ │ └── plm_8b.yaml │ └── stage_3 │ │ ├── plm_1b.yaml │ │ ├── plm_3b.yaml │ │ └── plm_8b.yaml │ ├── consolidate.py │ ├── dataset_conf.py │ ├── docs │ ├── evaluation.md │ ├── finetune_example.md │ ├── plm_main_fig.png │ ├── plm_videobench.md │ └── training.md │ ├── generate.py │ ├── interpolate_PE_pos_embed.py │ ├── notebook_demos │ ├── image_and_video_captioning.ipynb │ ├── image_grounding.ipynb │ └── multi_image_understanding.ipynb │ ├── tokenizer.py │ ├── train.py │ └── transformer.py ├── core ├── args.py ├── checkpoint.py ├── data │ ├── conversation.py │ ├── data.py │ ├── data_collators.py │ ├── data_mixer.py │ ├── dataloader.py │ └── preprocessor.py ├── distributed.py ├── logger.py ├── metrics.py ├── optim.py ├── probe.py ├── profiling.py ├── stool.py ├── tests │ ├── Rock-climbing-Canada-1920x1147.jpg │ ├── dataloader_test.py │ ├── llama3_tokenizer_test.py │ ├── ocrbench_centre.jpg │ └── selfie_cathedral_peak.jpg ├── tokenizer.py ├── transformer.py ├── transforms │ ├── image_transform.py │ ├── region_transform.py │ └── video_transform.py ├── utils.py ├── vision_encoder │ ├── __init__.py │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── config.py │ ├── pe.py │ ├── rope.py │ ├── tokenizer.py │ └── transforms.py └── vision_projector │ ├── base.py │ └── mlp.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .vscode 3 | *.ipynb 4 | slurm-*.out 5 | wandb 6 | data/* 7 | data-gym-cache/* 8 | torchinductor_*/* 9 | tmp*/* 10 | apps/plm/dummy_datasets 11 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | This Code of Conduct also applies outside the project spaces when there is a 56 | reasonable belief that an individual's behavior may have a negative impact on 57 | the project or its community. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported by contacting the project team at . All 63 | complaints will be reviewed and investigated and will result in a response that 64 | is deemed necessary and appropriate to the circumstances. The project team is 65 | obligated to maintain confidentiality with regard to the reporter of an incident. 66 | Further details of specific enforcement policies may be posted separately. 67 | 68 | Project maintainers who do not follow or enforce the Code of Conduct in good 69 | faith may face temporary or permanent repercussions as determined by other 70 | members of the project's leadership. 71 | 72 | ## Attribution 73 | 74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 76 | 77 | [homepage]: https://www.contributor-covenant.org 78 | 79 | For answers to common questions about this code of conduct, see 80 | https://www.contributor-covenant.org/faq 81 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Perception Models 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Pull Requests 6 | We actively welcome your pull requests. 7 | 8 | 1. Fork the repo and create your branch from `main`. 9 | 2. If you've added code that should be tested, add tests. 10 | 3. If you've changed APIs, update the documentation. 11 | 4. Ensure the test suite passes. 12 | 5. Make sure your code lints. 13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 14 | 15 | ## Contributor License Agreement ("CLA") 16 | In order to accept your pull request, we need you to submit a CLA. You only need 17 | to do this once to work on any of Facebook's open source projects. 18 | 19 | Complete your CLA here: 20 | 21 | ## Issues 22 | We use GitHub issues to track public bugs. Please ensure your description is 23 | clear and has sufficient instructions to be able to reproduce the issue. 24 | 25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 26 | disclosure of security bugs. In those cases, please go through the process 27 | outlined on that page and do not file a public issue. 28 | 29 | ## License 30 | By contributing to mae, you agree that your contributions will be licensed 31 | under the LICENSE file in the root directory of this source tree. 32 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/README.md: -------------------------------------------------------------------------------- 1 | # SOTA COCO Object Detection with PE 2 | 3 | ## Getting started 4 | 5 | Please refer to [INSTALL.md](../INSTALL.md) for installation and dataset preparation instructions. 6 | 7 | Also install [Deformable Attention](models/ops/make.sh) ops. 8 | 9 | ## Results and Fine-tuned Models 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
detectorvision encoderbox
AP
box(TTA)
AP
download
DETAPE spatial G 65.2 66.0 model
28 | 29 | 30 | ## Training 31 | We apply a four-stage training, Objects365(12ep, 1024pix), Objects365(6ep, 1536pix), COCO(12ep, 1728pix), COCO(3ep, 1824pix) 32 | 33 | ``` 34 | sbatch scripts/pretrain_spatial_Gwin384_o365ep12_1024pix_16node.sh 35 | 36 | sbatch scripts/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node.sh 37 | 38 | sbatch scripts/finetune_spatial_Gwin384_cocoep12_1728pix_8node.sh 39 | 40 | sbatch scripts/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node.sh 41 | 42 | ``` 43 | 44 | ## Evaluation 45 | ``` 46 | bash scripts/eval_1824pix.sh --resume deta_coco_1824pix.pth 47 | ``` 48 | 49 | ## Evaluation with TTA (Test-Time Augmentation) 50 | ``` 51 | sbatch scripts/eval_tta_slurm_1824pix.sh --resume deta_coco_1824pix.pth 52 | ``` 53 | Note: If you get 65.9 AP, it is probably caused by different package versions, trying different hyperparameters like `--quad_scale 0.4` will give 66.0 AP. 54 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | import torch.utils.data 11 | 12 | from .coco import build as build_coco 13 | from .objects365 import build as build_objects365 14 | from .torchvision_datasets import CocoDetection 15 | 16 | 17 | def get_coco_api_from_dataset(dataset): 18 | for _ in range(10): 19 | # if isinstance(dataset, torchvision.datasets.CocoDetection): 20 | # break 21 | if isinstance(dataset, torch.utils.data.Subset): 22 | dataset = dataset.dataset 23 | if isinstance(dataset, CocoDetection): 24 | return dataset.coco 25 | 26 | 27 | def build_dataset(image_set, args): 28 | if args.dataset_file == "objects365": 29 | return build_objects365(image_set, args) 30 | if args.dataset_file == "coco": 31 | return build_coco(image_set, args) 32 | if args.dataset_file == "coco_panoptic": 33 | # to avoid making panopticapi required for coco 34 | from .coco_panoptic import build as build_coco_panoptic 35 | 36 | return build_coco_panoptic(image_set, args) 37 | raise ValueError(f"dataset {args.dataset_file} not supported") 38 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/datasets/coco_panoptic.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | import json 11 | from pathlib import Path 12 | 13 | import numpy as np 14 | import torch 15 | from PIL import Image 16 | 17 | from panopticapi.utils import rgb2id 18 | from util.box_ops import masks_to_boxes 19 | 20 | from .coco import make_coco_transforms 21 | 22 | 23 | class CocoPanoptic: 24 | def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True): 25 | with open(ann_file, 'r') as f: 26 | self.coco = json.load(f) 27 | 28 | # sort 'images' field so that they are aligned with 'annotations' 29 | # i.e., in alphabetical order 30 | self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id']) 31 | # sanity check 32 | if "annotations" in self.coco: 33 | for img, ann in zip(self.coco['images'], self.coco['annotations']): 34 | assert img['file_name'][:-4] == ann['file_name'][:-4] 35 | 36 | self.img_folder = img_folder 37 | self.ann_folder = ann_folder 38 | self.ann_file = ann_file 39 | self.transforms = transforms 40 | self.return_masks = return_masks 41 | 42 | def __getitem__(self, idx): 43 | ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx] 44 | img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg') 45 | ann_path = Path(self.ann_folder) / ann_info['file_name'] 46 | 47 | img = Image.open(img_path).convert('RGB') 48 | w, h = img.size 49 | if "segments_info" in ann_info: 50 | masks = np.asarray(Image.open(ann_path), dtype=np.uint32) 51 | masks = rgb2id(masks) 52 | 53 | ids = np.array([ann['id'] for ann in ann_info['segments_info']]) 54 | masks = masks == ids[:, None, None] 55 | 56 | masks = torch.as_tensor(masks, dtype=torch.uint8) 57 | labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64) 58 | 59 | target = {} 60 | target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]]) 61 | if self.return_masks: 62 | target['masks'] = masks 63 | target['labels'] = labels 64 | 65 | target["boxes"] = masks_to_boxes(masks) 66 | 67 | target['size'] = torch.as_tensor([int(h), int(w)]) 68 | target['orig_size'] = torch.as_tensor([int(h), int(w)]) 69 | if "segments_info" in ann_info: 70 | for name in ['iscrowd', 'area']: 71 | target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']]) 72 | 73 | if self.transforms is not None: 74 | img, target = self.transforms(img, target) 75 | 76 | return img, target 77 | 78 | def __len__(self): 79 | return len(self.coco['images']) 80 | 81 | def get_height_and_width(self, idx): 82 | img_info = self.coco['images'][idx] 83 | height = img_info['height'] 84 | width = img_info['width'] 85 | return height, width 86 | 87 | 88 | def build(image_set, args): 89 | img_folder_root = Path(args.coco_path) 90 | ann_folder_root = Path(args.coco_panoptic_path) 91 | assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist' 92 | assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist' 93 | mode = 'panoptic' 94 | PATHS = { 95 | "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'), 96 | "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'), 97 | } 98 | 99 | img_folder, ann_file = PATHS[image_set] 100 | img_folder_path = img_folder_root / img_folder 101 | ann_folder = ann_folder_root / f'{mode}_{img_folder}' 102 | ann_file = ann_folder_root / ann_file 103 | 104 | dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file, 105 | transforms=make_coco_transforms(image_set), return_masks=args.masks) 106 | 107 | return dataset 108 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/datasets/data_prefetcher.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | 7 | import torch 8 | 9 | def to_cuda(samples, targets, device): 10 | samples = samples.to(device, non_blocking=True) 11 | targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets] 12 | return samples, targets 13 | 14 | class data_prefetcher(): 15 | def __init__(self, loader, device, prefetch=True): 16 | self.loader = iter(loader) 17 | self.prefetch = prefetch 18 | self.device = device 19 | if prefetch: 20 | self.stream = torch.cuda.Stream() 21 | self.preload() 22 | 23 | def preload(self): 24 | try: 25 | self.next_samples, self.next_targets = next(self.loader) 26 | except StopIteration: 27 | self.next_samples = None 28 | self.next_targets = None 29 | return 30 | # if record_stream() doesn't work, another option is to make sure device inputs are created 31 | # on the main stream. 32 | # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda') 33 | # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda') 34 | # Need to make sure the memory allocated for next_* is not still in use by the main stream 35 | # at the time we start copying to next_*: 36 | # self.stream.wait_stream(torch.cuda.current_stream()) 37 | with torch.cuda.stream(self.stream): 38 | self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device) 39 | # more code for the alternative if record_stream() doesn't work: 40 | # copy_ will record the use of the pinned source tensor in this side stream. 41 | # self.next_input_gpu.copy_(self.next_input, non_blocking=True) 42 | # self.next_target_gpu.copy_(self.next_target, non_blocking=True) 43 | # self.next_input = self.next_input_gpu 44 | # self.next_target = self.next_target_gpu 45 | 46 | # With Amp, it isn't necessary to manually convert data to half. 47 | # if args.fp16: 48 | # self.next_input = self.next_input.half() 49 | # else: 50 | 51 | def next(self): 52 | if self.prefetch: 53 | torch.cuda.current_stream().wait_stream(self.stream) 54 | samples = self.next_samples 55 | targets = self.next_targets 56 | if samples is not None: 57 | samples.record_stream(torch.cuda.current_stream()) 58 | if targets is not None: 59 | for t in targets: 60 | for k, v in t.items(): 61 | v.record_stream(torch.cuda.current_stream()) 62 | self.preload() 63 | else: 64 | try: 65 | samples, targets = next(self.loader) 66 | samples, targets = to_cuda(samples, targets, self.device) 67 | except StopIteration: 68 | samples = None 69 | targets = None 70 | return samples, targets 71 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/datasets/objects365.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | COCO dataset which returns image_id for evaluation. 12 | 13 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py 14 | """ 15 | from pathlib import Path 16 | 17 | import datasets.transforms as T 18 | 19 | import torch 20 | import torch.utils.data 21 | from pycocotools import mask as coco_mask 22 | from util.misc import get_local_rank, get_local_size 23 | 24 | from .coco import CocoDetection, make_coco_transforms, make_coco_transforms_lsj 25 | from .torchvision_datasets import CocoDetection as TvCocoDetection 26 | 27 | 28 | def build(image_set, args): 29 | root = Path(args.coco_path) 30 | assert root.exists(), f"provided Objects365 path {root} does not exist" 31 | mode = "instances" 32 | PATHS = { 33 | "train": ( 34 | root / "train", 35 | root / "annotations" / "zhiyuan_objv2_train_fixmiss.json", 36 | ), 37 | "val": (root / "val", root / "annotations" / "zhiyuan_objv2_val.json"), 38 | } 39 | 40 | img_folder, ann_file = PATHS[image_set] 41 | if args.lsj: 42 | coco_transform = make_coco_transforms_lsj(image_set, args.lsj_img_size) 43 | else: 44 | coco_transform = make_coco_transforms(image_set, args.bigger) 45 | dataset = CocoDetection( 46 | img_folder, 47 | ann_file, 48 | transforms=coco_transform, 49 | return_masks=args.masks, 50 | cache_mode=args.cache_mode, 51 | local_rank=get_local_rank(), 52 | local_size=get_local_size(), 53 | ) 54 | return dataset 55 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/datasets/panoptic_eval.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | import json 11 | import os 12 | 13 | import util.misc as utils 14 | 15 | try: 16 | from panopticapi.evaluation import pq_compute 17 | except ImportError: 18 | pass 19 | 20 | 21 | class PanopticEvaluator(object): 22 | def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"): 23 | self.gt_json = ann_file 24 | self.gt_folder = ann_folder 25 | if utils.is_main_process(): 26 | if not os.path.exists(output_dir): 27 | os.mkdir(output_dir) 28 | self.output_dir = output_dir 29 | self.predictions = [] 30 | 31 | def update(self, predictions): 32 | for p in predictions: 33 | with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f: 34 | f.write(p.pop("png_string")) 35 | 36 | self.predictions += predictions 37 | 38 | def synchronize_between_processes(self): 39 | all_predictions = utils.all_gather(self.predictions) 40 | merged_predictions = [] 41 | for p in all_predictions: 42 | merged_predictions += p 43 | self.predictions = merged_predictions 44 | 45 | def summarize(self): 46 | if utils.is_main_process(): 47 | json_data = {"annotations": self.predictions} 48 | predictions_json = os.path.join(self.output_dir, "predictions.json") 49 | with open(predictions_json, "w") as f: 50 | f.write(json.dumps(json_data)) 51 | return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir) 52 | return None 53 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/datasets/torchvision_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | 7 | from .coco import CocoDetection 8 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/datasets/torchvision_datasets/coco.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from torchvision 7 | # ------------------------------------------------------------------------ 8 | 9 | """ 10 | Copy-Paste from torchvision, but add utility of caching images on memory 11 | """ 12 | from torchvision.datasets.vision import VisionDataset 13 | from PIL import Image 14 | import os 15 | import os.path 16 | import tqdm 17 | from io import BytesIO 18 | 19 | 20 | class CocoDetection(VisionDataset): 21 | """`MS Coco Detection `_ Dataset. 22 | Args: 23 | root (string): Root directory where images are downloaded to. 24 | annFile (string): Path to json annotation file. 25 | transform (callable, optional): A function/transform that takes in an PIL image 26 | and returns a transformed version. E.g, ``transforms.ToTensor`` 27 | target_transform (callable, optional): A function/transform that takes in the 28 | target and transforms it. 29 | transforms (callable, optional): A function/transform that takes input sample and its target as entry 30 | and returns a transformed version. 31 | """ 32 | 33 | def __init__(self, root, annFile, transform=None, target_transform=None, transforms=None, 34 | cache_mode=False, local_rank=0, local_size=1): 35 | super(CocoDetection, self).__init__(root, transforms, transform, target_transform) 36 | from pycocotools.coco import COCO 37 | self.coco = COCO(annFile) 38 | self.ids = list(sorted(self.coco.imgs.keys())) 39 | self.cache_mode = cache_mode 40 | self.local_rank = local_rank 41 | self.local_size = local_size 42 | if cache_mode: 43 | self.cache = {} 44 | self.cache_images() 45 | 46 | def cache_images(self): 47 | self.cache = {} 48 | for index, img_id in zip(tqdm.trange(len(self.ids)), self.ids): 49 | if index % self.local_size != self.local_rank: 50 | continue 51 | path = self.coco.loadImgs(img_id)[0]['file_name'] 52 | with open(os.path.join(self.root, path), 'rb') as f: 53 | self.cache[path] = f.read() 54 | 55 | def get_image(self, path): 56 | if self.cache_mode: 57 | if path not in self.cache.keys(): 58 | with open(os.path.join(self.root, path), 'rb') as f: 59 | self.cache[path] = f.read() 60 | return Image.open(BytesIO(self.cache[path])).convert('RGB') 61 | return Image.open(os.path.join(self.root, path)).convert('RGB') 62 | 63 | def __getitem__(self, index): 64 | """ 65 | Args: 66 | index (int): Index 67 | Returns: 68 | tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``. 69 | """ 70 | coco = self.coco 71 | img_id = self.ids[index] 72 | ann_ids = coco.getAnnIds(imgIds=img_id) 73 | target = coco.loadAnns(ann_ids) 74 | 75 | path = coco.loadImgs(img_id)[0]['file_name'] 76 | 77 | img = self.get_image(path) 78 | if self.transforms is not None: 79 | img, target = self.transforms(img, target) 80 | 81 | return img, target 82 | 83 | def __len__(self): 84 | return len(self.ids) 85 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | from .deformable_detr import build 11 | 12 | 13 | def build_model(args): 14 | return build(args) 15 | 16 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/ops/functions/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn_func import ms_deform_attn_core_pytorch, MSDeformAttnFunction 10 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/ops/functions/ms_deform_attn_func.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import, division, print_function 10 | 11 | import MultiScaleDeformableAttention as MSDA 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Function 16 | from torch.autograd.function import once_differentiable 17 | 18 | 19 | class MSDeformAttnFunction(Function): 20 | @staticmethod 21 | def forward( 22 | ctx, 23 | value, 24 | value_spatial_shapes, 25 | value_level_start_index, 26 | sampling_locations, 27 | attention_weights, 28 | im2col_step, 29 | ): 30 | ctx.im2col_step = im2col_step 31 | output = MSDA.ms_deform_attn_forward( 32 | value, 33 | value_spatial_shapes, 34 | value_level_start_index, 35 | sampling_locations, 36 | attention_weights, 37 | ctx.im2col_step, 38 | ) 39 | ctx.save_for_backward( 40 | value, 41 | value_spatial_shapes, 42 | value_level_start_index, 43 | sampling_locations, 44 | attention_weights, 45 | ) 46 | return output 47 | 48 | @staticmethod 49 | @once_differentiable 50 | def backward(ctx, grad_output): 51 | ( 52 | value, 53 | value_spatial_shapes, 54 | value_level_start_index, 55 | sampling_locations, 56 | attention_weights, 57 | ) = ctx.saved_tensors 58 | grad_value, grad_sampling_loc, grad_attn_weight = MSDA.ms_deform_attn_backward( 59 | value, 60 | value_spatial_shapes, 61 | value_level_start_index, 62 | sampling_locations, 63 | attention_weights, 64 | grad_output, 65 | ctx.im2col_step, 66 | ) 67 | 68 | return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None 69 | 70 | 71 | def ms_deform_attn_core_pytorch( 72 | value, value_spatial_shapes, sampling_locations, attention_weights 73 | ): 74 | # for debug and test only, 75 | # need to use cuda version instead 76 | N_, S_, M_, D_ = value.shape 77 | _, Lq_, M_, L_, P_, _ = sampling_locations.shape 78 | value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) 79 | sampling_grids = 2 * sampling_locations - 1 80 | sampling_value_list = [] 81 | for lid_, (H_, W_) in enumerate(value_spatial_shapes): 82 | # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ 83 | value_l_ = ( 84 | value_list[lid_].flatten(2).transpose(1, 2).reshape(N_ * M_, D_, H_, W_) 85 | ) 86 | # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 87 | sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) 88 | # N_*M_, D_, Lq_, P_ 89 | sampling_value_l_ = F.grid_sample( 90 | value_l_, 91 | sampling_grid_l_, 92 | mode="bilinear", 93 | padding_mode="zeros", 94 | align_corners=False, 95 | ) 96 | sampling_value_list.append(sampling_value_l_) 97 | # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) 98 | attention_weights = attention_weights.transpose(1, 2).reshape( 99 | N_ * M_, 1, Lq_, L_ * P_ 100 | ) 101 | output = ( 102 | (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) 103 | .sum(-1) 104 | .view(N_, M_ * D_, Lq_) 105 | ) 106 | return output.transpose(1, 2).contiguous() 107 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/ops/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------------------------------ 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | # ------------------------------------------------------------------------------------------------ 9 | 10 | python setup.py build install 11 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/ops/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from .ms_deform_attn import MSDeformAttn 10 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/ops/setup.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | import os 10 | import glob 11 | 12 | import torch 13 | 14 | from torch.utils.cpp_extension import CUDA_HOME 15 | from torch.utils.cpp_extension import CppExtension 16 | from torch.utils.cpp_extension import CUDAExtension 17 | 18 | from setuptools import find_packages 19 | from setuptools import setup 20 | 21 | requirements = ["torch", "torchvision"] 22 | 23 | def get_extensions(): 24 | this_dir = os.path.dirname(os.path.abspath(__file__)) 25 | extensions_dir = os.path.join(this_dir, "src") 26 | 27 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 28 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 29 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 30 | 31 | sources = main_file + source_cpu 32 | extension = CppExtension 33 | extra_compile_args = {"cxx": []} 34 | define_macros = [] 35 | 36 | if torch.cuda.is_available() and CUDA_HOME is not None: 37 | extension = CUDAExtension 38 | sources += source_cuda 39 | define_macros += [("WITH_CUDA", None)] 40 | extra_compile_args["nvcc"] = [ 41 | "-DCUDA_HAS_FP16=1", 42 | "-D__CUDA_NO_HALF_OPERATORS__", 43 | "-D__CUDA_NO_HALF_CONVERSIONS__", 44 | "-D__CUDA_NO_HALF2_OPERATORS__", 45 | ] 46 | else: 47 | raise NotImplementedError('Cuda is not availabel') 48 | 49 | sources = [os.path.join(extensions_dir, s) for s in sources] 50 | include_dirs = [extensions_dir] 51 | ext_modules = [ 52 | extension( 53 | "MultiScaleDeformableAttention", 54 | sources, 55 | include_dirs=include_dirs, 56 | define_macros=define_macros, 57 | extra_compile_args=extra_compile_args, 58 | ) 59 | ] 60 | return ext_modules 61 | 62 | setup( 63 | name="MultiScaleDeformableAttention", 64 | version="1.0", 65 | author="Weijie Su", 66 | url="https://github.com/fundamentalvision/Deformable-DETR", 67 | description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", 68 | packages=find_packages(exclude=("configs", "tests",)), 69 | ext_modules=get_extensions(), 70 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 71 | ) 72 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/ops/src/cpu/ms_deform_attn_cpu.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | 17 | at::Tensor 18 | ms_deform_attn_cpu_forward( 19 | const at::Tensor &value, 20 | const at::Tensor &spatial_shapes, 21 | const at::Tensor &level_start_index, 22 | const at::Tensor &sampling_loc, 23 | const at::Tensor &attn_weight, 24 | const int im2col_step) 25 | { 26 | AT_ERROR("Not implement on cpu"); 27 | } 28 | 29 | std::vector 30 | ms_deform_attn_cpu_backward( 31 | const at::Tensor &value, 32 | const at::Tensor &spatial_shapes, 33 | const at::Tensor &level_start_index, 34 | const at::Tensor &sampling_loc, 35 | const at::Tensor &attn_weight, 36 | const at::Tensor &grad_output, 37 | const int im2col_step) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/ops/src/cpu/ms_deform_attn_cpu.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor 15 | ms_deform_attn_cpu_forward( 16 | const at::Tensor &value, 17 | const at::Tensor &spatial_shapes, 18 | const at::Tensor &level_start_index, 19 | const at::Tensor &sampling_loc, 20 | const at::Tensor &attn_weight, 21 | const int im2col_step); 22 | 23 | std::vector 24 | ms_deform_attn_cpu_backward( 25 | const at::Tensor &value, 26 | const at::Tensor &spatial_shapes, 27 | const at::Tensor &level_start_index, 28 | const at::Tensor &sampling_loc, 29 | const at::Tensor &attn_weight, 30 | const at::Tensor &grad_output, 31 | const int im2col_step); 32 | 33 | 34 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/ops/src/cuda/ms_deform_attn_cuda.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | #include 13 | 14 | at::Tensor ms_deform_attn_cuda_forward( 15 | const at::Tensor &value, 16 | const at::Tensor &spatial_shapes, 17 | const at::Tensor &level_start_index, 18 | const at::Tensor &sampling_loc, 19 | const at::Tensor &attn_weight, 20 | const int im2col_step); 21 | 22 | std::vector ms_deform_attn_cuda_backward( 23 | const at::Tensor &value, 24 | const at::Tensor &spatial_shapes, 25 | const at::Tensor &level_start_index, 26 | const at::Tensor &sampling_loc, 27 | const at::Tensor &attn_weight, 28 | const at::Tensor &grad_output, 29 | const int im2col_step); 30 | 31 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/ops/src/ms_deform_attn.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #pragma once 12 | 13 | #include "cpu/ms_deform_attn_cpu.h" 14 | 15 | #ifdef WITH_CUDA 16 | #include "cuda/ms_deform_attn_cuda.h" 17 | #endif 18 | 19 | 20 | at::Tensor 21 | ms_deform_attn_forward( 22 | const at::Tensor &value, 23 | const at::Tensor &spatial_shapes, 24 | const at::Tensor &level_start_index, 25 | const at::Tensor &sampling_loc, 26 | const at::Tensor &attn_weight, 27 | const int im2col_step) 28 | { 29 | if (value.type().is_cuda()) 30 | { 31 | #ifdef WITH_CUDA 32 | return ms_deform_attn_cuda_forward( 33 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | ms_deform_attn_backward( 43 | const at::Tensor &value, 44 | const at::Tensor &spatial_shapes, 45 | const at::Tensor &level_start_index, 46 | const at::Tensor &sampling_loc, 47 | const at::Tensor &attn_weight, 48 | const at::Tensor &grad_output, 49 | const int im2col_step) 50 | { 51 | if (value.type().is_cuda()) 52 | { 53 | #ifdef WITH_CUDA 54 | return ms_deform_attn_cuda_backward( 55 | value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); 56 | #else 57 | AT_ERROR("Not compiled with GPU support"); 58 | #endif 59 | } 60 | AT_ERROR("Not implemented on the CPU"); 61 | } 62 | 63 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/ops/src/vision.cpp: -------------------------------------------------------------------------------- 1 | /*! 2 | ************************************************************************************************** 3 | * Deformable DETR 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | ************************************************************************************************** 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 8 | ************************************************************************************************** 9 | */ 10 | 11 | #include "ms_deform_attn.h" 12 | 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); 15 | m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); 16 | } 17 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/ops/test.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------------------------------ 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 7 | # ------------------------------------------------------------------------------------------------ 8 | 9 | from __future__ import absolute_import 10 | from __future__ import print_function 11 | from __future__ import division 12 | 13 | import time 14 | import torch 15 | import torch.nn as nn 16 | from torch.autograd import gradcheck 17 | 18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch 19 | 20 | 21 | N, M, D = 1, 2, 2 22 | Lq, L, P = 2, 2, 2 23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() 24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) 25 | S = sum([(H*W).item() for H, W in shapes]) 26 | 27 | 28 | torch.manual_seed(3) 29 | 30 | 31 | @torch.no_grad() 32 | def check_forward_equal_with_pytorch_double(): 33 | value = torch.rand(N, S, M, D).cuda() * 0.01 34 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 35 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 36 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 37 | im2col_step = 2 38 | output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() 39 | output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() 40 | fwdok = torch.allclose(output_cuda, output_pytorch) 41 | max_abs_err = (output_cuda - output_pytorch).abs().max() 42 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 43 | 44 | print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 45 | 46 | 47 | @torch.no_grad() 48 | def check_forward_equal_with_pytorch_float(): 49 | value = torch.rand(N, S, M, D).cuda() * 0.01 50 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 51 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 52 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 53 | im2col_step = 2 54 | output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() 55 | output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() 56 | fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) 57 | max_abs_err = (output_cuda - output_pytorch).abs().max() 58 | max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() 59 | 60 | print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') 61 | 62 | 63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): 64 | 65 | value = torch.rand(N, S, M, channels).cuda() * 0.01 66 | sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() 67 | attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 68 | attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) 69 | im2col_step = 2 70 | func = MSDeformAttnFunction.apply 71 | 72 | value.requires_grad = grad_value 73 | sampling_locations.requires_grad = grad_sampling_loc 74 | attention_weights.requires_grad = grad_attn_weight 75 | 76 | gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) 77 | 78 | print(f'* {gradok} check_gradient_numerical(D={channels})') 79 | 80 | 81 | if __name__ == '__main__': 82 | check_forward_equal_with_pytorch_double() 83 | check_forward_equal_with_pytorch_float() 84 | 85 | for channels in [30, 32, 64, 71, 1025, 2048, 3096]: 86 | check_gradient_numerical(channels, True, True, True) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/models/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Various positional encodings for the transformer. 12 | """ 13 | import math 14 | import torch 15 | from torch import nn 16 | 17 | from util.misc import NestedTensor 18 | 19 | 20 | class PositionEmbeddingSine(nn.Module): 21 | """ 22 | This is a more standard version of the position embedding, very similar to the one 23 | used by the Attention is all you need paper, generalized to work on images. 24 | """ 25 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 26 | super().__init__() 27 | self.num_pos_feats = num_pos_feats 28 | self.temperature = temperature 29 | self.normalize = normalize 30 | if scale is not None and normalize is False: 31 | raise ValueError("normalize should be True if scale is passed") 32 | if scale is None: 33 | scale = 2 * math.pi 34 | self.scale = scale 35 | 36 | def forward(self, tensor_list: NestedTensor): 37 | x = tensor_list.tensors 38 | mask = tensor_list.mask 39 | assert mask is not None 40 | not_mask = ~mask 41 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 42 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 43 | if self.normalize: 44 | eps = 1e-6 45 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 46 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 47 | 48 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 49 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 50 | 51 | pos_x = x_embed[:, :, :, None] / dim_t 52 | pos_y = y_embed[:, :, :, None] / dim_t 53 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 54 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 55 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 56 | return pos 57 | 58 | 59 | class PositionEmbeddingLearned(nn.Module): 60 | """ 61 | Absolute pos embedding, learned. 62 | """ 63 | def __init__(self, num_pos_feats=256): 64 | super().__init__() 65 | self.row_embed = nn.Embedding(50, num_pos_feats) 66 | self.col_embed = nn.Embedding(50, num_pos_feats) 67 | self.reset_parameters() 68 | 69 | def reset_parameters(self): 70 | nn.init.uniform_(self.row_embed.weight) 71 | nn.init.uniform_(self.col_embed.weight) 72 | 73 | def forward(self, tensor_list: NestedTensor): 74 | x = tensor_list.tensors 75 | h, w = x.shape[-2:] 76 | i = torch.arange(w, device=x.device) 77 | j = torch.arange(h, device=x.device) 78 | x_emb = self.col_embed(i) 79 | y_emb = self.row_embed(j) 80 | pos = torch.cat([ 81 | x_emb.unsqueeze(0).repeat(h, 1, 1), 82 | y_emb.unsqueeze(1).repeat(1, w, 1), 83 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 84 | return pos 85 | 86 | 87 | def build_position_encoding(args): 88 | N_steps = args.hidden_dim // 2 89 | if args.position_embedding in ('v2', 'sine'): 90 | # TODO find a better way of exposing other arguments 91 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 92 | elif args.position_embedding in ('v3', 'learned'): 93 | position_embedding = PositionEmbeddingLearned(N_steps) 94 | else: 95 | raise ValueError(f"not supported {args.position_embedding}") 96 | 97 | return position_embedding 98 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/scripts/eval.sh: -------------------------------------------------------------------------------- 1 | 2 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/eval" 3 | 4 | 5 | python -m torch.distributed.launch --nproc_per_node=8 \ 6 | --master_port=12345 --use_env main.py \ 7 | --output_dir ${EXP_DIR} \ 8 | --with_box_refine --two_stage \ 9 | --num_feature_levels 5 --num_queries 900 \ 10 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \ 11 | --assign_first_stage --assign_second_stage \ 12 | --epochs 24 --lr_drop 20 \ 13 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \ 14 | --backbone pev1 \ 15 | --backbone_size Gwin384 \ 16 | --backbone_init_values 0.1 \ 17 | --backbone_tile_posemb True \ 18 | --backbone_lrd 0.9 --backbone_layers 50 \ 19 | --num_workers 4 \ 20 | --coco_path /checkpoint/vision_encoder/public_data/coco \ 21 | --lsj --lsj_img_size 1728 \ 22 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \ 23 | --eval \ 24 | --resume /checkpoint/vision_encoder/d2_output/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node/checkpoint.pth \ 25 | "$@" 26 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/scripts/eval_1824pix.sh: -------------------------------------------------------------------------------- 1 | 2 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/eval" 3 | 4 | 5 | python -m torch.distributed.launch --nproc_per_node=8 \ 6 | --master_port=12345 --use_env main.py \ 7 | --output_dir ${EXP_DIR} \ 8 | --with_box_refine --two_stage \ 9 | --num_feature_levels 5 --num_queries 900 \ 10 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \ 11 | --assign_first_stage --assign_second_stage \ 12 | --epochs 24 --lr_drop 20 \ 13 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \ 14 | --backbone pev1 \ 15 | --backbone_size Gwin384 \ 16 | --backbone_init_values 0.1 \ 17 | --backbone_tile_posemb True \ 18 | --backbone_lrd 0.9 --backbone_layers 50 \ 19 | --num_workers 4 \ 20 | --coco_path /checkpoint/vision_encoder/public_data/coco \ 21 | --lsj --lsj_img_size 1824 \ 22 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \ 23 | --eval \ 24 | --resume /checkpoint/vision_encoder/d2_output/coco_sota/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node/checkpoint.pth \ 25 | "$@" 26 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/scripts/eval_tta_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --qos=vision_encoder_high 4 | #SBATCH --account=vision_encoder 5 | #SBATCH --job-name=det 6 | #SBATCH --nodes=8 7 | #SBATCH --ntasks=8 8 | #SBATCH --gres=gpu:8 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --mem=0 11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/eval_tta_slurm/%j.out 12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/eval_tta_slurm/%j.err 13 | #SBATCH --time=23:00:00 14 | 15 | module load cuda/12.1 16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST)) 17 | nodes_array=($nodes) 18 | head_node=${nodes_array[0]} 19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 20 | 21 | read -ra my_array <<< $head_node_ip 22 | export LOGLEVEL=INFO 23 | 24 | echo head_node_ip $head_node_ip 25 | echo endpoint "${head_node_ip}:29500" 26 | 27 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/eval_tta_slurm" 28 | 29 | 30 | # srun \ 31 | # torchrun \ 32 | srun \ 33 | python -m torch.distributed.run \ 34 | --nnodes 8 \ 35 | --nproc_per_node 8 \ 36 | --rdzv_id $RANDOM \ 37 | --rdzv_endpoint "${my_array[0]}:29500" \ 38 | --rdzv_backend c10d \ 39 | main.py \ 40 | --output_dir ${EXP_DIR} \ 41 | --with_box_refine --two_stage \ 42 | --num_feature_levels 5 --num_queries 2000 \ 43 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \ 44 | --assign_first_stage --assign_second_stage \ 45 | --epochs 12 --lr_drop 10 \ 46 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \ 47 | --backbone pev1 \ 48 | --backbone_size Gwin384 \ 49 | --backbone_init_values 0.1 \ 50 | --backbone_tile_posemb True \ 51 | --backbone_lrd 0.9 --backbone_layers 50 \ 52 | --num_workers 4 \ 53 | --coco_path /checkpoint/vision_encoder/public_data/coco \ 54 | --lsj --lsj_img_size 1728 \ 55 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \ 56 | --eval \ 57 | --resume /checkpoint/vision_encoder/d2_output/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node/checkpoint.pth \ 58 | --soft_nms \ 59 | --tta \ 60 | "$@" 61 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/scripts/eval_tta_slurm_1824pix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --qos=vision_encoder_high 4 | #SBATCH --account=vision_encoder 5 | #SBATCH --job-name=det 6 | #SBATCH --nodes=8 7 | #SBATCH --ntasks=8 8 | #SBATCH --gres=gpu:8 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --mem=0 11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/eval_tta_slurm/%j.out 12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/eval_tta_slurm/%j.err 13 | #SBATCH --time=23:00:00 14 | 15 | module load cuda/12.1 16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST)) 17 | nodes_array=($nodes) 18 | head_node=${nodes_array[0]} 19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 20 | 21 | read -ra my_array <<< $head_node_ip 22 | export LOGLEVEL=INFO 23 | 24 | echo head_node_ip $head_node_ip 25 | echo endpoint "${head_node_ip}:29500" 26 | 27 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/eval_tta_slurm" 28 | 29 | 30 | # srun \ 31 | # torchrun \ 32 | srun \ 33 | python -m torch.distributed.run \ 34 | --nnodes 8 \ 35 | --nproc_per_node 8 \ 36 | --rdzv_id $RANDOM \ 37 | --rdzv_endpoint "${my_array[0]}:29500" \ 38 | --rdzv_backend c10d \ 39 | main.py \ 40 | --output_dir ${EXP_DIR} \ 41 | --with_box_refine --two_stage \ 42 | --num_feature_levels 5 --num_queries 2000 \ 43 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \ 44 | --assign_first_stage --assign_second_stage \ 45 | --epochs 12 --lr_drop 10 \ 46 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \ 47 | --backbone pev1 \ 48 | --backbone_size Gwin384 \ 49 | --backbone_init_values 0.1 \ 50 | --backbone_tile_posemb True \ 51 | --backbone_lrd 0.9 --backbone_layers 50 \ 52 | --num_workers 4 \ 53 | --coco_path /checkpoint/vision_encoder/public_data/coco \ 54 | --lsj --lsj_img_size 1824 \ 55 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \ 56 | --eval \ 57 | --resume /checkpoint/vision_encoder/d2_output/coco_sota/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node/checkpoint.pth \ 58 | --soft_nms \ 59 | --tta \ 60 | "$@" 61 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/scripts/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --qos=vision_encoder 4 | #SBATCH --account=vision_encoder 5 | #SBATCH --job-name=det 6 | #SBATCH --nodes=8 7 | #SBATCH --ntasks=8 8 | #SBATCH --gres=gpu:8 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --mem=0 11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node/%j.out 12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node/%j.err 13 | #SBATCH --time=96:00:00 14 | 15 | module load cuda/12.1 16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST)) 17 | nodes_array=($nodes) 18 | head_node=${nodes_array[0]} 19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 20 | 21 | read -ra my_array <<< $head_node_ip 22 | export LOGLEVEL=INFO 23 | 24 | echo head_node_ip $head_node_ip 25 | echo endpoint "${head_node_ip}:29500" 26 | 27 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node" 28 | 29 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True 30 | 31 | srun \ 32 | torchrun \ 33 | --nnodes 8 \ 34 | --nproc_per_node 8 \ 35 | --rdzv_id $RANDOM \ 36 | --rdzv_endpoint "${my_array[0]}:29500" \ 37 | --rdzv_backend c10d \ 38 | main.py \ 39 | --output_dir ${EXP_DIR} \ 40 | --with_box_refine --two_stage \ 41 | --num_feature_levels 5 --num_queries 900 \ 42 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \ 43 | --assign_first_stage --assign_second_stage \ 44 | --epochs 3 --lr_drop 2 \ 45 | --batch_size 1 \ 46 | --backbone pev1 \ 47 | --backbone_size Gwin384 \ 48 | --backbone_init_values 0.1 \ 49 | --backbone_tile_posemb True \ 50 | --backbone_lrd 0.9 --backbone_layers 50 \ 51 | --coco_path /checkpoint/vision_encoder/public_data/coco \ 52 | --finetune /checkpoint/vision_encoder/d2_output/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node/checkpoint.pth \ 53 | --lsj --lsj_img_size 1824 \ 54 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \ 55 | --eval_per_epochs 1 \ 56 | --save_per_epochs 1 \ 57 | --auto_resume \ 58 | --keep_class_embed \ 59 | --bf16 \ 60 | --backbone_dp 0.0 \ 61 | --sgd \ 62 | --lr 5e-5 --lr_backbone 5e-5 \ 63 | "$@" 64 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/scripts/finetune_spatial_Gwin384_cocoep12_1728pix_8node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --qos=vision_encoder 4 | #SBATCH --account=vision_encoder 5 | #SBATCH --job-name=det 6 | #SBATCH --nodes=8 7 | #SBATCH --ntasks=8 8 | #SBATCH --gres=gpu:8 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --mem=0 11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node/%j.out 12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node/%j.err 13 | #SBATCH --time=96:00:00 14 | 15 | module load cuda/12.1 16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST)) 17 | nodes_array=($nodes) 18 | head_node=${nodes_array[0]} 19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 20 | 21 | read -ra my_array <<< $head_node_ip 22 | export LOGLEVEL=INFO 23 | 24 | echo head_node_ip $head_node_ip 25 | echo endpoint "${head_node_ip}:29500" 26 | 27 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node" 28 | 29 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True 30 | 31 | srun \ 32 | torchrun \ 33 | --nnodes 8 \ 34 | --nproc_per_node 8 \ 35 | --rdzv_id $RANDOM \ 36 | --rdzv_endpoint "${my_array[0]}:29500" \ 37 | --rdzv_backend c10d \ 38 | main.py \ 39 | --output_dir ${EXP_DIR} \ 40 | --with_box_refine --two_stage \ 41 | --num_feature_levels 5 --num_queries 900 \ 42 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \ 43 | --assign_first_stage --assign_second_stage \ 44 | --epochs 12 --lr_drop 10 \ 45 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \ 46 | --backbone pev1 \ 47 | --backbone_size Gwin384 \ 48 | --backbone_init_values 0.1 \ 49 | --backbone_tile_posemb True \ 50 | --backbone_lrd 0.9 --backbone_layers 50 \ 51 | --coco_path /checkpoint/vision_encoder/public_data/coco \ 52 | --finetune /checkpoint/vision_encoder/d2_output/coco_sota/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node/checkpoint.pth \ 53 | --lsj --lsj_img_size 1728 \ 54 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \ 55 | --eval_per_epochs 1 \ 56 | --save_per_epochs 1 \ 57 | --auto_resume \ 58 | --bf16 \ 59 | --backbone_dp 0.4 \ 60 | "$@" 61 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/scripts/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --qos=vision_encoder 4 | #SBATCH --account=vision_encoder 5 | #SBATCH --job-name=det 6 | #SBATCH --nodes=16 7 | #SBATCH --ntasks=16 8 | #SBATCH --gres=gpu:8 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --mem=0 11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node/%j.out 12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node/%j.err 13 | #SBATCH --time=96:00:00 14 | 15 | module load cuda/12.1 16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST)) 17 | nodes_array=($nodes) 18 | head_node=${nodes_array[0]} 19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 20 | 21 | read -ra my_array <<< $head_node_ip 22 | export LOGLEVEL=INFO 23 | 24 | echo head_node_ip $head_node_ip 25 | echo endpoint "${head_node_ip}:29500" 26 | 27 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node" 28 | 29 | 30 | srun \ 31 | torchrun \ 32 | --nnodes 16 \ 33 | --nproc_per_node 8 \ 34 | --rdzv_id $RANDOM \ 35 | --rdzv_endpoint "${my_array[0]}:29500" \ 36 | --rdzv_backend c10d \ 37 | main.py \ 38 | --output_dir ${EXP_DIR} \ 39 | --with_box_refine --two_stage \ 40 | --num_feature_levels 5 --num_queries 900 \ 41 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \ 42 | --assign_first_stage --assign_second_stage \ 43 | --epochs 6 --lr_drop 4 \ 44 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \ 45 | --backbone pev1 \ 46 | --backbone_size Gwin384 \ 47 | --backbone_init_values 0.1 \ 48 | --backbone_tile_posemb True \ 49 | --backbone_lrd 0.9 --backbone_layers 50 \ 50 | --dataset_file objects365 \ 51 | --coco_path /checkpoint/vision_encoder/public_data/objects365_v2 \ 52 | --finetune /checkpoint/vision_encoder/d2_output/coco_sota/pretrain_spatial_Gwin384_o365ep12_1024pix_16node/checkpoint.pth \ 53 | --lsj --lsj_img_size 1536 \ 54 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \ 55 | --eval_per_epochs 1 \ 56 | --save_per_epochs 1 \ 57 | --auto_resume \ 58 | "$@" 59 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/scripts/pretrain_spatial_Gwin384_o365ep12_1024pix_16node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --qos=vision_encoder 4 | #SBATCH --account=vision_encoder 5 | #SBATCH --job-name=det 6 | #SBATCH --nodes=16 7 | #SBATCH --ntasks=16 8 | #SBATCH --gres=gpu:8 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --mem=0 11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/pretrain_spatial_Gwin384_o365ep12_1024pix_16node/%j.out 12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/pretrain_spatial_Gwin384_o365ep12_1024pix_16node/%j.err 13 | #SBATCH --time=96:00:00 14 | 15 | module load cuda/12.1 16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST)) 17 | nodes_array=($nodes) 18 | head_node=${nodes_array[0]} 19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 20 | 21 | read -ra my_array <<< $head_node_ip 22 | export LOGLEVEL=INFO 23 | 24 | echo head_node_ip $head_node_ip 25 | echo endpoint "${head_node_ip}:29500" 26 | 27 | 28 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/pretrain_spatial_Gwin384_o365ep12_1024pix_16node" 29 | 30 | srun \ 31 | torchrun \ 32 | --nnodes 16 \ 33 | --nproc_per_node 8 \ 34 | --rdzv_id $RANDOM \ 35 | --rdzv_endpoint "${my_array[0]}:29500" \ 36 | --rdzv_backend c10d \ 37 | main.py \ 38 | --output_dir ${EXP_DIR} \ 39 | --with_box_refine --two_stage \ 40 | --num_feature_levels 5 --num_queries 900 \ 41 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \ 42 | --assign_first_stage --assign_second_stage \ 43 | --epochs 12 --lr_drop 10 \ 44 | --lr_backbone 2e-4 \ 45 | --backbone pev1 \ 46 | --backbone_size Gwin384 \ 47 | --backbone_path /checkpoint/vision_encoder/pev1/pe_spatial_G14_448_16patch384pix.pth \ 48 | --backbone_init_values 0.1 \ 49 | --backbone_tile_posemb True \ 50 | --backbone_lrd 0.9 --backbone_layers 50 \ 51 | --dataset_file objects365 \ 52 | --coco_path /checkpoint/vision_encoder/public_data/objects365_v2 \ 53 | --lsj --lsj_img_size 1024 \ 54 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \ 55 | --eval_per_epochs 2 \ 56 | --save_per_epochs 1 \ 57 | --auto_resume \ 58 | "$@" 59 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/util/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/util/box_ops.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Utilities for bounding box manipulation and GIoU. 12 | """ 13 | import torch 14 | from torchvision.ops.boxes import box_area 15 | 16 | 17 | def box_cxcywh_to_xyxy(x): 18 | x_c, y_c, w, h = x.unbind(-1) 19 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 20 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 21 | return torch.stack(b, dim=-1) 22 | 23 | 24 | def box_xyxy_to_cxcywh(x): 25 | x0, y0, x1, y1 = x.unbind(-1) 26 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 27 | (x1 - x0), (y1 - y0)] 28 | return torch.stack(b, dim=-1) 29 | 30 | 31 | # modified from torchvision to also return the union 32 | def box_iou(boxes1, boxes2): 33 | area1 = box_area(boxes1) 34 | area2 = box_area(boxes2) 35 | 36 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 37 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 38 | 39 | wh = (rb - lt).clamp(min=0) # [N,M,2] 40 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 41 | 42 | union = area1[:, None] + area2 - inter 43 | 44 | iou = inter / union 45 | return iou, union 46 | 47 | 48 | def generalized_box_iou(boxes1, boxes2): 49 | """ 50 | Generalized IoU from https://giou.stanford.edu/ 51 | 52 | The boxes should be in [x0, y0, x1, y1] format 53 | 54 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 55 | and M = len(boxes2) 56 | """ 57 | # degenerate boxes gives inf / nan results 58 | # so do an early check 59 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 60 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 61 | iou, union = box_iou(boxes1, boxes2) 62 | 63 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 64 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 65 | 66 | wh = (rb - lt).clamp(min=0) # [N,M,2] 67 | area = wh[:, :, 0] * wh[:, :, 1] 68 | 69 | return iou - (area - union) / area 70 | 71 | 72 | def masks_to_boxes(masks): 73 | """Compute the bounding boxes around the provided masks 74 | 75 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 76 | 77 | Returns a [N, 4] tensors, with the boxes in xyxy format 78 | """ 79 | if masks.numel() == 0: 80 | return torch.zeros((0, 4), device=masks.device) 81 | 82 | h, w = masks.shape[-2:] 83 | 84 | y = torch.arange(0, h, dtype=torch.float) 85 | x = torch.arange(0, w, dtype=torch.float) 86 | y, x = torch.meshgrid(y, x) 87 | 88 | x_mask = (masks * x.unsqueeze(0)) 89 | x_max = x_mask.flatten(1).max(-1)[0] 90 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 91 | 92 | y_mask = (masks * y.unsqueeze(0)) 93 | y_max = y_mask.flatten(1).max(-1)[0] 94 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 95 | 96 | return torch.stack([x_min, y_min, x_max, y_max], 1) 97 | -------------------------------------------------------------------------------- /apps/detection/DETA_pe/util/ema.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | 5 | 6 | @torch.no_grad() 7 | def update_ema(ema_model, model, decay=0.9999): 8 | """ 9 | Step the EMA model towards the current model. 10 | """ 11 | ema_params = OrderedDict(ema_model.named_parameters()) 12 | model_params = OrderedDict(model.named_parameters()) 13 | 14 | for name, param in model_params.items(): 15 | # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed 16 | ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay) 17 | 18 | 19 | def requires_grad(model, flag=True): 20 | """ 21 | Set requires_grad flag for all parameters in a model. 22 | """ 23 | for p in model.parameters(): 24 | p.requires_grad = flag 25 | -------------------------------------------------------------------------------- /apps/detection/INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | Follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html) 3 | 4 | ## Dataset 5 | Prepare COCO and LVIS datasets 6 | 7 | ``` 8 | $DETECTRON2_DATASETS/ 9 | coco/ 10 | train2017/ 11 | val2017/ 12 | annotations/ 13 | instances_train2017.json 14 | instances_val2017.json 15 | lvis/ 16 | lvis_v1_train.json 17 | lvis_v1_val.json 18 | ``` 19 | 20 | -------------------------------------------------------------------------------- /apps/detection/README.md: -------------------------------------------------------------------------------- 1 | # Object Detection with PE 2 | 3 | ## Getting started 4 | 5 | Please refer to [INSTALL.md](INSTALL.md) for installation and dataset preparation instructions. 6 | 7 | ## Results and Fine-tuned Models 8 | 9 | 10 | ### LVIS 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 |
detectorvision encoderbox
AP
mask
AP
download
Mask R-CNNPE core G51.947.9model
Mask R-CNNPE spatial G54.249.3model
36 | 37 | 38 | ### COCO 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 |
detectorvision encoderbox
AP
mask
AP
download
Mask R-CNNPE core G57.049.8model
Mask R-CNNPE spatial G57.850.3model
64 | 65 | 66 | ### Training 67 | By default, we use 64 GPUs in slurm training, for example 68 | 69 | ``` 70 | sbatch scripts/coco/train_mask_rcnn_PEspatial_G_coco36ep.sh 71 | ``` 72 | 73 | ### Evaluation 74 | Evaluation is running locally 75 | ``` 76 | bash scripts/evaluate_local.sh --config-file projects/ViTDet/configs/COCO/mask_rcnn_PEspatial_G_coco36ep.py train.output_dir="/path/to/output_dir" train.init_checkpoint="/path/to/mask_rcnn_PEspatial_G_coco36ep.pth" 77 | ``` 78 | 79 | 80 | ## SOTA COCO Object Detection 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 |
detectorvision encoderbox
AP
box(TTA)
AP
download
DETAPE spatial G 65.2 66.0 model
100 | 101 | More details are in [DETA_pe](DETA_pe) 102 | 103 | 104 | ## Acknowledgment 105 | 106 | This code is built using [detectron2](https://github.com/facebookresearch/detectron2) and [DETA](https://github.com/jozhang97/DETA). 107 | -------------------------------------------------------------------------------- /apps/detection/detectron2_pe/__init__.py: -------------------------------------------------------------------------------- 1 | from . import modeling 2 | -------------------------------------------------------------------------------- /apps/detection/detectron2_pe/checkpoint/__init__.py: -------------------------------------------------------------------------------- 1 | from .detection_checkpoint import DetectionCheckpointer 2 | 3 | __all__ = ["DetectionCheckpointer"] 4 | -------------------------------------------------------------------------------- /apps/detection/detectron2_pe/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbone import PEv1_det, get_vit_lr_decay_rate_pev1 2 | -------------------------------------------------------------------------------- /apps/detection/detectron2_pe/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .pev1_det import PEv1_det, get_vit_lr_decay_rate_pev1 2 | -------------------------------------------------------------------------------- /apps/detection/projects/ViTDet/configs/COCO/mask_rcnn_PEcore_G_coco75ep.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch.nn as nn 4 | from detectron2 import model_zoo 5 | from detectron2.config import LazyCall as L 6 | from detectron2.modeling import SimpleFeaturePyramid, ViT 7 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool 8 | from detectron2.solver import WarmupParamScheduler 9 | from detectron2_pe.modeling import PEv1_det, get_vit_lr_decay_rate_pev1 10 | from fvcore.common.param_scheduler import MultiStepParamScheduler 11 | 12 | from ..COCO.mask_rcnn_vitdet_b_100ep import ( # dataloader,; model,; get_vit_lr_decay_rate, 13 | lr_multiplier, optimizer, train) 14 | from ..common.coco_loader_lsj import dataloader 15 | 16 | train.init_checkpoint = "/checkpoint/vision_encoder/pev1/pev1_rc2_d2.pt" 17 | train.output_dir = ( 18 | "/checkpoint/vision_encoder/d2_output/coco/mask_rcnn_PEcore_G_coco36ep" 19 | ) 20 | 21 | model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model 22 | 23 | model.pixel_mean = [127, 127, 127] 24 | model.pixel_std = [127, 127, 127] 25 | model.input_format = "RGB" 26 | 27 | 28 | img_size = 1024 29 | embed_dim, depth, num_heads, mlp_ratio, dp = 1536, 50, 16, 8960 / 1536, 0.5 30 | pretrain_img_size, patch_size, window_size = 512, 16, 32 31 | # 12, 24, 36, 49 for global attention 32 | window_block_indexes = ( 33 | list(range(0, 12)) + list(range(13, 24)) + list(range(25, 36)) + list(range(37, 49)) 34 | ) 35 | # Creates Simple Feature Pyramid from ViT backbone 36 | model.backbone = L(SimpleFeaturePyramid)( 37 | net=L(PEv1_det)( # Single-scale ViT backbone 38 | pretrain_img_size=pretrain_img_size, 39 | img_size=img_size, 40 | patch_size=patch_size, 41 | embed_dim=embed_dim, 42 | depth=depth, 43 | num_heads=num_heads, 44 | drop_path_rate=dp, 45 | window_size=window_size, 46 | pt_hw_seq_len=32, 47 | mlp_ratio=mlp_ratio, 48 | qkv_bias=True, 49 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 50 | window_block_indexes=window_block_indexes, 51 | residual_block_indexes=[], 52 | use_rel_pos=True, 53 | out_feature="last_feat", 54 | tile_posemb=True, 55 | use_abs_pos=True, 56 | pretrain_use_cls_token=False, 57 | use_act_checkpoint=True, 58 | ), 59 | in_feature="${.net.out_feature}", 60 | out_channels=256, 61 | scale_factors=(4.0, 2.0, 1.0, 0.5), 62 | top_block=L(LastLevelMaxPool)(), 63 | norm="LN", 64 | square_pad=img_size, 65 | ) 66 | 67 | optimizer.params.lr_factor_func = partial( 68 | get_vit_lr_decay_rate_pev1, lr_decay_rate=0.9, num_layers=50 69 | ) 70 | 71 | dataloader.train.total_batch_size = 64 72 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep 73 | train.max_iter = 184375 74 | 75 | 76 | lr_multiplier = L(WarmupParamScheduler)( 77 | scheduler=L(MultiStepParamScheduler)( 78 | values=[1.0, 0.1, 0.01], 79 | milestones=[163889, 177546], 80 | num_updates=train.max_iter, 81 | ), 82 | warmup_length=250 / train.max_iter, 83 | warmup_factor=0.001, 84 | ) 85 | 86 | optimizer.params.overrides = {} 87 | optimizer.params.weight_decay_norm = None 88 | optimizer.lr = 5e-5 89 | 90 | train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep 91 | lr_multiplier.scheduler.milestones = [ 92 | milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones 93 | ] 94 | lr_multiplier.scheduler.num_updates = train.max_iter 95 | -------------------------------------------------------------------------------- /apps/detection/projects/ViTDet/configs/COCO/mask_rcnn_PEspatial_G_coco36ep.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch.nn as nn 4 | from detectron2 import model_zoo 5 | from detectron2.config import LazyCall as L 6 | from detectron2.modeling import SimpleFeaturePyramid, ViT 7 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool 8 | from detectron2.solver import WarmupParamScheduler 9 | from detectron2_pe.modeling import PEv1_det, get_vit_lr_decay_rate_pev1 10 | from fvcore.common.param_scheduler import MultiStepParamScheduler 11 | 12 | from ..COCO.mask_rcnn_vitdet_b_100ep import ( # dataloader,; model,; get_vit_lr_decay_rate, 13 | lr_multiplier, optimizer, train) 14 | from ..common.coco_loader_lsj import dataloader 15 | 16 | train.init_checkpoint = "/checkpoint/vision_encoder/pev1/pev1_rc2_spatial_d2.pt" 17 | train.output_dir = ( 18 | "/checkpoint/vision_encoder/d2_output/coco/mask_rcnn_PEspatial_G_coco36ep" 19 | ) 20 | 21 | model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model 22 | 23 | model.pixel_mean = [127, 127, 127] 24 | model.pixel_std = [127, 127, 127] 25 | model.input_format = "RGB" 26 | 27 | 28 | img_size = 1024 29 | embed_dim, depth, num_heads, mlp_ratio, dp = 1536, 50, 16, 8960 / 1536, 0.5 30 | pretrain_img_size, patch_size, window_size = 512, 16, 32 31 | # 12, 24, 36, 49 for global attention 32 | window_block_indexes = ( 33 | list(range(0, 12)) + list(range(13, 24)) + list(range(25, 36)) + list(range(37, 49)) 34 | ) 35 | # Creates Simple Feature Pyramid from ViT backbone 36 | model.backbone = L(SimpleFeaturePyramid)( 37 | net=L(PEv1_det)( # Single-scale ViT backbone 38 | pretrain_img_size=pretrain_img_size, 39 | img_size=img_size, 40 | patch_size=patch_size, 41 | embed_dim=embed_dim, 42 | depth=depth, 43 | num_heads=num_heads, 44 | drop_path_rate=dp, 45 | window_size=window_size, 46 | pt_hw_seq_len=32, 47 | mlp_ratio=mlp_ratio, 48 | qkv_bias=True, 49 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 50 | window_block_indexes=window_block_indexes, 51 | residual_block_indexes=[], 52 | use_rel_pos=True, 53 | out_feature="last_feat", 54 | tile_posemb=True, 55 | use_abs_pos=True, 56 | pretrain_use_cls_token=False, 57 | use_act_checkpoint=True, 58 | init_values=0.1, 59 | ), 60 | in_feature="${.net.out_feature}", 61 | out_channels=256, 62 | scale_factors=(4.0, 2.0, 1.0, 0.5), 63 | top_block=L(LastLevelMaxPool)(), 64 | norm="LN", 65 | square_pad=img_size, 66 | ) 67 | 68 | optimizer.params.lr_factor_func = partial( 69 | get_vit_lr_decay_rate_pev1, lr_decay_rate=0.9, num_layers=50 70 | ) 71 | 72 | dataloader.train.total_batch_size = 64 73 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep 74 | train.max_iter = 184375 75 | 76 | 77 | lr_multiplier = L(WarmupParamScheduler)( 78 | scheduler=L(MultiStepParamScheduler)( 79 | values=[1.0, 0.1, 0.01], 80 | milestones=[163889, 177546], 81 | num_updates=train.max_iter, 82 | ), 83 | warmup_length=250 / train.max_iter, 84 | warmup_factor=0.001, 85 | ) 86 | 87 | optimizer.params.overrides = {} 88 | optimizer.params.weight_decay_norm = None 89 | optimizer.lr = 5e-5 90 | 91 | train.max_iter = train.max_iter * 36 // 100 # 100ep -> 36ep 92 | lr_multiplier.scheduler.milestones = [ 93 | milestone * 36 // 100 for milestone in lr_multiplier.scheduler.milestones 94 | ] 95 | lr_multiplier.scheduler.num_updates = train.max_iter 96 | -------------------------------------------------------------------------------- /apps/detection/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_b_100ep.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from detectron2 import model_zoo 4 | from detectron2.config import LazyCall as L 5 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate 6 | from detectron2.solver import WarmupParamScheduler 7 | from fvcore.common.param_scheduler import MultiStepParamScheduler 8 | 9 | from ..common.coco_loader_lsj import dataloader 10 | 11 | model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model 12 | 13 | # Initialization and trainer settings 14 | train = model_zoo.get_config("common/train.py").train 15 | train.amp.enabled = True 16 | train.ddp.fp16_compression = True 17 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth?matching_heuristics=True" 18 | 19 | 20 | # Schedule 21 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep 22 | train.max_iter = 184375 23 | 24 | lr_multiplier = L(WarmupParamScheduler)( 25 | scheduler=L(MultiStepParamScheduler)( 26 | values=[1.0, 0.1, 0.01], 27 | milestones=[163889, 177546], 28 | num_updates=train.max_iter, 29 | ), 30 | warmup_length=250 / train.max_iter, 31 | warmup_factor=0.001, 32 | ) 33 | 34 | # Optimizer 35 | optimizer = model_zoo.get_config("common/optim.py").AdamW 36 | optimizer.params.lr_factor_func = partial( 37 | get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7 38 | ) 39 | optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} 40 | -------------------------------------------------------------------------------- /apps/detection/projects/ViTDet/configs/LVIS/mask_rcnn_PEcore_G_lvis75ep.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch.nn as nn 4 | from detectron2 import model_zoo 5 | from detectron2.config import LazyCall as L 6 | from detectron2.data.detection_utils import get_fed_loss_cls_weights 7 | from detectron2.data.samplers import RepeatFactorTrainingSampler 8 | from detectron2.evaluation.lvis_evaluation import LVISEvaluator 9 | from detectron2.modeling import SimpleFeaturePyramid, ViT 10 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool 11 | from detectron2.solver import WarmupParamScheduler 12 | from detectron2_pe.modeling import PEv1_det, get_vit_lr_decay_rate_pev1 13 | from fvcore.common.param_scheduler import MultiStepParamScheduler 14 | 15 | from ..COCO.mask_rcnn_vitdet_b_100ep import ( # dataloader,; model,; get_vit_lr_decay_rate, 16 | lr_multiplier, optimizer, train) 17 | from ..common.coco_loader_lsj import dataloader 18 | 19 | train.init_checkpoint = "/checkpoint/vision_encoder/pev1/pev1_rc2_d2.pt" 20 | train.output_dir = ( 21 | "/checkpoint/vision_encoder/d2_output/lvis/mask_rcnn_PEcore_G_lvis75ep" 22 | ) 23 | 24 | model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model 25 | 26 | model.pixel_mean = [127, 127, 127] 27 | model.pixel_std = [127, 127, 127] 28 | model.input_format = "RGB" 29 | 30 | img_size = 1024 31 | embed_dim, depth, num_heads, mlp_ratio, dp = 1536, 50, 16, 8960 / 1536, 0.5 32 | pretrain_img_size, patch_size, window_size = 512, 16, 32 33 | # 12, 24, 36, 49 for global attention 34 | window_block_indexes = ( 35 | list(range(0, 12)) + list(range(13, 24)) + list(range(25, 36)) + list(range(37, 49)) 36 | ) 37 | # Creates Simple Feature Pyramid from ViT backbone 38 | model.backbone = L(SimpleFeaturePyramid)( 39 | net=L(PEv1_det)( # Single-scale ViT backbone 40 | pretrain_img_size=pretrain_img_size, 41 | img_size=img_size, 42 | patch_size=patch_size, 43 | embed_dim=embed_dim, 44 | depth=depth, 45 | num_heads=num_heads, 46 | drop_path_rate=dp, 47 | window_size=window_size, 48 | pt_hw_seq_len=32, 49 | mlp_ratio=mlp_ratio, 50 | qkv_bias=True, 51 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 52 | window_block_indexes=window_block_indexes, 53 | residual_block_indexes=[], 54 | use_rel_pos=True, 55 | out_feature="last_feat", 56 | tile_posemb=True, 57 | use_abs_pos=True, 58 | pretrain_use_cls_token=False, 59 | use_act_checkpoint=True, 60 | ), 61 | in_feature="${.net.out_feature}", 62 | out_channels=256, 63 | scale_factors=(4.0, 2.0, 1.0, 0.5), 64 | top_block=L(LastLevelMaxPool)(), 65 | norm="LN", 66 | square_pad=img_size, 67 | ) 68 | 69 | model.roi_heads.num_classes = 1203 70 | model.roi_heads.box_predictor.test_score_thresh = 0.02 71 | model.roi_heads.box_predictor.test_topk_per_image = 300 72 | model.roi_heads.box_predictor.use_sigmoid_ce = True 73 | model.roi_heads.box_predictor.use_fed_loss = True 74 | model.roi_heads.box_predictor.get_fed_loss_cls_weights = ( 75 | lambda: get_fed_loss_cls_weights(dataloader.train.dataset.names, 0.5) 76 | ) 77 | 78 | train.eval_period = 30000 79 | 80 | optimizer.params.lr_factor_func = partial( 81 | get_vit_lr_decay_rate_pev1, lr_decay_rate=0.9, num_layers=50 82 | ) 83 | 84 | 85 | dataloader.train.dataset.names = "lvis_v1_train" 86 | dataloader.train.sampler = L(RepeatFactorTrainingSampler)( 87 | repeat_factors=L( 88 | RepeatFactorTrainingSampler.repeat_factors_from_category_frequency 89 | )(dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001) 90 | ) 91 | dataloader.test.dataset.names = "lvis_v1_val" 92 | dataloader.evaluator = L(LVISEvaluator)( 93 | dataset_name="${..test.dataset.names}", 94 | max_dets_per_image=300, 95 | output_dir="${train.output_dir}", 96 | ) 97 | 98 | dataloader.train.total_batch_size = 64 99 | 100 | train.max_iter = 184375 101 | 102 | 103 | lr_multiplier = L(WarmupParamScheduler)( 104 | scheduler=L(MultiStepParamScheduler)( 105 | values=[1.0, 0.1, 0.01], 106 | milestones=[163889, 177546], 107 | num_updates=train.max_iter, 108 | ), 109 | warmup_length=250 / train.max_iter, 110 | warmup_factor=0.001, 111 | ) 112 | 113 | optimizer.params.overrides = {} 114 | optimizer.params.weight_decay_norm = None 115 | optimizer.lr = 5e-5 116 | 117 | train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep 118 | lr_multiplier.scheduler.milestones = [ 119 | milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones 120 | ] 121 | lr_multiplier.scheduler.num_updates = train.max_iter 122 | -------------------------------------------------------------------------------- /apps/detection/projects/ViTDet/configs/LVIS/mask_rcnn_PEspatial_G_lvis75ep.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import torch.nn as nn 4 | from detectron2 import model_zoo 5 | from detectron2.config import LazyCall as L 6 | from detectron2.data.detection_utils import get_fed_loss_cls_weights 7 | from detectron2.data.samplers import RepeatFactorTrainingSampler 8 | from detectron2.evaluation.lvis_evaluation import LVISEvaluator 9 | from detectron2.modeling import SimpleFeaturePyramid, ViT 10 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool 11 | from detectron2.solver import WarmupParamScheduler 12 | from detectron2_pe.modeling import PEv1_det, get_vit_lr_decay_rate_pev1 13 | from fvcore.common.param_scheduler import MultiStepParamScheduler 14 | 15 | from ..COCO.mask_rcnn_vitdet_b_100ep import ( # dataloader,; model,; get_vit_lr_decay_rate, 16 | lr_multiplier, optimizer, train) 17 | from ..common.coco_loader_lsj import dataloader 18 | 19 | train.init_checkpoint = "/checkpoint/vision_encoder/pev1/pev1_rc2_spatial_d2.pt" 20 | train.output_dir = ( 21 | "/checkpoint/vision_encoder/d2_output/lvis/mask_rcnn_PEspatial_G_lvis75ep" 22 | ) 23 | 24 | model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model 25 | 26 | model.pixel_mean = [127, 127, 127] 27 | model.pixel_std = [127, 127, 127] 28 | model.input_format = "RGB" 29 | 30 | img_size = 1024 31 | embed_dim, depth, num_heads, mlp_ratio, dp = 1536, 50, 16, 8960 / 1536, 0.5 32 | pretrain_img_size, patch_size, window_size = 512, 16, 32 33 | # 12, 24, 36, 49 for global attention 34 | window_block_indexes = ( 35 | list(range(0, 12)) + list(range(13, 24)) + list(range(25, 36)) + list(range(37, 49)) 36 | ) 37 | # Creates Simple Feature Pyramid from ViT backbone 38 | model.backbone = L(SimpleFeaturePyramid)( 39 | net=L(PEv1_det)( # Single-scale ViT backbone 40 | pretrain_img_size=pretrain_img_size, 41 | img_size=img_size, 42 | patch_size=patch_size, 43 | embed_dim=embed_dim, 44 | depth=depth, 45 | num_heads=num_heads, 46 | drop_path_rate=dp, 47 | window_size=window_size, 48 | pt_hw_seq_len=32, 49 | mlp_ratio=mlp_ratio, 50 | qkv_bias=True, 51 | norm_layer=partial(nn.LayerNorm, eps=1e-6), 52 | window_block_indexes=window_block_indexes, 53 | residual_block_indexes=[], 54 | use_rel_pos=True, 55 | out_feature="last_feat", 56 | tile_posemb=True, 57 | use_abs_pos=True, 58 | pretrain_use_cls_token=False, 59 | use_act_checkpoint=True, 60 | init_values=0.1, 61 | ), 62 | in_feature="${.net.out_feature}", 63 | out_channels=256, 64 | scale_factors=(4.0, 2.0, 1.0, 0.5), 65 | top_block=L(LastLevelMaxPool)(), 66 | norm="LN", 67 | square_pad=img_size, 68 | ) 69 | 70 | model.roi_heads.num_classes = 1203 71 | model.roi_heads.box_predictor.test_score_thresh = 0.02 72 | model.roi_heads.box_predictor.test_topk_per_image = 300 73 | model.roi_heads.box_predictor.use_sigmoid_ce = True 74 | model.roi_heads.box_predictor.use_fed_loss = True 75 | model.roi_heads.box_predictor.get_fed_loss_cls_weights = ( 76 | lambda: get_fed_loss_cls_weights(dataloader.train.dataset.names, 0.5) 77 | ) 78 | 79 | train.eval_period = 30000 80 | 81 | optimizer.params.lr_factor_func = partial( 82 | get_vit_lr_decay_rate_pev1, lr_decay_rate=0.9, num_layers=50 83 | ) 84 | 85 | 86 | dataloader.train.dataset.names = "lvis_v1_train" 87 | dataloader.train.sampler = L(RepeatFactorTrainingSampler)( 88 | repeat_factors=L( 89 | RepeatFactorTrainingSampler.repeat_factors_from_category_frequency 90 | )(dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001) 91 | ) 92 | dataloader.test.dataset.names = "lvis_v1_val" 93 | dataloader.evaluator = L(LVISEvaluator)( 94 | dataset_name="${..test.dataset.names}", 95 | max_dets_per_image=300, 96 | output_dir="${train.output_dir}", 97 | ) 98 | 99 | dataloader.train.total_batch_size = 64 100 | 101 | train.max_iter = 184375 102 | 103 | 104 | lr_multiplier = L(WarmupParamScheduler)( 105 | scheduler=L(MultiStepParamScheduler)( 106 | values=[1.0, 0.1, 0.01], 107 | milestones=[163889, 177546], 108 | num_updates=train.max_iter, 109 | ), 110 | warmup_length=250 / train.max_iter, 111 | warmup_factor=0.001, 112 | ) 113 | 114 | optimizer.params.overrides = {} 115 | optimizer.params.weight_decay_norm = None 116 | optimizer.lr = 5e-5 117 | 118 | train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep 119 | lr_multiplier.scheduler.milestones = [ 120 | milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones 121 | ] 122 | lr_multiplier.scheduler.num_updates = train.max_iter 123 | -------------------------------------------------------------------------------- /apps/detection/projects/ViTDet/configs/common/coco_loader_lsj.py: -------------------------------------------------------------------------------- 1 | import detectron2.data.transforms as T 2 | from detectron2 import model_zoo 3 | from detectron2.config import LazyCall as L 4 | 5 | # Data using LSJ 6 | image_size = 1024 7 | dataloader = model_zoo.get_config("common/data/coco.py").dataloader 8 | dataloader.train.mapper.augmentations = [ 9 | L(T.RandomFlip)(horizontal=True), # flip first 10 | L(T.ResizeScale)( 11 | min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size 12 | ), 13 | L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), 14 | ] 15 | dataloader.train.mapper.image_format = "RGB" 16 | dataloader.train.total_batch_size = 64 17 | # recompute boxes due to cropping 18 | dataloader.train.mapper.recompute_boxes = True 19 | 20 | dataloader.test.mapper.augmentations = [ 21 | L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), 22 | ] 23 | -------------------------------------------------------------------------------- /apps/detection/scripts/coco/train_mask_rcnn_PEcore_G_coco75ep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --qos=vision_encoder 4 | #SBATCH --account=vision_encoder 5 | #SBATCH --job-name=det 6 | #SBATCH --nodes=8 7 | #SBATCH --ntasks=8 8 | #SBATCH --gres=gpu:8 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --mem=0 11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco/train_mask_rcnn_PEcore_G_coco75ep/%j.out 12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco/train_mask_rcnn_PEcore_G_coco75ep/%j.err 13 | #SBATCH --time=96:00:00 14 | 15 | module load cuda/12.1 16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST)) 17 | nodes_array=($nodes) 18 | head_node=${nodes_array[0]} 19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 20 | 21 | read -ra my_array <<< $head_node_ip 22 | export LOGLEVEL=INFO 23 | 24 | echo head_node_ip $head_node_ip 25 | echo endpoint "${head_node_ip}:29500" 26 | 27 | export DETECTRON2_DATASETS="/path/to/detectron2_data" 28 | export PYTHONPATH="$HOME/occhi/apps/detection:$PYTHONPATH" 29 | 30 | srun \ 31 | torchrun \ 32 | --nnodes 8 \ 33 | --nproc_per_node 8 \ 34 | --rdzv_id $RANDOM \ 35 | --rdzv_endpoint "${my_array[0]}:29500" \ 36 | --rdzv_backend c10d \ 37 | tools/lazyconfig_train_net_pe_slurm.py \ 38 | --resume \ 39 | --config-file projects/ViTDet/configs/COCO/mask_rcnn_PEcore_G_coco75ep.py \ 40 | optimizer.lr=5e-5 \ 41 | train.init_checkpoint="/checkpoint/vision_encoder/pev1/pe_core_G14_448_16patch.pt" \ 42 | train.output_dir="/checkpoint/vision_encoder/d2_output/coco/train_mask_rcnn_PEcore_G_coco75ep" \ 43 | model.backbone.net.use_act_checkpoint=True \ 44 | "$@" 45 | -------------------------------------------------------------------------------- /apps/detection/scripts/coco/train_mask_rcnn_PEspatial_G_coco36ep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --qos=vision_encoder 4 | #SBATCH --account=vision_encoder 5 | #SBATCH --job-name=det 6 | #SBATCH --nodes=8 7 | #SBATCH --ntasks=8 8 | #SBATCH --gres=gpu:8 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --mem=0 11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco/train_mask_rcnn_PEspatial_G_coco36ep/%j.out 12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco/train_mask_rcnn_PEspatial_G_coco36ep/%j.err 13 | #SBATCH --time=96:00:00 14 | 15 | module load cuda/12.1 16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST)) 17 | nodes_array=($nodes) 18 | head_node=${nodes_array[0]} 19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 20 | 21 | read -ra my_array <<< $head_node_ip 22 | export LOGLEVEL=INFO 23 | 24 | echo head_node_ip $head_node_ip 25 | echo endpoint "${head_node_ip}:29500" 26 | 27 | export DETECTRON2_DATASETS="/path/to/detectron2_data" 28 | export PYTHONPATH="$HOME/occhi/apps/detection:$PYTHONPATH" 29 | 30 | srun \ 31 | torchrun \ 32 | --nnodes 8 \ 33 | --nproc_per_node 8 \ 34 | --rdzv_id $RANDOM \ 35 | --rdzv_endpoint "${my_array[0]}:29500" \ 36 | --rdzv_backend c10d \ 37 | tools/lazyconfig_train_net_pe_slurm.py \ 38 | --resume \ 39 | --config-file projects/ViTDet/configs/COCO/mask_rcnn_PEspatial_G_coco36ep.py \ 40 | optimizer.lr=5e-5 \ 41 | train.init_checkpoint="/checkpoint/vision_encoder/pev1/pe_spatial_G14_16patch.pth" \ 42 | train.output_dir="/checkpoint/vision_encoder/d2_output/coco/train_mask_rcnn_PEspatial_G_coco36ep" \ 43 | model.backbone.net.init_values=0.1 \ 44 | model.backbone.net.use_act_checkpoint=True \ 45 | "$@" 46 | -------------------------------------------------------------------------------- /apps/detection/scripts/evaluate_local.sh: -------------------------------------------------------------------------------- 1 | export DETECTRON2_DATASETS="/path/to/detectron2_data" 2 | export PYTHONPATH="$HOME/occhi/apps/detection:$PYTHONPATH" 3 | 4 | python3 tools/lazyconfig_train_net_pe.py \ 5 | --num-gpus 8 \ 6 | --eval-only \ 7 | "$@" -------------------------------------------------------------------------------- /apps/detection/scripts/lvis/train_mask_rcnn_PEcore_G_lvis75ep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --qos=vision_encoder 4 | #SBATCH --account=vision_encoder 5 | #SBATCH --job-name=det 6 | #SBATCH --nodes=8 7 | #SBATCH --ntasks=8 8 | #SBATCH --gres=gpu:8 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --mem=0 11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/lvis/train_mask_rcnn_PEcore_G_lvis75ep/%j.out 12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/lvis/train_mask_rcnn_PEcore_G_lvis75ep/%j.err 13 | #SBATCH --time=96:00:00 14 | 15 | module load cuda/12.1 16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST)) 17 | nodes_array=($nodes) 18 | head_node=${nodes_array[0]} 19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 20 | 21 | read -ra my_array <<< $head_node_ip 22 | export LOGLEVEL=INFO 23 | 24 | echo head_node_ip $head_node_ip 25 | echo endpoint "${head_node_ip}:29500" 26 | 27 | export DETECTRON2_DATASETS="/path/to/detectron2_data" 28 | export PYTHONPATH="$HOME/occhi/apps/detection:$PYTHONPATH" 29 | 30 | srun \ 31 | torchrun \ 32 | --nnodes 8 \ 33 | --nproc_per_node 8 \ 34 | --rdzv_id $RANDOM \ 35 | --rdzv_endpoint "${my_array[0]}:29500" \ 36 | --rdzv_backend c10d \ 37 | tools/lazyconfig_train_net_pe_slurm.py \ 38 | --resume \ 39 | --config-file projects/ViTDet/configs/LVIS/mask_rcnn_PEcore_G_lvis75ep.py \ 40 | optimizer.lr=5e-5 \ 41 | train.init_checkpoint="/checkpoint/vision_encoder/pev1/pe_core_G14_448_16patch.pt" \ 42 | train.output_dir="/checkpoint/vision_encoder/d2_output/lvis/train_mask_rcnn_PEcore_G_lvis75ep" \ 43 | model.backbone.net.use_act_checkpoint=True \ 44 | "$@" 45 | -------------------------------------------------------------------------------- /apps/detection/scripts/lvis/train_mask_rcnn_PEspatial_G_lvis75ep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --qos=vision_encoder 4 | #SBATCH --account=vision_encoder 5 | #SBATCH --job-name=det 6 | #SBATCH --nodes=8 7 | #SBATCH --ntasks=8 8 | #SBATCH --gres=gpu:8 9 | #SBATCH --cpus-per-task=32 10 | #SBATCH --mem=0 11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/lvis/train_mask_rcnn_PEspatial_G_lvis75ep/%j.out 12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/lvis/train_mask_rcnn_PEspatial_G_lvis75ep/%j.err 13 | #SBATCH --time=96:00:00 14 | 15 | module load cuda/12.1 16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST)) 17 | nodes_array=($nodes) 18 | head_node=${nodes_array[0]} 19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 20 | 21 | read -ra my_array <<< $head_node_ip 22 | export LOGLEVEL=INFO 23 | 24 | echo head_node_ip $head_node_ip 25 | echo endpoint "${head_node_ip}:29500" 26 | 27 | export DETECTRON2_DATASETS="/path/to/detectron2_data" 28 | export PYTHONPATH="$HOME/occhi/apps/detection:$PYTHONPATH" 29 | 30 | srun \ 31 | torchrun \ 32 | --nnodes 8 \ 33 | --nproc_per_node 8 \ 34 | --rdzv_id $RANDOM \ 35 | --rdzv_endpoint "${my_array[0]}:29500" \ 36 | --rdzv_backend c10d \ 37 | tools/lazyconfig_train_net_pe_slurm.py \ 38 | --resume \ 39 | --config-file projects/ViTDet/configs/LVIS/mask_rcnn_PEspatial_G_lvis75ep.py \ 40 | optimizer.lr=5e-5 \ 41 | train.init_checkpoint="/checkpoint/vision_encoder/pev1/pe_spatial_G14_16patch.pth" \ 42 | train.output_dir="/checkpoint/vision_encoder/d2_output/lvis/train_mask_rcnn_PEspatial_G_lvis75ep" \ 43 | model.backbone.net.init_values=0.1 \ 44 | model.backbone.net.use_act_checkpoint=True \ 45 | "$@" 46 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for CLIP Benchmark.""" 2 | 3 | __author__ = """Mehdi Cherti""" 4 | __email__ = "mehdicherti@gmail.com" 5 | __version__ = "0.1.0" 6 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/clip_benchmark/datasets/__init__.py -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/babel_imagenet.py: -------------------------------------------------------------------------------- 1 | import torchvision 2 | 3 | """ 4 | BabelImageNet from https://arxiv.org/pdf/2306.08658.pdf 5 | Adapted from https://github.com/gregor-ge/Babel-ImageNet, thanks to the authors 6 | """ 7 | 8 | 9 | class BabelImageNet(torchvision.datasets.ImageNet): 10 | def __init__( 11 | self, root: str, idxs, split: str = "val", download=None, **kwargs 12 | ) -> None: 13 | super().__init__(root, split, **kwargs) 14 | examples_per_class = len(self.targets) // 1000 15 | select_idxs = [ 16 | idx * examples_per_class + i 17 | for idx in idxs 18 | for i in range(examples_per_class) 19 | ] 20 | self.targets = [i for i in range(len(idxs)) for _ in range(examples_per_class)] 21 | self.imgs = [self.imgs[i] for i in select_idxs] 22 | self.samples = [self.samples[i] for i in select_idxs] 23 | self.idxs = idxs 24 | 25 | def __getitem__(self, i): 26 | img, target = super().__getitem__(i) 27 | target = self.idxs.index(target) 28 | return img, target 29 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/flickr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adapted from https://github.com/pytorch/vision/blob/main/torchvision/datasets/flickr.py 3 | Thanks to the authors of torchvision 4 | """ 5 | 6 | import glob 7 | import os 8 | from collections import defaultdict 9 | from html.parser import HTMLParser 10 | from typing import Any, Callable, Dict, List, Optional, Tuple 11 | 12 | from PIL import Image 13 | from torchvision.datasets import VisionDataset 14 | 15 | 16 | class Flickr(VisionDataset): 17 | 18 | def __init__( 19 | self, 20 | root: str, 21 | ann_file: str, 22 | transform: Optional[Callable] = None, 23 | target_transform: Optional[Callable] = None, 24 | ) -> None: 25 | super().__init__(root, transform=transform, target_transform=target_transform) 26 | self.ann_file = os.path.expanduser(ann_file) 27 | data = defaultdict(list) 28 | with open(ann_file) as fd: 29 | fd.readline() 30 | for line in fd: 31 | line = line.strip() 32 | if line: 33 | # some lines have comma in the caption, se we make sure we do the split correctly 34 | img, caption = line.strip().split(".jpg,") 35 | img = img + ".jpg" 36 | data[img].append(caption) 37 | self.data = list(data.items()) 38 | 39 | def __getitem__(self, index: int) -> Tuple[Any, Any]: 40 | """ 41 | Args: 42 | index (int): Index 43 | 44 | Returns: 45 | tuple: Tuple (image, target). target is a list of captions for the image. 46 | """ 47 | img, captions = self.data[index] 48 | 49 | # Image 50 | img = Image.open(os.path.join(self.root, img)).convert("RGB") 51 | if self.transform is not None: 52 | img = self.transform(img) 53 | 54 | # Captions 55 | target = captions 56 | if self.target_transform is not None: 57 | target = self.target_transform(target) 58 | 59 | return img, target 60 | 61 | def __len__(self) -> int: 62 | return len(self.data) 63 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/flickr30k_200.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import json 3 | import os 4 | from subprocess import call 5 | 6 | import requests 7 | from PIL import Image 8 | from torchvision.datasets import VisionDataset 9 | 10 | from .flores_langs import flores_languages 11 | 12 | GITHUB_DATA_PATH = ( 13 | "https://raw.githubusercontent.com/visheratin/nllb-clip/main/data/flickr30k-200/" 14 | ) 15 | SUPPORTED_LANGUAGES = flores_languages 16 | 17 | IMAGE_INDEX_FILENAME = "filenames.txt" 18 | 19 | CAPTIONS_FILENAME_TEMPLATE = "{}.txt" 20 | OUTPUT_FILENAME_TEMPLATE = "flickr30k_200-{}.json" 21 | 22 | IMAGES_DOWNLOAD_URL = "https://nllb-data.com/test/flickr30k/images.tar.gz" 23 | 24 | 25 | class Flickr30k_200(VisionDataset): 26 | def __init__(self, root, ann_file, transform=None, target_transform=None): 27 | super().__init__(root, transform=transform, target_transform=target_transform) 28 | self.ann_file = os.path.expanduser(ann_file) 29 | with codecs.open(ann_file, "r", encoding="utf-8") as fp: 30 | data = json.load(fp) 31 | self.data = [ 32 | (img_path, txt) 33 | for img_path, txt in zip(data["image_paths"], data["annotations"]) 34 | ] 35 | 36 | def __getitem__(self, index): 37 | img, captions = self.data[index] 38 | 39 | # Image 40 | img = Image.open(img).convert("RGB") 41 | if self.transform is not None: 42 | img = self.transform(img) 43 | 44 | # Captions 45 | target = [ 46 | captions, 47 | ] 48 | if self.target_transform is not None: 49 | target = self.target_transform(target) 50 | 51 | return img, target 52 | 53 | def __len__(self) -> int: 54 | return len(self.data) 55 | 56 | 57 | def _get_lines(url): 58 | response = requests.get(url, timeout=30) 59 | return response.text.splitlines() 60 | 61 | 62 | def _download_images(out_path): 63 | os.makedirs(out_path, exist_ok=True) 64 | print("Downloading images") 65 | call(f"wget {IMAGES_DOWNLOAD_URL} -O images.tar.gz", shell=True) 66 | call(f"tar -xzf images.tar.gz -C {out_path}", shell=True) 67 | call("rm images.tar.gz", shell=True) 68 | 69 | 70 | def create_annotation_file(root, lang_code): 71 | if lang_code not in SUPPORTED_LANGUAGES: 72 | raise ValueError( 73 | f"Language code {lang_code} not supported. Supported languages are {SUPPORTED_LANGUAGES}" 74 | ) 75 | data_dir = os.path.join(root, "flickr30k-200") 76 | if not os.path.exists(data_dir): 77 | _download_images(data_dir) 78 | images_dir = os.path.join(root, "flickr30k-200", "images") 79 | print("Downloading flickr30k-200 index file") 80 | download_path = os.path.join(GITHUB_DATA_PATH, IMAGE_INDEX_FILENAME) 81 | target_images = _get_lines(download_path) 82 | 83 | print("Downloading flickr30k-200 captions:", lang_code) 84 | captions_path = GITHUB_DATA_PATH 85 | download_path = os.path.join( 86 | captions_path, CAPTIONS_FILENAME_TEMPLATE.format(lang_code) 87 | ) 88 | target_captions = _get_lines(download_path) 89 | 90 | number_of_missing_images = 0 91 | valid_images, valid_annotations, valid_indicies = [], [], [] 92 | for i, (img, txt) in enumerate(zip(target_images, target_captions)): 93 | image_path = os.path.join(images_dir, img) 94 | if not os.path.exists(image_path): 95 | print("Missing image file", img) 96 | number_of_missing_images += 1 97 | continue 98 | 99 | valid_images.append(image_path) 100 | valid_annotations.append(txt) 101 | valid_indicies.append(i) 102 | 103 | if number_of_missing_images > 0: 104 | print(f"*** WARNING *** missing {number_of_missing_images} files.") 105 | 106 | with codecs.open( 107 | os.path.join(root, OUTPUT_FILENAME_TEMPLATE.format(lang_code)), 108 | "w", 109 | encoding="utf-8", 110 | ) as fp: 111 | json.dump( 112 | { 113 | "image_paths": valid_images, 114 | "annotations": valid_annotations, 115 | "indicies": valid_indicies, 116 | }, 117 | fp, 118 | ensure_ascii=False, 119 | ) 120 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/flores_langs.py: -------------------------------------------------------------------------------- 1 | flores_languages = [ 2 | "ace_Arab", 3 | "ace_Latn", 4 | "acm_Arab", 5 | "acq_Arab", 6 | "aeb_Arab", 7 | "afr_Latn", 8 | "ajp_Arab", 9 | "aka_Latn", 10 | "amh_Ethi", 11 | "apc_Arab", 12 | "arb_Arab", 13 | "ars_Arab", 14 | "ary_Arab", 15 | "arz_Arab", 16 | "asm_Beng", 17 | "ast_Latn", 18 | "awa_Deva", 19 | "ayr_Latn", 20 | "azb_Arab", 21 | "azj_Latn", 22 | "bak_Cyrl", 23 | "bam_Latn", 24 | "ban_Latn", 25 | "bel_Cyrl", 26 | "bem_Latn", 27 | "ben_Beng", 28 | "bho_Deva", 29 | "bjn_Arab", 30 | "bjn_Latn", 31 | "bod_Tibt", 32 | "bos_Latn", 33 | "bug_Latn", 34 | "bul_Cyrl", 35 | "cat_Latn", 36 | "ceb_Latn", 37 | "ces_Latn", 38 | "cjk_Latn", 39 | "ckb_Arab", 40 | "crh_Latn", 41 | "cym_Latn", 42 | "dan_Latn", 43 | "deu_Latn", 44 | "dik_Latn", 45 | "dyu_Latn", 46 | "dzo_Tibt", 47 | "eng_Latn", 48 | "ell_Grek", 49 | "epo_Latn", 50 | "est_Latn", 51 | "eus_Latn", 52 | "ewe_Latn", 53 | "fao_Latn", 54 | "fij_Latn", 55 | "fin_Latn", 56 | "fon_Latn", 57 | "fra_Latn", 58 | "fur_Latn", 59 | "fuv_Latn", 60 | "gla_Latn", 61 | "gle_Latn", 62 | "glg_Latn", 63 | "grn_Latn", 64 | "guj_Gujr", 65 | "hat_Latn", 66 | "hau_Latn", 67 | "heb_Hebr", 68 | "hin_Deva", 69 | "hne_Deva", 70 | "hrv_Latn", 71 | "hun_Latn", 72 | "hye_Armn", 73 | "ibo_Latn", 74 | "ilo_Latn", 75 | "ind_Latn", 76 | "isl_Latn", 77 | "ita_Latn", 78 | "jav_Latn", 79 | "jpn_Jpan", 80 | "kab_Latn", 81 | "kac_Latn", 82 | "kam_Latn", 83 | "kan_Knda", 84 | "kas_Arab", 85 | "kas_Deva", 86 | "kat_Geor", 87 | "knc_Arab", 88 | "knc_Latn", 89 | "kaz_Cyrl", 90 | "kbp_Latn", 91 | "kea_Latn", 92 | "khm_Khmr", 93 | "kik_Latn", 94 | "kin_Latn", 95 | "kir_Cyrl", 96 | "kmb_Latn", 97 | "kmr_Latn", 98 | "kon_Latn", 99 | "kor_Hang", 100 | "lao_Laoo", 101 | "lij_Latn", 102 | "lim_Latn", 103 | "lin_Latn", 104 | "lit_Latn", 105 | "lmo_Latn", 106 | "ltg_Latn", 107 | "ltz_Latn", 108 | "lua_Latn", 109 | "lug_Latn", 110 | "luo_Latn", 111 | "lus_Latn", 112 | "lvs_Latn", 113 | "mag_Deva", 114 | "mai_Deva", 115 | "mal_Mlym", 116 | "mar_Deva", 117 | "min_Latn", 118 | "mkd_Cyrl", 119 | "plt_Latn", 120 | "mlt_Latn", 121 | "mni_Beng", 122 | "khk_Cyrl", 123 | "mos_Latn", 124 | "mri_Latn", 125 | "mya_Mymr", 126 | "nld_Latn", 127 | "nno_Latn", 128 | "nob_Latn", 129 | "npi_Deva", 130 | "nso_Latn", 131 | "nus_Latn", 132 | "nya_Latn", 133 | "oci_Latn", 134 | "gaz_Latn", 135 | "ory_Orya", 136 | "pag_Latn", 137 | "pan_Guru", 138 | "pap_Latn", 139 | "pes_Arab", 140 | "pol_Latn", 141 | "por_Latn", 142 | "prs_Arab", 143 | "pbt_Arab", 144 | "quy_Latn", 145 | "ron_Latn", 146 | "run_Latn", 147 | "rus_Cyrl", 148 | "sag_Latn", 149 | "san_Deva", 150 | "scn_Latn", 151 | "shn_Mymr", 152 | "sin_Sinh", 153 | "slk_Latn", 154 | "slv_Latn", 155 | "smo_Latn", 156 | "sna_Latn", 157 | "snd_Arab", 158 | "som_Latn", 159 | "sot_Latn", 160 | "spa_Latn", 161 | "als_Latn", 162 | "srd_Latn", 163 | "srp_Cyrl", 164 | "ssw_Latn", 165 | "sun_Latn", 166 | "swe_Latn", 167 | "swh_Latn", 168 | "szl_Latn", 169 | "tam_Taml", 170 | "tat_Cyrl", 171 | "tel_Telu", 172 | "tgk_Cyrl", 173 | "tgl_Latn", 174 | "tha_Thai", 175 | "tir_Ethi", 176 | "taq_Latn", 177 | "taq_Tfng", 178 | "tpi_Latn", 179 | "tsn_Latn", 180 | "tso_Latn", 181 | "tuk_Latn", 182 | "tum_Latn", 183 | "tur_Latn", 184 | "twi_Latn", 185 | "tzm_Tfng", 186 | "uig_Arab", 187 | "ukr_Cyrl", 188 | "umb_Latn", 189 | "urd_Arab", 190 | "uzn_Latn", 191 | "vec_Latn", 192 | "vie_Latn", 193 | "war_Latn", 194 | "wol_Latn", 195 | "xho_Latn", 196 | "ydd_Hebr", 197 | "yor_Latn", 198 | "yue_Hant", 199 | "zho_Hans", 200 | "zho_Hant", 201 | "zsm_Latn", 202 | "zul_Latn", 203 | ] 204 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/multilingual_mscoco.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import json 3 | import os 4 | from subprocess import call 5 | 6 | import requests 7 | from PIL import Image 8 | from torchvision.datasets import VisionDataset 9 | 10 | GITHUB_DATA_PATH = "https://raw.githubusercontent.com/adobe-research/Cross-lingual-Test-Dataset-XTD10/main/XTD10/" 11 | GITHUB_DATA_PATH_DE_FR = "https://raw.githubusercontent.com/adobe-research/Cross-lingual-Test-Dataset-XTD10/main/MIC/" 12 | GITHUB_DATA_PATH_JP = "https://raw.githubusercontent.com/adobe-research/Cross-lingual-Test-Dataset-XTD10/main/STAIR/" 13 | SUPPORTED_LANGUAGES = ["es", "it", "ko", "pl", "ru", "tr", "zh", "en", "de", "fr", "jp"] 14 | 15 | IMAGE_INDEX_FILENAME = "test_image_names.txt" 16 | 17 | CAPTIONS_FILENAME_TEMPLATE = "test_1kcaptions_{}.txt" 18 | OUTPUT_FILENAME_TEMPLATE = "multilingual_mscoco_captions-{}.json" 19 | 20 | IMAGES_DOWNLOAD_URL = "https://nllb-data.com/test/xtd10/images.tar.gz" 21 | 22 | 23 | class Multilingual_MSCOCO(VisionDataset): 24 | def __init__(self, root, ann_file, transform=None, target_transform=None): 25 | super().__init__(root, transform=transform, target_transform=target_transform) 26 | self.ann_file = os.path.expanduser(ann_file) 27 | with codecs.open(ann_file, "r", encoding="utf-8") as fp: 28 | data = json.load(fp) 29 | self.data = [ 30 | (img_path, txt) 31 | for img_path, txt in zip(data["image_paths"], data["annotations"]) 32 | ] 33 | 34 | def __getitem__(self, index): 35 | img, captions = self.data[index] 36 | 37 | # Image 38 | img = Image.open(img).convert("RGB") 39 | if self.transform is not None: 40 | img = self.transform(img) 41 | 42 | # Captions 43 | target = [ 44 | captions, 45 | ] 46 | if self.target_transform is not None: 47 | target = self.target_transform(target) 48 | 49 | return img, target 50 | 51 | def __len__(self) -> int: 52 | return len(self.data) 53 | 54 | 55 | def _get_lines(url): 56 | response = requests.get(url, timeout=30) 57 | return response.text.splitlines() 58 | 59 | 60 | def _download_images(out_path): 61 | os.makedirs(out_path, exist_ok=True) 62 | print("Downloading images") 63 | call(f"wget {IMAGES_DOWNLOAD_URL} -O images.tar.gz", shell=True) 64 | call(f"tar -xzf images.tar.gz -C {out_path}", shell=True) 65 | call("rm images.tar.gz", shell=True) 66 | 67 | 68 | def create_annotation_file(root, lang_code): 69 | if lang_code not in SUPPORTED_LANGUAGES: 70 | raise ValueError( 71 | f"Language code {lang_code} not supported. Supported languages are {SUPPORTED_LANGUAGES}" 72 | ) 73 | data_dir = os.path.join(root, "multilingual_mscoco") 74 | if not os.path.exists(data_dir): 75 | _download_images(data_dir) 76 | images_dir = os.path.join(data_dir, "images") 77 | print("Downloading multilingual_ms_coco index file") 78 | download_path = os.path.join(GITHUB_DATA_PATH, IMAGE_INDEX_FILENAME) 79 | target_images = _get_lines(download_path) 80 | 81 | print("Downloading multilingual_ms_coco captions:", lang_code) 82 | captions_path = GITHUB_DATA_PATH 83 | if lang_code in ["de", "fr"]: 84 | captions_path = GITHUB_DATA_PATH_DE_FR 85 | elif lang_code == "jp": 86 | captions_path = GITHUB_DATA_PATH_JP 87 | download_path = os.path.join( 88 | captions_path, CAPTIONS_FILENAME_TEMPLATE.format(lang_code) 89 | ) 90 | target_captions = _get_lines(download_path) 91 | 92 | number_of_missing_images = 0 93 | valid_images, valid_annotations, valid_indicies = [], [], [] 94 | for i, (img, txt) in enumerate(zip(target_images, target_captions)): 95 | image_path = os.path.join(images_dir, img) 96 | if not os.path.exists(image_path): 97 | print("Missing image file", img) 98 | number_of_missing_images += 1 99 | continue 100 | 101 | valid_images.append(image_path) 102 | valid_annotations.append(txt) 103 | valid_indicies.append(i) 104 | 105 | if number_of_missing_images > 0: 106 | print(f"*** WARNING *** missing {number_of_missing_images} files.") 107 | 108 | with codecs.open( 109 | os.path.join(root, OUTPUT_FILENAME_TEMPLATE.format(lang_code)), 110 | "w", 111 | encoding="utf-8", 112 | ) as fp: 113 | json.dump( 114 | { 115 | "image_paths": valid_images, 116 | "annotations": valid_annotations, 117 | "indicies": valid_indicies, 118 | }, 119 | fp, 120 | ensure_ascii=False, 121 | ) 122 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/objectnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code adapted from https://github.com/mlfoundations/wise-ft/blob/master/src/datasets/objectnet.py 3 | Thanks to the authors of wise-ft 4 | """ 5 | 6 | import json 7 | import os 8 | from pathlib import Path 9 | 10 | import numpy as np 11 | import PIL 12 | import torch 13 | from torchvision import datasets 14 | from torchvision.transforms import Compose 15 | 16 | 17 | def get_metadata(folder): 18 | metadata = Path(folder) 19 | 20 | with open(metadata / "folder_to_objectnet_label.json", "r") as f: 21 | folder_map = json.load(f) 22 | folder_map = {v: k for k, v in folder_map.items()} 23 | with open(metadata / "objectnet_to_imagenet_1k.json", "r") as f: 24 | objectnet_map = json.load(f) 25 | 26 | with open(metadata / "pytorch_to_imagenet_2012_id.json", "r") as f: 27 | pytorch_map = json.load(f) 28 | pytorch_map = {v: k for k, v in pytorch_map.items()} 29 | 30 | with open(metadata / "imagenet_to_label_2012_v2", "r") as f: 31 | imagenet_map = {v.strip(): str(pytorch_map[i]) for i, v in enumerate(f)} 32 | 33 | folder_to_ids, class_sublist = {}, [] 34 | classnames = [] 35 | for objectnet_name, imagenet_names in objectnet_map.items(): 36 | imagenet_names = imagenet_names.split("; ") 37 | imagenet_ids = [ 38 | int(imagenet_map[imagenet_name]) for imagenet_name in imagenet_names 39 | ] 40 | class_sublist.extend(imagenet_ids) 41 | folder_to_ids[folder_map[objectnet_name]] = imagenet_ids 42 | 43 | class_sublist = sorted(class_sublist) 44 | class_sublist_mask = [(i in class_sublist) for i in range(1000)] 45 | classname_map = {v: k for k, v in folder_map.items()} 46 | return class_sublist, class_sublist_mask, folder_to_ids, classname_map 47 | 48 | 49 | class ObjectNetDataset(datasets.ImageFolder): 50 | 51 | def __init__(self, root, transform): 52 | ( 53 | self._class_sublist, 54 | self.class_sublist_mask, 55 | self.folders_to_ids, 56 | self.classname_map, 57 | ) = get_metadata(root) 58 | subdir = os.path.join(root, "objectnet-1.0", "images") 59 | label_map = { 60 | name: idx 61 | for idx, name in enumerate(sorted(list(self.folders_to_ids.keys()))) 62 | } 63 | self.label_map = label_map 64 | super().__init__(subdir, transform=transform) 65 | self.samples = [ 66 | d 67 | for d in self.samples 68 | if os.path.basename(os.path.dirname(d[0])) in self.label_map 69 | ] 70 | self.imgs = self.samples 71 | self.classes = sorted(list(self.folders_to_ids.keys())) 72 | self.classes = [self.classname_map[c].lower() for c in self.classes] 73 | 74 | def __len__(self): 75 | return len(self.samples) 76 | 77 | def __getitem__(self, index): 78 | path, target = self.samples[index] 79 | sample = self.loader(path) 80 | if self.transform is not None: 81 | sample = self.transform(sample) 82 | label = os.path.basename(os.path.dirname(path)) 83 | return sample, self.label_map[label] 84 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/pos_neg_caption_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from PIL import Image 5 | from torch.utils.data import Dataset 6 | 7 | 8 | class PosNegCaptionDataset(Dataset): 9 | 10 | def __init__(self, root, ann_file, transform=None, crop_images=False): 11 | self.root = root 12 | self.ann = json.load(open(ann_file)) 13 | self.transform = transform 14 | self.crop_images = crop_images 15 | self.idx_strings = list(self.ann.keys()) # NOTE : indices may be non-contiguous 16 | 17 | def __getitem__(self, idx): 18 | idx_str = self.idx_strings[idx] 19 | data = self.ann[idx_str] 20 | img = Image.open(os.path.join(self.root, data["filename"])) 21 | if self.crop_images: 22 | img = img.crop( 23 | ( 24 | data["bbox_x"], 25 | data["bbox_y"], 26 | data["bbox_x"] + data["bbox_width"], 27 | data["bbox_y"] + data["bbox_height"], 28 | ) 29 | ) 30 | if self.transform is not None: 31 | img = self.transform(img) 32 | caption = data["caption"] 33 | negative_caption = data["negative_caption"] 34 | 35 | return img, [caption, negative_caption] 36 | 37 | def __len__(self): 38 | return len(self.ann) 39 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/tfds.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | 4 | 5 | def download_tfds_dataset(name, data_dir=None): 6 | import tensorflow_datasets as tfds 7 | import timm 8 | 9 | builder = tfds.builder(name, data_dir=data_dir) 10 | builder.download_and_prepare() 11 | 12 | 13 | def disable_gpus_on_tensorflow(): 14 | import tensorflow as tf 15 | 16 | tf.config.set_visible_devices([], "GPU") 17 | 18 | 19 | class VTABIterableDataset(torch.utils.data.IterableDataset): 20 | 21 | def __init__( 22 | self, 23 | tfds_dataset, 24 | split="test", 25 | input_name="image", 26 | label_name="label", 27 | input_mode="RGB", 28 | transform=None, 29 | target_transform=None, 30 | classes=None, 31 | ): 32 | self.tfds_dataset = tfds_dataset 33 | self.input_name = input_name 34 | self.label_name = label_name 35 | self.transform = transform 36 | self.target_transform = target_transform 37 | self.input_mode = input_mode 38 | self.num_examples = tfds_dataset.get_num_samples(split) 39 | self.split = split 40 | if classes is None: 41 | self.classes = tfds_dataset._dataset_builder.info.features["label"].names 42 | else: 43 | self.classes = classes 44 | 45 | def __iter__(self): 46 | worker_info = torch.utils.data.get_worker_info() 47 | iterator = self.tfds_dataset.get_tf_data( 48 | self.split, batch_size=1, epochs=1, for_eval=True 49 | ) 50 | if worker_info is not None: 51 | iterator = iterator.shard( 52 | index=worker_info.id, num_shards=worker_info.num_workers 53 | ) 54 | nb = 0 55 | for data in iterator: 56 | inputs = data[self.input_name].numpy() 57 | labels = data[self.label_name].numpy() 58 | for input, label in zip(inputs, labels): 59 | input = Image.fromarray(input, mode=self.input_mode) 60 | if self.transform is not None: 61 | input = self.transform(input) 62 | if self.target_transform is not None: 63 | label = self.target_transform(label) 64 | yield input, label 65 | 66 | def __len__(self): 67 | return self.num_examples 68 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/video_classification_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import cv2 5 | import decord 6 | import torch 7 | from PIL import Image 8 | from torch.utils.data import Dataset 9 | 10 | 11 | class VideoClassificationDataset(Dataset): 12 | def __init__(self, dataset_dir_path, task_config, preprocessor, num_frames=8): 13 | self.dataset_dir_path = dataset_dir_path 14 | self.labels_txt = task_config["labels"] 15 | self.media_dir_path = os.path.join(dataset_dir_path, task_config["media"]) 16 | self.class_ids, self.classes = self.get_class_info() 17 | 18 | self.media_paths = [] 19 | self.labels = [] 20 | self.label_ids = [] 21 | 22 | for j, (class_id, class_name) in enumerate(zip(self.class_ids, self.classes)): 23 | class_dir_path = os.path.join(self.media_dir_path, class_id) 24 | for i, video_file_name in enumerate(os.listdir(class_dir_path)): 25 | video_path = os.path.join(class_dir_path, video_file_name) 26 | self.media_paths.append(video_path) 27 | self.labels.append(class_name) 28 | self.label_ids.append(j) 29 | 30 | self.preprocessor = preprocessor 31 | self.num_frames = num_frames 32 | 33 | def get_class_info(self): 34 | class_ids = [ 35 | dir_name 36 | for dir_name in os.listdir(self.media_dir_path) 37 | if os.path.isdir(os.path.join(self.media_dir_path, dir_name)) 38 | ] 39 | 40 | if self.labels_txt: 41 | labels_txt_path = os.path.join(self.dataset_dir_path, self.labels_txt) 42 | id_to_class_name = {} 43 | with open(labels_txt_path, "r") as f: 44 | for line in f: 45 | id, class_name = line.strip().split(",") 46 | id_to_class_name[id] = class_name 47 | class_names = [id_to_class_name[id] for id in class_ids] 48 | else: 49 | class_names = class_ids 50 | 51 | def clean_label(label: str) -> str: 52 | """ 53 | Return a label without spaces or parenthesis 54 | """ 55 | for c in "()": 56 | label = label.replace(c, "") 57 | return label.strip("_") 58 | 59 | class_names = [clean_label(label) for label in class_names] 60 | 61 | return class_ids, class_names 62 | 63 | def __len__(self): 64 | return len(self.media_paths) 65 | 66 | def __getitem__(self, index): 67 | while True: 68 | media_path = self.media_paths[index] 69 | class_name = self.labels[index] 70 | class_id = self.label_ids[index] 71 | 72 | try: 73 | images = self._load_video(media_path) 74 | 75 | images = [ 76 | ( 77 | self.preprocessor(image.convert("RGB")) 78 | if image.mode == "L" 79 | else self.preprocessor(image) 80 | ) 81 | for image in images 82 | ] 83 | break 84 | except Exception as e: 85 | print(f"{e}, skipping {media_path}.") 86 | index = random.randint(0, len(self.media_paths) - 1) 87 | 88 | # Returns a list of images and one class_id. The model will need to aggregate across the list of images to make a prediction. 89 | return images, class_id 90 | 91 | def _load_video(self, media_path): 92 | vr = decord.VideoReader(media_path) 93 | total_frames = len(vr) 94 | if self.num_frames == 1: 95 | frame_indices = [total_frames // 2] 96 | else: 97 | frame_indices = [ 98 | int(i * (total_frames - 1) / (self.num_frames - 1)) 99 | for i in range(self.num_frames) 100 | ] 101 | 102 | try: 103 | images = vr.get_batch(frame_indices).asnumpy() 104 | except Exception as e: 105 | cap = cv2.VideoCapture(media_path) 106 | images = [] 107 | for pos in frame_indices: 108 | cap.set(cv2.CAP_PROP_POS_FRAMES, pos) 109 | ret, frame = cap.read() 110 | if ret: 111 | # Convert the frame from BGR to RGB 112 | rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 113 | images.append(rgb_frame) 114 | else: 115 | break 116 | 117 | images = [Image.fromarray(image) for image in images] 118 | 119 | return images 120 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/video_retrieval_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import cv2 5 | import decord 6 | import pandas as pd 7 | import torch 8 | from PIL import Image 9 | from torch.utils.data import Dataset 10 | 11 | 12 | class VideoRetrievalDataset(Dataset): 13 | def __init__( 14 | self, 15 | csv_path, 16 | dataset_dir, 17 | preprocessor, 18 | video_ext="mp4", 19 | num_frames=8, 20 | multi_sent=False, 21 | ): 22 | self.data = pd.read_csv(csv_path) 23 | self.dataset_dir = dataset_dir 24 | self.video_ext = video_ext 25 | 26 | self.preprocessor = preprocessor 27 | self.num_frames = num_frames 28 | self.multi_sent = multi_sent 29 | 30 | def __len__(self): 31 | return len(self.data) 32 | 33 | def __getitem__(self, index): 34 | video_id = self.data["video_id"].values[index] 35 | sentences = self.data["sentence"].values[index] 36 | if self.multi_sent: 37 | sentences = sentences.split("@") 38 | else: 39 | sentences = [sentences] 40 | video_path = os.path.join( 41 | self.dataset_dir, "{}.{}".format(video_id, self.video_ext) 42 | ) 43 | 44 | images = self._load_video(video_path) 45 | 46 | images = [ 47 | ( 48 | self.preprocessor(image.convert("RGB")) 49 | if image.mode == "L" 50 | else self.preprocessor(image) 51 | ) 52 | for image in images 53 | ] 54 | 55 | return images, sentences 56 | 57 | def _load_video(self, media_path): 58 | vr = decord.VideoReader(media_path) 59 | total_frames = len(vr) 60 | if self.num_frames == 1: 61 | frame_indices = [total_frames // 2] 62 | else: 63 | frame_indices = [ 64 | int(i * (total_frames - 1) / (self.num_frames - 1)) 65 | for i in range(self.num_frames) 66 | ] 67 | try: 68 | images = vr.get_batch(frame_indices).asnumpy() 69 | except Exception as e: 70 | cap = cv2.VideoCapture(media_path) 71 | images = [] 72 | for pos in frame_indices: 73 | cap.set(cv2.CAP_PROP_POS_FRAMES, pos) 74 | ret, frame = cap.read() 75 | if ret: 76 | # Convert the frame from BGR to RGB 77 | rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 78 | images.append(rgb_frame) 79 | else: 80 | break 81 | 82 | images = [Image.fromarray(image) for image in images] 83 | 84 | return images 85 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/winoground.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import torch 5 | from PIL import Image 6 | from torch.utils.data import Dataset 7 | 8 | 9 | class WinoGround(Dataset): 10 | 11 | def __init__(self, root=".", transform=None): 12 | from datasets import load_dataset 13 | 14 | self.ds = load_dataset("facebook/winoground", cache_dir=root)["test"] 15 | self.transform = transform 16 | 17 | def __getitem__(self, idx): 18 | data = self.ds[idx] 19 | img0 = data["image_0"] 20 | img1 = data["image_1"] 21 | cap0 = data["caption_0"] 22 | cap1 = data["caption_1"] 23 | if self.transform is not None: 24 | img0 = self.transform(img0) 25 | img1 = self.transform(img1) 26 | imgs = torch.stack([img0, img1]) 27 | else: 28 | imgs = [img0, img1] 29 | caps = [cap0, cap1] 30 | return imgs, caps 31 | 32 | def __len__(self): 33 | return len(self.ds) 34 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/datasets/xtd200.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import json 3 | import os 4 | from subprocess import call 5 | 6 | import requests 7 | from PIL import Image 8 | from torchvision.datasets import VisionDataset 9 | 10 | from .flores_langs import flores_languages 11 | 12 | GITHUB_DATA_PATH = ( 13 | "https://raw.githubusercontent.com/visheratin/nllb-clip/main/data/xtd200/" 14 | ) 15 | SUPPORTED_LANGUAGES = flores_languages 16 | 17 | IMAGE_INDEX_FILENAME = "test_image_names.txt" 18 | 19 | CAPTIONS_FILENAME_TEMPLATE = "{}.txt" 20 | OUTPUT_FILENAME_TEMPLATE = "xtd200-{}.json" 21 | 22 | IMAGES_DOWNLOAD_URL = "https://nllb-data.com/test/xtd10/images.tar.gz" 23 | 24 | 25 | class XTD200(VisionDataset): 26 | def __init__(self, root, ann_file, transform=None, target_transform=None): 27 | super().__init__(root, transform=transform, target_transform=target_transform) 28 | self.ann_file = os.path.expanduser(ann_file) 29 | with codecs.open(ann_file, "r", encoding="utf-8") as fp: 30 | data = json.load(fp) 31 | self.data = [ 32 | (img_path, txt) 33 | for img_path, txt in zip(data["image_paths"], data["annotations"]) 34 | ] 35 | 36 | def __getitem__(self, index): 37 | img, captions = self.data[index] 38 | 39 | # Image 40 | img = Image.open(img).convert("RGB") 41 | if self.transform is not None: 42 | img = self.transform(img) 43 | 44 | # Captions 45 | target = [ 46 | captions, 47 | ] 48 | if self.target_transform is not None: 49 | target = self.target_transform(target) 50 | 51 | return img, target 52 | 53 | def __len__(self) -> int: 54 | return len(self.data) 55 | 56 | 57 | def _get_lines(url): 58 | response = requests.get(url, timeout=30) 59 | return response.text.splitlines() 60 | 61 | 62 | def _download_images(out_path): 63 | os.makedirs(out_path, exist_ok=True) 64 | print("Downloading images") 65 | call(f"wget {IMAGES_DOWNLOAD_URL} -O images.tar.gz", shell=True) 66 | call(f"tar -xzf images.tar.gz -C {out_path}", shell=True) 67 | call("rm images.tar.gz", shell=True) 68 | 69 | 70 | def create_annotation_file(root, lang_code): 71 | if lang_code not in SUPPORTED_LANGUAGES: 72 | raise ValueError( 73 | f"Language code {lang_code} not supported. Supported languages are {SUPPORTED_LANGUAGES}" 74 | ) 75 | data_dir = os.path.join(root, "xtd200") 76 | if not os.path.exists(data_dir): 77 | _download_images(data_dir) 78 | images_dir = os.path.join(data_dir, "images") 79 | print("Downloading xtd200 index file") 80 | download_path = os.path.join(GITHUB_DATA_PATH, IMAGE_INDEX_FILENAME) 81 | target_images = _get_lines(download_path) 82 | 83 | print("Downloading xtd200 captions:", lang_code) 84 | captions_path = GITHUB_DATA_PATH 85 | download_path = os.path.join( 86 | captions_path, CAPTIONS_FILENAME_TEMPLATE.format(lang_code) 87 | ) 88 | target_captions = _get_lines(download_path) 89 | 90 | number_of_missing_images = 0 91 | valid_images, valid_annotations, valid_indicies = [], [], [] 92 | for i, (img, txt) in enumerate(zip(target_images, target_captions)): 93 | image_path = os.path.join(images_dir, img) 94 | if not os.path.exists(image_path): 95 | print("Missing image file", img) 96 | number_of_missing_images += 1 97 | continue 98 | 99 | valid_images.append(image_path) 100 | valid_annotations.append(txt) 101 | valid_indicies.append(i) 102 | 103 | if number_of_missing_images > 0: 104 | print(f"*** WARNING *** missing {number_of_missing_images} files.") 105 | 106 | with codecs.open( 107 | os.path.join(root, OUTPUT_FILENAME_TEMPLATE.format(lang_code)), 108 | "w", 109 | encoding="utf-8", 110 | ) as fp: 111 | json.dump( 112 | { 113 | "image_paths": valid_images, 114 | "annotations": valid_annotations, 115 | "indicies": valid_indicies, 116 | }, 117 | fp, 118 | ensure_ascii=False, 119 | ) 120 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/metrics/__captioning.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from pycocoevalcap.bleu.bleu import Bleu 4 | from pycocoevalcap.cider.cider import Cider 5 | from pycocoevalcap.meteor.meteor import Meteor 6 | from pycocoevalcap.rouge.rouge import Rouge 7 | from pycocoevalcap.spice.spice import Spice 8 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer 9 | # from open_clip import tokenize 10 | from tqdm.auto import tqdm 11 | 12 | # from open_clip.tokenizer import _tokenizer 13 | from core.vision_encoder.tokenizer import _tokenizer, tokenize 14 | 15 | """ 16 | Code adapted from https://github.com/salaniz/pycocoevalcap/blob/master/eval.py 17 | Thanks to @salaniz for the code! 18 | """ 19 | 20 | 21 | class COCOEvalCap: 22 | def __init__(self, results): 23 | self.evalImgs = [] 24 | self.eval = {} 25 | self.imgToEval = {} 26 | self.results = results 27 | 28 | def evaluate(self): 29 | gts = {} 30 | res = {} 31 | for imgId, r in enumerate(self.results): 32 | gts[imgId] = r["true"] 33 | res[imgId] = r["gen"] 34 | # ================================================= 35 | # Set up scorers 36 | # ================================================= 37 | print("tokenization...") 38 | tokenizer = PTBTokenizer() 39 | gts = tokenizer.tokenize(gts) 40 | res = tokenizer.tokenize(res) 41 | 42 | # ================================================= 43 | # Set up scorers 44 | # ================================================= 45 | print("setting up scorers...") 46 | scorers = [ 47 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 48 | (Meteor(), "METEOR"), 49 | (Rouge(), "ROUGE_L"), 50 | (Cider(), "CIDEr"), 51 | (Spice(), "SPICE"), 52 | ] 53 | 54 | # ================================================= 55 | # Compute scores 56 | # ================================================= 57 | for scorer, method in scorers: 58 | print("computing %s score..." % (scorer.method())) 59 | score, scores = scorer.compute_score(gts, res) 60 | if type(method) == list: 61 | for sc, scs, m in zip(score, scores, method): 62 | self.setEval(sc, m) 63 | self.setImgToEvalImgs(scs, gts.keys(), m) 64 | print("%s: %0.3f" % (m, sc)) 65 | else: 66 | self.setEval(score, method) 67 | self.setImgToEvalImgs(scores, gts.keys(), method) 68 | print("%s: %0.3f" % (method, score)) 69 | self.setEvalImgs() 70 | 71 | def setEval(self, score, method): 72 | self.eval[method] = score 73 | 74 | def setImgToEvalImgs(self, scores, imgIds, method): 75 | for imgId, score in zip(imgIds, scores): 76 | if not imgId in self.imgToEval: 77 | self.imgToEval[imgId] = {} 78 | self.imgToEval[imgId]["image_id"] = imgId 79 | self.imgToEval[imgId][method] = score 80 | 81 | def setEvalImgs(self): 82 | self.evalImgs = [eval for imgId, eval in self.imgToEval.items()] 83 | 84 | 85 | def evaluate( 86 | model, 87 | dataloader, 88 | batch_size, 89 | device, 90 | transform, 91 | train_dataloader=None, 92 | num_workers=None, 93 | amp=True, 94 | verbose=False, 95 | ): 96 | results = [] 97 | image_id = 0 98 | gt = [] 99 | for idx, (img, captions) in enumerate(tqdm(dataloader)): 100 | out = model.generate(img.to(device)) 101 | decoded = [ 102 | _tokenizer.decode(i) 103 | .split("")[0] 104 | .replace("", "") 105 | .strip() 106 | for i in out.cpu().numpy() 107 | ] 108 | for pred, true in zip(decoded, captions): 109 | true = [{"caption": t} for t in true] 110 | pred = [{"caption": pred}] 111 | results.append({"image_id": image_id, "gen": pred, "true": true}) 112 | image_id += 1 113 | coco_eval = COCOEvalCap(results) 114 | coco_eval.evaluate() 115 | metrics = coco_eval.eval 116 | # print output evaluation scores 117 | for metric, score in metrics.items(): 118 | print(f"{metric}: {score:.3f}") 119 | return metrics 120 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/clip_benchmark/metrics/__init__.py -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/metrics/image_caption_selection.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from contextlib import suppress 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | from open_clip import image_to_device 7 | from tqdm import tqdm 8 | 9 | 10 | def evaluate(model, dataloader, tokenizer, device, amp=True, args=None): 11 | """ 12 | Evaluate the model on the given dataset. 13 | The task has N instances, each instance has I images and C captions. 14 | For each instance, the goal is to find the correct image for each caption and the correct caption for each image. 15 | This is done by computing the similarities between each image and each caption. 16 | This procedure is used to evaluate the models on Winoground and SugarCrepe. 17 | 18 | Parameters 19 | ---------- 20 | 21 | model: torch.nn,Module 22 | CLIP-like model with `encode_image` and `encode_text` 23 | 24 | dataloader: torch.utils.data.Dataloader 25 | dataloader to use for evaluation 26 | 27 | tokenizer: 28 | text tokenizer, i.e. convert list of strings to torch.Tensor of integers 29 | 30 | device: cpu/cuda 31 | 32 | amp: whether to use automatic mixed precision 33 | 34 | Returns 35 | ------- 36 | 37 | dict of accuracy metrics 38 | """ 39 | autocast = torch.cuda.amp.autocast if amp else suppress 40 | image_score = [] 41 | text_score = [] 42 | score = [] 43 | for batch_images, batch_texts in tqdm(dataloader): 44 | # assert(len(batch_images.shape) == 4) 45 | batch_images = image_to_device( 46 | batch_images, 47 | device, 48 | torch.float32, 49 | mean=args.image_mean, 50 | std=args.image_std, 51 | ) 52 | # Because of the packing collate function we cannot support multi-image to caption selection 53 | nim = 1 54 | 55 | # tokenize all texts in the batch 56 | nt = len(batch_texts[0]) 57 | batch_texts_tok_ = tokenizer( 58 | [text for i, texts in enumerate(batch_texts) for text in texts] 59 | ).to(device) 60 | 61 | # compute the embedding of images and texts 62 | with torch.no_grad(), autocast(): 63 | batch_images_emb = F.normalize( 64 | model.encode_image(batch_images), dim=-1 65 | ).unsqueeze(1) 66 | B, _, emb_dim = batch_images_emb.shape 67 | batch_texts_emb = F.normalize( 68 | model.encode_text(batch_texts_tok_), dim=-1 69 | ).view(B, nt, -1) 70 | 71 | gt = torch.arange(min(nim, nt)).to(device) 72 | for i in range(B): 73 | # iteratve over instances 74 | 75 | # compute similarities between each image and each text 76 | images_emb = batch_images_emb[i] 77 | texts_emb = batch_texts_emb[i] 78 | scores = images_emb @ texts_emb.t() 79 | 80 | # i-th image should be matched to the i-th text 81 | image_closest_text = scores.argmax(dim=1)[: len(gt)] 82 | text_closest_image = scores.argmax(dim=0)[: len(gt)] 83 | pred_text_is_correct = (image_closest_text == gt).all().item() 84 | pred_image_is_correct = (text_closest_image == gt).all().item() 85 | all_correct = pred_text_is_correct and pred_image_is_correct 86 | image_score.append(pred_image_is_correct) 87 | text_score.append(pred_text_is_correct) 88 | score.append(all_correct) 89 | metrics = {} 90 | metrics["image_acc"] = torch.Tensor(image_score).float().mean().item() 91 | metrics["text_acc"] = torch.Tensor(text_score).float().mean().item() 92 | metrics["acc"] = torch.Tensor(score).float().mean().item() 93 | return metrics 94 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/metrics/multiclass_retrieval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from contextlib import suppress 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn.functional as F 8 | from clip_benchmark.metrics.zeroshot_retrieval import (dataloader_with_indices, 9 | recall_at_k) 10 | from tqdm import tqdm 11 | 12 | 13 | def evaluate( 14 | model, 15 | dataloader, 16 | tokenizer, 17 | device, 18 | amp=True, 19 | recall_k_list=[1], 20 | args=None, 21 | retrieval_template=None, 22 | ): 23 | """ 24 | Evaluate the model on the given dataset 25 | 26 | Parameters 27 | ---------- 28 | 29 | model: torch.nn,Module 30 | CLIP-like model with `encode_image` and `encode_text` 31 | 32 | dataloader: torch.utils.data.Dataloader 33 | dataloader to use for evaluation 34 | 35 | tokenizer: 36 | text tokenizer, i.e. convert list of strings to torch.Tensor of integers 37 | 38 | device: cpu/cuda 39 | 40 | amp: whether to use automatic mixed precision 41 | 42 | recall_k_list: list of int 43 | recall@k k's to use 44 | 45 | retrieval_template: 46 | dict of retrieval templates for each class. Retrieval templates should contain lists of image/text indexes. The model will performed retrieval accross the examples in each list. 47 | 48 | Returns 49 | ------- 50 | 51 | dict of retrieval metrics 52 | """ 53 | # list of batch of images embedding 54 | batch_images_emb_list = [] 55 | # list of batch of text embedding 56 | batch_texts_emb_list = [] 57 | # for each text, we collect the corresponding image index, as each image can have multiple corresponding texts 58 | texts_image_index = [] 59 | dataloader = dataloader_with_indices(dataloader) 60 | autocast = torch.cuda.amp.autocast if amp else suppress 61 | 62 | for batch_images, batch_texts, inds in tqdm(dataloader): 63 | # move the batch to the device 64 | batch_images = image_to_device( 65 | batch_images, 66 | device, 67 | torch.float32, 68 | mean=args.image_mean, 69 | std=args.image_std, 70 | ) 71 | 72 | # tokenize all texts in the batch 73 | batch_texts_tok = tokenizer( 74 | [text for i, texts in enumerate(batch_texts) for text in texts] 75 | ).to(device) 76 | 77 | # compute the embedding of images and texts 78 | with torch.no_grad(), autocast(): 79 | batch_images_emb = F.normalize(model.encode_image(batch_images), dim=-1) 80 | batch_texts_emb = F.normalize(model.encode_text(batch_texts_tok), dim=-1) 81 | 82 | batch_images_emb_list.append(batch_images_emb.cpu()) 83 | batch_texts_emb_list.append(batch_texts_emb.cpu()) 84 | 85 | batch_size = len(batch_images_emb_list[0]) 86 | 87 | # concatenate all embeddings 88 | images_emb = torch.cat(batch_images_emb_list) 89 | texts_emb = torch.cat(batch_texts_emb_list) 90 | 91 | assert images_emb.shape[0] == texts_emb.shape[0] 92 | 93 | # get the score for each text and image pair 94 | scores = texts_emb @ images_emb.t() 95 | 96 | metrics = {} 97 | multiclass_image_retrieval = [] 98 | multiclass_text_retrieval = [] 99 | for c in retrieval_template.keys(): 100 | 101 | image_retrieval = [] 102 | text_retrieval = [] 103 | for indexes in retrieval_template[c]: 104 | retrieved = scores[np.ix_(indexes, indexes)] 105 | positive_pairs = torch.zeros_like(retrieved, dtype=bool) 106 | positive_pairs[ 107 | torch.arange(len(retrieved)), torch.arange(len(retrieved)) 108 | ] = True 109 | 110 | image_retrieval.append(recall_at_k(retrieved, positive_pairs, k=1)) 111 | text_retrieval.append(recall_at_k(retrieved.T, positive_pairs, k=1)) 112 | 113 | average_image_retrieval = torch.cat(image_retrieval).float().mean().item() 114 | average_text_retrieval = torch.cat(text_retrieval).float().mean().item() 115 | 116 | metrics[f"image_retrieval_recall@1_{c}"] = average_image_retrieval 117 | metrics[f"text_retrieval_recall@1_{c}"] = average_text_retrieval 118 | 119 | multiclass_image_retrieval.append(average_image_retrieval) 120 | multiclass_text_retrieval.append(average_text_retrieval) 121 | 122 | metrics["image_retrieval_recall@1_multiclass"] = ( 123 | torch.tensor(multiclass_image_retrieval).float().mean().item() 124 | ) 125 | metrics["text_retrieval_recall@1_multiclass"] = ( 126 | torch.tensor(multiclass_text_retrieval).float().mean().item() 127 | ) 128 | 129 | return metrics 130 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/model_collection.py: -------------------------------------------------------------------------------- 1 | # import open_clip 2 | 3 | 4 | def get_model_collection_from_file(path): 5 | return [l.strip().split(",") for l in open(path).readlines()] 6 | 7 | 8 | model_collection = { 9 | } 10 | -------------------------------------------------------------------------------- /apps/pe/clip_benchmark/tasks/wds_benchmarks.txt: -------------------------------------------------------------------------------- 1 | # image classification 2 | wds/wds_imagenet1k 3 | wds/wds_imagenetv2 4 | wds/wds_imagenet-a 5 | wds/wds_imagenet-r 6 | wds/wds_imagenet_sketch 7 | 8 | # image retrieval 9 | wds/wds_mscoco_captions 10 | wds/wds_flickr30k 11 | 12 | # video classification 13 | k400_val 14 | 15 | # video retrieval 16 | msrvtt 17 | -------------------------------------------------------------------------------- /apps/pe/docs/assets/cat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/cat.png -------------------------------------------------------------------------------- /apps/pe/docs/assets/dog.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/dog.mp4 -------------------------------------------------------------------------------- /apps/pe/docs/assets/dog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/dog.png -------------------------------------------------------------------------------- /apps/pe/docs/assets/spatial_correspondence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/spatial_correspondence.png -------------------------------------------------------------------------------- /apps/pe/docs/assets/spatial_features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/spatial_features.png -------------------------------------------------------------------------------- /apps/pe/docs/assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/teaser.png -------------------------------------------------------------------------------- /apps/pe/docs/evaluation.md: -------------------------------------------------------------------------------- 1 | # Zero-Shot ClipBench Evaluation 2 | Please download the supported datasets directly from the datasets host and update paths in clip_benchmark/datasets/builder.py. And run 3 | ```bash 4 | model='PE-Core-G14-448' 5 | DATASETS=./clip_benchmark/tasks/wds_benchmarks.txt 6 | DATA_ROOT=DATA_ROOT/ 7 | 8 | python -m clip_benchmark.cli eval \ 9 | --model $model \ 10 | --pretrained $CHECKPOINT \ 11 | --dataset "$DATASETS" \ 12 | --dataset_root $DATA_ROOT \ 13 | --output "./benchmark_{pretrained}_{dataset}_{num_frames}_{model}_{language}_{task}.json" \ 14 | --force-preprocess-cfg resize_mode=squash 15 | 16 | ``` 17 | This script will perform zero-shot classification abd retireval benchmarks defined in clip_benchmark/tasks/wds_benchmarks.txt. Examples above includes the following tasks: 18 | - ImageNet 1K classification 19 | - ImageNet v2 classification 20 | - ImageNet Adversial classification 21 | - MS-COCO retrieval 22 | - Flickr30K retrieval 23 | - Kinetics 400 video classification 24 | - MSR-VTT video retrieval 25 | 26 | -------------------------------------------------------------------------------- /apps/plm/configs/datasets.yaml: -------------------------------------------------------------------------------- 1 | dummy_image: 2 | annotation: apps/plm/dummy_datasets/image/annotations.jsonl 3 | root_dir: apps/plm/dummy_datasets/image/images 4 | 5 | dummy_multi_image: 6 | annotation: apps/plm/dummy_datasets/multi_image/annotations.jsonl 7 | root_dir: apps/plm/dummy_datasets/multi_image/images 8 | 9 | dummy_image_region: 10 | annotation: apps/plm/dummy_datasets/image_region/annotations.jsonl 11 | root_dir: apps/plm/dummy_datasets/image_region/images 12 | 13 | dummy_video: 14 | annotation: apps/plm/dummy_datasets/video/annotations.jsonl 15 | root_dir: apps/plm/dummy_datasets/video/videos 16 | 17 | dummy_text: 18 | annotation: apps/plm/dummy_datasets/text/annotations.jsonl 19 | 20 | dummy_stc_RDCap: 21 | annotation: apps/plm/dummy_datasets/plm_stc/RDCap.jsonl 22 | root_dir: apps/plm/dummy_datasets/plm_stc/videos 23 | 24 | dummy_stc_RCap: 25 | annotation: apps/plm/dummy_datasets/plm_stc/RCap.jsonl 26 | root_dir: apps/plm/dummy_datasets/plm_stc/videos 27 | 28 | dummy_stc_RTLoc: 29 | annotation: apps/plm/dummy_datasets/plm_stc/RTLoc.jsonl 30 | root_dir: apps/plm/dummy_datasets/plm_stc/videos 31 | -------------------------------------------------------------------------------- /apps/plm/configs/stage_1/plm_1b.yaml: -------------------------------------------------------------------------------- 1 | # We use a global batch size of 512 in stage # 1 for PLM-1B model. Please adjust batch_size as per your training setup. 2 | # For example, one possible configuration is batch_size=16,nodes=4,gpus_per_node=8 = 16*4*8 = 512 global batch size. 3 | 4 | name: "plm_1b_stage1" 5 | dump_dir: ./plm_1b_stage1 6 | steps: 8000 7 | seed: 777 8 | optim: 9 | lr: 1e-4 10 | warmup: 20 11 | lr_min_ratio: 0.01 12 | clip: 1.0 13 | weight_decay: 0.01 14 | 15 | distributed: 16 | fsdp_type: full_shard 17 | compile: false 18 | model_dtype: bf16 19 | matmul_allow_tf32: false 20 | selective_activation_checkpointing: false 21 | full_activation_checkpointing: true 22 | tp_size: 1 23 | 24 | model: 25 | dim: 2048 26 | n_layers: 16 27 | n_heads: 32 28 | n_kv_heads: 8 29 | vocab_size: 128256 30 | ffn_dim_multiplier: 1.5 31 | multiple_of: 256 32 | norm_eps: 1e-05 33 | rope_theta: 500000.0 34 | weight_tying: true 35 | rope_scale_factor: 32 36 | high_freq_factor: 4 37 | max_seqlen: 1280 38 | freeze_language_model: true 39 | freeze_vision_model: true 40 | pooling_ratio: 1 41 | vision_model: 42 | image_size: 448 43 | patch_size: 14 44 | width: 1024 45 | layers: 23 46 | heads: 16 47 | use_cls_token: true 48 | use_abs_posemb: true 49 | mlp_ratio: 4.0 50 | use_ln_post: false 51 | pool_type: "none" 52 | mlp_init: 53 | use_gaussian: true 54 | 55 | data: 56 | datamix: 57 | num_workers: 8 58 | batch_size: 16 59 | image_res: 448 60 | max_num_tiles: 1 61 | max_video_frames: 8 62 | vision_input_type: vanilla 63 | tokenizer_path: facebook/Perception-LM-1B/tokenizer.model 64 | tokenizer_name: plmchat 65 | conversation_format: warmup 66 | 67 | profiling: 68 | run: false 69 | 70 | checkpoint: 71 | dump: 72 | every: 500 73 | keep: 1 74 | init_ckpt_path: meta-llama/Llama-3.2-1B-Instruct/original 75 | # Please use the script at apps/plm/interpolate_PE_pos_embed.py to interpolate PE-Core-L14-336 (https://huggingface.co/facebook/PE-Core-L14-336) checkpoints to 448 resolution. 76 | vision_model_path: facebook/PE-Core-L14-336-interpolated-to-448/model.pt 77 | is_consolidated_model: True 78 | 79 | logging: 80 | freq: 10 81 | level: INFO # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL] 82 | -------------------------------------------------------------------------------- /apps/plm/configs/stage_1/plm_3b.yaml: -------------------------------------------------------------------------------- 1 | # We use a global batch size of 512 in stage # 1 for PLM-3B model. Please adjust batch_size as per your training setup. 2 | # For example, one possible configuration is batch_size=16,nodes=4,gpus_per_node=8 = 16*4*8 = 512 global batch size. 3 | 4 | name: "plm_3b_stage1" 5 | dump_dir: ./plm_3b_stage1 6 | steps: 8000 7 | seed: 777 8 | optim: 9 | lr: 1e-4 10 | warmup: 20 11 | lr_min_ratio: 0.01 12 | clip: 1.0 13 | weight_decay: 0.01 14 | 15 | distributed: 16 | fsdp_type: full_shard 17 | compile: false 18 | model_dtype: bf16 19 | matmul_allow_tf32: false 20 | selective_activation_checkpointing: false 21 | full_activation_checkpointing: true 22 | tp_size: 1 23 | 24 | model: 25 | dim: 3072 26 | n_layers: 28 27 | n_heads: 24 28 | n_kv_heads: 8 29 | vocab_size: 128256 30 | ffn_dim_multiplier: 1.0 31 | multiple_of: 256 32 | norm_eps: 1e-05 33 | rope_theta: 500000.0 34 | weight_tying: true 35 | rope_scale_factor: 32 36 | high_freq_factor: 4 37 | max_seqlen: 1280 38 | freeze_language_model: true 39 | freeze_vision_model: true 40 | pooling_ratio: 1 41 | vision_model: 42 | image_size: 448 43 | patch_size: 14 44 | width: 1024 45 | layers: 23 46 | heads: 16 47 | use_cls_token: true 48 | use_abs_posemb: true 49 | mlp_ratio: 4.0 50 | use_ln_post: false 51 | pool_type: "none" 52 | mlp_init: 53 | use_gaussian: true 54 | 55 | data: 56 | datamix: 57 | num_workers: 8 58 | batch_size: 16 59 | image_res: 448 60 | max_num_tiles: 1 61 | max_video_frames: 8 62 | vision_input_type: vanilla 63 | tokenizer_path: facebook/Perception-LM-3B/tokenizer.model 64 | tokenizer_name: plmchat 65 | conversation_format: warmup 66 | 67 | profiling: 68 | run: false 69 | 70 | checkpoint: 71 | dump: 72 | every: 500 73 | keep: 1 74 | init_ckpt_path: meta-llama/Llama-3.2-3B-Instruct/original 75 | # Please use the script at apps/plm/interpolate_PE_pos_embed.py to interpolate PE-Core-L14-336 (https://huggingface.co/facebook/PE-Core-L14-336) checkpoints to 448 resolution. 76 | vision_model_path: facebook/PE-Core-L14-336-interpolated-to-448/model.pt 77 | is_consolidated_model: True 78 | 79 | logging: 80 | freq: 10 81 | level: INFO # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL] 82 | -------------------------------------------------------------------------------- /apps/plm/configs/stage_1/plm_8b.yaml: -------------------------------------------------------------------------------- 1 | # We use a global batch size of 512 in stage # 1 for PLM-8B model. Please adjust batch_size as per your training setup. 2 | # For example, one possible configuration is batch_size=16,nodes=4,gpus_per_node=8 = 16*4*8 = 512 global batch size. 3 | 4 | name: "plm_8b_stage1" 5 | dump_dir: ./plm_8b_stage1 6 | steps: 8000 7 | seed: 777 8 | optim: 9 | lr: 1e-4 10 | warmup: 20 11 | lr_min_ratio: 0.01 12 | clip: 1.0 13 | weight_decay: 0.05 14 | 15 | distributed: 16 | fsdp_type: full_shard 17 | compile: false 18 | model_dtype: bf16 19 | matmul_allow_tf32: false 20 | selective_activation_checkpointing: false 21 | full_activation_checkpointing: true 22 | tp_size: 1 23 | 24 | model: 25 | dim: 4096 26 | n_layers: 32 27 | n_heads: 32 28 | n_kv_heads: 8 29 | vocab_size: 128256 30 | ffn_dim_multiplier: 1.3 31 | multiple_of: 1024 32 | norm_eps: 1e-05 33 | rope_theta: 500000.0 34 | weight_tying: false 35 | max_seqlen: 1280 36 | freeze_language_model: true 37 | freeze_vision_model: true 38 | pooling_ratio: 1 39 | vision_model: 40 | image_size: 448 41 | patch_size: 14 42 | width: 1536 43 | layers: 47 44 | heads: 16 45 | use_cls_token: false 46 | use_abs_posemb: true 47 | mlp_ratio: 5.833333334 48 | use_ln_post: false 49 | pool_type: "none" 50 | mlp_init: 51 | use_gaussian: true 52 | 53 | data: 54 | datamix: 55 | num_workers: 8 56 | batch_size: 16 57 | image_res: 448 58 | max_num_tiles: 1 59 | max_video_frames: 8 60 | vision_input_type: vanilla 61 | tokenizer_path: facebook/Perception-LM-8B/tokenizer.model 62 | tokenizer_name: plmchat 63 | conversation_format: warmup 64 | 65 | profiling: 66 | run: false 67 | 68 | checkpoint: 69 | dump: 70 | every: 500 71 | keep: 1 72 | init_ckpt_path: meta-llama/Llama-3.1-8B-Instruct/original 73 | vision_model_path: facebook/PE-Core-G14-448/model.pt 74 | is_consolidated_model: True 75 | 76 | logging: 77 | freq: 10 78 | level: INFO # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL] 79 | -------------------------------------------------------------------------------- /apps/plm/configs/stage_2/plm_1b.yaml: -------------------------------------------------------------------------------- 1 | # We use a global batch size of 2048 in stage # 3 for PLM-1B model. Please adjust batch_size as per your training setup. 2 | # For example, one possible configuration is batch_size=8,nodes=32,gpus_per_node=8 = 8*32*8 = 2048 global batch size. 3 | 4 | name: "plm_1b_stage2" 5 | dump_dir: ./plm_1b_stage2 6 | steps: 35000 7 | seed: 777 8 | optim: 9 | lr: 4e-5 10 | warmup: 120 11 | lr_min_ratio: 0.01 12 | clip: 1.0 13 | weight_decay: 0.01 14 | 15 | distributed: 16 | fsdp_type: full_shard 17 | compile: false 18 | model_dtype: bf16 19 | matmul_allow_tf32: false 20 | selective_activation_checkpointing: false 21 | full_activation_checkpointing: true 22 | tp_size: 1 23 | 24 | model: 25 | dim: 2048 26 | n_layers: 16 27 | n_heads: 32 28 | n_kv_heads: 8 29 | vocab_size: 128256 30 | ffn_dim_multiplier: 1.5 31 | multiple_of: 256 32 | norm_eps: 1e-05 33 | rope_theta: 500000.0 34 | weight_tying: true 35 | rope_scale_factor: 32 36 | high_freq_factor: 4 37 | max_seqlen: 6144 38 | freeze_language_model: false 39 | freeze_vision_model: false 40 | pooling_ratio: 2 41 | vision_model: 42 | image_size: 448 43 | patch_size: 14 44 | width: 1024 45 | layers: 23 46 | heads: 16 47 | use_cls_token: true 48 | use_abs_posemb: true 49 | mlp_ratio: 4.0 50 | ls_init_value: 0.1 51 | drop_path: 0.1 52 | use_ln_post: false 53 | pool_type: "none" 54 | mlp_init: 55 | use_gaussian: true 56 | 57 | data: 58 | datamix: 59 | num_workers: 8 60 | batch_size: 4 61 | image_res: 448 62 | max_num_tiles: 16 63 | max_video_frames: 16 64 | vision_input_type: thumb+tile 65 | tokenizer_path: facebook/Perception-LM-1B/tokenizer.model 66 | tokenizer_name: plmchat 67 | conversation_format: plm_sft 68 | 69 | profiling: 70 | run: false 71 | 72 | checkpoint: 73 | dump: 74 | every: 500 75 | keep: 1 76 | init_ckpt_path: 77 | is_consolidated_model: True 78 | 79 | logging: 80 | freq: 10 81 | level: INFO # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL] 82 | -------------------------------------------------------------------------------- /apps/plm/configs/stage_2/plm_3b.yaml: -------------------------------------------------------------------------------- 1 | # We use a global batch size of 2048 in stage # 3 for PLM-3B model. Please adjust batch_size as per your training setup. 2 | # For example, one possible configuration is batch_size=8,nodes=32,gpus_per_node=8 = 8*32*8 = 2048 global batch size. 3 | 4 | name: "plm_3b_stage2" 5 | dump_dir: ./plm_3b_stage2 6 | steps: 35000 7 | seed: 777 8 | optim: 9 | lr: 4e-5 10 | warmup: 120 11 | lr_min_ratio: 0.01 12 | clip: 1.0 13 | weight_decay: 0.01 14 | 15 | distributed: 16 | fsdp_type: full_shard 17 | compile: false 18 | model_dtype: bf16 19 | matmul_allow_tf32: false 20 | selective_activation_checkpointing: false 21 | full_activation_checkpointing: true 22 | tp_size: 1 23 | 24 | model: 25 | dim: 3072 26 | n_layers: 28 27 | n_heads: 24 28 | n_kv_heads: 8 29 | vocab_size: 128256 30 | ffn_dim_multiplier: 1.0 31 | multiple_of: 256 32 | norm_eps: 1e-05 33 | rope_theta: 500000.0 34 | weight_tying: true 35 | rope_scale_factor: 32 36 | high_freq_factor: 4 37 | max_seqlen: 6144 38 | freeze_language_model: false 39 | freeze_vision_model: false 40 | pooling_ratio: 2 41 | vision_model: 42 | image_size: 448 43 | patch_size: 14 44 | width: 1024 45 | layers: 23 46 | heads: 16 47 | use_cls_token: true 48 | use_abs_posemb: true 49 | mlp_ratio: 4.0 50 | ls_init_value: 0.1 51 | drop_path: 0.1 52 | use_ln_post: false 53 | pool_type: "none" 54 | mlp_init: 55 | use_gaussian: true 56 | 57 | data: 58 | datamix: 59 | num_workers: 8 60 | batch_size: 4 61 | image_res: 448 62 | max_num_tiles: 16 63 | max_video_frames: 16 64 | vision_input_type: thumb+tile 65 | tokenizer_path: facebook/Perception-LM-3B/tokenizer.model 66 | tokenizer_name: plmchat 67 | conversation_format: plm_sft 68 | 69 | profiling: 70 | run: false 71 | 72 | checkpoint: 73 | dump: 74 | every: 500 75 | keep: 1 76 | init_ckpt_path: 77 | is_consolidated_model: True 78 | 79 | logging: 80 | freq: 10 81 | level: INFO # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL] 82 | -------------------------------------------------------------------------------- /apps/plm/configs/stage_2/plm_8b.yaml: -------------------------------------------------------------------------------- 1 | # We use a global batch size of 2048 in stage # 3 for PLM-8B model. Please adjust batch_size as per your training setup. 2 | # For example, one possible configuration is batch_size=4,nodes=64,gpus_per_node=8 = 4*64*8 = 2048 global batch size. 3 | 4 | name: "plm_8b_stage2" 5 | dump_dir: ./plm_8b_stage2 6 | steps: 35000 7 | seed: 777 8 | optim: 9 | lr: 4e-5 10 | warmup: 120 11 | lr_min_ratio: 0.01 12 | clip: 1.0 13 | weight_decay: 0.05 14 | 15 | distributed: 16 | fsdp_type: full_shard 17 | compile: false 18 | model_dtype: bf16 19 | matmul_allow_tf32: false 20 | selective_activation_checkpointing: false 21 | full_activation_checkpointing: true 22 | tp_size: 1 23 | 24 | model: 25 | dim: 4096 26 | n_layers: 32 27 | n_heads: 32 28 | n_kv_heads: 8 29 | vocab_size: 128256 30 | ffn_dim_multiplier: 1.3 31 | multiple_of: 1024 32 | norm_eps: 1e-05 33 | rope_theta: 500000.0 34 | weight_tying: false 35 | max_seqlen: 6144 36 | freeze_language_model: false 37 | freeze_vision_model: false 38 | pooling_ratio: 2 39 | vision_model: 40 | image_size: 448 41 | patch_size: 14 42 | width: 1536 43 | layers: 47 44 | heads: 16 45 | use_cls_token: false 46 | use_abs_posemb: true 47 | mlp_ratio: 5.833333334 48 | ls_init_value: 0.1 49 | drop_path: 0.1 50 | use_ln_post: false 51 | pool_type: "none" 52 | mlp_init: 53 | use_gaussian: true 54 | 55 | data: 56 | datamix: 57 | num_workers: 4 58 | batch_size: 2 59 | image_res: 448 60 | max_num_tiles: 16 61 | max_video_frames: 16 62 | vision_input_type: thumb+tile 63 | tokenizer_path: facebook/Perception-LM-8B/tokenizer.model 64 | tokenizer_name: plmchat 65 | conversation_format: plm_sft 66 | 67 | profiling: 68 | run: false 69 | 70 | checkpoint: 71 | dump: 72 | every: 500 73 | keep: 1 74 | init_ckpt_path: 75 | is_consolidated_model: True 76 | 77 | logging: 78 | freq: 10 79 | level: INFO # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL] 80 | -------------------------------------------------------------------------------- /apps/plm/configs/stage_3/plm_1b.yaml: -------------------------------------------------------------------------------- 1 | # We use a global batch size of 1024 in stage # 3 for PLM-1B model. Please adjust batch_size as per your training setup. 2 | # For example, one possible configuration is batch_size=4,nodes=32,gpus_per_node=8 = 4*32*8 = 1024 global batch size. 3 | 4 | name: "plm_1b_stage3" 5 | dump_dir: ./plm_1b_stage3 6 | steps: 21000 7 | seed: 777 8 | optim: 9 | lr: 4e-5 10 | warmup: 120 11 | lr_min_ratio: 0.01 12 | clip: 1.0 13 | weight_decay: 0.01 14 | 15 | distributed: 16 | fsdp_type: full_shard 17 | compile: false 18 | model_dtype: bf16 19 | matmul_allow_tf32: false 20 | selective_activation_checkpointing: false 21 | full_activation_checkpointing: true 22 | tp_size: 1 23 | 24 | model: 25 | dim: 2048 26 | n_layers: 16 27 | n_heads: 32 28 | n_kv_heads: 8 29 | vocab_size: 128256 30 | ffn_dim_multiplier: 1.5 31 | multiple_of: 256 32 | norm_eps: 1e-05 33 | rope_theta: 500000.0 34 | weight_tying: true 35 | rope_scale_factor: 32 36 | high_freq_factor: 4 37 | max_seqlen: 11520 38 | freeze_language_model: false 39 | freeze_vision_model: false 40 | pooling_ratio: 2 41 | vision_model: 42 | image_size: 448 43 | patch_size: 14 44 | width: 1024 45 | layers: 23 46 | heads: 16 47 | use_cls_token: true 48 | use_abs_posemb: true 49 | mlp_ratio: 4.0 50 | ls_init_value: 0.1 51 | drop_path: 0.1 52 | use_ln_post: false 53 | pool_type: "none" 54 | mlp_init: 55 | use_gaussian: true 56 | 57 | data: 58 | datamix: 59 | num_workers: 4 60 | batch_size: 2 61 | image_res: 448 62 | max_num_tiles: 36 63 | max_video_frames: 32 64 | vision_input_type: thumb+tile 65 | tokenizer_path: facebook/Perception-LM-1B/tokenizer.model 66 | tokenizer_name: plmchat 67 | conversation_format: plm_sft 68 | 69 | profiling: 70 | run: false 71 | 72 | checkpoint: 73 | dump: 74 | every: 500 75 | keep: 1 76 | init_ckpt_path: 77 | is_consolidated_model: True 78 | 79 | logging: 80 | freq: 10 81 | level: INFO # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL] 82 | -------------------------------------------------------------------------------- /apps/plm/configs/stage_3/plm_3b.yaml: -------------------------------------------------------------------------------- 1 | # We use a global batch size of 1024 in stage # 3 for PLM-3B model. Please adjust batch_size as per your training setup. 2 | # For example, one possible configuration is batch_size=4,nodes=32,gpus_per_node=8 = 4*32*8 = 1024 global batch size. 3 | 4 | name: "plm_3b_stage3" 5 | dump_dir: ./plm_3b_stage3 6 | steps: 21000 7 | seed: 777 8 | optim: 9 | lr: 4e-5 10 | warmup: 120 11 | lr_min_ratio: 0.01 12 | clip: 1.0 13 | weight_decay: 0.01 14 | 15 | distributed: 16 | fsdp_type: full_shard 17 | compile: false 18 | model_dtype: bf16 19 | matmul_allow_tf32: false 20 | selective_activation_checkpointing: false 21 | full_activation_checkpointing: true 22 | tp_size: 1 23 | 24 | model: 25 | dim: 3072 26 | n_layers: 28 27 | n_heads: 24 28 | n_kv_heads: 8 29 | vocab_size: 128256 30 | ffn_dim_multiplier: 1.0 31 | multiple_of: 256 32 | norm_eps: 1e-05 33 | rope_theta: 500000.0 34 | weight_tying: true 35 | rope_scale_factor: 32 36 | high_freq_factor: 4 37 | max_seqlen: 11520 38 | freeze_language_model: false 39 | freeze_vision_model: false 40 | pooling_ratio: 2 41 | vision_model: 42 | image_size: 448 43 | patch_size: 14 44 | width: 1024 45 | layers: 23 46 | heads: 16 47 | use_cls_token: true 48 | use_abs_posemb: true 49 | mlp_ratio: 4.0 50 | ls_init_value: 0.1 51 | drop_path: 0.1 52 | use_ln_post: false 53 | pool_type: "none" 54 | mlp_init: 55 | use_gaussian: true 56 | 57 | data: 58 | datamix: 59 | num_workers: 4 60 | batch_size: 2 61 | image_res: 448 62 | max_num_tiles: 36 63 | max_video_frames: 32 64 | vision_input_type: thumb+tile 65 | tokenizer_path: facebook/Perception-LM-3B/tokenizer.model 66 | tokenizer_name: plmchat 67 | conversation_format: plm_sft 68 | 69 | profiling: 70 | run: false 71 | 72 | checkpoint: 73 | dump: 74 | every: 500 75 | keep: 1 76 | init_ckpt_path: 77 | is_consolidated_model: True 78 | 79 | logging: 80 | freq: 10 81 | level: INFO # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL] 82 | -------------------------------------------------------------------------------- /apps/plm/configs/stage_3/plm_8b.yaml: -------------------------------------------------------------------------------- 1 | # We use a global batch size of 1024 in stage # 3 for PLM-8B model. Please adjust batch_size as per your training setup. 2 | # For example, one possible configuration is batch_size=2,nodes=64,gpus_per_node=8 = 2*64*8 = 1024 global batch size. 3 | 4 | name: "plm_8b_stage3" 5 | dump_dir: ./plm_8b_stage3 6 | steps: 21000 7 | seed: 777 8 | optim: 9 | lr: 1e-5 10 | warmup: 120 11 | lr_min_ratio: 0.01 12 | clip: 1.0 13 | weight_decay: 0.05 14 | 15 | distributed: 16 | fsdp_type: full_shard 17 | compile: false 18 | model_dtype: bf16 19 | matmul_allow_tf32: false 20 | selective_activation_checkpointing: false 21 | full_activation_checkpointing: true 22 | tp_size: 1 23 | 24 | model: 25 | dim: 4096 26 | n_layers: 32 27 | n_heads: 32 28 | n_kv_heads: 8 29 | vocab_size: 128256 30 | ffn_dim_multiplier: 1.3 31 | multiple_of: 1024 32 | norm_eps: 1e-05 33 | rope_theta: 500000.0 34 | weight_tying: false 35 | max_seqlen: 11520 36 | freeze_language_model: false 37 | freeze_vision_model: false 38 | pooling_ratio: 2 39 | vision_model: 40 | image_size: 448 41 | patch_size: 14 42 | width: 1536 43 | layers: 47 44 | heads: 16 45 | use_cls_token: false 46 | use_abs_posemb: true 47 | mlp_ratio: 5.833333334 48 | ls_init_value: 0.1 49 | drop_path: 0.1 50 | use_ln_post: false 51 | pool_type: "none" 52 | mlp_init: 53 | use_gaussian: true 54 | 55 | data: 56 | datamix: 57 | num_workers: 4 58 | batch_size: 2 59 | image_res: 448 60 | max_num_tiles: 36 61 | max_video_frames: 32 62 | vision_input_type: thumb+tile 63 | tokenizer_path: facebook/Perception-LM-8B/tokenizer.model 64 | tokenizer_name: plmchat 65 | conversation_format: plm_sft 66 | 67 | profiling: 68 | run: false 69 | 70 | checkpoint: 71 | dump: 72 | every: 500 73 | keep: 1 74 | init_ckpt_path: 75 | is_consolidated_model: True 76 | 77 | logging: 78 | freq: 10 79 | level: INFO # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL] 80 | -------------------------------------------------------------------------------- /apps/plm/consolidate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from pathlib import Path 4 | 5 | import torch 6 | from omegaconf import OmegaConf 7 | 8 | from apps.plm.transformer import LMTransformer, LMTransformerArgs 9 | from core.args import dataclass_from_dict 10 | from core.checkpoint import load_from_checkpoint 11 | 12 | 13 | def build_model( 14 | ref_model_path: str, 15 | model_cls=LMTransformer, 16 | model_args_cls=LMTransformerArgs, 17 | ): 18 | ckpt_path = Path(ref_model_path) 19 | config = ckpt_path / "params.json" 20 | config = OmegaConf.load(config) 21 | 22 | model_args = dataclass_from_dict(model_args_cls, config.model, strict=False) 23 | model = model_cls(model_args) 24 | return model 25 | 26 | 27 | def main(): 28 | parser = argparse.ArgumentParser(description="Consolidate PLM checkpoints") 29 | parser.add_argument( 30 | "--ckpt", 31 | type=str, 32 | required=True, 33 | help="Path to the checkpoint directory to consolidate", 34 | ) 35 | args = parser.parse_args() 36 | 37 | model = build_model(ref_model_path=args.ckpt) 38 | load_from_checkpoint( 39 | ckpt_dir=args.ckpt, 40 | model=model, 41 | optimizer=None, 42 | model_key="model", 43 | ) 44 | 45 | consolidated_model_state_dict = model.state_dict() 46 | output_file = os.path.join(args.ckpt, "consolidated.pth") 47 | 48 | # Save the consolidated model state_dict using torch.save 49 | print(f"Saving consolidated model state_dict to: {output_file}") 50 | torch.save(consolidated_model_state_dict, output_file) 51 | print("Consolidated checkpoint saved successfully.") 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /apps/plm/dataset_conf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | 3 | import os 4 | from dataclasses import dataclass 5 | from typing import Optional 6 | 7 | import yaml 8 | 9 | 10 | @dataclass 11 | class DatasetConf: 12 | name: str = "" 13 | annotation: str = "" 14 | root_dir: Optional[str] = None 15 | 16 | 17 | def read_yaml_to_configs(yaml_file_path: str) -> dict: 18 | with open(yaml_file_path, "r", encoding="utf-8") as file: 19 | yaml_data = yaml.safe_load(file) 20 | 21 | dataset_config = {} 22 | for dataset_name, dataset_info in yaml_data.items(): 23 | dataset_config[dataset_name] = DatasetConf( 24 | name=dataset_name, 25 | annotation=dataset_info["annotation"], 26 | root_dir=dataset_info.get("root_dir"), 27 | ) 28 | 29 | return dataset_config 30 | 31 | 32 | # Determine the directory of the current script 33 | current_directory = os.path.dirname(os.path.abspath(__file__)) 34 | # Construct the path to the datasets.yaml file 35 | yaml_file_path = os.path.join(current_directory, "configs", "datasets.yaml") 36 | # Read the YAML file 37 | dataset_config = read_yaml_to_configs(yaml_file_path) 38 | -------------------------------------------------------------------------------- /apps/plm/docs/evaluation.md: -------------------------------------------------------------------------------- 1 | # Evaluating Perception Language Model (PLM) 2 | 3 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM 1B-Model-blue)](https://huggingface.co/facebook/Perception-LM-1B) 4 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM 3B-Model-blue)](https://huggingface.co/facebook/Perception-LM-3B) 5 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM 8B-Model-blue)](https://huggingface.co/facebook/Perception-LM-8B) 6 | 7 | We have added our model and benchmarks to [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/lmms_eval/models/plm.py) for to support the process of reproducing our reported results on multiple image and video benchmarks. 8 | 9 | --- 10 | 11 | ## Getting Started 12 | 1. Install perception_models following the instruction in the [`Main README`](../../../README.md). 13 | 2. Install `lmms-eval`: 14 | ``` 15 | git clone https://github.com/EvolvingLMMs-Lab/lmms-eval.git 16 | cd lmm-evals 17 | pip install -e . 18 | ``` 19 | 20 | ## Run Evaluation on Standard Image and Video Tasks 21 | You can use the following command to run the evaluation. 22 | 23 | ```shell 24 | 25 | # Use facebook/Perception-LM-1B for 1B parameters model and facebook/Perception-LM-8B for 8B parameters model. 26 | CHECKPOINTS_PATH=facebook/Perception-LM-3B 27 | 28 | # Define the tasks you want to evaluate PLM on. We support all the tasks present in lmms-eval, however have tested the following tasks with our models. 29 | 30 | ALL_TASKS=( 31 | "docvqa" "chartqa" "textvqa" "infovqa" "ai2d_no_mask" "ok_vqa" "vizwiz_vqa" "mme" 32 | "realworldqa" "pope" "mmmu" "ocrbench" "coco_karpathy_val" "nocaps" "vqav2_val" 33 | "mvbench" "videomme" "vatex_test" "egoschema" "egoschema_subset" "mlvu_dev" 34 | "tempcompass_multi_choice" "perceptiontest_val_mc" "perceptiontest_test_mc" 35 | ) 36 | 37 | # After specifying the task/tasks to evaluate, run the following command to start the evaluation. 38 | SELECTED_TASK="textvqa,videomme" 39 | accelerate launch --num_processes=8 \ 40 | -m lmms_eval \ 41 | --model plm \ 42 | --model_args pretrained=$CHECKPOINTS_PATH \ 43 | --tasks $SELECTED_TASK \ 44 | --batch_size 1 \ 45 | --log_samples \ 46 | --log_samples_suffix plm \ 47 | --output_path $OUTPUT_PATH 48 | ``` 49 | -------------------------------------------------------------------------------- /apps/plm/docs/plm_main_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/plm/docs/plm_main_fig.png -------------------------------------------------------------------------------- /apps/plm/docs/plm_videobench.md: -------------------------------------------------------------------------------- 1 | # PLM-VideoBench 2 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM‑VideoBench-BenchMark-blue)](https://huggingface.co/datasets/facebook/PLM-VideoBench) 3 | 4 | As part of our PLM-release, we are releasing a comprehensive set of video benchmarks (grouped as `PLM-VideoBench`) for detailed video understanding. PLM-VideoBench includes the following sub-benchmarks, 5 | 1. **Fine-Grained Question Answering (FGQA):** In this task, a model must answer a multiple-choice question (MCQ) 6 | that probes fine-grained activity understanding. 7 | 2. **Smart Glasses Question Answering (SGQA):** In this task, a model must answer open-ended questions about 8 | activities and objects visible in an egocentric video stream recorded by a Meta VR Glasses. 9 | 3. **Video Region Captioning (RCap):** In this task, the model must generate a detailed description of an event 10 | involving a subject of interest in the video. 11 | 4. **Region Temporal Localization (RTLoc):** In this task, the model must identify the precise time interval within the video when the specified event takes place for the given subject. 12 | 5. **Region Dense Video Captioning (RDCap):** In this task, a model must generate a detailed description of all events involving a specific subject of interest in a video. 13 | 14 | > [!TIP] 15 | > We have added all `PLM-VideoBench` tasks to [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks/plm_videobench). This makes it easy to reproduce PLM results and also allows other models to be tested on the benchmarks. 16 | 17 | You can use the following command to evaluate PLM on PLM-VideoBench. 18 | 19 | ```shell 20 | 21 | # Use facebook/Perception-LM-1B for 1B parameters model and facebook/Perception-LM-8B for 8B parameters model. 22 | CHECKPOINTS_PATH=facebook/Perception-LM-3B. 23 | 24 | # PLM-VideoBench Tasks 25 | SELECTED_TASK=fgqa_test,sgqa_test,rtloc_test,rcap_test,rdcap_test 26 | OUTPUT_PATH="plm_videobench_evaluation" 27 | 28 | accelerate launch --num_processes=8 \ 29 | -m lmms_eval \ 30 | --model plm \ 31 | --model_args pretrained=$CHECKPOINTS_PATH \ 32 | --tasks $TASKS \ 33 | --batch_size 1 \ 34 | --log_samples \ 35 | --log_samples_suffix plm \ 36 | --output_path $OUTPUT_PATH 37 | ``` 38 | 39 | ## Results 40 | 41 | We evaluate PLM against baselines on PLM-VideoBench and 42 | report breakdowns. We report human performance in the first row. 43 | | Model | FGQA (MBacc) | SGQA (Acc) | RDCap (SODA) | RCap (Score) | RTLoc (meanR) | Avg. | 44 | |------------------|------|------|------------|------------|-------------|------| 45 | | Human perf. | 90.9 | 67.9 | 66.6 | 53.9 | 67.8 | 73.9 | 46 | | GPT-4o | 61.2 | **63.7** | 20.9 | 35.7 | 33.1 | 51.6 | 47 | | Gemini 1.5 Pro | 57.1 | 49.9 | 14.4 | 33.1 | 27.6 | 44.0 | 48 | | Gemini 2.0 Flash | 58.7 | 44.8 | 13.2 | 30.9 | 27.6 | 42.5 | 49 | | LLaVA-OV-7B | 40.2 | 41.5 | 4.7 | 24.4 | 13.9 | 32.0 | 50 | | Qwen2VL-7B | 49.2 | 44.5 | 4.1 | 17.6 | 15.1 | 35.3 | 51 | | Qwen2.5VL-7B | 49.8 | 43.0 | 2.5 | 21.5 | 10.7 | 34.8 | 52 | | InternVL2-8B | 47.7 | 45.9 | 1.2 | 21.5 | 11.6 | 35.0 | 53 | | InternVL2.5-8B | 53.7 | 48.3 | 5.7 | 26.1 | 8.8 | 38.5 | 54 | | PLM-8B | **67.7** | 46.2 | **52.8** | **46.6** | **59.1** | **55.6** | 55 | -------------------------------------------------------------------------------- /apps/plm/docs/training.md: -------------------------------------------------------------------------------- 1 | # Training Perception Language Model (PLM) 2 | 3 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM Synthetic-Image-blue)](https://huggingface.co/datasets/facebook/PLM-Image-Auto) 4 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM Synthetic-Video-blue)](https://huggingface.co/datasets/facebook/PLM-Video-Auto) 5 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM Human-Video-blue)](https://huggingface.co/datasets/facebook/PLM-Video-Human) 6 | 7 | We provide instruction to train or finetune PLM on a custom dataset. 8 | 9 | --- 10 | 11 | > [!TIP] 12 | > We provide configurations to run [`warm-up`](../configs/warmup/) and [`sft`](../configs/sft/) to facilitate reproducibility of PLM training. 13 | 14 | 15 | ## Data Format :open_file_folder: 16 | 17 | We use support both image and video conversation datasets using `jsonl`. Each line of `jsonl` file should follow the following format, 18 | 19 | ### For Image Conversation Dataset 20 | ```json 21 | { 22 | "image": "", 23 | "conversations": [ 24 | { 25 | "from": "human", 26 | "value": "human instruction" 27 | }, 28 | { 29 | "from": "assistant", 30 | "value": "model response" 31 | } 32 | ] 33 | } 34 | ``` 35 | 36 | ### For Video Conversation Dataset 37 | ```json 38 | { 39 | "video": "