├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.PE
├── LICENSE.PLM
├── README.md
├── apps
    ├── detection
    │   ├── DETA_pe
    │   │   ├── README.md
    │   │   ├── datasets
    │   │   │   ├── __init__.py
    │   │   │   ├── coco.py
    │   │   │   ├── coco_eval.py
    │   │   │   ├── coco_panoptic.py
    │   │   │   ├── data_prefetcher.py
    │   │   │   ├── objects365.py
    │   │   │   ├── panoptic_eval.py
    │   │   │   ├── samplers.py
    │   │   │   ├── torchvision_datasets
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── coco.py
    │   │   │   └── transforms.py
    │   │   ├── engine.py
    │   │   ├── engine_tta.py
    │   │   ├── main.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── assigner.py
    │   │   │   ├── backbone.py
    │   │   │   ├── deformable_detr.py
    │   │   │   ├── deformable_transformer.py
    │   │   │   ├── matcher.py
    │   │   │   ├── ops
    │   │   │   │   ├── functions
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── ms_deform_attn_func.py
    │   │   │   │   ├── make.sh
    │   │   │   │   ├── modules
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── ms_deform_attn.py
    │   │   │   │   ├── setup.py
    │   │   │   │   ├── src
    │   │   │   │   │   ├── cpu
    │   │   │   │   │   │   ├── ms_deform_attn_cpu.cpp
    │   │   │   │   │   │   └── ms_deform_attn_cpu.h
    │   │   │   │   │   ├── cuda
    │   │   │   │   │   │   ├── ms_deform_attn_cuda.cu
    │   │   │   │   │   │   ├── ms_deform_attn_cuda.h
    │   │   │   │   │   │   └── ms_deform_im2col_cuda.cuh
    │   │   │   │   │   ├── ms_deform_attn.h
    │   │   │   │   │   └── vision.cpp
    │   │   │   │   └── test.py
    │   │   │   ├── pev1.py
    │   │   │   ├── position_encoding.py
    │   │   │   ├── segmentation.py
    │   │   │   ├── swin.py
    │   │   │   ├── utils_d2.py
    │   │   │   ├── utils_fed_loss.py
    │   │   │   └── utils_softnms.py
    │   │   ├── scripts
    │   │   │   ├── eval.sh
    │   │   │   ├── eval_1824pix.sh
    │   │   │   ├── eval_tta_slurm.sh
    │   │   │   ├── eval_tta_slurm_1824pix.sh
    │   │   │   ├── finetune_further_spatial_Gwin384_cocoep3_1824pix_8node.sh
    │   │   │   ├── finetune_spatial_Gwin384_cocoep12_1728pix_8node.sh
    │   │   │   ├── pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node.sh
    │   │   │   └── pretrain_spatial_Gwin384_o365ep12_1024pix_16node.sh
    │   │   └── util
    │   │   │   ├── __init__.py
    │   │   │   ├── box_ops.py
    │   │   │   ├── ema.py
    │   │   │   ├── misc.py
    │   │   │   └── plot_utils.py
    │   ├── INSTALL.md
    │   ├── README.md
    │   ├── detectron2_pe
    │   │   ├── __init__.py
    │   │   ├── checkpoint
    │   │   │   ├── __init__.py
    │   │   │   └── detection_checkpoint.py
    │   │   └── modeling
    │   │   │   ├── __init__.py
    │   │   │   └── backbone
    │   │   │       ├── __init__.py
    │   │   │       └── pev1_det.py
    │   ├── projects
    │   │   └── ViTDet
    │   │   │   └── configs
    │   │   │       ├── COCO
    │   │   │           ├── mask_rcnn_PEcore_G_coco75ep.py
    │   │   │           ├── mask_rcnn_PEspatial_G_coco36ep.py
    │   │   │           └── mask_rcnn_vitdet_b_100ep.py
    │   │   │       ├── LVIS
    │   │   │           ├── mask_rcnn_PEcore_G_lvis75ep.py
    │   │   │           └── mask_rcnn_PEspatial_G_lvis75ep.py
    │   │   │       └── common
    │   │   │           └── coco_loader_lsj.py
    │   ├── scripts
    │   │   ├── coco
    │   │   │   ├── train_mask_rcnn_PEcore_G_coco75ep.sh
    │   │   │   └── train_mask_rcnn_PEspatial_G_coco36ep.sh
    │   │   ├── evaluate_local.sh
    │   │   └── lvis
    │   │   │   ├── train_mask_rcnn_PEcore_G_lvis75ep.sh
    │   │   │   └── train_mask_rcnn_PEspatial_G_lvis75ep.sh
    │   └── tools
    │   │   ├── convert_d2.py
    │   │   ├── lazyconfig_train_net_pe.py
    │   │   └── lazyconfig_train_net_pe_slurm.py
    ├── pe
    │   ├── README.md
    │   ├── clip_benchmark
    │   │   ├── __init__.py
    │   │   ├── cli.py
    │   │   ├── datasets
    │   │   │   ├── __init__.py
    │   │   │   ├── babel_imagenet.py
    │   │   │   ├── builder.py
    │   │   │   ├── caltech101.py
    │   │   │   ├── crossmodal3600.py
    │   │   │   ├── en_classnames.json
    │   │   │   ├── en_zeroshot_classification_templates.json
    │   │   │   ├── flickr.py
    │   │   │   ├── flickr30k_200.py
    │   │   │   ├── flores_langs.py
    │   │   │   ├── imagenetv2.py
    │   │   │   ├── kitti.py
    │   │   │   ├── multilingual_mscoco.py
    │   │   │   ├── objectnet.py
    │   │   │   ├── pos_neg_caption_dataset.py
    │   │   │   ├── tfds.py
    │   │   │   ├── video_classification_dataset.py
    │   │   │   ├── video_retrieval_dataset.py
    │   │   │   ├── voc2007.py
    │   │   │   ├── winoground.py
    │   │   │   └── xtd200.py
    │   │   ├── metrics
    │   │   │   ├── __captioning.py
    │   │   │   ├── __init__.py
    │   │   │   ├── image_caption_selection.py
    │   │   │   ├── linear_probe.py
    │   │   │   ├── multiclass_retrieval.py
    │   │   │   ├── visualization.py
    │   │   │   ├── zeroshot_classification.py
    │   │   │   └── zeroshot_retrieval.py
    │   │   ├── model_collection.py
    │   │   ├── tasks
    │   │   │   └── wds_benchmarks.txt
    │   │   └── webdataset_builder.py
    │   └── docs
    │   │   ├── assets
    │   │       ├── cat.png
    │   │       ├── dog.mp4
    │   │       ├── dog.png
    │   │       ├── spatial_correspondence.png
    │   │       ├── spatial_features.png
    │   │       └── teaser.png
    │   │   ├── evaluation.md
    │   │   └── pe_demo.ipynb
    └── plm
    │   ├── README.md
    │   ├── configs
    │       ├── datasets.yaml
    │       ├── stage_1
    │       │   ├── plm_1b.yaml
    │       │   ├── plm_3b.yaml
    │       │   └── plm_8b.yaml
    │       ├── stage_2
    │       │   ├── plm_1b.yaml
    │       │   ├── plm_3b.yaml
    │       │   └── plm_8b.yaml
    │       └── stage_3
    │       │   ├── plm_1b.yaml
    │       │   ├── plm_3b.yaml
    │       │   └── plm_8b.yaml
    │   ├── consolidate.py
    │   ├── dataset_conf.py
    │   ├── docs
    │       ├── evaluation.md
    │       ├── finetune_example.md
    │       ├── plm_main_fig.png
    │       ├── plm_videobench.md
    │       └── training.md
    │   ├── generate.py
    │   ├── interpolate_PE_pos_embed.py
    │   ├── notebook_demos
    │       ├── image_and_video_captioning.ipynb
    │       ├── image_grounding.ipynb
    │       └── multi_image_understanding.ipynb
    │   ├── tokenizer.py
    │   ├── train.py
    │   └── transformer.py
├── core
    ├── args.py
    ├── checkpoint.py
    ├── data
    │   ├── conversation.py
    │   ├── data.py
    │   ├── data_collators.py
    │   ├── data_mixer.py
    │   ├── dataloader.py
    │   └── preprocessor.py
    ├── distributed.py
    ├── logger.py
    ├── metrics.py
    ├── optim.py
    ├── probe.py
    ├── profiling.py
    ├── stool.py
    ├── tests
    │   ├── Rock-climbing-Canada-1920x1147.jpg
    │   ├── dataloader_test.py
    │   ├── llama3_tokenizer_test.py
    │   ├── ocrbench_centre.jpg
    │   └── selfie_cathedral_peak.jpg
    ├── tokenizer.py
    ├── transformer.py
    ├── transforms
    │   ├── image_transform.py
    │   ├── region_transform.py
    │   └── video_transform.py
    ├── utils.py
    ├── vision_encoder
    │   ├── __init__.py
    │   ├── bpe_simple_vocab_16e6.txt.gz
    │   ├── config.py
    │   ├── pe.py
    │   ├── rope.py
    │   ├── tokenizer.py
    │   └── transforms.py
    └── vision_projector
    │   ├── base.py
    │   └── mlp.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .vscode
 3 | *.ipynb
 4 | slurm-*.out
 5 | wandb
 6 | data/*
 7 | data-gym-cache/*
 8 | torchinductor_*/*
 9 | tmp*/*
10 | apps/plm/dummy_datasets
11 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | This Code of Conduct also applies outside the project spaces when there is a
56 | reasonable belief that an individual's behavior may have a negative impact on
57 | the project or its community.
58 | 
59 | ## Enforcement
60 | 
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported by contacting the project team at <opensource-conduct@fb.com>. All
63 | complaints will be reviewed and investigated and will result in a response that
64 | is deemed necessary and appropriate to the circumstances. The project team is
65 | obligated to maintain confidentiality with regard to the reporter of an incident.
66 | Further details of specific enforcement policies may be posted separately.
67 | 
68 | Project maintainers who do not follow or enforce the Code of Conduct in good
69 | faith may face temporary or permanent repercussions as determined by other
70 | members of the project's leadership.
71 | 
72 | ## Attribution
73 | 
74 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76 | 
77 | [homepage]: https://www.contributor-covenant.org
78 | 
79 | For answers to common questions about this code of conduct, see
80 | https://www.contributor-covenant.org/faq
81 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Perception Models
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Facebook's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to mae, you agree that your contributions will be licensed
31 | under the LICENSE file in the root directory of this source tree.
32 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/README.md:
--------------------------------------------------------------------------------
 1 | # SOTA COCO Object Detection with PE
 2 | 
 3 | ## Getting started
 4 | 
 5 | Please refer to [INSTALL.md](../INSTALL.md) for installation and dataset preparation instructions.
 6 | 
 7 | Also install [Deformable Attention](models/ops/make.sh) ops.
 8 | 
 9 | ## Results and Fine-tuned Models
10 | 
11 | <table><tbody>
12 | <!-- START TABLE -->
13 | <!-- TABLE HEADER -->
14 | <th valign="bottom">detector</th>
15 | <th valign="bottom">vision encoder</th>
16 | <th valign="bottom">box<br/>AP</th>
17 | <th valign="bottom">box(TTA)<br/>AP</th>
18 | <th valign="bottom">download</th>
19 | <!-- TABLE BODY -->
20 | <!-- ROW: DETA -->
21 |  <tr><td align="left">DETA</td>
22 | <td align="center">PE spatial G</td>
23 | <td align="center"> 65.2 </td>
24 | <td align="center"> 66.0 </td>
25 | <td align="center"><a href="https://huggingface.co/facebook/PE-Detection/resolve/main/deta_coco_1824pix.pth">model</a></td>
26 | </tr>
27 | </tbody></table>
28 | 
29 | 
30 | ## Training
31 | We apply a four-stage training, Objects365(12ep, 1024pix), Objects365(6ep, 1536pix), COCO(12ep, 1728pix), COCO(3ep, 1824pix)
32 | 
33 | ```
34 | sbatch scripts/pretrain_spatial_Gwin384_o365ep12_1024pix_16node.sh
35 | 
36 | sbatch scripts/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node.sh
37 | 
38 | sbatch scripts/finetune_spatial_Gwin384_cocoep12_1728pix_8node.sh
39 | 
40 | sbatch scripts/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node.sh
41 | 
42 | ```
43 | 
44 | ## Evaluation
45 | ```
46 | bash scripts/eval_1824pix.sh --resume deta_coco_1824pix.pth
47 | ```
48 | 
49 | ## Evaluation with TTA (Test-Time Augmentation)
50 | ```
51 | sbatch scripts/eval_tta_slurm_1824pix.sh --resume deta_coco_1824pix.pth
52 | ```
53 | Note: If you get 65.9 AP, it is probably caused by different package versions, trying different hyperparameters like `--quad_scale 0.4` will give 66.0 AP. 
54 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | import torch.utils.data
11 | 
12 | from .coco import build as build_coco
13 | from .objects365 import build as build_objects365
14 | from .torchvision_datasets import CocoDetection
15 | 
16 | 
17 | def get_coco_api_from_dataset(dataset):
18 |     for _ in range(10):
19 |         # if isinstance(dataset, torchvision.datasets.CocoDetection):
20 |         #     break
21 |         if isinstance(dataset, torch.utils.data.Subset):
22 |             dataset = dataset.dataset
23 |     if isinstance(dataset, CocoDetection):
24 |         return dataset.coco
25 | 
26 | 
27 | def build_dataset(image_set, args):
28 |     if args.dataset_file == "objects365":
29 |         return build_objects365(image_set, args)
30 |     if args.dataset_file == "coco":
31 |         return build_coco(image_set, args)
32 |     if args.dataset_file == "coco_panoptic":
33 |         # to avoid making panopticapi required for coco
34 |         from .coco_panoptic import build as build_coco_panoptic
35 | 
36 |         return build_coco_panoptic(image_set, args)
37 |     raise ValueError(f"dataset {args.dataset_file} not supported")
38 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/datasets/coco_panoptic.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | import json
 11 | from pathlib import Path
 12 | 
 13 | import numpy as np
 14 | import torch
 15 | from PIL import Image
 16 | 
 17 | from panopticapi.utils import rgb2id
 18 | from util.box_ops import masks_to_boxes
 19 | 
 20 | from .coco import make_coco_transforms
 21 | 
 22 | 
 23 | class CocoPanoptic:
 24 |     def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
 25 |         with open(ann_file, 'r') as f:
 26 |             self.coco = json.load(f)
 27 | 
 28 |         # sort 'images' field so that they are aligned with 'annotations'
 29 |         # i.e., in alphabetical order
 30 |         self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
 31 |         # sanity check
 32 |         if "annotations" in self.coco:
 33 |             for img, ann in zip(self.coco['images'], self.coco['annotations']):
 34 |                 assert img['file_name'][:-4] == ann['file_name'][:-4]
 35 | 
 36 |         self.img_folder = img_folder
 37 |         self.ann_folder = ann_folder
 38 |         self.ann_file = ann_file
 39 |         self.transforms = transforms
 40 |         self.return_masks = return_masks
 41 | 
 42 |     def __getitem__(self, idx):
 43 |         ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
 44 |         img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
 45 |         ann_path = Path(self.ann_folder) / ann_info['file_name']
 46 | 
 47 |         img = Image.open(img_path).convert('RGB')
 48 |         w, h = img.size
 49 |         if "segments_info" in ann_info:
 50 |             masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
 51 |             masks = rgb2id(masks)
 52 | 
 53 |             ids = np.array([ann['id'] for ann in ann_info['segments_info']])
 54 |             masks = masks == ids[:, None, None]
 55 | 
 56 |             masks = torch.as_tensor(masks, dtype=torch.uint8)
 57 |             labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64)
 58 | 
 59 |         target = {}
 60 |         target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]])
 61 |         if self.return_masks:
 62 |             target['masks'] = masks
 63 |         target['labels'] = labels
 64 | 
 65 |         target["boxes"] = masks_to_boxes(masks)
 66 | 
 67 |         target['size'] = torch.as_tensor([int(h), int(w)])
 68 |         target['orig_size'] = torch.as_tensor([int(h), int(w)])
 69 |         if "segments_info" in ann_info:
 70 |             for name in ['iscrowd', 'area']:
 71 |                 target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']])
 72 | 
 73 |         if self.transforms is not None:
 74 |             img, target = self.transforms(img, target)
 75 | 
 76 |         return img, target
 77 | 
 78 |     def __len__(self):
 79 |         return len(self.coco['images'])
 80 | 
 81 |     def get_height_and_width(self, idx):
 82 |         img_info = self.coco['images'][idx]
 83 |         height = img_info['height']
 84 |         width = img_info['width']
 85 |         return height, width
 86 | 
 87 | 
 88 | def build(image_set, args):
 89 |     img_folder_root = Path(args.coco_path)
 90 |     ann_folder_root = Path(args.coco_panoptic_path)
 91 |     assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
 92 |     assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist'
 93 |     mode = 'panoptic'
 94 |     PATHS = {
 95 |         "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
 96 |         "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
 97 |     }
 98 | 
 99 |     img_folder, ann_file = PATHS[image_set]
100 |     img_folder_path = img_folder_root / img_folder
101 |     ann_folder = ann_folder_root / f'{mode}_{img_folder}'
102 |     ann_file = ann_folder_root / ann_file
103 | 
104 |     dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file,
105 |                            transforms=make_coco_transforms(image_set), return_masks=args.masks)
106 | 
107 |     return dataset
108 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/datasets/data_prefetcher.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | 
 7 | import torch
 8 | 
 9 | def to_cuda(samples, targets, device):
10 |     samples = samples.to(device, non_blocking=True)
11 |     targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets]
12 |     return samples, targets
13 | 
14 | class data_prefetcher():
15 |     def __init__(self, loader, device, prefetch=True):
16 |         self.loader = iter(loader)
17 |         self.prefetch = prefetch
18 |         self.device = device
19 |         if prefetch:
20 |             self.stream = torch.cuda.Stream()
21 |             self.preload()
22 | 
23 |     def preload(self):
24 |         try:
25 |             self.next_samples, self.next_targets = next(self.loader)
26 |         except StopIteration:
27 |             self.next_samples = None
28 |             self.next_targets = None
29 |             return
30 |         # if record_stream() doesn't work, another option is to make sure device inputs are created
31 |         # on the main stream.
32 |         # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
33 |         # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
34 |         # Need to make sure the memory allocated for next_* is not still in use by the main stream
35 |         # at the time we start copying to next_*:
36 |         # self.stream.wait_stream(torch.cuda.current_stream())
37 |         with torch.cuda.stream(self.stream):
38 |             self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device)
39 |             # more code for the alternative if record_stream() doesn't work:
40 |             # copy_ will record the use of the pinned source tensor in this side stream.
41 |             # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
42 |             # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
43 |             # self.next_input = self.next_input_gpu
44 |             # self.next_target = self.next_target_gpu
45 | 
46 |             # With Amp, it isn't necessary to manually convert data to half.
47 |             # if args.fp16:
48 |             #     self.next_input = self.next_input.half()
49 |             # else:
50 | 
51 |     def next(self):
52 |         if self.prefetch:
53 |             torch.cuda.current_stream().wait_stream(self.stream)
54 |             samples = self.next_samples
55 |             targets = self.next_targets
56 |             if samples is not None:
57 |                 samples.record_stream(torch.cuda.current_stream())
58 |             if targets is not None:
59 |                 for t in targets:
60 |                     for k, v in t.items():
61 |                         v.record_stream(torch.cuda.current_stream())
62 |             self.preload()
63 |         else:
64 |             try:
65 |                 samples, targets = next(self.loader)
66 |                 samples, targets = to_cuda(samples, targets, self.device)
67 |             except StopIteration:
68 |                 samples = None
69 |                 targets = None
70 |         return samples, targets
71 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/datasets/objects365.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | """
11 | COCO dataset which returns image_id for evaluation.
12 | 
13 | Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
14 | """
15 | from pathlib import Path
16 | 
17 | import datasets.transforms as T
18 | 
19 | import torch
20 | import torch.utils.data
21 | from pycocotools import mask as coco_mask
22 | from util.misc import get_local_rank, get_local_size
23 | 
24 | from .coco import CocoDetection, make_coco_transforms, make_coco_transforms_lsj
25 | from .torchvision_datasets import CocoDetection as TvCocoDetection
26 | 
27 | 
28 | def build(image_set, args):
29 |     root = Path(args.coco_path)
30 |     assert root.exists(), f"provided Objects365 path {root} does not exist"
31 |     mode = "instances"
32 |     PATHS = {
33 |         "train": (
34 |             root / "train",
35 |             root / "annotations" / "zhiyuan_objv2_train_fixmiss.json",
36 |         ),
37 |         "val": (root / "val", root / "annotations" / "zhiyuan_objv2_val.json"),
38 |     }
39 | 
40 |     img_folder, ann_file = PATHS[image_set]
41 |     if args.lsj:
42 |         coco_transform = make_coco_transforms_lsj(image_set, args.lsj_img_size)
43 |     else:
44 |         coco_transform = make_coco_transforms(image_set, args.bigger)
45 |     dataset = CocoDetection(
46 |         img_folder,
47 |         ann_file,
48 |         transforms=coco_transform,
49 |         return_masks=args.masks,
50 |         cache_mode=args.cache_mode,
51 |         local_rank=get_local_rank(),
52 |         local_size=get_local_size(),
53 |     )
54 |     return dataset
55 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/datasets/panoptic_eval.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | import json
11 | import os
12 | 
13 | import util.misc as utils
14 | 
15 | try:
16 |     from panopticapi.evaluation import pq_compute
17 | except ImportError:
18 |     pass
19 | 
20 | 
21 | class PanopticEvaluator(object):
22 |     def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"):
23 |         self.gt_json = ann_file
24 |         self.gt_folder = ann_folder
25 |         if utils.is_main_process():
26 |             if not os.path.exists(output_dir):
27 |                 os.mkdir(output_dir)
28 |         self.output_dir = output_dir
29 |         self.predictions = []
30 | 
31 |     def update(self, predictions):
32 |         for p in predictions:
33 |             with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
34 |                 f.write(p.pop("png_string"))
35 | 
36 |         self.predictions += predictions
37 | 
38 |     def synchronize_between_processes(self):
39 |         all_predictions = utils.all_gather(self.predictions)
40 |         merged_predictions = []
41 |         for p in all_predictions:
42 |             merged_predictions += p
43 |         self.predictions = merged_predictions
44 | 
45 |     def summarize(self):
46 |         if utils.is_main_process():
47 |             json_data = {"annotations": self.predictions}
48 |             predictions_json = os.path.join(self.output_dir, "predictions.json")
49 |             with open(predictions_json, "w") as f:
50 |                 f.write(json.dumps(json_data))
51 |             return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
52 |         return None
53 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/datasets/torchvision_datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | 
7 | from .coco import CocoDetection
8 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/datasets/torchvision_datasets/coco.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from torchvision
 7 | # ------------------------------------------------------------------------
 8 | 
 9 | """
10 | Copy-Paste from torchvision, but add utility of caching images on memory
11 | """
12 | from torchvision.datasets.vision import VisionDataset
13 | from PIL import Image
14 | import os
15 | import os.path
16 | import tqdm
17 | from io import BytesIO
18 | 
19 | 
20 | class CocoDetection(VisionDataset):
21 |     """`MS Coco Detection <http://mscoco.org/dataset/#detections-challenge2016>`_ Dataset.
22 |     Args:
23 |         root (string): Root directory where images are downloaded to.
24 |         annFile (string): Path to json annotation file.
25 |         transform (callable, optional): A function/transform that  takes in an PIL image
26 |             and returns a transformed version. E.g, ``transforms.ToTensor``
27 |         target_transform (callable, optional): A function/transform that takes in the
28 |             target and transforms it.
29 |         transforms (callable, optional): A function/transform that takes input sample and its target as entry
30 |             and returns a transformed version.
31 |     """
32 | 
33 |     def __init__(self, root, annFile, transform=None, target_transform=None, transforms=None,
34 |                  cache_mode=False, local_rank=0, local_size=1):
35 |         super(CocoDetection, self).__init__(root, transforms, transform, target_transform)
36 |         from pycocotools.coco import COCO
37 |         self.coco = COCO(annFile)
38 |         self.ids = list(sorted(self.coco.imgs.keys()))
39 |         self.cache_mode = cache_mode
40 |         self.local_rank = local_rank
41 |         self.local_size = local_size
42 |         if cache_mode:
43 |             self.cache = {}
44 |             self.cache_images()
45 | 
46 |     def cache_images(self):
47 |         self.cache = {}
48 |         for index, img_id in zip(tqdm.trange(len(self.ids)), self.ids):
49 |             if index % self.local_size != self.local_rank:
50 |                 continue
51 |             path = self.coco.loadImgs(img_id)[0]['file_name']
52 |             with open(os.path.join(self.root, path), 'rb') as f:
53 |                 self.cache[path] = f.read()
54 | 
55 |     def get_image(self, path):
56 |         if self.cache_mode:
57 |             if path not in self.cache.keys():
58 |                 with open(os.path.join(self.root, path), 'rb') as f:
59 |                     self.cache[path] = f.read()
60 |             return Image.open(BytesIO(self.cache[path])).convert('RGB')
61 |         return Image.open(os.path.join(self.root, path)).convert('RGB')
62 | 
63 |     def __getitem__(self, index):
64 |         """
65 |         Args:
66 |             index (int): Index
67 |         Returns:
68 |             tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
69 |         """
70 |         coco = self.coco
71 |         img_id = self.ids[index]
72 |         ann_ids = coco.getAnnIds(imgIds=img_id)
73 |         target = coco.loadAnns(ann_ids)
74 | 
75 |         path = coco.loadImgs(img_id)[0]['file_name']
76 | 
77 |         img = self.get_image(path)
78 |         if self.transforms is not None:
79 |             img, target = self.transforms(img, target)
80 | 
81 |         return img, target
82 | 
83 |     def __len__(self):
84 |         return len(self.ids)
85 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | from .deformable_detr import build
11 | 
12 | 
13 | def build_model(args):
14 |     return build(args)
15 | 
16 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/ops/functions/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn_func import ms_deform_attn_core_pytorch, MSDeformAttnFunction
10 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/ops/functions/ms_deform_attn_func.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
  7 | # ------------------------------------------------------------------------------------------------
  8 | 
  9 | from __future__ import absolute_import, division, print_function
 10 | 
 11 | import MultiScaleDeformableAttention as MSDA
 12 | 
 13 | import torch
 14 | import torch.nn.functional as F
 15 | from torch.autograd import Function
 16 | from torch.autograd.function import once_differentiable
 17 | 
 18 | 
 19 | class MSDeformAttnFunction(Function):
 20 |     @staticmethod
 21 |     def forward(
 22 |         ctx,
 23 |         value,
 24 |         value_spatial_shapes,
 25 |         value_level_start_index,
 26 |         sampling_locations,
 27 |         attention_weights,
 28 |         im2col_step,
 29 |     ):
 30 |         ctx.im2col_step = im2col_step
 31 |         output = MSDA.ms_deform_attn_forward(
 32 |             value,
 33 |             value_spatial_shapes,
 34 |             value_level_start_index,
 35 |             sampling_locations,
 36 |             attention_weights,
 37 |             ctx.im2col_step,
 38 |         )
 39 |         ctx.save_for_backward(
 40 |             value,
 41 |             value_spatial_shapes,
 42 |             value_level_start_index,
 43 |             sampling_locations,
 44 |             attention_weights,
 45 |         )
 46 |         return output
 47 | 
 48 |     @staticmethod
 49 |     @once_differentiable
 50 |     def backward(ctx, grad_output):
 51 |         (
 52 |             value,
 53 |             value_spatial_shapes,
 54 |             value_level_start_index,
 55 |             sampling_locations,
 56 |             attention_weights,
 57 |         ) = ctx.saved_tensors
 58 |         grad_value, grad_sampling_loc, grad_attn_weight = MSDA.ms_deform_attn_backward(
 59 |             value,
 60 |             value_spatial_shapes,
 61 |             value_level_start_index,
 62 |             sampling_locations,
 63 |             attention_weights,
 64 |             grad_output,
 65 |             ctx.im2col_step,
 66 |         )
 67 | 
 68 |         return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
 69 | 
 70 | 
 71 | def ms_deform_attn_core_pytorch(
 72 |     value, value_spatial_shapes, sampling_locations, attention_weights
 73 | ):
 74 |     # for debug and test only,
 75 |     # need to use cuda version instead
 76 |     N_, S_, M_, D_ = value.shape
 77 |     _, Lq_, M_, L_, P_, _ = sampling_locations.shape
 78 |     value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
 79 |     sampling_grids = 2 * sampling_locations - 1
 80 |     sampling_value_list = []
 81 |     for lid_, (H_, W_) in enumerate(value_spatial_shapes):
 82 |         # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
 83 |         value_l_ = (
 84 |             value_list[lid_].flatten(2).transpose(1, 2).reshape(N_ * M_, D_, H_, W_)
 85 |         )
 86 |         # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
 87 |         sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
 88 |         # N_*M_, D_, Lq_, P_
 89 |         sampling_value_l_ = F.grid_sample(
 90 |             value_l_,
 91 |             sampling_grid_l_,
 92 |             mode="bilinear",
 93 |             padding_mode="zeros",
 94 |             align_corners=False,
 95 |         )
 96 |         sampling_value_list.append(sampling_value_l_)
 97 |     # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
 98 |     attention_weights = attention_weights.transpose(1, 2).reshape(
 99 |         N_ * M_, 1, Lq_, L_ * P_
100 |     )
101 |     output = (
102 |         (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
103 |         .sum(-1)
104 |         .view(N_, M_ * D_, Lq_)
105 |     )
106 |     return output.transpose(1, 2).contiguous()
107 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/ops/make.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------------------------------
 7 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | # ------------------------------------------------------------------------------------------------
 9 | 
10 | python setup.py build install
11 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/ops/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from .ms_deform_attn import MSDeformAttn
10 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/ops/setup.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | import os
10 | import glob
11 | 
12 | import torch
13 | 
14 | from torch.utils.cpp_extension import CUDA_HOME
15 | from torch.utils.cpp_extension import CppExtension
16 | from torch.utils.cpp_extension import CUDAExtension
17 | 
18 | from setuptools import find_packages
19 | from setuptools import setup
20 | 
21 | requirements = ["torch", "torchvision"]
22 | 
23 | def get_extensions():
24 |     this_dir = os.path.dirname(os.path.abspath(__file__))
25 |     extensions_dir = os.path.join(this_dir, "src")
26 | 
27 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
28 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
29 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
30 | 
31 |     sources = main_file + source_cpu
32 |     extension = CppExtension
33 |     extra_compile_args = {"cxx": []}
34 |     define_macros = []
35 | 
36 |     if torch.cuda.is_available() and CUDA_HOME is not None:
37 |         extension = CUDAExtension
38 |         sources += source_cuda
39 |         define_macros += [("WITH_CUDA", None)]
40 |         extra_compile_args["nvcc"] = [
41 |             "-DCUDA_HAS_FP16=1",
42 |             "-D__CUDA_NO_HALF_OPERATORS__",
43 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
44 |             "-D__CUDA_NO_HALF2_OPERATORS__",
45 |         ]
46 |     else:
47 |         raise NotImplementedError('Cuda is not availabel')
48 | 
49 |     sources = [os.path.join(extensions_dir, s) for s in sources]
50 |     include_dirs = [extensions_dir]
51 |     ext_modules = [
52 |         extension(
53 |             "MultiScaleDeformableAttention",
54 |             sources,
55 |             include_dirs=include_dirs,
56 |             define_macros=define_macros,
57 |             extra_compile_args=extra_compile_args,
58 |         )
59 |     ]
60 |     return ext_modules
61 | 
62 | setup(
63 |     name="MultiScaleDeformableAttention",
64 |     version="1.0",
65 |     author="Weijie Su",
66 |     url="https://github.com/fundamentalvision/Deformable-DETR",
67 |     description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
68 |     packages=find_packages(exclude=("configs", "tests",)),
69 |     ext_modules=get_extensions(),
70 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
71 | )
72 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/ops/src/cpu/ms_deform_attn_cpu.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include <vector>
12 | 
13 | #include <ATen/ATen.h>
14 | #include <ATen/cuda/CUDAContext.h>
15 | 
16 | 
17 | at::Tensor
18 | ms_deform_attn_cpu_forward(
19 |     const at::Tensor &value, 
20 |     const at::Tensor &spatial_shapes,
21 |     const at::Tensor &level_start_index,
22 |     const at::Tensor &sampling_loc,
23 |     const at::Tensor &attn_weight,
24 |     const int im2col_step)
25 | {
26 |     AT_ERROR("Not implement on cpu");
27 | }
28 | 
29 | std::vector<at::Tensor>
30 | ms_deform_attn_cpu_backward(
31 |     const at::Tensor &value, 
32 |     const at::Tensor &spatial_shapes,
33 |     const at::Tensor &level_start_index,
34 |     const at::Tensor &sampling_loc,
35 |     const at::Tensor &attn_weight,
36 |     const at::Tensor &grad_output,
37 |     const int im2col_step)
38 | {
39 |     AT_ERROR("Not implement on cpu");
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/ops/src/cpu/ms_deform_attn_cpu.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor
15 | ms_deform_attn_cpu_forward(
16 |     const at::Tensor &value, 
17 |     const at::Tensor &spatial_shapes,
18 |     const at::Tensor &level_start_index,
19 |     const at::Tensor &sampling_loc,
20 |     const at::Tensor &attn_weight,
21 |     const int im2col_step);
22 | 
23 | std::vector<at::Tensor>
24 | ms_deform_attn_cpu_backward(
25 |     const at::Tensor &value, 
26 |     const at::Tensor &spatial_shapes,
27 |     const at::Tensor &level_start_index,
28 |     const at::Tensor &sampling_loc,
29 |     const at::Tensor &attn_weight,
30 |     const at::Tensor &grad_output,
31 |     const int im2col_step);
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/ops/src/cuda/ms_deform_attn_cuda.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | #include <torch/extension.h>
13 | 
14 | at::Tensor ms_deform_attn_cuda_forward(
15 |     const at::Tensor &value, 
16 |     const at::Tensor &spatial_shapes,
17 |     const at::Tensor &level_start_index,
18 |     const at::Tensor &sampling_loc,
19 |     const at::Tensor &attn_weight,
20 |     const int im2col_step);
21 | 
22 | std::vector<at::Tensor> ms_deform_attn_cuda_backward(
23 |     const at::Tensor &value, 
24 |     const at::Tensor &spatial_shapes,
25 |     const at::Tensor &level_start_index,
26 |     const at::Tensor &sampling_loc,
27 |     const at::Tensor &attn_weight,
28 |     const at::Tensor &grad_output,
29 |     const int im2col_step);
30 | 
31 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/ops/src/ms_deform_attn.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #pragma once
12 | 
13 | #include "cpu/ms_deform_attn_cpu.h"
14 | 
15 | #ifdef WITH_CUDA
16 | #include "cuda/ms_deform_attn_cuda.h"
17 | #endif
18 | 
19 | 
20 | at::Tensor
21 | ms_deform_attn_forward(
22 |     const at::Tensor &value, 
23 |     const at::Tensor &spatial_shapes,
24 |     const at::Tensor &level_start_index,
25 |     const at::Tensor &sampling_loc,
26 |     const at::Tensor &attn_weight,
27 |     const int im2col_step)
28 | {
29 |     if (value.type().is_cuda())
30 |     {
31 | #ifdef WITH_CUDA
32 |         return ms_deform_attn_cuda_forward(
33 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
34 | #else
35 |         AT_ERROR("Not compiled with GPU support");
36 | #endif
37 |     }
38 |     AT_ERROR("Not implemented on the CPU");
39 | }
40 | 
41 | std::vector<at::Tensor>
42 | ms_deform_attn_backward(
43 |     const at::Tensor &value, 
44 |     const at::Tensor &spatial_shapes,
45 |     const at::Tensor &level_start_index,
46 |     const at::Tensor &sampling_loc,
47 |     const at::Tensor &attn_weight,
48 |     const at::Tensor &grad_output,
49 |     const int im2col_step)
50 | {
51 |     if (value.type().is_cuda())
52 |     {
53 | #ifdef WITH_CUDA
54 |         return ms_deform_attn_cuda_backward(
55 |             value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
56 | #else
57 |         AT_ERROR("Not compiled with GPU support");
58 | #endif
59 |     }
60 |     AT_ERROR("Not implemented on the CPU");
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/ops/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 | **************************************************************************************************
 3 | * Deformable DETR
 4 | * Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | * Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | **************************************************************************************************
 7 | * Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 8 | **************************************************************************************************
 9 | */
10 | 
11 | #include "ms_deform_attn.h"
12 | 
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
15 |   m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
16 | }
17 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/ops/test.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------------------------------
 6 | # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
 7 | # ------------------------------------------------------------------------------------------------
 8 | 
 9 | from __future__ import absolute_import
10 | from __future__ import print_function
11 | from __future__ import division
12 | 
13 | import time
14 | import torch
15 | import torch.nn as nn
16 | from torch.autograd import gradcheck
17 | 
18 | from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
19 | 
20 | 
21 | N, M, D = 1, 2, 2
22 | Lq, L, P = 2, 2, 2
23 | shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
24 | level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
25 | S = sum([(H*W).item() for H, W in shapes])
26 | 
27 | 
28 | torch.manual_seed(3)
29 | 
30 | 
31 | @torch.no_grad()
32 | def check_forward_equal_with_pytorch_double():
33 |     value = torch.rand(N, S, M, D).cuda() * 0.01
34 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
35 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
36 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
37 |     im2col_step = 2
38 |     output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
39 |     output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
40 |     fwdok = torch.allclose(output_cuda, output_pytorch)
41 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
42 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
43 | 
44 |     print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
45 | 
46 | 
47 | @torch.no_grad()
48 | def check_forward_equal_with_pytorch_float():
49 |     value = torch.rand(N, S, M, D).cuda() * 0.01
50 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
51 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
52 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
53 |     im2col_step = 2
54 |     output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
55 |     output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
56 |     fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
57 |     max_abs_err = (output_cuda - output_pytorch).abs().max()
58 |     max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
59 | 
60 |     print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
61 | 
62 | 
63 | def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
64 | 
65 |     value = torch.rand(N, S, M, channels).cuda() * 0.01
66 |     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
67 |     attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
68 |     attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
69 |     im2col_step = 2
70 |     func = MSDeformAttnFunction.apply
71 | 
72 |     value.requires_grad = grad_value
73 |     sampling_locations.requires_grad = grad_sampling_loc
74 |     attention_weights.requires_grad = grad_attn_weight
75 | 
76 |     gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
77 | 
78 |     print(f'* {gradok} check_gradient_numerical(D={channels})')
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     check_forward_equal_with_pytorch_double()
83 |     check_forward_equal_with_pytorch_float()
84 | 
85 |     for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
86 |         check_gradient_numerical(channels, True, True, True)
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/models/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | """
11 | Various positional encodings for the transformer.
12 | """
13 | import math
14 | import torch
15 | from torch import nn
16 | 
17 | from util.misc import NestedTensor
18 | 
19 | 
20 | class PositionEmbeddingSine(nn.Module):
21 |     """
22 |     This is a more standard version of the position embedding, very similar to the one
23 |     used by the Attention is all you need paper, generalized to work on images.
24 |     """
25 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
26 |         super().__init__()
27 |         self.num_pos_feats = num_pos_feats
28 |         self.temperature = temperature
29 |         self.normalize = normalize
30 |         if scale is not None and normalize is False:
31 |             raise ValueError("normalize should be True if scale is passed")
32 |         if scale is None:
33 |             scale = 2 * math.pi
34 |         self.scale = scale
35 | 
36 |     def forward(self, tensor_list: NestedTensor):
37 |         x = tensor_list.tensors
38 |         mask = tensor_list.mask
39 |         assert mask is not None
40 |         not_mask = ~mask
41 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
42 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
43 |         if self.normalize:
44 |             eps = 1e-6
45 |             y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
46 |             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
47 | 
48 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
49 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
50 | 
51 |         pos_x = x_embed[:, :, :, None] / dim_t
52 |         pos_y = y_embed[:, :, :, None] / dim_t
53 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
54 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
55 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
56 |         return pos
57 | 
58 | 
59 | class PositionEmbeddingLearned(nn.Module):
60 |     """
61 |     Absolute pos embedding, learned.
62 |     """
63 |     def __init__(self, num_pos_feats=256):
64 |         super().__init__()
65 |         self.row_embed = nn.Embedding(50, num_pos_feats)
66 |         self.col_embed = nn.Embedding(50, num_pos_feats)
67 |         self.reset_parameters()
68 | 
69 |     def reset_parameters(self):
70 |         nn.init.uniform_(self.row_embed.weight)
71 |         nn.init.uniform_(self.col_embed.weight)
72 | 
73 |     def forward(self, tensor_list: NestedTensor):
74 |         x = tensor_list.tensors
75 |         h, w = x.shape[-2:]
76 |         i = torch.arange(w, device=x.device)
77 |         j = torch.arange(h, device=x.device)
78 |         x_emb = self.col_embed(i)
79 |         y_emb = self.row_embed(j)
80 |         pos = torch.cat([
81 |             x_emb.unsqueeze(0).repeat(h, 1, 1),
82 |             y_emb.unsqueeze(1).repeat(1, w, 1),
83 |         ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
84 |         return pos
85 | 
86 | 
87 | def build_position_encoding(args):
88 |     N_steps = args.hidden_dim // 2
89 |     if args.position_embedding in ('v2', 'sine'):
90 |         # TODO find a better way of exposing other arguments
91 |         position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
92 |     elif args.position_embedding in ('v3', 'learned'):
93 |         position_embedding = PositionEmbeddingLearned(N_steps)
94 |     else:
95 |         raise ValueError(f"not supported {args.position_embedding}")
96 | 
97 |     return position_embedding
98 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/scripts/eval.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/eval"
 3 | 
 4 | 
 5 | python -m torch.distributed.launch --nproc_per_node=8 \
 6 | --master_port=12345 --use_env main.py \
 7 | --output_dir ${EXP_DIR} \
 8 | --with_box_refine --two_stage \
 9 | --num_feature_levels 5 --num_queries 900 \
10 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \
11 | --assign_first_stage --assign_second_stage \
12 | --epochs 24 --lr_drop 20 \
13 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \
14 | --backbone pev1 \
15 | --backbone_size Gwin384 \
16 | --backbone_init_values 0.1 \
17 | --backbone_tile_posemb True \
18 | --backbone_lrd 0.9 --backbone_layers 50 \
19 | --num_workers 4 \
20 | --coco_path /checkpoint/vision_encoder/public_data/coco \
21 | --lsj --lsj_img_size 1728 \
22 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \
23 | --eval \
24 | --resume /checkpoint/vision_encoder/d2_output/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node/checkpoint.pth \
25 | "$@"
26 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/scripts/eval_1824pix.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/eval"
 3 | 
 4 | 
 5 | python -m torch.distributed.launch --nproc_per_node=8 \
 6 | --master_port=12345 --use_env main.py \
 7 | --output_dir ${EXP_DIR} \
 8 | --with_box_refine --two_stage \
 9 | --num_feature_levels 5 --num_queries 900 \
10 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \
11 | --assign_first_stage --assign_second_stage \
12 | --epochs 24 --lr_drop 20 \
13 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \
14 | --backbone pev1 \
15 | --backbone_size Gwin384 \
16 | --backbone_init_values 0.1 \
17 | --backbone_tile_posemb True \
18 | --backbone_lrd 0.9 --backbone_layers 50 \
19 | --num_workers 4 \
20 | --coco_path /checkpoint/vision_encoder/public_data/coco \
21 | --lsj --lsj_img_size 1824 \
22 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \
23 | --eval \
24 | --resume /checkpoint/vision_encoder/d2_output/coco_sota/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node/checkpoint.pth \
25 | "$@"
26 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/scripts/eval_tta_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --qos=vision_encoder_high
 4 | #SBATCH --account=vision_encoder
 5 | #SBATCH --job-name=det
 6 | #SBATCH --nodes=8
 7 | #SBATCH --ntasks=8
 8 | #SBATCH --gres=gpu:8
 9 | #SBATCH --cpus-per-task=32
10 | #SBATCH --mem=0
11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/eval_tta_slurm/%j.out
12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/eval_tta_slurm/%j.err
13 | #SBATCH --time=23:00:00
14 | 
15 | module load cuda/12.1
16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST))
17 | nodes_array=($nodes)
18 | head_node=${nodes_array[0]}
19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
20 | 
21 | read -ra my_array <<< $head_node_ip
22 | export LOGLEVEL=INFO
23 | 
24 | echo head_node_ip $head_node_ip
25 | echo endpoint "${head_node_ip}:29500"
26 | 
27 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/eval_tta_slurm"
28 | 
29 | 
30 | # srun \
31 | # torchrun \
32 | srun \
33 | python -m torch.distributed.run \
34 | --nnodes 8 \
35 | --nproc_per_node 8 \
36 | --rdzv_id $RANDOM \
37 | --rdzv_endpoint "${my_array[0]}:29500" \
38 | --rdzv_backend c10d \
39 | main.py \
40 | --output_dir ${EXP_DIR} \
41 | --with_box_refine --two_stage \
42 | --num_feature_levels 5 --num_queries 2000 \
43 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \
44 | --assign_first_stage --assign_second_stage \
45 | --epochs 12 --lr_drop 10 \
46 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \
47 | --backbone pev1 \
48 | --backbone_size Gwin384 \
49 | --backbone_init_values 0.1 \
50 | --backbone_tile_posemb True \
51 | --backbone_lrd 0.9 --backbone_layers 50 \
52 | --num_workers 4 \
53 | --coco_path /checkpoint/vision_encoder/public_data/coco \
54 | --lsj --lsj_img_size 1728 \
55 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \
56 | --eval \
57 | --resume /checkpoint/vision_encoder/d2_output/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node/checkpoint.pth \
58 | --soft_nms \
59 | --tta \
60 | "$@"
61 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/scripts/eval_tta_slurm_1824pix.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --qos=vision_encoder_high
 4 | #SBATCH --account=vision_encoder
 5 | #SBATCH --job-name=det
 6 | #SBATCH --nodes=8
 7 | #SBATCH --ntasks=8
 8 | #SBATCH --gres=gpu:8
 9 | #SBATCH --cpus-per-task=32
10 | #SBATCH --mem=0
11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/eval_tta_slurm/%j.out
12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/eval_tta_slurm/%j.err
13 | #SBATCH --time=23:00:00
14 | 
15 | module load cuda/12.1
16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST))
17 | nodes_array=($nodes)
18 | head_node=${nodes_array[0]}
19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
20 | 
21 | read -ra my_array <<< $head_node_ip
22 | export LOGLEVEL=INFO
23 | 
24 | echo head_node_ip $head_node_ip
25 | echo endpoint "${head_node_ip}:29500"
26 | 
27 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/eval_tta_slurm"
28 | 
29 | 
30 | # srun \
31 | # torchrun \
32 | srun \
33 | python -m torch.distributed.run \
34 | --nnodes 8 \
35 | --nproc_per_node 8 \
36 | --rdzv_id $RANDOM \
37 | --rdzv_endpoint "${my_array[0]}:29500" \
38 | --rdzv_backend c10d \
39 | main.py \
40 | --output_dir ${EXP_DIR} \
41 | --with_box_refine --two_stage \
42 | --num_feature_levels 5 --num_queries 2000 \
43 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \
44 | --assign_first_stage --assign_second_stage \
45 | --epochs 12 --lr_drop 10 \
46 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \
47 | --backbone pev1 \
48 | --backbone_size Gwin384 \
49 | --backbone_init_values 0.1 \
50 | --backbone_tile_posemb True \
51 | --backbone_lrd 0.9 --backbone_layers 50 \
52 | --num_workers 4 \
53 | --coco_path /checkpoint/vision_encoder/public_data/coco \
54 | --lsj --lsj_img_size 1824 \
55 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \
56 | --eval \
57 | --resume /checkpoint/vision_encoder/d2_output/coco_sota/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node/checkpoint.pth \
58 | --soft_nms \
59 | --tta \
60 | "$@"
61 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/scripts/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --qos=vision_encoder
 4 | #SBATCH --account=vision_encoder
 5 | #SBATCH --job-name=det
 6 | #SBATCH --nodes=8
 7 | #SBATCH --ntasks=8
 8 | #SBATCH --gres=gpu:8
 9 | #SBATCH --cpus-per-task=32
10 | #SBATCH --mem=0
11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node/%j.out
12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node/%j.err
13 | #SBATCH --time=96:00:00
14 | 
15 | module load cuda/12.1
16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST))
17 | nodes_array=($nodes)
18 | head_node=${nodes_array[0]}
19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
20 | 
21 | read -ra my_array <<< $head_node_ip
22 | export LOGLEVEL=INFO
23 | 
24 | echo head_node_ip $head_node_ip
25 | echo endpoint "${head_node_ip}:29500"
26 | 
27 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/finetune_further_spatial_Gwin384_cocoep3_1824pix_8node"
28 | 
29 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
30 | 
31 | srun \
32 | torchrun \
33 | --nnodes 8 \
34 | --nproc_per_node 8 \
35 | --rdzv_id $RANDOM \
36 | --rdzv_endpoint "${my_array[0]}:29500" \
37 | --rdzv_backend c10d \
38 | main.py \
39 | --output_dir ${EXP_DIR} \
40 | --with_box_refine --two_stage \
41 | --num_feature_levels 5 --num_queries 900 \
42 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \
43 | --assign_first_stage --assign_second_stage \
44 | --epochs 3 --lr_drop 2 \
45 | --batch_size 1 \
46 | --backbone pev1 \
47 | --backbone_size Gwin384 \
48 | --backbone_init_values 0.1 \
49 | --backbone_tile_posemb True \
50 | --backbone_lrd 0.9 --backbone_layers 50 \
51 | --coco_path /checkpoint/vision_encoder/public_data/coco \
52 | --finetune /checkpoint/vision_encoder/d2_output/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node/checkpoint.pth \
53 | --lsj --lsj_img_size 1824 \
54 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \
55 | --eval_per_epochs 1 \
56 | --save_per_epochs 1 \
57 | --auto_resume \
58 | --keep_class_embed \
59 | --bf16 \
60 | --backbone_dp 0.0 \
61 | --sgd \
62 | --lr 5e-5 --lr_backbone 5e-5 \
63 | "$@"
64 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/scripts/finetune_spatial_Gwin384_cocoep12_1728pix_8node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --qos=vision_encoder
 4 | #SBATCH --account=vision_encoder
 5 | #SBATCH --job-name=det
 6 | #SBATCH --nodes=8
 7 | #SBATCH --ntasks=8
 8 | #SBATCH --gres=gpu:8
 9 | #SBATCH --cpus-per-task=32
10 | #SBATCH --mem=0
11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node/%j.out
12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node/%j.err
13 | #SBATCH --time=96:00:00
14 | 
15 | module load cuda/12.1
16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST))
17 | nodes_array=($nodes)
18 | head_node=${nodes_array[0]}
19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
20 | 
21 | read -ra my_array <<< $head_node_ip
22 | export LOGLEVEL=INFO
23 | 
24 | echo head_node_ip $head_node_ip
25 | echo endpoint "${head_node_ip}:29500"
26 | 
27 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/finetune_spatial_Gwin384_cocoep12_1728pix_8node"
28 | 
29 | export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
30 | 
31 | srun \
32 | torchrun \
33 | --nnodes 8 \
34 | --nproc_per_node 8 \
35 | --rdzv_id $RANDOM \
36 | --rdzv_endpoint "${my_array[0]}:29500" \
37 | --rdzv_backend c10d \
38 | main.py \
39 | --output_dir ${EXP_DIR} \
40 | --with_box_refine --two_stage \
41 | --num_feature_levels 5 --num_queries 900 \
42 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \
43 | --assign_first_stage --assign_second_stage \
44 | --epochs 12 --lr_drop 10 \
45 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \
46 | --backbone pev1 \
47 | --backbone_size Gwin384 \
48 | --backbone_init_values 0.1 \
49 | --backbone_tile_posemb True \
50 | --backbone_lrd 0.9 --backbone_layers 50 \
51 | --coco_path /checkpoint/vision_encoder/public_data/coco \
52 | --finetune /checkpoint/vision_encoder/d2_output/coco_sota/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node/checkpoint.pth \
53 | --lsj --lsj_img_size 1728 \
54 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \
55 | --eval_per_epochs 1 \
56 | --save_per_epochs 1 \
57 | --auto_resume \
58 | --bf16 \
59 | --backbone_dp 0.4 \
60 | "$@"
61 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/scripts/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --qos=vision_encoder
 4 | #SBATCH --account=vision_encoder
 5 | #SBATCH --job-name=det
 6 | #SBATCH --nodes=16
 7 | #SBATCH --ntasks=16
 8 | #SBATCH --gres=gpu:8
 9 | #SBATCH --cpus-per-task=32
10 | #SBATCH --mem=0
11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node/%j.out
12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node/%j.err
13 | #SBATCH --time=96:00:00
14 | 
15 | module load cuda/12.1
16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST))
17 | nodes_array=($nodes)
18 | head_node=${nodes_array[0]}
19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
20 | 
21 | read -ra my_array <<< $head_node_ip
22 | export LOGLEVEL=INFO
23 | 
24 | echo head_node_ip $head_node_ip
25 | echo endpoint "${head_node_ip}:29500"
26 | 
27 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/pretrain_continue_spatial_Gwin384_o365ep6_1536pix_16node"
28 | 
29 | 
30 | srun \
31 | torchrun \
32 | --nnodes 16 \
33 | --nproc_per_node 8 \
34 | --rdzv_id $RANDOM \
35 | --rdzv_endpoint "${my_array[0]}:29500" \
36 | --rdzv_backend c10d \
37 | main.py \
38 | --output_dir ${EXP_DIR} \
39 | --with_box_refine --two_stage \
40 | --num_feature_levels 5 --num_queries 900 \
41 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \
42 | --assign_first_stage --assign_second_stage \
43 | --epochs 6 --lr_drop 4 \
44 | --lr 5e-5 --lr_backbone 5e-5 --batch_size 1 \
45 | --backbone pev1 \
46 | --backbone_size Gwin384 \
47 | --backbone_init_values 0.1 \
48 | --backbone_tile_posemb True \
49 | --backbone_lrd 0.9 --backbone_layers 50 \
50 | --dataset_file objects365 \
51 | --coco_path /checkpoint/vision_encoder/public_data/objects365_v2 \
52 | --finetune /checkpoint/vision_encoder/d2_output/coco_sota/pretrain_spatial_Gwin384_o365ep12_1024pix_16node/checkpoint.pth \
53 | --lsj --lsj_img_size 1536 \
54 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \
55 | --eval_per_epochs 1 \
56 | --save_per_epochs 1 \
57 | --auto_resume \
58 | "$@"
59 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/scripts/pretrain_spatial_Gwin384_o365ep12_1024pix_16node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --qos=vision_encoder
 4 | #SBATCH --account=vision_encoder
 5 | #SBATCH --job-name=det
 6 | #SBATCH --nodes=16
 7 | #SBATCH --ntasks=16
 8 | #SBATCH --gres=gpu:8
 9 | #SBATCH --cpus-per-task=32
10 | #SBATCH --mem=0
11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/pretrain_spatial_Gwin384_o365ep12_1024pix_16node/%j.out
12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco_sota/pretrain_spatial_Gwin384_o365ep12_1024pix_16node/%j.err
13 | #SBATCH --time=96:00:00
14 | 
15 | module load cuda/12.1
16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST))
17 | nodes_array=($nodes)
18 | head_node=${nodes_array[0]}
19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
20 | 
21 | read -ra my_array <<< $head_node_ip
22 | export LOGLEVEL=INFO
23 | 
24 | echo head_node_ip $head_node_ip
25 | echo endpoint "${head_node_ip}:29500"
26 | 
27 | 
28 | EXP_DIR="/checkpoint/vision_encoder/d2_output/coco_sota/pretrain_spatial_Gwin384_o365ep12_1024pix_16node"
29 | 
30 | srun \
31 | torchrun \
32 | --nnodes 16 \
33 | --nproc_per_node 8 \
34 | --rdzv_id $RANDOM \
35 | --rdzv_endpoint "${my_array[0]}:29500" \
36 | --rdzv_backend c10d \
37 | main.py \
38 | --output_dir ${EXP_DIR} \
39 | --with_box_refine --two_stage \
40 | --num_feature_levels 5 --num_queries 900 \
41 | --dim_feedforward 2048 --dropout 0.0 --cls_loss_coef 1.0 \
42 | --assign_first_stage --assign_second_stage \
43 | --epochs 12 --lr_drop 10 \
44 | --lr_backbone 2e-4 \
45 | --backbone pev1 \
46 | --backbone_size Gwin384 \
47 | --backbone_path /checkpoint/vision_encoder/pev1/pe_spatial_G14_448_16patch384pix.pth \
48 | --backbone_init_values 0.1 \
49 | --backbone_tile_posemb True \
50 | --backbone_lrd 0.9 --backbone_layers 50 \
51 | --dataset_file objects365 \
52 | --coco_path /checkpoint/vision_encoder/public_data/objects365_v2 \
53 | --lsj --lsj_img_size 1024 \
54 | --backbone_use_act_checkpoint --backbone_act_checkpoint_ratio 1.0 \
55 | --eval_per_epochs 2 \
56 | --save_per_epochs 1 \
57 | --auto_resume \
58 | "$@"
59 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/util/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Modified from DETR (https://github.com/facebookresearch/detr)
7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
8 | # ------------------------------------------------------------------------
9 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/util/box_ops.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | """
11 | Utilities for bounding box manipulation and GIoU.
12 | """
13 | import torch
14 | from torchvision.ops.boxes import box_area
15 | 
16 | 
17 | def box_cxcywh_to_xyxy(x):
18 |     x_c, y_c, w, h = x.unbind(-1)
19 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
20 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
21 |     return torch.stack(b, dim=-1)
22 | 
23 | 
24 | def box_xyxy_to_cxcywh(x):
25 |     x0, y0, x1, y1 = x.unbind(-1)
26 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
27 |          (x1 - x0), (y1 - y0)]
28 |     return torch.stack(b, dim=-1)
29 | 
30 | 
31 | # modified from torchvision to also return the union
32 | def box_iou(boxes1, boxes2):
33 |     area1 = box_area(boxes1)
34 |     area2 = box_area(boxes2)
35 | 
36 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
37 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
38 | 
39 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
40 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
41 | 
42 |     union = area1[:, None] + area2 - inter
43 | 
44 |     iou = inter / union
45 |     return iou, union
46 | 
47 | 
48 | def generalized_box_iou(boxes1, boxes2):
49 |     """
50 |     Generalized IoU from https://giou.stanford.edu/
51 | 
52 |     The boxes should be in [x0, y0, x1, y1] format
53 | 
54 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
55 |     and M = len(boxes2)
56 |     """
57 |     # degenerate boxes gives inf / nan results
58 |     # so do an early check
59 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
60 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
61 |     iou, union = box_iou(boxes1, boxes2)
62 | 
63 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
64 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
65 | 
66 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
67 |     area = wh[:, :, 0] * wh[:, :, 1]
68 | 
69 |     return iou - (area - union) / area
70 | 
71 | 
72 | def masks_to_boxes(masks):
73 |     """Compute the bounding boxes around the provided masks
74 | 
75 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
76 | 
77 |     Returns a [N, 4] tensors, with the boxes in xyxy format
78 |     """
79 |     if masks.numel() == 0:
80 |         return torch.zeros((0, 4), device=masks.device)
81 | 
82 |     h, w = masks.shape[-2:]
83 | 
84 |     y = torch.arange(0, h, dtype=torch.float)
85 |     x = torch.arange(0, w, dtype=torch.float)
86 |     y, x = torch.meshgrid(y, x)
87 | 
88 |     x_mask = (masks * x.unsqueeze(0))
89 |     x_max = x_mask.flatten(1).max(-1)[0]
90 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
91 | 
92 |     y_mask = (masks * y.unsqueeze(0))
93 |     y_max = y_mask.flatten(1).max(-1)[0]
94 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
95 | 
96 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
97 | 


--------------------------------------------------------------------------------
/apps/detection/DETA_pe/util/ema.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | @torch.no_grad()
 7 | def update_ema(ema_model, model, decay=0.9999):
 8 |     """
 9 |     Step the EMA model towards the current model.
10 |     """
11 |     ema_params = OrderedDict(ema_model.named_parameters())
12 |     model_params = OrderedDict(model.named_parameters())
13 | 
14 |     for name, param in model_params.items():
15 |         # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed
16 |         ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)
17 | 
18 | 
19 | def requires_grad(model, flag=True):
20 |     """
21 |     Set requires_grad flag for all parameters in a model.
22 |     """
23 |     for p in model.parameters():
24 |         p.requires_grad = flag
25 | 


--------------------------------------------------------------------------------
/apps/detection/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | Follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html)
 3 | 
 4 | ## Dataset
 5 | Prepare COCO and LVIS datasets 
 6 | 
 7 | ```
 8 | $DETECTRON2_DATASETS/
 9 |     coco/
10 |         train2017/
11 |         val2017/
12 |         annotations/
13 |             instances_train2017.json 
14 |             instances_val2017.json
15 |     lvis/
16 |         lvis_v1_train.json
17 |         lvis_v1_val.json
18 | ```
19 | 
20 | 


--------------------------------------------------------------------------------
/apps/detection/README.md:
--------------------------------------------------------------------------------
  1 | # Object Detection with PE
  2 | 
  3 | ## Getting started
  4 | 
  5 | Please refer to [INSTALL.md](INSTALL.md) for installation and dataset preparation instructions.
  6 | 
  7 | ## Results and Fine-tuned Models
  8 | 
  9 | 
 10 | ### LVIS
 11 | 
 12 | <table><tbody>
 13 | <!-- START TABLE -->
 14 | <!-- TABLE HEADER -->
 15 | <th valign="bottom">detector</th>
 16 | <th valign="bottom">vision encoder</th>
 17 | <th valign="bottom">box<br/>AP</th>
 18 | <th valign="bottom">mask<br/>AP</th>
 19 | <th valign="bottom">download</th>
 20 | <!-- TABLE BODY -->
 21 | <!-- ROW: mask_rcnn_PEcore_G_lvis75ep -->
 22 |  <tr><td align="left"><a href="projects/ViTDet/configs/LVIS/mask_rcnn_PEcore_G_lvis75ep.py">Mask R-CNN</a></td>
 23 | <td align="center">PE core G</td>
 24 | <td align="center">51.9</td>
 25 | <td align="center">47.9</td>
 26 | <td align="center"><a href="https://huggingface.co/facebook/PE-Detection/resolve/main/mask_rcnn_PEcore_G_lvis75ep.pth">model</a></td>
 27 | </tr>
 28 | <!-- ROW: mask_rcnn_PEspatial_G_lvis75ep -->
 29 |  <tr><td align="left"><a href="projects/ViTDet/configs/LVIS/mask_rcnn_PEspatial_G_lvis75ep.py">Mask R-CNN</a></td>
 30 | <td align="center">PE spatial G</td>
 31 | <td align="center">54.2</td>
 32 | <td align="center">49.3</td>
 33 | <td align="center"><a href="https://huggingface.co/facebook/PE-Detection/resolve/main/mask_rcnn_PEspatial_G_lvis75ep.pth">model</a></td>
 34 | </tr>
 35 | </tbody></table>
 36 | 
 37 | 
 38 | ### COCO
 39 | 
 40 | <table><tbody>
 41 | <!-- START TABLE -->
 42 | <!-- TABLE HEADER -->
 43 | <th valign="bottom">detector</th>
 44 | <th valign="bottom">vision encoder</th>
 45 | <th valign="bottom">box<br/>AP</th>
 46 | <th valign="bottom">mask<br/>AP</th>
 47 | <th valign="bottom">download</th>
 48 | <!-- TABLE BODY -->
 49 | <!-- ROW: mask_rcnn_PEcore_G_coco75ep -->
 50 |  <tr><td align="left"><a href="projects/ViTDet/configs/COCO/mask_rcnn_PEcore_G_coco75ep.py">Mask R-CNN</a></td>
 51 | <td align="center">PE core G</td>
 52 | <td align="center">57.0</td>
 53 | <td align="center">49.8</td>
 54 | <td align="center"><a href="https://huggingface.co/facebook/PE-Detection/resolve/main/mask_rcnn_PEcore_G_coco75ep.pth">model</a></td>
 55 | </tr>
 56 | <!-- ROW: mask_rcnn_PEspatial_G_coco36ep -->
 57 |  <tr><td align="left"><a href="projects/ViTDet/configs/COCO/mask_rcnn_PEspatial_G_coco36ep.py">Mask R-CNN</a></td>
 58 | <td align="center">PE spatial G</td>
 59 | <td align="center">57.8</td>
 60 | <td align="center">50.3</td>
 61 | <td align="center"><a href="https://huggingface.co/facebook/PE-Detection/resolve/main/mask_rcnn_PEspatial_G_coco36ep.pth">model</a></td>
 62 | </tr>
 63 | </tbody></table>
 64 | 
 65 | 
 66 | ### Training
 67 | By default, we use 64 GPUs in slurm training, for example
 68 | 
 69 | ```
 70 | sbatch scripts/coco/train_mask_rcnn_PEspatial_G_coco36ep.sh
 71 | ```
 72 | 
 73 | ### Evaluation
 74 | Evaluation is running locally
 75 | ```
 76 | bash scripts/evaluate_local.sh --config-file projects/ViTDet/configs/COCO/mask_rcnn_PEspatial_G_coco36ep.py train.output_dir="/path/to/output_dir" train.init_checkpoint="/path/to/mask_rcnn_PEspatial_G_coco36ep.pth"
 77 | ```
 78 | 
 79 | 
 80 | ## SOTA COCO Object Detection
 81 | 
 82 | <table><tbody>
 83 | <!-- START TABLE -->
 84 | <!-- TABLE HEADER -->
 85 | <th valign="bottom">detector</th>
 86 | <th valign="bottom">vision encoder</th>
 87 | <th valign="bottom">box<br/>AP</th>
 88 | <th valign="bottom">box(TTA)<br/>AP</th>
 89 | <th valign="bottom">download</th>
 90 | <!-- TABLE BODY -->
 91 | <!-- ROW: DETA -->
 92 |  <tr><td align="left">DETA</td>
 93 | <td align="center">PE spatial G</td>
 94 | <td align="center"> 65.2 </td>
 95 | <td align="center"> 66.0 </td>
 96 | <td align="center"><a href="https://huggingface.co/facebook/PE-Detection/resolve/main/deta_coco_1824pix.pth">model</a></td>
 97 | 
 98 | </tr>
 99 | </tbody></table>
100 | 
101 | More details are in [DETA_pe](DETA_pe)
102 | 
103 | 
104 | ## Acknowledgment
105 | 
106 | This code is built using [detectron2](https://github.com/facebookresearch/detectron2) and [DETA](https://github.com/jozhang97/DETA).
107 | 


--------------------------------------------------------------------------------
/apps/detection/detectron2_pe/__init__.py:
--------------------------------------------------------------------------------
1 | from . import modeling
2 | 


--------------------------------------------------------------------------------
/apps/detection/detectron2_pe/checkpoint/__init__.py:
--------------------------------------------------------------------------------
1 | from .detection_checkpoint import DetectionCheckpointer
2 | 
3 | __all__ = ["DetectionCheckpointer"]
4 | 


--------------------------------------------------------------------------------
/apps/detection/detectron2_pe/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbone import PEv1_det, get_vit_lr_decay_rate_pev1
2 | 


--------------------------------------------------------------------------------
/apps/detection/detectron2_pe/modeling/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .pev1_det import PEv1_det, get_vit_lr_decay_rate_pev1
2 | 


--------------------------------------------------------------------------------
/apps/detection/projects/ViTDet/configs/COCO/mask_rcnn_PEcore_G_coco75ep.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import torch.nn as nn
 4 | from detectron2 import model_zoo
 5 | from detectron2.config import LazyCall as L
 6 | from detectron2.modeling import SimpleFeaturePyramid, ViT
 7 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
 8 | from detectron2.solver import WarmupParamScheduler
 9 | from detectron2_pe.modeling import PEv1_det, get_vit_lr_decay_rate_pev1
10 | from fvcore.common.param_scheduler import MultiStepParamScheduler
11 | 
12 | from ..COCO.mask_rcnn_vitdet_b_100ep import (  # dataloader,; model,; get_vit_lr_decay_rate,
13 |     lr_multiplier, optimizer, train)
14 | from ..common.coco_loader_lsj import dataloader
15 | 
16 | train.init_checkpoint = "/checkpoint/vision_encoder/pev1/pev1_rc2_d2.pt"
17 | train.output_dir = (
18 |     "/checkpoint/vision_encoder/d2_output/coco/mask_rcnn_PEcore_G_coco36ep"
19 | )
20 | 
21 | model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model
22 | 
23 | model.pixel_mean = [127, 127, 127]
24 | model.pixel_std = [127, 127, 127]
25 | model.input_format = "RGB"
26 | 
27 | 
28 | img_size = 1024
29 | embed_dim, depth, num_heads, mlp_ratio, dp = 1536, 50, 16, 8960 / 1536, 0.5
30 | pretrain_img_size, patch_size, window_size = 512, 16, 32
31 | # 12, 24, 36, 49 for global attention
32 | window_block_indexes = (
33 |     list(range(0, 12)) + list(range(13, 24)) + list(range(25, 36)) + list(range(37, 49))
34 | )
35 | # Creates Simple Feature Pyramid from ViT backbone
36 | model.backbone = L(SimpleFeaturePyramid)(
37 |     net=L(PEv1_det)(  # Single-scale ViT backbone
38 |         pretrain_img_size=pretrain_img_size,
39 |         img_size=img_size,
40 |         patch_size=patch_size,
41 |         embed_dim=embed_dim,
42 |         depth=depth,
43 |         num_heads=num_heads,
44 |         drop_path_rate=dp,
45 |         window_size=window_size,
46 |         pt_hw_seq_len=32,
47 |         mlp_ratio=mlp_ratio,
48 |         qkv_bias=True,
49 |         norm_layer=partial(nn.LayerNorm, eps=1e-6),
50 |         window_block_indexes=window_block_indexes,
51 |         residual_block_indexes=[],
52 |         use_rel_pos=True,
53 |         out_feature="last_feat",
54 |         tile_posemb=True,
55 |         use_abs_pos=True,
56 |         pretrain_use_cls_token=False,
57 |         use_act_checkpoint=True,
58 |     ),
59 |     in_feature="${.net.out_feature}",
60 |     out_channels=256,
61 |     scale_factors=(4.0, 2.0, 1.0, 0.5),
62 |     top_block=L(LastLevelMaxPool)(),
63 |     norm="LN",
64 |     square_pad=img_size,
65 | )
66 | 
67 | optimizer.params.lr_factor_func = partial(
68 |     get_vit_lr_decay_rate_pev1, lr_decay_rate=0.9, num_layers=50
69 | )
70 | 
71 | dataloader.train.total_batch_size = 64
72 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
73 | train.max_iter = 184375
74 | 
75 | 
76 | lr_multiplier = L(WarmupParamScheduler)(
77 |     scheduler=L(MultiStepParamScheduler)(
78 |         values=[1.0, 0.1, 0.01],
79 |         milestones=[163889, 177546],
80 |         num_updates=train.max_iter,
81 |     ),
82 |     warmup_length=250 / train.max_iter,
83 |     warmup_factor=0.001,
84 | )
85 | 
86 | optimizer.params.overrides = {}
87 | optimizer.params.weight_decay_norm = None
88 | optimizer.lr = 5e-5
89 | 
90 | train.max_iter = train.max_iter * 3 // 4  # 100ep -> 75ep
91 | lr_multiplier.scheduler.milestones = [
92 |     milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones
93 | ]
94 | lr_multiplier.scheduler.num_updates = train.max_iter
95 | 


--------------------------------------------------------------------------------
/apps/detection/projects/ViTDet/configs/COCO/mask_rcnn_PEspatial_G_coco36ep.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import torch.nn as nn
 4 | from detectron2 import model_zoo
 5 | from detectron2.config import LazyCall as L
 6 | from detectron2.modeling import SimpleFeaturePyramid, ViT
 7 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
 8 | from detectron2.solver import WarmupParamScheduler
 9 | from detectron2_pe.modeling import PEv1_det, get_vit_lr_decay_rate_pev1
10 | from fvcore.common.param_scheduler import MultiStepParamScheduler
11 | 
12 | from ..COCO.mask_rcnn_vitdet_b_100ep import (  # dataloader,; model,; get_vit_lr_decay_rate,
13 |     lr_multiplier, optimizer, train)
14 | from ..common.coco_loader_lsj import dataloader
15 | 
16 | train.init_checkpoint = "/checkpoint/vision_encoder/pev1/pev1_rc2_spatial_d2.pt"
17 | train.output_dir = (
18 |     "/checkpoint/vision_encoder/d2_output/coco/mask_rcnn_PEspatial_G_coco36ep"
19 | )
20 | 
21 | model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model
22 | 
23 | model.pixel_mean = [127, 127, 127]
24 | model.pixel_std = [127, 127, 127]
25 | model.input_format = "RGB"
26 | 
27 | 
28 | img_size = 1024
29 | embed_dim, depth, num_heads, mlp_ratio, dp = 1536, 50, 16, 8960 / 1536, 0.5
30 | pretrain_img_size, patch_size, window_size = 512, 16, 32
31 | # 12, 24, 36, 49 for global attention
32 | window_block_indexes = (
33 |     list(range(0, 12)) + list(range(13, 24)) + list(range(25, 36)) + list(range(37, 49))
34 | )
35 | # Creates Simple Feature Pyramid from ViT backbone
36 | model.backbone = L(SimpleFeaturePyramid)(
37 |     net=L(PEv1_det)(  # Single-scale ViT backbone
38 |         pretrain_img_size=pretrain_img_size,
39 |         img_size=img_size,
40 |         patch_size=patch_size,
41 |         embed_dim=embed_dim,
42 |         depth=depth,
43 |         num_heads=num_heads,
44 |         drop_path_rate=dp,
45 |         window_size=window_size,
46 |         pt_hw_seq_len=32,
47 |         mlp_ratio=mlp_ratio,
48 |         qkv_bias=True,
49 |         norm_layer=partial(nn.LayerNorm, eps=1e-6),
50 |         window_block_indexes=window_block_indexes,
51 |         residual_block_indexes=[],
52 |         use_rel_pos=True,
53 |         out_feature="last_feat",
54 |         tile_posemb=True,
55 |         use_abs_pos=True,
56 |         pretrain_use_cls_token=False,
57 |         use_act_checkpoint=True,
58 |         init_values=0.1,
59 |     ),
60 |     in_feature="${.net.out_feature}",
61 |     out_channels=256,
62 |     scale_factors=(4.0, 2.0, 1.0, 0.5),
63 |     top_block=L(LastLevelMaxPool)(),
64 |     norm="LN",
65 |     square_pad=img_size,
66 | )
67 | 
68 | optimizer.params.lr_factor_func = partial(
69 |     get_vit_lr_decay_rate_pev1, lr_decay_rate=0.9, num_layers=50
70 | )
71 | 
72 | dataloader.train.total_batch_size = 64
73 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
74 | train.max_iter = 184375
75 | 
76 | 
77 | lr_multiplier = L(WarmupParamScheduler)(
78 |     scheduler=L(MultiStepParamScheduler)(
79 |         values=[1.0, 0.1, 0.01],
80 |         milestones=[163889, 177546],
81 |         num_updates=train.max_iter,
82 |     ),
83 |     warmup_length=250 / train.max_iter,
84 |     warmup_factor=0.001,
85 | )
86 | 
87 | optimizer.params.overrides = {}
88 | optimizer.params.weight_decay_norm = None
89 | optimizer.lr = 5e-5
90 | 
91 | train.max_iter = train.max_iter * 36 // 100  # 100ep -> 36ep
92 | lr_multiplier.scheduler.milestones = [
93 |     milestone * 36 // 100 for milestone in lr_multiplier.scheduler.milestones
94 | ]
95 | lr_multiplier.scheduler.num_updates = train.max_iter
96 | 


--------------------------------------------------------------------------------
/apps/detection/projects/ViTDet/configs/COCO/mask_rcnn_vitdet_b_100ep.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | from detectron2 import model_zoo
 4 | from detectron2.config import LazyCall as L
 5 | from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate
 6 | from detectron2.solver import WarmupParamScheduler
 7 | from fvcore.common.param_scheduler import MultiStepParamScheduler
 8 | 
 9 | from ..common.coco_loader_lsj import dataloader
10 | 
11 | model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model
12 | 
13 | # Initialization and trainer settings
14 | train = model_zoo.get_config("common/train.py").train
15 | train.amp.enabled = True
16 | train.ddp.fp16_compression = True
17 | train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth?matching_heuristics=True"
18 | 
19 | 
20 | # Schedule
21 | # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep
22 | train.max_iter = 184375
23 | 
24 | lr_multiplier = L(WarmupParamScheduler)(
25 |     scheduler=L(MultiStepParamScheduler)(
26 |         values=[1.0, 0.1, 0.01],
27 |         milestones=[163889, 177546],
28 |         num_updates=train.max_iter,
29 |     ),
30 |     warmup_length=250 / train.max_iter,
31 |     warmup_factor=0.001,
32 | )
33 | 
34 | # Optimizer
35 | optimizer = model_zoo.get_config("common/optim.py").AdamW
36 | optimizer.params.lr_factor_func = partial(
37 |     get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7
38 | )
39 | optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}}
40 | 


--------------------------------------------------------------------------------
/apps/detection/projects/ViTDet/configs/LVIS/mask_rcnn_PEcore_G_lvis75ep.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | 
  3 | import torch.nn as nn
  4 | from detectron2 import model_zoo
  5 | from detectron2.config import LazyCall as L
  6 | from detectron2.data.detection_utils import get_fed_loss_cls_weights
  7 | from detectron2.data.samplers import RepeatFactorTrainingSampler
  8 | from detectron2.evaluation.lvis_evaluation import LVISEvaluator
  9 | from detectron2.modeling import SimpleFeaturePyramid, ViT
 10 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
 11 | from detectron2.solver import WarmupParamScheduler
 12 | from detectron2_pe.modeling import PEv1_det, get_vit_lr_decay_rate_pev1
 13 | from fvcore.common.param_scheduler import MultiStepParamScheduler
 14 | 
 15 | from ..COCO.mask_rcnn_vitdet_b_100ep import (  # dataloader,; model,; get_vit_lr_decay_rate,
 16 |     lr_multiplier, optimizer, train)
 17 | from ..common.coco_loader_lsj import dataloader
 18 | 
 19 | train.init_checkpoint = "/checkpoint/vision_encoder/pev1/pev1_rc2_d2.pt"
 20 | train.output_dir = (
 21 |     "/checkpoint/vision_encoder/d2_output/lvis/mask_rcnn_PEcore_G_lvis75ep"
 22 | )
 23 | 
 24 | model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model
 25 | 
 26 | model.pixel_mean = [127, 127, 127]
 27 | model.pixel_std = [127, 127, 127]
 28 | model.input_format = "RGB"
 29 | 
 30 | img_size = 1024
 31 | embed_dim, depth, num_heads, mlp_ratio, dp = 1536, 50, 16, 8960 / 1536, 0.5
 32 | pretrain_img_size, patch_size, window_size = 512, 16, 32
 33 | # 12, 24, 36, 49 for global attention
 34 | window_block_indexes = (
 35 |     list(range(0, 12)) + list(range(13, 24)) + list(range(25, 36)) + list(range(37, 49))
 36 | )
 37 | # Creates Simple Feature Pyramid from ViT backbone
 38 | model.backbone = L(SimpleFeaturePyramid)(
 39 |     net=L(PEv1_det)(  # Single-scale ViT backbone
 40 |         pretrain_img_size=pretrain_img_size,
 41 |         img_size=img_size,
 42 |         patch_size=patch_size,
 43 |         embed_dim=embed_dim,
 44 |         depth=depth,
 45 |         num_heads=num_heads,
 46 |         drop_path_rate=dp,
 47 |         window_size=window_size,
 48 |         pt_hw_seq_len=32,
 49 |         mlp_ratio=mlp_ratio,
 50 |         qkv_bias=True,
 51 |         norm_layer=partial(nn.LayerNorm, eps=1e-6),
 52 |         window_block_indexes=window_block_indexes,
 53 |         residual_block_indexes=[],
 54 |         use_rel_pos=True,
 55 |         out_feature="last_feat",
 56 |         tile_posemb=True,
 57 |         use_abs_pos=True,
 58 |         pretrain_use_cls_token=False,
 59 |         use_act_checkpoint=True,
 60 |     ),
 61 |     in_feature="${.net.out_feature}",
 62 |     out_channels=256,
 63 |     scale_factors=(4.0, 2.0, 1.0, 0.5),
 64 |     top_block=L(LastLevelMaxPool)(),
 65 |     norm="LN",
 66 |     square_pad=img_size,
 67 | )
 68 | 
 69 | model.roi_heads.num_classes = 1203
 70 | model.roi_heads.box_predictor.test_score_thresh = 0.02
 71 | model.roi_heads.box_predictor.test_topk_per_image = 300
 72 | model.roi_heads.box_predictor.use_sigmoid_ce = True
 73 | model.roi_heads.box_predictor.use_fed_loss = True
 74 | model.roi_heads.box_predictor.get_fed_loss_cls_weights = (
 75 |     lambda: get_fed_loss_cls_weights(dataloader.train.dataset.names, 0.5)
 76 | )
 77 | 
 78 | train.eval_period = 30000
 79 | 
 80 | optimizer.params.lr_factor_func = partial(
 81 |     get_vit_lr_decay_rate_pev1, lr_decay_rate=0.9, num_layers=50
 82 | )
 83 | 
 84 | 
 85 | dataloader.train.dataset.names = "lvis_v1_train"
 86 | dataloader.train.sampler = L(RepeatFactorTrainingSampler)(
 87 |     repeat_factors=L(
 88 |         RepeatFactorTrainingSampler.repeat_factors_from_category_frequency
 89 |     )(dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001)
 90 | )
 91 | dataloader.test.dataset.names = "lvis_v1_val"
 92 | dataloader.evaluator = L(LVISEvaluator)(
 93 |     dataset_name="${..test.dataset.names}",
 94 |     max_dets_per_image=300,
 95 |     output_dir="${train.output_dir}",
 96 | )
 97 | 
 98 | dataloader.train.total_batch_size = 64
 99 | 
100 | train.max_iter = 184375
101 | 
102 | 
103 | lr_multiplier = L(WarmupParamScheduler)(
104 |     scheduler=L(MultiStepParamScheduler)(
105 |         values=[1.0, 0.1, 0.01],
106 |         milestones=[163889, 177546],
107 |         num_updates=train.max_iter,
108 |     ),
109 |     warmup_length=250 / train.max_iter,
110 |     warmup_factor=0.001,
111 | )
112 | 
113 | optimizer.params.overrides = {}
114 | optimizer.params.weight_decay_norm = None
115 | optimizer.lr = 5e-5
116 | 
117 | train.max_iter = train.max_iter * 3 // 4  # 100ep -> 75ep
118 | lr_multiplier.scheduler.milestones = [
119 |     milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones
120 | ]
121 | lr_multiplier.scheduler.num_updates = train.max_iter
122 | 


--------------------------------------------------------------------------------
/apps/detection/projects/ViTDet/configs/LVIS/mask_rcnn_PEspatial_G_lvis75ep.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | 
  3 | import torch.nn as nn
  4 | from detectron2 import model_zoo
  5 | from detectron2.config import LazyCall as L
  6 | from detectron2.data.detection_utils import get_fed_loss_cls_weights
  7 | from detectron2.data.samplers import RepeatFactorTrainingSampler
  8 | from detectron2.evaluation.lvis_evaluation import LVISEvaluator
  9 | from detectron2.modeling import SimpleFeaturePyramid, ViT
 10 | from detectron2.modeling.backbone.fpn import LastLevelMaxPool
 11 | from detectron2.solver import WarmupParamScheduler
 12 | from detectron2_pe.modeling import PEv1_det, get_vit_lr_decay_rate_pev1
 13 | from fvcore.common.param_scheduler import MultiStepParamScheduler
 14 | 
 15 | from ..COCO.mask_rcnn_vitdet_b_100ep import (  # dataloader,; model,; get_vit_lr_decay_rate,
 16 |     lr_multiplier, optimizer, train)
 17 | from ..common.coco_loader_lsj import dataloader
 18 | 
 19 | train.init_checkpoint = "/checkpoint/vision_encoder/pev1/pev1_rc2_spatial_d2.pt"
 20 | train.output_dir = (
 21 |     "/checkpoint/vision_encoder/d2_output/lvis/mask_rcnn_PEspatial_G_lvis75ep"
 22 | )
 23 | 
 24 | model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model
 25 | 
 26 | model.pixel_mean = [127, 127, 127]
 27 | model.pixel_std = [127, 127, 127]
 28 | model.input_format = "RGB"
 29 | 
 30 | img_size = 1024
 31 | embed_dim, depth, num_heads, mlp_ratio, dp = 1536, 50, 16, 8960 / 1536, 0.5
 32 | pretrain_img_size, patch_size, window_size = 512, 16, 32
 33 | # 12, 24, 36, 49 for global attention
 34 | window_block_indexes = (
 35 |     list(range(0, 12)) + list(range(13, 24)) + list(range(25, 36)) + list(range(37, 49))
 36 | )
 37 | # Creates Simple Feature Pyramid from ViT backbone
 38 | model.backbone = L(SimpleFeaturePyramid)(
 39 |     net=L(PEv1_det)(  # Single-scale ViT backbone
 40 |         pretrain_img_size=pretrain_img_size,
 41 |         img_size=img_size,
 42 |         patch_size=patch_size,
 43 |         embed_dim=embed_dim,
 44 |         depth=depth,
 45 |         num_heads=num_heads,
 46 |         drop_path_rate=dp,
 47 |         window_size=window_size,
 48 |         pt_hw_seq_len=32,
 49 |         mlp_ratio=mlp_ratio,
 50 |         qkv_bias=True,
 51 |         norm_layer=partial(nn.LayerNorm, eps=1e-6),
 52 |         window_block_indexes=window_block_indexes,
 53 |         residual_block_indexes=[],
 54 |         use_rel_pos=True,
 55 |         out_feature="last_feat",
 56 |         tile_posemb=True,
 57 |         use_abs_pos=True,
 58 |         pretrain_use_cls_token=False,
 59 |         use_act_checkpoint=True,
 60 |         init_values=0.1,
 61 |     ),
 62 |     in_feature="${.net.out_feature}",
 63 |     out_channels=256,
 64 |     scale_factors=(4.0, 2.0, 1.0, 0.5),
 65 |     top_block=L(LastLevelMaxPool)(),
 66 |     norm="LN",
 67 |     square_pad=img_size,
 68 | )
 69 | 
 70 | model.roi_heads.num_classes = 1203
 71 | model.roi_heads.box_predictor.test_score_thresh = 0.02
 72 | model.roi_heads.box_predictor.test_topk_per_image = 300
 73 | model.roi_heads.box_predictor.use_sigmoid_ce = True
 74 | model.roi_heads.box_predictor.use_fed_loss = True
 75 | model.roi_heads.box_predictor.get_fed_loss_cls_weights = (
 76 |     lambda: get_fed_loss_cls_weights(dataloader.train.dataset.names, 0.5)
 77 | )
 78 | 
 79 | train.eval_period = 30000
 80 | 
 81 | optimizer.params.lr_factor_func = partial(
 82 |     get_vit_lr_decay_rate_pev1, lr_decay_rate=0.9, num_layers=50
 83 | )
 84 | 
 85 | 
 86 | dataloader.train.dataset.names = "lvis_v1_train"
 87 | dataloader.train.sampler = L(RepeatFactorTrainingSampler)(
 88 |     repeat_factors=L(
 89 |         RepeatFactorTrainingSampler.repeat_factors_from_category_frequency
 90 |     )(dataset_dicts="${dataloader.train.dataset}", repeat_thresh=0.001)
 91 | )
 92 | dataloader.test.dataset.names = "lvis_v1_val"
 93 | dataloader.evaluator = L(LVISEvaluator)(
 94 |     dataset_name="${..test.dataset.names}",
 95 |     max_dets_per_image=300,
 96 |     output_dir="${train.output_dir}",
 97 | )
 98 | 
 99 | dataloader.train.total_batch_size = 64
100 | 
101 | train.max_iter = 184375
102 | 
103 | 
104 | lr_multiplier = L(WarmupParamScheduler)(
105 |     scheduler=L(MultiStepParamScheduler)(
106 |         values=[1.0, 0.1, 0.01],
107 |         milestones=[163889, 177546],
108 |         num_updates=train.max_iter,
109 |     ),
110 |     warmup_length=250 / train.max_iter,
111 |     warmup_factor=0.001,
112 | )
113 | 
114 | optimizer.params.overrides = {}
115 | optimizer.params.weight_decay_norm = None
116 | optimizer.lr = 5e-5
117 | 
118 | train.max_iter = train.max_iter * 3 // 4  # 100ep -> 75ep
119 | lr_multiplier.scheduler.milestones = [
120 |     milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones
121 | ]
122 | lr_multiplier.scheduler.num_updates = train.max_iter
123 | 


--------------------------------------------------------------------------------
/apps/detection/projects/ViTDet/configs/common/coco_loader_lsj.py:
--------------------------------------------------------------------------------
 1 | import detectron2.data.transforms as T
 2 | from detectron2 import model_zoo
 3 | from detectron2.config import LazyCall as L
 4 | 
 5 | # Data using LSJ
 6 | image_size = 1024
 7 | dataloader = model_zoo.get_config("common/data/coco.py").dataloader
 8 | dataloader.train.mapper.augmentations = [
 9 |     L(T.RandomFlip)(horizontal=True),  # flip first
10 |     L(T.ResizeScale)(
11 |         min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size
12 |     ),
13 |     L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False),
14 | ]
15 | dataloader.train.mapper.image_format = "RGB"
16 | dataloader.train.total_batch_size = 64
17 | # recompute boxes due to cropping
18 | dataloader.train.mapper.recompute_boxes = True
19 | 
20 | dataloader.test.mapper.augmentations = [
21 |     L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size),
22 | ]
23 | 


--------------------------------------------------------------------------------
/apps/detection/scripts/coco/train_mask_rcnn_PEcore_G_coco75ep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --qos=vision_encoder
 4 | #SBATCH --account=vision_encoder
 5 | #SBATCH --job-name=det
 6 | #SBATCH --nodes=8
 7 | #SBATCH --ntasks=8
 8 | #SBATCH --gres=gpu:8
 9 | #SBATCH --cpus-per-task=32
10 | #SBATCH --mem=0
11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco/train_mask_rcnn_PEcore_G_coco75ep/%j.out
12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco/train_mask_rcnn_PEcore_G_coco75ep/%j.err
13 | #SBATCH --time=96:00:00
14 | 
15 | module load cuda/12.1
16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST))
17 | nodes_array=($nodes)
18 | head_node=${nodes_array[0]}
19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
20 | 
21 | read -ra my_array <<< $head_node_ip
22 | export LOGLEVEL=INFO
23 | 
24 | echo head_node_ip $head_node_ip
25 | echo endpoint "${head_node_ip}:29500"
26 | 
27 | export DETECTRON2_DATASETS="/path/to/detectron2_data"
28 | export PYTHONPATH="$HOME/occhi/apps/detection:$PYTHONPATH"
29 | 
30 | srun \
31 | torchrun \
32 | --nnodes 8 \
33 | --nproc_per_node 8 \
34 | --rdzv_id $RANDOM \
35 | --rdzv_endpoint "${my_array[0]}:29500" \
36 | --rdzv_backend c10d \
37 | tools/lazyconfig_train_net_pe_slurm.py \
38 | --resume \
39 | --config-file projects/ViTDet/configs/COCO/mask_rcnn_PEcore_G_coco75ep.py \
40 | optimizer.lr=5e-5 \
41 | train.init_checkpoint="/checkpoint/vision_encoder/pev1/pe_core_G14_448_16patch.pt" \
42 | train.output_dir="/checkpoint/vision_encoder/d2_output/coco/train_mask_rcnn_PEcore_G_coco75ep" \
43 | model.backbone.net.use_act_checkpoint=True \
44 | "$@"
45 | 


--------------------------------------------------------------------------------
/apps/detection/scripts/coco/train_mask_rcnn_PEspatial_G_coco36ep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --qos=vision_encoder
 4 | #SBATCH --account=vision_encoder
 5 | #SBATCH --job-name=det
 6 | #SBATCH --nodes=8
 7 | #SBATCH --ntasks=8
 8 | #SBATCH --gres=gpu:8
 9 | #SBATCH --cpus-per-task=32
10 | #SBATCH --mem=0
11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/coco/train_mask_rcnn_PEspatial_G_coco36ep/%j.out
12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/coco/train_mask_rcnn_PEspatial_G_coco36ep/%j.err
13 | #SBATCH --time=96:00:00
14 | 
15 | module load cuda/12.1
16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST))
17 | nodes_array=($nodes)
18 | head_node=${nodes_array[0]}
19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
20 | 
21 | read -ra my_array <<< $head_node_ip
22 | export LOGLEVEL=INFO
23 | 
24 | echo head_node_ip $head_node_ip
25 | echo endpoint "${head_node_ip}:29500"
26 | 
27 | export DETECTRON2_DATASETS="/path/to/detectron2_data"
28 | export PYTHONPATH="$HOME/occhi/apps/detection:$PYTHONPATH"
29 | 
30 | srun \
31 | torchrun \
32 | --nnodes 8 \
33 | --nproc_per_node 8 \
34 | --rdzv_id $RANDOM \
35 | --rdzv_endpoint "${my_array[0]}:29500" \
36 | --rdzv_backend c10d \
37 | tools/lazyconfig_train_net_pe_slurm.py \
38 | --resume \
39 | --config-file projects/ViTDet/configs/COCO/mask_rcnn_PEspatial_G_coco36ep.py \
40 | optimizer.lr=5e-5 \
41 | train.init_checkpoint="/checkpoint/vision_encoder/pev1/pe_spatial_G14_16patch.pth" \
42 | train.output_dir="/checkpoint/vision_encoder/d2_output/coco/train_mask_rcnn_PEspatial_G_coco36ep" \
43 | model.backbone.net.init_values=0.1 \
44 | model.backbone.net.use_act_checkpoint=True \
45 | "$@"
46 | 


--------------------------------------------------------------------------------
/apps/detection/scripts/evaluate_local.sh:
--------------------------------------------------------------------------------
1 | export DETECTRON2_DATASETS="/path/to/detectron2_data"
2 | export PYTHONPATH="$HOME/occhi/apps/detection:$PYTHONPATH"
3 | 
4 | python3 tools/lazyconfig_train_net_pe.py \
5 | --num-gpus 8 \
6 | --eval-only \
7 | "$@"


--------------------------------------------------------------------------------
/apps/detection/scripts/lvis/train_mask_rcnn_PEcore_G_lvis75ep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --qos=vision_encoder
 4 | #SBATCH --account=vision_encoder
 5 | #SBATCH --job-name=det
 6 | #SBATCH --nodes=8
 7 | #SBATCH --ntasks=8
 8 | #SBATCH --gres=gpu:8
 9 | #SBATCH --cpus-per-task=32
10 | #SBATCH --mem=0
11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/lvis/train_mask_rcnn_PEcore_G_lvis75ep/%j.out
12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/lvis/train_mask_rcnn_PEcore_G_lvis75ep/%j.err
13 | #SBATCH --time=96:00:00
14 | 
15 | module load cuda/12.1
16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST))
17 | nodes_array=($nodes)
18 | head_node=${nodes_array[0]}
19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
20 | 
21 | read -ra my_array <<< $head_node_ip
22 | export LOGLEVEL=INFO
23 | 
24 | echo head_node_ip $head_node_ip
25 | echo endpoint "${head_node_ip}:29500"
26 | 
27 | export DETECTRON2_DATASETS="/path/to/detectron2_data"
28 | export PYTHONPATH="$HOME/occhi/apps/detection:$PYTHONPATH"
29 | 
30 | srun \
31 | torchrun \
32 | --nnodes 8 \
33 | --nproc_per_node 8 \
34 | --rdzv_id $RANDOM \
35 | --rdzv_endpoint "${my_array[0]}:29500" \
36 | --rdzv_backend c10d \
37 | tools/lazyconfig_train_net_pe_slurm.py \
38 | --resume \
39 | --config-file projects/ViTDet/configs/LVIS/mask_rcnn_PEcore_G_lvis75ep.py \
40 | optimizer.lr=5e-5 \
41 | train.init_checkpoint="/checkpoint/vision_encoder/pev1/pe_core_G14_448_16patch.pt" \
42 | train.output_dir="/checkpoint/vision_encoder/d2_output/lvis/train_mask_rcnn_PEcore_G_lvis75ep" \
43 | model.backbone.net.use_act_checkpoint=True \
44 | "$@"
45 | 


--------------------------------------------------------------------------------
/apps/detection/scripts/lvis/train_mask_rcnn_PEspatial_G_lvis75ep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --qos=vision_encoder
 4 | #SBATCH --account=vision_encoder
 5 | #SBATCH --job-name=det
 6 | #SBATCH --nodes=8
 7 | #SBATCH --ntasks=8
 8 | #SBATCH --gres=gpu:8
 9 | #SBATCH --cpus-per-task=32
10 | #SBATCH --mem=0
11 | #SBATCH --output=/checkpoint/vision_encoder/d2_output/slurm_logs/lvis/train_mask_rcnn_PEspatial_G_lvis75ep/%j.out
12 | #SBATCH --error=/checkpoint/vision_encoder/d2_output/slurm_logs/lvis/train_mask_rcnn_PEspatial_G_lvis75ep/%j.err
13 | #SBATCH --time=96:00:00
14 | 
15 | module load cuda/12.1
16 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST))
17 | nodes_array=($nodes)
18 | head_node=${nodes_array[0]}
19 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
20 | 
21 | read -ra my_array <<< $head_node_ip
22 | export LOGLEVEL=INFO
23 | 
24 | echo head_node_ip $head_node_ip
25 | echo endpoint "${head_node_ip}:29500"
26 | 
27 | export DETECTRON2_DATASETS="/path/to/detectron2_data"
28 | export PYTHONPATH="$HOME/occhi/apps/detection:$PYTHONPATH"
29 | 
30 | srun \
31 | torchrun \
32 | --nnodes 8 \
33 | --nproc_per_node 8 \
34 | --rdzv_id $RANDOM \
35 | --rdzv_endpoint "${my_array[0]}:29500" \
36 | --rdzv_backend c10d \
37 | tools/lazyconfig_train_net_pe_slurm.py \
38 | --resume \
39 | --config-file projects/ViTDet/configs/LVIS/mask_rcnn_PEspatial_G_lvis75ep.py \
40 | optimizer.lr=5e-5 \
41 | train.init_checkpoint="/checkpoint/vision_encoder/pev1/pe_spatial_G14_16patch.pth" \
42 | train.output_dir="/checkpoint/vision_encoder/d2_output/lvis/train_mask_rcnn_PEspatial_G_lvis75ep" \
43 | model.backbone.net.init_values=0.1 \
44 | model.backbone.net.use_act_checkpoint=True \
45 | "$@"
46 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | """Top-level package for CLIP Benchmark."""
2 | 
3 | __author__ = """Mehdi Cherti"""
4 | __email__ = "mehdicherti@gmail.com"
5 | __version__ = "0.1.0"
6 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/clip_benchmark/datasets/__init__.py


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/babel_imagenet.py:
--------------------------------------------------------------------------------
 1 | import torchvision
 2 | 
 3 | """
 4 | BabelImageNet from https://arxiv.org/pdf/2306.08658.pdf
 5 | Adapted from https://github.com/gregor-ge/Babel-ImageNet, thanks to the authors
 6 | """
 7 | 
 8 | 
 9 | class BabelImageNet(torchvision.datasets.ImageNet):
10 |     def __init__(
11 |         self, root: str, idxs, split: str = "val", download=None, **kwargs
12 |     ) -> None:
13 |         super().__init__(root, split, **kwargs)
14 |         examples_per_class = len(self.targets) // 1000
15 |         select_idxs = [
16 |             idx * examples_per_class + i
17 |             for idx in idxs
18 |             for i in range(examples_per_class)
19 |         ]
20 |         self.targets = [i for i in range(len(idxs)) for _ in range(examples_per_class)]
21 |         self.imgs = [self.imgs[i] for i in select_idxs]
22 |         self.samples = [self.samples[i] for i in select_idxs]
23 |         self.idxs = idxs
24 | 
25 |     def __getitem__(self, i):
26 |         img, target = super().__getitem__(i)
27 |         target = self.idxs.index(target)
28 |         return img, target
29 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/flickr.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Adapted from https://github.com/pytorch/vision/blob/main/torchvision/datasets/flickr.py
 3 | Thanks to the authors of torchvision
 4 | """
 5 | 
 6 | import glob
 7 | import os
 8 | from collections import defaultdict
 9 | from html.parser import HTMLParser
10 | from typing import Any, Callable, Dict, List, Optional, Tuple
11 | 
12 | from PIL import Image
13 | from torchvision.datasets import VisionDataset
14 | 
15 | 
16 | class Flickr(VisionDataset):
17 | 
18 |     def __init__(
19 |         self,
20 |         root: str,
21 |         ann_file: str,
22 |         transform: Optional[Callable] = None,
23 |         target_transform: Optional[Callable] = None,
24 |     ) -> None:
25 |         super().__init__(root, transform=transform, target_transform=target_transform)
26 |         self.ann_file = os.path.expanduser(ann_file)
27 |         data = defaultdict(list)
28 |         with open(ann_file) as fd:
29 |             fd.readline()
30 |             for line in fd:
31 |                 line = line.strip()
32 |                 if line:
33 |                     # some lines have comma in the caption, se we make sure we do the split correctly
34 |                     img, caption = line.strip().split(".jpg,")
35 |                     img = img + ".jpg"
36 |                     data[img].append(caption)
37 |         self.data = list(data.items())
38 | 
39 |     def __getitem__(self, index: int) -> Tuple[Any, Any]:
40 |         """
41 |         Args:
42 |             index (int): Index
43 | 
44 |         Returns:
45 |             tuple: Tuple (image, target). target is a list of captions for the image.
46 |         """
47 |         img, captions = self.data[index]
48 | 
49 |         # Image
50 |         img = Image.open(os.path.join(self.root, img)).convert("RGB")
51 |         if self.transform is not None:
52 |             img = self.transform(img)
53 | 
54 |         # Captions
55 |         target = captions
56 |         if self.target_transform is not None:
57 |             target = self.target_transform(target)
58 | 
59 |         return img, target
60 | 
61 |     def __len__(self) -> int:
62 |         return len(self.data)
63 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/flickr30k_200.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import json
  3 | import os
  4 | from subprocess import call
  5 | 
  6 | import requests
  7 | from PIL import Image
  8 | from torchvision.datasets import VisionDataset
  9 | 
 10 | from .flores_langs import flores_languages
 11 | 
 12 | GITHUB_DATA_PATH = (
 13 |     "https://raw.githubusercontent.com/visheratin/nllb-clip/main/data/flickr30k-200/"
 14 | )
 15 | SUPPORTED_LANGUAGES = flores_languages
 16 | 
 17 | IMAGE_INDEX_FILENAME = "filenames.txt"
 18 | 
 19 | CAPTIONS_FILENAME_TEMPLATE = "{}.txt"
 20 | OUTPUT_FILENAME_TEMPLATE = "flickr30k_200-{}.json"
 21 | 
 22 | IMAGES_DOWNLOAD_URL = "https://nllb-data.com/test/flickr30k/images.tar.gz"
 23 | 
 24 | 
 25 | class Flickr30k_200(VisionDataset):
 26 |     def __init__(self, root, ann_file, transform=None, target_transform=None):
 27 |         super().__init__(root, transform=transform, target_transform=target_transform)
 28 |         self.ann_file = os.path.expanduser(ann_file)
 29 |         with codecs.open(ann_file, "r", encoding="utf-8") as fp:
 30 |             data = json.load(fp)
 31 |         self.data = [
 32 |             (img_path, txt)
 33 |             for img_path, txt in zip(data["image_paths"], data["annotations"])
 34 |         ]
 35 | 
 36 |     def __getitem__(self, index):
 37 |         img, captions = self.data[index]
 38 | 
 39 |         # Image
 40 |         img = Image.open(img).convert("RGB")
 41 |         if self.transform is not None:
 42 |             img = self.transform(img)
 43 | 
 44 |         # Captions
 45 |         target = [
 46 |             captions,
 47 |         ]
 48 |         if self.target_transform is not None:
 49 |             target = self.target_transform(target)
 50 | 
 51 |         return img, target
 52 | 
 53 |     def __len__(self) -> int:
 54 |         return len(self.data)
 55 | 
 56 | 
 57 | def _get_lines(url):
 58 |     response = requests.get(url, timeout=30)
 59 |     return response.text.splitlines()
 60 | 
 61 | 
 62 | def _download_images(out_path):
 63 |     os.makedirs(out_path, exist_ok=True)
 64 |     print("Downloading images")
 65 |     call(f"wget {IMAGES_DOWNLOAD_URL} -O images.tar.gz", shell=True)
 66 |     call(f"tar -xzf images.tar.gz -C {out_path}", shell=True)
 67 |     call("rm images.tar.gz", shell=True)
 68 | 
 69 | 
 70 | def create_annotation_file(root, lang_code):
 71 |     if lang_code not in SUPPORTED_LANGUAGES:
 72 |         raise ValueError(
 73 |             f"Language code {lang_code} not supported. Supported languages are {SUPPORTED_LANGUAGES}"
 74 |         )
 75 |     data_dir = os.path.join(root, "flickr30k-200")
 76 |     if not os.path.exists(data_dir):
 77 |         _download_images(data_dir)
 78 |     images_dir = os.path.join(root, "flickr30k-200", "images")
 79 |     print("Downloading flickr30k-200 index file")
 80 |     download_path = os.path.join(GITHUB_DATA_PATH, IMAGE_INDEX_FILENAME)
 81 |     target_images = _get_lines(download_path)
 82 | 
 83 |     print("Downloading flickr30k-200 captions:", lang_code)
 84 |     captions_path = GITHUB_DATA_PATH
 85 |     download_path = os.path.join(
 86 |         captions_path, CAPTIONS_FILENAME_TEMPLATE.format(lang_code)
 87 |     )
 88 |     target_captions = _get_lines(download_path)
 89 | 
 90 |     number_of_missing_images = 0
 91 |     valid_images, valid_annotations, valid_indicies = [], [], []
 92 |     for i, (img, txt) in enumerate(zip(target_images, target_captions)):
 93 |         image_path = os.path.join(images_dir, img)
 94 |         if not os.path.exists(image_path):
 95 |             print("Missing image file", img)
 96 |             number_of_missing_images += 1
 97 |             continue
 98 | 
 99 |         valid_images.append(image_path)
100 |         valid_annotations.append(txt)
101 |         valid_indicies.append(i)
102 | 
103 |     if number_of_missing_images > 0:
104 |         print(f"*** WARNING *** missing {number_of_missing_images} files.")
105 | 
106 |     with codecs.open(
107 |         os.path.join(root, OUTPUT_FILENAME_TEMPLATE.format(lang_code)),
108 |         "w",
109 |         encoding="utf-8",
110 |     ) as fp:
111 |         json.dump(
112 |             {
113 |                 "image_paths": valid_images,
114 |                 "annotations": valid_annotations,
115 |                 "indicies": valid_indicies,
116 |             },
117 |             fp,
118 |             ensure_ascii=False,
119 |         )
120 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/flores_langs.py:
--------------------------------------------------------------------------------
  1 | flores_languages = [
  2 |     "ace_Arab",
  3 |     "ace_Latn",
  4 |     "acm_Arab",
  5 |     "acq_Arab",
  6 |     "aeb_Arab",
  7 |     "afr_Latn",
  8 |     "ajp_Arab",
  9 |     "aka_Latn",
 10 |     "amh_Ethi",
 11 |     "apc_Arab",
 12 |     "arb_Arab",
 13 |     "ars_Arab",
 14 |     "ary_Arab",
 15 |     "arz_Arab",
 16 |     "asm_Beng",
 17 |     "ast_Latn",
 18 |     "awa_Deva",
 19 |     "ayr_Latn",
 20 |     "azb_Arab",
 21 |     "azj_Latn",
 22 |     "bak_Cyrl",
 23 |     "bam_Latn",
 24 |     "ban_Latn",
 25 |     "bel_Cyrl",
 26 |     "bem_Latn",
 27 |     "ben_Beng",
 28 |     "bho_Deva",
 29 |     "bjn_Arab",
 30 |     "bjn_Latn",
 31 |     "bod_Tibt",
 32 |     "bos_Latn",
 33 |     "bug_Latn",
 34 |     "bul_Cyrl",
 35 |     "cat_Latn",
 36 |     "ceb_Latn",
 37 |     "ces_Latn",
 38 |     "cjk_Latn",
 39 |     "ckb_Arab",
 40 |     "crh_Latn",
 41 |     "cym_Latn",
 42 |     "dan_Latn",
 43 |     "deu_Latn",
 44 |     "dik_Latn",
 45 |     "dyu_Latn",
 46 |     "dzo_Tibt",
 47 |     "eng_Latn",
 48 |     "ell_Grek",
 49 |     "epo_Latn",
 50 |     "est_Latn",
 51 |     "eus_Latn",
 52 |     "ewe_Latn",
 53 |     "fao_Latn",
 54 |     "fij_Latn",
 55 |     "fin_Latn",
 56 |     "fon_Latn",
 57 |     "fra_Latn",
 58 |     "fur_Latn",
 59 |     "fuv_Latn",
 60 |     "gla_Latn",
 61 |     "gle_Latn",
 62 |     "glg_Latn",
 63 |     "grn_Latn",
 64 |     "guj_Gujr",
 65 |     "hat_Latn",
 66 |     "hau_Latn",
 67 |     "heb_Hebr",
 68 |     "hin_Deva",
 69 |     "hne_Deva",
 70 |     "hrv_Latn",
 71 |     "hun_Latn",
 72 |     "hye_Armn",
 73 |     "ibo_Latn",
 74 |     "ilo_Latn",
 75 |     "ind_Latn",
 76 |     "isl_Latn",
 77 |     "ita_Latn",
 78 |     "jav_Latn",
 79 |     "jpn_Jpan",
 80 |     "kab_Latn",
 81 |     "kac_Latn",
 82 |     "kam_Latn",
 83 |     "kan_Knda",
 84 |     "kas_Arab",
 85 |     "kas_Deva",
 86 |     "kat_Geor",
 87 |     "knc_Arab",
 88 |     "knc_Latn",
 89 |     "kaz_Cyrl",
 90 |     "kbp_Latn",
 91 |     "kea_Latn",
 92 |     "khm_Khmr",
 93 |     "kik_Latn",
 94 |     "kin_Latn",
 95 |     "kir_Cyrl",
 96 |     "kmb_Latn",
 97 |     "kmr_Latn",
 98 |     "kon_Latn",
 99 |     "kor_Hang",
100 |     "lao_Laoo",
101 |     "lij_Latn",
102 |     "lim_Latn",
103 |     "lin_Latn",
104 |     "lit_Latn",
105 |     "lmo_Latn",
106 |     "ltg_Latn",
107 |     "ltz_Latn",
108 |     "lua_Latn",
109 |     "lug_Latn",
110 |     "luo_Latn",
111 |     "lus_Latn",
112 |     "lvs_Latn",
113 |     "mag_Deva",
114 |     "mai_Deva",
115 |     "mal_Mlym",
116 |     "mar_Deva",
117 |     "min_Latn",
118 |     "mkd_Cyrl",
119 |     "plt_Latn",
120 |     "mlt_Latn",
121 |     "mni_Beng",
122 |     "khk_Cyrl",
123 |     "mos_Latn",
124 |     "mri_Latn",
125 |     "mya_Mymr",
126 |     "nld_Latn",
127 |     "nno_Latn",
128 |     "nob_Latn",
129 |     "npi_Deva",
130 |     "nso_Latn",
131 |     "nus_Latn",
132 |     "nya_Latn",
133 |     "oci_Latn",
134 |     "gaz_Latn",
135 |     "ory_Orya",
136 |     "pag_Latn",
137 |     "pan_Guru",
138 |     "pap_Latn",
139 |     "pes_Arab",
140 |     "pol_Latn",
141 |     "por_Latn",
142 |     "prs_Arab",
143 |     "pbt_Arab",
144 |     "quy_Latn",
145 |     "ron_Latn",
146 |     "run_Latn",
147 |     "rus_Cyrl",
148 |     "sag_Latn",
149 |     "san_Deva",
150 |     "scn_Latn",
151 |     "shn_Mymr",
152 |     "sin_Sinh",
153 |     "slk_Latn",
154 |     "slv_Latn",
155 |     "smo_Latn",
156 |     "sna_Latn",
157 |     "snd_Arab",
158 |     "som_Latn",
159 |     "sot_Latn",
160 |     "spa_Latn",
161 |     "als_Latn",
162 |     "srd_Latn",
163 |     "srp_Cyrl",
164 |     "ssw_Latn",
165 |     "sun_Latn",
166 |     "swe_Latn",
167 |     "swh_Latn",
168 |     "szl_Latn",
169 |     "tam_Taml",
170 |     "tat_Cyrl",
171 |     "tel_Telu",
172 |     "tgk_Cyrl",
173 |     "tgl_Latn",
174 |     "tha_Thai",
175 |     "tir_Ethi",
176 |     "taq_Latn",
177 |     "taq_Tfng",
178 |     "tpi_Latn",
179 |     "tsn_Latn",
180 |     "tso_Latn",
181 |     "tuk_Latn",
182 |     "tum_Latn",
183 |     "tur_Latn",
184 |     "twi_Latn",
185 |     "tzm_Tfng",
186 |     "uig_Arab",
187 |     "ukr_Cyrl",
188 |     "umb_Latn",
189 |     "urd_Arab",
190 |     "uzn_Latn",
191 |     "vec_Latn",
192 |     "vie_Latn",
193 |     "war_Latn",
194 |     "wol_Latn",
195 |     "xho_Latn",
196 |     "ydd_Hebr",
197 |     "yor_Latn",
198 |     "yue_Hant",
199 |     "zho_Hans",
200 |     "zho_Hant",
201 |     "zsm_Latn",
202 |     "zul_Latn",
203 | ]
204 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/multilingual_mscoco.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import json
  3 | import os
  4 | from subprocess import call
  5 | 
  6 | import requests
  7 | from PIL import Image
  8 | from torchvision.datasets import VisionDataset
  9 | 
 10 | GITHUB_DATA_PATH = "https://raw.githubusercontent.com/adobe-research/Cross-lingual-Test-Dataset-XTD10/main/XTD10/"
 11 | GITHUB_DATA_PATH_DE_FR = "https://raw.githubusercontent.com/adobe-research/Cross-lingual-Test-Dataset-XTD10/main/MIC/"
 12 | GITHUB_DATA_PATH_JP = "https://raw.githubusercontent.com/adobe-research/Cross-lingual-Test-Dataset-XTD10/main/STAIR/"
 13 | SUPPORTED_LANGUAGES = ["es", "it", "ko", "pl", "ru", "tr", "zh", "en", "de", "fr", "jp"]
 14 | 
 15 | IMAGE_INDEX_FILENAME = "test_image_names.txt"
 16 | 
 17 | CAPTIONS_FILENAME_TEMPLATE = "test_1kcaptions_{}.txt"
 18 | OUTPUT_FILENAME_TEMPLATE = "multilingual_mscoco_captions-{}.json"
 19 | 
 20 | IMAGES_DOWNLOAD_URL = "https://nllb-data.com/test/xtd10/images.tar.gz"
 21 | 
 22 | 
 23 | class Multilingual_MSCOCO(VisionDataset):
 24 |     def __init__(self, root, ann_file, transform=None, target_transform=None):
 25 |         super().__init__(root, transform=transform, target_transform=target_transform)
 26 |         self.ann_file = os.path.expanduser(ann_file)
 27 |         with codecs.open(ann_file, "r", encoding="utf-8") as fp:
 28 |             data = json.load(fp)
 29 |         self.data = [
 30 |             (img_path, txt)
 31 |             for img_path, txt in zip(data["image_paths"], data["annotations"])
 32 |         ]
 33 | 
 34 |     def __getitem__(self, index):
 35 |         img, captions = self.data[index]
 36 | 
 37 |         # Image
 38 |         img = Image.open(img).convert("RGB")
 39 |         if self.transform is not None:
 40 |             img = self.transform(img)
 41 | 
 42 |         # Captions
 43 |         target = [
 44 |             captions,
 45 |         ]
 46 |         if self.target_transform is not None:
 47 |             target = self.target_transform(target)
 48 | 
 49 |         return img, target
 50 | 
 51 |     def __len__(self) -> int:
 52 |         return len(self.data)
 53 | 
 54 | 
 55 | def _get_lines(url):
 56 |     response = requests.get(url, timeout=30)
 57 |     return response.text.splitlines()
 58 | 
 59 | 
 60 | def _download_images(out_path):
 61 |     os.makedirs(out_path, exist_ok=True)
 62 |     print("Downloading images")
 63 |     call(f"wget {IMAGES_DOWNLOAD_URL} -O images.tar.gz", shell=True)
 64 |     call(f"tar -xzf images.tar.gz -C {out_path}", shell=True)
 65 |     call("rm images.tar.gz", shell=True)
 66 | 
 67 | 
 68 | def create_annotation_file(root, lang_code):
 69 |     if lang_code not in SUPPORTED_LANGUAGES:
 70 |         raise ValueError(
 71 |             f"Language code {lang_code} not supported. Supported languages are {SUPPORTED_LANGUAGES}"
 72 |         )
 73 |     data_dir = os.path.join(root, "multilingual_mscoco")
 74 |     if not os.path.exists(data_dir):
 75 |         _download_images(data_dir)
 76 |     images_dir = os.path.join(data_dir, "images")
 77 |     print("Downloading multilingual_ms_coco index file")
 78 |     download_path = os.path.join(GITHUB_DATA_PATH, IMAGE_INDEX_FILENAME)
 79 |     target_images = _get_lines(download_path)
 80 | 
 81 |     print("Downloading multilingual_ms_coco captions:", lang_code)
 82 |     captions_path = GITHUB_DATA_PATH
 83 |     if lang_code in ["de", "fr"]:
 84 |         captions_path = GITHUB_DATA_PATH_DE_FR
 85 |     elif lang_code == "jp":
 86 |         captions_path = GITHUB_DATA_PATH_JP
 87 |     download_path = os.path.join(
 88 |         captions_path, CAPTIONS_FILENAME_TEMPLATE.format(lang_code)
 89 |     )
 90 |     target_captions = _get_lines(download_path)
 91 | 
 92 |     number_of_missing_images = 0
 93 |     valid_images, valid_annotations, valid_indicies = [], [], []
 94 |     for i, (img, txt) in enumerate(zip(target_images, target_captions)):
 95 |         image_path = os.path.join(images_dir, img)
 96 |         if not os.path.exists(image_path):
 97 |             print("Missing image file", img)
 98 |             number_of_missing_images += 1
 99 |             continue
100 | 
101 |         valid_images.append(image_path)
102 |         valid_annotations.append(txt)
103 |         valid_indicies.append(i)
104 | 
105 |     if number_of_missing_images > 0:
106 |         print(f"*** WARNING *** missing {number_of_missing_images} files.")
107 | 
108 |     with codecs.open(
109 |         os.path.join(root, OUTPUT_FILENAME_TEMPLATE.format(lang_code)),
110 |         "w",
111 |         encoding="utf-8",
112 |     ) as fp:
113 |         json.dump(
114 |             {
115 |                 "image_paths": valid_images,
116 |                 "annotations": valid_annotations,
117 |                 "indicies": valid_indicies,
118 |             },
119 |             fp,
120 |             ensure_ascii=False,
121 |         )
122 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/objectnet.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code adapted from https://github.com/mlfoundations/wise-ft/blob/master/src/datasets/objectnet.py
 3 | Thanks to the authors of wise-ft
 4 | """
 5 | 
 6 | import json
 7 | import os
 8 | from pathlib import Path
 9 | 
10 | import numpy as np
11 | import PIL
12 | import torch
13 | from torchvision import datasets
14 | from torchvision.transforms import Compose
15 | 
16 | 
17 | def get_metadata(folder):
18 |     metadata = Path(folder)
19 | 
20 |     with open(metadata / "folder_to_objectnet_label.json", "r") as f:
21 |         folder_map = json.load(f)
22 |         folder_map = {v: k for k, v in folder_map.items()}
23 |     with open(metadata / "objectnet_to_imagenet_1k.json", "r") as f:
24 |         objectnet_map = json.load(f)
25 | 
26 |     with open(metadata / "pytorch_to_imagenet_2012_id.json", "r") as f:
27 |         pytorch_map = json.load(f)
28 |         pytorch_map = {v: k for k, v in pytorch_map.items()}
29 | 
30 |     with open(metadata / "imagenet_to_label_2012_v2", "r") as f:
31 |         imagenet_map = {v.strip(): str(pytorch_map[i]) for i, v in enumerate(f)}
32 | 
33 |     folder_to_ids, class_sublist = {}, []
34 |     classnames = []
35 |     for objectnet_name, imagenet_names in objectnet_map.items():
36 |         imagenet_names = imagenet_names.split("; ")
37 |         imagenet_ids = [
38 |             int(imagenet_map[imagenet_name]) for imagenet_name in imagenet_names
39 |         ]
40 |         class_sublist.extend(imagenet_ids)
41 |         folder_to_ids[folder_map[objectnet_name]] = imagenet_ids
42 | 
43 |     class_sublist = sorted(class_sublist)
44 |     class_sublist_mask = [(i in class_sublist) for i in range(1000)]
45 |     classname_map = {v: k for k, v in folder_map.items()}
46 |     return class_sublist, class_sublist_mask, folder_to_ids, classname_map
47 | 
48 | 
49 | class ObjectNetDataset(datasets.ImageFolder):
50 | 
51 |     def __init__(self, root, transform):
52 |         (
53 |             self._class_sublist,
54 |             self.class_sublist_mask,
55 |             self.folders_to_ids,
56 |             self.classname_map,
57 |         ) = get_metadata(root)
58 |         subdir = os.path.join(root, "objectnet-1.0", "images")
59 |         label_map = {
60 |             name: idx
61 |             for idx, name in enumerate(sorted(list(self.folders_to_ids.keys())))
62 |         }
63 |         self.label_map = label_map
64 |         super().__init__(subdir, transform=transform)
65 |         self.samples = [
66 |             d
67 |             for d in self.samples
68 |             if os.path.basename(os.path.dirname(d[0])) in self.label_map
69 |         ]
70 |         self.imgs = self.samples
71 |         self.classes = sorted(list(self.folders_to_ids.keys()))
72 |         self.classes = [self.classname_map[c].lower() for c in self.classes]
73 | 
74 |     def __len__(self):
75 |         return len(self.samples)
76 | 
77 |     def __getitem__(self, index):
78 |         path, target = self.samples[index]
79 |         sample = self.loader(path)
80 |         if self.transform is not None:
81 |             sample = self.transform(sample)
82 |         label = os.path.basename(os.path.dirname(path))
83 |         return sample, self.label_map[label]
84 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/pos_neg_caption_dataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from PIL import Image
 5 | from torch.utils.data import Dataset
 6 | 
 7 | 
 8 | class PosNegCaptionDataset(Dataset):
 9 | 
10 |     def __init__(self, root, ann_file, transform=None, crop_images=False):
11 |         self.root = root
12 |         self.ann = json.load(open(ann_file))
13 |         self.transform = transform
14 |         self.crop_images = crop_images
15 |         self.idx_strings = list(self.ann.keys())  # NOTE : indices may be non-contiguous
16 | 
17 |     def __getitem__(self, idx):
18 |         idx_str = self.idx_strings[idx]
19 |         data = self.ann[idx_str]
20 |         img = Image.open(os.path.join(self.root, data["filename"]))
21 |         if self.crop_images:
22 |             img = img.crop(
23 |                 (
24 |                     data["bbox_x"],
25 |                     data["bbox_y"],
26 |                     data["bbox_x"] + data["bbox_width"],
27 |                     data["bbox_y"] + data["bbox_height"],
28 |                 )
29 |             )
30 |         if self.transform is not None:
31 |             img = self.transform(img)
32 |         caption = data["caption"]
33 |         negative_caption = data["negative_caption"]
34 | 
35 |         return img, [caption, negative_caption]
36 | 
37 |     def __len__(self):
38 |         return len(self.ann)
39 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/tfds.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | 
 4 | 
 5 | def download_tfds_dataset(name, data_dir=None):
 6 |     import tensorflow_datasets as tfds
 7 |     import timm
 8 | 
 9 |     builder = tfds.builder(name, data_dir=data_dir)
10 |     builder.download_and_prepare()
11 | 
12 | 
13 | def disable_gpus_on_tensorflow():
14 |     import tensorflow as tf
15 | 
16 |     tf.config.set_visible_devices([], "GPU")
17 | 
18 | 
19 | class VTABIterableDataset(torch.utils.data.IterableDataset):
20 | 
21 |     def __init__(
22 |         self,
23 |         tfds_dataset,
24 |         split="test",
25 |         input_name="image",
26 |         label_name="label",
27 |         input_mode="RGB",
28 |         transform=None,
29 |         target_transform=None,
30 |         classes=None,
31 |     ):
32 |         self.tfds_dataset = tfds_dataset
33 |         self.input_name = input_name
34 |         self.label_name = label_name
35 |         self.transform = transform
36 |         self.target_transform = target_transform
37 |         self.input_mode = input_mode
38 |         self.num_examples = tfds_dataset.get_num_samples(split)
39 |         self.split = split
40 |         if classes is None:
41 |             self.classes = tfds_dataset._dataset_builder.info.features["label"].names
42 |         else:
43 |             self.classes = classes
44 | 
45 |     def __iter__(self):
46 |         worker_info = torch.utils.data.get_worker_info()
47 |         iterator = self.tfds_dataset.get_tf_data(
48 |             self.split, batch_size=1, epochs=1, for_eval=True
49 |         )
50 |         if worker_info is not None:
51 |             iterator = iterator.shard(
52 |                 index=worker_info.id, num_shards=worker_info.num_workers
53 |             )
54 |         nb = 0
55 |         for data in iterator:
56 |             inputs = data[self.input_name].numpy()
57 |             labels = data[self.label_name].numpy()
58 |             for input, label in zip(inputs, labels):
59 |                 input = Image.fromarray(input, mode=self.input_mode)
60 |                 if self.transform is not None:
61 |                     input = self.transform(input)
62 |                 if self.target_transform is not None:
63 |                     label = self.target_transform(label)
64 |                 yield input, label
65 | 
66 |     def __len__(self):
67 |         return self.num_examples
68 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/video_classification_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | 
  4 | import cv2
  5 | import decord
  6 | import torch
  7 | from PIL import Image
  8 | from torch.utils.data import Dataset
  9 | 
 10 | 
 11 | class VideoClassificationDataset(Dataset):
 12 |     def __init__(self, dataset_dir_path, task_config, preprocessor, num_frames=8):
 13 |         self.dataset_dir_path = dataset_dir_path
 14 |         self.labels_txt = task_config["labels"]
 15 |         self.media_dir_path = os.path.join(dataset_dir_path, task_config["media"])
 16 |         self.class_ids, self.classes = self.get_class_info()
 17 | 
 18 |         self.media_paths = []
 19 |         self.labels = []
 20 |         self.label_ids = []
 21 | 
 22 |         for j, (class_id, class_name) in enumerate(zip(self.class_ids, self.classes)):
 23 |             class_dir_path = os.path.join(self.media_dir_path, class_id)
 24 |             for i, video_file_name in enumerate(os.listdir(class_dir_path)):
 25 |                 video_path = os.path.join(class_dir_path, video_file_name)
 26 |                 self.media_paths.append(video_path)
 27 |                 self.labels.append(class_name)
 28 |                 self.label_ids.append(j)
 29 | 
 30 |         self.preprocessor = preprocessor
 31 |         self.num_frames = num_frames
 32 | 
 33 |     def get_class_info(self):
 34 |         class_ids = [
 35 |             dir_name
 36 |             for dir_name in os.listdir(self.media_dir_path)
 37 |             if os.path.isdir(os.path.join(self.media_dir_path, dir_name))
 38 |         ]
 39 | 
 40 |         if self.labels_txt:
 41 |             labels_txt_path = os.path.join(self.dataset_dir_path, self.labels_txt)
 42 |             id_to_class_name = {}
 43 |             with open(labels_txt_path, "r") as f:
 44 |                 for line in f:
 45 |                     id, class_name = line.strip().split(",")
 46 |                     id_to_class_name[id] = class_name
 47 |             class_names = [id_to_class_name[id] for id in class_ids]
 48 |         else:
 49 |             class_names = class_ids
 50 | 
 51 |         def clean_label(label: str) -> str:
 52 |             """
 53 |             Return a label without spaces or parenthesis
 54 |             """
 55 |             for c in "()":
 56 |                 label = label.replace(c, "")
 57 |             return label.strip("_")
 58 | 
 59 |         class_names = [clean_label(label) for label in class_names]
 60 | 
 61 |         return class_ids, class_names
 62 | 
 63 |     def __len__(self):
 64 |         return len(self.media_paths)
 65 | 
 66 |     def __getitem__(self, index):
 67 |         while True:
 68 |             media_path = self.media_paths[index]
 69 |             class_name = self.labels[index]
 70 |             class_id = self.label_ids[index]
 71 | 
 72 |             try:
 73 |                 images = self._load_video(media_path)
 74 | 
 75 |                 images = [
 76 |                     (
 77 |                         self.preprocessor(image.convert("RGB"))
 78 |                         if image.mode == "L"
 79 |                         else self.preprocessor(image)
 80 |                     )
 81 |                     for image in images
 82 |                 ]
 83 |                 break
 84 |             except Exception as e:
 85 |                 print(f"{e}, skipping {media_path}.")
 86 |                 index = random.randint(0, len(self.media_paths) - 1)
 87 | 
 88 |         # Returns a list of images and one class_id. The model will need to aggregate across the list of images to make a prediction.
 89 |         return images, class_id
 90 | 
 91 |     def _load_video(self, media_path):
 92 |         vr = decord.VideoReader(media_path)
 93 |         total_frames = len(vr)
 94 |         if self.num_frames == 1:
 95 |             frame_indices = [total_frames // 2]
 96 |         else:
 97 |             frame_indices = [
 98 |                 int(i * (total_frames - 1) / (self.num_frames - 1))
 99 |                 for i in range(self.num_frames)
100 |             ]
101 | 
102 |         try:
103 |             images = vr.get_batch(frame_indices).asnumpy()
104 |         except Exception as e:
105 |             cap = cv2.VideoCapture(media_path)
106 |             images = []
107 |             for pos in frame_indices:
108 |                 cap.set(cv2.CAP_PROP_POS_FRAMES, pos)
109 |                 ret, frame = cap.read()
110 |                 if ret:
111 |                     # Convert the frame from BGR to RGB
112 |                     rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
113 |                     images.append(rgb_frame)
114 |                 else:
115 |                     break
116 | 
117 |         images = [Image.fromarray(image) for image in images]
118 | 
119 |         return images
120 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/video_retrieval_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | import cv2
 5 | import decord
 6 | import pandas as pd
 7 | import torch
 8 | from PIL import Image
 9 | from torch.utils.data import Dataset
10 | 
11 | 
12 | class VideoRetrievalDataset(Dataset):
13 |     def __init__(
14 |         self,
15 |         csv_path,
16 |         dataset_dir,
17 |         preprocessor,
18 |         video_ext="mp4",
19 |         num_frames=8,
20 |         multi_sent=False,
21 |     ):
22 |         self.data = pd.read_csv(csv_path)
23 |         self.dataset_dir = dataset_dir
24 |         self.video_ext = video_ext
25 | 
26 |         self.preprocessor = preprocessor
27 |         self.num_frames = num_frames
28 |         self.multi_sent = multi_sent
29 | 
30 |     def __len__(self):
31 |         return len(self.data)
32 | 
33 |     def __getitem__(self, index):
34 |         video_id = self.data["video_id"].values[index]
35 |         sentences = self.data["sentence"].values[index]
36 |         if self.multi_sent:
37 |             sentences = sentences.split("@")
38 |         else:
39 |             sentences = [sentences]
40 |         video_path = os.path.join(
41 |             self.dataset_dir, "{}.{}".format(video_id, self.video_ext)
42 |         )
43 | 
44 |         images = self._load_video(video_path)
45 | 
46 |         images = [
47 |             (
48 |                 self.preprocessor(image.convert("RGB"))
49 |                 if image.mode == "L"
50 |                 else self.preprocessor(image)
51 |             )
52 |             for image in images
53 |         ]
54 | 
55 |         return images, sentences
56 | 
57 |     def _load_video(self, media_path):
58 |         vr = decord.VideoReader(media_path)
59 |         total_frames = len(vr)
60 |         if self.num_frames == 1:
61 |             frame_indices = [total_frames // 2]
62 |         else:
63 |             frame_indices = [
64 |                 int(i * (total_frames - 1) / (self.num_frames - 1))
65 |                 for i in range(self.num_frames)
66 |             ]
67 |         try:
68 |             images = vr.get_batch(frame_indices).asnumpy()
69 |         except Exception as e:
70 |             cap = cv2.VideoCapture(media_path)
71 |             images = []
72 |             for pos in frame_indices:
73 |                 cap.set(cv2.CAP_PROP_POS_FRAMES, pos)
74 |                 ret, frame = cap.read()
75 |                 if ret:
76 |                     # Convert the frame from BGR to RGB
77 |                     rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
78 |                     images.append(rgb_frame)
79 |                 else:
80 |                     break
81 | 
82 |         images = [Image.fromarray(image) for image in images]
83 | 
84 |         return images
85 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/winoground.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import torch
 5 | from PIL import Image
 6 | from torch.utils.data import Dataset
 7 | 
 8 | 
 9 | class WinoGround(Dataset):
10 | 
11 |     def __init__(self, root=".", transform=None):
12 |         from datasets import load_dataset
13 | 
14 |         self.ds = load_dataset("facebook/winoground", cache_dir=root)["test"]
15 |         self.transform = transform
16 | 
17 |     def __getitem__(self, idx):
18 |         data = self.ds[idx]
19 |         img0 = data["image_0"]
20 |         img1 = data["image_1"]
21 |         cap0 = data["caption_0"]
22 |         cap1 = data["caption_1"]
23 |         if self.transform is not None:
24 |             img0 = self.transform(img0)
25 |             img1 = self.transform(img1)
26 |             imgs = torch.stack([img0, img1])
27 |         else:
28 |             imgs = [img0, img1]
29 |         caps = [cap0, cap1]
30 |         return imgs, caps
31 | 
32 |     def __len__(self):
33 |         return len(self.ds)
34 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/datasets/xtd200.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import json
  3 | import os
  4 | from subprocess import call
  5 | 
  6 | import requests
  7 | from PIL import Image
  8 | from torchvision.datasets import VisionDataset
  9 | 
 10 | from .flores_langs import flores_languages
 11 | 
 12 | GITHUB_DATA_PATH = (
 13 |     "https://raw.githubusercontent.com/visheratin/nllb-clip/main/data/xtd200/"
 14 | )
 15 | SUPPORTED_LANGUAGES = flores_languages
 16 | 
 17 | IMAGE_INDEX_FILENAME = "test_image_names.txt"
 18 | 
 19 | CAPTIONS_FILENAME_TEMPLATE = "{}.txt"
 20 | OUTPUT_FILENAME_TEMPLATE = "xtd200-{}.json"
 21 | 
 22 | IMAGES_DOWNLOAD_URL = "https://nllb-data.com/test/xtd10/images.tar.gz"
 23 | 
 24 | 
 25 | class XTD200(VisionDataset):
 26 |     def __init__(self, root, ann_file, transform=None, target_transform=None):
 27 |         super().__init__(root, transform=transform, target_transform=target_transform)
 28 |         self.ann_file = os.path.expanduser(ann_file)
 29 |         with codecs.open(ann_file, "r", encoding="utf-8") as fp:
 30 |             data = json.load(fp)
 31 |         self.data = [
 32 |             (img_path, txt)
 33 |             for img_path, txt in zip(data["image_paths"], data["annotations"])
 34 |         ]
 35 | 
 36 |     def __getitem__(self, index):
 37 |         img, captions = self.data[index]
 38 | 
 39 |         # Image
 40 |         img = Image.open(img).convert("RGB")
 41 |         if self.transform is not None:
 42 |             img = self.transform(img)
 43 | 
 44 |         # Captions
 45 |         target = [
 46 |             captions,
 47 |         ]
 48 |         if self.target_transform is not None:
 49 |             target = self.target_transform(target)
 50 | 
 51 |         return img, target
 52 | 
 53 |     def __len__(self) -> int:
 54 |         return len(self.data)
 55 | 
 56 | 
 57 | def _get_lines(url):
 58 |     response = requests.get(url, timeout=30)
 59 |     return response.text.splitlines()
 60 | 
 61 | 
 62 | def _download_images(out_path):
 63 |     os.makedirs(out_path, exist_ok=True)
 64 |     print("Downloading images")
 65 |     call(f"wget {IMAGES_DOWNLOAD_URL} -O images.tar.gz", shell=True)
 66 |     call(f"tar -xzf images.tar.gz -C {out_path}", shell=True)
 67 |     call("rm images.tar.gz", shell=True)
 68 | 
 69 | 
 70 | def create_annotation_file(root, lang_code):
 71 |     if lang_code not in SUPPORTED_LANGUAGES:
 72 |         raise ValueError(
 73 |             f"Language code {lang_code} not supported. Supported languages are {SUPPORTED_LANGUAGES}"
 74 |         )
 75 |     data_dir = os.path.join(root, "xtd200")
 76 |     if not os.path.exists(data_dir):
 77 |         _download_images(data_dir)
 78 |     images_dir = os.path.join(data_dir, "images")
 79 |     print("Downloading xtd200 index file")
 80 |     download_path = os.path.join(GITHUB_DATA_PATH, IMAGE_INDEX_FILENAME)
 81 |     target_images = _get_lines(download_path)
 82 | 
 83 |     print("Downloading xtd200 captions:", lang_code)
 84 |     captions_path = GITHUB_DATA_PATH
 85 |     download_path = os.path.join(
 86 |         captions_path, CAPTIONS_FILENAME_TEMPLATE.format(lang_code)
 87 |     )
 88 |     target_captions = _get_lines(download_path)
 89 | 
 90 |     number_of_missing_images = 0
 91 |     valid_images, valid_annotations, valid_indicies = [], [], []
 92 |     for i, (img, txt) in enumerate(zip(target_images, target_captions)):
 93 |         image_path = os.path.join(images_dir, img)
 94 |         if not os.path.exists(image_path):
 95 |             print("Missing image file", img)
 96 |             number_of_missing_images += 1
 97 |             continue
 98 | 
 99 |         valid_images.append(image_path)
100 |         valid_annotations.append(txt)
101 |         valid_indicies.append(i)
102 | 
103 |     if number_of_missing_images > 0:
104 |         print(f"*** WARNING *** missing {number_of_missing_images} files.")
105 | 
106 |     with codecs.open(
107 |         os.path.join(root, OUTPUT_FILENAME_TEMPLATE.format(lang_code)),
108 |         "w",
109 |         encoding="utf-8",
110 |     ) as fp:
111 |         json.dump(
112 |             {
113 |                 "image_paths": valid_images,
114 |                 "annotations": valid_annotations,
115 |                 "indicies": valid_indicies,
116 |             },
117 |             fp,
118 |             ensure_ascii=False,
119 |         )
120 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/metrics/__captioning.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | from pycocoevalcap.bleu.bleu import Bleu
  4 | from pycocoevalcap.cider.cider import Cider
  5 | from pycocoevalcap.meteor.meteor import Meteor
  6 | from pycocoevalcap.rouge.rouge import Rouge
  7 | from pycocoevalcap.spice.spice import Spice
  8 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
  9 | # from open_clip import tokenize
 10 | from tqdm.auto import tqdm
 11 | 
 12 | # from open_clip.tokenizer import _tokenizer
 13 | from core.vision_encoder.tokenizer import _tokenizer, tokenize
 14 | 
 15 | """
 16 | Code adapted from https://github.com/salaniz/pycocoevalcap/blob/master/eval.py
 17 | Thanks to @salaniz for the code!
 18 | """
 19 | 
 20 | 
 21 | class COCOEvalCap:
 22 |     def __init__(self, results):
 23 |         self.evalImgs = []
 24 |         self.eval = {}
 25 |         self.imgToEval = {}
 26 |         self.results = results
 27 | 
 28 |     def evaluate(self):
 29 |         gts = {}
 30 |         res = {}
 31 |         for imgId, r in enumerate(self.results):
 32 |             gts[imgId] = r["true"]
 33 |             res[imgId] = r["gen"]
 34 |         # =================================================
 35 |         # Set up scorers
 36 |         # =================================================
 37 |         print("tokenization...")
 38 |         tokenizer = PTBTokenizer()
 39 |         gts = tokenizer.tokenize(gts)
 40 |         res = tokenizer.tokenize(res)
 41 | 
 42 |         # =================================================
 43 |         # Set up scorers
 44 |         # =================================================
 45 |         print("setting up scorers...")
 46 |         scorers = [
 47 |             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
 48 |             (Meteor(), "METEOR"),
 49 |             (Rouge(), "ROUGE_L"),
 50 |             (Cider(), "CIDEr"),
 51 |             (Spice(), "SPICE"),
 52 |         ]
 53 | 
 54 |         # =================================================
 55 |         # Compute scores
 56 |         # =================================================
 57 |         for scorer, method in scorers:
 58 |             print("computing %s score..." % (scorer.method()))
 59 |             score, scores = scorer.compute_score(gts, res)
 60 |             if type(method) == list:
 61 |                 for sc, scs, m in zip(score, scores, method):
 62 |                     self.setEval(sc, m)
 63 |                     self.setImgToEvalImgs(scs, gts.keys(), m)
 64 |                     print("%s: %0.3f" % (m, sc))
 65 |             else:
 66 |                 self.setEval(score, method)
 67 |                 self.setImgToEvalImgs(scores, gts.keys(), method)
 68 |                 print("%s: %0.3f" % (method, score))
 69 |         self.setEvalImgs()
 70 | 
 71 |     def setEval(self, score, method):
 72 |         self.eval[method] = score
 73 | 
 74 |     def setImgToEvalImgs(self, scores, imgIds, method):
 75 |         for imgId, score in zip(imgIds, scores):
 76 |             if not imgId in self.imgToEval:
 77 |                 self.imgToEval[imgId] = {}
 78 |                 self.imgToEval[imgId]["image_id"] = imgId
 79 |             self.imgToEval[imgId][method] = score
 80 | 
 81 |     def setEvalImgs(self):
 82 |         self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]
 83 | 
 84 | 
 85 | def evaluate(
 86 |     model,
 87 |     dataloader,
 88 |     batch_size,
 89 |     device,
 90 |     transform,
 91 |     train_dataloader=None,
 92 |     num_workers=None,
 93 |     amp=True,
 94 |     verbose=False,
 95 | ):
 96 |     results = []
 97 |     image_id = 0
 98 |     gt = []
 99 |     for idx, (img, captions) in enumerate(tqdm(dataloader)):
100 |         out = model.generate(img.to(device))
101 |         decoded = [
102 |             _tokenizer.decode(i)
103 |             .split("<end_of_text>")[0]
104 |             .replace("<start_of_text>", "")
105 |             .strip()
106 |             for i in out.cpu().numpy()
107 |         ]
108 |         for pred, true in zip(decoded, captions):
109 |             true = [{"caption": t} for t in true]
110 |             pred = [{"caption": pred}]
111 |             results.append({"image_id": image_id, "gen": pred, "true": true})
112 |             image_id += 1
113 |     coco_eval = COCOEvalCap(results)
114 |     coco_eval.evaluate()
115 |     metrics = coco_eval.eval
116 |     # print output evaluation scores
117 |     for metric, score in metrics.items():
118 |         print(f"{metric}: {score:.3f}")
119 |     return metrics
120 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/clip_benchmark/metrics/__init__.py


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/metrics/image_caption_selection.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from contextlib import suppress
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from open_clip import image_to_device
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | def evaluate(model, dataloader, tokenizer, device, amp=True, args=None):
11 |     """
12 |     Evaluate the model on the given dataset.
13 |     The task has N instances, each instance has I images and C captions.
14 |     For each instance, the goal is to find the correct image for each caption and the correct caption for each image.
15 |     This is done by computing the similarities between each image and each caption.
16 |     This procedure is used to evaluate the models on Winoground and SugarCrepe.
17 | 
18 |     Parameters
19 |     ----------
20 | 
21 |     model: torch.nn,Module
22 |         CLIP-like model with `encode_image` and `encode_text`
23 | 
24 |     dataloader: torch.utils.data.Dataloader
25 |         dataloader to use for evaluation
26 | 
27 |     tokenizer:
28 |         text tokenizer, i.e. convert list of strings to torch.Tensor of integers
29 | 
30 |     device: cpu/cuda
31 | 
32 |     amp: whether to use automatic mixed precision
33 | 
34 |     Returns
35 |     -------
36 | 
37 |     dict of accuracy metrics
38 |     """
39 |     autocast = torch.cuda.amp.autocast if amp else suppress
40 |     image_score = []
41 |     text_score = []
42 |     score = []
43 |     for batch_images, batch_texts in tqdm(dataloader):
44 |         # assert(len(batch_images.shape) == 4)
45 |         batch_images = image_to_device(
46 |             batch_images,
47 |             device,
48 |             torch.float32,
49 |             mean=args.image_mean,
50 |             std=args.image_std,
51 |         )
52 |         # Because of the packing collate function we cannot support multi-image to caption selection
53 |         nim = 1
54 | 
55 |         # tokenize all texts in the batch
56 |         nt = len(batch_texts[0])
57 |         batch_texts_tok_ = tokenizer(
58 |             [text for i, texts in enumerate(batch_texts) for text in texts]
59 |         ).to(device)
60 | 
61 |         # compute the embedding of images and texts
62 |         with torch.no_grad(), autocast():
63 |             batch_images_emb = F.normalize(
64 |                 model.encode_image(batch_images), dim=-1
65 |             ).unsqueeze(1)
66 |             B, _, emb_dim = batch_images_emb.shape
67 |             batch_texts_emb = F.normalize(
68 |                 model.encode_text(batch_texts_tok_), dim=-1
69 |             ).view(B, nt, -1)
70 | 
71 |         gt = torch.arange(min(nim, nt)).to(device)
72 |         for i in range(B):
73 |             # iteratve over instances
74 | 
75 |             # compute similarities between each image and each text
76 |             images_emb = batch_images_emb[i]
77 |             texts_emb = batch_texts_emb[i]
78 |             scores = images_emb @ texts_emb.t()
79 | 
80 |             # i-th image should be matched to the i-th text
81 |             image_closest_text = scores.argmax(dim=1)[: len(gt)]
82 |             text_closest_image = scores.argmax(dim=0)[: len(gt)]
83 |             pred_text_is_correct = (image_closest_text == gt).all().item()
84 |             pred_image_is_correct = (text_closest_image == gt).all().item()
85 |             all_correct = pred_text_is_correct and pred_image_is_correct
86 |             image_score.append(pred_image_is_correct)
87 |             text_score.append(pred_text_is_correct)
88 |             score.append(all_correct)
89 |     metrics = {}
90 |     metrics["image_acc"] = torch.Tensor(image_score).float().mean().item()
91 |     metrics["text_acc"] = torch.Tensor(text_score).float().mean().item()
92 |     metrics["acc"] = torch.Tensor(score).float().mean().item()
93 |     return metrics
94 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/metrics/multiclass_retrieval.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from contextlib import suppress
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn.functional as F
  8 | from clip_benchmark.metrics.zeroshot_retrieval import (dataloader_with_indices,
  9 |                                                        recall_at_k)
 10 | from tqdm import tqdm
 11 | 
 12 | 
 13 | def evaluate(
 14 |     model,
 15 |     dataloader,
 16 |     tokenizer,
 17 |     device,
 18 |     amp=True,
 19 |     recall_k_list=[1],
 20 |     args=None,
 21 |     retrieval_template=None,
 22 | ):
 23 |     """
 24 |     Evaluate the model on the given dataset
 25 | 
 26 |     Parameters
 27 |     ----------
 28 | 
 29 |     model: torch.nn,Module
 30 |         CLIP-like model with `encode_image` and `encode_text`
 31 | 
 32 |     dataloader: torch.utils.data.Dataloader
 33 |         dataloader to use for evaluation
 34 | 
 35 |     tokenizer:
 36 |         text tokenizer, i.e. convert list of strings to torch.Tensor of integers
 37 | 
 38 |     device: cpu/cuda
 39 | 
 40 |     amp: whether to use automatic mixed precision
 41 | 
 42 |     recall_k_list: list of int
 43 |         recall@k k's to use
 44 | 
 45 |     retrieval_template:
 46 |         dict of retrieval templates for each class. Retrieval templates should contain lists of image/text indexes. The model will performed retrieval accross the examples in each list.
 47 | 
 48 |     Returns
 49 |     -------
 50 | 
 51 |     dict of retrieval metrics
 52 |     """
 53 |     # list of batch of images embedding
 54 |     batch_images_emb_list = []
 55 |     # list of batch of text embedding
 56 |     batch_texts_emb_list = []
 57 |     # for each text, we collect the corresponding image index, as each image can have multiple corresponding texts
 58 |     texts_image_index = []
 59 |     dataloader = dataloader_with_indices(dataloader)
 60 |     autocast = torch.cuda.amp.autocast if amp else suppress
 61 | 
 62 |     for batch_images, batch_texts, inds in tqdm(dataloader):
 63 |         # move the batch to the device
 64 |         batch_images = image_to_device(
 65 |             batch_images,
 66 |             device,
 67 |             torch.float32,
 68 |             mean=args.image_mean,
 69 |             std=args.image_std,
 70 |         )
 71 | 
 72 |         # tokenize all texts in the batch
 73 |         batch_texts_tok = tokenizer(
 74 |             [text for i, texts in enumerate(batch_texts) for text in texts]
 75 |         ).to(device)
 76 | 
 77 |         # compute the embedding of images and texts
 78 |         with torch.no_grad(), autocast():
 79 |             batch_images_emb = F.normalize(model.encode_image(batch_images), dim=-1)
 80 |             batch_texts_emb = F.normalize(model.encode_text(batch_texts_tok), dim=-1)
 81 | 
 82 |         batch_images_emb_list.append(batch_images_emb.cpu())
 83 |         batch_texts_emb_list.append(batch_texts_emb.cpu())
 84 | 
 85 |     batch_size = len(batch_images_emb_list[0])
 86 | 
 87 |     # concatenate all embeddings
 88 |     images_emb = torch.cat(batch_images_emb_list)
 89 |     texts_emb = torch.cat(batch_texts_emb_list)
 90 | 
 91 |     assert images_emb.shape[0] == texts_emb.shape[0]
 92 | 
 93 |     # get the score for each text and image pair
 94 |     scores = texts_emb @ images_emb.t()
 95 | 
 96 |     metrics = {}
 97 |     multiclass_image_retrieval = []
 98 |     multiclass_text_retrieval = []
 99 |     for c in retrieval_template.keys():
100 | 
101 |         image_retrieval = []
102 |         text_retrieval = []
103 |         for indexes in retrieval_template[c]:
104 |             retrieved = scores[np.ix_(indexes, indexes)]
105 |             positive_pairs = torch.zeros_like(retrieved, dtype=bool)
106 |             positive_pairs[
107 |                 torch.arange(len(retrieved)), torch.arange(len(retrieved))
108 |             ] = True
109 | 
110 |             image_retrieval.append(recall_at_k(retrieved, positive_pairs, k=1))
111 |             text_retrieval.append(recall_at_k(retrieved.T, positive_pairs, k=1))
112 | 
113 |         average_image_retrieval = torch.cat(image_retrieval).float().mean().item()
114 |         average_text_retrieval = torch.cat(text_retrieval).float().mean().item()
115 | 
116 |         metrics[f"image_retrieval_recall@1_{c}"] = average_image_retrieval
117 |         metrics[f"text_retrieval_recall@1_{c}"] = average_text_retrieval
118 | 
119 |         multiclass_image_retrieval.append(average_image_retrieval)
120 |         multiclass_text_retrieval.append(average_text_retrieval)
121 | 
122 |     metrics["image_retrieval_recall@1_multiclass"] = (
123 |         torch.tensor(multiclass_image_retrieval).float().mean().item()
124 |     )
125 |     metrics["text_retrieval_recall@1_multiclass"] = (
126 |         torch.tensor(multiclass_text_retrieval).float().mean().item()
127 |     )
128 | 
129 |     return metrics
130 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/model_collection.py:
--------------------------------------------------------------------------------
 1 | # import open_clip
 2 | 
 3 | 
 4 | def get_model_collection_from_file(path):
 5 |     return [l.strip().split(",") for l in open(path).readlines()]
 6 | 
 7 | 
 8 | model_collection = {
 9 | }
10 | 


--------------------------------------------------------------------------------
/apps/pe/clip_benchmark/tasks/wds_benchmarks.txt:
--------------------------------------------------------------------------------
 1 | # image classification
 2 | wds/wds_imagenet1k
 3 | wds/wds_imagenetv2
 4 | wds/wds_imagenet-a
 5 | wds/wds_imagenet-r
 6 | wds/wds_imagenet_sketch
 7 | 
 8 | # image retrieval 
 9 | wds/wds_mscoco_captions
10 | wds/wds_flickr30k
11 | 
12 | # video classification
13 | k400_val
14 | 
15 | # video retrieval
16 | msrvtt
17 | 


--------------------------------------------------------------------------------
/apps/pe/docs/assets/cat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/cat.png


--------------------------------------------------------------------------------
/apps/pe/docs/assets/dog.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/dog.mp4


--------------------------------------------------------------------------------
/apps/pe/docs/assets/dog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/dog.png


--------------------------------------------------------------------------------
/apps/pe/docs/assets/spatial_correspondence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/spatial_correspondence.png


--------------------------------------------------------------------------------
/apps/pe/docs/assets/spatial_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/spatial_features.png


--------------------------------------------------------------------------------
/apps/pe/docs/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/pe/docs/assets/teaser.png


--------------------------------------------------------------------------------
/apps/pe/docs/evaluation.md:
--------------------------------------------------------------------------------
 1 | # Zero-Shot ClipBench Evaluation
 2 | Please download the supported datasets directly from the datasets host and update paths in clip_benchmark/datasets/builder.py. And run
 3 | ```bash
 4 | model='PE-Core-G14-448'
 5 | DATASETS=./clip_benchmark/tasks/wds_benchmarks.txt
 6 | DATA_ROOT=DATA_ROOT/
 7 | 
 8 | python -m clip_benchmark.cli eval \
 9 |     --model $model \
10 |     --pretrained $CHECKPOINT \
11 |     --dataset "$DATASETS" \
12 |     --dataset_root $DATA_ROOT \
13 |     --output "./benchmark_{pretrained}_{dataset}_{num_frames}_{model}_{language}_{task}.json" \
14 |     --force-preprocess-cfg resize_mode=squash
15 | 
16 | ```
17 | This script will perform zero-shot classification abd retireval benchmarks defined in clip_benchmark/tasks/wds_benchmarks.txt. Examples above includes the following tasks:
18 | - ImageNet 1K classification
19 | - ImageNet v2 classification
20 | - ImageNet Adversial classification
21 | - MS-COCO retrieval
22 | - Flickr30K retrieval
23 | - Kinetics 400 video classification
24 | - MSR-VTT video retrieval
25 | 
26 | 


--------------------------------------------------------------------------------
/apps/plm/configs/datasets.yaml:
--------------------------------------------------------------------------------
 1 | dummy_image:
 2 |     annotation: apps/plm/dummy_datasets/image/annotations.jsonl
 3 |     root_dir: apps/plm/dummy_datasets/image/images
 4 | 
 5 | dummy_multi_image:
 6 |     annotation: apps/plm/dummy_datasets/multi_image/annotations.jsonl
 7 |     root_dir: apps/plm/dummy_datasets/multi_image/images
 8 | 
 9 | dummy_image_region:
10 |     annotation: apps/plm/dummy_datasets/image_region/annotations.jsonl
11 |     root_dir: apps/plm/dummy_datasets/image_region/images
12 | 
13 | dummy_video:
14 |     annotation: apps/plm/dummy_datasets/video/annotations.jsonl
15 |     root_dir: apps/plm/dummy_datasets/video/videos
16 | 
17 | dummy_text:
18 |     annotation: apps/plm/dummy_datasets/text/annotations.jsonl
19 | 
20 | dummy_stc_RDCap:
21 |     annotation: apps/plm/dummy_datasets/plm_stc/RDCap.jsonl
22 |     root_dir: apps/plm/dummy_datasets/plm_stc/videos
23 | 
24 | dummy_stc_RCap:
25 |     annotation: apps/plm/dummy_datasets/plm_stc/RCap.jsonl
26 |     root_dir: apps/plm/dummy_datasets/plm_stc/videos
27 | 
28 | dummy_stc_RTLoc:
29 |     annotation: apps/plm/dummy_datasets/plm_stc/RTLoc.jsonl
30 |     root_dir: apps/plm/dummy_datasets/plm_stc/videos
31 | 


--------------------------------------------------------------------------------
/apps/plm/configs/stage_1/plm_1b.yaml:
--------------------------------------------------------------------------------
 1 | # We use a global batch size of 512 in stage # 1 for PLM-1B model. Please adjust batch_size as per your training setup.
 2 | # For example, one possible configuration is batch_size=16,nodes=4,gpus_per_node=8 = 16*4*8 = 512 global batch size.
 3 | 
 4 | name: "plm_1b_stage1"
 5 | dump_dir: ./plm_1b_stage1
 6 | steps: 8000
 7 | seed: 777
 8 | optim:
 9 |     lr: 1e-4
10 |     warmup: 20
11 |     lr_min_ratio: 0.01
12 |     clip: 1.0
13 |     weight_decay: 0.01
14 | 
15 | distributed:
16 |     fsdp_type: full_shard
17 |     compile: false
18 |     model_dtype: bf16
19 |     matmul_allow_tf32: false
20 |     selective_activation_checkpointing: false
21 |     full_activation_checkpointing: true
22 |     tp_size: 1
23 | 
24 | model:
25 |     dim: 2048
26 |     n_layers: 16
27 |     n_heads: 32
28 |     n_kv_heads: 8
29 |     vocab_size: 128256
30 |     ffn_dim_multiplier: 1.5
31 |     multiple_of: 256
32 |     norm_eps: 1e-05
33 |     rope_theta: 500000.0
34 |     weight_tying: true
35 |     rope_scale_factor: 32
36 |     high_freq_factor: 4
37 |     max_seqlen: 1280
38 |     freeze_language_model: true
39 |     freeze_vision_model: true
40 |     pooling_ratio: 1
41 |     vision_model:
42 |         image_size: 448
43 |         patch_size: 14
44 |         width: 1024
45 |         layers: 23
46 |         heads: 16
47 |         use_cls_token: true
48 |         use_abs_posemb: true
49 |         mlp_ratio: 4.0
50 |         use_ln_post: false
51 |         pool_type: "none"
52 |     mlp_init:
53 |         use_gaussian: true
54 | 
55 | data:
56 |     datamix: <We use our synthetic SA-1B dataset for stage 1, https://huggingface.co/datasets/facebook/PLM-Image-Auto/tree/main/sa1b>
57 |     num_workers: 8
58 |     batch_size: 16
59 |     image_res: 448
60 |     max_num_tiles: 1
61 |     max_video_frames: 8
62 |     vision_input_type: vanilla
63 |     tokenizer_path: facebook/Perception-LM-1B/tokenizer.model
64 |     tokenizer_name: plmchat
65 |     conversation_format: warmup
66 | 
67 | profiling:
68 |     run: false
69 | 
70 | checkpoint:
71 |     dump:
72 |         every: 500
73 |         keep: 1
74 |     init_ckpt_path: meta-llama/Llama-3.2-1B-Instruct/original
75 |     # Please use the script at apps/plm/interpolate_PE_pos_embed.py to interpolate PE-Core-L14-336 (https://huggingface.co/facebook/PE-Core-L14-336) checkpoints to 448 resolution.
76 |     vision_model_path: facebook/PE-Core-L14-336-interpolated-to-448/model.pt
77 |     is_consolidated_model: True
78 | 
79 | logging:
80 |     freq: 10
81 |     level: INFO  # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL]
82 | 


--------------------------------------------------------------------------------
/apps/plm/configs/stage_1/plm_3b.yaml:
--------------------------------------------------------------------------------
 1 | # We use a global batch size of 512 in stage # 1 for PLM-3B model. Please adjust batch_size as per your training setup.
 2 | # For example, one possible configuration is batch_size=16,nodes=4,gpus_per_node=8 = 16*4*8 = 512 global batch size.
 3 | 
 4 | name: "plm_3b_stage1"
 5 | dump_dir: ./plm_3b_stage1
 6 | steps: 8000
 7 | seed: 777
 8 | optim:
 9 |     lr: 1e-4
10 |     warmup: 20
11 |     lr_min_ratio: 0.01
12 |     clip: 1.0
13 |     weight_decay: 0.01
14 | 
15 | distributed:
16 |     fsdp_type: full_shard
17 |     compile: false
18 |     model_dtype: bf16
19 |     matmul_allow_tf32: false
20 |     selective_activation_checkpointing: false
21 |     full_activation_checkpointing: true
22 |     tp_size: 1
23 | 
24 | model:
25 |     dim: 3072
26 |     n_layers: 28
27 |     n_heads: 24
28 |     n_kv_heads: 8
29 |     vocab_size: 128256
30 |     ffn_dim_multiplier: 1.0
31 |     multiple_of: 256
32 |     norm_eps: 1e-05
33 |     rope_theta: 500000.0
34 |     weight_tying: true
35 |     rope_scale_factor: 32
36 |     high_freq_factor: 4
37 |     max_seqlen: 1280
38 |     freeze_language_model: true
39 |     freeze_vision_model: true
40 |     pooling_ratio: 1
41 |     vision_model:
42 |         image_size: 448
43 |         patch_size: 14
44 |         width: 1024
45 |         layers: 23
46 |         heads: 16
47 |         use_cls_token: true
48 |         use_abs_posemb: true
49 |         mlp_ratio: 4.0
50 |         use_ln_post: false
51 |         pool_type: "none"
52 |     mlp_init:
53 |         use_gaussian: true
54 | 
55 | data:
56 |     datamix: <We use our synthetic SA-1B dataset for stage 1, https://huggingface.co/datasets/facebook/PLM-Image-Auto/tree/main/sa1b>
57 |     num_workers: 8
58 |     batch_size: 16
59 |     image_res: 448
60 |     max_num_tiles: 1
61 |     max_video_frames: 8
62 |     vision_input_type: vanilla
63 |     tokenizer_path: facebook/Perception-LM-3B/tokenizer.model
64 |     tokenizer_name: plmchat
65 |     conversation_format: warmup
66 | 
67 | profiling:
68 |     run: false
69 | 
70 | checkpoint:
71 |     dump:
72 |         every: 500
73 |         keep: 1
74 |     init_ckpt_path: meta-llama/Llama-3.2-3B-Instruct/original
75 |     # Please use the script at apps/plm/interpolate_PE_pos_embed.py to interpolate PE-Core-L14-336 (https://huggingface.co/facebook/PE-Core-L14-336) checkpoints to 448 resolution.
76 |     vision_model_path: facebook/PE-Core-L14-336-interpolated-to-448/model.pt
77 |     is_consolidated_model: True
78 | 
79 | logging:
80 |     freq: 10
81 |     level: INFO  # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL]
82 | 


--------------------------------------------------------------------------------
/apps/plm/configs/stage_1/plm_8b.yaml:
--------------------------------------------------------------------------------
 1 | # We use a global batch size of 512 in stage # 1 for PLM-8B model. Please adjust batch_size as per your training setup.
 2 | # For example, one possible configuration is batch_size=16,nodes=4,gpus_per_node=8 = 16*4*8 = 512 global batch size.
 3 | 
 4 | name: "plm_8b_stage1"
 5 | dump_dir: ./plm_8b_stage1
 6 | steps: 8000
 7 | seed: 777
 8 | optim:
 9 |     lr: 1e-4
10 |     warmup: 20
11 |     lr_min_ratio: 0.01
12 |     clip: 1.0
13 |     weight_decay: 0.05
14 | 
15 | distributed:
16 |     fsdp_type: full_shard
17 |     compile: false
18 |     model_dtype: bf16
19 |     matmul_allow_tf32: false
20 |     selective_activation_checkpointing: false
21 |     full_activation_checkpointing: true
22 |     tp_size: 1
23 | 
24 | model:
25 |     dim: 4096
26 |     n_layers: 32
27 |     n_heads: 32
28 |     n_kv_heads: 8
29 |     vocab_size: 128256
30 |     ffn_dim_multiplier: 1.3
31 |     multiple_of: 1024
32 |     norm_eps: 1e-05
33 |     rope_theta: 500000.0
34 |     weight_tying: false
35 |     max_seqlen: 1280
36 |     freeze_language_model: true
37 |     freeze_vision_model: true
38 |     pooling_ratio: 1
39 |     vision_model:
40 |         image_size: 448
41 |         patch_size: 14
42 |         width: 1536
43 |         layers: 47
44 |         heads: 16
45 |         use_cls_token: false
46 |         use_abs_posemb: true
47 |         mlp_ratio: 5.833333334
48 |         use_ln_post: false
49 |         pool_type: "none"
50 |     mlp_init:
51 |         use_gaussian: true
52 | 
53 | data:
54 |     datamix: <We use our synthetic SA-1B dataset for stage 1, https://huggingface.co/datasets/facebook/PLM-Image-Auto/tree/main/sa1b>
55 |     num_workers: 8
56 |     batch_size: 16
57 |     image_res: 448
58 |     max_num_tiles: 1
59 |     max_video_frames: 8
60 |     vision_input_type: vanilla
61 |     tokenizer_path: facebook/Perception-LM-8B/tokenizer.model
62 |     tokenizer_name: plmchat
63 |     conversation_format: warmup
64 | 
65 | profiling:
66 |     run: false
67 | 
68 | checkpoint:
69 |     dump:
70 |         every: 500
71 |         keep: 1
72 |     init_ckpt_path: meta-llama/Llama-3.1-8B-Instruct/original
73 |     vision_model_path: facebook/PE-Core-G14-448/model.pt
74 |     is_consolidated_model: True
75 | 
76 | logging:
77 |     freq: 10
78 |     level: INFO  # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL]
79 | 


--------------------------------------------------------------------------------
/apps/plm/configs/stage_2/plm_1b.yaml:
--------------------------------------------------------------------------------
 1 | # We use a global batch size of 2048 in stage # 3 for PLM-1B model. Please adjust batch_size as per your training setup.
 2 | # For example, one possible configuration is batch_size=8,nodes=32,gpus_per_node=8 = 8*32*8 = 2048 global batch size.
 3 | 
 4 | name: "plm_1b_stage2"
 5 | dump_dir: ./plm_1b_stage2
 6 | steps: 35000
 7 | seed: 777
 8 | optim:
 9 |     lr: 4e-5
10 |     warmup: 120
11 |     lr_min_ratio: 0.01
12 |     clip: 1.0
13 |     weight_decay: 0.01
14 | 
15 | distributed:
16 |     fsdp_type: full_shard
17 |     compile: false
18 |     model_dtype: bf16
19 |     matmul_allow_tf32: false
20 |     selective_activation_checkpointing: false
21 |     full_activation_checkpointing: true
22 |     tp_size: 1
23 | 
24 | model:
25 |     dim: 2048
26 |     n_layers: 16
27 |     n_heads: 32
28 |     n_kv_heads: 8
29 |     vocab_size: 128256
30 |     ffn_dim_multiplier: 1.5
31 |     multiple_of: 256
32 |     norm_eps: 1e-05
33 |     rope_theta: 500000.0
34 |     weight_tying: true
35 |     rope_scale_factor: 32
36 |     high_freq_factor: 4
37 |     max_seqlen: 6144
38 |     freeze_language_model: false
39 |     freeze_vision_model: false
40 |     pooling_ratio: 2
41 |     vision_model:
42 |         image_size: 448
43 |         patch_size: 14
44 |         width: 1024
45 |         layers: 23
46 |         heads: 16
47 |         use_cls_token: true
48 |         use_abs_posemb: true
49 |         mlp_ratio: 4.0
50 |         ls_init_value: 0.1
51 |         drop_path: 0.1
52 |         use_ln_post: false
53 |         pool_type: "none"
54 |     mlp_init:
55 |         use_gaussian: true
56 | 
57 | data:
58 |     datamix: <Please consider using data split listed in Table A1 of our paper https://arxiv.org/pdf/2504.13180. The weight of the dataset would be same as the number of samples.>
59 |     num_workers: 8
60 |     batch_size: 4
61 |     image_res: 448
62 |     max_num_tiles: 16
63 |     max_video_frames: 16
64 |     vision_input_type: thumb+tile
65 |     tokenizer_path: facebook/Perception-LM-1B/tokenizer.model
66 |     tokenizer_name: plmchat
67 |     conversation_format: plm_sft
68 | 
69 | profiling:
70 |     run: false
71 | 
72 | checkpoint:
73 |     dump:
74 |         every: 500
75 |         keep: 1
76 |     init_ckpt_path: <Point it to the consolidated checkpoints from PLM 1B Stage 1>
77 |     is_consolidated_model: True
78 | 
79 | logging:
80 |     freq: 10
81 |     level: INFO  # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL]
82 | 


--------------------------------------------------------------------------------
/apps/plm/configs/stage_2/plm_3b.yaml:
--------------------------------------------------------------------------------
 1 | # We use a global batch size of 2048 in stage # 3 for PLM-3B model. Please adjust batch_size as per your training setup.
 2 | # For example, one possible configuration is batch_size=8,nodes=32,gpus_per_node=8 = 8*32*8 = 2048 global batch size.
 3 | 
 4 | name: "plm_3b_stage2"
 5 | dump_dir: ./plm_3b_stage2
 6 | steps: 35000
 7 | seed: 777
 8 | optim:
 9 |     lr: 4e-5
10 |     warmup: 120
11 |     lr_min_ratio: 0.01
12 |     clip: 1.0
13 |     weight_decay: 0.01
14 | 
15 | distributed:
16 |     fsdp_type: full_shard
17 |     compile: false
18 |     model_dtype: bf16
19 |     matmul_allow_tf32: false
20 |     selective_activation_checkpointing: false
21 |     full_activation_checkpointing: true
22 |     tp_size: 1
23 | 
24 | model:
25 |     dim: 3072
26 |     n_layers: 28
27 |     n_heads: 24
28 |     n_kv_heads: 8
29 |     vocab_size: 128256
30 |     ffn_dim_multiplier: 1.0
31 |     multiple_of: 256
32 |     norm_eps: 1e-05
33 |     rope_theta: 500000.0
34 |     weight_tying: true
35 |     rope_scale_factor: 32
36 |     high_freq_factor: 4
37 |     max_seqlen: 6144
38 |     freeze_language_model: false
39 |     freeze_vision_model: false
40 |     pooling_ratio: 2
41 |     vision_model:
42 |         image_size: 448
43 |         patch_size: 14
44 |         width: 1024
45 |         layers: 23
46 |         heads: 16
47 |         use_cls_token: true
48 |         use_abs_posemb: true
49 |         mlp_ratio: 4.0
50 |         ls_init_value: 0.1
51 |         drop_path: 0.1
52 |         use_ln_post: false
53 |         pool_type: "none"
54 |     mlp_init:
55 |         use_gaussian: true
56 | 
57 | data:
58 |     datamix: <Please consider using data split listed in Table A1 of our paper https://arxiv.org/pdf/2504.13180. The weight of the dataset would be same as the number of samples.>
59 |     num_workers: 8
60 |     batch_size: 4
61 |     image_res: 448
62 |     max_num_tiles: 16
63 |     max_video_frames: 16
64 |     vision_input_type: thumb+tile
65 |     tokenizer_path: facebook/Perception-LM-3B/tokenizer.model
66 |     tokenizer_name: plmchat
67 |     conversation_format: plm_sft
68 | 
69 | profiling:
70 |     run: false
71 | 
72 | checkpoint:
73 |     dump:
74 |         every: 500
75 |         keep: 1
76 |     init_ckpt_path: <Point it to the consolidated checkpoints from PLM 3B Stage 1>
77 |     is_consolidated_model: True
78 | 
79 | logging:
80 |     freq: 10
81 |     level: INFO  # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL]
82 | 


--------------------------------------------------------------------------------
/apps/plm/configs/stage_2/plm_8b.yaml:
--------------------------------------------------------------------------------
 1 | # We use a global batch size of 2048 in stage # 3 for PLM-8B model. Please adjust batch_size as per your training setup.
 2 | # For example, one possible configuration is batch_size=4,nodes=64,gpus_per_node=8 = 4*64*8 = 2048 global batch size.
 3 | 
 4 | name: "plm_8b_stage2"
 5 | dump_dir: ./plm_8b_stage2
 6 | steps: 35000
 7 | seed: 777
 8 | optim:
 9 |     lr: 4e-5
10 |     warmup: 120
11 |     lr_min_ratio: 0.01
12 |     clip: 1.0
13 |     weight_decay: 0.05
14 | 
15 | distributed:
16 |     fsdp_type: full_shard
17 |     compile: false
18 |     model_dtype: bf16
19 |     matmul_allow_tf32: false
20 |     selective_activation_checkpointing: false
21 |     full_activation_checkpointing: true
22 |     tp_size: 1
23 | 
24 | model:
25 |     dim: 4096
26 |     n_layers: 32
27 |     n_heads: 32
28 |     n_kv_heads: 8
29 |     vocab_size: 128256
30 |     ffn_dim_multiplier: 1.3
31 |     multiple_of: 1024
32 |     norm_eps: 1e-05
33 |     rope_theta: 500000.0
34 |     weight_tying: false
35 |     max_seqlen: 6144
36 |     freeze_language_model: false
37 |     freeze_vision_model: false
38 |     pooling_ratio: 2
39 |     vision_model:
40 |         image_size: 448
41 |         patch_size: 14
42 |         width: 1536
43 |         layers: 47
44 |         heads: 16
45 |         use_cls_token: false
46 |         use_abs_posemb: true
47 |         mlp_ratio: 5.833333334
48 |         ls_init_value: 0.1
49 |         drop_path: 0.1
50 |         use_ln_post: false
51 |         pool_type: "none"
52 |     mlp_init:
53 |         use_gaussian: true
54 | 
55 | data:
56 |     datamix: <Please consider using data split listed in Table A1 of our paper https://arxiv.org/pdf/2504.13180. The weight of the dataset would be same as the number of samples.>
57 |     num_workers: 4
58 |     batch_size: 2
59 |     image_res: 448
60 |     max_num_tiles: 16
61 |     max_video_frames: 16
62 |     vision_input_type: thumb+tile
63 |     tokenizer_path: facebook/Perception-LM-8B/tokenizer.model
64 |     tokenizer_name: plmchat
65 |     conversation_format: plm_sft
66 | 
67 | profiling:
68 |     run: false
69 | 
70 | checkpoint:
71 |     dump:
72 |         every: 500
73 |         keep: 1
74 |     init_ckpt_path: <Point it to the consolidated checkpoints from PLM 8B Stage 1>
75 |     is_consolidated_model: True
76 | 
77 | logging:
78 |     freq: 10
79 |     level: INFO  # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL]
80 | 


--------------------------------------------------------------------------------
/apps/plm/configs/stage_3/plm_1b.yaml:
--------------------------------------------------------------------------------
 1 | # We use a global batch size of 1024 in stage # 3 for PLM-1B model. Please adjust batch_size as per your training setup.
 2 | # For example, one possible configuration is batch_size=4,nodes=32,gpus_per_node=8 = 4*32*8 = 1024 global batch size.
 3 | 
 4 | name: "plm_1b_stage3"
 5 | dump_dir: ./plm_1b_stage3
 6 | steps: 21000
 7 | seed: 777
 8 | optim:
 9 |     lr: 4e-5
10 |     warmup: 120
11 |     lr_min_ratio: 0.01
12 |     clip: 1.0
13 |     weight_decay: 0.01
14 | 
15 | distributed:
16 |     fsdp_type: full_shard
17 |     compile: false
18 |     model_dtype: bf16
19 |     matmul_allow_tf32: false
20 |     selective_activation_checkpointing: false
21 |     full_activation_checkpointing: true
22 |     tp_size: 1
23 | 
24 | model:
25 |     dim: 2048
26 |     n_layers: 16
27 |     n_heads: 32
28 |     n_kv_heads: 8
29 |     vocab_size: 128256
30 |     ffn_dim_multiplier: 1.5
31 |     multiple_of: 256
32 |     norm_eps: 1e-05
33 |     rope_theta: 500000.0
34 |     weight_tying: true
35 |     rope_scale_factor: 32
36 |     high_freq_factor: 4
37 |     max_seqlen: 11520
38 |     freeze_language_model: false
39 |     freeze_vision_model: false
40 |     pooling_ratio: 2
41 |     vision_model:
42 |         image_size: 448
43 |         patch_size: 14
44 |         width: 1024
45 |         layers: 23
46 |         heads: 16
47 |         use_cls_token: true
48 |         use_abs_posemb: true
49 |         mlp_ratio: 4.0
50 |         ls_init_value: 0.1
51 |         drop_path: 0.1
52 |         use_ln_post: false
53 |         pool_type: "none"
54 |     mlp_init:
55 |         use_gaussian: true
56 | 
57 | data:
58 |     datamix: <Please consider using data split listed in Table A2 of our paper https://arxiv.org/pdf/2504.13180. The weight of the dataset would be same as the number of samples.>
59 |     num_workers: 4
60 |     batch_size: 2
61 |     image_res: 448
62 |     max_num_tiles: 36
63 |     max_video_frames: 32
64 |     vision_input_type: thumb+tile
65 |     tokenizer_path: facebook/Perception-LM-1B/tokenizer.model
66 |     tokenizer_name: plmchat
67 |     conversation_format: plm_sft
68 | 
69 | profiling:
70 |     run: false
71 | 
72 | checkpoint:
73 |     dump:
74 |         every: 500
75 |         keep: 1
76 |     init_ckpt_path: <Point it to the consolidated checkpoints from PLM 1B Stage 2>
77 |     is_consolidated_model: True
78 | 
79 | logging:
80 |     freq: 10
81 |     level: INFO  # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL]
82 | 


--------------------------------------------------------------------------------
/apps/plm/configs/stage_3/plm_3b.yaml:
--------------------------------------------------------------------------------
 1 | # We use a global batch size of 1024 in stage # 3 for PLM-3B model. Please adjust batch_size as per your training setup.
 2 | # For example, one possible configuration is batch_size=4,nodes=32,gpus_per_node=8 = 4*32*8 = 1024 global batch size.
 3 | 
 4 | name: "plm_3b_stage3"
 5 | dump_dir: ./plm_3b_stage3
 6 | steps: 21000
 7 | seed: 777
 8 | optim:
 9 |     lr: 4e-5
10 |     warmup: 120
11 |     lr_min_ratio: 0.01
12 |     clip: 1.0
13 |     weight_decay: 0.01
14 | 
15 | distributed:
16 |     fsdp_type: full_shard
17 |     compile: false
18 |     model_dtype: bf16
19 |     matmul_allow_tf32: false
20 |     selective_activation_checkpointing: false
21 |     full_activation_checkpointing: true
22 |     tp_size: 1
23 | 
24 | model:
25 |     dim: 3072
26 |     n_layers: 28
27 |     n_heads: 24
28 |     n_kv_heads: 8
29 |     vocab_size: 128256
30 |     ffn_dim_multiplier: 1.0
31 |     multiple_of: 256
32 |     norm_eps: 1e-05
33 |     rope_theta: 500000.0
34 |     weight_tying: true
35 |     rope_scale_factor: 32
36 |     high_freq_factor: 4
37 |     max_seqlen: 11520
38 |     freeze_language_model: false
39 |     freeze_vision_model: false
40 |     pooling_ratio: 2
41 |     vision_model:
42 |         image_size: 448
43 |         patch_size: 14
44 |         width: 1024
45 |         layers: 23
46 |         heads: 16
47 |         use_cls_token: true
48 |         use_abs_posemb: true
49 |         mlp_ratio: 4.0
50 |         ls_init_value: 0.1
51 |         drop_path: 0.1
52 |         use_ln_post: false
53 |         pool_type: "none"
54 |     mlp_init:
55 |         use_gaussian: true
56 | 
57 | data:
58 |     datamix: <Please consider using data split listed in Table A2 of our paper https://arxiv.org/pdf/2504.13180. The weight of the dataset would be same as the number of samples.>
59 |     num_workers: 4
60 |     batch_size: 2
61 |     image_res: 448
62 |     max_num_tiles: 36
63 |     max_video_frames: 32
64 |     vision_input_type: thumb+tile
65 |     tokenizer_path: facebook/Perception-LM-3B/tokenizer.model
66 |     tokenizer_name: plmchat
67 |     conversation_format: plm_sft
68 | 
69 | profiling:
70 |     run: false
71 | 
72 | checkpoint:
73 |     dump:
74 |         every: 500
75 |         keep: 1
76 |     init_ckpt_path: <Point it to the consolidated checkpoints from PLM 3B Stage 2>
77 |     is_consolidated_model: True
78 | 
79 | logging:
80 |     freq: 10
81 |     level: INFO  # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL]
82 | 


--------------------------------------------------------------------------------
/apps/plm/configs/stage_3/plm_8b.yaml:
--------------------------------------------------------------------------------
 1 | # We use a global batch size of 1024 in stage # 3 for PLM-8B model. Please adjust batch_size as per your training setup.
 2 | # For example, one possible configuration is batch_size=2,nodes=64,gpus_per_node=8 = 2*64*8 = 1024 global batch size.
 3 | 
 4 | name: "plm_8b_stage3"
 5 | dump_dir: ./plm_8b_stage3
 6 | steps: 21000
 7 | seed: 777
 8 | optim:
 9 |     lr: 1e-5
10 |     warmup: 120
11 |     lr_min_ratio: 0.01
12 |     clip: 1.0
13 |     weight_decay: 0.05
14 | 
15 | distributed:
16 |     fsdp_type: full_shard
17 |     compile: false
18 |     model_dtype: bf16
19 |     matmul_allow_tf32: false
20 |     selective_activation_checkpointing: false
21 |     full_activation_checkpointing: true
22 |     tp_size: 1
23 | 
24 | model:
25 |     dim: 4096
26 |     n_layers: 32
27 |     n_heads: 32
28 |     n_kv_heads: 8
29 |     vocab_size: 128256
30 |     ffn_dim_multiplier: 1.3
31 |     multiple_of: 1024
32 |     norm_eps: 1e-05
33 |     rope_theta: 500000.0
34 |     weight_tying: false
35 |     max_seqlen: 11520
36 |     freeze_language_model: false
37 |     freeze_vision_model: false
38 |     pooling_ratio: 2
39 |     vision_model:
40 |         image_size: 448
41 |         patch_size: 14
42 |         width: 1536
43 |         layers: 47
44 |         heads: 16
45 |         use_cls_token: false
46 |         use_abs_posemb: true
47 |         mlp_ratio: 5.833333334
48 |         ls_init_value: 0.1
49 |         drop_path: 0.1
50 |         use_ln_post: false
51 |         pool_type: "none"
52 |     mlp_init:
53 |         use_gaussian: true
54 | 
55 | data:
56 |     datamix: <Please consider using data split listed in Table A2 of our paper https://arxiv.org/pdf/2504.13180. The weight of the dataset would be same as the number of samples.>
57 |     num_workers: 4
58 |     batch_size: 2
59 |     image_res: 448
60 |     max_num_tiles: 36
61 |     max_video_frames: 32
62 |     vision_input_type: thumb+tile
63 |     tokenizer_path: facebook/Perception-LM-8B/tokenizer.model
64 |     tokenizer_name: plmchat
65 |     conversation_format: plm_sft
66 | 
67 | profiling:
68 |     run: false
69 | 
70 | checkpoint:
71 |     dump:
72 |         every: 500
73 |         keep: 1
74 |     init_ckpt_path: <Point it to the consolidated checkpoints from PLM 8B Stage 2>
75 |     is_consolidated_model: True
76 | 
77 | logging:
78 |     freq: 10
79 |     level: INFO  # Available choices for logging level are: [NOTSET, DEBUG, INFO, WARN, ERROR, FATAL, CRITICAL]
80 | 


--------------------------------------------------------------------------------
/apps/plm/consolidate.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from pathlib import Path
 4 | 
 5 | import torch
 6 | from omegaconf import OmegaConf
 7 | 
 8 | from apps.plm.transformer import LMTransformer, LMTransformerArgs
 9 | from core.args import dataclass_from_dict
10 | from core.checkpoint import load_from_checkpoint
11 | 
12 | 
13 | def build_model(
14 |     ref_model_path: str,
15 |     model_cls=LMTransformer,
16 |     model_args_cls=LMTransformerArgs,
17 | ):
18 |     ckpt_path = Path(ref_model_path)
19 |     config = ckpt_path / "params.json"
20 |     config = OmegaConf.load(config)
21 | 
22 |     model_args = dataclass_from_dict(model_args_cls, config.model, strict=False)
23 |     model = model_cls(model_args)
24 |     return model
25 | 
26 | 
27 | def main():
28 |     parser = argparse.ArgumentParser(description="Consolidate PLM checkpoints")
29 |     parser.add_argument(
30 |         "--ckpt",
31 |         type=str,
32 |         required=True,
33 |         help="Path to the checkpoint directory to consolidate",
34 |     )
35 |     args = parser.parse_args()
36 | 
37 |     model = build_model(ref_model_path=args.ckpt)
38 |     load_from_checkpoint(
39 |         ckpt_dir=args.ckpt,
40 |         model=model,
41 |         optimizer=None,
42 |         model_key="model",
43 |     )
44 | 
45 |     consolidated_model_state_dict = model.state_dict()
46 |     output_file = os.path.join(args.ckpt, "consolidated.pth")
47 | 
48 |     # Save the consolidated model state_dict using torch.save
49 |     print(f"Saving consolidated model state_dict to: {output_file}")
50 |     torch.save(consolidated_model_state_dict, output_file)
51 |     print("Consolidated checkpoint saved successfully.")
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/apps/plm/dataset_conf.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | 
 3 | import os
 4 | from dataclasses import dataclass
 5 | from typing import Optional
 6 | 
 7 | import yaml
 8 | 
 9 | 
10 | @dataclass
11 | class DatasetConf:
12 |     name: str = ""
13 |     annotation: str = ""
14 |     root_dir: Optional[str] = None
15 | 
16 | 
17 | def read_yaml_to_configs(yaml_file_path: str) -> dict:
18 |     with open(yaml_file_path, "r", encoding="utf-8") as file:
19 |         yaml_data = yaml.safe_load(file)
20 | 
21 |     dataset_config = {}
22 |     for dataset_name, dataset_info in yaml_data.items():
23 |         dataset_config[dataset_name] = DatasetConf(
24 |             name=dataset_name,
25 |             annotation=dataset_info["annotation"],
26 |             root_dir=dataset_info.get("root_dir"),
27 |         )
28 | 
29 |     return dataset_config
30 | 
31 | 
32 | # Determine the directory of the current script
33 | current_directory = os.path.dirname(os.path.abspath(__file__))
34 | # Construct the path to the datasets.yaml file
35 | yaml_file_path = os.path.join(current_directory, "configs", "datasets.yaml")
36 | # Read the YAML file
37 | dataset_config = read_yaml_to_configs(yaml_file_path)
38 | 


--------------------------------------------------------------------------------
/apps/plm/docs/evaluation.md:
--------------------------------------------------------------------------------
 1 | # Evaluating Perception Language Model (PLM)
 2 | 
 3 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM&#160;1B-Model-blue)](https://huggingface.co/facebook/Perception-LM-1B)
 4 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM&#160;3B-Model-blue)](https://huggingface.co/facebook/Perception-LM-3B)
 5 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM&#160;8B-Model-blue)](https://huggingface.co/facebook/Perception-LM-8B) 
 6 | 
 7 | We have added our model and benchmarks to [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/lmms_eval/models/plm.py) for to support the process of reproducing our reported results on multiple image and video benchmarks.
 8 | 
 9 | ---
10 | 
11 | ## Getting Started
12 | 1. Install perception_models following the instruction in the [`Main README`](../../../README.md).
13 | 2. Install `lmms-eval`:
14 | ```
15 | git clone https://github.com/EvolvingLMMs-Lab/lmms-eval.git
16 | cd lmm-evals
17 | pip install -e .
18 | ```
19 | 
20 | ## Run Evaluation on Standard Image and Video Tasks
21 | You can use the following command to run the evaluation.
22 | 
23 | ```shell
24 | 
25 | # Use facebook/Perception-LM-1B for 1B parameters model and facebook/Perception-LM-8B for 8B parameters model.
26 | CHECKPOINTS_PATH=facebook/Perception-LM-3B
27 | 
28 | # Define the tasks you want to evaluate PLM on. We support all the tasks present in lmms-eval, however have tested the following tasks with our models.
29 | 
30 | ALL_TASKS=(
31 |     "docvqa" "chartqa" "textvqa" "infovqa" "ai2d_no_mask" "ok_vqa" "vizwiz_vqa" "mme"
32 |     "realworldqa" "pope" "mmmu" "ocrbench" "coco_karpathy_val" "nocaps" "vqav2_val"
33 |     "mvbench" "videomme" "vatex_test" "egoschema" "egoschema_subset" "mlvu_dev"
34 |     "tempcompass_multi_choice" "perceptiontest_val_mc" "perceptiontest_test_mc"
35 | )
36 | 
37 | # After specifying the task/tasks to evaluate, run the following command to start the evaluation.
38 | SELECTED_TASK="textvqa,videomme"
39 | accelerate launch --num_processes=8 \
40 | -m lmms_eval \
41 | --model plm \
42 | --model_args pretrained=$CHECKPOINTS_PATH \
43 | --tasks $SELECTED_TASK \
44 | --batch_size 1 \
45 | --log_samples \
46 | --log_samples_suffix plm \
47 | --output_path $OUTPUT_PATH
48 | ```
49 | 


--------------------------------------------------------------------------------
/apps/plm/docs/plm_main_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/apps/plm/docs/plm_main_fig.png


--------------------------------------------------------------------------------
/apps/plm/docs/plm_videobench.md:
--------------------------------------------------------------------------------
 1 | # PLM-VideoBench
 2 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM&#8209;VideoBench-BenchMark-blue)](https://huggingface.co/datasets/facebook/PLM-VideoBench)
 3 | 
 4 | As part of our PLM-release, we are releasing a comprehensive set of video benchmarks (grouped as `PLM-VideoBench`) for detailed video understanding. PLM-VideoBench includes the following sub-benchmarks,
 5 | 1. **Fine-Grained Question Answering (FGQA):** In this task, a model must answer a multiple-choice question (MCQ)
 6 | that probes fine-grained activity understanding.
 7 | 2. **Smart Glasses Question Answering (SGQA):** In this task, a model must answer open-ended questions about
 8 | activities and objects visible in an egocentric video stream recorded by a Meta VR Glasses.
 9 | 3. **Video Region Captioning (RCap):** In this task, the model must generate a detailed description of an event
10 | involving a subject of interest in the video. 
11 | 4. **Region Temporal Localization (RTLoc):** In this task, the model must identify the precise time interval within the video when the specified event takes place for the given subject.
12 | 5. **Region Dense Video Captioning (RDCap):** In this task, a model must generate a detailed description of all events involving a specific subject of interest in a video.
13 | 
14 | > [!TIP]
15 | > We have added all `PLM-VideoBench` tasks to [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/tasks/plm_videobench). This makes it easy to reproduce PLM results and also allows other models to be tested on the benchmarks.
16 | 
17 | You can use the following command to evaluate PLM on PLM-VideoBench.
18 | 
19 | ```shell
20 | 
21 | # Use facebook/Perception-LM-1B for 1B parameters model and facebook/Perception-LM-8B for 8B parameters model.
22 | CHECKPOINTS_PATH=facebook/Perception-LM-3B.
23 | 
24 | # PLM-VideoBench Tasks
25 | SELECTED_TASK=fgqa_test,sgqa_test,rtloc_test,rcap_test,rdcap_test
26 | OUTPUT_PATH="plm_videobench_evaluation"
27 | 
28 | accelerate launch --num_processes=8 \
29 | -m lmms_eval \
30 | --model plm \
31 | --model_args pretrained=$CHECKPOINTS_PATH \
32 | --tasks $TASKS \
33 | --batch_size 1 \
34 | --log_samples \
35 | --log_samples_suffix plm \
36 | --output_path $OUTPUT_PATH
37 | ```
38 | 
39 | ## Results
40 | 
41 | We evaluate PLM against baselines on PLM-VideoBench and
42 | report breakdowns. We report human performance in the first row.
43 | | Model            | FGQA (MBacc) | SGQA (Acc) | RDCap (SODA) | RCap (Score) | RTLoc (meanR) | Avg. |
44 | |------------------|------|------|------------|------------|-------------|------|
45 | | <font color="blue">Human perf.</font>      | <font color="blue">90.9</font>   | <font color="blue">67.9</font>   | <font color="blue">66.6</font>  | <font color="blue">53.9</font>       | <font color="blue">67.8</font>       | <font color="blue">73.9</font>  |
46 | | GPT-4o           | 61.2 | **63.7** | 20.9       | 35.7       | 33.1        | 51.6 |
47 | | Gemini 1.5 Pro   | 57.1 | 49.9 | 14.4       | 33.1       | 27.6        | 44.0 |
48 | | Gemini 2.0 Flash | 58.7 | 44.8 | 13.2       | 30.9       | 27.6        | 42.5 |
49 | | LLaVA-OV-7B      | 40.2 | 41.5 | 4.7        | 24.4       | 13.9        | 32.0 |
50 | | Qwen2VL-7B       | 49.2 | 44.5 | 4.1        | 17.6       | 15.1        | 35.3 |
51 | | Qwen2.5VL-7B     | 49.8 | 43.0 | 2.5        | 21.5       | 10.7        | 34.8 |
52 | | InternVL2-8B     | 47.7 | 45.9 | 1.2        | 21.5       | 11.6        | 35.0 |
53 | | InternVL2.5-8B   | 53.7 | 48.3 | 5.7        | 26.1       | 8.8         | 38.5 |
54 | | PLM-8B           | **67.7** | 46.2 | **52.8**   | **46.6**   | **59.1**    | **55.6** |
55 | 


--------------------------------------------------------------------------------
/apps/plm/docs/training.md:
--------------------------------------------------------------------------------
  1 | # Training Perception Language Model (PLM)
  2 | 
  3 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM&#160;Synthetic-Image-blue)](https://huggingface.co/datasets/facebook/PLM-Image-Auto)
  4 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM&#160;Synthetic-Video-blue)](https://huggingface.co/datasets/facebook/PLM-Video-Auto)
  5 | [![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20PLM&#160;Human-Video-blue)](https://huggingface.co/datasets/facebook/PLM-Video-Human)
  6 | 
  7 | We provide instruction to train or finetune PLM on a custom dataset.
  8 | 
  9 | ---
 10 | 
 11 | > [!TIP]
 12 | > We provide configurations to run [`warm-up`](../configs/warmup/) and [`sft`](../configs/sft/) to facilitate reproducibility of PLM training.
 13 | 
 14 | 
 15 | ## Data Format :open_file_folder:
 16 | 
 17 | We use support both image and video conversation datasets using `jsonl`. Each line of `jsonl` file should follow the following format,
 18 | 
 19 | ### For Image Conversation Dataset
 20 | ```json
 21 |   {
 22 |     "image": "<image path>",
 23 |     "conversations": [
 24 |       {
 25 |         "from": "human",
 26 |         "value": "human instruction"
 27 |       },
 28 |       {
 29 |         "from": "assistant",
 30 |         "value": "model response"
 31 |       }
 32 |     ]
 33 |   }
 34 | ```
 35 | 
 36 | ### For Video Conversation Dataset
 37 | ```json
 38 |   {
 39 |     "video": "<video path>",
 40 |     "conversations": [
 41 |       {
 42 |         "from": "human",
 43 |         "value": " human instruction"
 44 |       },
 45 |       {
 46 |         "from": "assistant",
 47 |         "value": "model response"
 48 |       }
 49 |     ]
 50 |   }
 51 | ```
 52 | 
 53 | Note that for images, we require the `image` key to be present in the `jsonl line`, while for videos we require the `video` key to be present in the `jsonl line`. The `conversations` key is common between the two types.
 54 | 
 55 | > [!TIP]
 56 | > The repo also support `text-only`, `multi-image`, `image-region`, `video-region-caption (RCap)`, `video-region-temporal-localization (RTLoc)` and `video-region-dense-captioning (RDCap)` tasks. Please download the provided [`dummy-datasets`](https://dl.fbaipublicfiles.com/plm/dummy_datasets.tar.gz) for an example of each dataset.
 57 | 
 58 | 
 59 | ### Registration of New Dataset
 60 | Given the dataset `jsonl` file, we can register a new dataset by adding an entry in [`apps/plm/configs/datasets.yaml`](apps/plm/configs/datasets.yaml).
 61 | 
 62 | ```shell
 63 | custom_dataset_name:
 64 |     annotation: path/to/the/jsonl/file.jsonl
 65 |     root_dir: path/to/the/image-or-video/root-dir
 66 | ```
 67 | Please refer to [`apps/plm/configs/datasets.yaml`](apps/plm/configs/datasets.yaml) for already present dummy image, video and grounding datasets.
 68 | 
 69 | ---
 70 | 
 71 | ## Training / Finetuning PLM :train:
 72 | Training PLM involves creating a `.yaml` configuration file, defining all model and training related configurable parameters. Please refer to the provided [`plm_configs`](../configs) for details.
 73 | 
 74 | > [!TIP]
 75 | > To run the following code, download the [`dummy-datasets`](https://dl.fbaipublicfiles.com/plm/dummy_datasets.tar.gz) and extract them to `apps/plm/dummy_datasets`.
 76 | 
 77 | Given a `.yaml` configuration file, please run the following command to launch the training on a single node with 8 GPUs.
 78 | 
 79 | ```shell
 80 | torchrun --nproc-per-node 8 -m apps.plm.train config=apps/plm/configs/stage_3/plm_3b.yaml
 81 | ```
 82 | 
 83 | ### Consolidate Checkpoints
 84 | In order to run inference / evaluation, please consolidate checkpoints using the following command,
 85 | 
 86 | ```shell
 87 | python apps/plm/consolidate.py --ckpt <path to the saved checkpoints.>
 88 | ```
 89 | 
 90 | ### Run Inference / Evaluation
 91 | After consoldating the checkpoints, you can run inference using the following command,
 92 | 
 93 | ```shell
 94 | python apps/plm/generate.py \
 95 | --ckpt facebook/Perception-LM-3B \
 96 | --media_type image \  # Replace with "video" for running inference on video
 97 | --media_path <path to image or video> \
 98 | --question <Question to be asked about the video.>
 99 | ```
100 | 
101 | For evaluation, please refer to [`evaluation.md`](evaluation.md).
102 | 
103 | ---
104 | 
105 | We also provide a script to launch a distributed multinode training on slurm. Please use the provided utility named `stool.py`.
106 | 
107 | ```shell
108 | python -m core.stool script=apps.plm.train config=apps/plm/configs/stage_3/plm_8b.yaml qos=<QoS> nodes=<num_of_nodes>
109 | ```
110 | 
111 | ---
112 | 
113 | We provide a step-by-step example for how to finetune PLM on a public dataset that elaborates on each of the steps above in detail. Please see [`finetune_example.md`](finetune_example.md). 
114 | 


--------------------------------------------------------------------------------
/apps/plm/interpolate_PE_pos_embed.py:
--------------------------------------------------------------------------------
  1 | # python apps/plm/interpolate_PE_pos_embed.py \
  2 | #     --old_image_size 336 \
  3 | #     --new_image_size 448 \
  4 | #     --patch_size 14 \
  5 | #     --input_model_path facebook/PE-Core-L14-336/model.pt \
  6 | #     --output_model_path facebook/PE-Core-L14-336-interpolated-to-448/model.pt \
  7 | #     --use_cls_token
  8 | 
  9 | import argparse
 10 | import os
 11 | 
 12 | import torch
 13 | from torch.nn import functional as F
 14 | 
 15 | 
 16 | def interpolate_positional_embedding(
 17 |     old_image_size,
 18 |     new_image_size,
 19 |     patch_size,
 20 |     input_model_path,
 21 |     output_model_path,
 22 |     use_cls_token=True,
 23 | ):
 24 |     _sd = torch.load(input_model_path, weights_only=True)
 25 |     if "state_dict" in _sd:
 26 |         _sd = _sd["state_dict"]
 27 |     elif "weights" in _sd:
 28 |         _sd = _sd["weights"]
 29 | 
 30 |     # for backwards compatibility
 31 |     _sd = {k.replace("module.", ""): v for k, v in _sd.items()}
 32 |     if any(k.startswith("visual.") for k in _sd):
 33 |         _sd = {k.replace("visual.", ""): v for k, v in _sd.items() if "visual" in k}
 34 | 
 35 |     pos_embed = _sd["positional_embedding"]
 36 | 
 37 |     old_grid_size = old_image_size // patch_size
 38 |     new_grid_size = new_image_size // patch_size
 39 | 
 40 |     if use_cls_token:
 41 |         cls_token_embed, pos_embed = pos_embed[:1], pos_embed[1:]
 42 |     pos_embed = (
 43 |         pos_embed.reshape(1, old_grid_size, old_grid_size, -1)
 44 |         .permute(0, 3, 1, 2)
 45 |         .contiguous()
 46 |     )
 47 |     pos_embed = F.interpolate(
 48 |         pos_embed,
 49 |         size=(new_grid_size, new_grid_size),
 50 |         mode="bilinear",
 51 |         align_corners=False,
 52 |     )
 53 |     pos_embed = pos_embed.permute(0, 2, 3, 1).reshape(-1, 1024).contiguous()
 54 | 
 55 |     if use_cls_token:
 56 |         pos_embed = torch.cat([cls_token_embed, pos_embed], dim=0)
 57 | 
 58 |     _sd["positional_embedding"] = pos_embed
 59 |     torch.save(_sd, output_model_path)
 60 |     print(f"Model saved to {output_model_path}")
 61 | 
 62 | 
 63 | if __name__ == "__main__":
 64 |     parser = argparse.ArgumentParser(
 65 |         description="Interpolate positional embeddings for different image sizes"
 66 |     )
 67 |     parser.add_argument(
 68 |         "--old_image_size", type=int, default=336, help="Original image size"
 69 |     )
 70 |     parser.add_argument(
 71 |         "--new_image_size", type=int, default=448, help="Target image size"
 72 |     )
 73 |     parser.add_argument("--patch_size", type=int, default=14, help="Patch size")
 74 |     parser.add_argument(
 75 |         "--input_model_path",
 76 |         type=str,
 77 |         default="facebook/PE-Core-L14-336/model.pt",
 78 |         help="Input model path",
 79 |     )
 80 |     parser.add_argument(
 81 |         "--output_model_path",
 82 |         type=str,
 83 |         default="facebook/PE-Core-L14-336-interpolated-to-448/model.pt",
 84 |         help="Output model path",
 85 |     )
 86 |     parser.add_argument(
 87 |         "--use_cls_token",
 88 |         action="store_true",
 89 |         default=True,
 90 |         help="Whether to use class token",
 91 |     )
 92 | 
 93 |     args = parser.parse_args()
 94 | 
 95 |     # Create output directory if it doesn't exist
 96 |     output_dir = os.path.dirname(args.output_model_path)
 97 |     if output_dir and not os.path.exists(output_dir):
 98 |         os.makedirs(output_dir)
 99 |         print(f"Created output directory: {output_dir}")
100 | 
101 |     interpolate_positional_embedding(
102 |         args.old_image_size,
103 |         args.new_image_size,
104 |         args.patch_size,
105 |         args.input_model_path,
106 |         args.output_model_path,
107 |         args.use_cls_token,
108 |     )
109 | 


--------------------------------------------------------------------------------
/core/args.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | 
 3 | import logging
 4 | from typing import Type, TypeVar
 5 | 
 6 | from omegaconf import DictConfig, ListConfig, OmegaConf
 7 | 
 8 | logger = logging.getLogger()
 9 | 
10 | T = TypeVar("T")
11 | 
12 | 
13 | def set_struct_recursively(cfg, strict: bool = True):
14 |     # Set struct mode for the current level
15 |     OmegaConf.set_struct(cfg, strict)
16 | 
17 |     # Traverse through nested dictionaries and lists
18 |     if isinstance(cfg, DictConfig):
19 |         for key, value in cfg.items():
20 |             if isinstance(value, (DictConfig, ListConfig)):
21 |                 set_struct_recursively(value, strict)
22 |     elif isinstance(cfg, ListConfig):
23 |         for item in cfg:
24 |             if isinstance(item, (DictConfig, ListConfig)):
25 |                 set_struct_recursively(item, strict)
26 | 
27 | 
28 | def flatten_dict(d, parent_key="", sep="_"):
29 |     items = []
30 |     for k, v in d.items():
31 |         new_key = f"{parent_key}{sep}{k}" if parent_key else k
32 |         if isinstance(v, dict):
33 |             items.extend(flatten_dict(v, new_key, sep=sep).items())
34 |         else:
35 |             items.append((new_key, v))
36 |     return dict(items)
37 | 
38 | 
39 | def dataclass_from_dict(cls: Type[T], data: dict, strict: bool = True) -> T:
40 |     """
41 |     Converts a dictionary to a dataclass instance, recursively for nested structures.
42 |     """
43 |     base = OmegaConf.structured(cls())
44 |     OmegaConf.set_struct(base, strict)
45 |     override = OmegaConf.create(data)
46 |     return OmegaConf.to_object(OmegaConf.merge(base, override))
47 | 
48 | 
49 | def dataclass_to_dict(dataclass_instance: T) -> dict:
50 |     """
51 |     Converts a dataclass instance to a dictionary, recursively for nested structures.
52 |     """
53 |     if isinstance(dataclass_instance, dict):
54 |         return dataclass_instance
55 | 
56 |     return OmegaConf.to_container(
57 |         OmegaConf.structured(dataclass_instance), resolve=True
58 |     )
59 | 
60 | 
61 | def load_config_file(config_file, dataclass_cls: Type[T]) -> T:
62 |     config = OmegaConf.to_container(OmegaConf.load(config_file), resolve=True)
63 |     return dataclass_from_dict(dataclass_cls, config)
64 | 
65 | 
66 | def dump_config(config, path, log_config=True):
67 |     yaml_dump = OmegaConf.to_yaml(OmegaConf.structured(config))
68 |     with open(path, "w") as f:
69 |         if log_config:
70 |             logger.info("Using the following config for this run:")
71 |             logger.info(yaml_dump)
72 |         f.write(yaml_dump)
73 | 


--------------------------------------------------------------------------------
/core/data/dataloader.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | 
  3 | import logging
  4 | from dataclasses import dataclass
  5 | from functools import partial
  6 | from typing import Any, Dict, Optional
  7 | 
  8 | from core.data.data_collators import MllmPaddingCollator
  9 | from core.data.data_mixer import DatasetMixer, PersistentDataLoader
 10 | from core.data.preprocessor import VisionPreprocessor
 11 | from core.transforms.image_transform import get_image_transform
 12 | from core.transforms.region_transform import get_region_transform
 13 | from core.transforms.video_transform import get_video_transform
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | @dataclass
 19 | class DataloadArgs:
 20 |     datamix: str = (
 21 |         "dummy_image:1,dummy_multi_image:1,dummy_image_region:1,dummy_video:1,dummy_text:1,dummy_stc_RDCap:1,dummy_stc_RCap:1,dummy_stc_RTLoc:1"
 22 |     )
 23 |     batch_size: int = 2
 24 |     seed: int = 42
 25 |     image_res: Optional[int] = None
 26 |     max_num_tiles: Optional[int] = None
 27 |     vision_input_type: Optional[str] = None
 28 |     num_workers: Optional[int] = None
 29 |     tokenizer_path: Optional[str] = None
 30 |     tokenizer_name: Optional[str] = None
 31 |     conversation_format: Optional[str] = None
 32 |     patch_size: Optional[int] = None
 33 |     seq_len: Optional[int] = None
 34 |     max_video_frames: Optional[int] = None
 35 |     show_first_batch: Optional[bool] = False
 36 | 
 37 | 
 38 | def get_rank_position(positions, rank, workers, world_size):
 39 |     if positions is not None and rank in positions:
 40 |         if positions["num_workers"] != workers or positions["world_size"] != world_size:
 41 |             logger.warning(
 42 |                 f"Checkpoint resumed with different number of total dataloader workers. Dataloaders have been reset. "
 43 |                 f"num_workers: {positions['num_workers']} -> {workers}, "
 44 |                 f"world_size: {positions['world_size']} -> {world_size}"
 45 |             )
 46 |             return None
 47 |         return positions[rank]
 48 |     return None
 49 | 
 50 | 
 51 | def get_dataloader(
 52 |     args,
 53 |     dp_rank,
 54 |     dp_world_size,
 55 |     dataset_configs: Dict[str, Any],
 56 |     tokenizer=None,
 57 |     positions=None,
 58 | ):
 59 |     vision_input_type = args.vision_input_type
 60 |     image_res = args.image_res
 61 |     max_num_tiles = args.max_num_tiles
 62 |     max_video_frames = args.max_video_frames
 63 | 
 64 |     preprocessor = partial(
 65 |         VisionPreprocessor,
 66 |         transform={
 67 |             "image": get_image_transform(
 68 |                 vision_input_type=vision_input_type,
 69 |                 image_res=image_res,
 70 |                 max_num_tiles=max_num_tiles,
 71 |             ),
 72 |             "video": get_video_transform(image_res=image_res),
 73 |             "region": get_region_transform(),
 74 |         },
 75 |         tokenizer=tokenizer,
 76 |         max_video_frames=max_video_frames,
 77 |     )
 78 | 
 79 |     dataset = DatasetMixer(
 80 |         args.datamix,
 81 |         global_rank=dp_rank,
 82 |         world_size=dp_world_size,
 83 |         seed=args.seed,
 84 |         preprocessors=[preprocessor],
 85 |         dataset_configs=dataset_configs,
 86 |     )
 87 | 
 88 |     # Create the dataloader
 89 |     dataloader = PersistentDataLoader(
 90 |         dataset,
 91 |         args.batch_size,
 92 |         args.num_workers,
 93 |         collate_fn=MllmPaddingCollator(
 94 |             tokenizer,
 95 |             show_first_batch=args.show_first_batch,
 96 |         ),
 97 |         positions=get_rank_position(
 98 |             positions, dp_rank, args.num_workers, dp_world_size
 99 |         ),
100 |     )
101 | 
102 |     return dataloader
103 | 


--------------------------------------------------------------------------------
/core/tests/Rock-climbing-Canada-1920x1147.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/core/tests/Rock-climbing-Canada-1920x1147.jpg


--------------------------------------------------------------------------------
/core/tests/dataloader_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import TestCase
 3 | 
 4 | import torch
 5 | 
 6 | from apps.plm.dataset_conf import dataset_config as DATASET_CONFIGS
 7 | from apps.plm.tokenizer import PLMTokenizer
 8 | from core.data.dataloader import DataloadArgs, get_dataloader
 9 | 
10 | 
11 | # TOKENIZER_PATH=facebook/Perception-LM-1B/tokenizer.model python -m unittest core/tests/dataloader_test.py
12 | class DataloaderTest(TestCase):
13 |     def setUp(self):
14 |         self.seq_len = 8196
15 |         self.patch_size = 14
16 |         self.pooling_ratio = 2
17 |         self.max_num_tiles = 9
18 |         self.image_res = 448
19 |         self.mllm_tokenizer = PLMTokenizer(
20 |             os.environ["TOKENIZER_PATH"],
21 |             seq_len=self.seq_len,
22 |             patch_size=self.patch_size,
23 |             pooling_ratio=self.pooling_ratio,
24 |         )
25 | 
26 |     def test_jsonl_image_text_dataloader(self):
27 |         dataloader_args = DataloadArgs(
28 |             datamix="dummy_image:1",
29 |             num_workers=1,
30 |             vision_input_type="thumb+tile",
31 |             image_res=self.image_res,
32 |             max_num_tiles=self.max_num_tiles,
33 |             batch_size=1,
34 |         )
35 |         dataloader = get_dataloader(
36 |             dataloader_args,
37 |             dp_rank=0,
38 |             dp_world_size=1,
39 |             dataset_configs=DATASET_CONFIGS,
40 |             tokenizer=self.mllm_tokenizer,
41 |         )
42 |         batch_iterator = iter(dataloader)
43 |         expected_num_image_tokens = (
44 |             self.image_res // self.patch_size // self.pooling_ratio
45 |         ) ** 2 * (self.max_num_tiles + 1)
46 |         print(f"expected_num_image_tokens: {expected_num_image_tokens}")
47 |         for i in range(3):
48 |             mllm_batch = next(batch_iterator)
49 | 
50 |             image_token_mask = mllm_batch.x == self.mllm_tokenizer.image_token_id
51 |             num_image_tokens = image_token_mask.sum(dim=1)
52 |             self.assertTrue((mllm_batch.image_pos_index[~image_token_mask] == -1).all())
53 | 
54 |             for i in range(mllm_batch.x.shape[0]):
55 |                 print(f"num_image_tokens in example {i}", num_image_tokens[i])
56 |                 self.assertEqual(
57 |                     num_image_tokens[i],
58 |                     expected_num_image_tokens,
59 |                 )
60 |                 cur_x_is_image = image_token_mask[i]
61 |                 cur_image_pos_index = mllm_batch.image_pos_index[i]
62 |                 self.assertTrue(
63 |                     torch.equal(
64 |                         cur_image_pos_index[cur_x_is_image],
65 |                         torch.arange(num_image_tokens[i]),
66 |                     )
67 |                 )
68 | 


--------------------------------------------------------------------------------
/core/tests/llama3_tokenizer_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
  3 | 
  4 | import os
  5 | from unittest import TestCase
  6 | 
  7 | from tokenizer import ChatFormat, Llama3Tokenizer
  8 | 
  9 | # From current directory:
 10 | # TOKENIZER_PATH=<path> python -m unittest llama3_tokenizer_test.py
 11 | 
 12 | 
 13 | class TokenizerTests(TestCase):
 14 |     def setUp(self):
 15 |         self.tokenizer = Llama3Tokenizer(os.environ["TOKENIZER_PATH"])
 16 |         self.format = ChatFormat(self.tokenizer)
 17 | 
 18 |     def test_special_tokens(self):
 19 |         self.assertEqual(
 20 |             self.tokenizer.special_tokens["<|begin_of_text|>"],
 21 |             128000,
 22 |         )
 23 | 
 24 |     def test_encode(self):
 25 |         self.assertEqual(
 26 |             self.tokenizer.encode(
 27 |                 "This is a test sentence.", add_bos=True, add_eos=True
 28 |             ),
 29 |             [128000, 2028, 374, 264, 1296, 11914, 13, 128001],
 30 |         )
 31 | 
 32 |     def test_decode(self):
 33 |         self.assertEqual(
 34 |             self.tokenizer.decode(
 35 |                 [128000, 2028, 374, 264, 1296, 11914, 13, 128001],
 36 |             ),
 37 |             "<|begin_of_text|>This is a test sentence.<|end_of_text|>",
 38 |         )
 39 | 
 40 |     def test_encode_message(self):
 41 |         message = {
 42 |             "role": "user",
 43 |             "content": "This is a test sentence.",
 44 |         }
 45 |         self.assertEqual(
 46 |             self.format.encode_message(message),
 47 |             [
 48 |                 128006,  # <|start_header_id|>
 49 |                 882,  # "user"
 50 |                 128007,  # <|end_header_id|>
 51 |                 271,  # "\n\n"
 52 |                 2028,
 53 |                 374,
 54 |                 264,
 55 |                 1296,
 56 |                 11914,
 57 |                 13,  # This is a test sentence.
 58 |                 128009,  # <|eot_id|>
 59 |             ],
 60 |         )
 61 | 
 62 |     def test_encode_dialog(self):
 63 |         dialog = [
 64 |             {
 65 |                 "role": "system",
 66 |                 "content": "This is a test sentence.",
 67 |             },
 68 |             {
 69 |                 "role": "user",
 70 |                 "content": "This is a response.",
 71 |             },
 72 |         ]
 73 |         self.assertEqual(
 74 |             self.format.encode_dialog_prompt(dialog),
 75 |             [
 76 |                 128000,  # <|begin_of_text|>
 77 |                 128006,  # <|start_header_id|>
 78 |                 9125,  # "system"
 79 |                 128007,  # <|end_header_id|>
 80 |                 271,  # "\n\n"
 81 |                 2028,
 82 |                 374,
 83 |                 264,
 84 |                 1296,
 85 |                 11914,
 86 |                 13,  # "This is a test sentence."
 87 |                 128009,  # <|eot_id|>
 88 |                 128006,  # <|start_header_id|>
 89 |                 882,  # "user"
 90 |                 128007,  # <|end_header_id|>
 91 |                 271,  # "\n\n"
 92 |                 2028,
 93 |                 374,
 94 |                 264,
 95 |                 2077,
 96 |                 13,  # "This is a response.",
 97 |                 128009,  # <|eot_id|>
 98 |                 128006,  # <|start_header_id|>
 99 |                 78191,  # "assistant"
100 |                 128007,  # <|end_header_id|>
101 |                 271,  # "\n\n"
102 |             ],
103 |         )
104 | 


--------------------------------------------------------------------------------
/core/tests/ocrbench_centre.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/core/tests/ocrbench_centre.jpg


--------------------------------------------------------------------------------
/core/tests/selfie_cathedral_peak.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/core/tests/selfie_cathedral_peak.jpg


--------------------------------------------------------------------------------
/core/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from dataclasses import dataclass
 3 | from functools import partial
 4 | from typing import Callable, Optional
 5 | 
 6 | import torch
 7 | 
 8 | 
 9 | @dataclass
10 | class InitArgs:
11 |     use_gaussian: bool = True  # gaussian vs uniform
12 |     coeff_std: Optional[float] = None  # std coeff multiplier
13 |     no_init: bool = False
14 | 
15 | 
16 | def get_init_fn(
17 |     args: InitArgs, input_dim: int, init_depth: Optional[int]
18 | ) -> Callable[[torch.Tensor], torch.Tensor]:
19 |     """
20 |     Init functions.
21 |     """
22 |     if args.no_init:
23 |         return lambda x: x
24 | 
25 |     # standard deviation
26 |     std = 1 / math.sqrt(input_dim)
27 |     std = std if args.coeff_std is None else (args.coeff_std * std)
28 | 
29 |     # rescale with depth
30 |     if init_depth is not None:
31 |         std = std / math.sqrt(2 * init_depth)
32 | 
33 |     # gaussian vs uniform
34 |     if args.use_gaussian:
35 |         return partial(
36 |             torch.nn.init.trunc_normal_, mean=0.0, std=std, a=-3 * std, b=3 * std
37 |         )
38 |     else:
39 |         bound = math.sqrt(3) * std  # ensure the standard deviation is `std`
40 |         return partial(torch.nn.init.uniform_, a=-bound, b=bound)
41 | 


--------------------------------------------------------------------------------
/core/vision_encoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/core/vision_encoder/__init__.py


--------------------------------------------------------------------------------
/core/vision_encoder/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/perception_models/54e09d1fee2e65ed99b344489d785fbd7c174e8d/core/vision_encoder/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/core/vision_encoder/config.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | 
  3 | """
  4 | Include all available vision encoder configurations.
  5 | """
  6 | 
  7 | from dataclasses import dataclass, replace
  8 | 
  9 | from functools import partial
 10 | from typing import Callable, Optional, Sequence, Tuple, List
 11 | 
 12 | from huggingface_hub import hf_hub_download
 13 | 
 14 | 
 15 | 
 16 | def fetch_pe_checkpoint(name: str, path: Optional[str] = None):
 17 |     path = path or f"hf://facebook/{name}:{name}.pt"
 18 | 
 19 |     if path.startswith("hf://"):
 20 |         # Load from huggingface
 21 |         path = path[len("hf://"):]
 22 |         repo, file = path.split(":")
 23 | 
 24 |         # To count the download, config.yaml is empty
 25 |         hf_hub_download(repo_id=repo, filename="config.yaml")
 26 |         return hf_hub_download(repo_id=repo, filename=file)
 27 |     else:
 28 |         return path
 29 | 
 30 | 
 31 | 
 32 | 
 33 | @dataclass
 34 | class PEConfig:
 35 |     """ Vision Tower Config. """
 36 |     patch_size: int
 37 |     width: int
 38 |     layers: int
 39 |     heads: int
 40 |     mlp_ratio: float
 41 |     output_dim: Optional[int]
 42 | 
 43 |     ls_init_value: float = None
 44 |     drop_path: float = 0.0
 45 | 
 46 |     image_size: int = 224,
 47 |     use_abs_posemb: bool = True
 48 |     use_cls_token: bool = False
 49 |     use_rope2d: bool = True
 50 | 
 51 |     pool_type: str = "attn"
 52 |     attn_pooler_heads: int = 8
 53 | 
 54 |     use_ln_pre: bool = True
 55 |     use_ln_post: bool = True
 56 | 
 57 | 
 58 | @dataclass
 59 | class PETextConfig:
 60 |     """ Text Tower Config. """
 61 |     context_length: int
 62 |     width: int
 63 |     heads: int
 64 |     layers: int
 65 | 
 66 |     output_dim: int
 67 | 
 68 |     mlp_ratio: float = 4.0
 69 |     vocab_size: int = 49408
 70 | 
 71 | 
 72 | 
 73 | 
 74 | PE_VISION_CONFIG = {}
 75 | PE_TEXT_CONFIG = {}
 76 | 
 77 | 
 78 | 
 79 | #########################################
 80 | #                PE CORE                #
 81 | #########################################
 82 | 
 83 | PE_VISION_CONFIG["PE-Core-G14-448"] = PEConfig(
 84 |     image_size=448,
 85 |     patch_size=14,
 86 |     width=1536,
 87 |     layers=50,
 88 |     heads=16,
 89 |     mlp_ratio=8960 / 1536,
 90 |     pool_type="attn",
 91 |     output_dim=1280,
 92 |     use_cls_token=False,
 93 | )
 94 | PE_TEXT_CONFIG["PE-Core-G14-448"] = PETextConfig(
 95 |     context_length=72,
 96 |     width=1280,
 97 |     heads=20,
 98 |     layers=24,
 99 |     output_dim=1280
100 | )
101 | 
102 | 
103 | PE_VISION_CONFIG["PE-Core-L14-336"] = PEConfig(
104 |     image_size=336,
105 |     patch_size=14,
106 |     width=1024,
107 |     layers=24,
108 |     heads=16,
109 |     mlp_ratio=4.0,
110 |     pool_type="attn",
111 |     output_dim=1024,
112 |     use_cls_token=True,
113 | )
114 | PE_TEXT_CONFIG["PE-Core-L14-336"] = PETextConfig(
115 |     context_length=32,
116 |     width=1024,
117 |     heads=16,
118 |     layers=24,
119 |     output_dim=1024
120 | )
121 | 
122 | 
123 | PE_VISION_CONFIG["PE-Core-B16-224"] = PEConfig(
124 |     image_size=224,
125 |     patch_size=16,
126 |     width=768,
127 |     layers=12,
128 |     heads=12,
129 |     mlp_ratio=4.0,
130 |     pool_type="attn",
131 |     output_dim=1024,
132 |     use_cls_token=True,
133 | )
134 | PE_TEXT_CONFIG["PE-Core-B16-224"] = PE_TEXT_CONFIG["PE-Core-L14-336"]
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | #########################################
144 | #                PE Lang                #
145 | #########################################
146 | 
147 | PE_VISION_CONFIG["PE-Lang-G14-448"] = replace(
148 |     PE_VISION_CONFIG["PE-Core-G14-448"],
149 |     image_size=448,
150 |     pool_type="none",
151 |     use_ln_post=False,
152 |     output_dim=None,
153 |     ls_init_value=0.1,
154 |     layers=47,
155 | )
156 | 
157 | PE_VISION_CONFIG["PE-Lang-L14-448"] = replace(
158 |     PE_VISION_CONFIG["PE-Core-L14-336"],
159 |     image_size=448,
160 |     pool_type="none",
161 |     use_ln_post=False,
162 |     output_dim=None,
163 |     ls_init_value=0.1,
164 |     layers=23
165 | )
166 | 
167 | 
168 | 
169 | #########################################
170 | #               PE Spatial              #
171 | #########################################
172 | 
173 | PE_VISION_CONFIG["PE-Spatial-G14-448"] = replace(
174 |     PE_VISION_CONFIG["PE-Core-G14-448"],
175 |     image_size=448,
176 |     pool_type="none",
177 |     use_ln_post=False,
178 |     output_dim=None,
179 |     ls_init_value=0.1,
180 | )
181 | 


--------------------------------------------------------------------------------
/core/vision_encoder/transforms.py:
--------------------------------------------------------------------------------
 1 | import torchvision.transforms as T
 2 | 
 3 | from core.vision_encoder.tokenizer import SimpleTokenizer
 4 | 
 5 | 
 6 | def get_image_transform(
 7 |     image_size: int,
 8 |     center_crop: bool = False,
 9 |     interpolation: T.InterpolationMode = T.InterpolationMode.BILINEAR  # We used bilinear during training
10 | ):
11 |     if center_crop:
12 |         crop = [
13 |             T.Resize(image_size, interpolation=interpolation),
14 |             T.CenterCrop(image_size)
15 |         ]
16 |     else:
17 |         # "Squash": most versatile
18 |         crop = [
19 |             T.Resize((image_size, image_size), interpolation=interpolation)
20 |         ]
21 |     
22 |     return T.Compose(crop + [
23 |         T.Lambda(lambda x: x.convert("RGB")),
24 |         T.ToTensor(),
25 |         T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5], inplace=True),
26 |     ])
27 | 
28 | 
29 | 
30 | def get_text_tokenizer(context_length: int):
31 |     return SimpleTokenizer(context_length=context_length)


--------------------------------------------------------------------------------
/core/vision_projector/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | from torch import nn
 6 | 
 7 | 
 8 | class BaseProjector(nn.Module, ABC):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.adaptive_avg_pool = None
12 | 
13 |     @abstractmethod
14 |     def setup_projector(self):
15 |         """
16 |         Setup the vision_projector attribute in subclasses.
17 |         """
18 |         pass
19 | 
20 |     def forward(self, x):
21 |         x = x.permute(1, 0, 2)  # NLD -> LND
22 |         x = self.projector(x)
23 |         x = x.permute(1, 0, 2)
24 |         if self.adaptive_avg_pool is not None:
25 |             x = self.adaptive_avg_pool(x)
26 |         return x
27 | 


--------------------------------------------------------------------------------
/core/vision_projector/mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | import math
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from torch import nn
 7 | 
 8 | from core.utils import get_init_fn
 9 | from core.vision_projector.base import BaseProjector
10 | 
11 | 
12 | class AdaptiveAvgPooling(nn.Module):
13 |     def __init__(self, pooling_ratio=2):
14 |         super(AdaptiveAvgPooling, self).__init__()
15 |         self.pooling_ratio = pooling_ratio
16 | 
17 |     def forward(self, x):
18 |         b, num_tokens, c = x.shape
19 |         h = int(math.sqrt(num_tokens))
20 |         assert h * h == num_tokens
21 | 
22 |         shape = (h // self.pooling_ratio, h // self.pooling_ratio)
23 |         x = x.permute(0, 2, 1).reshape(b, -1, h, h)
24 |         x = F.adaptive_avg_pool2d(x, shape)
25 |         x = x.flatten(2).transpose(1, 2)
26 | 
27 |         return x
28 | 
29 | 
30 | class MLPProjector(BaseProjector):
31 |     def __init__(self, args):
32 |         super().__init__()
33 |         self.setup_projector(args)
34 |         self.pooling_ratio = args.pooling_ratio
35 |         self.adaptive_avg_pool = AdaptiveAvgPooling(pooling_ratio=args.pooling_ratio)
36 |         self.remove_vision_class_token = args.remove_vision_class_token
37 | 
38 |     def init_tensors(self):
39 |         self.init_method(self.projector[0].weight)
40 |         self.init_method(self.projector[0].bias)
41 |         self.init_method(self.projector[2].weight)
42 |         self.init_method(self.projector[2].bias)
43 | 
44 |     def setup_projector(self, args):
45 |         self.init_method = get_init_fn(args.mlp_init, args.dim, init_depth=None)
46 |         input_size = args.vision_model["width"]
47 |         output_size = args.dim
48 |         self.projector = nn.Sequential(
49 |             nn.Linear(
50 |                 in_features=input_size,
51 |                 out_features=output_size,
52 |                 bias=True,
53 |                 dtype=torch.get_default_dtype(),
54 |             ),
55 |             nn.GELU(),
56 |             nn.Linear(
57 |                 in_features=output_size,
58 |                 out_features=output_size,
59 |                 bias=True,
60 |                 dtype=torch.get_default_dtype(),
61 |             ),
62 |         )
63 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==2.1.2
 2 | omegaconf==2.3.0
 3 | msgspec==0.19.0
 4 | rouge-score==0.1.2
 5 | sacrebleu==2.5.1
 6 | sentencepiece==0.2.0
 7 | tiktoken==0.9.0
 8 | blobfile==3.0.0
 9 | wandb==0.19.8
10 | viztracer==1.0.3
11 | lm-eval==0.4.8
12 | scipy==1.15.2
13 | pynvml==12.0.0
14 | orjson==3.10.15
15 | einops==0.8.1
16 | pillow==11.0.0
17 | pyahocorasick==2.1.0
18 | iopath==0.1.10
19 | torchdata==0.11.0
20 | torchcodec==0.1.0
21 | timm==1.0.15
22 | decord==0.6.0
23 | opencv-python==4.11.0.86
24 | pycocoevalcap==1.2
25 | scikit-learn==1.6.1
26 | scipy==1.15.2
27 | sentencepiece==0.2.0
28 | tokenizers==0.21.1
29 | webdataset==0.2.111
30 | fsspec
31 | datatrove
32 | ftfy
33 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | with open("requirements.txt") as f:
 4 |     required = f.read().splitlines()
 5 | 
 6 | setup(
 7 |     name="perception_models",
 8 |     version="1.0.0",
 9 |     author="Meta AI Research, FAIR",
10 |     description="Occhi package.",
11 |     url="https://github.com/facebookresearch/perception_models",
12 |     packages=find_packages(),
13 |     install_requires=required,
14 |     classifiers=[
15 |         "Programming Language :: Python :: 3",
16 |         "License :: Other/Proprietary License",
17 |     ],
18 |     license="FAIR Noncommercial Research License",
19 |     python_requires=">=3.11",
20 | )
21 | 


--------------------------------------------------------------------------------