├── README.md ├── assets └── ov_parts.jpg ├── baselines ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── cat_seg.cpython-38.pyc │ ├── clipseg.cpython-38.pyc │ ├── config.cpython-38.pyc │ ├── mask_former_model.cpython-38.pyc │ ├── test_time_augmentation.cpython-38.pyc │ └── zero_shot_obj_part_mask_former_model.cpython-38.pyc ├── cat_seg.py ├── clipseg.py ├── config.py ├── data │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── build.cpython-38.pyc │ ├── augmentations.py │ ├── build.py │ ├── dataset_mappers │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── object_part_mapper.cpython-38.pyc │ │ │ └── oracle_dataset_mapper.cpython-38.pyc │ │ ├── object_part_mapper.py │ │ └── oracle_dataset_mapper.py │ ├── datasets │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── coco.cpython-38.pyc │ │ │ ├── register_ade_part_234.cpython-38.pyc │ │ │ ├── register_pascal_part_116.cpython-38.pyc │ │ │ └── utils.cpython-38.pyc │ │ ├── ade20kpart234_mapping.json │ │ ├── coco.py │ │ ├── mask_cls_collect.py │ │ ├── register_ade_part_234.py │ │ ├── register_pascal_part_116.py │ │ └── utils.py │ └── transforms │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── augmentation.cpython-38.pyc │ │ ├── augmentation_impl.cpython-38.pyc │ │ └── transform.cpython-38.pyc │ │ ├── augmentation.py │ │ ├── augmentation_impl.py │ │ └── transform.py ├── evaluation │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── classification_evaluation.cpython-38.pyc │ │ ├── generalized_sem_seg_evaluation.cpython-38.pyc │ │ └── pseudo_sem_seg_evaluation.cpython-38.pyc │ └── generalized_sem_seg_evaluation.py ├── mask_former_model.py ├── modeling │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── criterion.cpython-38.pyc │ │ └── matcher.cpython-38.pyc │ ├── backbone │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── clip_resnet.cpython-38.pyc │ │ │ └── swin.cpython-38.pyc │ │ ├── clip_resnet.py │ │ └── swin.py │ ├── clip_adapter │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── adapter.cpython-38.pyc │ │ │ ├── text_prompt.cpython-38.pyc │ │ │ └── utils.cpython-38.pyc │ │ ├── adapter.py │ │ ├── text_prompt.py │ │ └── utils.py │ ├── criterion.py │ ├── heads │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── cat_seg_head.cpython-38.pyc │ │ │ ├── mask_former_head.cpython-38.pyc │ │ │ ├── pixel_decoder.cpython-38.pyc │ │ │ ├── zero_shot_mask_former_head.cpython-38.pyc │ │ │ └── zero_shot_obj_part_mask_former_head.cpython-38.pyc │ │ ├── cat_seg_head.py │ │ ├── mask_former_head.py │ │ ├── pixel_decoder.py │ │ ├── zero_shot_mask_former_head.py │ │ └── zero_shot_obj_part_mask_former_head.py │ ├── matcher.py │ └── transformer │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── cat_seg_predictor.cpython-38.pyc │ │ ├── model.cpython-38.pyc │ │ ├── position_encoding.cpython-38.pyc │ │ ├── transformer.cpython-38.pyc │ │ ├── transformer_predictor.cpython-38.pyc │ │ ├── zero_shot_obj_part_transformer_predictor.cpython-38.pyc │ │ └── zero_shot_transformer_predictor.cpython-38.pyc │ │ ├── cat_seg_predictor.py │ │ ├── model.py │ │ ├── position_encoding.py │ │ ├── transformer.py │ │ ├── transformer_predictor.py │ │ ├── zero_shot_obj_part_transformer_predictor.py │ │ └── zero_shot_transformer_predictor.py ├── test_time_augmentation.py ├── third_party │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── clip.cpython-37.pyc │ │ ├── clip.cpython-38.pyc │ │ ├── imagenet_templates.cpython-38.pyc │ │ ├── model_vpt.cpython-37.pyc │ │ ├── model_vpt.cpython-38.pyc │ │ ├── simple_tokenizer.cpython-37.pyc │ │ └── simple_tokenizer.cpython-38.pyc │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── clip.py │ ├── imagenet_templates.py │ ├── model.py │ ├── model_vpt.py │ └── simple_tokenizer.py ├── utils │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── events.cpython-38.pyc │ │ ├── misc.cpython-38.pyc │ │ └── post_process_utils.cpython-38.pyc │ ├── events.py │ ├── misc.py │ ├── post_process_utils.py │ └── selective_search.py └── zero_shot_obj_part_mask_former_model.py ├── configs ├── Base-VOC11K-20.yaml ├── base_catseg_config.yaml ├── cross_dataset │ └── clipseg_ade.yaml ├── few_shot │ ├── catseg_ade.yaml │ ├── catseg_voc.yaml │ ├── clipseg_ade.yaml │ └── clipseg_voc.yaml ├── maskformer_R50_bs16_20k.yaml └── zero_shot │ ├── catseg_ade.yaml │ ├── catseg_voc.yaml │ ├── clipseg_ade.yaml │ ├── clipseg_voc.yaml │ ├── zsseg+_R50_coop_ade.yaml │ └── zsseg+_R50_coop_voc.yaml ├── open_clip ├── CITATION.cff ├── HISTORY.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── pytest.ini ├── requirements-test.txt ├── requirements-training.txt ├── requirements.txt ├── setup.py ├── src │ ├── open_clip │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── constants.cpython-38.pyc │ │ │ ├── factory.cpython-38.pyc │ │ │ ├── hf_configs.cpython-38.pyc │ │ │ ├── hf_model.cpython-38.pyc │ │ │ ├── loss.cpython-38.pyc │ │ │ ├── model.cpython-38.pyc │ │ │ ├── modified_resnet.cpython-38.pyc │ │ │ ├── openai.cpython-38.pyc │ │ │ ├── pretrained.cpython-38.pyc │ │ │ ├── timm_model.cpython-38.pyc │ │ │ ├── tokenizer.cpython-38.pyc │ │ │ ├── transform.cpython-38.pyc │ │ │ ├── transformer.cpython-38.pyc │ │ │ ├── utils.cpython-38.pyc │ │ │ └── version.cpython-38.pyc │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── constants.py │ │ ├── factory.py │ │ ├── hf_configs.py │ │ ├── hf_model.py │ │ ├── loss.py │ │ ├── model.py │ │ ├── model_configs │ │ │ ├── RN101-quickgelu.json │ │ │ ├── RN101.json │ │ │ ├── RN50-quickgelu.json │ │ │ ├── RN50.json │ │ │ ├── RN50x16.json │ │ │ ├── RN50x4.json │ │ │ ├── RN50x64.json │ │ │ ├── ViT-B-16-plus-240.json │ │ │ ├── ViT-B-16-plus.json │ │ │ ├── ViT-B-16.json │ │ │ ├── ViT-B-32-plus-256.json │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ ├── ViT-B-32.json │ │ │ ├── ViT-H-14.json │ │ │ ├── ViT-H-16.json │ │ │ ├── ViT-L-14-280.json │ │ │ ├── ViT-L-14-336.json │ │ │ ├── ViT-L-14.json │ │ │ ├── ViT-L-16-320.json │ │ │ ├── ViT-L-16.json │ │ │ ├── ViT-M-16-alt.json │ │ │ ├── ViT-M-16.json │ │ │ ├── ViT-M-32-alt.json │ │ │ ├── ViT-M-32.json │ │ │ ├── ViT-S-16-alt.json │ │ │ ├── ViT-S-16.json │ │ │ ├── ViT-S-32-alt.json │ │ │ ├── ViT-S-32.json │ │ │ ├── ViT-bigG-14.json │ │ │ ├── ViT-e-14.json │ │ │ ├── ViT-g-14.json │ │ │ ├── convnext_base.json │ │ │ ├── convnext_base_w.json │ │ │ ├── convnext_base_w_320.json │ │ │ ├── convnext_large.json │ │ │ ├── convnext_large_d.json │ │ │ ├── convnext_small.json │ │ │ ├── convnext_tiny.json │ │ │ ├── convnext_xlarge.json │ │ │ ├── convnext_xxlarge.json │ │ │ ├── convnext_xxlarge_320.json │ │ │ ├── mt5-base-ViT-B-32.json │ │ │ ├── mt5-xl-ViT-H-14.json │ │ │ ├── roberta-ViT-B-32.json │ │ │ ├── swin_base_patch4_window7_224.json │ │ │ ├── vit_medium_patch16_gap_256.json │ │ │ ├── vit_relpos_medium_patch16_cls_224.json │ │ │ ├── xlm-roberta-base-ViT-B-32.json │ │ │ └── xlm-roberta-large-ViT-H-14.json │ │ ├── modified_resnet.py │ │ ├── openai.py │ │ ├── pretrained.py │ │ ├── timm_model.py │ │ ├── tokenizer.py │ │ ├── transform.py │ │ ├── transformer.py │ │ ├── utils.py │ │ └── version.py │ ├── open_clip_torch.egg-info │ │ ├── PKG-INFO │ │ ├── SOURCES.txt │ │ ├── dependency_links.txt │ │ ├── requires.txt │ │ └── top_level.txt │ └── training │ │ ├── __init__.py │ │ ├── data.py │ │ ├── distributed.py │ │ ├── file_utils.py │ │ ├── imagenet_zeroshot_data.py │ │ ├── logger.py │ │ ├── main.py │ │ ├── params.py │ │ ├── precision.py │ │ ├── profile.py │ │ ├── scheduler.py │ │ ├── train.py │ │ └── zero_shot.py └── tests │ ├── test_download_pretrained.py │ ├── test_hf_model.py │ ├── test_inference.py │ ├── test_inference_simple.py │ ├── test_num_shards.py │ ├── test_training_simple.py │ └── util_test.py ├── requirements.txt ├── train_net.py └── transformers ├── __init__.py ├── __pycache__ ├── __init__.cpython-38.pyc ├── activations.cpython-38.pyc ├── configuration_utils.cpython-38.pyc ├── convert_slow_tokenizer.cpython-38.pyc ├── deepspeed.cpython-38.pyc ├── dependency_versions_check.cpython-38.pyc ├── dependency_versions_table.cpython-38.pyc ├── dynamic_module_utils.cpython-38.pyc ├── feature_extraction_utils.cpython-38.pyc ├── file_utils.cpython-38.pyc ├── image_processing_utils.cpython-38.pyc ├── image_transforms.cpython-38.pyc ├── image_utils.cpython-38.pyc ├── modeling_outputs.cpython-38.pyc ├── modeling_utils.cpython-38.pyc ├── processing_utils.cpython-38.pyc ├── pytorch_utils.cpython-38.pyc ├── tokenization_utils.cpython-38.pyc ├── tokenization_utils_base.cpython-38.pyc └── tokenization_utils_fast.cpython-38.pyc ├── activations.py ├── activations_tf.py ├── audio_utils.py ├── benchmark ├── __init__.py ├── benchmark.py ├── benchmark_args.py ├── benchmark_args_tf.py ├── benchmark_args_utils.py ├── benchmark_tf.py └── benchmark_utils.py ├── commands ├── __init__.py ├── add_new_model.py ├── add_new_model_like.py ├── convert.py ├── download.py ├── env.py ├── lfs.py ├── pt_to_tf.py ├── run.py ├── serving.py ├── train.py ├── transformers_cli.py └── user.py ├── configuration_utils.py ├── convert_graph_to_onnx.py ├── convert_pytorch_checkpoint_to_tf2.py ├── convert_slow_tokenizer.py ├── convert_slow_tokenizers_checkpoints_to_fast.py ├── convert_tf_hub_seq_to_seq_bert_to_pytorch.py ├── data ├── __init__.py ├── data_collator.py ├── datasets │ ├── __init__.py │ ├── glue.py │ ├── language_modeling.py │ └── squad.py ├── metrics │ ├── __init__.py │ └── squad_metrics.py ├── processors │ ├── __init__.py │ ├── glue.py │ ├── squad.py │ ├── utils.py │ └── xnli.py └── test_generation_utils.py ├── debug_utils.py ├── deepspeed.py ├── dependency_versions_check.py ├── dependency_versions_table.py ├── dynamic_module_utils.py ├── feature_extraction_sequence_utils.py ├── feature_extraction_utils.py ├── file_utils.py ├── generation ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── beam_constraints.cpython-38.pyc │ ├── beam_search.cpython-38.pyc │ ├── configuration_utils.cpython-38.pyc │ ├── logits_process.cpython-38.pyc │ ├── stopping_criteria.cpython-38.pyc │ └── utils.cpython-38.pyc ├── beam_constraints.py ├── beam_search.py ├── configuration_utils.py ├── flax_logits_process.py ├── flax_utils.py ├── logits_process.py ├── stopping_criteria.py ├── streamers.py ├── tf_logits_process.py ├── tf_utils.py └── utils.py ├── generation_flax_utils.py ├── generation_tf_utils.py ├── generation_utils.py ├── hf_argparser.py ├── image_processing_utils.py ├── image_transforms.py ├── image_utils.py ├── integrations.py ├── keras_callbacks.py ├── modelcard.py ├── modeling_flax_outputs.py ├── modeling_flax_pytorch_utils.py ├── modeling_flax_utils.py ├── modeling_outputs.py ├── modeling_tf_outputs.py ├── modeling_tf_pytorch_utils.py ├── modeling_tf_utils.py ├── modeling_utils.py ├── models ├── auto │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── auto_factory.cpython-38.pyc │ │ ├── configuration_auto.cpython-38.pyc │ │ └── modeling_auto.cpython-38.pyc │ ├── auto_factory.py │ ├── configuration_auto.py │ ├── feature_extraction_auto.py │ ├── image_processing_auto.py │ ├── modeling_auto.py │ ├── modeling_flax_auto.py │ ├── modeling_tf_auto.py │ ├── processing_auto.py │ └── tokenization_auto.py ├── clip │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── tokenization_clip.cpython-38.pyc │ │ └── tokenization_clip_fast.cpython-38.pyc │ ├── configuration_clip.py │ ├── convert_clip_original_pytorch_to_hf.py │ ├── feature_extraction_clip.py │ ├── image_processing_clip.py │ ├── modeling_clip.py │ ├── modeling_flax_clip.py │ ├── modeling_tf_clip.py │ ├── processing_clip.py │ ├── tokenization_clip.py │ └── tokenization_clip_fast.py ├── clipseg │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── configuration_clipseg.cpython-38.pyc │ │ ├── modeling_clipseg.cpython-38.pyc │ │ └── processing_clipseg.cpython-38.pyc │ ├── configuration_clipseg.py │ ├── convert_clipseg_original_pytorch_to_hf.py │ ├── modeling_clipseg.py │ └── processing_clipseg.py └── vit │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-38.pyc │ └── image_processing_vit.cpython-38.pyc │ ├── configuration_vit.py │ ├── convert_dino_to_pytorch.py │ ├── convert_vit_timm_to_pytorch.py │ ├── feature_extraction_vit.py │ ├── image_processing_vit.py │ ├── modeling_flax_vit.py │ ├── modeling_tf_vit.py │ └── modeling_vit.py ├── onnx ├── __init__.py ├── __main__.py ├── config.py ├── convert.py ├── features.py └── utils.py ├── optimization.py ├── optimization_tf.py ├── pipelines ├── __init__.py ├── audio_classification.py ├── audio_utils.py ├── automatic_speech_recognition.py ├── base.py ├── conversational.py ├── depth_estimation.py ├── document_question_answering.py ├── feature_extraction.py ├── fill_mask.py ├── image_classification.py ├── image_segmentation.py ├── image_to_text.py ├── mask_generation.py ├── object_detection.py ├── pt_utils.py ├── question_answering.py ├── table_question_answering.py ├── text2text_generation.py ├── text_classification.py ├── text_generation.py ├── token_classification.py ├── video_classification.py ├── visual_question_answering.py ├── zero_shot_audio_classification.py ├── zero_shot_classification.py ├── zero_shot_image_classification.py └── zero_shot_object_detection.py ├── processing_utils.py ├── pytorch_utils.py ├── sagemaker ├── __init__.py ├── trainer_sm.py └── training_args_sm.py ├── testing_utils.py ├── tf_utils.py ├── time_series_utils.py ├── tokenization_utils.py ├── tokenization_utils_base.py ├── tokenization_utils_fast.py ├── trainer.py ├── trainer_callback.py ├── trainer_pt_utils.py ├── trainer_seq2seq.py ├── trainer_tf.py ├── trainer_utils.py ├── training_args.py ├── training_args_seq2seq.py ├── training_args_tf.py └── utils ├── __init__.py ├── __pycache__ ├── __init__.cpython-38.pyc ├── constants.cpython-38.pyc ├── doc.cpython-38.pyc ├── dummy_flax_objects.cpython-38.pyc ├── dummy_keras_nlp_objects.cpython-38.pyc ├── dummy_sentencepiece_and_tokenizers_objects.cpython-38.pyc ├── dummy_speech_objects.cpython-38.pyc ├── dummy_tensorflow_text_objects.cpython-38.pyc ├── dummy_tf_objects.cpython-38.pyc ├── dummy_tokenizers_objects.cpython-38.pyc ├── generic.cpython-38.pyc ├── hub.cpython-38.pyc ├── import_utils.cpython-38.pyc ├── logging.cpython-38.pyc ├── quantization_config.cpython-38.pyc └── versions.cpython-38.pyc ├── backbone_utils.py ├── bitsandbytes.py ├── constants.py ├── doc.py ├── dummy_detectron2_objects.py ├── dummy_flax_objects.py ├── dummy_keras_nlp_objects.py ├── dummy_pt_objects.py ├── dummy_sentencepiece_and_tokenizers_objects.py ├── dummy_sentencepiece_objects.py ├── dummy_speech_objects.py ├── dummy_tensorflow_text_objects.py ├── dummy_tf_objects.py ├── dummy_tokenizers_objects.py ├── dummy_vision_objects.py ├── fx.py ├── generic.py ├── hp_naming.py ├── hub.py ├── import_utils.py ├── logging.py ├── model_parallel_utils.py ├── notebook.py ├── quantization_config.py ├── sentencepiece_model_pb2.py └── versions.py /assets/ov_parts.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/assets/ov_parts.jpg -------------------------------------------------------------------------------- /baselines/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data 2 | from . import modeling 3 | from .config import add_mask_former_config 4 | 5 | from .test_time_augmentation import SemanticSegmentorWithTTA 6 | from .mask_former_model import MaskFormer 7 | from .zero_shot_obj_part_mask_former_model import ZeroShotObjPartMaskFormer 8 | from .clipseg import CLIPSeg 9 | from .cat_seg import CATSeg -------------------------------------------------------------------------------- /baselines/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/__pycache__/cat_seg.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/__pycache__/cat_seg.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/__pycache__/clipseg.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/__pycache__/clipseg.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/__pycache__/config.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/__pycache__/config.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/__pycache__/mask_former_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/__pycache__/mask_former_model.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/__pycache__/test_time_augmentation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/__pycache__/test_time_augmentation.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/__pycache__/zero_shot_obj_part_mask_former_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/__pycache__/zero_shot_obj_part_mask_former_model.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset_mappers import * 2 | from . import datasets 3 | from .build import ( 4 | build_detection_train_loader, 5 | build_detection_test_loader, 6 | ) 7 | -------------------------------------------------------------------------------- /baselines/data/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/__pycache__/build.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/__pycache__/build.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/dataset_mappers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .object_part_mapper import SemanticObjPartDatasetMapper 3 | from .oracle_dataset_mapper import OracleDatasetMapper -------------------------------------------------------------------------------- /baselines/data/dataset_mappers/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/dataset_mappers/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/dataset_mappers/__pycache__/object_part_mapper.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/dataset_mappers/__pycache__/object_part_mapper.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/dataset_mappers/__pycache__/oracle_dataset_mapper.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/dataset_mappers/__pycache__/oracle_dataset_mapper.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/dataset_mappers/oracle_dataset_mapper.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | 5 | import torch 6 | 7 | 8 | from detectron2.data import detection_utils as utils 9 | from detectron2.data import transforms as T 10 | from detectron2.data import DatasetMapper 11 | 12 | 13 | class OracleDatasetMapper(DatasetMapper): 14 | def __call__(self, dataset_dict): 15 | """ 16 | Args: 17 | dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. 18 | 19 | Returns: 20 | dict: a format that builtin models in detectron2 accept 21 | """ 22 | dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below 23 | # USER: Write your own image loading if it's not from a file 24 | image = utils.read_image(dataset_dict["file_name"], format=self.image_format) 25 | utils.check_image_size(dataset_dict, image) 26 | 27 | # USER: Remove if you don't do semantic/panoptic segmentation. 28 | if "sem_seg_file_name" in dataset_dict: 29 | sem_seg_gt = utils.read_image( 30 | dataset_dict.pop("sem_seg_file_name"), "L" 31 | ).squeeze(2) 32 | else: 33 | sem_seg_gt = None 34 | 35 | aug_input = T.AugInput(image, sem_seg=sem_seg_gt) 36 | transforms = self.augmentations(aug_input) 37 | image, sem_seg_gt = aug_input.image, aug_input.sem_seg 38 | 39 | image_shape = image.shape[:2] # h, w 40 | # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, 41 | # but not efficient on large generic data structures due to the use of pickle & mp.Queue. 42 | # Therefore it's important to use torch.Tensor. 43 | dataset_dict["image"] = torch.as_tensor( 44 | np.ascontiguousarray(image.transpose(2, 0, 1)) 45 | ) 46 | if sem_seg_gt is not None: 47 | dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long")) 48 | 49 | # USER: Remove if you don't use pre-computed proposals. 50 | # Most users would not need this feature. 51 | if self.proposal_topk is not None: 52 | utils.transform_proposals( 53 | dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk 54 | ) 55 | 56 | if "annotations" in dataset_dict: 57 | self._transform_annotations(dataset_dict, transforms, image_shape) 58 | 59 | return dataset_dict 60 | -------------------------------------------------------------------------------- /baselines/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .register_pascal_part_116 import register_pascal_part_116 3 | from .register_ade_part_234 import register_ade20k_part_234 -------------------------------------------------------------------------------- /baselines/data/datasets/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/datasets/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/datasets/__pycache__/coco.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/datasets/__pycache__/coco.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/datasets/__pycache__/register_ade_part_234.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/datasets/__pycache__/register_ade_part_234.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/datasets/__pycache__/register_pascal_part_116.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/datasets/__pycache__/register_pascal_part_116.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/datasets/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/datasets/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/datasets/mask_cls_collect.py: -------------------------------------------------------------------------------- 1 | # From Repository: https://github.com/MendelXu/zsseg.baseline/blob/master/tools/mask_cls_collect.py 2 | 3 | import os 4 | import glob 5 | import functools 6 | from mmcv.utils import track_parallel_progress 7 | import numpy as np 8 | from PIL import Image 9 | import json 10 | import warnings 11 | import fire 12 | from itertools import chain 13 | 14 | 15 | def count_cls(file_path, ignore_index=[255], depth=1): 16 | cls_label = np.unique(np.asarray(Image.open(file_path))).tolist() 17 | cls_label = [l for l in cls_label if l not in ignore_index] 18 | return [os.path.join(*file_path.split(os.sep)[-depth:]), cls_label] 19 | 20 | 21 | def main(gt_dir, map_file_save_path, ignore_index=[255], ext=".png", recursive=False): 22 | if not os.path.isdir(gt_dir): 23 | warnings.warn(f"{gt_dir} is not a valid directory") 24 | return 25 | gt_file_list = glob.glob(os.path.join(gt_dir, "*" + ext), recursive=recursive) 26 | print(f"Find {len(gt_file_list)}") 27 | _func = functools.partial(count_cls, ignore_index=ignore_index) 28 | results = track_parallel_progress(_func, gt_file_list, nproc=16) 29 | results = {r[0]: r[1] for r in results} 30 | with open(map_file_save_path, "w") as f: 31 | json.dump(results, f) 32 | 33 | 34 | def main_ctyscapes( 35 | gt_dir, map_file_save_path, ignore_index=[255], ext=".png", recursive=False 36 | ): 37 | if not os.path.isdir(gt_dir): 38 | warnings.warn(f"{gt_dir} is not a valid directory") 39 | return 40 | cities = os.listdir(gt_dir) 41 | gt_file_list = list( 42 | chain.from_iterable( 43 | [ 44 | glob.glob( 45 | os.path.join(gt_dir, city, "*" + ext), 46 | ) 47 | for city in cities 48 | ] 49 | ) 50 | ) 51 | print(gt_file_list[0]) 52 | print(f"Find {len(gt_file_list)}") 53 | _func = functools.partial(count_cls, ignore_index=ignore_index, depth=2) 54 | results = track_parallel_progress(_func, gt_file_list, nproc=16) 55 | results = {r[0]: r[1] for r in results} 56 | with open(map_file_save_path, "w") as f: 57 | json.dump(results, f) 58 | 59 | 60 | if __name__ == "__main__": 61 | fire.Fire(main) 62 | -------------------------------------------------------------------------------- /baselines/data/datasets/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | 5 | from detectron2.data.datasets.coco import load_sem_seg 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | def load_obj_part_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg", data_list=None): 10 | data_dicts = load_sem_seg(gt_root, image_root, gt_ext, image_ext) 11 | if data_list is not None: 12 | img_list = json.load(open(data_list,'r')) 13 | img_list = [item["file_name"] for item in img_list] 14 | new_data_dicts = [] 15 | for i,data in enumerate(data_dicts): 16 | if data_list is not None: 17 | if data["file_name"] not in img_list: 18 | continue 19 | data_dicts[i]["obj_sem_seg_file_name"] = data["sem_seg_file_name"].replace('part','obj') 20 | new_data_dicts.append(data_dicts[i]) 21 | return new_data_dicts 22 | 23 | 24 | def load_binary_mask(gt_root, image_root, gt_ext="png", image_ext="jpg", label_count="_part_label_count.json", base_classes=None): 25 | """ 26 | Flatten the results of `load_sem_seg` to annotations for binary mask. 27 | 28 | `label_count_file` contains a dictionary like: 29 | ``` 30 | { 31 | "xxx.png":[0,3,5], 32 | "xxxx.png":[3,4,7], 33 | } 34 | ``` 35 | """ 36 | label_count_file = gt_root + label_count 37 | with open(label_count_file) as f: 38 | label_count_dict = json.load(f) 39 | 40 | data_dicts = load_sem_seg(gt_root, image_root, gt_ext, image_ext) 41 | flattened_data_dicts = [] 42 | for data in data_dicts: 43 | data['obj_sem_seg_file_name'] = data["sem_seg_file_name"].replace('_part','_obj') 44 | category_per_image = label_count_dict[ 45 | os.path.basename(data["sem_seg_file_name"]) 46 | ] 47 | if base_classes is not None: 48 | category_per_image = [i for i in category_per_image if i in base_classes] 49 | flattened_data = [ 50 | dict(**{"category_id": cat}, **data) for cat in category_per_image 51 | ] 52 | flattened_data_dicts.extend(flattened_data) 53 | logger.info( 54 | "Loaded {} images with flattened semantic segmentation from {}".format( 55 | len(flattened_data_dicts), image_root 56 | ) 57 | ) 58 | return flattened_data_dicts 59 | -------------------------------------------------------------------------------- /baselines/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from fvcore.transforms.transform import Transform, TransformList # order them first 3 | from fvcore.transforms.transform import * 4 | from .transform import * 5 | from .augmentation import * 6 | from .augmentation_impl import * 7 | 8 | __all__ = [k for k in globals().keys() if not k.startswith("_")] 9 | 10 | 11 | from detectron2.utils.env import fixup_module_metadata 12 | 13 | fixup_module_metadata(__name__, globals(), __all__) 14 | del fixup_module_metadata 15 | -------------------------------------------------------------------------------- /baselines/data/transforms/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/transforms/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/transforms/__pycache__/augmentation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/transforms/__pycache__/augmentation.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/transforms/__pycache__/augmentation_impl.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/transforms/__pycache__/augmentation_impl.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/data/transforms/__pycache__/transform.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/data/transforms/__pycache__/transform.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .generalized_sem_seg_evaluation import GeneralizedSemSegEvaluator 2 | -------------------------------------------------------------------------------- /baselines/evaluation/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/evaluation/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/evaluation/__pycache__/classification_evaluation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/evaluation/__pycache__/classification_evaluation.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/evaluation/__pycache__/generalized_sem_seg_evaluation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/evaluation/__pycache__/generalized_sem_seg_evaluation.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/evaluation/__pycache__/pseudo_sem_seg_evaluation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/evaluation/__pycache__/pseudo_sem_seg_evaluation.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .heads.mask_former_head import MaskFormerHead 3 | from .heads.zero_shot_obj_part_mask_former_head import ZeroShotObjPartMaskFormerHead 4 | from .heads.cat_seg_head import CATSegHead -------------------------------------------------------------------------------- /baselines/modeling/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/__pycache__/criterion.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/__pycache__/criterion.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/__pycache__/matcher.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/__pycache__/matcher.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /baselines/modeling/backbone/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/backbone/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/backbone/__pycache__/clip_resnet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/backbone/__pycache__/clip_resnet.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/backbone/__pycache__/swin.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/backbone/__pycache__/swin.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/clip_adapter/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/clip_adapter/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/clip_adapter/__pycache__/adapter.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/clip_adapter/__pycache__/adapter.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/clip_adapter/__pycache__/text_prompt.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/clip_adapter/__pycache__/text_prompt.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/clip_adapter/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/clip_adapter/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /baselines/modeling/heads/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/heads/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/heads/__pycache__/cat_seg_head.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/heads/__pycache__/cat_seg_head.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/heads/__pycache__/mask_former_head.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/heads/__pycache__/mask_former_head.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/heads/__pycache__/pixel_decoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/heads/__pycache__/pixel_decoder.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/heads/__pycache__/zero_shot_mask_former_head.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/heads/__pycache__/zero_shot_mask_former_head.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/heads/__pycache__/zero_shot_obj_part_mask_former_head.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/heads/__pycache__/zero_shot_obj_part_mask_former_head.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/heads/cat_seg_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | from copy import deepcopy 4 | from typing import Callable, Dict, List, Optional, Tuple, Union 5 | from einops import rearrange 6 | 7 | import fvcore.nn.weight_init as weight_init 8 | from torch import nn 9 | from torch.nn import functional as F 10 | 11 | from detectron2.config import configurable 12 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 13 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 14 | 15 | from ..transformer.cat_seg_predictor import CATSegPredictor 16 | 17 | 18 | @SEM_SEG_HEADS_REGISTRY.register() 19 | class CATSegHead(nn.Module): 20 | 21 | @configurable 22 | def __init__( 23 | self, 24 | input_shape: Dict[str, ShapeSpec], 25 | *, 26 | num_classes: int, 27 | ignore_value: int = -1, 28 | # extra parameters 29 | feature_resolution: list, 30 | transformer_predictor: nn.Module, 31 | ): 32 | """ 33 | NOTE: this interface is experimental. 34 | Args: 35 | input_shape: shapes (channels and stride) of the input features 36 | num_classes: number of classes to predict 37 | pixel_decoder: the pixel decoder module 38 | loss_weight: loss weight 39 | ignore_value: category id to be ignored during training. 40 | transformer_predictor: the transformer decoder that makes prediction 41 | transformer_in_feature: input feature name to the transformer_predictor 42 | """ 43 | super().__init__() 44 | input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) 45 | self.in_features = [k for k, v in input_shape] 46 | self.ignore_value = ignore_value 47 | self.predictor = transformer_predictor 48 | self.num_classes = num_classes 49 | self.feature_resolution = feature_resolution 50 | 51 | @classmethod 52 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 53 | return { 54 | "input_shape": { 55 | k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 56 | }, 57 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 58 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 59 | "feature_resolution": cfg.MODEL.SEM_SEG_HEAD.FEATURE_RESOLUTION, 60 | "transformer_predictor": CATSegPredictor( 61 | cfg, 62 | ), 63 | } 64 | 65 | def forward(self, features, guidance_features, test_text=None): 66 | """ 67 | Arguments: 68 | img_feats: (B, C, HW) 69 | affinity_features: (B, C, ) 70 | """ 71 | img_feat = rearrange(features[:, 1:, :], "b (h w) c->b c h w", h=self.feature_resolution[0], w=self.feature_resolution[1]) 72 | return self.predictor(img_feat, guidance_features, test_text) -------------------------------------------------------------------------------- /baselines/modeling/heads/zero_shot_obj_part_mask_former_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import logging 3 | from copy import deepcopy 4 | from typing import Callable, Dict, List, Optional, Tuple, Union 5 | 6 | import fvcore.nn.weight_init as weight_init 7 | from torch import nn 8 | from torch.nn import functional as F 9 | 10 | from detectron2.config import configurable 11 | from detectron2.layers import Conv2d, ShapeSpec, get_norm 12 | from detectron2.modeling import SEM_SEG_HEADS_REGISTRY 13 | 14 | from ..transformer.zero_shot_obj_part_transformer_predictor import ZeroShotTransformerObjPartPredictor 15 | from .pixel_decoder import build_pixel_decoder 16 | from .zero_shot_mask_former_head import ZeroShotMaskFormerHead 17 | 18 | @SEM_SEG_HEADS_REGISTRY.register() 19 | class ZeroShotObjPartMaskFormerHead(ZeroShotMaskFormerHead): 20 | @configurable 21 | def __init__( 22 | self, 23 | input_shape: Dict[str, ShapeSpec], 24 | *, 25 | num_classes: int, 26 | pixel_decoder: nn.Module, 27 | loss_weight: float = 1.0, 28 | ignore_value: int = -1, 29 | # extra parameters 30 | transformer_predictor: nn.Module, 31 | transformer_in_feature: str, 32 | ): 33 | super().__init__( 34 | input_shape=input_shape, 35 | num_classes=num_classes, 36 | pixel_decoder=pixel_decoder, 37 | loss_weight=loss_weight, 38 | ignore_value=ignore_value, 39 | transformer_predictor=transformer_predictor, 40 | transformer_in_feature=transformer_in_feature 41 | ) 42 | @classmethod 43 | def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): 44 | return { 45 | "input_shape": { 46 | k: v 47 | for k, v in input_shape.items() 48 | if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES 49 | }, 50 | "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, 51 | "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, 52 | "pixel_decoder": build_pixel_decoder(cfg, input_shape), 53 | "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, 54 | "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, 55 | "transformer_predictor": ZeroShotTransformerObjPartPredictor( 56 | cfg, 57 | cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM 58 | if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder" 59 | else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels, 60 | mask_classification=True, 61 | ), 62 | } 63 | 64 | def forward(self, features, obj_masks): 65 | return self.layers(features, obj_masks) 66 | 67 | def layers(self, features, obj_masks): 68 | ( 69 | mask_features, 70 | transformer_encoder_features, 71 | ) = self.pixel_decoder.forward_features(features) 72 | if self.transformer_in_feature == "transformer_encoder": 73 | assert ( 74 | transformer_encoder_features is not None 75 | ), "Please use the TransformerEncoderPixelDecoder." 76 | predictions = self.predictor(transformer_encoder_features, mask_features, obj_masks) 77 | else: 78 | predictions = self.predictor( 79 | features[self.transformer_in_feature], mask_features, obj_masks 80 | ) 81 | return predictions -------------------------------------------------------------------------------- /baselines/modeling/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /baselines/modeling/transformer/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/transformer/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/transformer/__pycache__/cat_seg_predictor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/transformer/__pycache__/cat_seg_predictor.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/transformer/__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/transformer/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/transformer/__pycache__/position_encoding.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/transformer/__pycache__/position_encoding.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/transformer/__pycache__/transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/transformer/__pycache__/transformer.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/transformer/__pycache__/transformer_predictor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/transformer/__pycache__/transformer_predictor.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/transformer/__pycache__/zero_shot_obj_part_transformer_predictor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/transformer/__pycache__/zero_shot_obj_part_transformer_predictor.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/transformer/__pycache__/zero_shot_transformer_predictor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/modeling/transformer/__pycache__/zero_shot_transformer_predictor.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/modeling/transformer/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py 3 | """ 4 | Various positional encodings for the transformer. 5 | """ 6 | import math 7 | 8 | import torch 9 | from torch import nn 10 | 11 | 12 | class PositionEmbeddingSine(nn.Module): 13 | """ 14 | This is a more standard version of the position embedding, very similar to the one 15 | used by the Attention is all you need paper, generalized to work on images. 16 | """ 17 | 18 | def __init__( 19 | self, num_pos_feats=64, temperature=10000, normalize=False, scale=None 20 | ): 21 | super().__init__() 22 | self.num_pos_feats = num_pos_feats 23 | self.temperature = temperature 24 | self.normalize = normalize 25 | if scale is not None and normalize is False: 26 | raise ValueError("normalize should be True if scale is passed") 27 | if scale is None: 28 | scale = 2 * math.pi 29 | self.scale = scale 30 | 31 | def forward(self, x, mask=None): 32 | if mask is None: 33 | mask = torch.zeros( 34 | (x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool 35 | ) 36 | not_mask = ~mask 37 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 38 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 39 | if self.normalize: 40 | eps = 1e-6 41 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 42 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 43 | 44 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 45 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 46 | 47 | pos_x = x_embed[:, :, :, None] / dim_t 48 | pos_y = y_embed[:, :, :, None] / dim_t 49 | pos_x = torch.stack( 50 | (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 51 | ).flatten(3) 52 | pos_y = torch.stack( 53 | (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 54 | ).flatten(3) 55 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 56 | return pos 57 | -------------------------------------------------------------------------------- /baselines/modeling/transformer/zero_shot_transformer_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py 3 | from torch import nn 4 | from detectron2.config import configurable 5 | from .transformer_predictor import TransformerPredictor, MLP 6 | 7 | 8 | class ZeroShotTransformerPredictor(TransformerPredictor): 9 | @configurable 10 | def __init__( 11 | self, 12 | in_channels, 13 | mask_classification=True, 14 | *, 15 | embedding_dim: int, 16 | embed_hidden_dim: int, 17 | embed_layers: int, 18 | hidden_dim: int, 19 | num_queries: int, 20 | nheads: int, 21 | dropout: float, 22 | dim_feedforward: int, 23 | enc_layers: int, 24 | dec_layers: int, 25 | pre_norm: bool, 26 | deep_supervision: bool, 27 | mask_dim: int, 28 | enforce_input_project: bool, 29 | ): 30 | super().__init__( 31 | in_channels, 32 | False, 33 | num_classes=embedding_dim, 34 | hidden_dim=hidden_dim, 35 | num_queries=num_queries, 36 | nheads=nheads, 37 | dropout=dropout, 38 | dim_feedforward=dim_feedforward, 39 | enc_layers=enc_layers, 40 | dec_layers=dec_layers, 41 | pre_norm=pre_norm, 42 | deep_supervision=deep_supervision, 43 | mask_dim=mask_dim, 44 | enforce_input_project=enforce_input_project, 45 | ) 46 | self.mask_classification = mask_classification 47 | # output FFNs 48 | if self.mask_classification: 49 | self.class_embed = MLP( 50 | hidden_dim, embed_hidden_dim, embedding_dim, embed_layers 51 | ) 52 | 53 | def freeze_pretrained(self): 54 | for name, module in self.named_children(): 55 | if name not in ["class_embed"]: 56 | for param in module.parameters(): 57 | param.requires_grad = False 58 | 59 | @classmethod 60 | def from_config(cls, cfg, in_channels, mask_classification): 61 | ret = {} 62 | ret["in_channels"] = in_channels 63 | ret["mask_classification"] = mask_classification 64 | 65 | ret["embedding_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBEDDING_DIM 66 | ret["embed_hidden_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_HIDDEN_DIM 67 | ret["embed_layers"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_LAYERS 68 | ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM 69 | ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES 70 | # Transformer parameters: 71 | ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS 72 | ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT 73 | ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD 74 | ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS 75 | ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS 76 | ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM 77 | ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION 78 | ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ 79 | 80 | ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM 81 | 82 | return ret 83 | -------------------------------------------------------------------------------- /baselines/third_party/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/third_party/__init__.py -------------------------------------------------------------------------------- /baselines/third_party/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/third_party/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/third_party/__pycache__/clip.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/third_party/__pycache__/clip.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/third_party/__pycache__/clip.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/third_party/__pycache__/clip.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/third_party/__pycache__/imagenet_templates.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/third_party/__pycache__/imagenet_templates.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/third_party/__pycache__/model_vpt.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/third_party/__pycache__/model_vpt.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/third_party/__pycache__/model_vpt.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/third_party/__pycache__/model_vpt.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/third_party/__pycache__/simple_tokenizer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/third_party/__pycache__/simple_tokenizer.cpython-37.pyc -------------------------------------------------------------------------------- /baselines/third_party/__pycache__/simple_tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/third_party/__pycache__/simple_tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/third_party/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/third_party/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /baselines/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # from .events import setup_wandb, WandbWriter 3 | -------------------------------------------------------------------------------- /baselines/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/utils/__pycache__/events.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/utils/__pycache__/events.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/utils/__pycache__/misc.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/utils/__pycache__/misc.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/utils/__pycache__/post_process_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/baselines/utils/__pycache__/post_process_utils.cpython-38.pyc -------------------------------------------------------------------------------- /baselines/utils/post_process_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional as F 3 | import numpy as np 4 | 5 | try: 6 | import pydensecrf.densecrf as dcrf 7 | from pydensecrf.utils import ( 8 | unary_from_softmax, 9 | unary_from_labels, 10 | create_pairwise_bilateral, 11 | create_pairwise_gaussian, 12 | ) 13 | except: 14 | dcrf = None 15 | 16 | 17 | def dense_crf_post_process( 18 | logits, 19 | image, 20 | n_labels=None, 21 | max_iters=5, 22 | pos_xy_std=(3, 3), 23 | pos_w=3, 24 | bi_xy_std=(80, 80), 25 | bi_rgb_std=(13, 13, 13), 26 | bi_w=10, 27 | ): 28 | """ 29 | logits : [C,H,W] 30 | image : [3,H,W] 31 | """ 32 | if dcrf is None: 33 | raise FileNotFoundError( 34 | "pydensecrf is required to perform dense crf inference." 35 | ) 36 | if isinstance(logits, torch.Tensor): 37 | logits = F.softmax(logits, dim=0).detach().cpu().numpy() 38 | U = unary_from_softmax(logits) 39 | n_labels = logits.shape[0] 40 | elif logits.ndim == 3: 41 | U = unary_from_softmax(logits) 42 | n_labels = logits.shape[0] 43 | else: 44 | assert n_labels is not None 45 | U = unary_from_labels(logits, n_labels, zero_unsure=False) 46 | 47 | d = dcrf.DenseCRF2D(image.shape[1], image.shape[0], n_labels) 48 | 49 | d.setUnaryEnergy(U) 50 | 51 | # This adds the color-independent term, features are the locations only. 52 | d.addPairwiseGaussian( 53 | sxy=pos_xy_std, 54 | compat=pos_w, 55 | kernel=dcrf.DIAG_KERNEL, 56 | normalization=dcrf.NORMALIZE_SYMMETRIC, 57 | ) 58 | 59 | # This adds the color-dependent term, i.e. features are (x,y,r,g,b). 60 | d.addPairwiseBilateral( 61 | sxy=bi_xy_std, 62 | srgb=bi_rgb_std, 63 | rgbim=image, 64 | compat=bi_w, 65 | kernel=dcrf.DIAG_KERNEL, 66 | normalization=dcrf.NORMALIZE_SYMMETRIC, 67 | ) 68 | # Run five inference steps. 69 | logits = d.inference(max_iters) 70 | logits = np.asarray(logits).reshape((n_labels, image.shape[0], image.shape[1])) 71 | return torch.from_numpy(logits) 72 | -------------------------------------------------------------------------------- /configs/Base-VOC11K-20.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | BACKBONE: 3 | FREEZE_AT: 0 4 | NAME: "build_resnet_backbone" 5 | WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl" 6 | PIXEL_MEAN: [123.675, 116.280, 103.530] 7 | PIXEL_STD: [58.395, 57.120, 57.375] 8 | RESNETS: 9 | DEPTH: 50 10 | STEM_TYPE: "basic" # not used 11 | STEM_OUT_CHANNELS: 64 12 | STRIDE_IN_1X1: False 13 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 14 | # NORM: "SyncBN" 15 | RES5_MULTI_GRID: [1, 1, 1] # not used 16 | DATASETS: 17 | TRAIN: ("voc_sem_seg_train",) 18 | TEST: ("voc_sem_seg_test",) 19 | SOLVER: 20 | IMS_PER_BATCH: 16 21 | BASE_LR: 0.0001 22 | MAX_ITER: 20000 23 | WARMUP_FACTOR: 1.0 24 | WARMUP_ITERS: 0 25 | WEIGHT_DECAY: 0.0001 26 | OPTIMIZER: "ADAMW" 27 | LR_SCHEDULER_NAME: "WarmupPolyLR" 28 | BACKBONE_MULTIPLIER: 0.1 29 | CLIP_GRADIENTS: 30 | ENABLED: True 31 | CLIP_TYPE: "full_model" 32 | CLIP_VALUE: 0.01 33 | NORM_TYPE: 2.0 34 | INPUT: 35 | MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 16)]"] 36 | MIN_SIZE_TRAIN_SAMPLING: "choice" 37 | MIN_SIZE_TEST: 512 38 | MAX_SIZE_TRAIN: 2048 39 | MAX_SIZE_TEST: 2048 40 | CROP: 41 | ENABLED: True 42 | TYPE: "absolute" 43 | SIZE: (512, 512) 44 | SINGLE_CATEGORY_MAX_AREA: 1.0 45 | COLOR_AUG_SSD: True 46 | SIZE_DIVISIBILITY: 512 # used in dataset mapper 47 | FORMAT: "RGB" 48 | DATASET_MAPPER_NAME: "mask_former_semantic" 49 | TEST: 50 | EVAL_PERIOD: 5000 51 | AUG: 52 | ENABLED: False 53 | MIN_SIZES: [256, 384, 512, 640, 768, 896] 54 | MAX_SIZE: 3584 55 | FLIP: True 56 | DATALOADER: 57 | FILTER_EMPTY_ANNOTATIONS: True 58 | NUM_WORKERS: 4 59 | VERSION: 2 60 | -------------------------------------------------------------------------------- /configs/base_catseg_config.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "CATSeg" 3 | BACKBONE: 4 | FREEZE_AT: 0 5 | NAME: "D2SwinTransformer" 6 | SWIN: 7 | EMBED_DIM: 192 8 | DEPTHS: [2, 2, 18, 2] 9 | NUM_HEADS: [4, 8, 16, 32] 10 | WINDOW_SIZE: 12 11 | APE: False 12 | DROP_PATH_RATE: 0.3 13 | PATCH_NORM: True 14 | PRETRAIN_IMG_SIZE: 384 15 | OUT_FEATURES: ["res2", "res3", "res4"] 16 | WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" 17 | PIXEL_MEAN: [123.675, 116.280, 103.530] 18 | PIXEL_STD: [58.395, 57.120, 57.375] 19 | SEM_SEG_HEAD: 20 | NAME: "OpenVocabHead" 21 | IN_FEATURES: ["res2", "res3", "res4"] 22 | IGNORE_VALUE: 255 23 | NUM_CLASSES: 171 24 | CLIP_PRETRAINED: "ViT-L/14@336px" 25 | PROMPT_DEPTH: 0 26 | PROMPT_LENGTH: 0 27 | TEXT_GUIDANCE_DIM: 768 28 | TEXT_GUIDANCE_PROJ_DIM: 128 29 | APPEARANCE_GUIDANCE_DIM: 768 30 | APPEARANCE_GUIDANCE_PROJ_DIM: 128 31 | DECODER_DIMS: [64, 32] 32 | DECODER_GUIDANCE_DIMS: [256, 128] 33 | DECODER_GUIDANCE_PROJ_DIMS: [32, 16] 34 | NUM_LAYERS: 4 35 | NUM_HEADS: 4 36 | HIDDEN_DIMS: 128 37 | POOLING_SIZES: [6, 6] 38 | FEATURE_RESOLUTION: [24, 24] 39 | WINDOW_SIZES: 12 40 | ATTENTION_TYPE: "linear" 41 | CLIP_FINETUNE: "attention" 42 | PROMPT_ENSEMBLE_TYPE: "imagenet" 43 | DATASETS: 44 | TRAIN: ("coco_2017_train_stuff_all_sem_seg",) 45 | TEST: ("coco_2017_test_stuff_all_sem_seg",) 46 | SOLVER: 47 | IMS_PER_BATCH: 4 48 | BASE_LR: 0.0002 49 | MAX_ITER: 80000 50 | WARMUP_FACTOR: 1.0 51 | WARMUP_ITERS: 0 52 | WEIGHT_DECAY: 0.0001 53 | OPTIMIZER: "ADAMW" 54 | LR_SCHEDULER_NAME: "WarmupCosineLR" 55 | BACKBONE_MULTIPLIER: 0.01 56 | CLIP_MULTIPLIER: 0.01 57 | CLIP_GRADIENTS: 58 | ENABLED: True 59 | CLIP_TYPE: "full_model" 60 | CLIP_VALUE: 0.01 61 | NORM_TYPE: 2.0 62 | INPUT: 63 | MIN_SIZE_TRAIN: (384, ) 64 | MIN_SIZE_TRAIN_SAMPLING: "choice" 65 | MIN_SIZE_TEST: 640 66 | MAX_SIZE_TEST: 2560 67 | CROP: 68 | ENABLED: True 69 | TYPE: "absolute" 70 | SIZE: (384, 384) 71 | SINGLE_CATEGORY_MAX_AREA: 1.0 72 | COLOR_AUG_SSD: True 73 | SIZE_DIVISIBILITY: 384 74 | FORMAT: "RGB" 75 | DATASET_MAPPER_NAME: "mask_former_semantic" 76 | TEST: 77 | EVAL_PERIOD: 5000 78 | SLIDING_WINDOW: False 79 | DATALOADER: 80 | FILTER_EMPTY_ANNOTATIONS: True 81 | NUM_WORKERS: 8 82 | VERSION: 2 83 | CUDNN_BENCHMARK: True 84 | -------------------------------------------------------------------------------- /configs/cross_dataset/clipseg_ade.yaml: -------------------------------------------------------------------------------- 1 | ORACLE: True 2 | MODEL: 3 | META_ARCHITECTURE: "CLIPSeg" 4 | INPUT: 5 | DATASET_MAPPER_NAME: "obj_part_semantic" 6 | DATASETS: 7 | TRAIN: ("ade_obj_part_sem_seg_train",) 8 | TEST: ("voc_obj_part_sem_seg_val_obj_condition",) 9 | DATALOADER: 10 | FILTER_EMPTY_ANNOTATIONS: True 11 | NUM_WORKERS: 8 12 | SOLVER: 13 | IMS_PER_BATCH: 8 14 | BASE_LR: 0.0001 15 | MAX_ITER: 20000 16 | WARMUP_FACTOR: 1.0 17 | WARMUP_ITERS: 0 18 | WEIGHT_DECAY: 0.0001 19 | OPTIMIZER: "ADAMW" 20 | LR_SCHEDULER_NAME: "WarmupPolyLR" 21 | CLIP_GRADIENTS: 22 | ENABLED: True 23 | CLIP_TYPE: "full_model" 24 | CLIP_VALUE: 0.01 25 | NORM_TYPE: 2.0 26 | TEST: 27 | EVAL_PERIOD: 5000 28 | -------------------------------------------------------------------------------- /configs/few_shot/catseg_ade.yaml: -------------------------------------------------------------------------------- 1 | ORACLE: True 2 | _BASE_: base_catseg_config.yaml 3 | MODEL: 4 | META_ARCHITECTURE: "CATSeg" 5 | BACKBONE: 6 | FREEZE_AT: 0 7 | NAME: "build_resnet_backbone" 8 | WEIGHTS: "pretrain_weights/model_final_base.pth" 9 | RESNETS: 10 | DEPTH: 101 11 | STEM_TYPE: "basic" 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4"] 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | SEM_SEG_HEAD: 18 | NAME: "CATSegHead" 19 | IN_FEATURES: ["res2", "res3", "res4"] 20 | IGNORE_VALUE: 255 21 | NUM_CLASSES: 74 22 | BG_ON: True 23 | CLIP_PRETRAINED: "ViT-B/16" 24 | PROMPT_DEPTH: 0 25 | PROMPT_LENGTH: 0 26 | TEXT_GUIDANCE_DIM: 512 27 | TEXT_GUIDANCE_PROJ_DIM: 128 28 | APPEARANCE_GUIDANCE_DIM: 1024 29 | APPEARANCE_GUIDANCE_PROJ_DIM: 128 30 | DECODER_DIMS: [64, 32] 31 | DECODER_GUIDANCE_DIMS: [512, 256] 32 | DECODER_GUIDANCE_PROJ_DIMS: [32, 16] 33 | NUM_LAYERS: 2 34 | NUM_HEADS: 4 35 | HIDDEN_DIMS: 128 36 | POOLING_SIZES: [2, 2] 37 | FEATURE_RESOLUTION: [24, 24] 38 | WINDOW_SIZES: 12 39 | ATTENTION_TYPE: "linear" 40 | CLIP_FINETUNE: "" 41 | PROMPT_ENSEMBLE_TYPE: "imagenet" 42 | INPUT: 43 | DATASET_MAPPER_NAME: "obj_part_semantic" 44 | MAX_SIZE_TRAIN: 768 45 | MAX_SIZE_TEST: 768 46 | DATASETS: 47 | TRAIN: ("ade_obj_part_sem_seg_train_few_shot",) 48 | TEST: ("ade_obj_part_sem_seg_val_obj_few_shot",) 49 | DATALOADER: 50 | FILTER_EMPTY_ANNOTATIONS: True 51 | NUM_WORKERS: 4 52 | SOLVER: 53 | BACKBONE_MULTIPLIER: 0.01 54 | BASE_LR: 0.0002 55 | IMS_PER_BATCH: 8 56 | MAX_ITER: 80000 57 | TEST: 58 | EVAL_PERIOD: 5000 59 | -------------------------------------------------------------------------------- /configs/few_shot/catseg_voc.yaml: -------------------------------------------------------------------------------- 1 | ORACLE: True 2 | _BASE_: base_catseg_config.yaml 3 | MODEL: 4 | META_ARCHITECTURE: "CATSeg" 5 | BACKBONE: 6 | FREEZE_AT: 0 7 | NAME: "build_resnet_backbone" 8 | WEIGHTS: "pretrain_weights/model_final_base.pth" 9 | RESNETS: 10 | DEPTH: 101 11 | STEM_TYPE: "basic" 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4"] 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | SEM_SEG_HEAD: 18 | NAME: "CATSegHead" 19 | IN_FEATURES: ["res2", "res3", "res4"] 20 | IGNORE_VALUE: 255 21 | NUM_CLASSES: 74 22 | BG_ON: True 23 | CLIP_PRETRAINED: "ViT-B/16" 24 | PROMPT_DEPTH: 0 25 | PROMPT_LENGTH: 0 26 | TEXT_GUIDANCE_DIM: 512 27 | TEXT_GUIDANCE_PROJ_DIM: 128 28 | APPEARANCE_GUIDANCE_DIM: 1024 29 | APPEARANCE_GUIDANCE_PROJ_DIM: 128 30 | DECODER_DIMS: [64, 32] 31 | DECODER_GUIDANCE_DIMS: [512, 256] 32 | DECODER_GUIDANCE_PROJ_DIMS: [32, 16] 33 | NUM_LAYERS: 2 34 | NUM_HEADS: 4 35 | HIDDEN_DIMS: 128 36 | POOLING_SIZES: [2, 2] 37 | FEATURE_RESOLUTION: [24, 24] 38 | WINDOW_SIZES: 12 39 | ATTENTION_TYPE: "linear" 40 | CLIP_FINETUNE: "" 41 | PROMPT_ENSEMBLE_TYPE: "imagenet" 42 | INPUT: 43 | DATASET_MAPPER_NAME: "obj_part_semantic" 44 | MAX_SIZE_TRAIN: 768 45 | MAX_SIZE_TEST: 768 46 | DATASETS: 47 | TRAIN: ("voc_obj_part_sem_seg_train_few_shot",) 48 | TEST: ("voc_obj_part_sem_seg_val_obj_condition",) 49 | DATALOADER: 50 | FILTER_EMPTY_ANNOTATIONS: True 51 | NUM_WORKERS: 4 52 | SOLVER: 53 | BACKBONE_MULTIPLIER: 0.01 54 | BASE_LR: 0.0002 55 | IMS_PER_BATCH: 8 56 | MAX_ITER: 80000 57 | TEST: 58 | EVAL_PERIOD: 5000 59 | -------------------------------------------------------------------------------- /configs/few_shot/clipseg_ade.yaml: -------------------------------------------------------------------------------- 1 | ORACLE: True 2 | MODEL: 3 | META_ARCHITECTURE: "CLIPSeg" 4 | INPUT: 5 | DATASET_MAPPER_NAME: "obj_part_semantic" 6 | DATASETS: 7 | TRAIN: ("ade_obj_part_sem_seg_train_few_shot",) 8 | TEST: ("ade_obj_part_sem_seg_val_few_shot",) 9 | DATALOADER: 10 | FILTER_EMPTY_ANNOTATIONS: True 11 | NUM_WORKERS: 8 12 | SOLVER: 13 | IMS_PER_BATCH: 8 14 | BASE_LR: 0.0001 15 | MAX_ITER: 20000 16 | WARMUP_FACTOR: 1.0 17 | WARMUP_ITERS: 0 18 | WEIGHT_DECAY: 0.0001 19 | OPTIMIZER: "ADAMW" 20 | LR_SCHEDULER_NAME: "WarmupPolyLR" 21 | CLIP_GRADIENTS: 22 | ENABLED: True 23 | CLIP_TYPE: "full_model" 24 | CLIP_VALUE: 0.01 25 | NORM_TYPE: 2.0 26 | TEST: 27 | EVAL_PERIOD: 5000 28 | -------------------------------------------------------------------------------- /configs/few_shot/clipseg_voc.yaml: -------------------------------------------------------------------------------- 1 | ORACLE: True 2 | MODEL: 3 | META_ARCHITECTURE: "CLIPSeg" 4 | INPUT: 5 | DATASET_MAPPER_NAME: "obj_part_semantic" 6 | DATASETS: 7 | TRAIN: ("voc_obj_part_sem_seg_train_few_shot",) 8 | TEST: ("voc_obj_part_sem_seg_val_obj_condition",) 9 | DATALOADER: 10 | FILTER_EMPTY_ANNOTATIONS: True 11 | NUM_WORKERS: 8 12 | SOLVER: 13 | IMS_PER_BATCH: 8 14 | BASE_LR: 0.0001 15 | MAX_ITER: 20000 16 | WARMUP_FACTOR: 1.0 17 | WARMUP_ITERS: 0 18 | WEIGHT_DECAY: 0.0001 19 | OPTIMIZER: "ADAMW" 20 | LR_SCHEDULER_NAME: "WarmupPolyLR" 21 | CLIP_GRADIENTS: 22 | ENABLED: True 23 | CLIP_TYPE: "full_model" 24 | CLIP_VALUE: 0.01 25 | NORM_TYPE: 2.0 26 | TEST: 27 | EVAL_PERIOD: 5000 28 | -------------------------------------------------------------------------------- /configs/maskformer_R50_bs16_20k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: Base-VOC11K-20.yaml 2 | MODEL: 3 | META_ARCHITECTURE: "MaskFormer" 4 | SEM_SEG_HEAD: 5 | NAME: "MaskFormerHead" 6 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 7 | IGNORE_VALUE: 255 8 | NUM_CLASSES: 20 9 | COMMON_STRIDE: 4 # not used, hard-coded 10 | LOSS_WEIGHT: 1.0 11 | CONVS_DIM: 256 12 | MASK_DIM: 256 13 | NORM: "GN" 14 | MASK_FORMER: 15 | TRANSFORMER_IN_FEATURE: "res5" 16 | DEEP_SUPERVISION: True 17 | NO_OBJECT_WEIGHT: 0.1 18 | DICE_WEIGHT: 1.0 19 | MASK_WEIGHT: 20.0 20 | HIDDEN_DIM: 256 21 | NUM_OBJECT_QUERIES: 100 22 | NHEADS: 8 23 | DROPOUT: 0.1 24 | DIM_FEEDFORWARD: 2048 25 | ENC_LAYERS: 0 26 | DEC_LAYERS: 6 27 | PRE_NORM: False -------------------------------------------------------------------------------- /configs/zero_shot/catseg_ade.yaml: -------------------------------------------------------------------------------- 1 | ORACLE: True 2 | _BASE_: ../base_catseg_config.yaml 3 | MODEL: 4 | META_ARCHITECTURE: "CATSeg" 5 | BACKBONE: 6 | FREEZE_AT: 0 7 | NAME: "build_resnet_backbone" 8 | WEIGHTS: "pretrain_weights/model_final_base.pth" 9 | RESNETS: 10 | DEPTH: 101 11 | STEM_TYPE: "basic" 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4"] 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | SEM_SEG_HEAD: 18 | NAME: "CATSegHead" 19 | IN_FEATURES: ["res2", "res3", "res4"] 20 | IGNORE_VALUE: 255 21 | NUM_CLASSES: 74 22 | BG_ON: True 23 | CLIP_PRETRAINED: "ViT-B/16" 24 | PROMPT_DEPTH: 0 25 | PROMPT_LENGTH: 0 26 | TEXT_GUIDANCE_DIM: 512 27 | TEXT_GUIDANCE_PROJ_DIM: 128 28 | APPEARANCE_GUIDANCE_DIM: 1024 29 | APPEARANCE_GUIDANCE_PROJ_DIM: 128 30 | DECODER_DIMS: [64, 32] 31 | DECODER_GUIDANCE_DIMS: [512, 256] 32 | DECODER_GUIDANCE_PROJ_DIMS: [32, 16] 33 | NUM_LAYERS: 2 34 | NUM_HEADS: 4 35 | HIDDEN_DIMS: 128 36 | POOLING_SIZES: [2, 2] 37 | FEATURE_RESOLUTION: [24, 24] 38 | WINDOW_SIZES: 12 39 | ATTENTION_TYPE: "linear" 40 | CLIP_FINETUNE: "" 41 | PROMPT_ENSEMBLE_TYPE: "imagenet" 42 | INPUT: 43 | DATASET_MAPPER_NAME: "obj_part_semantic" 44 | MAX_SIZE_TRAIN: 768 45 | MAX_SIZE_TEST: 768 46 | DATASETS: 47 | TRAIN: ("ade_obj_part_sem_seg_train",) 48 | TEST: ("ade_obj_part_sem_seg_val_obj_condition",) 49 | DATALOADER: 50 | FILTER_EMPTY_ANNOTATIONS: True 51 | NUM_WORKERS: 4 52 | SOLVER: 53 | BACKBONE_MULTIPLIER: 0.01 54 | BASE_LR: 0.0002 55 | IMS_PER_BATCH: 8 56 | MAX_ITER: 80000 57 | TEST: 58 | EVAL_PERIOD: 5000 59 | SLIDING_WINDOW: False 60 | -------------------------------------------------------------------------------- /configs/zero_shot/catseg_voc.yaml: -------------------------------------------------------------------------------- 1 | ORACLE: True 2 | _BASE_: ../base_catseg_config.yaml 3 | MODEL: 4 | META_ARCHITECTURE: "CATSeg" 5 | BACKBONE: 6 | FREEZE_AT: 0 7 | NAME: "build_resnet_backbone" 8 | WEIGHTS: "pretrain_weights/model_final_base.pth" 9 | RESNETS: 10 | DEPTH: 101 11 | STEM_TYPE: "basic" 12 | STEM_OUT_CHANNELS: 64 13 | STRIDE_IN_1X1: False 14 | OUT_FEATURES: ["res2", "res3", "res4"] 15 | PIXEL_MEAN: [123.675, 116.280, 103.530] 16 | PIXEL_STD: [58.395, 57.120, 57.375] 17 | SEM_SEG_HEAD: 18 | NAME: "CATSegHead" 19 | IN_FEATURES: ["res2", "res3", "res4"] 20 | IGNORE_VALUE: 255 21 | NUM_CLASSES: 74 22 | BG_ON: True 23 | CLIP_PRETRAINED: "ViT-B/16" 24 | PROMPT_DEPTH: 0 25 | PROMPT_LENGTH: 0 26 | TEXT_GUIDANCE_DIM: 512 27 | TEXT_GUIDANCE_PROJ_DIM: 128 28 | APPEARANCE_GUIDANCE_DIM: 1024 29 | APPEARANCE_GUIDANCE_PROJ_DIM: 128 30 | DECODER_DIMS: [64, 32] 31 | DECODER_GUIDANCE_DIMS: [512, 256] 32 | DECODER_GUIDANCE_PROJ_DIMS: [32, 16] 33 | NUM_LAYERS: 2 34 | NUM_HEADS: 4 35 | HIDDEN_DIMS: 128 36 | POOLING_SIZES: [2, 2] 37 | FEATURE_RESOLUTION: [24, 24] 38 | WINDOW_SIZES: 12 39 | ATTENTION_TYPE: "linear" 40 | CLIP_FINETUNE: "" 41 | PROMPT_ENSEMBLE_TYPE: "imagenet" 42 | INPUT: 43 | DATASET_MAPPER_NAME: "obj_part_semantic" 44 | MAX_SIZE_TRAIN: 768 45 | MAX_SIZE_TEST: 768 46 | DATASETS: 47 | TRAIN: ("voc_obj_part_sem_seg_train",) 48 | TEST: ("voc_obj_part_sem_seg_val_obj_condition",) 49 | DATALOADER: 50 | FILTER_EMPTY_ANNOTATIONS: True 51 | NUM_WORKERS: 4 52 | SOLVER: 53 | BACKBONE_MULTIPLIER: 0.01 54 | BASE_LR: 0.0002 55 | IMS_PER_BATCH: 8 56 | MAX_ITER: 80000 57 | TEST: 58 | EVAL_PERIOD: 5000 59 | SLIDING_WINDOW: False 60 | -------------------------------------------------------------------------------- /configs/zero_shot/clipseg_ade.yaml: -------------------------------------------------------------------------------- 1 | ORACLE: True 2 | MODEL: 3 | META_ARCHITECTURE: "CLIPSeg" 4 | INPUT: 5 | DATASET_MAPPER_NAME: "obj_part_semantic" 6 | DATASETS: 7 | TRAIN: ("ade_obj_part_sem_seg_train",) 8 | TEST: ("ade_obj_part_sem_seg_val_obj_condition",) 9 | DATALOADER: 10 | FILTER_EMPTY_ANNOTATIONS: True 11 | NUM_WORKERS: 8 12 | SOLVER: 13 | IMS_PER_BATCH: 8 14 | BASE_LR: 0.0001 15 | MAX_ITER: 20000 16 | WARMUP_FACTOR: 1.0 17 | WARMUP_ITERS: 0 18 | WEIGHT_DECAY: 0.0001 19 | OPTIMIZER: "ADAMW" 20 | LR_SCHEDULER_NAME: "WarmupPolyLR" 21 | CLIP_GRADIENTS: 22 | ENABLED: True 23 | CLIP_TYPE: "full_model" 24 | CLIP_VALUE: 0.01 25 | NORM_TYPE: 2.0 26 | TEST: 27 | EVAL_PERIOD: 5000 28 | -------------------------------------------------------------------------------- /configs/zero_shot/clipseg_voc.yaml: -------------------------------------------------------------------------------- 1 | ORACLE: True 2 | MODEL: 3 | META_ARCHITECTURE: "CLIPSeg" 4 | INPUT: 5 | DATASET_MAPPER_NAME: "obj_part_semantic" 6 | DATASETS: 7 | TRAIN: ("voc_obj_part_sem_seg_train",) 8 | TEST: ("voc_obj_part_sem_seg_val_obj_condition",) 9 | DATALOADER: 10 | FILTER_EMPTY_ANNOTATIONS: True 11 | NUM_WORKERS: 8 12 | SOLVER: 13 | IMS_PER_BATCH: 8 14 | BASE_LR: 0.0001 15 | MAX_ITER: 20000 16 | WARMUP_FACTOR: 1.0 17 | WARMUP_ITERS: 0 18 | WEIGHT_DECAY: 0.0001 19 | OPTIMIZER: "ADAMW" 20 | LR_SCHEDULER_NAME: "WarmupPolyLR" 21 | CLIP_GRADIENTS: 22 | ENABLED: True 23 | CLIP_TYPE: "full_model" 24 | CLIP_VALUE: 0.01 25 | NORM_TYPE: 2.0 26 | TEST: 27 | EVAL_PERIOD: 5000 28 | -------------------------------------------------------------------------------- /configs/zero_shot/zsseg+_R50_coop_ade.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer_R50_bs16_20k.yaml 2 | ORACLE: True 3 | MODEL: 4 | META_ARCHITECTURE: "ZeroShotObjPartMaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "ZeroShotObjPartMaskFormerHead" 7 | NUM_CLASSES: 74 #only used in set criterion 8 | EMBEDDING_DIM: 512 9 | EMBED_LAYERS: 2 10 | CLIP_ADAPTER: 11 | PROMPT_LEARNER: "learnable_obj_part" 12 | # # for learnable prompt 13 | PROMPT_DIM: 512 14 | PROMPT_SHAPE: (4, 4 ,0) 15 | CLIP_MODEL_NAME: "ViT-B/16" 16 | MASK_FILL: "mean" 17 | MASK_EXPAND_RATIO: 1.2 18 | MASK_THR: 0.5 19 | MASK_MATTING: False 20 | REGION_RESIZED: True 21 | CLIP_ENSEMBLE: True 22 | CLIP_ENSEMBLE_WEIGHT: 0.5 23 | PROMPT_CHECKPOINT: 'clip_weights/ade_cpt_coop_model.pth' 24 | MASK_FORMER: 25 | NUM_OBJECT_QUERIES: 50 26 | CLASS_WEIGHT: 1.0 27 | INPUT: 28 | DATASET_MAPPER_NAME: "obj_part_semantic" 29 | MAX_SIZE_TRAIN: 768 30 | MAX_SIZE_TEST: 768 31 | SOLVER: 32 | IMS_PER_BATCH: 8 33 | TEST: 34 | EVAL_PERIOD: 5000 35 | DATASETS: 36 | TRAIN: ("ade_obj_part_sem_seg_train_obj_condition",) 37 | TEST: ("ade_obj_part_sem_seg_val_obj_condition",) 38 | DATALOADER: 39 | FILTER_EMPTY_ANNOTATIONS: True 40 | NUM_WORKERS: 4 41 | -------------------------------------------------------------------------------- /configs/zero_shot/zsseg+_R50_coop_voc.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: ../maskformer_R50_bs16_20k.yaml 2 | ORACLE: True 3 | MODEL: 4 | META_ARCHITECTURE: "ZeroShotObjPartMaskFormer" 5 | SEM_SEG_HEAD: 6 | NAME: "ZeroShotObjPartMaskFormerHead" 7 | NUM_CLASSES: 74 #only used in set criterion 8 | EMBEDDING_DIM: 512 9 | EMBED_LAYERS: 2 10 | CLIP_ADAPTER: 11 | PROMPT_LEARNER: "learnable_obj_part" 12 | # # for learnable prompt 13 | PROMPT_DIM: 512 14 | PROMPT_SHAPE: (4, 4 ,0) 15 | CLIP_MODEL_NAME: "ViT-B/16" 16 | MASK_FILL: "mean" 17 | MASK_EXPAND_RATIO: 1.2 18 | MASK_THR: 0.5 19 | MASK_MATTING: False 20 | REGION_RESIZED: True 21 | CLIP_ENSEMBLE: True 22 | CLIP_ENSEMBLE_WEIGHT: 0.5 23 | PROMPT_CHECKPOINT: 'clip_weights/voc_cpt_coop_model.pth' 24 | MASK_FORMER: 25 | NUM_OBJECT_QUERIES: 50 26 | CLASS_WEIGHT: 1.0 27 | INPUT: 28 | DATASET_MAPPER_NAME: "obj_part_semantic" 29 | MAX_SIZE_TRAIN: 768 30 | MAX_SIZE_TEST: 768 31 | SOLVER: 32 | IMS_PER_BATCH: 8 33 | TEST: 34 | EVAL_PERIOD: 5000 35 | DATASETS: 36 | TRAIN: ("voc_obj_part_sem_seg_train_obj_condition",) 37 | TEST: ("voc_obj_part_sem_seg_val_obj_condition",) 38 | DATALOADER: 39 | FILTER_EMPTY_ANNOTATIONS: True 40 | NUM_WORKERS: 4 41 | -------------------------------------------------------------------------------- /open_clip/CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.1.0 2 | message: If you use this software, please cite it as below. 3 | authors: 4 | - family-names: Ilharco 5 | given-names: Gabriel 6 | - family-names: Wortsman 7 | given-names: Mitchell 8 | - family-names: Wightman 9 | given-names: Ross 10 | - family-names: Gordon 11 | given-names: Cade 12 | - family-names: Carlini 13 | given-names: Nicholas 14 | - family-names: Taori 15 | given-names: Rohan 16 | - family-names: Dave 17 | given-names: Achal 18 | - family-names: Shankar 19 | given-names: Vaishaal 20 | - family-names: Namkoong 21 | given-names: Hongseok 22 | - family-names: Miller 23 | given-names: John 24 | - family-names: Hajishirzi 25 | given-names: Hannaneh 26 | - family-names: Farhadi 27 | given-names: Ali 28 | - family-names: Schmidt 29 | given-names: Ludwig 30 | title: OpenCLIP 31 | version: v0.1 32 | doi: 10.5281/zenodo.5143773 33 | date-released: 2021-07-28 34 | -------------------------------------------------------------------------------- /open_clip/HISTORY.md: -------------------------------------------------------------------------------- 1 | ## 2.10.1 2 | 3 | * `hf-hub:org/model_id` support for loading models w/ config and weights in Hugging Face Hub 4 | 5 | ## 2.10.0 6 | 7 | * Added a ViT-bigG-14 model. 8 | * Added an up-to-date example slurm script for large training jobs. 9 | * Added a option to sync logs and checkpoints to S3 during training. 10 | * New options for LR schedulers, constant and constant with cooldown 11 | * Fix wandb autoresuming when resume is not set 12 | * ConvNeXt `base` & `base_w` pretrained models added 13 | * `timm-` model prefix removed from configs 14 | * `timm` augmentation + regularization (dropout / drop-path) supported 15 | 16 | ## 2.9.3 17 | 18 | * Fix wandb collapsing multiple parallel runs into a single one 19 | 20 | ## 2.9.2 21 | 22 | * Fix braceexpand memory explosion for complex webdataset urls 23 | 24 | ## 2.9.1 25 | 26 | * Fix release 27 | 28 | ## 2.9.0 29 | 30 | * Add training feature to auto-resume from the latest checkpoint on restart via `--resume latest` 31 | * Allow webp in webdataset 32 | * Fix logging for number of samples when using gradient accumulation 33 | * Add model configs for convnext xxlarge 34 | 35 | ## 2.8.2 36 | 37 | * wrapped patchdropout in a torch.nn.Module 38 | 39 | ## 2.8.1 40 | 41 | * relax protobuf dependency 42 | * override the default patch dropout value in 'vision_cfg' 43 | 44 | ## 2.8.0 45 | 46 | * better support for HF models 47 | * add support for gradient accumulation 48 | * CI fixes 49 | * add support for patch dropout 50 | * add convnext configs 51 | 52 | 53 | ## 2.7.0 54 | 55 | * add multilingual H/14 xlm roberta large 56 | 57 | ## 2.6.1 58 | 59 | * fix setup.py _read_reqs 60 | 61 | ## 2.6.0 62 | 63 | * Make openclip training usable from pypi. 64 | * Add xlm roberta large vit h 14 config. 65 | 66 | ## 2.5.0 67 | 68 | * pretrained B/32 xlm roberta base: first multilingual clip trained on laion5B 69 | * pretrained B/32 roberta base: first clip trained using an HF text encoder 70 | 71 | ## 2.4.1 72 | 73 | * Add missing hf_tokenizer_name in CLIPTextCfg. 74 | 75 | ## 2.4.0 76 | 77 | * Fix #211, missing RN50x64 config. Fix type of dropout param for ResNet models 78 | * Bring back LayerNorm impl that casts to input for non bf16/fp16 79 | * zero_shot.py: set correct tokenizer based on args 80 | * training/params.py: remove hf params and get them from model config 81 | 82 | ## 2.3.1 83 | 84 | * Implement grad checkpointing for hf model. 85 | * custom_text: True if hf_model_name is set 86 | * Disable hf tokenizer parallelism 87 | 88 | ## 2.3.0 89 | 90 | * Generalizable Text Transformer with HuggingFace Models (@iejMac) 91 | 92 | ## 2.2.0 93 | 94 | * Support for custom text tower 95 | * Add checksum verification for pretrained model weights 96 | 97 | ## 2.1.0 98 | 99 | * lot including sota models, bfloat16 option, better loading, better metrics 100 | 101 | ## 1.2.0 102 | 103 | * ViT-B/32 trained on Laion2B-en 104 | * add missing openai RN50x64 model 105 | 106 | ## 1.1.1 107 | 108 | * ViT-B/16+ 109 | * Add grad checkpointing support 110 | * more robust data loader 111 | -------------------------------------------------------------------------------- /open_clip/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2021 Gabriel Ilharco, Mitchell Wortsman, 2 | Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar, 3 | John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi, 4 | Ludwig Schmidt 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /open_clip/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include src/open_clip/bpe_simple_vocab_16e6.txt.gz 2 | include src/open_clip/model_configs/*.json 3 | 4 | -------------------------------------------------------------------------------- /open_clip/Makefile: -------------------------------------------------------------------------------- 1 | install: ## [Local development] Upgrade pip, install requirements, install package. 2 | python -m pip install -U pip 3 | python -m pip install -e . 4 | 5 | install-training: 6 | python -m pip install -r requirements-training.txt 7 | 8 | install-test: ## [Local development] Install test requirements 9 | python -m pip install -r requirements-test.txt 10 | 11 | test: ## [Local development] Run unit tests 12 | python -m pytest -x -s -v tests 13 | -------------------------------------------------------------------------------- /open_clip/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | regression_test 4 | -------------------------------------------------------------------------------- /open_clip/requirements-test.txt: -------------------------------------------------------------------------------- 1 | pytest-split==0.8.0 2 | pytest==7.2.0 3 | transformers 4 | timm==0.6.11 5 | -------------------------------------------------------------------------------- /open_clip/requirements-training.txt: -------------------------------------------------------------------------------- 1 | torch>=1.9.0 2 | torchvision 3 | webdataset>=0.2.5 4 | regex 5 | ftfy 6 | tqdm 7 | pandas 8 | braceexpand 9 | huggingface_hub 10 | transformers 11 | timm 12 | fsspec 13 | -------------------------------------------------------------------------------- /open_clip/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.9.0 2 | torchvision 3 | regex 4 | ftfy 5 | tqdm 6 | huggingface_hub 7 | sentencepiece 8 | protobuf==3.20.* 9 | timm 10 | -------------------------------------------------------------------------------- /open_clip/setup.py: -------------------------------------------------------------------------------- 1 | """ Setup 2 | """ 3 | from setuptools import setup, find_packages 4 | from codecs import open 5 | from os import path 6 | 7 | here = path.abspath(path.dirname(__file__)) 8 | 9 | # Get the long description from the README file 10 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 11 | long_description = f.read() 12 | 13 | def _read_reqs(relpath): 14 | fullpath = path.join(path.dirname(__file__), relpath) 15 | with open(fullpath) as f: 16 | return [s.strip() for s in f.readlines() if (s.strip() and not s.startswith("#"))] 17 | 18 | REQUIREMENTS = _read_reqs("requirements.txt") 19 | TRAINING_REQUIREMENTS = _read_reqs("requirements-training.txt") 20 | 21 | exec(open('src/open_clip/version.py').read()) 22 | setup( 23 | name='open_clip_torch', 24 | version=__version__, 25 | description='OpenCLIP', 26 | long_description=long_description, 27 | long_description_content_type='text/markdown', 28 | url='https://github.com/mlfoundations/open_clip', 29 | author='', 30 | author_email='', 31 | classifiers=[ 32 | # How mature is this project? Common values are 33 | # 3 - Alpha 34 | # 4 - Beta 35 | # 5 - Production/Stable 36 | 'Development Status :: 3 - Alpha', 37 | 'Intended Audience :: Education', 38 | 'Intended Audience :: Science/Research', 39 | 'License :: OSI Approved :: Apache Software License', 40 | 'Programming Language :: Python :: 3.7', 41 | 'Programming Language :: Python :: 3.8', 42 | 'Programming Language :: Python :: 3.9', 43 | 'Programming Language :: Python :: 3.10', 44 | 'Topic :: Scientific/Engineering', 45 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 46 | 'Topic :: Software Development', 47 | 'Topic :: Software Development :: Libraries', 48 | 'Topic :: Software Development :: Libraries :: Python Modules', 49 | ], 50 | 51 | # Note that this is a string of words separated by whitespace, not a list. 52 | keywords='CLIP pretrained', 53 | package_dir={'': 'src'}, 54 | packages=find_packages(where='src'), 55 | include_package_data=True, 56 | install_requires=REQUIREMENTS, 57 | extras_require={ 58 | "training": TRAINING_REQUIREMENTS, 59 | }, 60 | python_requires='>=3.7', 61 | ) 62 | -------------------------------------------------------------------------------- /open_clip/src/open_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 4 | from .loss import ClipLoss 5 | from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg,\ 6 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype 7 | from .openai import load_openai_model, list_openai_models 8 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\ 9 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 10 | from .tokenizer import SimpleTokenizer, tokenize 11 | from .transform import image_transform, AugmentationCfg 12 | -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/constants.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/constants.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/factory.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/factory.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/hf_configs.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/hf_configs.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/hf_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/hf_model.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/loss.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/modified_resnet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/modified_resnet.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/openai.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/openai.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/pretrained.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/pretrained.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/timm_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/timm_model.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/transform.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/transform.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/transformer.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/__pycache__/version.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/__pycache__/version.cpython-38.pyc -------------------------------------------------------------------------------- /open_clip/src/open_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/open_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /open_clip/src/open_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | -------------------------------------------------------------------------------- /open_clip/src/open_clip/hf_configs.py: -------------------------------------------------------------------------------- 1 | # HF architecture dict: 2 | arch_dict = { 3 | # https://huggingface.co/docs/transformers/model_doc/roberta#roberta 4 | "roberta": { 5 | "config_names": { 6 | "context_length": "max_position_embeddings", 7 | "vocab_size": "vocab_size", 8 | "width": "hidden_size", 9 | "heads": "num_attention_heads", 10 | "layers": "num_hidden_layers", 11 | "layer_attr": "layer", 12 | "token_embeddings_attr": "embeddings" 13 | }, 14 | "pooler": "mean_pooler", 15 | }, 16 | # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig 17 | "xlm-roberta": { 18 | "config_names": { 19 | "context_length": "max_position_embeddings", 20 | "vocab_size": "vocab_size", 21 | "width": "hidden_size", 22 | "heads": "num_attention_heads", 23 | "layers": "num_hidden_layers", 24 | "layer_attr": "layer", 25 | "token_embeddings_attr": "embeddings" 26 | }, 27 | "pooler": "mean_pooler", 28 | }, 29 | # https://huggingface.co/docs/transformers/model_doc/mt5#mt5 30 | "mt5": { 31 | "config_names": { 32 | # unlimited seqlen 33 | # https://github.com/google-research/text-to-text-transfer-transformer/issues/273 34 | # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374 35 | "context_length": "", 36 | "vocab_size": "vocab_size", 37 | "width": "d_model", 38 | "heads": "num_heads", 39 | "layers": "num_layers", 40 | "layer_attr": "block", 41 | "token_embeddings_attr": "embed_tokens" 42 | }, 43 | "pooler": "mean_pooler", 44 | }, 45 | } 46 | -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/RN50x64.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": [ 6 | 3, 7 | 15, 8 | 36, 9 | 10 10 | ], 11 | "width": 128, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 1024, 18 | "heads": 16, 19 | "layers": 12 20 | } 21 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-M-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16, 8 | "ls_init_value": 1e-4 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 384, 14 | "heads": 6, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-M-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-M-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-M-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 512, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-S-16-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-S-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-S-32-alt.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 256, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 256, 13 | "heads": 4, 14 | "layers": 10 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-S-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 384, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 384, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 384, 13 | "heads": 6, 14 | "layers": 12 15 | } 16 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-bigG-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 1664, 7 | "head_width": 104, 8 | "mlp_ratio": 4.9231, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 32 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-e-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 56, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.5715, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1280, 15 | "heads": 20, 16 | "layers": 36 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/convnext_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/convnext_base_w.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 256 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 640, 14 | "heads": 10, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/convnext_base_w_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_base", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 320 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 640, 14 | "heads": 10, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/convnext_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 12, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/convnext_large_d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_large", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "mlp", 8 | "timm_drop": 0.1, 9 | "timm_drop_path": 0.1, 10 | "image_size": 256 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 768, 16 | "heads": 12, 17 | "layers": 16 18 | } 19 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/convnext_small.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_small", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/convnext_tiny.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_tiny", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/convnext_xlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 16 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/convnext_xxlarge.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 256 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/convnext_xxlarge_320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "convnext_xxlarge", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 320 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/mt5-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "google/mt5-base", 11 | "hf_tokenizer_name": "google/mt5-base", 12 | "proj": "mlp", 13 | "pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/mt5-xl-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "google/mt5-xl", 12 | "hf_tokenizer_name": "google/mt5-xl", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/roberta-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "roberta-base", 12 | "hf_tokenizer_name": "roberta-base", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 640, 14 | "heads": 10, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/vit_medium_patch16_gap_256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_medium_patch16_gap_256", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 256 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_relpos_medium_patch16_cls_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "hf_model_name": "xlm-roberta-base", 11 | "hf_tokenizer_name": "xlm-roberta-base", 12 | "proj": "mlp", 13 | "pooler_type": "mean_pooler" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /open_clip/src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "hf_model_name": "xlm-roberta-large", 12 | "hf_tokenizer_name": "xlm-roberta-large", 13 | "proj": "mlp", 14 | "pooler_type": "mean_pooler" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /open_clip/src/open_clip/utils.py: -------------------------------------------------------------------------------- 1 | from itertools import repeat 2 | import collections.abc 3 | 4 | from torch import nn as nn 5 | from torchvision.ops.misc import FrozenBatchNorm2d 6 | 7 | 8 | def freeze_batch_norm_2d(module, module_match={}, name=''): 9 | """ 10 | Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is 11 | itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and 12 | returned. Otherwise, the module is walked recursively and submodules are converted in place. 13 | 14 | Args: 15 | module (torch.nn.Module): Any PyTorch module. 16 | module_match (dict): Dictionary of full module names to freeze (all if empty) 17 | name (str): Full module name (prefix) 18 | 19 | Returns: 20 | torch.nn.Module: Resulting module 21 | 22 | Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762 23 | """ 24 | res = module 25 | is_match = True 26 | if module_match: 27 | is_match = name in module_match 28 | if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)): 29 | res = FrozenBatchNorm2d(module.num_features) 30 | res.num_features = module.num_features 31 | res.affine = module.affine 32 | if module.affine: 33 | res.weight.data = module.weight.data.clone().detach() 34 | res.bias.data = module.bias.data.clone().detach() 35 | res.running_mean.data = module.running_mean.data 36 | res.running_var.data = module.running_var.data 37 | res.eps = module.eps 38 | else: 39 | for child_name, child in module.named_children(): 40 | full_child_name = '.'.join([name, child_name]) if name else child_name 41 | new_child = freeze_batch_norm_2d(child, module_match, full_child_name) 42 | if new_child is not child: 43 | res.add_module(child_name, new_child) 44 | return res 45 | 46 | 47 | # From PyTorch internals 48 | def _ntuple(n): 49 | def parse(x): 50 | if isinstance(x, collections.abc.Iterable): 51 | return x 52 | return tuple(repeat(x, n)) 53 | return parse 54 | 55 | 56 | to_1tuple = _ntuple(1) 57 | to_2tuple = _ntuple(2) 58 | to_3tuple = _ntuple(3) 59 | to_4tuple = _ntuple(4) 60 | to_ntuple = lambda n, x: _ntuple(n)(x) 61 | -------------------------------------------------------------------------------- /open_clip/src/open_clip/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.10.1' 2 | -------------------------------------------------------------------------------- /open_clip/src/open_clip_torch.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | MANIFEST.in 2 | README.md 3 | setup.py 4 | src/open_clip/__init__.py 5 | src/open_clip/bpe_simple_vocab_16e6.txt.gz 6 | src/open_clip/constants.py 7 | src/open_clip/factory.py 8 | src/open_clip/hf_configs.py 9 | src/open_clip/hf_model.py 10 | src/open_clip/loss.py 11 | src/open_clip/model.py 12 | src/open_clip/modified_resnet.py 13 | src/open_clip/openai.py 14 | src/open_clip/pretrained.py 15 | src/open_clip/timm_model.py 16 | src/open_clip/tokenizer.py 17 | src/open_clip/transform.py 18 | src/open_clip/transformer.py 19 | src/open_clip/utils.py 20 | src/open_clip/version.py 21 | src/open_clip/model_configs/RN101-quickgelu.json 22 | src/open_clip/model_configs/RN101.json 23 | src/open_clip/model_configs/RN50-quickgelu.json 24 | src/open_clip/model_configs/RN50.json 25 | src/open_clip/model_configs/RN50x16.json 26 | src/open_clip/model_configs/RN50x4.json 27 | src/open_clip/model_configs/RN50x64.json 28 | src/open_clip/model_configs/ViT-B-16-plus-240.json 29 | src/open_clip/model_configs/ViT-B-16-plus.json 30 | src/open_clip/model_configs/ViT-B-16.json 31 | src/open_clip/model_configs/ViT-B-32-plus-256.json 32 | src/open_clip/model_configs/ViT-B-32-quickgelu.json 33 | src/open_clip/model_configs/ViT-B-32.json 34 | src/open_clip/model_configs/ViT-H-14.json 35 | src/open_clip/model_configs/ViT-H-16.json 36 | src/open_clip/model_configs/ViT-L-14-280.json 37 | src/open_clip/model_configs/ViT-L-14-336.json 38 | src/open_clip/model_configs/ViT-L-14.json 39 | src/open_clip/model_configs/ViT-L-16-320.json 40 | src/open_clip/model_configs/ViT-L-16.json 41 | src/open_clip/model_configs/ViT-M-16-alt.json 42 | src/open_clip/model_configs/ViT-M-16.json 43 | src/open_clip/model_configs/ViT-M-32-alt.json 44 | src/open_clip/model_configs/ViT-M-32.json 45 | src/open_clip/model_configs/ViT-S-16-alt.json 46 | src/open_clip/model_configs/ViT-S-16.json 47 | src/open_clip/model_configs/ViT-S-32-alt.json 48 | src/open_clip/model_configs/ViT-S-32.json 49 | src/open_clip/model_configs/ViT-bigG-14.json 50 | src/open_clip/model_configs/ViT-e-14.json 51 | src/open_clip/model_configs/ViT-g-14.json 52 | src/open_clip/model_configs/convnext_base.json 53 | src/open_clip/model_configs/convnext_base_w.json 54 | src/open_clip/model_configs/convnext_base_w_320.json 55 | src/open_clip/model_configs/convnext_large.json 56 | src/open_clip/model_configs/convnext_large_d.json 57 | src/open_clip/model_configs/convnext_small.json 58 | src/open_clip/model_configs/convnext_tiny.json 59 | src/open_clip/model_configs/convnext_xlarge.json 60 | src/open_clip/model_configs/convnext_xxlarge.json 61 | src/open_clip/model_configs/convnext_xxlarge_320.json 62 | src/open_clip/model_configs/mt5-base-ViT-B-32.json 63 | src/open_clip/model_configs/mt5-xl-ViT-H-14.json 64 | src/open_clip/model_configs/roberta-ViT-B-32.json 65 | src/open_clip/model_configs/swin_base_patch4_window7_224.json 66 | src/open_clip/model_configs/vit_medium_patch16_gap_256.json 67 | src/open_clip/model_configs/vit_relpos_medium_patch16_cls_224.json 68 | src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json 69 | src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json 70 | src/open_clip_torch.egg-info/PKG-INFO 71 | src/open_clip_torch.egg-info/SOURCES.txt 72 | src/open_clip_torch.egg-info/dependency_links.txt 73 | src/open_clip_torch.egg-info/requires.txt 74 | src/open_clip_torch.egg-info/top_level.txt 75 | src/training/__init__.py 76 | src/training/data.py 77 | src/training/distributed.py 78 | src/training/file_utils.py 79 | src/training/imagenet_zeroshot_data.py 80 | src/training/logger.py 81 | src/training/main.py 82 | src/training/params.py 83 | src/training/precision.py 84 | src/training/profile.py 85 | src/training/scheduler.py 86 | src/training/train.py 87 | src/training/zero_shot.py -------------------------------------------------------------------------------- /open_clip/src/open_clip_torch.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /open_clip/src/open_clip_torch.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | torch>=1.9.0 2 | torchvision 3 | regex 4 | ftfy 5 | tqdm 6 | huggingface_hub 7 | sentencepiece 8 | protobuf==3.20.* 9 | timm 10 | 11 | [training] 12 | torch>=1.9.0 13 | torchvision 14 | webdataset>=0.2.5 15 | regex 16 | ftfy 17 | tqdm 18 | pandas 19 | braceexpand 20 | huggingface_hub 21 | transformers 22 | timm 23 | fsspec 24 | -------------------------------------------------------------------------------- /open_clip/src/open_clip_torch.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | open_clip 2 | training 3 | -------------------------------------------------------------------------------- /open_clip/src/training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/open_clip/src/training/__init__.py -------------------------------------------------------------------------------- /open_clip/src/training/file_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import multiprocessing 4 | import subprocess 5 | import time 6 | import fsspec 7 | import torch 8 | from tqdm import tqdm 9 | 10 | def remote_sync_s3(local_dir, remote_dir): 11 | # skip epoch_latest which can change during sync. 12 | result = subprocess.run(["aws", "s3", "sync", local_dir, remote_dir, '--exclude', '*epoch_latest.pt'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 13 | if result.returncode != 0: 14 | logging.error(f"Error: Failed to sync with S3 bucket {result.stderr.decode('utf-8')}") 15 | return False 16 | 17 | logging.info(f"Successfully synced with S3 bucket") 18 | return True 19 | 20 | def remote_sync_fsspec(local_dir, remote_dir): 21 | # FIXME currently this is slow and not recommended. Look into speeding up. 22 | a = fsspec.get_mapper(local_dir) 23 | b = fsspec.get_mapper(remote_dir) 24 | 25 | for k in a: 26 | # skip epoch_latest which can change during sync. 27 | if 'epoch_latest.pt' in k: 28 | continue 29 | 30 | logging.info(f'Attempting to sync {k}') 31 | if k in b and len(a[k]) == len(b[k]): 32 | logging.debug(f'Skipping remote sync for {k}.') 33 | continue 34 | 35 | try: 36 | logging.info(f'Successful sync for {k}.') 37 | b[k] = a[k] 38 | except Exception as e: 39 | logging.info(f'Error during remote sync for {k}: {e}') 40 | return False 41 | 42 | return True 43 | 44 | def remote_sync(local_dir, remote_dir, protocol): 45 | logging.info('Starting remote sync.') 46 | if protocol == 's3': 47 | return remote_sync_s3(local_dir, remote_dir) 48 | elif protocol == 'fsspec': 49 | return remote_sync_fsspec(local_dir, remote_dir) 50 | else: 51 | logging.error('Remote protocol not known') 52 | return False 53 | 54 | def keep_running_remote_sync(sync_every, local_dir, remote_dir, protocol): 55 | while True: 56 | time.sleep(sync_every) 57 | remote_sync(local_dir, remote_dir, protocol) 58 | 59 | def start_sync_process(sync_every, local_dir, remote_dir, protocol): 60 | p = multiprocessing.Process(target=keep_running_remote_sync, args=(sync_every, local_dir, remote_dir, protocol)) 61 | return p 62 | 63 | # Note: we are not currently using this save function. 64 | def pt_save(pt_obj, file_path): 65 | of = fsspec.open(file_path, "wb") 66 | with of as f: 67 | torch.save(pt_obj, file_path) 68 | 69 | def pt_load(file_path, map_location=None): 70 | if not file_path.startswith('/'): 71 | logging.info('Loading remote checkpoint, which may take a bit.') 72 | of = fsspec.open(file_path, "rb") 73 | with of as f: 74 | out = torch.load(f, map_location=map_location) 75 | return out 76 | 77 | def check_exists(file_path): 78 | try: 79 | with fsspec.open(file_path): 80 | pass 81 | except FileNotFoundError: 82 | return False 83 | return True 84 | -------------------------------------------------------------------------------- /open_clip/src/training/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def setup_logging(log_file, level, include_host=False): 5 | if include_host: 6 | import socket 7 | hostname = socket.gethostname() 8 | formatter = logging.Formatter( 9 | f'%(asctime)s | {hostname} | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S') 10 | else: 11 | formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s', datefmt='%Y-%m-%d,%H:%M:%S') 12 | 13 | logging.root.setLevel(level) 14 | loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] 15 | for logger in loggers: 16 | logger.setLevel(level) 17 | 18 | stream_handler = logging.StreamHandler() 19 | stream_handler.setFormatter(formatter) 20 | logging.root.addHandler(stream_handler) 21 | 22 | if log_file: 23 | file_handler = logging.FileHandler(filename=log_file) 24 | file_handler.setFormatter(formatter) 25 | logging.root.addHandler(file_handler) 26 | 27 | -------------------------------------------------------------------------------- /open_clip/src/training/precision.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from contextlib import suppress 3 | 4 | 5 | def get_autocast(precision): 6 | if precision == 'amp': 7 | return torch.cuda.amp.autocast 8 | elif precision == 'amp_bfloat16' or precision == 'amp_bf16': 9 | # amp_bfloat16 is more stable than amp float16 for clip training 10 | return lambda: torch.cuda.amp.autocast(dtype=torch.bfloat16) 11 | else: 12 | return suppress 13 | -------------------------------------------------------------------------------- /open_clip/src/training/scheduler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def assign_learning_rate(optimizer, new_lr): 5 | for param_group in optimizer.param_groups: 6 | param_group["lr"] = new_lr 7 | 8 | 9 | def _warmup_lr(base_lr, warmup_length, step): 10 | return base_lr * (step + 1) / warmup_length 11 | 12 | 13 | def const_lr(optimizer, base_lr, warmup_length, steps): 14 | def _lr_adjuster(step): 15 | if step < warmup_length: 16 | lr = _warmup_lr(base_lr, warmup_length, step) 17 | else: 18 | lr = base_lr 19 | assign_learning_rate(optimizer, lr) 20 | return lr 21 | return _lr_adjuster 22 | 23 | 24 | def const_lr_cooldown(optimizer, base_lr, warmup_length, steps, cooldown_steps, cooldown_power=1.0, cooldown_end_lr=0.): 25 | def _lr_adjuster(step): 26 | start_cooldown_step = steps - cooldown_steps 27 | if step < warmup_length: 28 | lr = _warmup_lr(base_lr, warmup_length, step) 29 | else: 30 | if step < start_cooldown_step: 31 | lr = base_lr 32 | else: 33 | e = step - start_cooldown_step 34 | es = steps - start_cooldown_step 35 | # linear decay if power == 1; polynomial decay otherwise; 36 | decay = (1 - (e/es)) ** cooldown_power 37 | lr = decay * (base_lr - cooldown_end_lr) + cooldown_end_lr 38 | assign_learning_rate(optimizer, lr) 39 | return lr 40 | return _lr_adjuster 41 | 42 | 43 | def cosine_lr(optimizer, base_lr, warmup_length, steps): 44 | def _lr_adjuster(step): 45 | if step < warmup_length: 46 | lr = _warmup_lr(base_lr, warmup_length, step) 47 | else: 48 | e = step - warmup_length 49 | es = steps - warmup_length 50 | lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr 51 | assign_learning_rate(optimizer, lr) 52 | return lr 53 | return _lr_adjuster 54 | -------------------------------------------------------------------------------- /open_clip/src/training/zero_shot.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from tqdm import tqdm 6 | 7 | from open_clip import get_cast_dtype, get_tokenizer 8 | from .precision import get_autocast 9 | from .imagenet_zeroshot_data import imagenet_classnames, openai_imagenet_template 10 | 11 | 12 | def zero_shot_classifier(model, classnames, templates, args): 13 | tokenizer = get_tokenizer(args.model) 14 | with torch.no_grad(): 15 | zeroshot_weights = [] 16 | for classname in tqdm(classnames): 17 | texts = [template(classname) for template in templates] # format with class 18 | texts = tokenizer(texts).to(args.device) # tokenize 19 | if args.distributed and not args.horovod: 20 | class_embeddings = model.module.encode_text(texts) 21 | else: 22 | class_embeddings = model.encode_text(texts) 23 | class_embedding = F.normalize(class_embeddings, dim=-1).mean(dim=0) 24 | class_embedding /= class_embedding.norm() 25 | zeroshot_weights.append(class_embedding) 26 | zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(args.device) 27 | return zeroshot_weights 28 | 29 | 30 | def accuracy(output, target, topk=(1,)): 31 | pred = output.topk(max(topk), 1, True, True)[1].t() 32 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 33 | return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk] 34 | 35 | 36 | def run(model, classifier, dataloader, args): 37 | autocast = get_autocast(args.precision) 38 | cast_dtype = get_cast_dtype(args.precision) 39 | with torch.no_grad(): 40 | top1, top5, n = 0., 0., 0. 41 | for images, target in tqdm(dataloader, unit_scale=args.batch_size): 42 | images = images.to(args.device) 43 | if cast_dtype is not None: 44 | images = images.to(dtype=cast_dtype) 45 | target = target.to(args.device) 46 | 47 | with autocast(): 48 | # predict 49 | if args.distributed and not args.horovod: 50 | image_features = model.module.encode_image(images) 51 | else: 52 | image_features = model.encode_image(images) 53 | image_features = F.normalize(image_features, dim=-1) 54 | logits = 100. * image_features @ classifier 55 | 56 | # measure accuracy 57 | acc1, acc5 = accuracy(logits, target, topk=(1, 5)) 58 | top1 += acc1 59 | top5 += acc5 60 | n += images.size(0) 61 | 62 | top1 = (top1 / n) 63 | top5 = (top5 / n) 64 | return top1, top5 65 | 66 | 67 | def zero_shot_eval(model, data, epoch, args): 68 | if 'imagenet-val' not in data and 'imagenet-v2' not in data: 69 | return {} 70 | if args.zeroshot_frequency == 0: 71 | return {} 72 | if (epoch % args.zeroshot_frequency) != 0 and epoch != args.epochs: 73 | return {} 74 | 75 | logging.info('Starting zero-shot imagenet.') 76 | 77 | logging.info('Building zero-shot classifier') 78 | classifier = zero_shot_classifier(model, imagenet_classnames, openai_imagenet_template, args) 79 | 80 | logging.info('Using classifier') 81 | results = {} 82 | if 'imagenet-val' in data: 83 | top1, top5 = run(model, classifier, data['imagenet-val'].dataloader, args) 84 | results['imagenet-zeroshot-val-top1'] = top1 85 | results['imagenet-zeroshot-val-top5'] = top5 86 | if 'imagenet-v2' in data: 87 | top1, top5 = run(model, classifier, data['imagenet-v2'].dataloader, args) 88 | results['imagenetv2-zeroshot-val-top1'] = top1 89 | results['imagenetv2-zeroshot-val-top5'] = top5 90 | 91 | logging.info('Finished zero-shot imagenet.') 92 | 93 | return results 94 | -------------------------------------------------------------------------------- /open_clip/tests/test_hf_model.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import torch 4 | from open_clip.hf_model import _POOLERS, HFTextEncoder 5 | from transformers import AutoConfig 6 | from transformers.modeling_outputs import BaseModelOutput 7 | # test poolers 8 | def test_poolers(): 9 | bs, sl, d = 2, 10, 5 10 | h = torch.arange(sl).repeat(bs).reshape(bs, sl)[..., None] * torch.linspace(0.2, 1., d) 11 | mask = torch.ones(bs, sl, dtype=torch.long) 12 | mask[:2, 6:] = 0 13 | x = BaseModelOutput(h) 14 | for name, cls in _POOLERS.items(): 15 | pooler = cls() 16 | res = pooler(x, mask) 17 | assert res.shape == (bs, d), f"{name} returned wrong shape" 18 | 19 | # test HFTextEncoder 20 | @pytest.mark.parametrize("model_id", ["arampacha/roberta-tiny", "roberta-base", "xlm-roberta-base", "google/mt5-base"]) 21 | def test_pretrained_text_encoder(model_id): 22 | bs, sl, d = 2, 10, 64 23 | cfg = AutoConfig.from_pretrained(model_id) 24 | model = HFTextEncoder(model_id, d, proj='linear') 25 | x = torch.randint(0, cfg.vocab_size, (bs, sl)) 26 | with torch.no_grad(): 27 | emb = model(x) 28 | 29 | assert emb.shape == (bs, d) 30 | -------------------------------------------------------------------------------- /open_clip/tests/test_inference.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import pytest 4 | import torch 5 | import open_clip 6 | import util_test 7 | 8 | os.environ['CUDA_VISIBLE_DEVICES'] = '' 9 | 10 | models_to_test = set(open_clip.list_models()) 11 | 12 | # testing excemptions 13 | models_to_test = models_to_test.difference({ 14 | # not available with timm yet 15 | # see https://github.com/mlfoundations/open_clip/issues/219 16 | 'convnext_xlarge', 17 | 'convnext_xxlarge', 18 | 'convnext_xxlarge_320', 19 | 'vit_medium_patch16_gap_256', 20 | # exceeds GH runner memory limit 21 | 'ViT-bigG-14', 22 | 'ViT-e-14', 23 | 'mt5-xl-ViT-H-14', 24 | }) 25 | 26 | if 'OPEN_CLIP_TEST_REG_MODELS' in os.environ: 27 | external_model_list = os.environ['OPEN_CLIP_TEST_REG_MODELS'] 28 | with open(external_model_list, 'r') as f: 29 | models_to_test = set(f.read().splitlines()).intersection(models_to_test) 30 | print(f"Selected models from {external_model_list}: {models_to_test}") 31 | 32 | models_to_test = list(models_to_test) 33 | models_to_test.sort() 34 | 35 | @pytest.mark.regression_test 36 | @pytest.mark.parametrize('model_name', models_to_test) 37 | def test_inference_with_data( 38 | model_name, 39 | pretrained = None, 40 | pretrained_hf = False, 41 | precision = 'fp32', 42 | jit = False, 43 | force_quick_gelu = False, 44 | ): 45 | util_test.seed_all() 46 | model, _, preprocess_val = open_clip.create_model_and_transforms( 47 | model_name, 48 | pretrained = pretrained, 49 | precision = precision, 50 | jit = jit, 51 | force_quick_gelu = force_quick_gelu, 52 | pretrained_hf = pretrained_hf 53 | ) 54 | model_id = f'{model_name}_{pretrained or pretrained_hf}_{precision}' 55 | input_dir, output_dir = util_test.get_data_dirs() 56 | # text 57 | input_text_path = os.path.join(input_dir, 'random_text.pt') 58 | gt_text_path = os.path.join(output_dir, f'{model_id}_random_text.pt') 59 | if not os.path.isfile(input_text_path): 60 | pytest.skip(reason = f"missing test data, expected at {input_text_path}") 61 | if not os.path.isfile(gt_text_path): 62 | pytest.skip(reason = f"missing test data, expected at {gt_text_path}") 63 | input_text = torch.load(input_text_path) 64 | gt_text = torch.load(gt_text_path) 65 | y_text = util_test.inference_text(model, model_name, input_text) 66 | assert (y_text == gt_text).all(), f"text output differs @ {input_text_path}" 67 | # image 68 | image_size = model.visual.image_size 69 | if not isinstance(image_size, tuple): 70 | image_size = (image_size, image_size) 71 | input_image_path = os.path.join(input_dir, f'random_image_{image_size[0]}_{image_size[1]}.pt') 72 | gt_image_path = os.path.join(output_dir, f'{model_id}_random_image.pt') 73 | if not os.path.isfile(input_image_path): 74 | pytest.skip(reason = f"missing test data, expected at {input_image_path}") 75 | if not os.path.isfile(gt_image_path): 76 | pytest.skip(reason = f"missing test data, expected at {gt_image_path}") 77 | input_image = torch.load(input_image_path) 78 | gt_image = torch.load(gt_image_path) 79 | y_image = util_test.inference_image(model, preprocess_val, input_image) 80 | assert (y_image == gt_image).all(), f"image output differs @ {input_image_path}" 81 | 82 | 83 | -------------------------------------------------------------------------------- /open_clip/tests/test_inference_simple.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from PIL import Image 4 | from open_clip.factory import get_tokenizer 5 | import pytest 6 | import open_clip 7 | import os 8 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 9 | 10 | @pytest.mark.parametrize("model_type,pretrained", [("ViT-B-32-quickgelu", "laion400m_e32"), ("roberta-ViT-B-32", "laion2b_s12b_b32k")]) 11 | def test_inference_simple(model_type, pretrained): 12 | model, _, preprocess = open_clip.create_model_and_transforms(model_type, pretrained=pretrained, jit=False) 13 | tokenizer = get_tokenizer(model_type) 14 | 15 | current_dir = os.path.dirname(os.path.realpath(__file__)) 16 | 17 | image = preprocess(Image.open(current_dir + "/../docs/CLIP.png")).unsqueeze(0) 18 | text = tokenizer(["a diagram", "a dog", "a cat"]) 19 | 20 | with torch.no_grad(): 21 | image_features = model.encode_image(image) 22 | text_features = model.encode_text(text) 23 | 24 | text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1) 25 | 26 | assert text_probs.cpu().numpy()[0].tolist() == [1.0, 0.0, 0.0] 27 | -------------------------------------------------------------------------------- /open_clip/tests/test_num_shards.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from training.data import get_dataset_size 4 | 5 | @pytest.mark.parametrize( 6 | "shards,expected_size", 7 | [ 8 | ('/path/to/shard.tar', 1), 9 | ('/path/to/shard_{000..000}.tar', 1), 10 | ('/path/to/shard_{000..009}.tar', 10), 11 | ('/path/to/shard_{000..009}_{000..009}.tar', 100), 12 | ('/path/to/shard.tar::/path/to/other_shard_{000..009}.tar', 11), 13 | ('/path/to/shard_{000..009}.tar::/path/to/other_shard_{000..009}.tar', 20), 14 | (['/path/to/shard.tar'], 1), 15 | (['/path/to/shard.tar', '/path/to/other_shard.tar'], 2), 16 | ] 17 | ) 18 | def test_num_shards(shards, expected_size): 19 | _, size = get_dataset_size(shards) 20 | assert size == expected_size, f'Expected {expected_size} for {shards} but found {size} instead.' 21 | -------------------------------------------------------------------------------- /open_clip/tests/test_training_simple.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | import pytest 5 | from PIL import Image 6 | import torch 7 | from training.main import main 8 | 9 | os.environ["CUDA_VISIBLE_DEVICES"] = "" 10 | 11 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals") 12 | def test_training(): 13 | main([ 14 | '--save-frequency', '1', 15 | '--zeroshot-frequency', '1', 16 | '--dataset-type', "synthetic", 17 | '--train-num-samples', '16', 18 | '--warmup', '1', 19 | '--batch-size', '4', 20 | '--lr', '1e-3', 21 | '--wd', '0.1', 22 | '--epochs', '1', 23 | '--workers', '2', 24 | '--model', 'RN50' 25 | ]) 26 | 27 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals") 28 | def test_training_mt5(): 29 | main([ 30 | '--save-frequency', '1', 31 | '--zeroshot-frequency', '1', 32 | '--dataset-type', "synthetic", 33 | '--train-num-samples', '16', 34 | '--warmup', '1', 35 | '--batch-size', '4', 36 | '--lr', '1e-3', 37 | '--wd', '0.1', 38 | '--epochs', '1', 39 | '--workers', '2', 40 | '--model', 'mt5-base-ViT-B-32', 41 | '--lock-text', 42 | '--lock-text-unlocked-layers', '2' 43 | ]) 44 | 45 | 46 | 47 | @pytest.mark.skipif(sys.platform.startswith('darwin'), reason="macos pickle bug with locals") 48 | def test_training_unfreezing_vit(): 49 | main([ 50 | '--save-frequency', '1', 51 | '--zeroshot-frequency', '1', 52 | '--dataset-type', "synthetic", 53 | '--train-num-samples', '16', 54 | '--warmup', '1', 55 | '--batch-size', '4', 56 | '--lr', '1e-3', 57 | '--wd', '0.1', 58 | '--epochs', '1', 59 | '--workers', '2', 60 | '--model', 'ViT-B-32', 61 | '--lock-image', 62 | '--lock-image-unlocked-groups', '5' 63 | ]) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | timm==0.9.1 2 | scikit-image==0.15.0 3 | scikit-learn==0.24.2 4 | opencv-python==4.5.5.64 5 | hydra-core==1.3.2 6 | openmim==0.3.6 7 | mmcv-full==1.7.1 8 | mmsegmentation==0.29.1 9 | torch==1.12.1+cu113 10 | torchvision==0.13.1 11 | tokenizers==0.11.1 12 | Pillow~=9.5 13 | detectron2 #Following https://detectron2.readthedocs.io/en/latest/tutorials/install.html 14 | -------------------------------------------------------------------------------- /transformers/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/activations.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/activations.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/configuration_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/configuration_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/convert_slow_tokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/convert_slow_tokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/deepspeed.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/deepspeed.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/dependency_versions_check.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/dependency_versions_check.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/dependency_versions_table.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/dependency_versions_table.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/dynamic_module_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/dynamic_module_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/feature_extraction_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/feature_extraction_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/file_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/file_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/image_processing_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/image_processing_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/image_transforms.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/image_transforms.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/image_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/image_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_outputs.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/modeling_outputs.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/modeling_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/modeling_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/processing_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/processing_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/pytorch_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/pytorch_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/tokenization_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_utils_base.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/tokenization_utils_base.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/__pycache__/tokenization_utils_fast.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/__pycache__/tokenization_utils_fast.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/benchmark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/benchmark/__init__.py -------------------------------------------------------------------------------- /transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from argparse import ArgumentParser 17 | 18 | 19 | class BaseTransformersCLICommand(ABC): 20 | @staticmethod 21 | @abstractmethod 22 | def register_subcommand(parser: ArgumentParser): 23 | raise NotImplementedError() 24 | 25 | @abstractmethod 26 | def run(self): 27 | raise NotImplementedError() 28 | -------------------------------------------------------------------------------- /transformers/commands/download.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from argparse import ArgumentParser 16 | 17 | from . import BaseTransformersCLICommand 18 | 19 | 20 | def download_command_factory(args): 21 | return DownloadCommand(args.model, args.cache_dir, args.force) 22 | 23 | 24 | class DownloadCommand(BaseTransformersCLICommand): 25 | @staticmethod 26 | def register_subcommand(parser: ArgumentParser): 27 | download_parser = parser.add_parser("download") 28 | download_parser.add_argument( 29 | "--cache-dir", type=str, default=None, help="Path to location to store the models" 30 | ) 31 | download_parser.add_argument( 32 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir" 33 | ) 34 | download_parser.add_argument("model", type=str, help="Name of the model to download") 35 | download_parser.set_defaults(func=download_command_factory) 36 | 37 | def __init__(self, model: str, cache: str, force: bool): 38 | self._model = model 39 | self._cache = cache 40 | self._force = force 41 | 42 | def run(self): 43 | from ..models.auto import AutoModel, AutoTokenizer 44 | 45 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 46 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 47 | -------------------------------------------------------------------------------- /transformers/commands/transformers_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from argparse import ArgumentParser 17 | 18 | from .add_new_model import AddNewModelCommand 19 | from .add_new_model_like import AddNewModelLikeCommand 20 | from .convert import ConvertCommand 21 | from .download import DownloadCommand 22 | from .env import EnvironmentCommand 23 | from .lfs import LfsCommands 24 | from .pt_to_tf import PTtoTFCommand 25 | from .run import RunCommand 26 | from .serving import ServeCommand 27 | from .user import UserCommands 28 | 29 | 30 | def main(): 31 | parser = ArgumentParser("Transformers CLI tool", usage="transformers-cli []") 32 | commands_parser = parser.add_subparsers(help="transformers-cli command helpers") 33 | 34 | # Register commands 35 | ConvertCommand.register_subcommand(commands_parser) 36 | DownloadCommand.register_subcommand(commands_parser) 37 | EnvironmentCommand.register_subcommand(commands_parser) 38 | RunCommand.register_subcommand(commands_parser) 39 | ServeCommand.register_subcommand(commands_parser) 40 | UserCommands.register_subcommand(commands_parser) 41 | AddNewModelCommand.register_subcommand(commands_parser) 42 | AddNewModelLikeCommand.register_subcommand(commands_parser) 43 | LfsCommands.register_subcommand(commands_parser) 44 | PTtoTFCommand.register_subcommand(commands_parser) 45 | 46 | # Let's go 47 | args = parser.parse_args() 48 | 49 | if not hasattr(args, "func"): 50 | parser.print_help() 51 | exit(1) 52 | 53 | # Run 54 | service = args.func(args) 55 | service.run() 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Seq2Seq TF Hub checkpoint.""" 16 | 17 | 18 | import argparse 19 | 20 | from . import ( 21 | BertConfig, 22 | BertGenerationConfig, 23 | BertGenerationDecoder, 24 | BertGenerationEncoder, 25 | load_tf_weights_in_bert_generation, 26 | logging, 27 | ) 28 | 29 | 30 | logging.set_verbosity_info() 31 | 32 | 33 | def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder): 34 | # Initialise PyTorch model 35 | bert_config = BertConfig.from_pretrained( 36 | "bert-large-cased", 37 | vocab_size=vocab_size, 38 | max_position_embeddings=512, 39 | is_decoder=True, 40 | add_cross_attention=True, 41 | ) 42 | bert_config_dict = bert_config.to_dict() 43 | del bert_config_dict["type_vocab_size"] 44 | config = BertGenerationConfig(**bert_config_dict) 45 | if is_encoder: 46 | model = BertGenerationEncoder(config) 47 | else: 48 | model = BertGenerationDecoder(config) 49 | print(f"Building PyTorch model from configuration: {config}") 50 | 51 | # Load weights from tf checkpoint 52 | load_tf_weights_in_bert_generation( 53 | model, 54 | tf_hub_path, 55 | model_class="bert", 56 | is_encoder_named_decoder=is_encoder_named_decoder, 57 | is_encoder=is_encoder, 58 | ) 59 | 60 | # Save pytorch-model 61 | print(f"Save PyTorch model and config to {pytorch_dump_path}") 62 | model.save_pretrained(pytorch_dump_path) 63 | 64 | 65 | if __name__ == "__main__": 66 | parser = argparse.ArgumentParser() 67 | # Required parameters 68 | parser.add_argument( 69 | "--tf_hub_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 70 | ) 71 | parser.add_argument( 72 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 73 | ) 74 | parser.add_argument( 75 | "--is_encoder_named_decoder", 76 | action="store_true", 77 | help="If decoder has to be renamed to encoder in PyTorch model.", 78 | ) 79 | parser.add_argument("--is_encoder", action="store_true", help="If model is an encoder.") 80 | parser.add_argument("--vocab_size", default=50358, type=int, help="Vocab size of model") 81 | args = parser.parse_args() 82 | convert_tf_checkpoint_to_pytorch( 83 | args.tf_hub_path, 84 | args.pytorch_dump_path, 85 | args.is_encoder_named_decoder, 86 | args.vocab_size, 87 | is_encoder=args.is_encoder, 88 | ) 89 | -------------------------------------------------------------------------------- /transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .data_collator import ( 16 | DataCollatorForLanguageModeling, 17 | DataCollatorForPermutationLanguageModeling, 18 | DataCollatorForSeq2Seq, 19 | DataCollatorForSOP, 20 | DataCollatorForTokenClassification, 21 | DataCollatorForWholeWordMask, 22 | DataCollatorWithPadding, 23 | DefaultDataCollator, 24 | default_data_collator, 25 | ) 26 | from .metrics import glue_compute_metrics, xnli_compute_metrics 27 | from .processors import ( 28 | DataProcessor, 29 | InputExample, 30 | InputFeatures, 31 | SingleSentenceClassificationProcessor, 32 | SquadExample, 33 | SquadFeatures, 34 | SquadV1Processor, 35 | SquadV2Processor, 36 | glue_convert_examples_to_features, 37 | glue_output_modes, 38 | glue_processors, 39 | glue_tasks_num_labels, 40 | squad_convert_examples_to_features, 41 | xnli_output_modes, 42 | xnli_processors, 43 | xnli_tasks_num_labels, 44 | ) 45 | -------------------------------------------------------------------------------- /transformers/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .glue import GlueDataset, GlueDataTrainingArguments 16 | from .language_modeling import ( 17 | LineByLineTextDataset, 18 | LineByLineWithRefDataset, 19 | LineByLineWithSOPTextDataset, 20 | TextDataset, 21 | TextDatasetForNextSentencePrediction, 22 | ) 23 | from .squad import SquadDataset, SquadDataTrainingArguments 24 | -------------------------------------------------------------------------------- /transformers/data/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import warnings 14 | 15 | from ...utils import is_sklearn_available, requires_backends 16 | 17 | 18 | if is_sklearn_available(): 19 | from scipy.stats import pearsonr, spearmanr 20 | from sklearn.metrics import f1_score, matthews_corrcoef 21 | 22 | 23 | DEPRECATION_WARNING = ( 24 | "This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate " 25 | "library. You can have a look at this example script for pointers: " 26 | "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py" 27 | ) 28 | 29 | 30 | def simple_accuracy(preds, labels): 31 | warnings.warn(DEPRECATION_WARNING, FutureWarning) 32 | requires_backends(simple_accuracy, "sklearn") 33 | return (preds == labels).mean() 34 | 35 | 36 | def acc_and_f1(preds, labels): 37 | warnings.warn(DEPRECATION_WARNING, FutureWarning) 38 | requires_backends(acc_and_f1, "sklearn") 39 | acc = simple_accuracy(preds, labels) 40 | f1 = f1_score(y_true=labels, y_pred=preds) 41 | return { 42 | "acc": acc, 43 | "f1": f1, 44 | "acc_and_f1": (acc + f1) / 2, 45 | } 46 | 47 | 48 | def pearson_and_spearman(preds, labels): 49 | warnings.warn(DEPRECATION_WARNING, FutureWarning) 50 | requires_backends(pearson_and_spearman, "sklearn") 51 | pearson_corr = pearsonr(preds, labels)[0] 52 | spearman_corr = spearmanr(preds, labels)[0] 53 | return { 54 | "pearson": pearson_corr, 55 | "spearmanr": spearman_corr, 56 | "corr": (pearson_corr + spearman_corr) / 2, 57 | } 58 | 59 | 60 | def glue_compute_metrics(task_name, preds, labels): 61 | warnings.warn(DEPRECATION_WARNING, FutureWarning) 62 | requires_backends(glue_compute_metrics, "sklearn") 63 | assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}" 64 | if task_name == "cola": 65 | return {"mcc": matthews_corrcoef(labels, preds)} 66 | elif task_name == "sst-2": 67 | return {"acc": simple_accuracy(preds, labels)} 68 | elif task_name == "mrpc": 69 | return acc_and_f1(preds, labels) 70 | elif task_name == "sts-b": 71 | return pearson_and_spearman(preds, labels) 72 | elif task_name == "qqp": 73 | return acc_and_f1(preds, labels) 74 | elif task_name == "mnli": 75 | return {"mnli/acc": simple_accuracy(preds, labels)} 76 | elif task_name == "mnli-mm": 77 | return {"mnli-mm/acc": simple_accuracy(preds, labels)} 78 | elif task_name == "qnli": 79 | return {"acc": simple_accuracy(preds, labels)} 80 | elif task_name == "rte": 81 | return {"acc": simple_accuracy(preds, labels)} 82 | elif task_name == "wnli": 83 | return {"acc": simple_accuracy(preds, labels)} 84 | elif task_name == "hans": 85 | return {"acc": simple_accuracy(preds, labels)} 86 | else: 87 | raise KeyError(task_name) 88 | 89 | 90 | def xnli_compute_metrics(task_name, preds, labels): 91 | warnings.warn(DEPRECATION_WARNING, FutureWarning) 92 | requires_backends(xnli_compute_metrics, "sklearn") 93 | assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}" 94 | if task_name == "xnli": 95 | return {"acc": simple_accuracy(preds, labels)} 96 | else: 97 | raise KeyError(task_name) 98 | -------------------------------------------------------------------------------- /transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 16 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 17 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 18 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 19 | -------------------------------------------------------------------------------- /transformers/data/processors/xnli.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XNLI utils (dataset loading and evaluation)""" 17 | 18 | 19 | import os 20 | 21 | from ...utils import logging 22 | from .utils import DataProcessor, InputExample 23 | 24 | 25 | logger = logging.get_logger(__name__) 26 | 27 | 28 | class XnliProcessor(DataProcessor): 29 | """ 30 | Processor for the XNLI dataset. Adapted from 31 | https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207 32 | """ 33 | 34 | def __init__(self, language, train_language=None): 35 | self.language = language 36 | self.train_language = train_language 37 | 38 | def get_train_examples(self, data_dir): 39 | """See base class.""" 40 | lg = self.language if self.train_language is None else self.train_language 41 | lines = self._read_tsv(os.path.join(data_dir, f"XNLI-MT-1.0/multinli/multinli.train.{lg}.tsv")) 42 | examples = [] 43 | for i, line in enumerate(lines): 44 | if i == 0: 45 | continue 46 | guid = f"train-{i}" 47 | text_a = line[0] 48 | text_b = line[1] 49 | label = "contradiction" if line[2] == "contradictory" else line[2] 50 | if not isinstance(text_a, str): 51 | raise ValueError(f"Training input {text_a} is not a string") 52 | if not isinstance(text_b, str): 53 | raise ValueError(f"Training input {text_b} is not a string") 54 | if not isinstance(label, str): 55 | raise ValueError(f"Training label {label} is not a string") 56 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 57 | return examples 58 | 59 | def get_test_examples(self, data_dir): 60 | """See base class.""" 61 | lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv")) 62 | examples = [] 63 | for i, line in enumerate(lines): 64 | if i == 0: 65 | continue 66 | language = line[0] 67 | if language != self.language: 68 | continue 69 | guid = f"test-{i}" 70 | text_a = line[6] 71 | text_b = line[7] 72 | label = line[1] 73 | if not isinstance(text_a, str): 74 | raise ValueError(f"Training input {text_a} is not a string") 75 | if not isinstance(text_b, str): 76 | raise ValueError(f"Training input {text_b} is not a string") 77 | if not isinstance(label, str): 78 | raise ValueError(f"Training label {label} is not a string") 79 | examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 80 | return examples 81 | 82 | def get_labels(self): 83 | """See base class.""" 84 | return ["contradiction", "entailment", "neutral"] 85 | 86 | 87 | xnli_processors = { 88 | "xnli": XnliProcessor, 89 | } 90 | 91 | xnli_output_modes = { 92 | "xnli": "classification", 93 | } 94 | 95 | xnli_tasks_num_labels = { 96 | "xnli": 3, 97 | } 98 | -------------------------------------------------------------------------------- /transformers/data/test_generation_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import random 16 | import unittest 17 | 18 | import timeout_decorator 19 | 20 | from ..testing_utils import require_torch 21 | from ..utils import cached_property, is_torch_available 22 | 23 | 24 | if is_torch_available(): 25 | import torch 26 | 27 | from ..models.marian import MarianConfig, MarianMTModel 28 | 29 | 30 | @require_torch 31 | class GenerationUtilsTest(unittest.TestCase): 32 | @cached_property 33 | def config(self): 34 | config = MarianConfig.from_pretrained("sshleifer/tiny-marian-en-de") 35 | return config 36 | 37 | @cached_property 38 | def model(self): 39 | return MarianMTModel(self.config) 40 | 41 | def test_postprocess_next_token_scores(self): 42 | config = self.config 43 | model = self.model 44 | # Initialize an input id tensor with batch size 8 and sequence length 12 45 | input_ids = torch.arange(0, 96, 1).view((8, 12)) 46 | eos = config.eos_token_id 47 | bad_words_ids_test_cases = [[[299]], [[23, 24], [54]], [[config.eos_token_id]], []] 48 | masked_scores = [ 49 | [(0, 299), (1, 299), (2, 299), (3, 299), (4, 299), (5, 299), (6, 299), (7, 299)], 50 | [(1, 24), (0, 54), (1, 54), (2, 54), (3, 54), (4, 54), (5, 54), (6, 54), (7, 54)], 51 | [(0, eos), (1, eos), (2, eos), (3, eos), (4, eos), (5, eos), (6, eos), (7, eos)], 52 | [], 53 | ] 54 | 55 | for test_case_index, bad_words_ids in enumerate(bad_words_ids_test_cases): 56 | # Initialize a scores tensor with batch size 8 and vocabulary size 300 57 | scores = torch.rand((8, 300)) 58 | output = model.postprocess_next_token_scores( 59 | scores, 60 | input_ids, 61 | 0, 62 | bad_words_ids, 63 | 13, 64 | 15, 65 | config.max_length, 66 | config.eos_token_id, 67 | config.repetition_penalty, 68 | 32, 69 | 5, 70 | ) 71 | for masked_score in masked_scores[test_case_index]: 72 | self.assertTrue(output[masked_score[0], masked_score[1]] == -float("inf")) 73 | 74 | @timeout_decorator.timeout(10) 75 | def test_postprocess_next_token_scores_large_bad_words_list(self): 76 | config = self.config 77 | model = self.model 78 | # Initialize an input id tensor with batch size 8 and sequence length 12 79 | input_ids = torch.arange(0, 96, 1).view((8, 12)) 80 | 81 | bad_words_ids = [] 82 | for _ in range(100): 83 | length_bad_word = random.randint(1, 4) 84 | bad_words_ids.append(random.sample(range(1, 300), length_bad_word)) 85 | 86 | scores = torch.rand((8, 300)) 87 | _ = model.postprocess_next_token_scores( 88 | scores, 89 | input_ids, 90 | 0, 91 | bad_words_ids, 92 | 13, 93 | 15, 94 | config.max_length, 95 | config.eos_token_id, 96 | config.repetition_penalty, 97 | 32, 98 | 5, 99 | ) 100 | -------------------------------------------------------------------------------- /transformers/dependency_versions_check.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import sys 15 | 16 | from .dependency_versions_table import deps 17 | from .utils.versions import require_version, require_version_core 18 | 19 | 20 | # define which module versions we always want to check at run time 21 | # (usually the ones defined in `install_requires` in setup.py) 22 | # 23 | # order specific notes: 24 | # - tqdm must be checked before tokenizers 25 | 26 | pkgs_to_check_at_runtime = "python tqdm regex requests packaging filelock numpy tokenizers".split() 27 | if sys.version_info < (3, 7): 28 | pkgs_to_check_at_runtime.append("dataclasses") 29 | if sys.version_info < (3, 8): 30 | pkgs_to_check_at_runtime.append("importlib_metadata") 31 | 32 | for pkg in pkgs_to_check_at_runtime: 33 | if pkg in deps: 34 | if pkg == "tokenizers": 35 | # must be loaded here, or else tqdm check may fail 36 | from .utils import is_tokenizers_available 37 | 38 | if not is_tokenizers_available(): 39 | continue # not required, check version only if installed 40 | 41 | require_version_core(deps[pkg]) 42 | else: 43 | raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py") 44 | 45 | 46 | def dep_version_check(pkg, hint=None): 47 | require_version(deps[pkg], hint) 48 | -------------------------------------------------------------------------------- /transformers/dependency_versions_table.py: -------------------------------------------------------------------------------- 1 | # THIS FILE HAS BEEN AUTOGENERATED. To update: 2 | # 1. modify the `_deps` dict in setup.py 3 | # 2. run `make deps_table_update`` 4 | deps = { 5 | "Pillow": "Pillow", 6 | "accelerate": "accelerate>=0.17.0", 7 | "av": "av==9.2.0", 8 | "beautifulsoup4": "beautifulsoup4", 9 | "black": "black~=23.1", 10 | "codecarbon": "codecarbon==1.2.0", 11 | "cookiecutter": "cookiecutter==1.7.3", 12 | "dataclasses": "dataclasses", 13 | "datasets": "datasets!=2.5.0", 14 | "decord": "decord==0.6.0", 15 | "deepspeed": "deepspeed>=0.8.3", 16 | "dill": "dill<0.3.5", 17 | "evaluate": "evaluate>=0.2.0", 18 | "fairscale": "fairscale>0.3", 19 | "faiss-cpu": "faiss-cpu", 20 | "fastapi": "fastapi", 21 | "filelock": "filelock", 22 | "flax": "flax>=0.4.1,<=0.6.9", 23 | "ftfy": "ftfy", 24 | "fugashi": "fugashi>=1.0", 25 | "GitPython": "GitPython<3.1.19", 26 | "hf-doc-builder": "hf-doc-builder>=0.3.0", 27 | "huggingface-hub": "huggingface-hub>=0.11.0,<1.0", 28 | "importlib_metadata": "importlib_metadata", 29 | "ipadic": "ipadic>=1.0.0,<2.0", 30 | "isort": "isort>=5.5.4", 31 | "jax": "jax>=0.2.8,!=0.3.2,<=0.3.6", 32 | "jaxlib": "jaxlib>=0.1.65,<=0.3.6", 33 | "jieba": "jieba", 34 | "kenlm": "kenlm", 35 | "keras-nlp": "keras-nlp>=0.3.1", 36 | "librosa": "librosa", 37 | "nltk": "nltk", 38 | "natten": "natten>=0.14.6", 39 | "numba": "numba<0.57.0", 40 | "numpy": "numpy>=1.17", 41 | "onnxconverter-common": "onnxconverter-common", 42 | "onnxruntime-tools": "onnxruntime-tools>=1.4.2", 43 | "onnxruntime": "onnxruntime>=1.4.0", 44 | "optuna": "optuna", 45 | "optax": "optax>=0.0.8,<=0.1.4", 46 | "packaging": "packaging>=20.0", 47 | "parameterized": "parameterized", 48 | "phonemizer": "phonemizer", 49 | "protobuf": "protobuf<=3.20.2", 50 | "psutil": "psutil", 51 | "pyyaml": "pyyaml>=5.1", 52 | "pydantic": "pydantic", 53 | "pytest": "pytest", 54 | "pytest-timeout": "pytest-timeout", 55 | "pytest-xdist": "pytest-xdist", 56 | "python": "python>=3.7.0", 57 | "ray[tune]": "ray[tune]", 58 | "regex": "regex!=2019.12.17", 59 | "requests": "requests", 60 | "rhoknp": "rhoknp>=1.1.0", 61 | "rjieba": "rjieba", 62 | "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", 63 | "ruff": "ruff>=0.0.241,<=0.0.259", 64 | "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", 65 | "sacremoses": "sacremoses", 66 | "safetensors": "safetensors>=0.2.1", 67 | "sagemaker": "sagemaker>=2.31.0", 68 | "scikit-learn": "scikit-learn", 69 | "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92", 70 | "sigopt": "sigopt", 71 | "starlette": "starlette", 72 | "sudachipy": "sudachipy>=0.6.6", 73 | "sudachidict_core": "sudachidict_core>=20220729", 74 | "tensorflow-cpu": "tensorflow-cpu>=2.4,<2.13", 75 | "tensorflow": "tensorflow>=2.4,<2.13", 76 | "tensorflow-text": "tensorflow-text<2.13", 77 | "tf2onnx": "tf2onnx", 78 | "timeout-decorator": "timeout-decorator", 79 | "timm": "timm", 80 | "tokenizers": "tokenizers>=0.11.1,!=0.11.3,<0.14", 81 | "torch": "torch>=1.9,!=1.12.0", 82 | "torchaudio": "torchaudio", 83 | "torchvision": "torchvision", 84 | "pyctcdecode": "pyctcdecode>=0.4.0", 85 | "tqdm": "tqdm>=4.27", 86 | "unidic": "unidic>=1.0.2", 87 | "unidic_lite": "unidic_lite>=1.0.7", 88 | "urllib3": "urllib3<2.0.0", 89 | "uvicorn": "uvicorn", 90 | } 91 | -------------------------------------------------------------------------------- /transformers/file_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | File utilities: utilities related to download and cache models 16 | 17 | This module should not be update anymore and is only left for backward compatibility. 18 | """ 19 | 20 | from . import __version__ 21 | 22 | # Backward compatibility imports, to make sure all those objects can be found in file_utils 23 | from .utils import ( 24 | CLOUDFRONT_DISTRIB_PREFIX, 25 | CONFIG_NAME, 26 | DISABLE_TELEMETRY, 27 | DUMMY_INPUTS, 28 | DUMMY_MASK, 29 | ENV_VARS_TRUE_AND_AUTO_VALUES, 30 | ENV_VARS_TRUE_VALUES, 31 | FEATURE_EXTRACTOR_NAME, 32 | FLAX_WEIGHTS_NAME, 33 | HF_MODULES_CACHE, 34 | HUGGINGFACE_CO_PREFIX, 35 | HUGGINGFACE_CO_RESOLVE_ENDPOINT, 36 | MODEL_CARD_NAME, 37 | MULTIPLE_CHOICE_DUMMY_INPUTS, 38 | PYTORCH_PRETRAINED_BERT_CACHE, 39 | PYTORCH_TRANSFORMERS_CACHE, 40 | S3_BUCKET_PREFIX, 41 | SENTENCEPIECE_UNDERLINE, 42 | SPIECE_UNDERLINE, 43 | TF2_WEIGHTS_NAME, 44 | TF_WEIGHTS_NAME, 45 | TORCH_FX_REQUIRED_VERSION, 46 | TRANSFORMERS_CACHE, 47 | TRANSFORMERS_DYNAMIC_MODULE_NAME, 48 | USE_JAX, 49 | USE_TF, 50 | USE_TORCH, 51 | WEIGHTS_INDEX_NAME, 52 | WEIGHTS_NAME, 53 | ContextManagers, 54 | DummyObject, 55 | EntryNotFoundError, 56 | ExplicitEnum, 57 | ModelOutput, 58 | PaddingStrategy, 59 | PushToHubMixin, 60 | RepositoryNotFoundError, 61 | RevisionNotFoundError, 62 | TensorType, 63 | _LazyModule, 64 | add_code_sample_docstrings, 65 | add_end_docstrings, 66 | add_start_docstrings, 67 | add_start_docstrings_to_model_forward, 68 | cached_property, 69 | copy_func, 70 | default_cache_path, 71 | define_sagemaker_information, 72 | get_cached_models, 73 | get_file_from_repo, 74 | get_full_repo_name, 75 | has_file, 76 | http_user_agent, 77 | is_apex_available, 78 | is_bs4_available, 79 | is_coloredlogs_available, 80 | is_datasets_available, 81 | is_detectron2_available, 82 | is_faiss_available, 83 | is_flax_available, 84 | is_ftfy_available, 85 | is_in_notebook, 86 | is_ipex_available, 87 | is_librosa_available, 88 | is_offline_mode, 89 | is_onnx_available, 90 | is_pandas_available, 91 | is_phonemizer_available, 92 | is_protobuf_available, 93 | is_psutil_available, 94 | is_py3nvml_available, 95 | is_pyctcdecode_available, 96 | is_pytesseract_available, 97 | is_pytorch_quantization_available, 98 | is_rjieba_available, 99 | is_sagemaker_dp_enabled, 100 | is_sagemaker_mp_enabled, 101 | is_scipy_available, 102 | is_sentencepiece_available, 103 | is_sklearn_available, 104 | is_soundfile_availble, 105 | is_spacy_available, 106 | is_speech_available, 107 | is_tensor, 108 | is_tensorflow_probability_available, 109 | is_tf2onnx_available, 110 | is_tf_available, 111 | is_timm_available, 112 | is_tokenizers_available, 113 | is_torch_available, 114 | is_torch_bf16_available, 115 | is_torch_cuda_available, 116 | is_torch_fx_available, 117 | is_torch_fx_proxy, 118 | is_torch_tf32_available, 119 | is_torch_tpu_available, 120 | is_torchaudio_available, 121 | is_training_run_on_sagemaker, 122 | is_vision_available, 123 | replace_return_docstrings, 124 | requires_backends, 125 | to_numpy, 126 | to_py_obj, 127 | torch_only_method, 128 | torch_version, 129 | ) 130 | -------------------------------------------------------------------------------- /transformers/generation/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/generation/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/generation/__pycache__/beam_constraints.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/generation/__pycache__/beam_constraints.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/generation/__pycache__/beam_search.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/generation/__pycache__/beam_search.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/generation/__pycache__/configuration_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/generation/__pycache__/configuration_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/generation/__pycache__/logits_process.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/generation/__pycache__/logits_process.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/generation/__pycache__/stopping_criteria.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/generation/__pycache__/stopping_criteria.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/generation/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/generation/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/generation_flax_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The Google AI Flax Team Authors, and The HuggingFace Inc. team. 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import warnings 18 | 19 | from .generation import FlaxGenerationMixin 20 | 21 | 22 | class FlaxGenerationMixin(FlaxGenerationMixin): 23 | # warning at import time 24 | warnings.warn( 25 | "Importing `FlaxGenerationMixin` from `src/transformers/generation_flax_utils.py` is deprecated and will " 26 | "be removed in Transformers v5. Import as `from transformers import FlaxGenerationMixin` instead.", 27 | FutureWarning, 28 | ) 29 | -------------------------------------------------------------------------------- /transformers/generation_tf_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import warnings 18 | 19 | from .generation import TFGenerationMixin 20 | 21 | 22 | class TFGenerationMixin(TFGenerationMixin): 23 | # warning at import time 24 | warnings.warn( 25 | "Importing `TFGenerationMixin` from `src/transformers/generation_tf_utils.py` is deprecated and will " 26 | "be removed in Transformers v5. Import as `from transformers import TFGenerationMixin` instead.", 27 | FutureWarning, 28 | ) 29 | -------------------------------------------------------------------------------- /transformers/generation_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import warnings 18 | 19 | from .generation import GenerationMixin 20 | 21 | 22 | class GenerationMixin(GenerationMixin): 23 | # warning at import time 24 | warnings.warn( 25 | "Importing `GenerationMixin` from `src/transformers/generation_utils.py` is deprecated and will " 26 | "be removed in Transformers v5. Import as `from transformers import GenerationMixin` instead.", 27 | FutureWarning, 28 | ) 29 | -------------------------------------------------------------------------------- /transformers/models/auto/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/auto/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/auto/__pycache__/auto_factory.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/auto/__pycache__/auto_factory.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/auto/__pycache__/configuration_auto.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/auto/__pycache__/configuration_auto.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/auto/__pycache__/modeling_auto.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/auto/__pycache__/modeling_auto.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/clip/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/clip/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/clip/__pycache__/tokenization_clip.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/clip/__pycache__/tokenization_clip.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/clip/__pycache__/tokenization_clip_fast.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/clip/__pycache__/tokenization_clip_fast.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/clip/feature_extraction_clip.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Feature extractor class for CLIP.""" 16 | 17 | import warnings 18 | 19 | from ...utils import logging 20 | from .image_processing_clip import CLIPImageProcessor 21 | 22 | 23 | logger = logging.get_logger(__name__) 24 | 25 | 26 | class CLIPFeatureExtractor(CLIPImageProcessor): 27 | def __init__(self, *args, **kwargs) -> None: 28 | warnings.warn( 29 | "The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please" 30 | " use CLIPImageProcessor instead.", 31 | FutureWarning, 32 | ) 33 | super().__init__(*args, **kwargs) 34 | -------------------------------------------------------------------------------- /transformers/models/clipseg/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available 17 | 18 | 19 | _import_structure = { 20 | "configuration_clipseg": [ 21 | "CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP", 22 | "CLIPSegConfig", 23 | "CLIPSegTextConfig", 24 | "CLIPSegVisionConfig", 25 | ], 26 | "processing_clipseg": ["CLIPSegProcessor"], 27 | } 28 | 29 | try: 30 | if not is_torch_available(): 31 | raise OptionalDependencyNotAvailable() 32 | except OptionalDependencyNotAvailable: 33 | pass 34 | else: 35 | _import_structure["modeling_clipseg"] = [ 36 | "CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST", 37 | "CLIPSegModel", 38 | "CLIPSegPreTrainedModel", 39 | "CLIPSegTextModel", 40 | "CLIPSegVisionModel", 41 | "CLIPSegForImageSegmentation", 42 | ] 43 | 44 | if TYPE_CHECKING: 45 | from .configuration_clipseg import ( 46 | CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP, 47 | CLIPSegConfig, 48 | CLIPSegTextConfig, 49 | CLIPSegVisionConfig, 50 | ) 51 | from .processing_clipseg import CLIPSegProcessor 52 | 53 | try: 54 | if not is_torch_available(): 55 | raise OptionalDependencyNotAvailable() 56 | except OptionalDependencyNotAvailable: 57 | pass 58 | else: 59 | from .modeling_clipseg import ( 60 | CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST, 61 | CLIPSegForImageSegmentation, 62 | CLIPSegModel, 63 | CLIPSegPreTrainedModel, 64 | CLIPSegTextModel, 65 | CLIPSegVisionModel, 66 | ) 67 | 68 | else: 69 | import sys 70 | 71 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 72 | -------------------------------------------------------------------------------- /transformers/models/clipseg/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/clipseg/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/clipseg/__pycache__/configuration_clipseg.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/clipseg/__pycache__/configuration_clipseg.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/clipseg/__pycache__/modeling_clipseg.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/clipseg/__pycache__/modeling_clipseg.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/clipseg/__pycache__/processing_clipseg.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/clipseg/__pycache__/processing_clipseg.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/vit/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from ...utils import ( 17 | OptionalDependencyNotAvailable, 18 | _LazyModule, 19 | is_flax_available, 20 | is_tf_available, 21 | is_torch_available, 22 | is_vision_available, 23 | ) 24 | 25 | 26 | _import_structure = {"configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig", "ViTOnnxConfig"]} 27 | 28 | try: 29 | if not is_vision_available(): 30 | raise OptionalDependencyNotAvailable() 31 | except OptionalDependencyNotAvailable: 32 | pass 33 | else: 34 | _import_structure["feature_extraction_vit"] = ["ViTFeatureExtractor"] 35 | _import_structure["image_processing_vit"] = ["ViTImageProcessor"] 36 | 37 | try: 38 | if not is_torch_available(): 39 | raise OptionalDependencyNotAvailable() 40 | except OptionalDependencyNotAvailable: 41 | pass 42 | else: 43 | _import_structure["modeling_vit"] = [ 44 | "VIT_PRETRAINED_MODEL_ARCHIVE_LIST", 45 | "ViTForImageClassification", 46 | "ViTForMaskedImageModeling", 47 | "ViTModel", 48 | "ViTPreTrainedModel", 49 | ] 50 | 51 | try: 52 | if not is_tf_available(): 53 | raise OptionalDependencyNotAvailable() 54 | except OptionalDependencyNotAvailable: 55 | pass 56 | else: 57 | _import_structure["modeling_tf_vit"] = [ 58 | "TFViTForImageClassification", 59 | "TFViTModel", 60 | "TFViTPreTrainedModel", 61 | ] 62 | 63 | try: 64 | if not is_flax_available(): 65 | raise OptionalDependencyNotAvailable() 66 | except OptionalDependencyNotAvailable: 67 | pass 68 | else: 69 | _import_structure["modeling_flax_vit"] = [ 70 | "FlaxViTForImageClassification", 71 | "FlaxViTModel", 72 | "FlaxViTPreTrainedModel", 73 | ] 74 | 75 | if TYPE_CHECKING: 76 | from .configuration_vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig, ViTOnnxConfig 77 | 78 | try: 79 | if not is_vision_available(): 80 | raise OptionalDependencyNotAvailable() 81 | except OptionalDependencyNotAvailable: 82 | pass 83 | else: 84 | from .feature_extraction_vit import ViTFeatureExtractor 85 | from .image_processing_vit import ViTImageProcessor 86 | 87 | try: 88 | if not is_torch_available(): 89 | raise OptionalDependencyNotAvailable() 90 | except OptionalDependencyNotAvailable: 91 | pass 92 | else: 93 | from .modeling_vit import ( 94 | VIT_PRETRAINED_MODEL_ARCHIVE_LIST, 95 | ViTForImageClassification, 96 | ViTForMaskedImageModeling, 97 | ViTModel, 98 | ViTPreTrainedModel, 99 | ) 100 | 101 | try: 102 | if not is_tf_available(): 103 | raise OptionalDependencyNotAvailable() 104 | except OptionalDependencyNotAvailable: 105 | pass 106 | else: 107 | from .modeling_tf_vit import TFViTForImageClassification, TFViTModel, TFViTPreTrainedModel 108 | 109 | try: 110 | if not is_flax_available(): 111 | raise OptionalDependencyNotAvailable() 112 | except OptionalDependencyNotAvailable: 113 | pass 114 | else: 115 | from .modeling_flax_vit import FlaxViTForImageClassification, FlaxViTModel, FlaxViTPreTrainedModel 116 | 117 | 118 | else: 119 | import sys 120 | 121 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 122 | -------------------------------------------------------------------------------- /transformers/models/vit/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/vit/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/vit/__pycache__/image_processing_vit.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/models/vit/__pycache__/image_processing_vit.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/models/vit/feature_extraction_vit.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Feature extractor class for ViT.""" 16 | 17 | import warnings 18 | 19 | from ...utils import logging 20 | from .image_processing_vit import ViTImageProcessor 21 | 22 | 23 | logger = logging.get_logger(__name__) 24 | 25 | 26 | class ViTFeatureExtractor(ViTImageProcessor): 27 | def __init__(self, *args, **kwargs) -> None: 28 | warnings.warn( 29 | "The class ViTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please" 30 | " use ViTImageProcessor instead.", 31 | FutureWarning, 32 | ) 33 | super().__init__(*args, **kwargs) 34 | -------------------------------------------------------------------------------- /transformers/onnx/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING 16 | 17 | from ..utils import _LazyModule 18 | 19 | 20 | _import_structure = { 21 | "config": [ 22 | "EXTERNAL_DATA_FORMAT_SIZE_LIMIT", 23 | "OnnxConfig", 24 | "OnnxConfigWithPast", 25 | "OnnxSeq2SeqConfigWithPast", 26 | "PatchingSpec", 27 | ], 28 | "convert": ["export", "validate_model_outputs"], 29 | "features": ["FeaturesManager"], 30 | "utils": ["ParameterFormat", "compute_serialized_parameters_size"], 31 | } 32 | 33 | 34 | if TYPE_CHECKING: 35 | from .config import ( 36 | EXTERNAL_DATA_FORMAT_SIZE_LIMIT, 37 | OnnxConfig, 38 | OnnxConfigWithPast, 39 | OnnxSeq2SeqConfigWithPast, 40 | PatchingSpec, 41 | ) 42 | from .convert import export, validate_model_outputs 43 | from .features import FeaturesManager 44 | from .utils import ParameterFormat, compute_serialized_parameters_size 45 | 46 | else: 47 | import sys 48 | 49 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 50 | -------------------------------------------------------------------------------- /transformers/onnx/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from ctypes import c_float, sizeof 16 | from enum import Enum 17 | from typing import TYPE_CHECKING, Optional, Union 18 | 19 | 20 | if TYPE_CHECKING: 21 | from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer # tests_ignore 22 | 23 | 24 | class ParameterFormat(Enum): 25 | Float = c_float 26 | 27 | @property 28 | def size(self) -> int: 29 | """ 30 | Number of byte required for this data type 31 | 32 | Returns: 33 | Integer > 0 34 | """ 35 | return sizeof(self.value) 36 | 37 | 38 | def compute_effective_axis_dimension(dimension: int, fixed_dimension: int, num_token_to_add: int = 0) -> int: 39 | """ 40 | 41 | Args: 42 | dimension: 43 | fixed_dimension: 44 | num_token_to_add: 45 | 46 | Returns: 47 | 48 | """ 49 | # < 0 is possible if using a dynamic axis 50 | if dimension <= 0: 51 | dimension = fixed_dimension 52 | 53 | dimension -= num_token_to_add 54 | return dimension 55 | 56 | 57 | def compute_serialized_parameters_size(num_parameters: int, dtype: ParameterFormat) -> int: 58 | """ 59 | Compute the size taken by all the parameters in the given the storage format when serializing the model 60 | 61 | Args: 62 | num_parameters: Number of parameters to be saved 63 | dtype: The data format each parameter will be saved 64 | 65 | Returns: 66 | Size (in byte) taken to save all the parameters 67 | """ 68 | return num_parameters * dtype.size 69 | 70 | 71 | def get_preprocessor(model_name: str) -> Optional[Union["AutoTokenizer", "AutoFeatureExtractor", "AutoProcessor"]]: 72 | """ 73 | Gets a preprocessor (tokenizer, feature extractor or processor) that is available for `model_name`. 74 | 75 | Args: 76 | model_name (`str`): Name of the model for which a preprocessor are loaded. 77 | 78 | Returns: 79 | `Optional[Union[AutoTokenizer, AutoFeatureExtractor, AutoProcessor]]`: 80 | If a processor is found, it is returned. Otherwise, if a tokenizer or a feature extractor exists, it is 81 | returned. If both a tokenizer and a feature extractor exist, an error is raised. The function returns 82 | `None` if no preprocessor is found. 83 | """ 84 | # Avoid circular imports by only importing this here. 85 | from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer # tests_ignore 86 | 87 | try: 88 | return AutoProcessor.from_pretrained(model_name) 89 | except (ValueError, OSError, KeyError): 90 | tokenizer, feature_extractor = None, None 91 | try: 92 | tokenizer = AutoTokenizer.from_pretrained(model_name) 93 | except (OSError, KeyError): 94 | pass 95 | try: 96 | feature_extractor = AutoFeatureExtractor.from_pretrained(model_name) 97 | except (OSError, KeyError): 98 | pass 99 | 100 | if tokenizer is not None and feature_extractor is not None: 101 | raise ValueError( 102 | f"Couldn't auto-detect preprocessor for {model_name}. Found both a tokenizer and a feature extractor." 103 | ) 104 | elif tokenizer is None and feature_extractor is None: 105 | return None 106 | elif tokenizer is not None: 107 | return tokenizer 108 | else: 109 | return feature_extractor 110 | -------------------------------------------------------------------------------- /transformers/sagemaker/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .trainer_sm import SageMakerTrainer 16 | from .training_args_sm import SageMakerTrainingArguments, is_sagemaker_dp_enabled 17 | -------------------------------------------------------------------------------- /transformers/sagemaker/trainer_sm.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import warnings 15 | 16 | from ..trainer import Trainer 17 | from ..utils import logging 18 | 19 | 20 | logger = logging.get_logger(__name__) 21 | 22 | 23 | class SageMakerTrainer(Trainer): 24 | def __init__(self, args=None, **kwargs): 25 | warnings.warn( 26 | "`SageMakerTrainer` is deprecated and will be removed in v5 of Transformers. You can use `Trainer` " 27 | "instead.", 28 | FutureWarning, 29 | ) 30 | super().__init__(args=args, **kwargs) 31 | -------------------------------------------------------------------------------- /transformers/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/constants.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/constants.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/doc.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/doc.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/dummy_flax_objects.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/dummy_flax_objects.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/dummy_keras_nlp_objects.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/dummy_keras_nlp_objects.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/dummy_sentencepiece_and_tokenizers_objects.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/dummy_sentencepiece_and_tokenizers_objects.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/dummy_speech_objects.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/dummy_speech_objects.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/dummy_tensorflow_text_objects.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/dummy_tensorflow_text_objects.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/dummy_tf_objects.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/dummy_tf_objects.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/dummy_tokenizers_objects.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/dummy_tokenizers_objects.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/generic.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/generic.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/hub.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/hub.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/import_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/import_utils.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/logging.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/logging.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/quantization_config.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/quantization_config.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/__pycache__/versions.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenRobotLab/OV_PARTS/939b1a56e9d42ff8b631b0aa46ee757e06d444af/transformers/utils/__pycache__/versions.cpython-38.pyc -------------------------------------------------------------------------------- /transformers/utils/constants.py: -------------------------------------------------------------------------------- 1 | IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406] 2 | IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225] 3 | IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5] 4 | IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5] 5 | OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] 6 | OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] 7 | -------------------------------------------------------------------------------- /transformers/utils/dummy_detectron2_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import requires_backends 3 | 4 | 5 | LAYOUTLM_V2_PRETRAINED_MODEL_ARCHIVE_LIST = None 6 | 7 | 8 | class LayoutLMv2Model: 9 | def __init__(self, *args, **kwargs): 10 | requires_backends(self, ["detectron2"]) 11 | 12 | @classmethod 13 | def from_pretrained(cls, *args, **kwargs): 14 | requires_backends(cls, ["detectron2"]) 15 | -------------------------------------------------------------------------------- /transformers/utils/dummy_keras_nlp_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import DummyObject, requires_backends 3 | 4 | 5 | class TFGPT2Tokenizer(metaclass=DummyObject): 6 | _backends = ["keras_nlp"] 7 | 8 | def __init__(self, *args, **kwargs): 9 | requires_backends(self, ["keras_nlp"]) 10 | -------------------------------------------------------------------------------- /transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import DummyObject, requires_backends 3 | 4 | 5 | SLOW_TO_FAST_CONVERTERS = None 6 | 7 | 8 | def convert_slow_tokenizer(*args, **kwargs): 9 | requires_backends(convert_slow_tokenizer, ["sentencepiece", "tokenizers"]) 10 | -------------------------------------------------------------------------------- /transformers/utils/dummy_speech_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import DummyObject, requires_backends 3 | 4 | 5 | class ASTFeatureExtractor(metaclass=DummyObject): 6 | _backends = ["speech"] 7 | 8 | def __init__(self, *args, **kwargs): 9 | requires_backends(self, ["speech"]) 10 | 11 | 12 | class MCTCTFeatureExtractor(metaclass=DummyObject): 13 | _backends = ["speech"] 14 | 15 | def __init__(self, *args, **kwargs): 16 | requires_backends(self, ["speech"]) 17 | 18 | 19 | class Speech2TextFeatureExtractor(metaclass=DummyObject): 20 | _backends = ["speech"] 21 | 22 | def __init__(self, *args, **kwargs): 23 | requires_backends(self, ["speech"]) 24 | 25 | 26 | class SpeechT5FeatureExtractor(metaclass=DummyObject): 27 | _backends = ["speech"] 28 | 29 | def __init__(self, *args, **kwargs): 30 | requires_backends(self, ["speech"]) 31 | 32 | 33 | class TvltFeatureExtractor(metaclass=DummyObject): 34 | _backends = ["speech"] 35 | 36 | def __init__(self, *args, **kwargs): 37 | requires_backends(self, ["speech"]) 38 | -------------------------------------------------------------------------------- /transformers/utils/dummy_tensorflow_text_objects.py: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by the command `make fix-copies`, do not edit. 2 | from ..utils import DummyObject, requires_backends 3 | 4 | 5 | class TFBertTokenizer(metaclass=DummyObject): 6 | _backends = ["tensorflow_text"] 7 | 8 | def __init__(self, *args, **kwargs): 9 | requires_backends(self, ["tensorflow_text"]) 10 | -------------------------------------------------------------------------------- /transformers/utils/model_parallel_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from math import ceil 17 | 18 | 19 | def assert_device_map(device_map, num_blocks): 20 | blocks = list(range(0, num_blocks)) 21 | 22 | device_map_blocks = [item for sublist in list(device_map.values()) for item in sublist] 23 | 24 | # Duplicate check 25 | duplicate_blocks = [] 26 | for i in device_map_blocks: 27 | if device_map_blocks.count(i) > 1 and i not in duplicate_blocks: 28 | duplicate_blocks.append(i) 29 | # Missing blocks 30 | missing_blocks = [i for i in blocks if i not in device_map_blocks] 31 | extra_blocks = [i for i in device_map_blocks if i not in blocks] 32 | 33 | if len(duplicate_blocks) != 0: 34 | raise ValueError( 35 | "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device." 36 | " These attention blocks were specified more than once: " + str(duplicate_blocks) 37 | ) 38 | if len(missing_blocks) != 0: 39 | raise ValueError( 40 | "There are attention blocks for this model that are not specified in the device_map. Add these attention " 41 | "blocks to a device on the device_map: " + str(missing_blocks) 42 | ) 43 | if len(extra_blocks) != 0: 44 | raise ValueError( 45 | "The device_map contains more attention blocks than this model has. Remove these from the device_map:" 46 | + str(extra_blocks) 47 | ) 48 | 49 | 50 | def get_device_map(n_layers, devices): 51 | """Returns a dictionary of layers distributed evenly across all devices.""" 52 | layers = list(range(n_layers)) 53 | n_blocks = int(ceil(n_layers / len(devices))) 54 | layers_list = [layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)] 55 | 56 | return dict(zip(devices, layers_list)) 57 | --------------------------------------------------------------------------------