├── .gitignore ├── LICENSE ├── README.md ├── assets ├── comparison.png └── framework.png ├── causal-conv1d ├── AUTHORS ├── LICENSE ├── README.md ├── causal_conv1d │ ├── __init__.py │ └── causal_conv1d_interface.py ├── csrc │ ├── causal_conv1d.cpp │ ├── causal_conv1d.h │ ├── causal_conv1d_bwd.cu │ ├── causal_conv1d_common.h │ ├── causal_conv1d_fwd.cu │ ├── causal_conv1d_update.cu │ └── static_switch.h ├── setup.py └── tests │ └── test_causal_conv1d.py ├── mamba ├── .gitmodules ├── AUTHORS ├── LICENSE ├── README.md ├── assets │ └── selection.png ├── benchmarks │ └── benchmark_generation_mamba_simple.py ├── csrc │ └── selective_scan │ │ ├── reverse_scan.cuh │ │ ├── selective_scan.cpp │ │ ├── selective_scan.h │ │ ├── selective_scan_bwd_bf16_complex.cu │ │ ├── selective_scan_bwd_bf16_real.cu │ │ ├── selective_scan_bwd_fp16_complex.cu │ │ ├── selective_scan_bwd_fp16_real.cu │ │ ├── selective_scan_bwd_fp32_complex.cu │ │ ├── selective_scan_bwd_fp32_real.cu │ │ ├── selective_scan_bwd_kernel.cuh │ │ ├── selective_scan_common.h │ │ ├── selective_scan_fwd_bf16.cu │ │ ├── selective_scan_fwd_fp16.cu │ │ ├── selective_scan_fwd_fp32.cu │ │ ├── selective_scan_fwd_kernel.cuh │ │ ├── static_switch.h │ │ └── uninitialized_copy.cuh ├── evals │ └── lm_harness_eval.py ├── mamba_ssm │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ └── mixer_seq_simple.py │ ├── modules │ │ ├── __init__.py │ │ └── mamba_simple.py │ ├── ops │ │ ├── __init__.py │ │ ├── selective_scan_interface.py │ │ └── triton │ │ │ ├── __init__.py │ │ │ ├── layernorm.py │ │ │ └── selective_state_update.py │ └── utils │ │ ├── __init__.py │ │ ├── generation.py │ │ └── hf.py ├── setup.py ├── test_mamba_module.py └── tests │ └── ops │ ├── test_selective_scan.py │ └── triton │ └── test_selective_state_update.py ├── requirements.txt └── videomamba ├── README.md ├── image_sm ├── .gitignore ├── MODEL_ZOO.md ├── README.md ├── augment.py ├── datasets.py ├── engine.py ├── engine_distill.py ├── exp │ ├── videomamba_small │ │ ├── run224.sh │ │ ├── run448.sh │ │ └── run576.sh │ └── videomamba_tiny │ │ ├── run224.sh │ │ ├── run448.sh │ │ └── run576.sh ├── exp_distill │ ├── videomamba_base │ │ └── run224.sh │ └── videomamba_middle │ │ ├── run224.sh │ │ ├── run448.sh │ │ └── run576.sh ├── generate_tensorboard.py ├── generate_tensorboard_distill.py ├── hubconf.py ├── imagenet_dataset.py ├── losses.py ├── main.py ├── main_distill.py ├── models │ ├── __init__.py │ ├── deit.py │ ├── videomamba.py │ └── videomamba_distill.py ├── run_with_submitit.py ├── run_with_submitit_distill.py ├── samplers.py └── utils.py ├── video_mm ├── DATASET.md ├── MODEL_ZOO.md ├── README.md ├── configs │ ├── beit-base-patch16-224-pt22k-ft22k.json │ ├── config_bert.json │ ├── config_bert_large.json │ ├── data.py │ ├── model.py │ ├── pretrain.py │ ├── qa.py │ ├── qa_anet.py │ ├── qa_msrvtt.py │ ├── ret_anet.py │ ├── ret_coco.py │ ├── ret_didemo.py │ ├── ret_flickr.py │ ├── ret_msrvtt.py │ ├── ret_msrvtt_9k.py │ ├── ret_msrvtt_mc.py │ ├── ret_ssv2_label.py │ └── ret_ssv2_template.py ├── dataset │ ├── __init__.py │ ├── base_dataset.py │ ├── caption_dataset.py │ ├── dataloader.py │ ├── qa_dataset.py │ ├── sqlite_dataset.py │ ├── text_prompt.py │ ├── utils.py │ └── video_utils.py ├── exp_pt │ ├── videomamba_middle_17m │ │ ├── config.py │ │ └── run.sh │ ├── videomamba_middle_17m_unmasked │ │ ├── config.py │ │ └── run.sh │ ├── videomamba_middle_25m │ │ ├── config.py │ │ └── run.sh │ ├── videomamba_middle_25m_unmasked │ │ ├── config.py │ │ └── run.sh │ ├── videomamba_middle_5m │ │ ├── config.py │ │ └── run.sh │ └── videomamba_middle_5m_unmasked │ │ ├── config.py │ │ └── run.sh ├── exp_zs │ ├── anet │ │ ├── config.py │ │ └── run.sh │ ├── didemo │ │ ├── config.py │ │ └── run.sh │ ├── lsmdc │ │ ├── config.py │ │ └── run.sh │ ├── msrvtt │ │ ├── config.py │ │ └── run.sh │ └── msvd │ │ ├── config.py │ │ └── run.sh ├── models │ ├── __init__.py │ ├── backbones │ │ ├── __init__.py │ │ ├── bert │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ ├── tokenization_bert.py │ │ │ ├── tokenization_bert2.py │ │ │ └── xbert.py │ │ ├── clip │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── clip_text.py │ │ │ ├── tokenizer.py │ │ │ └── transformer.py │ │ ├── videomamba │ │ │ ├── __init__.py │ │ │ ├── clip.py │ │ │ └── videomamba.py │ │ └── vit │ │ │ ├── __init__.py │ │ │ ├── clip.py │ │ │ └── vit.py │ ├── criterions.py │ ├── mask.py │ ├── umt.py │ ├── umt_qa.py │ ├── umt_videomamba.py │ └── utils.py ├── tasks │ ├── pretrain.py │ ├── retrieval.py │ ├── retrieval_mc.py │ ├── retrieval_utils.py │ ├── shared_utils.py │ ├── vqa.py │ └── vqa_utils.py ├── torchrun.sh └── utils │ ├── basic_utils.py │ ├── config.py │ ├── config_utils.py │ ├── distributed.py │ ├── easydict.py │ ├── logger.py │ ├── optimizer.py │ └── scheduler.py └── video_sm ├── DATASET.md ├── MODEL_ZOO.md ├── README.md ├── datasets ├── __init__.py ├── build.py ├── kinetics.py ├── kinetics_sparse.py ├── lvu.py ├── mae.py ├── masking_generator.py ├── mixup.py ├── rand_augment.py ├── random_erasing.py ├── ssv2.py ├── transforms.py ├── video_transforms.py └── volume_transforms.py ├── engines ├── __init__.py ├── engine_for_finetuning.py ├── engine_for_finetuning_regression.py ├── engine_for_pretraining.py ├── engine_for_pretraining_umt.py └── engine_for_pretraining_videomamba.py ├── exp ├── breakfast │ ├── videomamba_middle │ │ ├── run_f32x224.sh │ │ └── run_f64x224.sh │ ├── videomamba_middle_mask │ │ ├── run_f32x224.sh │ │ └── run_f64x224.sh │ ├── videomamba_small │ │ ├── run_f32x224.sh │ │ └── run_f64x224.sh │ └── videomamba_tiny │ │ ├── run_f32x224.sh │ │ └── run_f64x224.sh ├── coin │ ├── videomamba_middle │ │ ├── run_f32x224.sh │ │ └── run_f64x224.sh │ ├── videomamba_middle_mask │ │ ├── run_f32x224.sh │ │ └── run_f64x224.sh │ ├── videomamba_small │ │ ├── run_f32x224.sh │ │ └── run_f64x224.sh │ └── videomamba_tiny │ │ ├── run_f32x224.sh │ │ └── run_f64x224.sh ├── k400 │ ├── videomamba_middle │ │ ├── run_f16x224.sh │ │ ├── run_f32x224.sh │ │ ├── run_f64x224.sh │ │ ├── run_f64x224to384.sh │ │ └── run_f8x224.sh │ ├── videomamba_middle_mask │ │ ├── run_f16x224.sh │ │ ├── run_f32x224.sh │ │ ├── run_f64x224.sh │ │ ├── run_f64x224to384.sh │ │ ├── run_f8x224.sh │ │ └── run_mask_pretrain.sh │ ├── videomamba_small │ │ ├── run_f16x224.sh │ │ ├── run_f32x224.sh │ │ ├── run_f64x224.sh │ │ ├── run_f64x224to384.sh │ │ └── run_f8x224.sh │ └── videomamba_tiny │ │ ├── run_f16x224.sh │ │ ├── run_f32x224.sh │ │ ├── run_f64x224.sh │ │ ├── run_f64x224to384.sh │ │ └── run_f8x224.sh ├── lvu │ ├── run_class.sh │ ├── run_class_trim.sh │ ├── run_regression.sh │ └── run_regression_trim.sh └── ssv2 │ ├── videomamba_middle │ ├── run_f16x224.sh │ ├── run_f16x224to288.sh │ └── run_f8x224.sh │ ├── videomamba_middle_mask │ ├── run_f16x224.sh │ ├── run_f16x224to288.sh │ ├── run_f8x224.sh │ └── run_mask_pretrain.sh │ ├── videomamba_small │ ├── run_f16x224.sh │ ├── run_f16x224to288.sh │ └── run_f8x224.sh │ └── videomamba_tiny │ ├── run_f16x224.sh │ ├── run_f16x224to288.sh │ └── run_f8x224.sh ├── functional.py ├── models ├── __init__.py ├── clip.py ├── deit.py ├── extract_clip │ └── extract.ipynb ├── modeling_finetune.py ├── modeling_pretrain.py ├── modeling_pretrain_umt.py ├── speed_test.py ├── videomamba.py └── videomamba_pretrain.py ├── optim_factory.py ├── run_class_finetuning.py ├── run_mae_pretraining.py ├── run_regression_finetuning.py ├── run_umt_pretraining.py ├── run_videomamba_pretraining.py └── utils.py /assets/comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/assets/comparison.png -------------------------------------------------------------------------------- /assets/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/assets/framework.png -------------------------------------------------------------------------------- /causal-conv1d/AUTHORS: -------------------------------------------------------------------------------- 1 | Tri Dao, tri@tridao.me 2 | -------------------------------------------------------------------------------- /causal-conv1d/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /causal-conv1d/README.md: -------------------------------------------------------------------------------- 1 | # Causal depthwise conv1d in CUDA with a PyTorch interface 2 | -------------------------------------------------------------------------------- /causal-conv1d/causal_conv1d/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.0" 2 | 3 | from causal_conv1d.causal_conv1d_interface import causal_conv1d_fn, causal_conv1d_update 4 | -------------------------------------------------------------------------------- /causal-conv1d/csrc/causal_conv1d.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | #pragma once 6 | 7 | //////////////////////////////////////////////////////////////////////////////////////////////////// 8 | 9 | struct ConvParamsBase { 10 | using index_t = uint32_t; 11 | 12 | int batch, dim, seqlen, width; 13 | bool silu_activation; 14 | 15 | index_t x_batch_stride; 16 | index_t x_c_stride; 17 | index_t x_l_stride; 18 | index_t weight_c_stride; 19 | index_t weight_width_stride; 20 | index_t out_batch_stride; 21 | index_t out_c_stride; 22 | index_t out_l_stride; 23 | 24 | index_t conv_state_batch_stride; 25 | index_t conv_state_c_stride; 26 | index_t conv_state_l_stride; 27 | 28 | // Common data pointers. 29 | void *__restrict__ x_ptr; 30 | void *__restrict__ weight_ptr; 31 | void *__restrict__ bias_ptr; 32 | void *__restrict__ out_ptr; 33 | 34 | void *__restrict__ conv_state_ptr; 35 | }; 36 | 37 | struct ConvParamsBwd: public ConvParamsBase { 38 | index_t dx_batch_stride; 39 | index_t dx_c_stride; 40 | index_t dx_l_stride; 41 | index_t dweight_c_stride; 42 | index_t dweight_width_stride; 43 | index_t dout_batch_stride; 44 | index_t dout_c_stride; 45 | index_t dout_l_stride; 46 | 47 | // Common data pointers. 48 | void *__restrict__ dx_ptr; 49 | void *__restrict__ dweight_ptr; 50 | void *__restrict__ dbias_ptr; 51 | void *__restrict__ dout_ptr; 52 | }; 53 | 54 | -------------------------------------------------------------------------------- /causal-conv1d/csrc/causal_conv1d_common.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | 10 | //////////////////////////////////////////////////////////////////////////////////////////////////// 11 | 12 | template struct BytesToType {}; 13 | 14 | template<> struct BytesToType<16> { 15 | using Type = uint4; 16 | static_assert(sizeof(Type) == 16); 17 | }; 18 | 19 | template<> struct BytesToType<8> { 20 | using Type = uint64_t; 21 | static_assert(sizeof(Type) == 8); 22 | }; 23 | 24 | template<> struct BytesToType<4> { 25 | using Type = uint32_t; 26 | static_assert(sizeof(Type) == 4); 27 | }; 28 | 29 | template<> struct BytesToType<2> { 30 | using Type = uint16_t; 31 | static_assert(sizeof(Type) == 2); 32 | }; 33 | 34 | template<> struct BytesToType<1> { 35 | using Type = uint8_t; 36 | static_assert(sizeof(Type) == 1); 37 | }; 38 | 39 | //////////////////////////////////////////////////////////////////////////////////////////////////// 40 | 41 | template 42 | struct SumOp { 43 | __device__ inline T operator()(T const & x, T const & y) { return x + y; } 44 | }; 45 | 46 | template 47 | struct Allreduce { 48 | static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4); 49 | template 50 | static __device__ inline T run(T x, Operator &op) { 51 | constexpr int OFFSET = THREADS / 2; 52 | x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET)); 53 | return Allreduce::run(x, op); 54 | } 55 | }; 56 | 57 | template<> 58 | struct Allreduce<2> { 59 | template 60 | static __device__ inline T run(T x, Operator &op) { 61 | x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1)); 62 | return x; 63 | } 64 | }; 65 | -------------------------------------------------------------------------------- /causal-conv1d/csrc/static_switch.h: -------------------------------------------------------------------------------- 1 | // Inspired by https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h 2 | // and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h 3 | 4 | #pragma once 5 | 6 | /// @param COND - a boolean expression to switch by 7 | /// @param CONST_NAME - a name given for the constexpr bool variable. 8 | /// @param ... - code to execute for true and false 9 | /// 10 | /// Usage: 11 | /// ``` 12 | /// BOOL_SWITCH(flag, BoolConst, [&] { 13 | /// some_function(...); 14 | /// }); 15 | /// ``` 16 | #define BOOL_SWITCH(COND, CONST_NAME, ...) \ 17 | [&] { \ 18 | if (COND) { \ 19 | static constexpr bool CONST_NAME = true; \ 20 | return __VA_ARGS__(); \ 21 | } else { \ 22 | static constexpr bool CONST_NAME = false; \ 23 | return __VA_ARGS__(); \ 24 | } \ 25 | }() 26 | -------------------------------------------------------------------------------- /mamba/.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rdparty/lm-evaluation-harness"] 2 | path = 3rdparty/lm-evaluation-harness 3 | url = https://github.com/EleutherAI/lm-evaluation-harness/ 4 | -------------------------------------------------------------------------------- /mamba/AUTHORS: -------------------------------------------------------------------------------- 1 | Tri Dao, tri@tridao.me 2 | Albert Gu, agu@andrew.cmu.edu 3 | -------------------------------------------------------------------------------- /mamba/assets/selection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/mamba/assets/selection.png -------------------------------------------------------------------------------- /mamba/benchmarks/benchmark_generation_mamba_simple.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Tri Dao, Albert Gu. 2 | 3 | import argparse 4 | import time 5 | import json 6 | 7 | import torch 8 | import torch.nn.functional as F 9 | 10 | from einops import rearrange 11 | 12 | from transformers import AutoTokenizer, AutoModelForCausalLM 13 | 14 | from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel 15 | 16 | 17 | parser = argparse.ArgumentParser(description="Generation benchmarking") 18 | parser.add_argument("--model-name", type=str, default="state-spaces/mamba-130m") 19 | parser.add_argument("--prompt", type=str, default=None) 20 | parser.add_argument("--promptlen", type=int, default=100) 21 | parser.add_argument("--genlen", type=int, default=100) 22 | parser.add_argument("--temperature", type=float, default=1.0) 23 | parser.add_argument("--topk", type=int, default=1) 24 | parser.add_argument("--topp", type=float, default=1.0) 25 | parser.add_argument("--batch", type=int, default=1) 26 | args = parser.parse_args() 27 | 28 | repeats = 3 29 | device = "cuda" 30 | dtype = torch.float16 31 | 32 | print(f"Loading model {args.model_name}") 33 | is_mamba = args.model_name.startswith("state-spaces/mamba-") or "mamba" in args.model_name 34 | 35 | if is_mamba: 36 | tokenizer = AutoTokenizer.from_pretrained("/home/zhulianghui/VisionProjects/mamba/ckpts/gpt-neox-20b-tokenizer") 37 | model = MambaLMHeadModel.from_pretrained(args.model_name, device=device, dtype=dtype) 38 | else: 39 | tokenizer = AutoTokenizer.from_pretrained(args.model_name) 40 | model = AutoModelForCausalLM.from_pretrained(args.model_name, device_map={"": device}, torch_dtype=dtype) 41 | model.eval() 42 | print(f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}") 43 | 44 | torch.random.manual_seed(0) 45 | if args.prompt is None: 46 | input_ids = torch.randint(1, 1000, (args.batch, args.promptlen), dtype=torch.long, device="cuda") 47 | attn_mask = torch.ones_like(input_ids, dtype=torch.long, device="cuda") 48 | else: 49 | tokens = tokenizer(args.prompt, return_tensors="pt") 50 | input_ids = tokens.input_ids.to(device=device) 51 | attn_mask = tokens.attention_mask.to(device=device) 52 | max_length = input_ids.shape[1] + args.genlen 53 | 54 | if is_mamba: 55 | fn = lambda: model.generate( 56 | input_ids=input_ids, 57 | max_length=max_length, 58 | cg=True, 59 | return_dict_in_generate=True, 60 | output_scores=True, 61 | enable_timing=False, 62 | temperature=args.temperature, 63 | top_k=args.topk, 64 | top_p=args.topp, 65 | ) 66 | else: 67 | fn = lambda: model.generate( 68 | input_ids=input_ids, 69 | attention_mask=attn_mask, 70 | max_length=max_length, 71 | return_dict_in_generate=True, 72 | pad_token_id=tokenizer.eos_token_id, 73 | do_sample=True, 74 | temperature=args.temperature, 75 | top_k=args.topk, 76 | top_p=args.topp, 77 | ) 78 | out = fn() 79 | if args.prompt is not None: 80 | print(tokenizer.batch_decode(out.sequences.tolist())) 81 | 82 | torch.cuda.synchronize() 83 | start = time.time() 84 | for _ in range(repeats): 85 | fn() 86 | torch.cuda.synchronize() 87 | print(f"Prompt length: {len(input_ids[0])}, generation length: {len(out.sequences[0]) - len(input_ids[0])}") 88 | print(f"{args.model_name} prompt processing + decoding time: {(time.time() - start) / repeats * 1000:.0f}ms") 89 | -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/selective_scan.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | #pragma once 6 | 7 | //////////////////////////////////////////////////////////////////////////////////////////////////// 8 | 9 | struct SSMScanParamsBase { 10 | using index_t = uint32_t; 11 | 12 | int batch, seqlen, n_chunks; 13 | index_t a_batch_stride; 14 | index_t b_batch_stride; 15 | index_t out_batch_stride; 16 | 17 | // Common data pointers. 18 | void *__restrict__ a_ptr; 19 | void *__restrict__ b_ptr; 20 | void *__restrict__ out_ptr; 21 | void *__restrict__ x_ptr; 22 | }; 23 | 24 | //////////////////////////////////////////////////////////////////////////////////////////////////// 25 | 26 | struct SSMParamsBase { 27 | using index_t = uint32_t; 28 | 29 | int batch, dim, seqlen, dstate, n_groups, n_chunks; 30 | int dim_ngroups_ratio; 31 | bool is_variable_B; 32 | bool is_variable_C; 33 | 34 | bool delta_softplus; 35 | 36 | index_t A_d_stride; 37 | index_t A_dstate_stride; 38 | index_t B_batch_stride; 39 | index_t B_d_stride; 40 | index_t B_dstate_stride; 41 | index_t B_group_stride; 42 | index_t C_batch_stride; 43 | index_t C_d_stride; 44 | index_t C_dstate_stride; 45 | index_t C_group_stride; 46 | index_t u_batch_stride; 47 | index_t u_d_stride; 48 | index_t delta_batch_stride; 49 | index_t delta_d_stride; 50 | index_t z_batch_stride; 51 | index_t z_d_stride; 52 | index_t out_batch_stride; 53 | index_t out_d_stride; 54 | index_t out_z_batch_stride; 55 | index_t out_z_d_stride; 56 | 57 | // Common data pointers. 58 | void *__restrict__ A_ptr; 59 | void *__restrict__ B_ptr; 60 | void *__restrict__ C_ptr; 61 | void *__restrict__ D_ptr; 62 | void *__restrict__ u_ptr; 63 | void *__restrict__ delta_ptr; 64 | void *__restrict__ delta_bias_ptr; 65 | void *__restrict__ out_ptr; 66 | void *__restrict__ x_ptr; 67 | void *__restrict__ z_ptr; 68 | void *__restrict__ out_z_ptr; 69 | }; 70 | 71 | struct SSMParamsBwd: public SSMParamsBase { 72 | index_t dout_batch_stride; 73 | index_t dout_d_stride; 74 | index_t dA_d_stride; 75 | index_t dA_dstate_stride; 76 | index_t dB_batch_stride; 77 | index_t dB_group_stride; 78 | index_t dB_d_stride; 79 | index_t dB_dstate_stride; 80 | index_t dC_batch_stride; 81 | index_t dC_group_stride; 82 | index_t dC_d_stride; 83 | index_t dC_dstate_stride; 84 | index_t du_batch_stride; 85 | index_t du_d_stride; 86 | index_t dz_batch_stride; 87 | index_t dz_d_stride; 88 | index_t ddelta_batch_stride; 89 | index_t ddelta_d_stride; 90 | 91 | // Common data pointers. 92 | void *__restrict__ dout_ptr; 93 | void *__restrict__ dA_ptr; 94 | void *__restrict__ dB_ptr; 95 | void *__restrict__ dC_ptr; 96 | void *__restrict__ dD_ptr; 97 | void *__restrict__ du_ptr; 98 | void *__restrict__ dz_ptr; 99 | void *__restrict__ ddelta_ptr; 100 | void *__restrict__ ddelta_bias_ptr; 101 | }; 102 | -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/selective_scan_bwd_bf16_complex.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | // Split into multiple files to compile in paralell 6 | 7 | #include "selective_scan_bwd_kernel.cuh" 8 | 9 | template void selective_scan_bwd_cuda(SSMParamsBwd ¶ms, cudaStream_t stream); -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/selective_scan_bwd_bf16_real.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | // Split into multiple files to compile in paralell 6 | 7 | #include "selective_scan_bwd_kernel.cuh" 8 | 9 | template void selective_scan_bwd_cuda(SSMParamsBwd ¶ms, cudaStream_t stream); -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/selective_scan_bwd_fp16_complex.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | // Split into multiple files to compile in paralell 6 | 7 | #include "selective_scan_bwd_kernel.cuh" 8 | 9 | template void selective_scan_bwd_cuda(SSMParamsBwd ¶ms, cudaStream_t stream); -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/selective_scan_bwd_fp16_real.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | // Split into multiple files to compile in paralell 6 | 7 | #include "selective_scan_bwd_kernel.cuh" 8 | 9 | template void selective_scan_bwd_cuda(SSMParamsBwd ¶ms, cudaStream_t stream); -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/selective_scan_bwd_fp32_complex.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | // Split into multiple files to compile in paralell 6 | 7 | #include "selective_scan_bwd_kernel.cuh" 8 | 9 | template void selective_scan_bwd_cuda(SSMParamsBwd ¶ms, cudaStream_t stream); -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/selective_scan_bwd_fp32_real.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | // Split into multiple files to compile in paralell 6 | 7 | #include "selective_scan_bwd_kernel.cuh" 8 | 9 | template void selective_scan_bwd_cuda(SSMParamsBwd ¶ms, cudaStream_t stream); -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/selective_scan_fwd_bf16.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | // Split into multiple files to compile in paralell 6 | 7 | #include "selective_scan_fwd_kernel.cuh" 8 | 9 | template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); 10 | template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/selective_scan_fwd_fp16.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | // Split into multiple files to compile in paralell 6 | 7 | #include "selective_scan_fwd_kernel.cuh" 8 | 9 | template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); 10 | template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/selective_scan_fwd_fp32.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2023, Tri Dao. 3 | ******************************************************************************/ 4 | 5 | // Split into multiple files to compile in paralell 6 | 7 | #include "selective_scan_fwd_kernel.cuh" 8 | 9 | template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); 10 | template void selective_scan_fwd_cuda(SSMParamsBase ¶ms, cudaStream_t stream); -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/static_switch.h: -------------------------------------------------------------------------------- 1 | // Inspired by https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h 2 | // and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h 3 | 4 | #pragma once 5 | 6 | /// @param COND - a boolean expression to switch by 7 | /// @param CONST_NAME - a name given for the constexpr bool variable. 8 | /// @param ... - code to execute for true and false 9 | /// 10 | /// Usage: 11 | /// ``` 12 | /// BOOL_SWITCH(flag, BoolConst, [&] { 13 | /// some_function(...); 14 | /// }); 15 | /// ``` 16 | #define BOOL_SWITCH(COND, CONST_NAME, ...) \ 17 | [&] { \ 18 | if (COND) { \ 19 | constexpr bool CONST_NAME = true; \ 20 | return __VA_ARGS__(); \ 21 | } else { \ 22 | constexpr bool CONST_NAME = false; \ 23 | return __VA_ARGS__(); \ 24 | } \ 25 | }() 26 | -------------------------------------------------------------------------------- /mamba/csrc/selective_scan/uninitialized_copy.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include 33 | 34 | 35 | namespace detail 36 | { 37 | 38 | #if defined(_NVHPC_CUDA) 39 | template 40 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val) 41 | { 42 | // NVBug 3384810 43 | new (ptr) T(::cuda::std::forward(val)); 44 | } 45 | #else 46 | template ::value, 50 | int 51 | >::type = 0> 52 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val) 53 | { 54 | *ptr = ::cuda::std::forward(val); 55 | } 56 | 57 | template ::value, 61 | int 62 | >::type = 0> 63 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val) 64 | { 65 | new (ptr) T(::cuda::std::forward(val)); 66 | } 67 | #endif 68 | 69 | } // namespace detail 70 | -------------------------------------------------------------------------------- /mamba/evals/lm_harness_eval.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import transformers 4 | from transformers import AutoTokenizer 5 | 6 | from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel 7 | 8 | from lm_eval.api.model import LM 9 | from lm_eval.models.huggingface import HFLM 10 | from lm_eval.api.registry import register_model 11 | from lm_eval.__main__ import cli_evaluate 12 | 13 | 14 | @register_model("mamba") 15 | class MambaEvalWrapper(HFLM): 16 | 17 | AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM 18 | 19 | def __init__(self, pretrained="state-spaces/mamba-2.8b", max_length=2048, batch_size=None, device="cuda", 20 | dtype=torch.float16): 21 | LM.__init__(self) 22 | self._model = MambaLMHeadModel.from_pretrained(pretrained, device=device, dtype=dtype) 23 | self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") 24 | self.tokenizer.pad_token_id = self.tokenizer.eos_token_id 25 | self.vocab_size = self.tokenizer.vocab_size 26 | self._batch_size = batch_size if batch_size is None else 64 27 | self._max_length = max_length 28 | self._device = torch.device(device) 29 | 30 | @property 31 | def batch_size(self): 32 | return self._batch_size 33 | 34 | def _model_generate(self, context, max_length, stop, **generation_kwargs): 35 | raise NotImplementedError() 36 | 37 | 38 | if __name__ == "__main__": 39 | cli_evaluate() 40 | -------------------------------------------------------------------------------- /mamba/mamba_ssm/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.1" 2 | 3 | from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn, bimamba_inner_fn 4 | from mamba_ssm.modules.mamba_simple import Mamba 5 | from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel 6 | -------------------------------------------------------------------------------- /mamba/mamba_ssm/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/mamba/mamba_ssm/models/__init__.py -------------------------------------------------------------------------------- /mamba/mamba_ssm/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/mamba/mamba_ssm/modules/__init__.py -------------------------------------------------------------------------------- /mamba/mamba_ssm/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/mamba/mamba_ssm/ops/__init__.py -------------------------------------------------------------------------------- /mamba/mamba_ssm/ops/triton/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/mamba/mamba_ssm/ops/triton/__init__.py -------------------------------------------------------------------------------- /mamba/mamba_ssm/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/mamba/mamba_ssm/utils/__init__.py -------------------------------------------------------------------------------- /mamba/mamba_ssm/utils/hf.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | 5 | from transformers.utils import WEIGHTS_NAME, CONFIG_NAME 6 | from transformers.utils.hub import cached_file 7 | 8 | 9 | def load_config_hf(model_name): 10 | resolved_archive_file = cached_file(model_name, CONFIG_NAME, _raise_exceptions_for_missing_entries=False) 11 | return json.load(open(resolved_archive_file)) 12 | 13 | 14 | def load_state_dict_hf(model_name, device=None, dtype=None): 15 | # If not fp32, then we don't want to load directly to the GPU 16 | mapped_device = "cpu" if dtype not in [torch.float32, None] else device 17 | resolved_archive_file = cached_file(model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False) 18 | return torch.load(resolved_archive_file, map_location=mapped_device) 19 | # Convert dtype before moving to GPU to save memory 20 | if dtype is not None: 21 | state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()} 22 | state_dict = {k: v.to(device=device) for k, v in state_dict.items()} 23 | return state_dict 24 | -------------------------------------------------------------------------------- /mamba/test_mamba_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mamba_ssm import Mamba 3 | 4 | batch, length, dim = 2, 64, 768 5 | x = torch.randn(batch, length, dim).to("cuda") 6 | model = Mamba( 7 | # This module uses roughly 3 * expand * d_model^2 parameters 8 | d_model=dim, # Model dimension d_model 9 | d_state=16, # SSM state expansion factor # 64 10 | d_conv=4, # Local convolution width 11 | expand=2, # Block expansion factor 12 | use_fast_path=False, 13 | ).to("cuda") 14 | y = model(x) 15 | assert y.shape == x.shape 16 | -------------------------------------------------------------------------------- /mamba/tests/ops/triton/test_selective_state_update.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2023, Tri Dao. 2 | 3 | import math 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | import pytest 8 | 9 | from einops import rearrange 10 | 11 | from mamba_ssm.ops.triton.selective_state_update import selective_state_update, selective_state_update_ref 12 | 13 | 14 | @pytest.mark.parametrize("itype", [torch.float32, torch.float16, torch.bfloat16]) 15 | # @pytest.mark.parametrize('itype', [torch.float16]) 16 | @pytest.mark.parametrize("has_z", [False, True]) 17 | # @pytest.mark.parametrize('has_z', [True]) 18 | @pytest.mark.parametrize("dstate", [16, 32, 64]) 19 | # @pytest.mark.parametrize("dstate", [16]) 20 | @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096]) 21 | # @pytest.mark.parametrize("dim", [2048]) 22 | def test_causal_conv1d_update(dim, dstate, has_z, itype): 23 | device = "cuda" 24 | rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2) 25 | if itype == torch.bfloat16: 26 | rtol, atol = 1e-2, 5e-2 27 | # set seed 28 | torch.random.manual_seed(0) 29 | batch_size = 2 30 | state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device) 31 | x = torch.randn(batch_size, dim, device=device, dtype=itype) 32 | dt = torch.randn(batch_size, dim, device=device, dtype=itype) 33 | dt_bias = torch.rand(dim, device=device) - 4.0 34 | A = -torch.rand(dim, dstate, device=device) - 1.0 35 | B = torch.randn(batch_size, dstate, device=device) 36 | C = torch.randn(batch_size, dstate, device=device) 37 | D = torch.randn(dim, device=device) 38 | if has_z: 39 | z = torch.randn_like(x) 40 | else: 41 | z = None 42 | state_ref = state.detach().clone() 43 | out = selective_state_update(state, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True) 44 | out_ref = selective_state_update_ref(state_ref, x, dt, A, B, C, D=D, z=z, dt_bias=dt_bias, dt_softplus=True) 45 | 46 | print(f"Output max diff: {(out - out_ref).abs().max().item()}") 47 | print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") 48 | assert torch.allclose(state, state_ref, rtol=rtol, atol=atol) 49 | assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | apex==0.1 2 | av==11.0.0 3 | decord==0.6.0 4 | deepspeed==0.13.1 5 | einops==0.7.0 6 | ftfy==6.1.3 7 | fvcore==0.1.5.post20221221 8 | imageio==2.33.1 9 | lm_eval==0.4.1 10 | numpy==1.26.4 11 | omegaconf==2.3.0 12 | opencv_python==4.8.1.78 13 | packaging==24.0 14 | pandas==2.2.1 15 | Pillow==10.1.0 16 | pytest==8.1.1 17 | PyYAML==6.0.1 18 | regex==2023.10.3 19 | Requests==2.31.0 20 | scipy==1.12.0 21 | setuptools==68.2.2 22 | skimage==0.0 23 | submitit==1.5.1 24 | tensorboardX==2.6.2.2 25 | tensorflow==2.16.1 26 | termcolor==2.4.0 27 | timm==0.4.12 28 | # torch==2.1.1+cu118 29 | # torchvision==0.16.1+cu118 30 | tqdm==4.66.1 31 | transformers==4.36.1 32 | wandb==0.16.2 33 | wheel==0.42.0 34 | xformers==0.0.24 35 | -------------------------------------------------------------------------------- /videomamba/README.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | - [x] [image_sm](./image_sm/README.md): Single-modality Image Tasks 4 | - Image Classification: [script](./image_sm/README.md) & [model](./image_sm/MODEL_ZOO.md) 5 | - [x] [video_sm](./video_sm/README.md): Single-modality Video Tasks 6 | - Short-term Video Understanding: [script](./video_sm/README.md) & [model](./video_sm/MODEL_ZOO.md) 7 | - Long-term Video Understanding: [script](./video_sm/README.md) & [model](./video_sm/MODEL_ZOO.md) 8 | - Masked Modeling: [script](./video_sm/README.md), [model](./video_sm/MODEL_ZOO.md) 9 | - [x] [video_mm](./video_mm/README.md): Multi-modality Video Tasks 10 | - Video-Text Retrieval: [script](./video_sm/README.md) & [model](./video_sm/MODEL_ZOO.md) 11 | 12 | ## Installation 13 | 14 | - Clone this repo: 15 | 16 | ```shell 17 | git clone https://github.com/OpenGVLab/VideoMamba 18 | cd VideoMamba 19 | ``` 20 | 21 | - Create Conda environments 22 | 23 | ```shell 24 | conda create -n mamba python=3.10 25 | conda activate mamba 26 | ``` 27 | 28 | 29 | - Install PyTorch 2.1.1+cu118 30 | 31 | ```shell 32 | pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu118 33 | ``` 34 | 35 | - Install `causal_conv1d` and `mamba` 36 | 37 | ```shell 38 | pip install -r requirements.txt 39 | pip install -e causal-conv1d 40 | pip install -e mamba 41 | ``` 42 | -------------------------------------------------------------------------------- /videomamba/image_sm/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | **/__pycache__/** 3 | imnet_resnet50_scratch/timm_temp/ 4 | .dumbo.json 5 | checkpoints/ 6 | -------------------------------------------------------------------------------- /videomamba/image_sm/README.md: -------------------------------------------------------------------------------- 1 | # Image Classification 2 | 3 | We currenent release the code and models for: 4 | 5 | - [x] **ImageNet-1K pretraining** 6 | 7 | - [x] **Large resolution fine-tuning** 8 | 9 | 10 | 11 | ## Update 12 | 13 | - :fire: **03/12/2024**: Pretrained models on ImageNet-1K are released. 14 | 15 | 16 | 17 | ## Model Zoo 18 | 19 | See [MODEL_ZOO](./MODEL_ZOO.md). 20 | 21 | 22 | ## Usage 23 | 24 | ### Normal Training 25 | 26 | Simply run the training scripts in [exp](exp) as followed: 27 | 28 | ```shell 29 | bash ./exp/videomamba_tiny/run224.sh 30 | ``` 31 | 32 | > If the training was interrupted abnormally, you can simply rerun the script for auto-resuming. Sometimes the checkpoint may not be saved properly, you should set the resumed model via `--reusme ${OUTPUT_DIR}/ckpt/checkpoint.pth`. 33 | 34 | ### Training w/ SD 35 | 36 | Simply run the training scripts in [exp_distill](exp_distill) as followed: 37 | 38 | ```shell 39 | bash ./exp_distill/videomamba_middle/run224.sh 40 | ``` 41 | 42 | > For `teacher_model`, we use a smaller model by default. 43 | 44 | ### Large Resolution Fine-tuning 45 | 46 | Simply run the training scripts in [exp](exp) as followed: 47 | 48 | ```shell 49 | bash ./exp/videomamba_tiny/run448.sh 50 | ``` 51 | 52 | > Please set pretrained model via `--finetune`. 53 | 54 | ### Evaluation 55 | 56 | Simply add `--eval` in the training scripts. 57 | 58 | > It will evaluate the last model by default. You can set other models via `--resume`. 59 | 60 | ### Generate curves 61 | 62 | You can generate the training curves as followed: 63 | 64 | ```shell 65 | python3 generate_tensoboard.py 66 | ``` 67 | 68 | Note that you should install `tensorboardX`. 69 | 70 | 71 | -------------------------------------------------------------------------------- /videomamba/image_sm/exp/videomamba_small/run224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='videomamba_small_res224' 11 | OUTPUT_DIR="$(dirname $0)" 12 | LOG_DIR="./logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=1 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun --mpi=pmi2 \ 19 | -p ${PARTITION} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} --use_env main.py \ 25 | --root_dir_train your_imagenet_path/train/ \ 26 | --meta_file_train your_imagenet_path/meta/train.txt \ 27 | --root_dir_val your_imagenet_path/val/ \ 28 | --meta_file_val your_imagenet_path/meta/val.txt \ 29 | --model videomamba_small \ 30 | --batch-size 512 \ 31 | --num_workers 16 \ 32 | --lr 5e-4 \ 33 | --weight-decay 0.05 \ 34 | --drop-path 0.15 \ 35 | --no-model-ema \ 36 | --output_dir ${OUTPUT_DIR}/ckpt \ 37 | --bf16 \ 38 | --dist-eval -------------------------------------------------------------------------------- /videomamba/image_sm/exp/videomamba_small/run448.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='videomamba_small_res224to448' 11 | OUTPUT_DIR="$(dirname $0)" 12 | LOG_DIR="./logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=1 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun --mpi=pmi2 \ 19 | -p ${PARTITION} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} --use_env main.py \ 25 | --root_dir_train your_imagenet_path/train/ \ 26 | --meta_file_train your_imagenet_path/meta/train.txt \ 27 | --root_dir_val your_imagenet_path/val/ \ 28 | --meta_file_val your_imagenet_path/meta/val.txt \ 29 | --model videomamba_small \ 30 | --finetune your_model_path/videomamba_small_res224.pth \ 31 | --input-size 448 \ 32 | --batch-size 64 \ 33 | --num_workers 16 \ 34 | --lr 5e-6 \ 35 | --min-lr 5e-6 \ 36 | --weight-decay 1e-8 \ 37 | --warmup-epochs 5 \ 38 | --epochs 30 \ 39 | --drop-path 0.15 \ 40 | --no-model-ema \ 41 | --output_dir ${OUTPUT_DIR}/ckpt \ 42 | --bf16 \ 43 | --dist-eval -------------------------------------------------------------------------------- /videomamba/image_sm/exp/videomamba_small/run576.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='videomamba_small_res224to448to576' 11 | OUTPUT_DIR="$(dirname $0)" 12 | LOG_DIR="./logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=1 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun --mpi=pmi2 \ 19 | -p ${PARTITION} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} --use_env main.py \ 25 | --root_dir_train your_imagenet_path/train/ \ 26 | --meta_file_train your_imagenet_path/meta/train.txt \ 27 | --root_dir_val your_imagenet_path/val/ \ 28 | --meta_file_val your_imagenet_path/meta/val.txt \ 29 | --model videomamba_small \ 30 | --finetune your_model_path/videomamba_small_res224to448.pth \ 31 | --input-size 576 \ 32 | --batch-size 64 \ 33 | --num_workers 16 \ 34 | --lr 5e-6 \ 35 | --min-lr 5e-6 \ 36 | --weight-decay 1e-8 \ 37 | --warmup-epochs 2 \ 38 | --epochs 10 \ 39 | --drop-path 0.15 \ 40 | --no-model-ema \ 41 | --output_dir ${OUTPUT_DIR}/ckpt \ 42 | --bf16 \ 43 | --dist-eval -------------------------------------------------------------------------------- /videomamba/image_sm/exp/videomamba_tiny/run224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='videomamba_tiny_res224' 11 | OUTPUT_DIR="$(dirname $0)" 12 | LOG_DIR="./logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=1 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun --mpi=pmi2 \ 19 | -p ${PARTITION} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} --use_env main.py \ 25 | --root_dir_train your_imagenet_path/train/ \ 26 | --meta_file_train your_imagenet_path/meta/train.txt \ 27 | --root_dir_val your_imagenet_path/val/ \ 28 | --meta_file_val your_imagenet_path/meta/val.txt \ 29 | --model videomamba_tiny \ 30 | --batch-size 512 \ 31 | --num_workers 16 \ 32 | --num_workers 16 \ 33 | --lr 5e-4 \ 34 | --clip-grad 5.0 \ 35 | --weight-decay 0.1 \ 36 | --drop-path 0 \ 37 | --no-repeated-aug \ 38 | --aa v0 \ 39 | --no-model-ema \ 40 | --output_dir ${OUTPUT_DIR}/ckpt \ 41 | --bf16 \ 42 | --dist-eval -------------------------------------------------------------------------------- /videomamba/image_sm/exp/videomamba_tiny/run448.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='videomamba_tiny_res224to448' 11 | OUTPUT_DIR="$(dirname $0)" 12 | LOG_DIR="./logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=1 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun --mpi=pmi2 \ 19 | -p ${PARTITION} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} --use_env main.py \ 25 | --root_dir_train your_imagenet_path/train/ \ 26 | --meta_file_train your_imagenet_path/meta/train.txt \ 27 | --root_dir_val your_imagenet_path/val/ \ 28 | --meta_file_val your_imagenet_path/meta/val.txt \ 29 | --model videomamba_tiny \ 30 | --finetune your_model_path/videomamba_tiny_res224.pth \ 31 | --input-size 448 \ 32 | --batch-size 256 \ 33 | --num_workers 16 \ 34 | --lr 5e-6 \ 35 | --min-lr 5e-6 \ 36 | --weight-decay 1e-8 \ 37 | --warmup-epochs 5 \ 38 | --epochs 30 \ 39 | --drop-path 0 \ 40 | --no-repeated-aug \ 41 | --aa v0 \ 42 | --no-model-ema \ 43 | --output_dir ${OUTPUT_DIR}/ckpt \ 44 | --bf16 \ 45 | --dist-eval -------------------------------------------------------------------------------- /videomamba/image_sm/exp/videomamba_tiny/run576.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='videomamba_tiny_res448to576' 11 | OUTPUT_DIR="$(dirname $0)" 12 | LOG_DIR="./logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=1 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun --mpi=pmi2 \ 19 | -p ${PARTITION} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | python -m torch.distributed.launch --nproc_per_node=${NUM_GPUS} --use_env main.py \ 25 | --root_dir_train your_imagenet_path/train/ \ 26 | --meta_file_train your_imagenet_path/meta/train.txt \ 27 | --root_dir_val your_imagenet_path/val/ \ 28 | --meta_file_val your_imagenet_path/meta/val.txt \ 29 | --model videomamba_tiny \ 30 | --finetune your_model_path/videomamba_tiny_res224to448.pth \ 31 | --input-size 576 \ 32 | --batch-size 128 \ 33 | --num_workers 16 \ 34 | --lr 5e-6 \ 35 | --min-lr 5e-6 \ 36 | --weight-decay 1e-8 \ 37 | --warmup-epochs 2 \ 38 | --epochs 10 \ 39 | --drop-path 0 \ 40 | --no-repeated-aug \ 41 | --aa v0 \ 42 | --no-model-ema \ 43 | --output_dir ${OUTPUT_DIR}/ckpt \ 44 | --bf16 \ 45 | --dist-eval -------------------------------------------------------------------------------- /videomamba/image_sm/exp_distill/videomamba_base/run224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='videomamba_middle_res224' 11 | OUTPUT_DIR="$(dirname $0)" 12 | 13 | python run_with_submitit_distill.py \ 14 | --root_dir_train your_imagenet_path/train/ \ 15 | --meta_file_train your_imagenet_path/meta/train.txt \ 16 | --root_dir_val your_imagenet_path/val/ \ 17 | --meta_file_val your_imagenet_path/meta/val.txt \ 18 | --model videomamba_base \ 19 | --teacher_model videomamba_small \ 20 | --teacher_embed_dim 384 \ 21 | --teacher_pretrained_path your_model_path/videomamba_small_res224.pth \ 22 | --batch-size 128 \ 23 | --num_workers 16 \ 24 | --warmup-epochs 20 \ 25 | --lr 5e-4 \ 26 | --warmup-lr 5e-7 \ 27 | --min-lr 5e-6 \ 28 | --weight-decay 0.05 \ 29 | --drop-path 0.5 \ 30 | --clip-grad 5.0 \ 31 | --no-model-ema \ 32 | --output_dir ${OUTPUT_DIR}/ckpt \ 33 | --bf16 \ 34 | --dist-eval -------------------------------------------------------------------------------- /videomamba/image_sm/exp_distill/videomamba_middle/run224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='videomamba_middle_res224' 11 | OUTPUT_DIR="$(dirname $0)" 12 | 13 | python run_with_submitit_distill.py \ 14 | --root_dir_train your_imagenet_path/train/ \ 15 | --meta_file_train your_imagenet_path/meta/train.txt \ 16 | --root_dir_val your_imagenet_path/val/ \ 17 | --meta_file_val your_imagenet_path/meta/val.txt \ 18 | --model videomamba_middle \ 19 | --teacher_model videomamba_small \ 20 | --teacher_embed_dim 384 \ 21 | --teacher_pretrained_path your_model_path/videomamba_small_res224.pth \ 22 | --batch-size 128 \ 23 | --num_workers 16 \ 24 | --warmup-epochs 20 \ 25 | --lr 5e-4 \ 26 | --warmup-lr 5e-7 \ 27 | --min-lr 5e-6 \ 28 | --weight-decay 0.05 \ 29 | --drop-path 0.5 \ 30 | --clip-grad 5.0 \ 31 | --no-model-ema \ 32 | --output_dir ${OUTPUT_DIR}/ckpt \ 33 | --bf16 \ 34 | --dist-eval -------------------------------------------------------------------------------- /videomamba/image_sm/exp_distill/videomamba_middle/run448.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='videomamba_middle_res224to448' 11 | OUTPUT_DIR="$(dirname $0)" 12 | 13 | python run_with_submitit.py \ 14 | --root_dir_train your_imagenet_path/train/ \ 15 | --meta_file_train your_imagenet_path/meta/train.txt \ 16 | --root_dir_val your_imagenet_path/val/ \ 17 | --meta_file_val your_imagenet_path/meta/val.txt \ 18 | --model videomamba_middle \ 19 | --finetune your_model_path/videomamba_middle_res224.pth \ 20 | --input-size 448 \ 21 | --batch-size 32 \ 22 | --num_workers 16 \ 23 | --lr 5e-6 \ 24 | --min-lr 5e-6 \ 25 | --weight-decay 1e-8 \ 26 | --warmup-epochs 5 \ 27 | --epochs 30 \ 28 | --drop-path 0.5 \ 29 | --no-model-ema \ 30 | --output_dir ${OUTPUT_DIR}/ckpt \ 31 | --bf16 \ 32 | --dist-eval -------------------------------------------------------------------------------- /videomamba/image_sm/exp_distill/videomamba_middle/run576.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='videomamba_middle_res224to448to576' 11 | OUTPUT_DIR="$(dirname $0)" 12 | 13 | python run_with_submitit.py \ 14 | --root_dir_train your_imagenet_path/train/ \ 15 | --meta_file_train your_imagenet_path/meta/train.txt \ 16 | --root_dir_val your_imagenet_path/val/ \ 17 | --meta_file_val your_imagenet_path/meta/val.txt \ 18 | --model videomamba_middle \ 19 | --finetune your_model_path/videomamba_middle_res224to448.pth \ 20 | --input-size 576 \ 21 | --batch-size 32 \ 22 | --num_workers 16 \ 23 | --lr 5e-6 \ 24 | --min-lr 5e-6 \ 25 | --weight-decay 1e-8 \ 26 | --warmup-epochs 2 \ 27 | --epochs 10 \ 28 | --drop-path 0.5 \ 29 | --no-model-ema \ 30 | --output_dir ${OUTPUT_DIR}/ckpt \ 31 | --bf16 \ 32 | --dist-eval -------------------------------------------------------------------------------- /videomamba/image_sm/generate_tensorboard.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from tensorboardX import SummaryWriter 4 | 5 | exp_path_list = ['exp'] 6 | log_keys = ['train_lr', 'train_loss', 'test_loss', 'test_acc1', 'test_acc5'] 7 | 8 | for path in exp_path_list: 9 | for exp in os.listdir(path): 10 | log_path = os.path.join('.', path, exp, 'ckpt', 'log.txt') 11 | if os.path.exists(log_path): 12 | tensorboard_path = os.path.join('.', path, exp, 'events') 13 | if os.path.exists(tensorboard_path): 14 | for old_exp in os.listdir(tensorboard_path): 15 | delete_path = os.path.join(tensorboard_path, old_exp) 16 | print('delete:', delete_path) 17 | os.remove(delete_path) 18 | tb_logger = SummaryWriter(tensorboard_path) 19 | with open(log_path, 'r') as f: 20 | lines = f.readlines() 21 | for line in lines: 22 | log = json.loads(line.rstrip()) 23 | for k in log_keys: 24 | tb_logger.add_scalar(k, log[k], log['epoch']) 25 | print("load ok in:", tensorboard_path) 26 | tb_logger.close() 27 | log_path = os.path.join('.', path, exp, 'log', 'log.txt') 28 | if os.path.exists(log_path): 29 | tensorboard_path = os.path.join('.', path, exp, 'events') 30 | if os.path.exists(tensorboard_path): 31 | for old_exp in os.listdir(tensorboard_path): 32 | delete_path = os.path.join(tensorboard_path, old_exp) 33 | print('delete:', delete_path) 34 | os.remove(delete_path) 35 | tb_logger = SummaryWriter(tensorboard_path) 36 | with open(log_path, 'r') as f: 37 | lines = f.readlines() 38 | for line in lines: 39 | log = json.loads(line.rstrip()) 40 | for k in log_keys: 41 | tb_logger.add_scalar(k, log[k], log['epoch']) 42 | print("load ok in:", tensorboard_path) 43 | tb_logger.close() 44 | -------------------------------------------------------------------------------- /videomamba/image_sm/generate_tensorboard_distill.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from tensorboardX import SummaryWriter 4 | 5 | exp_path_list = ['exp_distill'] 6 | log_keys = ['train_lr', 'test_loss', 'test_acc1', 'test_acc5'] 7 | special_log_keys = { 8 | 'train_loss_cls': 'train_loss', 9 | 'train_loss_distill': 'train_loss_distill' 10 | } 11 | 12 | for path in exp_path_list: 13 | for exp in os.listdir(path): 14 | log_path = os.path.join('.', path, exp, 'ckpt', 'log.txt') 15 | if os.path.exists(log_path): 16 | tensorboard_path = os.path.join('.', path, exp, 'events') 17 | if os.path.exists(tensorboard_path): 18 | for old_exp in os.listdir(tensorboard_path): 19 | delete_path = os.path.join(tensorboard_path, old_exp) 20 | print('delete:', delete_path) 21 | os.remove(delete_path) 22 | tb_logger = SummaryWriter(tensorboard_path) 23 | with open(log_path, 'r') as f: 24 | lines = f.readlines() 25 | for line in lines: 26 | log = json.loads(line.rstrip()) 27 | for k in log_keys: 28 | tb_logger.add_scalar(k, log[k], log['epoch']) 29 | for k, v in special_log_keys.items(): 30 | if k in log.keys(): 31 | tb_logger.add_scalar(v, log[k], log['epoch']) 32 | print("load ok in:", tensorboard_path) 33 | tb_logger.close() 34 | log_path = os.path.join('.', path, exp, 'log', 'log.txt') 35 | if os.path.exists(log_path): 36 | tensorboard_path = os.path.join('.', path, exp, 'events') 37 | if os.path.exists(tensorboard_path): 38 | for old_exp in os.listdir(tensorboard_path): 39 | delete_path = os.path.join(tensorboard_path, old_exp) 40 | print('delete:', delete_path) 41 | os.remove(delete_path) 42 | tb_logger = SummaryWriter(tensorboard_path) 43 | with open(log_path, 'r') as f: 44 | lines = f.readlines() 45 | for line in lines: 46 | log = json.loads(line.rstrip()) 47 | for k in log_keys: 48 | tb_logger.add_scalar(k, log[k], log['epoch']) 49 | for k, v in special_log_keys.items(): 50 | if k in log.keys(): 51 | tb_logger.add_scalar(v, log[k], log['epoch']) 52 | print("load ok in:", tensorboard_path) 53 | tb_logger.close() 54 | -------------------------------------------------------------------------------- /videomamba/image_sm/hubconf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | from models import * 4 | # from cait_models import * 5 | # from resmlp_models import * 6 | #from patchconvnet_models import * 7 | 8 | dependencies = ["torch", "torchvision", "timm"] 9 | -------------------------------------------------------------------------------- /videomamba/image_sm/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .videomamba import ( 2 | videomamba_tiny, 3 | videomamba_small, 4 | videomamba_middle, 5 | videomamba_base, 6 | ) 7 | 8 | from .videomamba_distill import ( 9 | videomamba_middle_distill, 10 | videomamba_base_distill, 11 | ) 12 | 13 | from .deit import ( 14 | deit_tiny_patch16_224, 15 | deit_small_patch16_224, 16 | deit_base_patch16_224, 17 | ) 18 | -------------------------------------------------------------------------------- /videomamba/image_sm/samplers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-present, Facebook, Inc. 2 | # All rights reserved. 3 | import torch 4 | import torch.distributed as dist 5 | import math 6 | 7 | 8 | class RASampler(torch.utils.data.Sampler): 9 | """Sampler that restricts data loading to a subset of the dataset for distributed, 10 | with repeated augmentation. 11 | It ensures that different each augmented version of a sample will be visible to a 12 | different process (GPU) 13 | Heavily based on torch.utils.data.DistributedSampler 14 | """ 15 | 16 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, num_repeats: int = 3): 17 | if num_replicas is None: 18 | if not dist.is_available(): 19 | raise RuntimeError("Requires distributed package to be available") 20 | num_replicas = dist.get_world_size() 21 | if rank is None: 22 | if not dist.is_available(): 23 | raise RuntimeError("Requires distributed package to be available") 24 | rank = dist.get_rank() 25 | if num_repeats < 1: 26 | raise ValueError("num_repeats should be greater than 0") 27 | self.dataset = dataset 28 | self.num_replicas = num_replicas 29 | self.rank = rank 30 | self.num_repeats = num_repeats 31 | self.epoch = 0 32 | self.num_samples = int(math.ceil(len(self.dataset) * self.num_repeats / self.num_replicas)) 33 | self.total_size = self.num_samples * self.num_replicas 34 | # self.num_selected_samples = int(math.ceil(len(self.dataset) / self.num_replicas)) 35 | self.num_selected_samples = int(math.floor(len(self.dataset) // 256 * 256 / self.num_replicas)) 36 | self.shuffle = shuffle 37 | 38 | def __iter__(self): 39 | if self.shuffle: 40 | # deterministically shuffle based on epoch 41 | g = torch.Generator() 42 | g.manual_seed(self.epoch) 43 | indices = torch.randperm(len(self.dataset), generator=g) 44 | else: 45 | indices = torch.arange(start=0, end=len(self.dataset)) 46 | 47 | # add extra samples to make it evenly divisible 48 | indices = torch.repeat_interleave(indices, repeats=self.num_repeats, dim=0).tolist() 49 | padding_size: int = self.total_size - len(indices) 50 | if padding_size > 0: 51 | indices += indices[:padding_size] 52 | assert len(indices) == self.total_size 53 | 54 | # subsample 55 | indices = indices[self.rank:self.total_size:self.num_replicas] 56 | assert len(indices) == self.num_samples 57 | 58 | return iter(indices[:self.num_selected_samples]) 59 | 60 | def __len__(self): 61 | return self.num_selected_samples 62 | 63 | def set_epoch(self, epoch): 64 | self.epoch = epoch 65 | -------------------------------------------------------------------------------- /videomamba/video_mm/DATASET.md: -------------------------------------------------------------------------------- 1 | # Dataset Preparation 2 | 3 | We follow [VINDLU](https://github.com/klauscc/VindLU/) to prepare the datasets, but we **DO NOT** compress the videos and images. We use the original data and load the JSON files, since there are some communication problems for SQLite in our environment. 4 | 5 | :warning: If you do not have enough resources, we suggest you follow the preprocessing of [VINDLU](https://github.com/klauscc/VindLU/blob/main/DATA.md#compressing-videos-and-images). 6 | 7 | :label: We use the same **JSON** files provided by [VINDLU](https://drive.google.com/drive/folders/12bC7WotvwyTG4pVvYeU4iZzmBLP1-6d9). However, since some vides are missing in large-scale datasets (like CC3M, CC12M and WebVid10M), we filter out those unavaliable videos. 8 | 9 | 10 | ## Pretraining 11 | 12 | - CC3M images, https://github.com/google-research-datasets/conceptual-captions 13 | - CC12M images, https://github.com/google-research-datasets/conceptual-12m 14 | - SBU images, https://www.cs.rice.edu/~vo9/sbucaptions/ 15 | - VG images, https://visualgenome.org/api/v0/api_home.html 16 | - COCO images, https://cocodataset.org/#download 17 | - WebVid videos, https://github.com/m-bain/webvid 18 | 19 | 20 | ## Video-Text Retrieval and Video Question Answering 21 | 22 | - MSRVTT videos, https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip 23 | - MSVD videos, https://www.cs.utexas.edu/users/ml/clamp/videoDescription/ 24 | - ActivityNet videos, http://activity-net.org/download.html 25 | - DiDeMo videos, https://github.com/LisaAnne/LocalizingMoments 26 | - LSMDC videos, https://sites.google.com/site/describingmovies 27 | -------------------------------------------------------------------------------- /videomamba/video_mm/README.md: -------------------------------------------------------------------------------- 1 | # Multi-modality Video Understanding 2 | 3 | ## Datasets 4 | 5 | You can find the dataset instructions in [DATASET](DATASET.md). We have provide all the metadata files of our data. 6 | 7 | ## Model ZOO 8 | 9 | You can find all the models and the scripts in [MODEL_ZOO](./MODEL_ZOO.md). 10 | 11 | ## Pre-Training 12 | 13 | We use [CLIP](https://github.com/openai/CLIP) pretrained models as the unmasked teachers by default: 14 | - Follow [extract.ipynb](../video_sm/models/extract_clip/extract.ipynb) to extract visual encoder from CLIP. 15 | - Change `MODEL_PATH` in [clip.py](./models/backbones/videomamba/clip.py). 16 | 17 | For training, you can simply run the pretraining scripts as follows: 18 | ```shell 19 | # masked pretraining 20 | bash ./exp_pt/videomamba_middle_5m/run.sh 21 | # further unmasked pretraining for 1 epoch 22 | bash ./exp_pt/videomamba_middle_5m_unmasked/run.sh 23 | ``` 24 | 25 | > **Notes:** 26 | > 1. Set `data_dir` and `your_data_path` like `your_webvid_path` in [data.py](./configs/data.py) before running the scripts. 27 | > 2. Set `vision_encoder.pretrained` in `vision_encoder.pretrained` in the corresponding config files. 28 | > 3. Set `--rdzv_endpoint` to your `MASTER_NODE:MASTER_PORT` in [torchrun.sh](torchrun.sh). 29 | > 4. `save_latest=True` will automatically save the latest checkpoint while training. 30 | > 5. `auto_resume=True` will automatically loaded the best or latest checkpoint while training. 31 | > 6. For unmasked pretraining, please set `pretrained_path` to load the masked pretrained epoch. 32 | 33 | 34 | ## Zero-shot Evaluation 35 | 36 | For zero-shot evaluation, you can simply run the pretraining scripts as follows: 37 | ```shell 38 | bash ./exp_zs/msrvtt/run.sh 39 | ``` 40 | 41 | > **Notes:** 42 | > 1. Set `pretrained_path` in the running scripts before running the scripts. 43 | > 2. Set `zero_shot=True` and `evaluate=True` for zero-shot evaluation 44 | 45 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/beit-base-patch16-224-pt22k-ft22k.json: -------------------------------------------------------------------------------- 1 | { 2 | "note": "this file is a copy of the BEiT model config, not used directly", 3 | "architectures": [ 4 | "BeitForImageClassification" 5 | ], 6 | "url": "https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k/raw/main/config.json", 7 | "attention_probs_dropout_prob": 0.0, 8 | "drop_path_rate": 0.1, 9 | "hidden_act": "gelu", 10 | "hidden_dropout_prob": 0.0, 11 | "hidden_size": 768, 12 | "image_size": 224, 13 | "initializer_range": 0.02, 14 | "intermediate_size": 3072, 15 | "layer_norm_eps": 1e-12, 16 | "layer_scale_init_value": 0.1, 17 | "model_type": "beit", 18 | "num_attention_heads": 12, 19 | "num_channels": 3, 20 | "num_hidden_layers": 12, 21 | "patch_size": 16, 22 | "torch_dtype": "float32", 23 | "transformers_version": "4.11.0.dev0", 24 | "use_absolute_position_embeddings": false, 25 | "use_mask_token": false, 26 | "use_mean_pooling": true, 27 | "use_relative_position_bias": true, 28 | "use_shared_relative_position_bias": false, 29 | "vocab_size": 8192 30 | } 31 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/config_bert.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "type_vocab_size": 2, 18 | "vocab_size": 30522, 19 | "fusion_layer": 9, 20 | "encoder_width": 768, 21 | "cross_module": "ca" 22 | } 23 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/config_bert_large.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertForMaskedLM" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "gradient_checkpointing": false, 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "hidden_size": 1024, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 4096, 12 | "layer_norm_eps": 1e-12, 13 | "max_position_embeddings": 512, 14 | "model_type": "bert", 15 | "num_attention_heads": 16, 16 | "num_hidden_layers": 24, 17 | "pad_token_id": 0, 18 | "position_embedding_type": "absolute", 19 | "type_vocab_size": 2, 20 | "use_cache": true, 21 | "vocab_size": 30522, 22 | "fusion_layer": 19, 23 | "encoder_width": 768, 24 | "cross_module": "ca" 25 | } 26 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/data.py: -------------------------------------------------------------------------------- 1 | import os as __os # add "__" if not want to be exported 2 | from copy import deepcopy as __deepcopy 3 | 4 | data_dir = 'your_annotation_path' 5 | if data_dir is None: 6 | raise ValueError("please set environment `VL_DATA_DIR` before continue") 7 | 8 | data_root = __os.path.join(data_dir, "videos_images") 9 | anno_root_pt = __os.path.join(data_dir, "anno_pretrain") 10 | anno_root_downstream = __os.path.join(data_dir, "anno_downstream") 11 | 12 | # ============== pretraining datasets================= 13 | available_corpus = dict( 14 | # pretraining datasets 15 | cc3m=[ 16 | f"{anno_root_pt}/cc3m_train.json", 17 | "your_cc3m_path", 18 | ], 19 | cc12m=[ 20 | f"{anno_root_pt}/cc12m_train.json", 21 | "your_cc12m_path", 22 | ], 23 | sbu=[ 24 | f"{anno_root_pt}/sbu.json", 25 | "your_sbu_path", 26 | ], 27 | vg=[ 28 | f"{anno_root_pt}/vg.json", 29 | "your_vg_path", 30 | ], 31 | coco=[ 32 | f"{anno_root_pt}/coco.json", 33 | "your_coco_path", 34 | ], 35 | webvid=[ 36 | f"{anno_root_pt}/webvid_train.json", 37 | "your_webvid_path", 38 | "video" 39 | ], 40 | webvid_10m=[ 41 | "{anno_root_pt}/webvid10m_train.json", 42 | "your_webvid_10m_path", 43 | "video", 44 | ], 45 | # downstream datasets. 46 | ) 47 | 48 | # composed datasets. 49 | available_corpus["data_5m"] = [ 50 | available_corpus["webvid"], 51 | available_corpus["cc3m"] 52 | ] 53 | available_corpus["data_17m"] = [ 54 | available_corpus["webvid"], 55 | available_corpus["cc3m"], 56 | available_corpus["coco"], 57 | available_corpus["vg"], 58 | available_corpus["sbu"], 59 | available_corpus["cc12m"], 60 | ] 61 | available_corpus["data_25m"] = [ 62 | available_corpus["webvid_10m"], 63 | available_corpus["cc3m"], 64 | available_corpus["coco"], 65 | available_corpus["vg"], 66 | available_corpus["sbu"], 67 | available_corpus["cc12m"], 68 | ] 69 | 70 | # ============== for validation ================= 71 | available_corpus["msrvtt_1k_test"] = [ 72 | f"{anno_root_downstream}/msrvtt_test1k.json", 73 | "your_msrvtt_path/MSRVTT_Videos", 74 | "video", 75 | ] 76 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/model.py: -------------------------------------------------------------------------------- 1 | TextEncoders = dict() 2 | TextEncoders["bert"] = dict( 3 | name="bert_base", 4 | pretrained="bert-base-uncased", 5 | config="configs/config_bert.json", 6 | d_model=768, 7 | fusion_layer=9, 8 | ) 9 | TextEncoders["bert_large"] = dict( 10 | name="bert_large", 11 | pretrained="bert-large-uncased", 12 | config="configs/config_bert_large.json", 13 | d_model=1024, 14 | fusion_layer=19, 15 | ) 16 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/pretrain.py: -------------------------------------------------------------------------------- 1 | from .data import * 2 | from .model import * 3 | 4 | # ========================= data ========================== 5 | train_corpus = "webvid_cc3m" 6 | train_file = "${available_corpus[${train_corpus}]}" # for lazy evaluation 7 | test_file = dict(msrvtt_1k_test=available_corpus["msrvtt_1k_test"]) 8 | test_types = ["msrvtt_1k_test"] 9 | num_workers = 6 10 | 11 | stop_key = None 12 | 13 | # ========================= input ========================== 14 | num_frames = 4 15 | num_frames_test = 4 16 | batch_size = 64 17 | max_txt_l = 32 18 | 19 | inputs = dict( 20 | image_res=224, 21 | video_input=dict( 22 | num_frames="${num_frames}", 23 | sample_type="rand", 24 | num_frames_test="${num_frames_test}", 25 | sample_type_test="middle", 26 | random_aug=False, 27 | ), 28 | max_txt_l=dict(image="${max_txt_l}", video="${max_txt_l}"), 29 | batch_size=dict(image="${batch_size}", video="${batch_size}"), 30 | batch_size_test=dict(image="${batch_size}", video="${batch_size}"), 31 | ) 32 | 33 | # ========================= model ========================== 34 | vision_enc = "beit" 35 | text_enc = "bert" 36 | model = dict( 37 | vision_encoder="${VisionEncoders[${vision_enc}]}", 38 | text_encoder="${TextEncoders[${text_enc}]}", 39 | temporal_modeling=dict( 40 | num_frames="${num_frames}", 41 | temporal_model_block="timesformer", 42 | temporal_model_position="last", 43 | temporal_model_config=dict(input_dim="${model.vision_encoder.d_model}"), 44 | use_temporal_position_embedding=True, 45 | ), 46 | vit_add_ln=True, 47 | multimodal=dict(enable=True), 48 | embed_dim=256, 49 | temp=0.07, 50 | ) 51 | 52 | criterion = dict( 53 | loss_weight=dict(vtc=1.0, mlm=1.0, vtm=1.0, mvm=0.0), # 0: disabled. 54 | vtm_hard_neg=True, 55 | mlm_masking_prob=0.5, 56 | ) 57 | 58 | optimizer = dict( 59 | opt="adamW", 60 | lr=1e-4, 61 | opt_betas=[0.9, 0.999], # default 62 | weight_decay=0.02, 63 | max_grad_norm=-1, # requires a positive float, use -1 to disable 64 | # use a different lr for some modules, e.g., larger lr for new modules 65 | different_lr=dict(enable=False, module_names=[], lr=1e-3), 66 | ) 67 | 68 | scheduler = dict(sched="cosine", epochs=10, min_lr_multi=0.01, warmup_epochs=1) 69 | 70 | evaluate = False 71 | deep_fusion = False 72 | evaluation = dict( 73 | eval_frame_ensemble="concat", # [concat, max, mean, lse] 74 | eval_x_only=False, 75 | k_test=128, 76 | eval_offload=True, # offload gpu tensors to cpu to save memory. 77 | ) 78 | 79 | fp16 = True 80 | gradient_checkpointing = True 81 | 82 | # ========================= wandb ========================== 83 | wandb = dict( 84 | enable=True, 85 | entity="user", # username or team name to store the runs, see https://docs.wandb.ai/ref/python/init 86 | project="umt", # setup in your command line 87 | ) 88 | dist_url = "env://" 89 | device = "cuda" 90 | mode = "pt" 91 | 92 | # ========================= others ========================== 93 | output_dir = None # output dir 94 | resume = False # if True, load optimizer and scheduler states as well 95 | debug = False 96 | log_freq = 100 97 | seed = 42 98 | 99 | save_latest = True 100 | auto_resume = True 101 | pretrained_path = "" # path to pretrained model weights, for resume only? 102 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/qa.py: -------------------------------------------------------------------------------- 1 | from .pretrain import * 2 | 3 | del available_corpus 4 | 5 | criterion["loss_weight"]["mlm"] = 0.0 6 | scheduler["warmup_epochs"] = 0.5 7 | 8 | max_txt_l = 32 9 | batch_size = 32 10 | num_frames = 12 11 | 12 | optimizer["lr"] = 1e-5 13 | log_freq = 100 14 | 15 | # =========additional args for VQA ============ 16 | eos = "[SEP]" 17 | max_q_len = 25 18 | max_a_len = 5 19 | # =========end ================================ 20 | 21 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/qa_anet.py: -------------------------------------------------------------------------------- 1 | from .qa import * 2 | 3 | train_file = [ 4 | [ 5 | f"{anno_root_downstream}/anet_qa_train.json", 6 | f"{data_root}/activity_net_2fps_360", 7 | "video", 8 | ] 9 | ] 10 | test_file = dict( 11 | val=[ 12 | f"{anno_root_downstream}/anet_qa_val.json", 13 | f"{data_root}/activity_net_2fps_360", 14 | "video", 15 | ], 16 | test=[ 17 | f"{anno_root_downstream}/anet_qa_test.json", 18 | f"{data_root}/activity_net_2fps_360", 19 | "video", 20 | ] 21 | ) 22 | dataset_name = "anet" 23 | 24 | answer_list = f"{anno_root_downstream}/anet_qa_answer_list.json" # list of answer words 25 | 26 | test_types = ["val"] 27 | stop_key = "val" # used to choose the best ckpt. If None, save the last. 28 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/qa_msrvtt.py: -------------------------------------------------------------------------------- 1 | from .qa import * 2 | 3 | train_file = [ 4 | [ 5 | f"{anno_root_downstream}/msrvtt_qa_train.json", 6 | f"{data_root}/msrvtt_2fps_224", 7 | "video", 8 | ] 9 | ] 10 | test_file = dict( 11 | val=[ 12 | f"{anno_root_downstream}/msrvtt_qa_val.json", 13 | f"{data_root}/msrvtt_2fps_224", 14 | "video", 15 | ], 16 | test=[ 17 | f"{anno_root_downstream}/msrvtt_qa_test.json", 18 | f"{data_root}/msrvtt_2fps_224", 19 | "video", 20 | ], 21 | ) 22 | dataset_name = "msrvtt" 23 | 24 | answer_list = f"{anno_root_downstream}/msrvtt_qa_answer_list.json" # list of answer words 25 | 26 | test_types = ["val"] 27 | stop_key = "val" # used to choose the best ckpt. If None, save the last. 28 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/ret_anet.py: -------------------------------------------------------------------------------- 1 | from .pretrain import * 2 | 3 | del available_corpus 4 | 5 | train_file = [ 6 | f"{anno_root_downstream}/anet_ret_train.json", 7 | f"{data_root}/activity_net_2fps_360", 8 | "video", 9 | ] 10 | test_file = dict( 11 | test=[ 12 | f"{anno_root_downstream}/anet_ret_val_1.json", 13 | f"{data_root}/activity_net_2fps_360", 14 | "video", 15 | ], 16 | ) 17 | 18 | test_types = ["test"] 19 | stop_key = "test/" # used to choose the best ckpt. If None, save the last. 20 | is_paragraph_retrieval = True 21 | 22 | max_txt_l = 64 23 | batch_size = 32 24 | num_frames = 12 25 | 26 | optimizer["lr"] = 1e-5 27 | log_freq = 100 28 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/ret_coco.py: -------------------------------------------------------------------------------- 1 | from .pretrain import * 2 | 3 | del available_corpus 4 | 5 | train_file = [ 6 | f"{anno_root_downstream}/coco_train.json", 7 | f"{data_root}/coco", 8 | "video", 9 | ] 10 | test_file = dict( 11 | val=[ 12 | f"{anno_root_downstream}/coco_val.json", 13 | f"{data_root}/coco", 14 | "video", 15 | ], 16 | test=[ 17 | f"{anno_root_downstream}/coco_test.json", 18 | f"{data_root}/coco", 19 | "video", 20 | ], 21 | ) 22 | 23 | test_types = ["val"] 24 | stop_key = "val/" # used to choose the best ckpt. If None, save the last. 25 | is_paragraph_retrieval = False 26 | 27 | criterion["loss_weight"]["mlm"] = 0.0 28 | scheduler["warmup_epochs"] = 0 29 | optimizer["lr"] = 1e-5 30 | 31 | 32 | max_txt_l = 22 33 | batch_size = 128 34 | num_frames = 1 35 | num_frames_test = 1 36 | 37 | log_freq = 100 38 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/ret_didemo.py: -------------------------------------------------------------------------------- 1 | from .pretrain import * 2 | 3 | del available_corpus 4 | 5 | train_file = [ 6 | f"{anno_root_downstream}/didemo_ret_train.json", 7 | f"{data_root}/didemo_2fps_360_trimed30", 8 | "video", 9 | ] 10 | test_file = dict( 11 | val=[ 12 | f"{anno_root_downstream}/didemo_ret_val.json", 13 | f"{data_root}/didemo_2fps_360_trimed30", 14 | "video", 15 | ], 16 | test=[ 17 | f"{anno_root_downstream}/didemo_ret_test.json", 18 | f"{data_root}/didemo_2fps_360_trimed30", 19 | "video", 20 | ], 21 | ) 22 | 23 | test_types = ["val"] 24 | stop_key = "val/" # used to choose the best ckpt. If None, save the last. 25 | is_paragraph_retrieval = True 26 | 27 | criterion["loss_weight"]["mlm"] = 0.0 28 | scheduler["warmup_epochs"] = 0 29 | optimizer["lr"] = 1e-5 30 | 31 | 32 | max_txt_l = 64 33 | batch_size = 32 34 | num_frames = 12 35 | 36 | log_freq = 10 37 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/ret_flickr.py: -------------------------------------------------------------------------------- 1 | from .pretrain import * 2 | 3 | del available_corpus 4 | 5 | train_file = [ 6 | f"{anno_root_downstream}/flickr30k_train.json", 7 | f"{data_root}/f30k", 8 | "video", 9 | ] 10 | test_file = dict( 11 | val=[ 12 | f"{anno_root_downstream}/flickr30k_val.json", 13 | f"{data_root}/f30k", 14 | "video", 15 | ], 16 | test=[ 17 | f"{anno_root_downstream}/flickr30k_test.json", 18 | f"{data_root}/f30k", 19 | "video", 20 | ], 21 | ) 22 | 23 | test_types = ["val"] 24 | stop_key = "val/" # used to choose the best ckpt. If None, save the last. 25 | is_paragraph_retrieval = False 26 | 27 | criterion["loss_weight"]["mlm"] = 0.0 28 | scheduler["warmup_epochs"] = 0 29 | optimizer["lr"] = 1e-5 30 | 31 | 32 | max_txt_l = 32 33 | batch_size = 128 34 | num_frames = 1 35 | num_frames_test = 1 36 | 37 | log_freq = 100 38 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/ret_msrvtt.py: -------------------------------------------------------------------------------- 1 | from .pretrain import * 2 | 3 | del available_corpus 4 | 5 | train_file = [ 6 | f"{anno_root_downstream}/msrvtt_ret_train7k.json", 7 | f"{data_root}/msrvtt_2fps_224", 8 | "video", 9 | ] 10 | test_file = dict( 11 | test=[ 12 | f"{anno_root_downstream}/msrvtt_ret_test1k.json", 13 | f"{data_root}/msrvtt_2fps_224", 14 | "video", 15 | ], 16 | ) 17 | 18 | test_types = ["test"] 19 | stop_key = None # used to choose the best ckpt. If None, save the last. 20 | is_paragraph_retrieval = False 21 | 22 | criterion["loss_weight"]["mlm"] = 0.0 23 | scheduler["warmup_epochs"] = 0 24 | scheduler["epochs"] = 5 25 | optimizer["lr"] = 1e-5 26 | 27 | max_txt_l = 32 28 | batch_size = 32 29 | num_frames = 12 30 | 31 | log_freq = 100 32 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/ret_msrvtt_9k.py: -------------------------------------------------------------------------------- 1 | from .ret_msrvtt import * 2 | 3 | train_file = [ 4 | f"{anno_root_downstream}/msrvtt_ret_train9k.json", 5 | f"{data_root}/msrvtt_2fps_224", 6 | "video", 7 | ] 8 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/ret_msrvtt_mc.py: -------------------------------------------------------------------------------- 1 | from .pretrain import * 2 | 3 | del available_corpus 4 | 5 | train_file = [ 6 | f"{anno_root_downstream}/msrvtt_ret_train7k.json", 7 | f"{data_root}/msrvtt_2fps_224", 8 | "video", 9 | ] 10 | test_file = dict( 11 | mc_test=[ 12 | f"{anno_root_downstream}/msrvtt_mc_test.json", 13 | f"{data_root}/msrvtt_2fps_224", 14 | "video", 15 | ] 16 | ) 17 | 18 | test_types = ["mc_test"] 19 | stop_key = None # used to choose the best ckpt. If None, save the last. 20 | is_paragraph_retrieval = False 21 | 22 | criterion["loss_weight"]["mlm"] = 0.0 23 | scheduler["warmup_epochs"] = 0 24 | optimizer["lr"] = 1e-5 25 | 26 | max_txt_l = 32 27 | batch_size = 32 28 | num_frames = 12 29 | 30 | log_freq = 100 31 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/ret_ssv2_label.py: -------------------------------------------------------------------------------- 1 | from .ret_msrvtt import * 2 | 3 | train_file = [ 4 | f"{anno_root_downstream}/ssv2_ret_label_train.json", 5 | f"{data_root}/ssv2", 6 | "video", 7 | ] 8 | test_file = dict( 9 | val=[ 10 | f"{anno_root_downstream}/ssv2_ret_label_val_small.json", 11 | f"{data_root}/ssv2", 12 | "video", 13 | ], 14 | ) 15 | 16 | test_types = ["val"] 17 | stop_key = None # used to choose the best ckpt. If None, save the last. 18 | 19 | has_multi_vision_gt = True 20 | 21 | scheduler["epochs"] = 10 22 | optimizer["lr"] = 1e-4 23 | 24 | max_txt_l = 25 25 | -------------------------------------------------------------------------------- /videomamba/video_mm/configs/ret_ssv2_template.py: -------------------------------------------------------------------------------- 1 | from .ret_msrvtt import * 2 | 3 | train_file = [ 4 | f"{anno_root_downstream}/ssv2_ret_template_train.json", 5 | f"{data_root}/ssv2", 6 | "video", 7 | ] 8 | test_file = dict( 9 | val=[ 10 | f"{anno_root_downstream}/ssv2_ret_template_val_small.json", 11 | f"{data_root}/ssv2", 12 | "video", 13 | ], 14 | ) 15 | 16 | test_types = ["val"] 17 | stop_key = None # used to choose the best ckpt. If None, save the last. 18 | 19 | has_multi_vision_gt = True 20 | 21 | scheduler["epochs"] = 10 22 | optimizer["lr"] = 1e-4 23 | 24 | max_txt_l = 22 25 | -------------------------------------------------------------------------------- /videomamba/video_mm/dataset/base_dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | from torch.utils.data import Dataset 5 | from dataset.utils import load_image_from_path 6 | 7 | try: 8 | from petrel_client.client import Client 9 | has_client = True 10 | except ImportError: 11 | has_client = False 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class ImageVideoBaseDataset(Dataset): 17 | """Base class that implements the image and video loading methods""" 18 | 19 | media_type = "video" 20 | 21 | def __init__(self): 22 | assert self.media_type in ["image", "video"] 23 | self.data_root = None 24 | self.anno_list = ( 25 | None # list(dict), each dict contains {"image": str, # image or video path} 26 | ) 27 | self.transform = None 28 | self.video_reader = None 29 | self.num_tries = None 30 | self.trimmed30 = False 31 | 32 | self.client = None 33 | if has_client: 34 | self.client = Client('~/petreloss.conf') 35 | 36 | def __getitem__(self, index): 37 | raise NotImplementedError 38 | 39 | def __len__(self): 40 | raise NotImplementedError 41 | 42 | def get_anno(self, index): 43 | """obtain the annotation for one media (video or image) 44 | 45 | Args: 46 | index (int): The media index. 47 | 48 | Returns: dict. 49 | - "image": the filename, video also use "image". 50 | - "caption": The caption for this file. 51 | 52 | """ 53 | anno = self.anno_list[index] 54 | if self.data_root is not None: 55 | anno["image"] = os.path.join(self.data_root, anno["image"]) 56 | return anno 57 | 58 | def load_and_transform_media_data(self, index, data_path): 59 | if self.media_type == "image": 60 | return self.load_and_transform_media_data_image(index, data_path) 61 | else: 62 | return self.load_and_transform_media_data_video(index, data_path) 63 | 64 | def load_and_transform_media_data_image(self, index, data_path): 65 | image = load_image_from_path(data_path, client=self.client) 66 | image = self.transform(image) 67 | return image, index 68 | 69 | def load_and_transform_media_data_video(self, index, data_path): 70 | for _ in range(self.num_tries): 71 | try: 72 | max_num_frames = self.max_num_frames if hasattr(self, "max_num_frames") else -1 73 | frames, frame_indices, video_duration = self.video_reader( 74 | data_path, self.num_frames, self.sample_type, 75 | max_num_frames=max_num_frames, client=self.client, 76 | trimmed30=self.trimmed30 77 | ) 78 | except Exception as e: 79 | logger.warning( 80 | f"Caught exception {e} when loading video {data_path}, " 81 | f"randomly sample a new video as replacement" 82 | ) 83 | index = random.randint(0, len(self) - 1) 84 | ann = self.get_anno(index) 85 | data_path = ann["image"] 86 | continue 87 | # shared aug for video frames 88 | frames = self.transform(frames) 89 | return frames, index 90 | else: 91 | raise RuntimeError( 92 | f"Failed to fetch video after {self.num_tries} tries. " 93 | f"This might indicate that you have many corrupted videos." 94 | ) 95 | -------------------------------------------------------------------------------- /videomamba/video_mm/dataset/dataloader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | from utils.distributed import get_rank, is_dist_avail_and_initialized, is_main_process 4 | import random 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class MetaLoader(object): 11 | """ wraps multiple data loader """ 12 | def __init__(self, name2loader): 13 | """Iterates over multiple dataloaders, it ensures all processes 14 | work on data from the same dataloader. This loader will end when 15 | the shorter dataloader raises StopIteration exception. 16 | 17 | loaders: Dict, {name: dataloader} 18 | """ 19 | self.name2loader = name2loader 20 | self.name2iter = {name: iter(l) for name, l in name2loader.items()} 21 | name2index = {name: idx for idx, (name, l) in enumerate(name2loader.items())} 22 | index2name = {v: k for k, v in name2index.items()} 23 | 24 | iter_order = [] 25 | for n, l in name2loader.items(): 26 | iter_order.extend([name2index[n]]*len(l)) 27 | 28 | random.shuffle(iter_order) 29 | iter_order = torch.Tensor(iter_order).to(torch.device("cuda")).to(torch.uint8) 30 | 31 | # sync 32 | if is_dist_avail_and_initialized(): 33 | # make sure all processes have the same order so that 34 | # each step they will have data from the same loader 35 | dist.broadcast(iter_order, src=0) 36 | self.iter_order = [index2name[int(e.item())] for e in iter_order.cpu()] 37 | 38 | logger.info(str(self)) 39 | 40 | def __str__(self): 41 | output = [f"MetaLoader has {len(self.name2loader)} dataloaders, {len(self)} batches in total"] 42 | for idx, (name, loader) in enumerate(self.name2loader.items()): 43 | output.append( 44 | f"dataloader index={idx} name={name}, batch-size={loader.batch_size} length(#batches)={len(loader)} " 45 | ) 46 | return "\n".join(output) 47 | 48 | def __len__(self): 49 | return len(self.iter_order) 50 | 51 | def __iter__(self): 52 | """ this iterator will run indefinitely """ 53 | for name in self.iter_order: 54 | _iter = self.name2iter[name] 55 | batch = next(_iter) 56 | yield name, batch 57 | -------------------------------------------------------------------------------- /videomamba/video_mm/dataset/qa_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataset.base_dataset import ImageVideoBaseDataset 3 | from dataset.utils import pre_text, load_anno 4 | from dataset.video_utils import VIDEO_READER_FUNCS 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class ImageQADataset(ImageVideoBaseDataset): 11 | media_type = "image" 12 | 13 | def __init__(self, ann_file, transform, eos="[SEP]", mode="train", answer_list=None): 14 | super(ImageQADataset, self).__init__() 15 | assert mode in ["train", "eval"] 16 | self.mode = mode 17 | self.transform = transform 18 | self.eos = eos 19 | 20 | self.anno_list = load_anno(ann_file) 21 | 22 | if mode == "eval": 23 | self.answer_list = json.load(open(answer_list, "r")) 24 | 25 | def __len__(self): 26 | return len(self.anno_list) 27 | 28 | def get_answers_with_weights(self, raw_answers): 29 | if isinstance(raw_answers, str): 30 | raw_answers = [raw_answers] 31 | answer_weight = {} 32 | for answer in raw_answers: 33 | if answer in answer_weight.keys(): 34 | answer_weight[answer] += 1/len(raw_answers) 35 | else: 36 | answer_weight[answer] = 1/len(raw_answers) 37 | 38 | answers = list(answer_weight.keys()) 39 | weights = [answer_weight[a] for a in answers] 40 | answers = [answer + " " + self.eos for answer in answers] 41 | return answers, weights 42 | 43 | def __getitem__(self, index): 44 | ann = self.anno_list[index] 45 | image, index = self.load_and_transform_media_data(index, ann["image"]) 46 | 47 | question = pre_text(ann["question"]) 48 | if self.mode == "train": 49 | answers, weights = self.get_answers_with_weights(ann["answer"]) 50 | return image, question, answers, weights 51 | else: # self.mode == "eval": 52 | question_id = ann["question_id"] 53 | return image, question, question_id 54 | 55 | 56 | class VideoQADataset(ImageQADataset): 57 | media_type = "video" 58 | 59 | def __init__( 60 | self, ann_file, transform, eos="[SEP]", mode="train", answer_list=None, 61 | num_frames=4, video_reader_type="decord", sample_type="rand", num_tries=1 62 | ): 63 | super(VideoQADataset, self).__init__( 64 | ann_file, transform, eos, mode, answer_list) 65 | self.num_frames = num_frames 66 | self.video_reader_type = video_reader_type 67 | self.video_reader = VIDEO_READER_FUNCS[video_reader_type] 68 | self.sample_type = sample_type 69 | self.num_tries = num_tries 70 | -------------------------------------------------------------------------------- /videomamba/video_mm/exp_pt/videomamba_middle_17m/run.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='m16_17m' 11 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 12 | LOG_DIR="$(dirname $0)/logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=4 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun -p ${PARTITION} \ 19 | --job-name=${JOB_NAME} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | bash torchrun.sh \ 25 | --nnodes=${NNODE} \ 26 | --nproc_per_node=${NUM_GPUS} \ 27 | --rdzv_backend=c10d \ 28 | tasks/pretrain.py \ 29 | $(dirname $0)/config.py \ 30 | output_dir ${OUTPUT_DIR} 31 | -------------------------------------------------------------------------------- /videomamba/video_mm/exp_pt/videomamba_middle_17m_unmasked/run.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='m16_17m' 11 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 12 | LOG_DIR="$(dirname $0)/logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=4 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun -p ${PARTITION} \ 19 | --job-name=${JOB_NAME} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | bash torchrun.sh \ 25 | --nnodes=${NNODE} \ 26 | --nproc_per_node=${NUM_GPUS} \ 27 | --rdzv_backend=c10d \ 28 | tasks/pretrain.py \ 29 | $(dirname $0)/config.py \ 30 | output_dir ${OUTPUT_DIR} 31 | -------------------------------------------------------------------------------- /videomamba/video_mm/exp_pt/videomamba_middle_25m/run.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='m16_25m' 11 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 12 | LOG_DIR="$(dirname $0)/logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=4 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun -p ${PARTITION} \ 19 | --job-name=${JOB_NAME} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | bash torchrun.sh \ 25 | --nnodes=${NNODE} \ 26 | --nproc_per_node=${NUM_GPUS} \ 27 | --rdzv_backend=c10d \ 28 | tasks/pretrain.py \ 29 | $(dirname $0)/config.py \ 30 | output_dir ${OUTPUT_DIR} 31 | -------------------------------------------------------------------------------- /videomamba/video_mm/exp_pt/videomamba_middle_25m_unmasked/run.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='m16_25m' 11 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 12 | LOG_DIR="$(dirname $0)/logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=4 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun -p ${PARTITION} \ 19 | --job-name=${JOB_NAME} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | bash torchrun.sh \ 25 | --nnodes=${NNODE} \ 26 | --nproc_per_node=${NUM_GPUS} \ 27 | --rdzv_backend=c10d \ 28 | tasks/pretrain.py \ 29 | $(dirname $0)/config.py \ 30 | output_dir ${OUTPUT_DIR} 31 | -------------------------------------------------------------------------------- /videomamba/video_mm/exp_pt/videomamba_middle_5m/run.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='m16_5m' 11 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 12 | LOG_DIR="$(dirname $0)/logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=2 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun -p ${PARTITION} \ 19 | --job-name=${JOB_NAME} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | bash torchrun.sh \ 25 | --nnodes=${NNODE} \ 26 | --nproc_per_node=${NUM_GPUS} \ 27 | --rdzv_backend=c10d \ 28 | tasks/pretrain.py \ 29 | $(dirname $0)/config.py \ 30 | output_dir ${OUTPUT_DIR} 31 | -------------------------------------------------------------------------------- /videomamba/video_mm/exp_pt/videomamba_middle_5m_unmasked/run.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='m16_5m' 11 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 12 | LOG_DIR="$(dirname $0)/logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=2 15 | NUM_GPUS=8 16 | NUM_CPU=128 17 | 18 | srun -p ${PARTITION} \ 19 | --job-name=${JOB_NAME} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | bash torchrun.sh \ 25 | --nnodes=${NNODE} \ 26 | --nproc_per_node=${NUM_GPUS} \ 27 | --rdzv_backend=c10d \ 28 | tasks/pretrain.py \ 29 | $(dirname $0)/config.py \ 30 | output_dir ${OUTPUT_DIR} 31 | -------------------------------------------------------------------------------- /videomamba/video_mm/exp_zs/anet/run.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='m16_5m' 11 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 12 | LOG_DIR="$(dirname $0)/logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=1 15 | NUM_GPUS=1 16 | NUM_CPU=1 17 | 18 | srun -p ${PARTITION} \ 19 | --job-name=${JOB_NAME} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | bash torchrun.sh \ 25 | --nnodes=${NNODE} \ 26 | --nproc_per_node=${NUM_GPUS} \ 27 | --rdzv_backend=c10d \ 28 | tasks/retrieval.py \ 29 | $(dirname $0)/config.py \ 30 | output_dir ${OUTPUT_DIR} \ 31 | evaluate True \ 32 | pretrained_path your_model_path/videomamba_m16_25M_f8_res224.pth 33 | 34 | -------------------------------------------------------------------------------- /videomamba/video_mm/exp_zs/didemo/run.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='m16_5m' 11 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 12 | LOG_DIR="$(dirname $0)/logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=1 15 | NUM_GPUS=1 16 | NUM_CPU=1 17 | 18 | srun -p ${PARTITION} \ 19 | --job-name=${JOB_NAME} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | bash torchrun.sh \ 25 | --nnodes=${NNODE} \ 26 | --nproc_per_node=${NUM_GPUS} \ 27 | --rdzv_backend=c10d \ 28 | tasks/retrieval.py \ 29 | $(dirname $0)/config.py \ 30 | output_dir ${OUTPUT_DIR} \ 31 | evaluate True \ 32 | pretrained_path your_model_path/videomamba_m16_25M_f8_res224.pth 33 | 34 | -------------------------------------------------------------------------------- /videomamba/video_mm/exp_zs/lsmdc/run.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='m16_5m' 11 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 12 | LOG_DIR="$(dirname $0)/logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=1 15 | NUM_GPUS=1 16 | NUM_CPU=1 17 | 18 | srun -p ${PARTITION} \ 19 | --job-name=${JOB_NAME} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | bash torchrun.sh \ 25 | --nnodes=${NNODE} \ 26 | --nproc_per_node=${NUM_GPUS} \ 27 | --rdzv_backend=c10d \ 28 | tasks/retrieval.py \ 29 | $(dirname $0)/config.py \ 30 | output_dir ${OUTPUT_DIR} \ 31 | evaluate True \ 32 | pretrained_path your_model_path/videomamba_m16_25M_f8_res224.pth 33 | 34 | -------------------------------------------------------------------------------- /videomamba/video_mm/exp_zs/msrvtt/run.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='m16_5m' 11 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 12 | LOG_DIR="$(dirname $0)/logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=1 15 | NUM_GPUS=1 16 | NUM_CPU=1 17 | 18 | srun -p ${PARTITION} \ 19 | --job-name=${JOB_NAME} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | bash torchrun.sh \ 25 | --nnodes=${NNODE} \ 26 | --nproc_per_node=${NUM_GPUS} \ 27 | --rdzv_backend=c10d \ 28 | tasks/retrieval.py \ 29 | $(dirname $0)/config.py \ 30 | output_dir ${OUTPUT_DIR} \ 31 | evaluate True \ 32 | pretrained_path your_model_path/videomamba_m16_25M_f8_res224.pth 33 | 34 | -------------------------------------------------------------------------------- /videomamba/video_mm/exp_zs/msvd/run.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | echo "PYTHONPATH: ${PYTHONPATH}" 4 | which_python=$(which python) 5 | echo "which python: ${which_python}" 6 | export PYTHONPATH=${PYTHONPATH}:${which_python} 7 | export PYTHONPATH=${PYTHONPATH}:. 8 | echo "PYTHONPATH: ${PYTHONPATH}" 9 | 10 | JOB_NAME='m16_5m' 11 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 12 | LOG_DIR="$(dirname $0)/logs/${JOB_NAME}" 13 | PARTITION='video5' 14 | NNODE=1 15 | NUM_GPUS=1 16 | NUM_CPU=1 17 | 18 | srun -p ${PARTITION} \ 19 | --job-name=${JOB_NAME} \ 20 | -n${NNODE} \ 21 | --gres=gpu:${NUM_GPUS} \ 22 | --ntasks-per-node=1 \ 23 | --cpus-per-task=${NUM_CPU} \ 24 | bash torchrun.sh \ 25 | --nnodes=${NNODE} \ 26 | --nproc_per_node=${NUM_GPUS} \ 27 | --rdzv_backend=c10d \ 28 | tasks/retrieval.py \ 29 | $(dirname $0)/config.py \ 30 | output_dir ${OUTPUT_DIR} \ 31 | evaluate True \ 32 | pretrained_path your_model_path/videomamba_m16_25M_f8_res224.pth 33 | -------------------------------------------------------------------------------- /videomamba/video_mm/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .umt import UMT 2 | from .umt_qa import UMT_QA 3 | from .umt_videomamba import UMT_VIDEOMAMBA -------------------------------------------------------------------------------- /videomamba/video_mm/models/backbones/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/videomamba/video_mm/models/backbones/__init__.py -------------------------------------------------------------------------------- /videomamba/video_mm/models/backbones/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/videomamba/video_mm/models/backbones/bert/__init__.py -------------------------------------------------------------------------------- /videomamba/video_mm/models/backbones/bert/builder.py: -------------------------------------------------------------------------------- 1 | from .xbert import BertConfig, BertForMaskedLM, BertLMHeadModel, BertModel 2 | 3 | import logging 4 | logger = logging.getLogger(__name__) 5 | 6 | def build_bert(model_config, pretrain, checkpoint): 7 | """build text encoder. 8 | 9 | Args: 10 | model_config (dict): model config. 11 | pretrain (bool): Whether to do pretrain or finetuning. 12 | checkpoint (bool): whether to do gradient_checkpointing. 13 | 14 | Returns: TODO 15 | 16 | """ 17 | bert_config = BertConfig.from_json_file(model_config.text_encoder.config) 18 | bert_config.encoder_width = model_config.vision_encoder.get.d_model if model_config.vision_encoder.get('d_model', 0) else model_config.vision_encoder.embed_dim 19 | bert_config.gradient_checkpointing = checkpoint 20 | bert_config.fusion_layer = model_config.text_encoder.fusion_layer 21 | 22 | if not model_config.multimodal.enable: 23 | bert_config.fusion_layer = bert_config.num_hidden_layers 24 | 25 | if pretrain: 26 | text_encoder, loading_info = BertForMaskedLM.from_pretrained( 27 | model_config.text_encoder.pretrained, 28 | config=bert_config, 29 | output_loading_info=True, 30 | local_files_only=True 31 | ) 32 | else: 33 | text_encoder, loading_info = BertModel.from_pretrained( 34 | model_config.text_encoder.pretrained, 35 | config=bert_config, 36 | add_pooling_layer=False, 37 | output_loading_info=True, 38 | local_files_only=True 39 | ) 40 | 41 | return text_encoder 42 | 43 | 44 | def build_bert_decoder(model_config, checkpoint): 45 | """build text decoder the same as the multimodal encoder. 46 | 47 | Args: 48 | model_config (dict): model config. 49 | pretrain (bool): Whether to do pretrain or finetuning. 50 | checkpoint (bool): whether to do gradient_checkpointing. 51 | 52 | Returns: TODO 53 | 54 | """ 55 | bert_config = BertConfig.from_json_file(model_config.text_encoder.config) 56 | bert_config.encoder_width = model_config.vision_encoder.get.d_model if model_config.vision_encoder.get('d_model', 0) else model_config.vision_encoder.embed_dim 57 | bert_config.gradient_checkpointing = checkpoint 58 | 59 | bert_config.fusion_layer = 0 60 | bert_config.num_hidden_layers = ( 61 | bert_config.num_hidden_layers - model_config.text_encoder.fusion_layer 62 | ) 63 | 64 | text_decoder, loading_info = BertLMHeadModel.from_pretrained( 65 | model_config.text_encoder.pretrained, 66 | config=bert_config, 67 | output_loading_info=True, 68 | local_files_only=True 69 | 70 | ) 71 | 72 | return text_decoder 73 | -------------------------------------------------------------------------------- /videomamba/video_mm/models/backbones/clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/videomamba/video_mm/models/backbones/clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /videomamba/video_mm/models/backbones/clip/clip_text.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .transformer import LayerNorm, TextTransformer 4 | from .tokenizer import tokenize 5 | 6 | 7 | class CLIP_text(nn.Module): 8 | # stolen text encoder from EVA02_CLIP_E_psz14_plus_s9B 9 | def __init__(self): 10 | super().__init__() 11 | text = TextTransformer( 12 | context_length=77, 13 | vocab_size=49408, 14 | width=1280, 15 | heads=20, 16 | layers=32, 17 | ls_init_value=None, 18 | output_dim=768, # use 768 for alignment 19 | act_layer=nn.GELU, 20 | norm_layer=LayerNorm, 21 | xattn=False, 22 | attn_mask=True, 23 | ) 24 | self.transformer = text.transformer 25 | self.vocab_size = text.vocab_size 26 | self.token_embedding = text.token_embedding 27 | self.positional_embedding = text.positional_embedding 28 | self.ln_final = text.ln_final 29 | self.text_projection = text.text_projection 30 | self.register_buffer('attn_mask', text.attn_mask, persistent=False) 31 | 32 | def forward(self, text): 33 | cast_dtype = self.transformer.get_cast_dtype() 34 | 35 | x = self.token_embedding(text).to(cast_dtype) # [batch_size, n_ctx, d_model] 36 | 37 | x = x + self.positional_embedding.to(cast_dtype) 38 | x = x.permute(1, 0, 2) # NLD -> LND 39 | x = self.transformer(x, attn_mask=self.attn_mask) 40 | x = x.permute(1, 0, 2) # LND -> NLD 41 | x = self.ln_final(x) # [batch_size, n_ctx, transformer.width] 42 | # take features from the eot embedding (eot_token is the highest number in each sequence) 43 | x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection 44 | return x 45 | 46 | 47 | class Tokenizer(nn.Module): 48 | def __init__(self): 49 | super().__init__() 50 | self.tokenizer = tokenize 51 | 52 | def forward(self, text): 53 | text = self.tokenizer(text) 54 | return text -------------------------------------------------------------------------------- /videomamba/video_mm/models/backbones/videomamba/__init__.py: -------------------------------------------------------------------------------- 1 | from .videomamba import build_videomamba 2 | 3 | from .clip import clip_b16, clip_l14, clip_l14_336 4 | 5 | 6 | def build_clip(config): 7 | model_cls = config.vision_encoder.clip_teacher 8 | model = eval(model_cls)( 9 | input_resolution = config.vision_encoder.clip_img_size, 10 | clip_return_layer=config.vision_encoder.clip_return_layer, 11 | clip_return_interval=config.vision_encoder.clip_return_interval, 12 | ) 13 | return model 14 | 15 | 16 | def build_text_clip(clip_teacher): 17 | model = eval(clip_teacher)() 18 | return model -------------------------------------------------------------------------------- /videomamba/video_mm/models/backbones/vit/__init__.py: -------------------------------------------------------------------------------- 1 | from .vit import build_vit 2 | 3 | from .clip import clip_b16, clip_l14, clip_l14_336 4 | 5 | 6 | def build_clip(config): 7 | model_cls = config.vision_encoder.clip_teacher 8 | model = eval(model_cls)( 9 | input_resolution = config.vision_encoder.clip_img_size, 10 | clip_return_layer=config.vision_encoder.clip_return_layer, 11 | clip_return_interval=config.vision_encoder.clip_return_interval, 12 | ) 13 | return model 14 | 15 | 16 | def build_text_clip(clip_teacher): 17 | model = eval(clip_teacher)() 18 | return model -------------------------------------------------------------------------------- /videomamba/video_mm/models/mask.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def TubeMaskingGenerator(input_size, mask_ratio, batch, device='cuda'): 6 | frames, height, width = input_size 7 | num_patches_per_frame = height * width 8 | num_masks_per_frame = int(mask_ratio * num_patches_per_frame) 9 | 10 | mask_list = [] 11 | for _ in range(batch): 12 | mask_per_frame = np.hstack([ 13 | np.zeros(num_patches_per_frame - num_masks_per_frame), 14 | np.ones(num_masks_per_frame), 15 | ]) 16 | np.random.shuffle(mask_per_frame) 17 | mask_list.append(np.tile(mask_per_frame, (frames, 1)).flatten()) 18 | mask = torch.Tensor(np.array(mask_list)).to(device, non_blocking=True).to(torch.bool) 19 | return mask 20 | 21 | 22 | def RandomMaskingGenerator(input_size, mask_ratio, batch, device='cuda'): 23 | frames, height, width = input_size 24 | 25 | num_patches = frames * height * width # 8x14x14 26 | num_mask = int(mask_ratio * num_patches) 27 | 28 | mask_list = [] 29 | for _ in range(batch): 30 | mask = np.hstack([ 31 | np.zeros(num_patches - num_mask), 32 | np.ones(num_mask), 33 | ]) 34 | np.random.shuffle(mask) 35 | mask_list.append(mask) 36 | mask = torch.Tensor(np.array(mask_list)).to(device, non_blocking=True).to(torch.bool) 37 | return mask 38 | -------------------------------------------------------------------------------- /videomamba/video_mm/torchrun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MASTER_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) 3 | ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST") 4 | MASTER_PORT=$((10000 + $RANDOM % 100)) 5 | 6 | echo "All nodes used:" 7 | echo ${ALL_NODES} 8 | echo "Master node:" 9 | echo ${MASTER_NODE} 10 | echo "Args:" 11 | echo $@ 12 | 13 | torchrun --rdzv_endpoint=${MASTER_NODE}:10054 $@ 14 | -------------------------------------------------------------------------------- /videomamba/video_mm/utils/scheduler.py: -------------------------------------------------------------------------------- 1 | """ Scheduler Factory 2 | Hacked together by / Copyright 2020 Ross Wightman 3 | """ 4 | from torch.optim import Optimizer 5 | import math 6 | from torch.optim.lr_scheduler import LambdaLR 7 | 8 | 9 | def create_scheduler(args, optimizer): 10 | lr_scheduler = None 11 | if args.sched == 'cosine': 12 | lr_scheduler = get_cosine_schedule_with_warmup( 13 | optimizer, 14 | num_warmup_steps=args.num_warmup_steps, 15 | num_training_steps=args.num_training_steps, 16 | num_cycles=0.5, 17 | min_lr_multi=args.min_lr_multi 18 | ) 19 | return lr_scheduler 20 | 21 | 22 | def get_cosine_schedule_with_warmup( 23 | optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, 24 | num_cycles: float = 0.5, min_lr_multi: float = 0., last_epoch: int = -1 25 | ): 26 | """ 27 | Modified from https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/optimization.py 28 | 29 | Create a schedule with a learning rate that decreases following the values of the cosine function between the 30 | initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the 31 | initial lr set in the optimizer. 32 | Args: 33 | optimizer ([`~torch.optim.Optimizer`]): 34 | The optimizer for which to schedule the learning rate. 35 | num_warmup_steps (`int`): 36 | The number of steps for the warmup phase. 37 | num_training_steps (`int`): 38 | The total number of training steps. 39 | num_cycles (`float`, *optional*, defaults to 0.5): 40 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 41 | following a half-cosine). 42 | min_lr_multi (`float`, *optional*, defaults to 0): 43 | The minimum learning rate multiplier. Thus the minimum learning rate is base_lr * min_lr_multi. 44 | last_epoch (`int`, *optional*, defaults to -1): 45 | The index of the last epoch when resuming training. 46 | Return: 47 | `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. 48 | """ 49 | 50 | def lr_lambda(current_step): 51 | if current_step < num_warmup_steps: 52 | return max(min_lr_multi, float(current_step) / float(max(1, num_warmup_steps))) 53 | progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) 54 | return max(min_lr_multi, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) 55 | 56 | return LambdaLR(optimizer, lr_lambda, last_epoch) 57 | -------------------------------------------------------------------------------- /videomamba/video_sm/DATASET.md: -------------------------------------------------------------------------------- 1 | # Dataset Preparation 2 | 3 | ## Short-term Video Understanding 4 | 5 | We follow [UniFormerV2](https://github.com/OpenGVLab/UniFormerV2/) to prepare the datasets. All the files can be found [here](https://drive.google.com/drive/folders/17VB-XdF3Kfr9ORmnGyXCxTMs86n0L4QL?usp=sharing), including: 6 | - [Kinetics-400](https://www.deepmind.com/open-source/kinetics) 7 | - [Something-Something V2](https://developer.qualcomm.com/software/ai-datasets/something-something) 8 | 9 | > Since some videos in Kinetics may no longer be available, it will lead to small performance gap. Our version of Kinetics-400 can be downloaded via **[Baidu Cloud](https://pan.baidu.com/s/150AE6OK9GjWQQvXv_db8vw) (password: li0f)** 10 | 11 | 12 | ## Long-term Video Understanding 13 | 14 | We follow [ViS4mer](https://github.com/md-mohaiminul/ViS4mer) to prepare the datasets. Add the files can be found [here](https://github.com/md-mohaiminul/ViS4mer/tree/main/data), including: 15 | - [Breakfast](https://serre-lab.clps.brown.edu/resource/breakfast-actions-dataset/) 16 | - [COIN](https://coin-dataset.github.io/) 17 | - [LVU](https://github.com/chaoyuaw/lvu) 18 | 19 | > We simply use the raw videos, instead of extracting features. -------------------------------------------------------------------------------- /videomamba/video_sm/README.md: -------------------------------------------------------------------------------- 1 | # Single-modality Video Understanding 2 | 3 | We currenent release the code and models for: 4 | 5 | - [x] **Masked Pretraining** 6 | 7 | - [x] **Short-term Video Understaning** 8 | - K400 and SthSthV2 9 | 10 | - [x] **Long-term Video Understaning** 11 | - Breakfast, COIN and LVU 12 | 13 | 14 | 15 | 16 | ## Update 17 | 18 | - :fire: **03/12/2024**: Pretrained models on ImageNet-1K are released. 19 | 20 | 21 | ## Datasets 22 | 23 | You can find the dataset instructions in [DATASET](./DATASET.md). 24 | 25 | ## Model ZOO 26 | 27 | You can find all the models and the scripts in [MODEL_ZOO](./MODEL_ZOO.md). 28 | 29 | ## Usage 30 | 31 | ### Masked Pretraining 32 | 33 | We use [CLIP](https://github.com/openai/CLIP) pretrained models as the unmasked teachers by default: 34 | - Follow [extract.ipynb](./models/extract_clip/extract.ipynb) to extract visual encoder from CLIP. 35 | - Change `MODEL_PATH` in [clip.py](./models/clip.py). 36 | 37 | For training, you can simply run the pretraining scripts as follows: 38 | ```shell 39 | bash ./exp/k400/videomamba_middle_mask/run_mask_pretrain.sh 40 | ``` 41 | 42 | > **Notes:** 43 | > 1. Chage `DATA_PATH` to your data path before running the scripts. 44 | > 2. `--sampling_rate` is set to 1 for **sprase sampling**. 45 | > 3. The latest checkpoint will be automatically saved while training, thus we use a large `--save_ckpt_freq`. 46 | > 4. For VideoMamba-M, we use CLIP-B-ViT as the teacher. 47 | 48 | 49 | ### Short-term Video Understanding 50 | 51 | For finetuning, you can simply run the fine-tuning scripts as follows: 52 | ```shell 53 | bash ./exp/k400/videomamba_middle_mask/run_f8x224.sh 54 | ``` 55 | 56 | > **Notes:** 57 | > 1. Chage `DATA_PATH` And `PREFIX` to your data path before running the scripts. 58 | > 2. Set `--finetune` when using masked pretrained model. 59 | > 3. The best checkpoint will be automatically evaluated with `--test_best`. 60 | > 4. Set `--test_num_segment` and `--test_num_crop` for different evaluation strategies. 61 | > 5. To only run evaluation, just set `--eval`. 62 | 63 | 64 | ### Long-term Video Understanding 65 | 66 | For BreakFast and COIN, you can simply run the fine-tuning scripts as follows: 67 | ```shell 68 | bash ./exp/breakfast/videomamba_middle_mask/run_f32x224.sh 69 | ``` 70 | 71 | For LVU, there are classification and regression tasks, you can simply run the fine-tuning scripts as follows: 72 | ```shell 73 | # classification 74 | bash ./exp/lvu/run_class.sh 75 | # regression 76 | bash ./exp/lvu/run_regression.sh 77 | ``` 78 | > **Notes:** 79 | > For regression tasks, the data should be preprocessed with normalization as in [ViS4mer](https://github.com/md-mohaiminul/ViS4mer/blob/main/datasets/lvu_dataset.py). 80 | 81 | 82 | ### :warning: Using Trimmed Video 83 | 84 | By default, we use `Kinetics_sparse` dataset for different datasets. **However, in [ViS4mer](https://github.com/md-mohaiminul/ViS4mer/blob/main/datasets/lvu_dataset.py), the authors use trimmed clips with sliding window, which may improve the results.** We also provided a dataset with sliding window as follows: 85 | ```shell 86 | # classification 87 | bash ./exp/lvu/run_class_trim.sh 88 | # regression 89 | bash ./exp/lvu/run_regression_trim.sh 90 | ``` 91 | 92 | > **Notes:** 93 | > 1. Set `trimmed` for the length of trimmed videos. 94 | > 2. Set `time_stride` for the length of sliding window. -------------------------------------------------------------------------------- /videomamba/video_sm/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import build_dataset, build_pretraining_dataset -------------------------------------------------------------------------------- /videomamba/video_sm/engines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenGVLab/VideoMamba/37355c26d0ae99ca2459f6d4044a5f509031a79f/videomamba/video_sm/engines/__init__.py -------------------------------------------------------------------------------- /videomamba/video_sm/exp/breakfast/videomamba_middle/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_breakfast_path' 8 | DATA_PATH='your_breakfast_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 10 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 8 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 50 \ 43 | --lr 2e-4 \ 44 | --layer_decay 0.8 \ 45 | --drop_path 0.8 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.05 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/breakfast/videomamba_middle/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_breakfast_path' 8 | DATA_PATH='your_breakfast_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_f64_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 10 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 4 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 64 \ 38 | --orig_t_size 64 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 50 \ 43 | --lr 2e-4 \ 44 | --layer_decay 0.8 \ 45 | --drop_path 0.8 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.05 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/breakfast/videomamba_middle_mask/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_breakfast_path' 8 | DATA_PATH='your_breakfast_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_mask_ft_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 10 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 8 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 45 \ 43 | --lr 2e-4 \ 44 | --layer_decay 0.8 \ 45 | --drop_path 0.4 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.05 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/breakfast/videomamba_middle_mask/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_breakfast_path' 8 | DATA_PATH='your_breakfast_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_mask_ft_f64_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 10 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 4 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 64 \ 38 | --orig_t_size 64 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 45 \ 43 | --lr 2e-4 \ 44 | --layer_decay 0.8 \ 45 | --drop_path 0.4 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.05 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/breakfast/videomamba_small/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_breakfast_path' 8 | DATA_PATH='your_breakfast_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --finetune your_model_path/videomamba_s16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 10 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 16 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 50 \ 43 | --lr 2e-4 \ 44 | --drop_path 0.4 \ 45 | --opt adamw \ 46 | --opt_betas 0.9 0.999 \ 47 | --weight_decay 0.05 \ 48 | --test_num_segment 4 \ 49 | --test_num_crop 3 \ 50 | --dist_eval \ 51 | --test_best \ 52 | --bf16 53 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/breakfast/videomamba_small/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_breakfast_path' 8 | DATA_PATH='your_breakfast_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --finetune your_model_path/videomamba_s16_k400_f64_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 10 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 8 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 64 \ 38 | --orig_t_size 64 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 50 \ 43 | --lr 2e-4 \ 44 | --drop_path 0.4 \ 45 | --opt adamw \ 46 | --opt_betas 0.9 0.999 \ 47 | --weight_decay 0.05 \ 48 | --test_num_segment 4 \ 49 | --test_num_crop 3 \ 50 | --dist_eval \ 51 | --test_best \ 52 | --bf16 53 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/breakfast/videomamba_tiny/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_breakfast_path' 8 | DATA_PATH='your_breakfast_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --finetune your_model_path/videomamba_t16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 10 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 32 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 70 \ 43 | --lr 2e-4 \ 44 | --drop_path 0.1 \ 45 | --aa rand-m5-n2-mstd0.25-inc1 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.1 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/breakfast/videomamba_tiny/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_breakfast_path' 8 | DATA_PATH='your_breakfast_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --finetune your_model_path/videomamba_t16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 10 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 16 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 64 \ 38 | --orig_t_size 64 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 70 \ 43 | --lr 2e-4 \ 44 | --drop_path 0.1 \ 45 | --aa rand-m5-n2-mstd0.25-inc1 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.1 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/coin/videomamba_middle/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_coin_path' 8 | DATA_PATH='your_coin_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 180 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 8 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 30 \ 43 | --lr 2e-4 \ 44 | --layer_decay 0.8 \ 45 | --drop_path 0.8 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.05 \ 49 | --test_num_segment 10 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/coin/videomamba_middle/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_coin_path' 8 | DATA_PATH='your_coin_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_f64_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 180 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 4 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 64 \ 38 | --orig_t_size 64 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 30 \ 43 | --lr 2e-4 \ 44 | --layer_decay 0.8 \ 45 | --drop_path 0.8 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.05 \ 49 | --test_num_segment 10 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/coin/videomamba_middle_mask/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_coin_path' 8 | DATA_PATH='your_coin_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_mask_ft_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 180 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 8 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 30 \ 43 | --lr 2e-4 \ 44 | --layer_decay 0.8 \ 45 | --drop_path 0.4 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.05 \ 49 | --test_num_segment 10 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/coin/videomamba_middle_mask/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_coin_path' 8 | DATA_PATH='your_coin_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_mask_ft_f64_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 180 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 4 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 64 \ 38 | --orig_t_size 64 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 30 \ 43 | --lr 2e-4 \ 44 | --layer_decay 0.8 \ 45 | --drop_path 0.4 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.05 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/coin/videomamba_small/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_coin_path' 8 | DATA_PATH='your_coin_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --finetune your_model_path/videomamba_s16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 180 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 16 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 35 \ 43 | --lr 2e-4 \ 44 | --drop_path 0.4 \ 45 | --opt adamw \ 46 | --opt_betas 0.9 0.999 \ 47 | --weight_decay 0.05 \ 48 | --test_num_segment 10 \ 49 | --test_num_crop 3 \ 50 | --dist_eval \ 51 | --test_best \ 52 | --bf16 53 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/coin/videomamba_small/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_coin_path' 8 | DATA_PATH='your_coin_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --finetune your_model_path/videomamba_s16_k400_f64_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 180 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 8 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 64 \ 38 | --orig_t_size 64 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 35 \ 43 | --lr 2e-4 \ 44 | --drop_path 0.4 \ 45 | --opt adamw \ 46 | --opt_betas 0.9 0.999 \ 47 | --weight_decay 0.05 \ 48 | --test_num_segment 10 \ 49 | --test_num_crop 3 \ 50 | --dist_eval \ 51 | --test_best \ 52 | --bf16 53 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/coin/videomamba_tiny/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_coin_path' 8 | DATA_PATH='your_coin_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --finetune your_model_path/videomamba_t16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 180 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 32 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 40 \ 43 | --lr 2e-4 \ 44 | --drop_path 0.1 \ 45 | --aa rand-m5-n2-mstd0.25-inc1 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.1 \ 49 | --test_num_segment 10 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/coin/videomamba_tiny/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_coin_path' 8 | DATA_PATH='your_coin_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --finetune your_model_path/videomamba_t16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 180 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 16 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 64 \ 38 | --orig_t_size 64 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 40 \ 43 | --lr 2e-4 \ 44 | --drop_path 0.1 \ 45 | --aa rand-m5-n2-mstd0.25-inc1 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.1 \ 49 | --test_num_segment 10 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_middle/run_f16x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f16_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 16 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 16 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 50 \ 41 | --lr 1e-4 \ 42 | --drop_path 0.8 \ 43 | --opt adamw \ 44 | --opt_betas 0.9 0.999 \ 45 | --weight_decay 0.05 \ 46 | --test_num_segment 4 \ 47 | --test_num_crop 3 \ 48 | --dist_eval \ 49 | --test_best \ 50 | --bf16 51 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_middle/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 8 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 32 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 50 \ 41 | --lr 1e-4 \ 42 | --drop_path 0.8 \ 43 | --opt adamw \ 44 | --opt_betas 0.9 0.999 \ 45 | --weight_decay 0.05 \ 46 | --test_num_segment 4 \ 47 | --test_num_crop 3 \ 48 | --dist_eval \ 49 | --test_best \ 50 | --bf16 51 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_middle/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 4 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 64 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 50 \ 41 | --lr 1e-4 \ 42 | --drop_path 0.8 \ 43 | --opt adamw \ 44 | --opt_betas 0.9 0.999 \ 45 | --weight_decay 0.05 \ 46 | --test_num_segment 4 \ 47 | --test_num_crop 3 \ 48 | --dist_eval \ 49 | --test_best \ 50 | --bf16 51 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_middle/run_f64x224to384.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f64_res224to384' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_f64_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 400 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 1 \ 33 | --update_freq 2 \ 34 | --num_sample 2 \ 35 | --input_size 384 \ 36 | --short_side_size 384 \ 37 | --save_ckpt_freq 100 \ 38 | --num_frames 64 \ 39 | --orig_t_size 64 \ 40 | --num_workers 12 \ 41 | --warmup_epochs 1 \ 42 | --tubelet_size 1 \ 43 | --epochs 6 \ 44 | --lr 5e-6 \ 45 | --drop_path 0.8 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 1e-8 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_middle/run_f8x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f8_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 32 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 8 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 50 \ 41 | --lr 2e-4 \ 42 | --drop_path 0.8 \ 43 | --opt adamw \ 44 | --opt_betas 0.9 0.999 \ 45 | --weight_decay 0.05 \ 46 | --test_num_segment 4 \ 47 | --test_num_crop 3 \ 48 | --dist_eval \ 49 | --test_best \ 50 | --bf16 51 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_middle_mask/run_f16x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_mask_ft_f16_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_mask_pt_f8_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 400 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 16 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 16 \ 38 | --num_workers 12 \ 39 | --warmup_epochs 5 \ 40 | --tubelet_size 1 \ 41 | --epochs 45 \ 42 | --lr 1e-4 \ 43 | --layer_decay 0.8 \ 44 | --drop_path 0.4 \ 45 | --opt adamw \ 46 | --opt_betas 0.9 0.999 \ 47 | --weight_decay 0.05 \ 48 | --test_num_segment 4 \ 49 | --test_num_crop 3 \ 50 | --dist_eval \ 51 | --test_best \ 52 | --bf16 53 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_middle_mask/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_mask_ft_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_mask_pt_f8_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 400 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 8 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --num_workers 12 \ 39 | --warmup_epochs 5 \ 40 | --tubelet_size 1 \ 41 | --epochs 45 \ 42 | --lr 1e-4 \ 43 | --layer_decay 0.8 \ 44 | --drop_path 0.4 \ 45 | --opt adamw \ 46 | --opt_betas 0.9 0.999 \ 47 | --weight_decay 0.05 \ 48 | --test_num_segment 4 \ 49 | --test_num_crop 3 \ 50 | --dist_eval \ 51 | --test_best \ 52 | --bf16 53 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_middle_mask/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_mask_ft_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_mask_pt_f8_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 400 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 4 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 64 \ 38 | --num_workers 12 \ 39 | --warmup_epochs 5 \ 40 | --tubelet_size 1 \ 41 | --epochs 45 \ 42 | --lr 1e-4 \ 43 | --layer_decay 0.8 \ 44 | --drop_path 0.4 \ 45 | --opt adamw \ 46 | --opt_betas 0.9 0.999 \ 47 | --weight_decay 0.05 \ 48 | --test_num_segment 4 \ 49 | --test_num_crop 3 \ 50 | --dist_eval \ 51 | --test_best \ 52 | --bf16 53 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_middle_mask/run_f64x224to384.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_mask_ft_f64_res224to384' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_mask_ft_f64_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 400 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 1 \ 33 | --update_freq 2 \ 34 | --num_sample 2 \ 35 | --input_size 384 \ 36 | --short_side_size 384 \ 37 | --save_ckpt_freq 100 \ 38 | --num_frames 64 \ 39 | --orig_t_size 64 \ 40 | --num_workers 12 \ 41 | --warmup_epochs 1 \ 42 | --tubelet_size 1 \ 43 | --epochs 6 \ 44 | --lr 5e-6 \ 45 | --layer_decay 0.8 \ 46 | --drop_path 0.4 \ 47 | --opt adamw \ 48 | --opt_betas 0.9 0.999 \ 49 | --weight_decay 1e-8 \ 50 | --test_num_segment 4 \ 51 | --test_num_crop 3 \ 52 | --dist_eval \ 53 | --test_best \ 54 | --bf16 55 | 56 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_middle_mask/run_f8x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_mask_ft_f8_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_k400_mask_pt_f8_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 400 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 32 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 8 \ 38 | --num_workers 12 \ 39 | --warmup_epochs 5 \ 40 | --tubelet_size 1 \ 41 | --epochs 45 \ 42 | --lr 1e-4 \ 43 | --layer_decay 0.8 \ 44 | --drop_path 0.4 \ 45 | --opt adamw \ 46 | --opt_betas 0.9 0.999 \ 47 | --weight_decay 0.05 \ 48 | --test_num_segment 4 \ 49 | --test_num_crop 3 \ 50 | --dist_eval \ 51 | --test_best \ 52 | --bf16 53 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_middle_mask/run_mask_pretrain.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_mask_pt_f8_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path/train.csv' 9 | 10 | PARTITION='video5' 11 | GPUS=32 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | python -u run_videomamba_pretraining.py \ 22 | --data_path ${DATA_PATH} \ 23 | --prefix ${PREFIX} \ 24 | --num_sample 1 \ 25 | --split ',' \ 26 | --flip True \ 27 | --mask_type 'attention' \ 28 | --mask_ratio 0.8 \ 29 | --model 'videomamba_middle_pretrain' \ 30 | --clip_teacher 'clip_b16' \ 31 | --clip_loss_ratio 1 \ 32 | --clip_loss_type 'l2' \ 33 | --clip_decoder_embed_dim 576 \ 34 | --clip_output_dim 512 \ 35 | --clip_norm_type 'l2' \ 36 | --clip_return_layer 1 \ 37 | --clip_return_interval 1 \ 38 | --clip_student_return_interval 1 \ 39 | --clip_return_cls \ 40 | --clip_return_attn \ 41 | --tubelet_size 1 \ 42 | --lr 1.5e-4 \ 43 | --drop_path 0.4 \ 44 | --batch_size 64 \ 45 | --num_segments 8 \ 46 | --num_frames 8 \ 47 | --sampling_rate 1 \ 48 | --num_workers 12 \ 49 | --opt adamw \ 50 | --opt_betas 0.9 0.95 \ 51 | --warmup_epochs 40 \ 52 | --save_ckpt_freq 1000 \ 53 | --epochs 801 \ 54 | --log_dir ${OUTPUT_DIR} \ 55 | --output_dir ${OUTPUT_DIR} \ 56 | --bf16 57 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_small/run_f16x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f16_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 32 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 16 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 50 \ 41 | --lr 2e-4 \ 42 | --drop_path 0.35 \ 43 | --opt adamw \ 44 | --opt_betas 0.9 0.999 \ 45 | --weight_decay 0.05 \ 46 | --test_num_segment 4 \ 47 | --test_num_crop 3 \ 48 | --dist_eval \ 49 | --test_best \ 50 | --bf16 51 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_small/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 16 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 32 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 50 \ 41 | --lr 2e-4 \ 42 | --drop_path 0.35 \ 43 | --opt adamw \ 44 | --opt_betas 0.9 0.999 \ 45 | --weight_decay 0.05 \ 46 | --test_num_segment 4 \ 47 | --test_num_crop 3 \ 48 | --dist_eval \ 49 | --test_best \ 50 | --bf16 51 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_small/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 8 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 64 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 50 \ 41 | --lr 2e-4 \ 42 | --drop_path 0.35 \ 43 | --opt adamw \ 44 | --opt_betas 0.9 0.999 \ 45 | --weight_decay 0.05 \ 46 | --test_num_segment 4 \ 47 | --test_num_crop 3 \ 48 | --dist_eval \ 49 | --test_best \ 50 | --bf16 51 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_small/run_f64x224to384.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f64_res224to384' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=32 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --finetune your_model_path/videomamba_s16_k400_f64_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 400 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 2 \ 33 | --update_freq 2 \ 34 | --num_sample 2 \ 35 | --input_size 384 \ 36 | --short_side_size 384 \ 37 | --save_ckpt_freq 100 \ 38 | --num_frames 64 \ 39 | --orig_t_size 64 \ 40 | --num_workers 12 \ 41 | --warmup_epochs 1 \ 42 | --tubelet_size 1 \ 43 | --epochs 6 \ 44 | --lr 5e-6 \ 45 | --drop_path 0.35 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 1e-8 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_small/run_f8x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f8_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 64 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 8 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 50 \ 41 | --lr 2e-4 \ 42 | --drop_path 0.35 \ 43 | --opt adamw \ 44 | --opt_betas 0.9 0.999 \ 45 | --weight_decay 0.05 \ 46 | --test_num_segment 4 \ 47 | --test_num_crop 3 \ 48 | --dist_eval \ 49 | --test_best \ 50 | --bf16 51 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_tiny/run_f16x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f16_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 64 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 16 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 70 \ 41 | --lr 2e-4 \ 42 | --drop_path 0.1 \ 43 | --aa rand-m5-n2-mstd0.25-inc1 \ 44 | --opt adamw \ 45 | --opt_betas 0.9 0.999 \ 46 | --weight_decay 0.1 \ 47 | --test_num_segment 4 \ 48 | --test_num_crop 3 \ 49 | --dist_eval \ 50 | --test_best \ 51 | --bf16 52 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_tiny/run_f32x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 32 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 32 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 70 \ 41 | --lr 2e-4 \ 42 | --drop_path 0.1 \ 43 | --aa rand-m5-n2-mstd0.25-inc1 \ 44 | --opt adamw \ 45 | --opt_betas 0.9 0.999 \ 46 | --weight_decay 0.1 \ 47 | --test_num_segment 4 \ 48 | --test_num_crop 3 \ 49 | --dist_eval \ 50 | --test_best \ 51 | --bf16 52 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_tiny/run_f64x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f64_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 16 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 64 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 70 \ 41 | --lr 2e-4 \ 42 | --drop_path 0.1 \ 43 | --aa rand-m5-n2-mstd0.25-inc1 \ 44 | --opt adamw \ 45 | --opt_betas 0.9 0.999 \ 46 | --weight_decay 0.1 \ 47 | --test_num_segment 4 \ 48 | --test_num_crop 3 \ 49 | --dist_eval \ 50 | --test_best \ 51 | --bf16 52 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_tiny/run_f64x224to384.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f64_res224to384' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --finetune your_model_path/videomamba_t16_k400_f64_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 400 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 6 \ 33 | --num_sample 2 \ 34 | --input_size 384 \ 35 | --short_side_size 384 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 64 \ 38 | --orig_t_size 64 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 2 \ 41 | --tubelet_size 1 \ 42 | --epochs 10 \ 43 | --lr 5e-6 \ 44 | --drop_path 0.1 \ 45 | --aa rand-m5-n2-mstd0.25-inc1 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 1e-8 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/k400/videomamba_tiny/run_f8x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f8_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_k400_path' 8 | DATA_PATH='your_k400_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'Kinetics_sparse' \ 27 | --split ',' \ 28 | --nb_classes 400 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 64 \ 32 | --num_sample 2 \ 33 | --input_size 224 \ 34 | --short_side_size 224 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 8 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 5 \ 39 | --tubelet_size 1 \ 40 | --epochs 70 \ 41 | --lr 2e-4 \ 42 | --drop_path 0.1 \ 43 | --aa rand-m5-n2-mstd0.25-inc1 \ 44 | --opt adamw \ 45 | --opt_betas 0.9 0.999 \ 46 | --weight_decay 0.1 \ 47 | --test_num_segment 4 \ 48 | --test_num_crop 3 \ 49 | --dist_eval \ 50 | --test_best \ 51 | --bf16 52 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/lvu/run_class.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_lvu_path' 8 | DATA_PATH='your_lvu_metadata_path/director' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --finetune your_model_path/videomamba_t16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 10 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 8 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 70 \ 43 | --lr 4e-4 \ 44 | --drop_path 0.15 \ 45 | --aa rand-m5-n2-mstd0.25-inc1 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.1 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/lvu/run_class_trim.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_lvu_path' 8 | DATA_PATH='your_lvu_metadata_path/director' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --finetune your_model_path/videomamba_t16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'LVU' \ 28 | --split ',' \ 29 | --nb_classes 10 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 8 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --trimmed 60 \ 40 | --time_stride 16 \ 41 | --num_workers 12 \ 42 | --warmup_epochs 5 \ 43 | --tubelet_size 1 \ 44 | --epochs 70 \ 45 | --lr 4e-4 \ 46 | --drop_path 0.15 \ 47 | --aa rand-m5-n2-mstd0.25-inc1 \ 48 | --opt adamw \ 49 | --opt_betas 0.9 0.999 \ 50 | --weight_decay 0.1 \ 51 | --test_num_segment 4 \ 52 | --test_num_crop 3 \ 53 | --dist_eval \ 54 | --test_best \ 55 | --bf16 56 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/lvu/run_regression.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_lvu_path' 8 | DATA_PATH='your_lvu_metadata_path/director' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_regression_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --finetune your_model_path/videomamba_t16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'LVU' \ 28 | --split ',' \ 29 | --nb_classes 1 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 8 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 5 \ 41 | --tubelet_size 1 \ 42 | --epochs 70 \ 43 | --lr 4e-4 \ 44 | --drop_path 0.15 \ 45 | --aa rand-m5-n2-mstd0.25-inc1 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 0.1 \ 49 | --test_num_segment 4 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/lvu/run_regression_trim.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f32_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_lvu_path' 8 | DATA_PATH='your_lvu_metadata_path/director' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_regression_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --finetune your_model_path/videomamba_t16_k400_f32_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 1 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 8 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 32 \ 38 | --orig_t_size 32 \ 39 | --trimmed 60 \ 40 | --time_stride 16 \ 41 | --num_workers 12 \ 42 | --warmup_epochs 5 \ 43 | --tubelet_size 1 \ 44 | --epochs 70 \ 45 | --lr 4e-4 \ 46 | --drop_path 0.15 \ 47 | --aa rand-m5-n2-mstd0.25-inc1 \ 48 | --opt adamw \ 49 | --opt_betas 0.9 0.999 \ 50 | --weight_decay 0.1 \ 51 | --test_num_segment 4 \ 52 | --test_num_crop 3 \ 53 | --dist_eval \ 54 | --test_best \ 55 | --bf16 56 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_middle/run_f16x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f16_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'SSV2' \ 27 | --nb_classes 174 \ 28 | --log_dir ${OUTPUT_DIR} \ 29 | --output_dir ${OUTPUT_DIR} \ 30 | --batch_size 16 \ 31 | --num_sample 2 \ 32 | --input_size 224 \ 33 | --short_side_size 224 \ 34 | --save_ckpt_freq 100 \ 35 | --num_frames 16 \ 36 | --num_workers 12 \ 37 | --warmup_epochs 5 \ 38 | --tubelet_size 1 \ 39 | --epochs 30 \ 40 | --lr 4e-4 \ 41 | --drop_path 0.8 \ 42 | --opt adamw \ 43 | --opt_betas 0.9 0.999 \ 44 | --weight_decay 0.05 \ 45 | --test_num_segment 2 \ 46 | --test_num_crop 3 \ 47 | --dist_eval \ 48 | --test_best \ 49 | --bf16 50 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_middle/run_f16x224to288.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f16_res224to288' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_ssv2_f16_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'SSV2' \ 28 | --nb_classes 174 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 6 \ 32 | --num_sample 2 \ 33 | --input_size 288 \ 34 | --short_side_size 288 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 16 \ 37 | --num_workers 12 \ 38 | --warmup_epochs 2 \ 39 | --tubelet_size 1 \ 40 | --epochs 10 \ 41 | --lr 5e-6 \ 42 | --drop_path 0.8 \ 43 | --opt adamw \ 44 | --opt_betas 0.9 0.999 \ 45 | --weight_decay 1e-8 \ 46 | --test_num_segment 2 \ 47 | --test_num_crop 3 \ 48 | --dist_eval \ 49 | --test_best \ 50 | --bf16 51 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_middle/run_f8x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_f8_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'SSV2' \ 27 | --nb_classes 174 \ 28 | --log_dir ${OUTPUT_DIR} \ 29 | --output_dir ${OUTPUT_DIR} \ 30 | --batch_size 32 \ 31 | --num_sample 2 \ 32 | --input_size 224 \ 33 | --short_side_size 224 \ 34 | --save_ckpt_freq 100 \ 35 | --num_frames 8 \ 36 | --num_workers 12 \ 37 | --warmup_epochs 5 \ 38 | --tubelet_size 1 \ 39 | --epochs 30 \ 40 | --lr 4e-4 \ 41 | --drop_path 0.8 \ 42 | --opt adamw \ 43 | --opt_betas 0.9 0.999 \ 44 | --weight_decay 0.05 \ 45 | --test_num_segment 2 \ 46 | --test_num_crop 3 \ 47 | --dist_eval \ 48 | --test_best \ 49 | --bf16 50 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_middle_mask/run_f16x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_mask_ft_f16_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_ssv2_mask_pt_f8_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'SSV2' \ 28 | --nb_classes 174 \ 29 | --finetune ${MODEL_PATH} \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 16 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 16 \ 38 | --num_workers 12 \ 39 | --warmup_epochs 5 \ 40 | --tubelet_size 1 \ 41 | --epochs 30 \ 42 | --lr 1e-4 \ 43 | --layer_decay 0.8 \ 44 | --drop_path 0.4 \ 45 | --opt adamw \ 46 | --opt_betas 0.9 0.999 \ 47 | --weight_decay 0.05 \ 48 | --test_num_segment 2 \ 49 | --test_num_crop 3 \ 50 | --dist_eval \ 51 | --test_best \ 52 | --bf16 53 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_middle_mask/run_f16x224to288.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_mask_ft_f64_res224to288' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_ssv2_mask_ft_f16_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'Kinetics_sparse' \ 28 | --split ',' \ 29 | --nb_classes 174 \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 6 \ 33 | --num_sample 2 \ 34 | --input_size 288 \ 35 | --short_side_size 288 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 16 \ 38 | --orig_t_size 16 \ 39 | --num_workers 12 \ 40 | --warmup_epochs 2 \ 41 | --tubelet_size 1 \ 42 | --epochs 10 \ 43 | --lr 5e-6 \ 44 | --layer_decay 0.8 \ 45 | --drop_path 0.4 \ 46 | --opt adamw \ 47 | --opt_betas 0.9 0.999 \ 48 | --weight_decay 1e-8 \ 49 | --test_num_segment 2 \ 50 | --test_num_crop 3 \ 51 | --dist_eval \ 52 | --test_best \ 53 | --bf16 54 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_middle_mask/run_f8x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_mask_ft_f16_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_middle \ 24 | --finetune your_model_path/videomamba_m16_ssv2_mask_pt_f8_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'SSV2' \ 28 | --nb_classes 174 \ 29 | --finetune ${MODEL_PATH} \ 30 | --log_dir ${OUTPUT_DIR} \ 31 | --output_dir ${OUTPUT_DIR} \ 32 | --batch_size 32 \ 33 | --num_sample 2 \ 34 | --input_size 224 \ 35 | --short_side_size 224 \ 36 | --save_ckpt_freq 100 \ 37 | --num_frames 8 \ 38 | --num_workers 12 \ 39 | --warmup_epochs 5 \ 40 | --tubelet_size 1 \ 41 | --epochs 30 \ 42 | --lr 1e-4 \ 43 | --layer_decay 0.8 \ 44 | --drop_path 0.4 \ 45 | --opt adamw \ 46 | --opt_betas 0.9 0.999 \ 47 | --weight_decay 0.05 \ 48 | --test_num_segment 2 \ 49 | --test_num_crop 3 \ 50 | --dist_eval \ 51 | --test_best \ 52 | --bf16 53 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_middle_mask/run_mask_pretrain.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_middle_mask_pt_f8_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path/train.csv' 9 | 10 | PARTITION='video5' 11 | GPUS=32 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | python -u run_videomamba_pretraining.py \ 22 | --data_path ${DATA_PATH} \ 23 | --prefix ${PREFIX} \ 24 | --num_sample 1 \ 25 | --split ',' \ 26 | --flip True \ 27 | --mask_type 'attention' \ 28 | --mask_ratio 0.8 \ 29 | --model 'videomamba_middle_pretrain' \ 30 | --clip_teacher 'clip_b16' \ 31 | --clip_loss_ratio 1 \ 32 | --clip_loss_type 'l2' \ 33 | --clip_decoder_embed_dim 576 \ 34 | --clip_output_dim 512 \ 35 | --clip_norm_type 'l2' \ 36 | --clip_return_layer 1 \ 37 | --clip_return_interval 1 \ 38 | --clip_student_return_interval 1 \ 39 | --clip_return_cls \ 40 | --clip_return_attn \ 41 | --tubelet_size 1 \ 42 | --lr 1.5e-4 \ 43 | --drop_path 0.4 \ 44 | --batch_size 64 \ 45 | --num_segments 8 \ 46 | --num_frames 8 \ 47 | --sampling_rate 1 \ 48 | --num_workers 12 \ 49 | --opt adamw \ 50 | --opt_betas 0.9 0.95 \ 51 | --warmup_epochs 40 \ 52 | --save_ckpt_freq 1000 \ 53 | --epochs 801 \ 54 | --log_dir ${OUTPUT_DIR} \ 55 | --output_dir ${OUTPUT_DIR} \ 56 | --bf16 57 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_small/run_f16x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f16_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'SSV2' \ 27 | --nb_classes 174 \ 28 | --log_dir ${OUTPUT_DIR} \ 29 | --output_dir ${OUTPUT_DIR} \ 30 | --batch_size 32 \ 31 | --num_sample 2 \ 32 | --input_size 224 \ 33 | --short_side_size 224 \ 34 | --save_ckpt_freq 100 \ 35 | --num_frames 16 \ 36 | --num_workers 12 \ 37 | --warmup_epochs 5 \ 38 | --tubelet_size 1 \ 39 | --epochs 30 \ 40 | --lr 4e-4 \ 41 | --drop_path 0.35 \ 42 | --opt adamw \ 43 | --opt_betas 0.9 0.999 \ 44 | --weight_decay 0.05 \ 45 | --test_num_segment 2 \ 46 | --test_num_crop 3 \ 47 | --dist_eval \ 48 | --test_best \ 49 | --bf16 50 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_small/run_f16x224to288.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f16_res224to288' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --finetune your_model_path/videomamba_s16_ssv2_f16_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'SSV2' \ 28 | --nb_classes 174 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 12 \ 32 | --num_sample 2 \ 33 | --input_size 288 \ 34 | --short_side_size 288 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 16 \ 37 | --orig_t_size 16 \ 38 | --num_workers 12 \ 39 | --warmup_epochs 2 \ 40 | --tubelet_size 1 \ 41 | --epochs 10 \ 42 | --lr 5e-6 \ 43 | --drop_path 0.35 \ 44 | --opt adamw \ 45 | --opt_betas 0.9 0.999 \ 46 | --weight_decay 1e-8 \ 47 | --test_num_segment 2 \ 48 | --test_num_crop 3 \ 49 | --dist_eval \ 50 | --test_best \ 51 | --bf16 52 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_small/run_f8x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f8_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'SSV2' \ 27 | --nb_classes 174 \ 28 | --log_dir ${OUTPUT_DIR} \ 29 | --output_dir ${OUTPUT_DIR} \ 30 | --batch_size 64 \ 31 | --num_sample 2 \ 32 | --input_size 224 \ 33 | --short_side_size 224 \ 34 | --save_ckpt_freq 100 \ 35 | --num_frames 8 \ 36 | --num_workers 12 \ 37 | --warmup_epochs 5 \ 38 | --tubelet_size 1 \ 39 | --epochs 30 \ 40 | --lr 4e-4 \ 41 | --drop_path 0.35 \ 42 | --opt adamw \ 43 | --opt_betas 0.9 0.999 \ 44 | --weight_decay 0.05 \ 45 | --test_num_segment 2 \ 46 | --test_num_crop 3 \ 47 | --dist_eval \ 48 | --test_best \ 49 | --bf16 50 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_tiny/run_f16x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f16_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'SSV2' \ 27 | --nb_classes 174 \ 28 | --log_dir ${OUTPUT_DIR} \ 29 | --output_dir ${OUTPUT_DIR} \ 30 | --batch_size 64 \ 31 | --num_sample 2 \ 32 | --input_size 224 \ 33 | --short_side_size 224 \ 34 | --save_ckpt_freq 100 \ 35 | --num_frames 16 \ 36 | --num_workers 12 \ 37 | --warmup_epochs 5 \ 38 | --tubelet_size 1 \ 39 | --epochs 35 \ 40 | --lr 4e-4 \ 41 | --drop_path 0.1 \ 42 | --aa rand-m5-n2-mstd0.25-inc1 \ 43 | --opt adamw \ 44 | --opt_betas 0.9 0.999 \ 45 | --weight_decay 0.1 \ 46 | --test_num_segment 2 \ 47 | --test_num_crop 3 \ 48 | --dist_eval \ 49 | --test_best \ 50 | --bf16 51 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_tiny/run_f16x224to288.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_small_f16_res224to288' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=16 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_small \ 24 | --finetune your_model_path/videomamba_t16_ssv2_f64_res224.pth \ 25 | --data_path ${DATA_PATH} \ 26 | --prefix ${PREFIX} \ 27 | --data_set 'SSV2' \ 28 | --nb_classes 174 \ 29 | --log_dir ${OUTPUT_DIR} \ 30 | --output_dir ${OUTPUT_DIR} \ 31 | --batch_size 32 \ 32 | --num_sample 2 \ 33 | --input_size 288 \ 34 | --short_side_size 288 \ 35 | --save_ckpt_freq 100 \ 36 | --num_frames 64 \ 37 | --orig_t_size 64 \ 38 | --num_workers 12 \ 39 | --warmup_epochs 2 \ 40 | --tubelet_size 1 \ 41 | --epochs 10 \ 42 | --lr 5e-6 \ 43 | --drop_path 0.1 \ 44 | --aa rand-m5-n2-mstd0.25-inc1 \ 45 | --opt adamw \ 46 | --opt_betas 0.9 0.999 \ 47 | --weight_decay 1e-8 \ 48 | --test_num_segment 2 \ 49 | --test_num_crop 3 \ 50 | --dist_eval \ 51 | --test_best \ 52 | --bf16 53 | -------------------------------------------------------------------------------- /videomamba/video_sm/exp/ssv2/videomamba_tiny/run_f8x224.sh: -------------------------------------------------------------------------------- 1 | export MASTER_PORT=$((12000 + $RANDOM % 20000)) 2 | export OMP_NUM_THREADS=1 3 | 4 | JOB_NAME='videomamba_tiny_f8_res224' 5 | OUTPUT_DIR="$(dirname $0)/$JOB_NAME" 6 | LOG_DIR="./logs/${JOB_NAME}" 7 | PREFIX='your_ssv2_path' 8 | DATA_PATH='your_ssv2_metadata_path' 9 | 10 | PARTITION='video5' 11 | GPUS=8 12 | GPUS_PER_NODE=8 13 | CPUS_PER_TASK=16 14 | 15 | srun -p $PARTITION \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS_PER_NODE} \ 18 | --ntasks=${GPUS} \ 19 | --ntasks-per-node=${GPUS_PER_NODE} \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | python run_class_finetuning.py \ 23 | --model videomamba_tiny \ 24 | --data_path ${DATA_PATH} \ 25 | --prefix ${PREFIX} \ 26 | --data_set 'SSV2' \ 27 | --nb_classes 174 \ 28 | --log_dir ${OUTPUT_DIR} \ 29 | --output_dir ${OUTPUT_DIR} \ 30 | --batch_size 64 \ 31 | --num_sample 2 \ 32 | --input_size 224 \ 33 | --short_side_size 224 \ 34 | --save_ckpt_freq 100 \ 35 | --num_frames 8 \ 36 | --num_workers 12 \ 37 | --warmup_epochs 5 \ 38 | --tubelet_size 1 \ 39 | --epochs 35 \ 40 | --lr 4e-4 \ 41 | --drop_path 0.1 \ 42 | --aa rand-m5-n2-mstd0.25-inc1 \ 43 | --opt adamw \ 44 | --opt_betas 0.9 0.999 \ 45 | --weight_decay 0.1 \ 46 | --test_num_segment 2 \ 47 | --test_num_crop 3 \ 48 | --dist_eval \ 49 | --test_best \ 50 | --bf16 51 | -------------------------------------------------------------------------------- /videomamba/video_sm/functional.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import cv2 3 | import numpy as np 4 | import PIL 5 | import torch 6 | 7 | 8 | def _is_tensor_clip(clip): 9 | return torch.is_tensor(clip) and clip.ndimension() == 4 10 | 11 | 12 | def crop_clip(clip, min_h, min_w, h, w): 13 | if isinstance(clip[0], np.ndarray): 14 | cropped = [img[min_h:min_h + h, min_w:min_w + w, :] for img in clip] 15 | 16 | elif isinstance(clip[0], PIL.Image.Image): 17 | cropped = [ 18 | img.crop((min_w, min_h, min_w + w, min_h + h)) for img in clip 19 | ] 20 | else: 21 | raise TypeError('Expected numpy.ndarray or PIL.Image' + 22 | 'but got list of {0}'.format(type(clip[0]))) 23 | return cropped 24 | 25 | 26 | def resize_clip(clip, size, interpolation='bilinear'): 27 | if isinstance(clip[0], np.ndarray): 28 | if isinstance(size, numbers.Number): 29 | im_h, im_w, im_c = clip[0].shape 30 | # Min spatial dim already matches minimal size 31 | if (im_w <= im_h and im_w == size) or (im_h <= im_w 32 | and im_h == size): 33 | return clip 34 | new_h, new_w = get_resize_sizes(im_h, im_w, size) 35 | size = (new_w, new_h) 36 | else: 37 | size = size[0], size[1] 38 | if interpolation == 'bilinear': 39 | np_inter = cv2.INTER_LINEAR 40 | else: 41 | np_inter = cv2.INTER_NEAREST 42 | scaled = [ 43 | cv2.resize(img, size, interpolation=np_inter) for img in clip 44 | ] 45 | elif isinstance(clip[0], PIL.Image.Image): 46 | if isinstance(size, numbers.Number): 47 | im_w, im_h = clip[0].size 48 | # Min spatial dim already matches minimal size 49 | if (im_w <= im_h and im_w == size) or (im_h <= im_w 50 | and im_h == size): 51 | return clip 52 | new_h, new_w = get_resize_sizes(im_h, im_w, size) 53 | size = (new_w, new_h) 54 | else: 55 | size = size[1], size[0] 56 | if interpolation == 'bilinear': 57 | pil_inter = PIL.Image.BILINEAR 58 | else: 59 | pil_inter = PIL.Image.NEAREST 60 | scaled = [img.resize(size, pil_inter) for img in clip] 61 | else: 62 | raise TypeError('Expected numpy.ndarray or PIL.Image' + 63 | 'but got list of {0}'.format(type(clip[0]))) 64 | return scaled 65 | 66 | 67 | def get_resize_sizes(im_h, im_w, size): 68 | if im_w < im_h: 69 | ow = size 70 | oh = int(size * im_h / im_w) 71 | else: 72 | oh = size 73 | ow = int(size * im_w / im_h) 74 | return oh, ow 75 | 76 | 77 | def normalize(clip, mean, std, inplace=False): 78 | if not _is_tensor_clip(clip): 79 | raise TypeError('tensor is not a torch clip.') 80 | 81 | if not inplace: 82 | clip = clip.clone() 83 | 84 | dtype = clip.dtype 85 | mean = torch.as_tensor(mean, dtype=dtype, device=clip.device) 86 | std = torch.as_tensor(std, dtype=dtype, device=clip.device) 87 | clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) 88 | 89 | return clip 90 | -------------------------------------------------------------------------------- /videomamba/video_sm/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import clip_b16, clip_l14, clip_l14_336 2 | from .modeling_finetune import ( 3 | vit_base_patch16_224, 4 | vit_base_patch16_384, 5 | vit_large_patch16_224, 6 | vit_large_patch16_384 7 | ) 8 | from .modeling_pretrain_umt import ( 9 | pretrain_umt_base_patch16_224, 10 | pretrain_umt_large_patch16_224 11 | ) 12 | from .modeling_pretrain import ( 13 | pretrain_videomae_base_patch16_224, 14 | pretrain_videomae_large_patch16_224, 15 | pretrain_videomae_huge_patch16_224 16 | ) 17 | from .deit import deit_tiny_patch16_224 18 | from .videomamba import ( 19 | videomamba_tiny, 20 | videomamba_small, 21 | videomamba_middle, 22 | ) 23 | 24 | from .videomamba_pretrain import ( 25 | videomamba_middle_pretrain 26 | ) -------------------------------------------------------------------------------- /videomamba/video_sm/models/extract_clip/extract.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import clip.clip as clip\n", 10 | "import os\n", 11 | "import torch\n", 12 | "from collections import OrderedDict" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 10, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "path = 'your_model_path/clip_visual_encoder'" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 14, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "model, _ = clip.load(\"ViT-B/16\", device='cpu')\n", 31 | "new_state_dict = OrderedDict()\n", 32 | "for k, v in model.state_dict().items():\n", 33 | " if 'visual.' in k:\n", 34 | " new_state_dict[k[7:]] = v\n", 35 | "torch.save(new_state_dict, os.path.join(path, 'vit_b16.pth'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 15, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "model, _ = clip.load(\"ViT-L/14\", device='cpu')\n", 45 | "new_state_dict = OrderedDict()\n", 46 | "for k, v in model.state_dict().items():\n", 47 | " if 'visual.' in k:\n", 48 | " new_state_dict[k[7:]] = v\n", 49 | "torch.save(new_state_dict, os.path.join(path, 'vit_l14.pth'))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "model, _ = clip.load(\"ViT-L/14@336px\", device='cpu')\n", 59 | "new_state_dict = OrderedDict()\n", 60 | "for k, v in model.state_dict().items():\n", 61 | " if 'visual.' in k:\n", 62 | " new_state_dict[k[7:]] = v\n", 63 | "torch.save(new_state_dict, os.path.join(path, 'vit_l14_336.pth'))" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3.7.13 ('torch1.9')", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.7.13" 91 | }, 92 | "orig_nbformat": 4, 93 | "vscode": { 94 | "interpreter": { 95 | "hash": "c30e0be9d1dabfc31a056b9daab5ce1d15284c0e9e5af7f56f8931344ec84c24" 96 | } 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 2 101 | } 102 | --------------------------------------------------------------------------------