├── models └── README.md ├── FSAA.png ├── splits ├── example_standard.json ├── finegym_test.txt ├── mit_test.txt ├── finegym_train.txt ├── generate_dataset_json.py ├── haa_test.txt ├── mit_train.txt └── haa_train.txt ├── ctc ├── __pycache__ │ ├── ctc.cpython-38.pyc │ ├── Common.cpython-38.pyc │ ├── ctc_decode.cpython-38.pyc │ └── ctc_loss.cpython-38.pyc ├── Common.py ├── ctc_loss.py └── ctc_decode.py ├── moco ├── rename.py ├── moco_encoder.py ├── tcn.py ├── builder.py ├── dataset.py ├── encoder.py └── main_moco.py ├── tcn.py ├── README.md ├── ctc.py ├── utils.py ├── attention_pool.py ├── test.py ├── relation_net.py ├── encoder.py ├── train.py └── dataset.py /models/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /FSAA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sausage-SONG/Few-shot-action-recognition/HEAD/FSAA.png -------------------------------------------------------------------------------- /splits/example_standard.json: -------------------------------------------------------------------------------- 1 | {"name": "", "folders": [""], "splits": [[], [], []]} -------------------------------------------------------------------------------- /ctc/__pycache__/ctc.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sausage-SONG/Few-shot-action-recognition/HEAD/ctc/__pycache__/ctc.cpython-38.pyc -------------------------------------------------------------------------------- /ctc/__pycache__/Common.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sausage-SONG/Few-shot-action-recognition/HEAD/ctc/__pycache__/Common.cpython-38.pyc -------------------------------------------------------------------------------- /ctc/__pycache__/ctc_decode.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sausage-SONG/Few-shot-action-recognition/HEAD/ctc/__pycache__/ctc_decode.cpython-38.pyc -------------------------------------------------------------------------------- /ctc/__pycache__/ctc_loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sausage-SONG/Few-shot-action-recognition/HEAD/ctc/__pycache__/ctc_loss.cpython-38.pyc -------------------------------------------------------------------------------- /ctc/Common.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | 4 | 5 | def extendByBlanks(seq, b): 6 | "extends a label seq. by adding blanks at the beginning, end and in between each label" 7 | res = [b] 8 | for s in seq: 9 | res.append(s) 10 | res.append(b) 11 | return res 12 | 13 | 14 | def wordToLabelSeq(w, classes): 15 | "map a word to a sequence of labels (indices)" 16 | res = [classes.index(c) for c in w] 17 | return res -------------------------------------------------------------------------------- /splits/finegym_test.txt: -------------------------------------------------------------------------------- 1 | 38 2 | 203 3 | 22 4 | 245 5 | 110 6 | 287 7 | 56 8 | 138 9 | 36 10 | 263 11 | 53 12 | 16 13 | 34 14 | 207 15 | 33 16 | 145 17 | 71 18 | 284 19 | 112 20 | 70 21 | 270 22 | 249 23 | 55 24 | 229 25 | 14 26 | 278 27 | 41 28 | 114 29 | 268 30 | 24 31 | 37 32 | 106 33 | 26 34 | 175 35 | 52 36 | 220 37 | 82 38 | 121 39 | 58 40 | 163 41 | 222 42 | 176 43 | 15 44 | 253 45 | 137 46 | 155 47 | 1 48 | 177 49 | 238 50 | 95 51 | 23 52 | 49 53 | 42 54 | 28 55 | 89 56 | 165 57 | 135 58 | -------------------------------------------------------------------------------- /moco/rename.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import torch 3 | import os 4 | 5 | checkpoint = torch.load("") 6 | model = checkpoint['state_dict'] 7 | 8 | encoder_dict = OrderedDict() 9 | tcn_dict = OrderedDict() 10 | 11 | for key in model.keys(): 12 | if 'encoder_q' in key: 13 | 14 | if 'c3d' in key: 15 | new_key = 'module.' + key[21:] 16 | encoder_dict[new_key] = model[key] 17 | 18 | elif 'tcn' in key: 19 | new_key = key[21:] 20 | tcn_dict[new_key] = model[key] 21 | 22 | torch.save(encoder_dict, '') 23 | torch.save(tcn_dict, '"], train_split_path="", 54 | val_split_path="", test_split_path="", 55 | name="", output_name="example_standard.json"): 56 | result = dict() 57 | 58 | result["name"] = name 59 | result["folders"] = folders 60 | result["splits"] = [read_split(train_split_path), 61 | read_split(val_split_path), 62 | read_split(test_split_path)] 63 | 64 | with open(output_name, "w") as file: 65 | file.write(json.dumps(result)) 66 | 67 | generate_finegym() 68 | generate_standard() -------------------------------------------------------------------------------- /tcn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.utils import weight_norm 4 | 5 | class Chomp1d(nn.Module): 6 | def __init__(self, chomp_size): 7 | super(Chomp1d, self).__init__() 8 | self.chomp_size = chomp_size 9 | 10 | def forward(self, x): 11 | return x[:, :, :-self.chomp_size].contiguous() 12 | 13 | 14 | class TemporalBlock(nn.Module): 15 | def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2): 16 | super(TemporalBlock, self).__init__() 17 | self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size, 18 | stride=stride, padding=padding, dilation=dilation)) 19 | self.chomp1 = Chomp1d(padding) 20 | self.relu1 = nn.ReLU() 21 | self.dropout1 = nn.Dropout(dropout) 22 | 23 | self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size, 24 | stride=stride, padding=padding, dilation=dilation)) 25 | self.chomp2 = Chomp1d(padding) 26 | self.relu2 = nn.ReLU() 27 | self.dropout2 = nn.Dropout(dropout) 28 | 29 | self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1, 30 | self.conv2, self.chomp2, self.relu2, self.dropout2) 31 | self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None 32 | self.relu = nn.ReLU() 33 | self.init_weights() 34 | 35 | def init_weights(self): 36 | self.conv1.weight.data.normal_(0, 0.01) 37 | self.conv2.weight.data.normal_(0, 0.01) 38 | if self.downsample is not None: 39 | self.downsample.weight.data.normal_(0, 0.01) 40 | 41 | def forward(self, x): 42 | out = self.net(x) 43 | res = x if self.downsample is None else self.downsample(x) 44 | return self.relu(out + res) 45 | 46 | 47 | class TemporalConvNet(nn.Module): 48 | def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2): 49 | super(TemporalConvNet, self).__init__() 50 | layers = [] 51 | num_levels = len(num_channels) 52 | for i in range(num_levels): 53 | dilation_size = 2 ** i 54 | in_channels = num_inputs if i == 0 else num_channels[i-1] 55 | out_channels = num_channels[i] 56 | layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size, 57 | padding=(kernel_size-1) * dilation_size, dropout=dropout)] 58 | 59 | self.network = nn.Sequential(*layers) 60 | 61 | def forward(self, x): 62 | return self.network(x) -------------------------------------------------------------------------------- /moco/tcn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.utils import weight_norm 4 | 5 | class Chomp1d(nn.Module): 6 | def __init__(self, chomp_size): 7 | super(Chomp1d, self).__init__() 8 | self.chomp_size = chomp_size 9 | 10 | def forward(self, x): 11 | return x[:, :, :-self.chomp_size].contiguous() 12 | 13 | 14 | class TemporalBlock(nn.Module): 15 | def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2): 16 | super(TemporalBlock, self).__init__() 17 | self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size, 18 | stride=stride, padding=padding, dilation=dilation)) 19 | self.chomp1 = Chomp1d(padding) 20 | self.relu1 = nn.ReLU() 21 | self.dropout1 = nn.Dropout(dropout) 22 | 23 | self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size, 24 | stride=stride, padding=padding, dilation=dilation)) 25 | self.chomp2 = Chomp1d(padding) 26 | self.relu2 = nn.ReLU() 27 | self.dropout2 = nn.Dropout(dropout) 28 | 29 | self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1, 30 | self.conv2, self.chomp2, self.relu2, self.dropout2) 31 | self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None 32 | self.relu = nn.ReLU() 33 | 34 | self.init_weights() 35 | 36 | def init_weights(self): 37 | self.conv1.weight.data.normal_(0, 0.01) 38 | self.conv2.weight.data.normal_(0, 0.01) 39 | if self.downsample is not None: 40 | self.downsample.weight.data.normal_(0, 0.01) 41 | 42 | def forward(self, x): 43 | out = self.net(x) 44 | res = x if self.downsample is None else self.downsample(x) 45 | return self.relu(out + res) 46 | 47 | 48 | class TemporalConvNet(nn.Module): 49 | def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2): 50 | super(TemporalConvNet, self).__init__() 51 | layers = [] 52 | num_levels = len(num_channels) 53 | for i in range(num_levels): 54 | dilation_size = 2 ** i 55 | in_channels = num_inputs if i == 0 else num_channels[i-1] 56 | out_channels = num_channels[i] 57 | layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size, 58 | padding=(kernel_size-1) * dilation_size, dropout=dropout)] 59 | 60 | self.network = nn.Sequential(*layers) 61 | 62 | def forward(self, x): 63 | return self.network(x) -------------------------------------------------------------------------------- /ctc/ctc_loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | import math 4 | import numpy as np 5 | import ctc.Common as Common 6 | 7 | def recLabelingProb(t, s, mat, labelingWithBlanks, blank, cache): 8 | "recursively compute probability of labeling, save results of sub-problems in cache to avoid recalculating them" 9 | 10 | # check index of labeling 11 | if s < 0: 12 | return 0.0 13 | 14 | # sub-problem already computed 15 | if cache[t][s] != None: 16 | return cache[t][s] 17 | 18 | # initial values 19 | if t == 0: 20 | if s == 0: 21 | res = mat[0, blank] 22 | elif s == 1: 23 | res = mat[0, labelingWithBlanks[1]] 24 | else: 25 | res = 0.0 26 | 27 | cache[t][s] = res 28 | return res 29 | 30 | # recursion on s and t 31 | res = (recLabelingProb(t-1, s, mat, labelingWithBlanks, blank, cache) + recLabelingProb(t-1, s-1, mat, labelingWithBlanks, blank, cache)) * mat[t, labelingWithBlanks[s]] 32 | 33 | # in case of a blank or a repeated label, we only consider s and s-1 at t-1, so we're done 34 | if labelingWithBlanks[s] == blank or (s >= 2 and labelingWithBlanks[s-2] == labelingWithBlanks[s]): 35 | cache[t][s] = res 36 | return res 37 | 38 | # otherwise, in case of a non-blank and non-repeated label, we additionally add s-2 at t-1 39 | res += recLabelingProb(t-1, s-2, mat, labelingWithBlanks, blank, cache) * mat[t, labelingWithBlanks[s]] 40 | cache[t][s] = res 41 | return res 42 | 43 | 44 | def emptyCache(maxT, labelingWithBlanks): 45 | "create empty cache" 46 | return [[None for _ in range(len(labelingWithBlanks))] for _ in range(maxT)] 47 | 48 | 49 | def ctcLabelingProb(mat, gt, classes): 50 | "calculate probability p(gt|mat) of a given labeling gt and a matrix mat according to section 'The CTC Forward-Backward Algorithm' in Graves paper" 51 | maxT, _ = mat.shape # size of input matrix 52 | blank = len(classes) # index of blank label 53 | labelingWithBlanks = Common.extendByBlanks(Common.wordToLabelSeq(gt, classes), blank) # ground truth text as label string extended by blanks 54 | cache = emptyCache(maxT, labelingWithBlanks) # cache subresults to avoid recalculating subproblems over and over again 55 | return recLabelingProb(maxT-1, len(labelingWithBlanks)-1, mat, labelingWithBlanks, blank, cache) + recLabelingProb(maxT-1, len(labelingWithBlanks)-2, mat, labelingWithBlanks, blank, cache) 56 | 57 | 58 | def ctcLoss(mat, gt, classes): 59 | "calculate CTC loss" 60 | try: 61 | return -math.log(ctcLabelingProb(mat, gt, classes)) 62 | except: 63 | return float('inf') 64 | 65 | 66 | def testLoss(): 67 | "test loss" 68 | classes = 'ab' 69 | mat = np.array([[0.4, 0, 0.6], [0.4, 0, 0.6]]) 70 | print('Test loss calculation') 71 | expected = 0.64 72 | actual = ctcLabelingProb(mat, 'a', classes) 73 | print('Expected: ' + str(expected)) 74 | print('Actual: ' + str(actual)) 75 | print('OK' if expected == actual else 'ERROR') 76 | 77 | 78 | if __name__ == '__main__': 79 | testLoss() -------------------------------------------------------------------------------- /splits/haa_test.txt: -------------------------------------------------------------------------------- 1 | horizontalbar_land 2 | skateboard_grind 3 | play_recorder 4 | gangnam_style_dance 5 | curling_sweep 6 | hopscotch_spin 7 | sledgehammer_strike_down 8 | play_melodic 9 | gym_pull 10 | play_doublebass 11 | punching_sandbag 12 | dabbing 13 | using_lawn_mower_riding_type 14 | play_grandpiano 15 | riding_mechanical_bull 16 | floss_dance 17 | rock_balancing 18 | figure_skate_backward 19 | roller-skating_backward 20 | pizza_dough_toss 21 | gym_ride 22 | adjusting_glasses 23 | discuss_throw 24 | trapeze_interacting 25 | bowling 26 | shotput_throw 27 | baseball_run 28 | folding_clothes 29 | play_accordian 30 | pushup 31 | archery 32 | triple_jump_run 33 | air_hocky 34 | brushing_hair 35 | play_hulusi 36 | falling_off_chair 37 | arm_wrestling 38 | hugging_human 39 | leaf_blowing 40 | climbing_rope 41 | fist_bump 42 | breakdancing_flare 43 | playing_conga_drum 44 | eating_ice_cream 45 | dog_walking 46 | sprint_start 47 | play_sanxian 48 | play_ otamatone 49 | play_xylophone 50 | read_newspaper 51 | carrying_with_head 52 | play_triangle 53 | weightlifting_overhead 54 | grass_skiing 55 | cleaning_mopping 56 | play_sitar 57 | push_wheelchair 58 | playing_taiko_drum 59 | throwing_bouquet 60 | play_maracas 61 | haircut_scissor 62 | play_lute 63 | face-changing_opera 64 | bike_fall 65 | push_wheelchair_alone 66 | play_harp 67 | diving_sneak 68 | play_saxophone 69 | play_kendama 70 | burping 71 | hand_in_hand 72 | volleyball_underhand 73 | pottery_wheel 74 | situp 75 | base_jumping 76 | answering_questions 77 | climb_pole 78 | balancebeam_flip 79 | piggyback_ride 80 | surfing 81 | balancebeam_jump 82 | play_timpani 83 | tire_pull 84 | workout_chest-pull 85 | battle-rope_rainbow 86 | shoveling_snow 87 | unevenbar_flip 88 | figure_skate_jump_spin 89 | basketball_shoot 90 | play_guitar 91 | talking_megaphone 92 | play_ocarina 93 | ski_frontflip 94 | long_jump_jump 95 | stone_skipping 96 | decorating_snowman 97 | face_slapping 98 | chopping_wood 99 | hurdle_jump 100 | shake_cocktail 101 | cutting_onion 102 | badminton_serve 103 | basketball_hookshot 104 | dice_stack_shuffle 105 | taekwondo_kick 106 | roller-skating_forward 107 | conducting 108 | peeling_banana 109 | football_throw 110 | using_inhaler 111 | badminton_underswing 112 | backflip 113 | riding_camel 114 | fire_dancing_circulating 115 | screw_car_tire 116 | swinging_axe_on_a_tree 117 | sticking tongue out 118 | baseball_catch_flyball 119 | balancebeam_walk 120 | diving_rotate 121 | frisbee_throw 122 | triple_jump_jump 123 | bending_back 124 | kiss 125 | chainsaw_tree 126 | brushing_teeth 127 | diving_jump 128 | shooting_handgun 129 | play_hulahoop 130 | riding_elephant 131 | baseball_swing 132 | play_cymbals 133 | taekwondo_punch 134 | speedskating_forward 135 | bowls_throw 136 | horizontalbar_jump 137 | hopscotch_skip 138 | watering_plants 139 | building_snowman 140 | clear_snow_off_car 141 | climb_icecliff 142 | pole_vault_run 143 | baseball_catch_catcher 144 | CPR 145 | soccer_throw 146 | equestrian_run 147 | ski_backflip 148 | guitar_flip 149 | neck_side_pull_stretch 150 | canoeing_slalom 151 | ironing_clothes 152 | underarm_turn 153 | dice_shuffle_reveal 154 | using_metal_detector 155 | reading_book 156 | gym_lunges 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Semi-supervised Few-shot Atomic Action Recognition 2 | 3 | This repo contains the codes for our paper "Semi-supervised Few-shot Atomic Action Recognition". Please check our [paper](https://arxiv.org/abs/2011.08410) and [project page](https://sausage-song.github.io/home/FSAA/) for more details. 4 | 5 | ![FSAA Architecture](https://github.com/Sausage-SONG/Few-shot-action-recognition/raw/master/FSAA.png) 6 | 7 | Our learning strategies are divided into two parts: 1) train an encoder with unsupervised learning; 2) train the action classification module with supervised learning. Regarding the encoder our model provides fine-grained spatial and temporal video processing with high length flexibility, which embeds the video feature and temporally combines the features with TCN. In terms of classification module, our models provides attention pooling and compares the multi-head relation. Finally, the CTC and MSE loss enables our model for time-invariant few shot classification training. 8 | 9 | # Requirements 10 | 11 | pytorch >= 1.5.0 12 | torchvision >= 0.6.0 13 | numpy >= 1.18.1 14 | scipy >= 1.4.1 15 | [vidaug](https://github.com/okankop/vidaug) >= 0.1 16 | 17 | # Usage 18 | 19 | ## Installation 20 | 21 | 1. Clone the repo 22 | 2. Install [required packages](#requirements) 23 | 3. Download [trained models](#trained-models) to `/models` (Optional) 24 | 4. Download the [datasets](#datasets) (Optional) 25 | 26 | ## Training 27 | 28 | As mentioned in the [intro](#semi-supervised-few-shot-atomic-action-recognition), our model training has two parts. 29 | 30 | ### 1. Train the encoder unsupervisedly. 31 | 32 | Here we use [MoCo](https://github.com/facebookresearch/moco). However, this part can be done by actually any unsupervised learning tool. 33 | 34 | First clone [MoCo](https://github.com/facebookresearch/moco). Then do the following copy & replace: 35 | 36 | ``` 37 | cp '/moco/builder.py' '/moco/' 38 | cp '/moco/{dataset.py,encoder.py,main_moco.py,moco_encoder.py,rename.py,tcn.py}' '/' 39 | ``` 40 | 41 | You are recommended to first read the instruction of MoCo to know more about how it works, then input the relevant paths to `main_moco.py` and start your training. You will need to use `rename.py` to split the trained model (a .tar file) to a `c3d.pkl` and `tcn.pkl` for the next step. 42 | 43 | ### 2. Train the whole model supervisedly. 44 | 45 | Load your pretrained C3D and TCN models and continue. 46 | `python3 train.py -d='./splits/.json' -n=''` 47 | 48 | ## Testing 49 | `python3 test.py -d='./splits/.json' -c=''` 50 | 51 | # Trained Models 52 | 53 | TODO 54 | 55 | # Datasets 56 | 57 | We use three atomic action datasets. 58 | 1. [HAA](https://www.cse.ust.hk/haa/index.html) 59 | 2. [Finegym](https://sdolivia.github.io/FineGym/) 60 | 3. [MIT](http://moments.csail.mit.edu/) 61 | 62 | Dataset splits and json files can be found under `/splits`, see example dataset jsons or use the scripts there to generate your own. If you want to use other datasets, make sure it has a `///