├── models
    └── README.md
├── FSAA.png
├── splits
    ├── example_standard.json
    ├── finegym_test.txt
    ├── mit_test.txt
    ├── finegym_train.txt
    ├── generate_dataset_json.py
    ├── haa_test.txt
    ├── mit_train.txt
    └── haa_train.txt
├── ctc
    ├── __pycache__
    │   ├── ctc.cpython-38.pyc
    │   ├── Common.cpython-38.pyc
    │   ├── ctc_decode.cpython-38.pyc
    │   └── ctc_loss.cpython-38.pyc
    ├── Common.py
    ├── ctc_loss.py
    └── ctc_decode.py
├── moco
    ├── rename.py
    ├── moco_encoder.py
    ├── tcn.py
    ├── builder.py
    ├── dataset.py
    ├── encoder.py
    └── main_moco.py
├── tcn.py
├── README.md
├── ctc.py
├── utils.py
├── attention_pool.py
├── test.py
├── relation_net.py
├── encoder.py
├── train.py
└── dataset.py


/models/README.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/FSAA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sausage-SONG/Few-shot-action-recognition/HEAD/FSAA.png


--------------------------------------------------------------------------------
/splits/example_standard.json:
--------------------------------------------------------------------------------
1 | {"name": "<INPUT_DATASET_NAME_HERE>", "folders": ["<INPUT_FOLDERS_HERE>"], "splits": [[], [], []]}


--------------------------------------------------------------------------------
/ctc/__pycache__/ctc.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sausage-SONG/Few-shot-action-recognition/HEAD/ctc/__pycache__/ctc.cpython-38.pyc


--------------------------------------------------------------------------------
/ctc/__pycache__/Common.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sausage-SONG/Few-shot-action-recognition/HEAD/ctc/__pycache__/Common.cpython-38.pyc


--------------------------------------------------------------------------------
/ctc/__pycache__/ctc_decode.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sausage-SONG/Few-shot-action-recognition/HEAD/ctc/__pycache__/ctc_decode.cpython-38.pyc


--------------------------------------------------------------------------------
/ctc/__pycache__/ctc_loss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sausage-SONG/Few-shot-action-recognition/HEAD/ctc/__pycache__/ctc_loss.cpython-38.pyc


--------------------------------------------------------------------------------
/ctc/Common.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | 
 4 | 
 5 | def extendByBlanks(seq, b):
 6 | 	"extends a label seq. by adding blanks at the beginning, end and in between each label"
 7 | 	res = [b]
 8 | 	for s in seq:
 9 | 		res.append(s)
10 | 		res.append(b)
11 | 	return res
12 | 
13 | 
14 | def wordToLabelSeq(w, classes):
15 | 	"map a word to a sequence of labels (indices)"
16 | 	res = [classes.index(c) for c in w]
17 | 	return res


--------------------------------------------------------------------------------
/splits/finegym_test.txt:
--------------------------------------------------------------------------------
 1 | 38
 2 | 203
 3 | 22
 4 | 245
 5 | 110
 6 | 287
 7 | 56
 8 | 138
 9 | 36
10 | 263
11 | 53
12 | 16
13 | 34
14 | 207
15 | 33
16 | 145
17 | 71
18 | 284
19 | 112
20 | 70
21 | 270
22 | 249
23 | 55
24 | 229
25 | 14
26 | 278
27 | 41
28 | 114
29 | 268
30 | 24
31 | 37
32 | 106
33 | 26
34 | 175
35 | 52
36 | 220
37 | 82
38 | 121
39 | 58
40 | 163
41 | 222
42 | 176
43 | 15
44 | 253
45 | 137
46 | 155
47 | 1
48 | 177
49 | 238
50 | 95
51 | 23
52 | 49
53 | 42
54 | 28
55 | 89
56 | 165
57 | 135
58 | 


--------------------------------------------------------------------------------
/moco/rename.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | import torch
 3 | import os
 4 | 
 5 | checkpoint = torch.load("<MODEL_PATH_(A_TAR_FILE)>")
 6 | model = checkpoint['state_dict']
 7 | 
 8 | encoder_dict = OrderedDict()
 9 | tcn_dict = OrderedDict()
10 | 
11 | for key in model.keys():
12 |     if 'encoder_q' in key:
13 |         
14 |         if 'c3d' in key:
15 |             new_key = 'module.' + key[21:]
16 |             encoder_dict[new_key] = model[key]
17 |         
18 |         elif 'tcn' in key:
19 |             new_key = key[21:]
20 |             tcn_dict[new_key] = model[key]
21 | 
22 | torch.save(encoder_dict, '<PATH_TO_SAVE_C3D>')
23 | torch.save(tcn_dict, '<PATH_TO_SAVE_TCN')
24 | 


--------------------------------------------------------------------------------
/splits/mit_test.txt:
--------------------------------------------------------------------------------
 1 | reading
 2 | swerving
 3 | tapping
 4 | dining
 5 | sailing
 6 | descending
 7 | dunking
 8 | rubbing
 9 | exercising
10 | drilling
11 | performing
12 | crying
13 | crouching
14 | cheering
15 | buttoning
16 | frowning
17 | juggling
18 | stitching
19 | punching
20 | dragging
21 | crashing
22 | cramming
23 | scrubbing
24 | selling
25 | mopping
26 | baptizing
27 | lecturing
28 | plunging
29 | sleeping
30 | dressing
31 | putting
32 | smashing
33 | massaging
34 | spraying
35 | dancing
36 | playing+music
37 | sprinting
38 | working
39 | resting
40 | exiting
41 | dropping
42 | running
43 | climbing
44 | adult+female+speaking
45 | swimming
46 | wrapping
47 | pointing
48 | loading
49 | waving
50 | drenching
51 | mowing
52 | folding
53 | winking
54 | marrying
55 | whistling
56 | knitting
57 | boiling
58 | talking
59 | erupting
60 | joining
61 | guarding
62 | spilling
63 | inflating
64 | bouncing
65 | licking
66 | shoveling
67 | ascending
68 | 


--------------------------------------------------------------------------------
/moco/moco_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from encoder import Simple3DEncoder as C3D          #  Conv3D
 5 | from tcn import TemporalConvNet as TCN              #  TCN
 6 | 
 7 | import os
 8 | 
 9 | 
10 | class C3D_TCN(nn.Module):
11 | 
12 |     def __init__(self, tcn_out_channel=64, c3d_path='', tcn_path=''):
13 |         super(C3D_TCN, self).__init__()
14 | 
15 |         self.c3d = C3D(in_channels=3) 
16 |         self.tcn = TCN(245760, [128,128,64,tcn_out_channel]) # 245760 == 128, 983040 == 256, 384000 == 160
17 | 
18 |         self.load_models(c3d_path, tcn_path)
19 | 
20 |     def load_models(self, c3d_path, tcn_path):
21 |         if os.path.exists(c3d_path):
22 |             self.c3d.load_state_dict(torch.load(c3d_path))
23 |         if os.path.exists(tcn_path):
24 |             self.tcn.load_state_dict(torch.load(tcn_path))
25 | 
26 |     def save_models(self, c3d_path, tcn_path):
27 |         torch.save(self.c3d.state_dict(), c3d_path)
28 |         torch.save(self.tcn.state_dict(), tcn_path)
29 |     
30 |     def forward(self, X):
31 |         N, WC, RGB, F, W, H = X.shape
32 |         shape = [N*WC, RGB, F, W, H]
33 | 
34 |         X = self.c3d(X.reshape(shape))
35 | 
36 |         shape = [N, WC, -1]
37 |         X = X.reshape(shape)
38 | 
39 |         X = torch.transpose(X, 1, 2)
40 |         X = self.tcn(X)
41 |         X = torch.transpose(X, 1, 2)
42 | 
43 |         shape = [N, -1]
44 |         X = X.reshape(shape)
45 | 
46 |         return X


--------------------------------------------------------------------------------
/splits/finegym_train.txt:
--------------------------------------------------------------------------------
  1 | 0
  2 | 2
  3 | 3
  4 | 4
  5 | 5
  6 | 6
  7 | 7
  8 | 8
  9 | 9
 10 | 10
 11 | 11
 12 | 12
 13 | 13
 14 | 17
 15 | 18
 16 | 19
 17 | 20
 18 | 21
 19 | 25
 20 | 27
 21 | 29
 22 | 30
 23 | 31
 24 | 32
 25 | 35
 26 | 39
 27 | 40
 28 | 43
 29 | 44
 30 | 45
 31 | 46
 32 | 47
 33 | 48
 34 | 50
 35 | 51
 36 | 54
 37 | 57
 38 | 59
 39 | 60
 40 | 61
 41 | 62
 42 | 63
 43 | 64
 44 | 65
 45 | 66
 46 | 67
 47 | 68
 48 | 69
 49 | 72
 50 | 73
 51 | 74
 52 | 75
 53 | 76
 54 | 77
 55 | 78
 56 | 79
 57 | 80
 58 | 81
 59 | 83
 60 | 84
 61 | 85
 62 | 86
 63 | 87
 64 | 88
 65 | 90
 66 | 91
 67 | 92
 68 | 93
 69 | 94
 70 | 96
 71 | 97
 72 | 98
 73 | 99
 74 | 100
 75 | 101
 76 | 102
 77 | 103
 78 | 104
 79 | 105
 80 | 107
 81 | 108
 82 | 109
 83 | 111
 84 | 113
 85 | 115
 86 | 116
 87 | 117
 88 | 118
 89 | 119
 90 | 120
 91 | 122
 92 | 123
 93 | 124
 94 | 125
 95 | 126
 96 | 127
 97 | 128
 98 | 129
 99 | 130
100 | 131
101 | 132
102 | 133
103 | 134
104 | 136
105 | 139
106 | 140
107 | 141
108 | 142
109 | 143
110 | 144
111 | 146
112 | 147
113 | 148
114 | 149
115 | 150
116 | 151
117 | 152
118 | 153
119 | 154
120 | 156
121 | 157
122 | 158
123 | 159
124 | 160
125 | 161
126 | 162
127 | 164
128 | 166
129 | 167
130 | 168
131 | 169
132 | 170
133 | 171
134 | 172
135 | 173
136 | 174
137 | 178
138 | 179
139 | 180
140 | 181
141 | 182
142 | 183
143 | 184
144 | 185
145 | 186
146 | 187
147 | 188
148 | 189
149 | 190
150 | 191
151 | 192
152 | 193
153 | 194
154 | 195
155 | 196
156 | 197
157 | 198
158 | 199
159 | 200
160 | 201
161 | 202
162 | 204
163 | 205
164 | 206
165 | 208
166 | 209
167 | 210
168 | 211
169 | 212
170 | 213
171 | 214
172 | 215
173 | 216
174 | 217
175 | 218
176 | 219
177 | 221
178 | 223
179 | 224
180 | 225
181 | 226
182 | 227
183 | 228
184 | 230
185 | 231
186 | 232
187 | 233
188 | 234
189 | 235
190 | 236
191 | 237
192 | 239
193 | 240
194 | 241
195 | 242
196 | 243
197 | 244
198 | 246
199 | 247
200 | 248
201 | 250
202 | 251
203 | 252
204 | 254
205 | 255
206 | 256
207 | 257
208 | 258
209 | 259
210 | 260
211 | 261
212 | 262
213 | 264
214 | 265
215 | 266
216 | 267
217 | 269
218 | 271
219 | 272
220 | 273
221 | 274
222 | 275
223 | 276
224 | 277
225 | 279
226 | 280
227 | 281
228 | 282
229 | 283
230 | 285
231 | 286
232 | 


--------------------------------------------------------------------------------
/splits/generate_dataset_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | def read_split(file_path):
 5 |     result = []
 6 |     if os.path.exists(file_path):
 7 |         file = open(file_path, "r")
 8 |         lines = file.readlines()
 9 |         file.close()
10 | 
11 |         for line in lines:
12 |             result.append(line.rstrip())
13 |             
14 |     return result
15 | 
16 | def read_finegym_info(finegym_info):
17 |     d = dict()
18 |     for path in finegym_info:
19 |         file = open(path, 'r')
20 |         lines = file.readlines()
21 |         file.close()
22 | 
23 |         for line in lines:
24 |             line = line.strip()
25 |             line = line.split(" ")
26 |             label = line[1]
27 |             name = line[0]
28 | 
29 |             if label in d.keys():
30 |                 d[label].append(name)
31 |             else:
32 |                 d[label] = [name]
33 | 
34 |     return d
35 | 
36 | def generate_finegym(folder="<INPUT_FINEGYM_DIR_HERE>", train_split_path="./finegym_train.txt",  
37 |                      val_split_path="./finegym_train.txt", test_split_path="./finegym_test.txt",
38 |                      finegym_info=["./gym288_train_element_v1.1.txt", "./gym288_val_element.txt"],
39 |                      output_name="example_finegym.json"):
40 |     result = dict()
41 | 
42 |     result["name"] = "finegym"
43 |     result["folder"] = folder
44 |     result["splits"] = [read_split(train_split_path), 
45 |                         read_split(val_split_path), 
46 |                         read_split(test_split_path)]
47 | 
48 |     result["finegym_info"] = read_finegym_info(finegym_info)
49 | 
50 |     with open(output_name, "w") as file:
51 |         file.write(json.dumps(result))
52 | 
53 | def generate_standard(folders=["<INPUT_FOLDERS_HERE>"], train_split_path="<INPUT_SPLIT_PATH_HERE>",
54 |                         val_split_path="<INPUT_SPLIT_PATH_HERE>", test_split_path="<INPUT_SPLIT_PATH_HERE>",
55 |                         name="<INPUT_DATASET_NAME_HERE>", output_name="example_standard.json"):
56 |     result = dict()
57 | 
58 |     result["name"] = name
59 |     result["folders"] = folders
60 |     result["splits"] = [read_split(train_split_path), 
61 |                         read_split(val_split_path), 
62 |                         read_split(test_split_path)]
63 |     
64 |     with open(output_name, "w") as file:
65 |         file.write(json.dumps(result))
66 | 
67 | generate_finegym()
68 | generate_standard()


--------------------------------------------------------------------------------
/tcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn.utils import weight_norm
 4 | 
 5 | class Chomp1d(nn.Module):
 6 |     def __init__(self, chomp_size):
 7 |         super(Chomp1d, self).__init__()
 8 |         self.chomp_size = chomp_size
 9 | 
10 |     def forward(self, x):
11 |         return x[:, :, :-self.chomp_size].contiguous()
12 | 
13 | 
14 | class TemporalBlock(nn.Module):
15 |     def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
16 |         super(TemporalBlock, self).__init__()
17 |         self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
18 |                                            stride=stride, padding=padding, dilation=dilation))
19 |         self.chomp1 = Chomp1d(padding)
20 |         self.relu1 = nn.ReLU()
21 |         self.dropout1 = nn.Dropout(dropout)
22 | 
23 |         self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
24 |                                            stride=stride, padding=padding, dilation=dilation))
25 |         self.chomp2 = Chomp1d(padding)
26 |         self.relu2 = nn.ReLU()
27 |         self.dropout2 = nn.Dropout(dropout)
28 | 
29 |         self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
30 |                                  self.conv2, self.chomp2, self.relu2, self.dropout2)
31 |         self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
32 |         self.relu = nn.ReLU()
33 |         self.init_weights()
34 | 
35 |     def init_weights(self):
36 |         self.conv1.weight.data.normal_(0, 0.01)
37 |         self.conv2.weight.data.normal_(0, 0.01)
38 |         if self.downsample is not None:
39 |             self.downsample.weight.data.normal_(0, 0.01)
40 | 
41 |     def forward(self, x):
42 |         out = self.net(x)
43 |         res = x if self.downsample is None else self.downsample(x)
44 |         return self.relu(out + res)
45 | 
46 | 
47 | class TemporalConvNet(nn.Module):
48 |     def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
49 |         super(TemporalConvNet, self).__init__()
50 |         layers = []
51 |         num_levels = len(num_channels)
52 |         for i in range(num_levels):
53 |             dilation_size = 2 ** i
54 |             in_channels = num_inputs if i == 0 else num_channels[i-1]
55 |             out_channels = num_channels[i]
56 |             layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
57 |                                      padding=(kernel_size-1) * dilation_size, dropout=dropout)]
58 | 
59 |         self.network = nn.Sequential(*layers)
60 | 
61 |     def forward(self, x):
62 |         return self.network(x)


--------------------------------------------------------------------------------
/moco/tcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn.utils import weight_norm
 4 | 
 5 | class Chomp1d(nn.Module):
 6 |     def __init__(self, chomp_size):
 7 |         super(Chomp1d, self).__init__()
 8 |         self.chomp_size = chomp_size
 9 | 
10 |     def forward(self, x):
11 |         return x[:, :, :-self.chomp_size].contiguous()
12 | 
13 | 
14 | class TemporalBlock(nn.Module):
15 |     def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
16 |         super(TemporalBlock, self).__init__()
17 |         self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
18 |                                            stride=stride, padding=padding, dilation=dilation))
19 |         self.chomp1 = Chomp1d(padding)
20 |         self.relu1 = nn.ReLU()
21 |         self.dropout1 = nn.Dropout(dropout)
22 | 
23 |         self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
24 |                                            stride=stride, padding=padding, dilation=dilation))
25 |         self.chomp2 = Chomp1d(padding)
26 |         self.relu2 = nn.ReLU()
27 |         self.dropout2 = nn.Dropout(dropout)
28 | 
29 |         self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
30 |                                  self.conv2, self.chomp2, self.relu2, self.dropout2)
31 |         self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
32 |         self.relu = nn.ReLU()
33 | 
34 |         self.init_weights()
35 | 
36 |     def init_weights(self):
37 |         self.conv1.weight.data.normal_(0, 0.01)
38 |         self.conv2.weight.data.normal_(0, 0.01)
39 |         if self.downsample is not None:
40 |             self.downsample.weight.data.normal_(0, 0.01)
41 | 
42 |     def forward(self, x):
43 |         out = self.net(x)
44 |         res = x if self.downsample is None else self.downsample(x)
45 |         return self.relu(out + res)
46 | 
47 | 
48 | class TemporalConvNet(nn.Module):
49 |     def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
50 |         super(TemporalConvNet, self).__init__()
51 |         layers = []
52 |         num_levels = len(num_channels)
53 |         for i in range(num_levels):
54 |             dilation_size = 2 ** i
55 |             in_channels = num_inputs if i == 0 else num_channels[i-1]
56 |             out_channels = num_channels[i]
57 |             layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
58 |                                      padding=(kernel_size-1) * dilation_size, dropout=dropout)]
59 | 
60 |         self.network = nn.Sequential(*layers)
61 | 
62 |     def forward(self, x):
63 |         return self.network(x)


--------------------------------------------------------------------------------
/ctc/ctc_loss.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | import math
 4 | import numpy as np
 5 | import ctc.Common as Common
 6 | 
 7 | def recLabelingProb(t, s, mat, labelingWithBlanks, blank, cache):
 8 | 	"recursively compute probability of labeling, save results of sub-problems in cache to avoid recalculating them"
 9 | 
10 | 	# check index of labeling
11 | 	if s < 0:
12 | 		return 0.0
13 | 
14 | 	# sub-problem already computed
15 | 	if cache[t][s] != None:
16 | 		return cache[t][s]
17 | 
18 | 	# initial values
19 | 	if t == 0:
20 | 		if s == 0:
21 | 			res = mat[0, blank]
22 | 		elif s == 1:
23 | 			res = mat[0, labelingWithBlanks[1]]
24 | 		else:
25 | 			res = 0.0
26 | 
27 | 		cache[t][s] = res
28 | 		return res
29 | 
30 | 	# recursion on s and t
31 | 	res = (recLabelingProb(t-1, s, mat, labelingWithBlanks, blank, cache) + recLabelingProb(t-1, s-1, mat, labelingWithBlanks, blank, cache)) * mat[t, labelingWithBlanks[s]]
32 | 
33 | 	# in case of a blank or a repeated label, we only consider s and s-1 at t-1, so we're done
34 | 	if labelingWithBlanks[s] == blank or (s >= 2 and labelingWithBlanks[s-2] == labelingWithBlanks[s]):
35 | 		cache[t][s] = res
36 | 		return res
37 | 
38 | 	# otherwise, in case of a non-blank and non-repeated label, we additionally add s-2 at t-1
39 | 	res += recLabelingProb(t-1, s-2, mat, labelingWithBlanks, blank, cache) * mat[t, labelingWithBlanks[s]]
40 | 	cache[t][s] = res
41 | 	return res
42 | 
43 | 
44 | def emptyCache(maxT, labelingWithBlanks):
45 | 	"create empty cache"
46 | 	return [[None for _ in range(len(labelingWithBlanks))] for _ in range(maxT)]
47 | 
48 | 
49 | def ctcLabelingProb(mat, gt, classes):
50 | 	"calculate probability p(gt|mat) of a given labeling gt and a matrix mat according to section 'The CTC Forward-Backward Algorithm' in Graves paper"
51 | 	maxT, _ = mat.shape # size of input matrix
52 | 	blank = len(classes) # index of blank label
53 | 	labelingWithBlanks = Common.extendByBlanks(Common.wordToLabelSeq(gt, classes), blank) # ground truth text as label string extended by blanks
54 | 	cache = emptyCache(maxT, labelingWithBlanks) # cache subresults to avoid recalculating  subproblems over and over again
55 | 	return recLabelingProb(maxT-1, len(labelingWithBlanks)-1, mat, labelingWithBlanks, blank, cache) + recLabelingProb(maxT-1, len(labelingWithBlanks)-2, mat, labelingWithBlanks, blank, cache)
56 | 
57 | 
58 | def ctcLoss(mat, gt, classes):
59 | 	"calculate CTC loss"
60 | 	try:
61 | 		return -math.log(ctcLabelingProb(mat, gt, classes))
62 | 	except:
63 | 		return float('inf')
64 | 
65 | 
66 | def testLoss():
67 | 	"test loss"
68 | 	classes = 'ab'
69 | 	mat = np.array([[0.4, 0, 0.6], [0.4, 0, 0.6]])
70 | 	print('Test loss calculation')
71 | 	expected = 0.64
72 | 	actual = ctcLabelingProb(mat, 'a', classes)
73 | 	print('Expected: ' + str(expected))
74 | 	print('Actual: ' + str(actual))
75 | 	print('OK' if expected == actual else 'ERROR')
76 | 
77 | 
78 | if __name__ == '__main__':
79 | 	testLoss()


--------------------------------------------------------------------------------
/splits/haa_test.txt:
--------------------------------------------------------------------------------
  1 | horizontalbar_land
  2 | skateboard_grind
  3 | play_recorder
  4 | gangnam_style_dance
  5 | curling_sweep
  6 | hopscotch_spin
  7 | sledgehammer_strike_down
  8 | play_melodic
  9 | gym_pull
 10 | play_doublebass
 11 | punching_sandbag
 12 | dabbing
 13 | using_lawn_mower_riding_type
 14 | play_grandpiano
 15 | riding_mechanical_bull
 16 | floss_dance
 17 | rock_balancing
 18 | figure_skate_backward
 19 | roller-skating_backward
 20 | pizza_dough_toss
 21 | gym_ride
 22 | adjusting_glasses
 23 | discuss_throw
 24 | trapeze_interacting
 25 | bowling
 26 | shotput_throw
 27 | baseball_run
 28 | folding_clothes
 29 | play_accordian
 30 | pushup
 31 | archery
 32 | triple_jump_run
 33 | air_hocky
 34 | brushing_hair
 35 | play_hulusi
 36 | falling_off_chair
 37 | arm_wrestling
 38 | hugging_human
 39 | leaf_blowing
 40 | climbing_rope
 41 | fist_bump
 42 | breakdancing_flare
 43 | playing_conga_drum
 44 | eating_ice_cream
 45 | dog_walking
 46 | sprint_start
 47 | play_sanxian
 48 | play_ otamatone
 49 | play_xylophone
 50 | read_newspaper
 51 | carrying_with_head
 52 | play_triangle
 53 | weightlifting_overhead
 54 | grass_skiing
 55 | cleaning_mopping
 56 | play_sitar
 57 | push_wheelchair
 58 | playing_taiko_drum
 59 | throwing_bouquet
 60 | play_maracas
 61 | haircut_scissor
 62 | play_lute
 63 | face-changing_opera
 64 | bike_fall
 65 | push_wheelchair_alone
 66 | play_harp
 67 | diving_sneak
 68 | play_saxophone
 69 | play_kendama
 70 | burping
 71 | hand_in_hand
 72 | volleyball_underhand
 73 | pottery_wheel
 74 | situp
 75 | base_jumping
 76 | answering_questions
 77 | climb_pole
 78 | balancebeam_flip
 79 | piggyback_ride
 80 | surfing
 81 | balancebeam_jump
 82 | play_timpani
 83 | tire_pull
 84 | workout_chest-pull
 85 | battle-rope_rainbow
 86 | shoveling_snow
 87 | unevenbar_flip
 88 | figure_skate_jump_spin
 89 | basketball_shoot
 90 | play_guitar
 91 | talking_megaphone
 92 | play_ocarina
 93 | ski_frontflip
 94 | long_jump_jump
 95 | stone_skipping
 96 | decorating_snowman
 97 | face_slapping
 98 | chopping_wood
 99 | hurdle_jump
100 | shake_cocktail
101 | cutting_onion
102 | badminton_serve
103 | basketball_hookshot
104 | dice_stack_shuffle
105 | taekwondo_kick
106 | roller-skating_forward
107 | conducting
108 | peeling_banana
109 | football_throw
110 | using_inhaler
111 | badminton_underswing
112 | backflip
113 | riding_camel
114 | fire_dancing_circulating
115 | screw_car_tire
116 | swinging_axe_on_a_tree
117 | sticking tongue out
118 | baseball_catch_flyball
119 | balancebeam_walk
120 | diving_rotate
121 | frisbee_throw
122 | triple_jump_jump
123 | bending_back
124 | kiss
125 | chainsaw_tree
126 | brushing_teeth
127 | diving_jump
128 | shooting_handgun
129 | play_hulahoop
130 | riding_elephant
131 | baseball_swing
132 | play_cymbals
133 | taekwondo_punch
134 | speedskating_forward
135 | bowls_throw
136 | horizontalbar_jump
137 | hopscotch_skip
138 | watering_plants
139 | building_snowman
140 | clear_snow_off_car
141 | climb_icecliff
142 | pole_vault_run
143 | baseball_catch_catcher
144 | CPR
145 | soccer_throw
146 | equestrian_run
147 | ski_backflip
148 | guitar_flip
149 | neck_side_pull_stretch
150 | canoeing_slalom
151 | ironing_clothes
152 | underarm_turn
153 | dice_shuffle_reveal
154 | using_metal_detector
155 | reading_book
156 | gym_lunges
157 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Semi-supervised Few-shot Atomic Action Recognition
 2 | 
 3 | This repo contains the codes for our paper "Semi-supervised Few-shot Atomic Action Recognition". Please check our [paper](https://arxiv.org/abs/2011.08410) and [project page](https://sausage-song.github.io/home/FSAA/) for more details.
 4 | 
 5 | ![FSAA Architecture](https://github.com/Sausage-SONG/Few-shot-action-recognition/raw/master/FSAA.png)
 6 | 
 7 | Our learning strategies are divided into two parts: 1) train an encoder with unsupervised learning; 2) train the action classification module with supervised learning. Regarding the encoder our model provides fine-grained spatial and temporal video processing with high length flexibility, which embeds the video feature and temporally combines the features with TCN. In terms of classification module, our models provides attention pooling and compares the multi-head relation. Finally, the CTC and MSE loss enables our model for time-invariant few shot classification training.
 8 | 
 9 | # Requirements
10 | 
11 | pytorch >= 1.5.0  
12 | torchvision >= 0.6.0  
13 | numpy >= 1.18.1  
14 | scipy >= 1.4.1  
15 | [vidaug](https://github.com/okankop/vidaug) >= 0.1
16 | 
17 | # Usage
18 | 
19 | ## Installation
20 | 
21 | 1. Clone the repo
22 | 2. Install [required packages](#requirements)
23 | 3. Download [trained models](#trained-models) to `<REPO_DIR>/models` (Optional)
24 | 4. Download the [datasets](#datasets) (Optional)
25 | 
26 | ## Training
27 | 
28 | As mentioned in the [intro](#semi-supervised-few-shot-atomic-action-recognition), our model training has two parts.
29 | 
30 | ### 1. Train the encoder unsupervisedly. 
31 | 
32 | Here we use [MoCo](https://github.com/facebookresearch/moco). However, this part can be done by actually any unsupervised learning tool.
33 | 
34 | First clone [MoCo](https://github.com/facebookresearch/moco). Then do the following copy & replace:  
35 | 
36 | ```
37 | cp '<REPO_DIR>/moco/builder.py' '<MOCO_DIR>/moco/'
38 | cp '<REPO_DIR>/moco/{dataset.py,encoder.py,main_moco.py,moco_encoder.py,rename.py,tcn.py}' '<MOCO_DIR>/'
39 | ```
40 | 
41 | You are recommended to first read the instruction of MoCo to know more about how it works, then input the relevant paths to `main_moco.py` and start your training. You will need to use `rename.py` to split the trained model (a .tar file) to a `c3d.pkl` and `tcn.pkl` for the next step.
42 | 
43 | ### 2. Train the whole model supervisedly.  
44 | 
45 | Load your pretrained C3D and TCN models and continue.  
46 | `python3 train.py -d='./splits/<YOUR_DATASET>.json' -n='<EXP_NAME>'`
47 | 
48 | ## Testing
49 | `python3 test.py -d='./splits/<YOUR_DATASET>.json' -c='<CHECKPOINT_DIR>'`
50 | 
51 | # Trained Models
52 | 
53 | TODO
54 | 
55 | # Datasets
56 | 
57 | We use three atomic action datasets.
58 | 1. [HAA](https://www.cse.ust.hk/haa/index.html)
59 | 2. [Finegym](https://sdolivia.github.io/FineGym/)
60 | 3. [MIT](http://moments.csail.mit.edu/)  
61 | 
62 | Dataset splits and json files can be found under `<REPO_DIR>/splits`, see example dataset jsons or use the scripts there to generate your own. If you want to use other datasets, make sure it has a `<DATASET>/<SPLIT>/<CLASS>/<VIDEO>/<FRAME>` structure.
63 | 
64 | # Acknowledge
65 | 
66 | This repo makes use of some great works. Our appreciation for
67 | 
68 | 1. [locuslab / TCN](https://github.com/locuslab/TCN)
69 | 2. [fujenchu / relationNet](https://github.com/dragen1860/LearningToCompare-Pytorch)
70 | 3. [facebookresearch / moco](https://github.com/facebookresearch/moco)
71 | 4. [parlance / ctcdecode](https://github.com/parlance/ctcdecode)
72 | 5. [okankop / vidaug](https://github.com/okankop/vidaug)
73 | 
74 | # Reference
75 | 
76 | Please refer this paper as
77 | 
78 | ```
79 | @Article{fsaa,
80 |   author  = {Sizhe Song, Xiaoyuan Ni, Yu-Wing Tai, Chi-Keung Tang},
81 |   title   = {Semi-supervised Few-shot Atomic Action Recognition},
82 |   journal = {arXiv preprint arXiv:2011.08410},
83 |   year    = {2020},
84 | }
85 | ```


--------------------------------------------------------------------------------
/ctc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import math
  3 | import collections
  4 | 
  5 | NEG_INF = -float("inf")
  6 | 
  7 | def make_new_beam():
  8 |   fn = lambda : (NEG_INF, NEG_INF)
  9 |   return collections.defaultdict(fn)
 10 | 
 11 | def logsumexp(*args):
 12 |   """
 13 |   Stable log sum exp.
 14 |   """
 15 |   if all(a == NEG_INF for a in args):
 16 |       return NEG_INF
 17 |   a_max = max(args)
 18 |   lsp = math.log(sum(math.exp(a - a_max)
 19 |                       for a in args))
 20 |   return a_max + lsp
 21 | 
 22 | def decode(probs, beam_size=100, blank=0):
 23 |   """
 24 |   Performs inference for the given output probabilities.
 25 |   Arguments:
 26 |       probs: The output probabilities (e.g. post-softmax) for each
 27 |         time step. Should be an array of shape (time x output dim).
 28 |       beam_size (int): Size of the beam to use during inference.
 29 |       blank (int): Index of the CTC blank label.
 30 |   Returns the output label sequence and the corresponding negative
 31 |   log-likelihood estimated by the decoder.
 32 |   """
 33 |   T, S = probs.shape
 34 |   probs = np.log(probs)
 35 | 
 36 |   # Elements in the beam are (prefix, (p_blank, p_no_blank))
 37 |   # Initialize the beam with the empty sequence, a probability of
 38 |   # 1 for ending in blank and zero for ending in non-blank
 39 |   # (in log space).
 40 |   beam = [(tuple(), (0.0, NEG_INF))]
 41 | 
 42 |   for t in range(T): # Loop over time
 43 | 
 44 |     # A default dictionary to store the next step candidates.
 45 |     next_beam = make_new_beam()
 46 | 
 47 |     for s in range(S): # Loop over vocab
 48 |       p = probs[t, s]
 49 | 
 50 |       # The variables p_b and p_nb are respectively the
 51 |       # probabilities for the prefix given that it ends in a
 52 |       # blank and does not end in a blank at this time step.
 53 |       for prefix, (p_b, p_nb) in beam: # Loop over beam
 54 | 
 55 |         # If we propose a blank the prefix doesn't change.
 56 |         # Only the probability of ending in blank gets updated.
 57 |         if s == blank:
 58 |           n_p_b, n_p_nb = next_beam[prefix]
 59 |           n_p_b = logsumexp(n_p_b, p_b + p, p_nb + p)
 60 |           next_beam[prefix] = (n_p_b, n_p_nb)
 61 |           continue
 62 | 
 63 |         # Extend the prefix by the new character s and add it to
 64 |         # the beam. Only the probability of not ending in blank
 65 |         # gets updated.
 66 |         end_t = prefix[-1] if prefix else None
 67 |         n_prefix = prefix + (s,)
 68 |         n_p_b, n_p_nb = next_beam[n_prefix]
 69 |         if s != end_t:
 70 |           n_p_nb = logsumexp(n_p_nb, p_b + p, p_nb + p)
 71 |         else:
 72 |           # We don't include the previous probability of not ending
 73 |           # in blank (p_nb) if s is repeated at the end. The CTC
 74 |           # algorithm merges characters not separated by a blank.
 75 |           n_p_nb = logsumexp(n_p_nb, p_b + p)
 76 |           
 77 |         # *NB* this would be a good place to include an LM score.
 78 |         next_beam[n_prefix] = (n_p_b, n_p_nb)
 79 | 
 80 |         # If s is repeated at the end we also update the unchanged
 81 |         # prefix. This is the merging case.
 82 |         if s == end_t:
 83 |           n_p_b, n_p_nb = next_beam[prefix]
 84 |           n_p_nb = logsumexp(n_p_nb, p_nb + p)
 85 |           next_beam[prefix] = (n_p_b, n_p_nb)
 86 | 
 87 |     # Sort and trim the beam before moving on to the
 88 |     # next time-step.
 89 |     beam = sorted(next_beam.items(),
 90 |             key=lambda x : logsumexp(*x[1]),
 91 |             reverse=True)
 92 |     beam = beam[:beam_size]
 93 | 
 94 |   best = beam[0]
 95 |   return best[0], -logsumexp(*best[1])
 96 | 
 97 | 
 98 |   # np.random.seed()
 99 | 
100 |   # time = 3
101 |   # output_dim = 4
102 | 
103 |   # probs = np.random.rand(time, output_dim)
104 |   # probs = probs / np.sum(probs, axis=1, keepdims=True)
105 | 
106 |   # labels, score = decode(probs, 100)
107 |   # print(labels)
108 |   # print("Score {:.3f}".format(score))


--------------------------------------------------------------------------------
/ctc/ctc_decode.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import math
  3 | import collections
  4 | 
  5 | NEG_INF = -float("inf")
  6 | 
  7 | def make_new_beam():
  8 |   fn = lambda : (NEG_INF, NEG_INF)
  9 |   return collections.defaultdict(fn)
 10 | 
 11 | def logsumexp(*args):
 12 |   """
 13 |   Stable log sum exp.
 14 |   """
 15 |   if all(a == NEG_INF for a in args):
 16 |       return NEG_INF
 17 |   a_max = max(args)
 18 |   lsp = math.log(sum(math.exp(a - a_max)
 19 |                       for a in args))
 20 |   return a_max + lsp
 21 | 
 22 | def decode(probs, beam_size=100, blank=0):
 23 |   """
 24 |   Performs inference for the given output probabilities.
 25 |   Arguments:
 26 |       probs: The output probabilities (e.g. post-softmax) for each
 27 |         time step. Should be an array of shape (time x output dim).
 28 |       beam_size (int): Size of the beam to use during inference.
 29 |       blank (int): Index of the CTC blank label.
 30 |   Returns the output label sequence and the corresponding negative
 31 |   log-likelihood estimated by the decoder.
 32 |   """
 33 |   T, S = probs.shape
 34 |   probs = np.log(probs)
 35 | 
 36 |   # Elements in the beam are (prefix, (p_blank, p_no_blank))
 37 |   # Initialize the beam with the empty sequence, a probability of
 38 |   # 1 for ending in blank and zero for ending in non-blank
 39 |   # (in log space).
 40 |   beam = [(tuple(), (0.0, NEG_INF))]
 41 | 
 42 |   for t in range(T): # Loop over time
 43 | 
 44 |     # A default dictionary to store the next step candidates.
 45 |     next_beam = make_new_beam()
 46 | 
 47 |     for s in range(S): # Loop over vocab
 48 |       p = probs[t, s]
 49 | 
 50 |       # The variables p_b and p_nb are respectively the
 51 |       # probabilities for the prefix given that it ends in a
 52 |       # blank and does not end in a blank at this time step.
 53 |       for prefix, (p_b, p_nb) in beam: # Loop over beam
 54 | 
 55 |         # If we propose a blank the prefix doesn't change.
 56 |         # Only the probability of ending in blank gets updated.
 57 |         if s == blank:
 58 |           n_p_b, n_p_nb = next_beam[prefix]
 59 |           n_p_b = logsumexp(n_p_b, p_b + p, p_nb + p)
 60 |           next_beam[prefix] = (n_p_b, n_p_nb)
 61 |           continue
 62 | 
 63 |         # Extend the prefix by the new character s and add it to
 64 |         # the beam. Only the probability of not ending in blank
 65 |         # gets updated.
 66 |         end_t = prefix[-1] if prefix else None
 67 |         n_prefix = prefix + (s,)
 68 |         n_p_b, n_p_nb = next_beam[n_prefix]
 69 |         if s != end_t:
 70 |           n_p_nb = logsumexp(n_p_nb, p_b + p, p_nb + p)
 71 |         else:
 72 |           # We don't include the previous probability of not ending
 73 |           # in blank (p_nb) if s is repeated at the end. The CTC
 74 |           # algorithm merges characters not separated by a blank.
 75 |           n_p_nb = logsumexp(n_p_nb, p_b + p)
 76 |           
 77 |         # *NB* this would be a good place to include an LM score.
 78 |         next_beam[n_prefix] = (n_p_b, n_p_nb)
 79 | 
 80 |         # If s is repeated at the end we also update the unchanged
 81 |         # prefix. This is the merging case.
 82 |         if s == end_t:
 83 |           n_p_b, n_p_nb = next_beam[prefix]
 84 |           n_p_nb = logsumexp(n_p_nb, p_nb + p)
 85 |           next_beam[prefix] = (n_p_b, n_p_nb)
 86 | 
 87 |     # Sort and trim the beam before moving on to the
 88 |     # next time-step.
 89 |     beam = sorted(next_beam.items(),
 90 |             key=lambda x : logsumexp(*x[1]),
 91 |             reverse=True)
 92 |     beam = beam[:beam_size]
 93 | 
 94 |   best = beam[0]
 95 |   return best[0], -logsumexp(*best[1])
 96 | 
 97 | 
 98 |   # np.random.seed()
 99 | 
100 |   # time = 3
101 |   # output_dim = 4
102 | 
103 |   # probs = np.random.rand(time, output_dim)
104 |   # probs = probs / np.sum(probs, axis=1, keepdims=True)
105 | 
106 |   # labels, score = decode(probs, 100)
107 |   # print(labels)
108 |   # print("Score {:.3f}".format(score))


--------------------------------------------------------------------------------
/splits/mit_train.txt:
--------------------------------------------------------------------------------
  1 | punting
  2 | hunting
  3 | digging
  4 | stroking
  5 | dusting
  6 | opening
  7 | applauding
  8 | shredding
  9 | drinking
 10 | asking
 11 | skating
 12 | photographing
 13 | waxing
 14 | crawling
 15 | paying
 16 | floating
 17 | racing
 18 | measuring
 19 | balancing
 20 | leaping
 21 | protesting
 22 | chewing
 23 | sweeping
 24 | grooming
 25 | rolling
 26 | leaking
 27 | singing
 28 | attacking
 29 | shrugging
 30 | spreading
 31 | pedaling
 32 | grilling
 33 | flipping
 34 | draining
 35 | waking
 36 | destroying
 37 | bowing
 38 | yawning
 39 | sniffing
 40 | twisting
 41 | gripping
 42 | roaring
 43 | officiating
 44 | instructing
 45 | filming
 46 | teaching
 47 | shouting
 48 | bending
 49 | playing+fun
 50 | rowing
 51 | stomping
 52 | hitting
 53 | tattooing
 54 | tying
 55 | squatting
 56 | shaking
 57 | speaking
 58 | child+singing
 59 | fueling
 60 | removing
 61 | clapping
 62 | cutting
 63 | sowing
 64 | parading
 65 | pouring
 66 | constructing
 67 | overflowing
 68 | diving
 69 | snowing
 70 | drying
 71 | weeding
 72 | spinning
 73 | flying
 74 | walking
 75 | peeling
 76 | burning
 77 | clinging
 78 | injecting
 79 | sitting
 80 | stacking
 81 | building
 82 | bathing
 83 | handcuffing
 84 | autographing
 85 | starting
 86 | tearing
 87 | tickling
 88 | studying
 89 | poking
 90 | giggling
 91 | chasing
 92 | kneeling
 93 | crafting
 94 | closing
 95 | coughing
 96 | breaking
 97 | adult+female+singing
 98 | submerging
 99 | placing
100 | grinning
101 | falling
102 | imitating
103 | bowling
104 | skiing
105 | slipping
106 | picking
107 | bicycling
108 | extinguishing
109 | jogging
110 | discussing
111 | vacuuming
112 | saluting
113 | brushing
114 | spitting
115 | burying
116 | eating
117 | sketching
118 | kissing
119 | entering
120 | serving
121 | reaching
122 | aiming
123 | preaching
124 | flicking
125 | pushing
126 | hitchhiking
127 | playing+sports
128 | sliding
129 | adult+male+speaking
130 | child+speaking
131 | hammering
132 | shaving
133 | squinting
134 | filling
135 | howling
136 | stopping
137 | frying
138 | shopping
139 | jumping
140 | catching
141 | packaging
142 | feeding
143 | covering
144 | wrestling
145 | manicuring
146 | typing
147 | blowing
148 | clipping
149 | riding
150 | bubbling
151 | gambling
152 | fighting
153 | chopping
154 | smoking
155 | leaning
156 | turning
157 | praying
158 | sneezing
159 | blocking
160 | towing
161 | wetting
162 | cracking
163 | pitching
164 | emptying
165 | throwing
166 | playing
167 | hugging
168 | smiling
169 | splashing
170 | boarding
171 | cleaning
172 | knocking
173 | drumming
174 | washing
175 | barbecuing
176 | pulling
177 | standing
178 | dripping
179 | signing
180 | skipping
181 | storming
182 | fencing
183 | stirring
184 | stealing
185 | tuning
186 | raining
187 | sanding
188 | snuggling
189 | baking
190 | handwriting
191 | slicing
192 | carving
193 | buying
194 | telephoning
195 | cuddling
196 | queuing
197 | watering
198 | cooking
199 | launching
200 | taping
201 | repairing
202 | kicking
203 | piloting
204 | fishing
205 | hanging
206 | landing
207 | adult+male+singing
208 | flowing
209 | barking
210 | operating
211 | rising
212 | driving
213 | trimming
214 | rinsing
215 | raising
216 | carrying
217 | biting
218 | tripping
219 | welding
220 | sewing
221 | writing
222 | shooting
223 | combusting
224 | socializing
225 | locking
226 | steering
227 | interviewing
228 | dipping
229 | sawing
230 | painting
231 | giving
232 | planting
233 | flooding
234 | unloading
235 | marching
236 | snapping
237 | coaching
238 | boxing
239 | drawing
240 | competing
241 | combing
242 | cheerleading
243 | screwing
244 | smelling
245 | celebrating
246 | crushing
247 | packing
248 | stretching
249 | rafting
250 | lifting
251 | slapping
252 | pressing
253 | camping
254 | hiking
255 | playing+videogames
256 | calling
257 | clawing
258 | sprinkling
259 | bulldozing
260 | surfing
261 | arresting
262 | rocking
263 | colliding
264 | gardening
265 | laughing
266 | clearing
267 | swinging
268 | assembling
269 | boating
270 | unpacking
271 | scratching
272 | plugging
273 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import os
  4 | 
  5 | import scipy.stats
  6 | def mean_confidence_interval(data, confidence=0.95):
  7 |     a = 1.0*np.array(data)
  8 |     n = len(a)
  9 |     m, se = np.mean(a), scipy.stats.sem(a)
 10 |     h = se * scipy.stats.t._ppf((1+confidence)/2., n-1)
 11 |     return m,h
 12 | 
 13 | def ndarray_equal(arr1, arr2):
 14 |     return len(arr1) == len(arr2) and np.count_nonzero((arr1 == arr2) == True) == len(arr1)
 15 | 
 16 | def compute_score(prediction, truth):
 17 |     # zeros = np.count_nonzero(prediction == 0)
 18 |     # truths = np.count_nonzero(prediction == truth)
 19 |     try:
 20 |         length_one = (len(prediction) == 1)
 21 |         return 1 if length_one and prediction[0] == truth else 0
 22 |     except Exception:
 23 |         return 1 if prediction == truth else 0
 24 | 
 25 | # def compute_score(prediction, truth):
 26 | #     return np.count_nonzero(prediction == truth) / len(prediction)
 27 | 
 28 | from torch.nn.init import kaiming_normal_
 29 | def weights_init(m):
 30 |     try:
 31 |         kaiming_normal_(m.weight)
 32 |     except Exception:
 33 |         return
 34 | 
 35 | def read_split(file_path):
 36 |     result = []
 37 |     if os.path.exists(file_path):
 38 |         file = open(file_path, "r")
 39 |         lines = file.readlines()
 40 |         file.close()
 41 | 
 42 |         for line in lines:
 43 |             result.append(line.rstrip())
 44 |             
 45 |     return result
 46 | 
 47 | def ctc_length_mask(input):
 48 |     N, L = int(input.shape[0]), int(input.shape[1])
 49 |     dtype = input.dtype
 50 | 
 51 |     # Starting with length L (original length), and all mask true
 52 |     lengths = input.new_full((N, ), fill_value=L, dtype=torch.int, requires_grad=False)
 53 |     mask = input.new_full((N, L), fill_value=True, dtype=torch.bool, requires_grad=False)
 54 | 
 55 |     # Case: Length 1
 56 |     if L == 1:
 57 |         return lengths, mask
 58 |     
 59 |     # Case: Length >= 2
 60 |     for i in range(1, L):
 61 |         # If one same to last one, length--, mask->False
 62 |         lengths[input[:, i] == input[:, i-1]] -= 1
 63 |         mask[input[:, i] == input[:, i-1], i] = False
 64 |     
 65 |     return lengths, mask
 66 | 
 67 | from ctc.ctc_loss import ctcLabelingProb as ctc_prob
 68 | from string import ascii_letters as letters
 69 | # Truth label start with 1
 70 | # 0-th of prob is blank label
 71 | def ctc_probability(prob, truth):
 72 |     C = int(prob.shape[1])
 73 |     C -= 1
 74 |     assert C <= 27
 75 | 
 76 |     classes = [letters[i] for i in range(C)]
 77 |     classes = "".join(classes)
 78 | 
 79 |     gt = truth - 1
 80 |     gt = [letters[i] for i in gt]
 81 | 
 82 |     blank = prob[:, 0]
 83 |     blank = blank.unsqueeze(1)
 84 |     non_blank = prob[:, 1:]
 85 |     mat = torch.cat((non_blank, blank), 1)
 86 | 
 87 |     return ctc_prob(mat, gt, classes)
 88 | 
 89 | def ctc_alignment_predict(probs, targets, target_lengths, sample_num):
 90 |     N, W, CL = int(probs.shape[0]), int(probs.shape[1]), int(probs.shape[2])
 91 |     CL -= 1
 92 |     S = int(target_lengths.shape[0])
 93 | 
 94 |     result = probs.new_full((N, ), fill_value=0, dtype=torch.int, requires_grad=False)
 95 |     
 96 |     for n in range(N):
 97 |         prediction_probs = target_lengths.new_full((S, ), fill_value=0, dtype=torch.double, requires_grad=False)
 98 |         
 99 |         start = 0
100 |         for s in range(S):
101 |             length = int(target_lengths[s])
102 |             prediction_probs[s] = ctc_probability(probs[n], targets[start: start+length])
103 |             # file = open("test.log", "a")
104 |             # file.write("prob from "+str(probs[n])+" to "+str(targets[start: start+length])+"\n")
105 |             # file.close()
106 |             start += length
107 |         
108 |         # print("\nprediction_probs", prediction_probs, "\n")
109 |         _, result[n] = torch.max(prediction_probs, 0)
110 |     
111 |     result = [int(result[i]//sample_num)+1 for i in range(len(result))]
112 | 
113 |     return result
114 | 
115 | def PCA(X, k=2):
116 |     X_mean = torch.mean(X, 0)
117 |     X = X - X_mean.expand_as(X)
118 |     # SVD
119 |     U,S,V = torch.svd(torch.t(X))
120 |     return torch.mm(X,U[:,:k])
121 | 
122 | from ctc.ctc_decode import decode as ctc_decode
123 | def ctc_predict(input):
124 |     results = []
125 |     for i in range(len(input)):
126 |         result, _ = ctc_decode(input[i])
127 |         results.append(result)
128 |     return results
129 | 
130 | def ctc_predict_single(input):
131 |     N, W, C = input.shape # Batch, Window, Class
132 |     C -= 1
133 | 
134 |     prediction = []
135 |     for n in range(N):
136 |         prob = input[n]
137 |         results = []
138 |         for c in range(C):
139 |             truth = np.array([c+1])
140 |             results.append(ctc_probability(prob, truth))
141 |         prediction.append(results)
142 |     
143 |     prediction = np.argmax(prediction, axis=1) + 1
144 |     return prediction
145 | 
146 | def my_load(model, name):
147 |     model_path = os.path.join(args.checkpoint, name)
148 |     if os.path.join(model_path):
149 |         model.load_state_dict(torch.load(model_path))


--------------------------------------------------------------------------------
/attention_pool.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | class AttentionPooling(nn.Module):
  5 |     def __init__(self, class_num, sample_num, query_num, window_num, clip_num, feature_dim):
  6 |         super(AttentionPooling, self).__init__()
  7 |         self.class_num = class_num
  8 |         self.sample_num = sample_num
  9 |         self.window_num = window_num
 10 |         self.clip_num = clip_num
 11 |         self.feature_dim = feature_dim
 12 |         # self.k = k
 13 | 
 14 |         self.query_dim = class_num*query_num*window_num
 15 |         self.sample_dim = class_num*sample_num*window_num
 16 |         
 17 |         self.layer1 = nn.Sequential(
 18 |                         nn.Linear(self.sample_dim, self.sample_dim*2),
 19 |                         nn.BatchNorm1d(self.sample_dim*2, affine=False),
 20 |                         nn.ReLU(),
 21 |                         nn.Linear(self.sample_dim*2, self.sample_dim),
 22 |                         nn.BatchNorm1d(self.sample_dim, affine=False),
 23 |                         nn.ReLU())
 24 |         
 25 |         self.layer2 = nn.Sequential(
 26 |                         nn.Linear(self.clip_num*self.feature_dim, self.clip_num*self.feature_dim*2),
 27 |                         nn.BatchNorm1d(self.clip_num*self.feature_dim*2, affine=False),
 28 |                         nn.ReLU(),
 29 |                         nn.Linear(self.clip_num*self.feature_dim*2, self.clip_num*self.feature_dim),
 30 |                         nn.BatchNorm1d(self.clip_num*self.feature_dim, affine=False),
 31 |                         nn.ReLU())
 32 |         
 33 |         self.weights_init()
 34 |     
 35 |     def weights_init(self):
 36 |         nn.init.kaiming_normal_(self.layer1[0].weight)
 37 |         nn.init.kaiming_normal_(self.layer1[3].weight)
 38 |         nn.init.kaiming_normal_(self.layer2[0].weight)
 39 |         nn.init.kaiming_normal_(self.layer2[3].weight)
 40 | 
 41 |     def forward(self, samples, batches):
 42 |         self.query_dim = int(batches.shape[0])
 43 | 
 44 |         samples_trans = torch.transpose(samples, 0, 1)    # [clip, class*sample*window, feature]
 45 |         samples_trans = torch.transpose(samples_trans, 1, 2)    # [clip, feature, class*sample*window]
 46 | 
 47 |         weight = torch.tensordot(batches, samples_trans, dims=2) # [query*class*window, class*sample*window]
 48 |         weight = self.layer1(weight)
 49 |         weight = weight.reshape(self.query_dim, self.class_num*self.sample_num*self.window_num) # [query*class*window, class*sample*window]
 50 |         
 51 |         # if self.k > 0:
 52 |         #     _, topk_idx = torch.topk(weight, self.k, largest=True, sorted=True, out=None) # [query*class*window, class, k]
 53 |             
 54 |         #     weight_01 = weight.new_full((self.query_dim, self.class_num, self.sample_num*self.window_num), 0, requires_grad=True) # [query*class*window, class, sample*window]
 55 |         #     weight = weight_01.scatter_(2, topk_idx, 1)
 56 | 
 57 |         weight = weight.reshape(self.query_dim*self.class_num, self.sample_num*self.window_num).unsqueeze(1) # [query*class*window*class, 1, sample*window]
 58 | 
 59 |         samples = samples.unsqueeze(0).repeat(self.query_dim,1,1,1).reshape(self.query_dim*self.class_num, self.sample_num*self.window_num, -1) # [query*class*window*class, sample*window, clip*feature]
 60 |         samples = torch.bmm(weight, samples).squeeze(1)                        # [query*class*window*class, clip*feature]
 61 | 
 62 |         samples = self.layer2(samples)                                         # [query*class*window*class, clip*feature]
 63 |         samples = samples.reshape(self.query_dim, self.class_num, self.clip_num, -1)  # [query*class*window, class, clip, feature]
 64 | 
 65 |         return samples
 66 | 
 67 | class AttentionPoolingConv(nn.Module):
 68 |     def __init__(self, class_num, sample_num, query_num, window_num, clip_num, feature_dim):
 69 |         super(AttentionPoolingConv, self).__init__()
 70 |         self.class_num = class_num
 71 |         self.sample_num = sample_num
 72 |         self.window_num = window_num
 73 |         self.clip_num = clip_num
 74 |         self.feature_dim = feature_dim
 75 |         # self.k = k
 76 | 
 77 |         self.query_dim = class_num*query_num*window_num
 78 |         self.sample_dim = class_num*sample_num*window_num
 79 |         
 80 |         self.layer = nn.Sequential(
 81 |                         nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, padding=1),
 82 |                         nn.BatchNorm1d(16, affine=False),
 83 |                         nn.ReLU(),
 84 |                         nn.Conv1d(in_channels=16, out_channels=1, kernel_size=3, padding=1),
 85 |                         nn.BatchNorm1d(1, affine=False),
 86 |                         nn.ReLU())
 87 |         
 88 |         self.weights_init()
 89 |     
 90 |     def weights_init(self):
 91 |         nn.init.kaiming_normal_(self.layer[0].weight)
 92 |         nn.init.kaiming_normal_(self.layer[3].weight)
 93 | 
 94 |     def forward(self, samples, batches):
 95 |         self.query_dim = int(batches.shape[0])
 96 | 
 97 |         samples_trans = torch.transpose(samples, 0, 1)    # [clip, class*sample*window, feature]
 98 |         samples_trans = torch.transpose(samples_trans, 1, 2)    # [clip, feature, class*sample*window]
 99 | 
100 |         weight = torch.tensordot(batches, samples_trans, dims=2) # [query*class*window, class*sample*window]
101 |         weight = weight.unsqueeze(1) # [query*class*window, 1, class*sample*window]
102 |         weight = self.layer(weight)
103 | 
104 |         weight = weight.reshape(self.query_dim*self.class_num, self.sample_num*self.window_num).unsqueeze(1) # [query*class*window*class, 1, sample*window]
105 | 
106 |         samples = samples.unsqueeze(0).repeat(self.query_dim,1,1,1).reshape(self.query_dim*self.class_num, self.sample_num*self.window_num, -1) # [query*class*window*class, sample*window, clip*feature]
107 |         samples = torch.bmm(weight, samples).squeeze(1)                        # [query*class*window*class, clip*feature]
108 | 
109 |         samples = samples.reshape(self.query_dim, self.class_num, self.clip_num, -1)  # [query*class*window, class, clip, feature]
110 | 
111 |         return samples
112 | 
113 | 


--------------------------------------------------------------------------------
/splits/haa_train.txt:
--------------------------------------------------------------------------------
  1 | figure_skate_I_spin
  2 | high_five
  3 | play_diabolo
  4 | hand-drill_firemaking_drill_with_bow
  5 | eating_oyster
  6 | using_lawn_mower
  7 | alligator_wrestling
  8 | climbing_rock
  9 | javelin_run
 10 | playing_rubiks_cube
 11 | dog_highfive
 12 | figure_skate_sit_spin
 13 | tight-rope_walking
 14 | play_noseflute
 15 | play_trumpet
 16 | soccer_shoot
 17 | hand-drill_firemaking_drill_with_hand
 18 | ride_horse
 19 | cross_body_shoulder_stretch
 20 | blowdrying_hair
 21 | ride_motorcycle
 22 | frog_jump
 23 | play_jazzdrum
 24 | baseball_bunt
 25 | headbang
 26 | segway
 27 | punching_speedbag
 28 | skateboard_jump
 29 | figure_skate_inaBauer
 30 | leg_hold_front
 31 | blowing kisses
 32 | pull_ups
 33 | play_gong
 34 | hammering_nail
 35 | sack_race
 36 | equestrian_jump
 37 | axe_throwing
 38 | unevenbar_cross
 39 | play_banjo
 40 | juggling_balls
 41 | bowing_waist
 42 | blow_gun
 43 | remove_car_tire
 44 | basketball_jabstep
 45 | handsaw
 46 | climb_stair
 47 | cast_net
 48 | handshake_dog
 49 | volleyball_pass
 50 | chainsaw_log
 51 | speed_stack
 52 | play_cello
 53 | running_on_four
 54 | unevenbar_spin
 55 | tennis_serve
 56 | rowing_boat
 57 | yoga_updog
 58 | burpee
 59 | closing_door
 60 | jumping_jack
 61 | racewalk_walk
 62 | weightlifting_hang
 63 | add_new_car_tire
 64 | trampoline
 65 | unicycle_ride
 66 | flying_kite
 67 | plunging_toilet
 68 | fish-hunting_pull
 69 | sword_swallowing
 70 | spinning_plate
 71 | stomping_grapes
 72 | blowing_glass
 73 | yoga_cat
 74 | cartwheeling
 75 | playing_swing
 76 | smoking_inhale
 77 | putting_scarf_on
 78 | golf_swing
 79 | play_thereminvox
 80 | dart_throw
 81 | setup_tent
 82 | walking_on_stilts
 83 | cleaning_vacumming
 84 | ride_bike
 85 | ski_jump_slide
 86 | side_lunge
 87 | balancebeam_rotate
 88 | flamethrower
 89 | hailing_taxi
 90 | brushing_floor
 91 | petanque_throw
 92 | high_jump_run
 93 | salute
 94 | wall_paint_brush
 95 | heimlich_maneuver
 96 | horizontalbar_spin
 97 | frisbee_catch
 98 | softball_pitch
 99 | dj
100 | play_serpent
101 | kayaking
102 | bench_dip
103 | archaeological_excavation
104 | play_castanets
105 | soccer_dribble
106 | gym_run
107 | wear_helmet
108 | bandaging
109 | snowboard_jump
110 | trapeze_single
111 | play_saw
112 | applying_cream
113 | floor_spin
114 | fire_breathing
115 | ski_jump_land
116 | fire_extinguisher
117 | spinning_basketball
118 | shoot_dance
119 | fish-hunting_hold
120 | weightlifting_stand
121 | applauding
122 | cross_country_ski_walk
123 | canoeing_sprint
124 | golf_part
125 | hanging_clothes
126 | draw_handgun
127 | jump rope
128 | hammer_throw
129 | blowing_balloon
130 | soccer_save
131 | play_sheng
132 | kick_jianzi
133 | calfrope_rope
134 | skateboard_forward
135 | eat_burger
136 | pouring_wine
137 | curling_follow
138 | eat_apple
139 | gym_squat
140 | using_typewriter
141 | moonwalk
142 | walking_with_walker
143 | play_yoyo
144 | throw_boomerang
145 | taking_photo_camera
146 | blowing_gum
147 | using_spinning_wheel
148 | cleaning_mirror
149 | rock_paper_scissors
150 | grass_skating
151 | blowing_nose
152 | sling
153 | play_suona
154 | ride_scooter
155 | shoe_shining
156 | skydiving
157 | gas_pumping_to_car
158 | soccer_header
159 | knitting
160 | opening_door
161 | playing_seesaw
162 | football_catch
163 | cleaning_sweeping
164 | sandboarding_forward
165 | baseball_pitch
166 | air_drumming
167 | battle-rope_russian-twist
168 | chalkboard
169 | sand_scuplting
170 | baseball_catch_groundball
171 | sprint_run
172 | figure_skate_bielman_spin
173 | using_string_trimmer
174 | running_in_place
175 | figure_skate_camel_spin
176 | playing_snare_drum
177 | blowing_leaf
178 | gym_push
179 | split_leap
180 | hold_baby_with_wrap
181 | styling_hair
182 | battle-rope_snake
183 | balloon_animal
184 | belly_dancing
185 | talking_on_phone
186 | tap_dancing
187 | chopping_meat
188 | spitting_on_face
189 | card_throw
190 | playing_cajon_drum
191 | slingshot
192 | play_erhu
193 | ganggangsullae
194 | leg_split
195 | abseiling
196 | taekwondo_middle_block
197 | eat_spagetti
198 | dips
199 | play_bagpipes
200 | hookah
201 | play_trombone
202 | battle-rope_power-slam
203 | kick_open_door
204 | playing_nunchucks
205 | whipping
206 | whistle_one_hand
207 | wall_paint_roller
208 | figure_skate_donut_spin
209 | unevenbar_land
210 | basketball_pass
211 | breakdancing_support
212 | washing_clothes
213 | beer_pong_throw
214 | snowboard_slide
215 | unevenbar_jump
216 | figure_skate_scratch_spin
217 | milking
218 | play_viola
219 | wear_face_mask
220 | figure_skate_forward
221 | drinking_with_cup
222 | taichi_fan
223 | water_skiing
224 | riding_ostrich
225 | throw_paper-plane
226 | taking_selfie
227 | high_jump_jump
228 | paragliding
229 | play_clarinet
230 | washing_dishes
231 | horizontalbar_flip
232 | whistle_two_hands
233 | using_scythe
234 | basketball_dunk
235 | handstand
236 | play_yangqin
237 | gym_lift
238 | hugging_animal
239 | calfrope_catch
240 | hold_baby
241 | riding_zebra
242 | cross_country_ski_slide
243 | bmx_jumping
244 | ski_cork
245 | basketball_layup
246 | basketball_dribble
247 | battle-rope_wave
248 | playing_bass_drum
249 | flipping_bottle
250 | ice_scuplting
251 | yoga_tree
252 | spinning_book
253 | walking_with_crutches
254 | tennis_backhand
255 | forward_fold
256 | climb_ladder
257 | balancebeam_spin
258 | calfrope_subdue
259 | sprint_kneel
260 | arm_wave
261 | long_jump_run
262 | taekwondo_high_block
263 | cleaning_windows
264 | backward_roll
265 | shooting_shotgun
266 | play_violin
267 | pole_vault_jump
268 | guitar_smashing
269 | ALS_IceBucket_Challenge
270 | marble_scuplting
271 | three_legged_race
272 | play_handpan
273 | bowing_fullbody
274 | high_knees
275 | tug_of_war
276 | breakdancing_flip
277 | windsurfing
278 | volleyball_overhand
279 | football_run
280 | leg_hold_back
281 | pancake_flip
282 | jetski
283 | volleyball_set
284 | air_guitar
285 | riding_mule
286 | push_car
287 | badminton_overswing
288 | play_bangu
289 | taekwondo_low_block
290 | folding_blanket
291 | floor_rotate
292 | crossbow_shoot
293 | battle-rope_jumping-jack
294 | equestrian_dressage
295 | play_tambourine
296 | spraying_wall
297 | hopscotch_pickup
298 | shuffle_dance
299 | tire_sled
300 | atlatl_throw
301 | tennis_forehand
302 | bmx_riding
303 | jack_up_car
304 | play_cornett
305 | curling_push
306 | play_ukulele
307 | star_jumping
308 | rolling_snow
309 | shaking_head
310 | javelin_throw
311 | 


--------------------------------------------------------------------------------
/moco/builder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | 
  6 | class MoCo(nn.Module):
  7 |     """
  8 |     Build a MoCo model with: a query encoder, a key encoder, and a queue
  9 |     https://arxiv.org/abs/1911.05722
 10 |     """
 11 |     def __init__(self, base_encoder, dim=128, K=65536, m=0.999, T=0.07, mlp=False):
 12 |         """
 13 |         dim: feature dimension (default: 128)
 14 |         K: queue size; number of negative keys (default: 65536)
 15 |         m: moco momentum of updating key encoder (default: 0.999)
 16 |         T: softmax temperature (default: 0.07)
 17 |         """
 18 |         super(MoCo, self).__init__()
 19 | 
 20 |         self.K = K
 21 |         self.m = m
 22 |         self.T = T
 23 | 
 24 |         # create the encoders
 25 |         # num_classes is the output fc dimension
 26 |         self.encoder_q = base_encoder()
 27 |         self.encoder_k = base_encoder() # num_classes=dim
 28 | 
 29 |         if mlp:  # hack: brute-force replacement
 30 |             dim_mlp = self.encoder_q.fc.weight.shape[1]
 31 |             self.encoder_q.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_q.fc)
 32 |             self.encoder_k.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_k.fc)
 33 | 
 34 |         for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
 35 |             param_k.data.copy_(param_q.data)  # initialize
 36 |             param_k.requires_grad = False  # not update by gradient
 37 | 
 38 |         # create the queue
 39 |         self.register_buffer("queue", torch.randn(dim, K))
 40 |         self.queue = nn.functional.normalize(self.queue, dim=0)
 41 | 
 42 |         self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))
 43 | 
 44 |     @torch.no_grad()
 45 |     def _momentum_update_key_encoder(self):
 46 |         """
 47 |         Momentum update of the key encoder
 48 |         """
 49 |         for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
 50 |             param_k.data = param_k.data * self.m + param_q.data * (1. - self.m)
 51 | 
 52 |     @torch.no_grad()
 53 |     def _dequeue_and_enqueue(self, keys):
 54 |         # gather keys before updating queue
 55 |         keys = concat_all_gather(keys)
 56 | 
 57 |         batch_size = keys.shape[0]
 58 | 
 59 |         ptr = int(self.queue_ptr)
 60 |         assert self.K % batch_size == 0  # for simplicity
 61 | 
 62 |         # replace the keys at ptr (dequeue and enqueue)
 63 |         self.queue[:, ptr:ptr + batch_size] = keys.T
 64 |         ptr = (ptr + batch_size) % self.K  # move pointer
 65 | 
 66 |         self.queue_ptr[0] = ptr
 67 | 
 68 |     @torch.no_grad()
 69 |     def _batch_shuffle_ddp(self, x):
 70 |         """
 71 |         Batch shuffle, for making use of BatchNorm.
 72 |         *** Only support DistributedDataParallel (DDP) model. ***
 73 |         """
 74 |         # gather from all gpus
 75 |         batch_size_this = x.shape[0]
 76 |         x_gather = concat_all_gather(x)
 77 |         batch_size_all = x_gather.shape[0]
 78 | 
 79 |         num_gpus = batch_size_all // batch_size_this
 80 | 
 81 |         # random shuffle index
 82 |         idx_shuffle = torch.randperm(batch_size_all).cuda()
 83 | 
 84 |         # broadcast to all gpus
 85 |         torch.distributed.broadcast(idx_shuffle, src=0)
 86 | 
 87 |         # index for restoring
 88 |         idx_unshuffle = torch.argsort(idx_shuffle)
 89 | 
 90 |         # shuffled index for this gpu
 91 |         gpu_idx = torch.distributed.get_rank()
 92 |         idx_this = idx_shuffle.view(num_gpus, -1)[gpu_idx]
 93 | 
 94 |         return x_gather[idx_this], idx_unshuffle
 95 | 
 96 |     @torch.no_grad()
 97 |     def _batch_unshuffle_ddp(self, x, idx_unshuffle):
 98 |         """
 99 |         Undo batch shuffle.
100 |         *** Only support DistributedDataParallel (DDP) model. ***
101 |         """
102 |         # gather from all gpus
103 |         batch_size_this = x.shape[0]
104 |         x_gather = concat_all_gather(x)
105 |         batch_size_all = x_gather.shape[0]
106 | 
107 |         num_gpus = batch_size_all // batch_size_this
108 | 
109 |         # restored index for this gpu
110 |         gpu_idx = torch.distributed.get_rank()
111 |         idx_this = idx_unshuffle.view(num_gpus, -1)[gpu_idx]
112 | 
113 |         return x_gather[idx_this]
114 | 
115 |     def forward(self, im_q, im_k):
116 |         """
117 |         Input:
118 |             im_q: a batch of query images
119 |             im_k: a batch of key images
120 |         Output:
121 |             logits, targets
122 |         """
123 | 
124 |         # compute query features
125 |         q = self.encoder_q(im_q)  # queries: NxC
126 |         q = nn.functional.normalize(q, dim=1)
127 | 
128 |         # compute key features
129 |         with torch.no_grad():  # no gradient to keys
130 |             self._momentum_update_key_encoder()  # update the key encoder
131 | 
132 |             # shuffle for making use of BN
133 |             im_k, idx_unshuffle = self._batch_shuffle_ddp(im_k)
134 | 
135 |             k = self.encoder_k(im_k)  # keys: NxC
136 |             k = nn.functional.normalize(k, dim=1)
137 | 
138 |             # undo shuffle
139 |             k = self._batch_unshuffle_ddp(k, idx_unshuffle)
140 | 
141 |         # compute logits
142 |         # Einstein sum is more intuitive
143 |         # positive logits: Nx1
144 |         l_pos = torch.einsum('nc,nc->n', [q, k]).unsqueeze(-1)
145 |         # negative logits: NxK
146 |         l_neg = torch.einsum('nc,ck->nk', [q, self.queue.clone().detach()])
147 | 
148 |         # logits: Nx(1+K)
149 |         logits = torch.cat([l_pos, l_neg], dim=1)
150 | 
151 |         # apply temperature
152 |         logits /= self.T
153 | 
154 |         # labels: positive key indicators
155 |         labels = torch.zeros(logits.shape[0], dtype=torch.long).cuda()
156 | 
157 |         # dequeue and enqueue
158 |         self._dequeue_and_enqueue(k)
159 | 
160 |         return logits, labels
161 | 
162 | 
163 | # utils
164 | @torch.no_grad()
165 | def concat_all_gather(tensor):
166 |     """
167 |     Performs all_gather operation on the provided tensors.
168 |     *** Warning ***: torch.distributed.all_gather has no gradient.
169 |     """
170 |     tensors_gather = [torch.ones_like(tensor)
171 |         for _ in range(torch.distributed.get_world_size())]
172 |     torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
173 | 
174 |     output = torch.cat(tensors_gather, dim=0)
175 |     return output
176 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | 
  5 | import numpy as np
  6 | import os
  7 | import random
  8 | import argparse
  9 | 
 10 | from relation_net import RelationNetwork as RN
 11 | from encoder import Simple3DEncoder as C3D
 12 | from tcn import TemporalConvNet as TCN
 13 | from attention_pool import AttentionPooling as AP
 14 | import dataset
 15 | from utils import *
 16 | 
 17 | # Argument Parser
 18 | parser = argparse.ArgumentParser()
 19 | parser.add_argument("-d", "--dataset", help="path of the dataset json file", required=True)
 20 | parser.add_argument("-w", "--way", help="number of classes", type=int, default=3)
 21 | parser.add_argument("-s", "--shot", help="number of shots", type=int, default=5)
 22 | parser.add_argument("-g", "--gpu", help="indices of gpu to be used, use all if not specified, e.g. --gpu=2,4,5")
 23 | parser.add_argument("-t", "--test_ep", help="number of test episodes", type=int, default=500)
 24 | parser.add_argument("-p", "--predict", help="whether to use mse or ctc at prediction", choices=["mse", "ctc"], default="ctc")
 25 | parser.add_argument("-c", "--checkpoint", help="path of a checkpoint to start from, a path with its name as the accuracy", required=True)
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | if not os.exists(args.dataset):
 30 |     raise Exception("invalid dataset path: {}".format(args.dataset))
 31 | else:
 32 |     file = open(args.dataset, 'r')
 33 |     text = file.readline()
 34 |     file.close()
 35 |     dataset_info = json.loads(text)
 36 | if args.gpu is not None:
 37 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
 38 | device = torch.device('cuda')
 39 | if args.way > 1:
 40 |     CLASS_NUM = args.way
 41 | else:
 42 |     raise Exception("one-way or even less is not meaningful")
 43 | if args.shot >= 1:
 44 |     SAMPLE_NUM = args.shot
 45 | else:
 46 |     raise Exception("zero-shot is beyond the scope of this project")
 47 | if not os.path.exists(args.checkpoint):
 48 |     raise Exception("invalid checkpoint path: {}".format(args.checkpoint))
 49 | 
 50 | # Some Constants
 51 | CLIP_NUM = 5    # Num of clips per window
 52 | WINDOW_NUM = 3  # Num of processing window per video
 53 | FRAME_NUM = 10  # Num of frames per clip
 54 | QUERY_NUM = 5   # Num of instances for query per class
 55 | INST_NUM = 10   # Num of videos selected in each class (A silly design, will be removed later)
 56 | TCN_OUT = 64    # Num of channels of output of TCN
 57 | 
 58 | # Define models
 59 | c3d = C3D(in_channels=3) 
 60 | c3d = nn.DataParallel(c3d)
 61 | tcn = TCN(245760, [128,128,64,TCN_OUT])
 62 | ap = AP(CLASS_NUM, SAMPLE_NUM, QUERY_NUM, WINDOW_NUM, CLIP_NUM, TCN_OUT)
 63 | rn = RN(CLIP_NUM, hidden_size=32) 
 64 | 
 65 | # Move models to GPU
 66 | c3d.to(device)
 67 | rn.to(device)
 68 | tcn.to(device)
 69 | ap.to(device)
 70 | 
 71 | # Load Saved Models & Optimizers & Schedulers
 72 | my_load(c3d, "c3d.pkl")
 73 | my_load(tcn, "tcn.pkl")
 74 | my_load(ap, "ap.pkl")
 75 | my_load(rn, "rn.pkl")
 76 | 
 77 | # Testing
 78 | with torch.no_grad():
 79 |     accuracies = []
 80 | 
 81 |     test_ep = 0
 82 |     while test_ep < args.test_ep:
 83 |         
 84 |         # Data Loading
 85 |         try:
 86 |             if dataset_info["name"] != "finegym":
 87 |                 the_dataset = dataset.StandardDataset(dataset_info["folders"], "test", dataset_info["split"], CLASS_NUM, INST_NUM, FRAME_NUM, CLIP_NUM, WINDOW_NUM)
 88 |             else:
 89 |                 the_dataset = dataset.FinegymDataset(dataset_info["folder"], dataset_info["finegym_info"], "test", dataset_info["split"], CLASS_NUM, INST_NUM, FRAME_NUM, CLIP_NUM, WINDOW_NUM)
 90 |             dataloader = dataset.get_data_loader(the_dataset, num_per_class=SAMPLE_NUM+QUERY_NUM, num_workers=0)
 91 |             data, data_labels = dataloader.__iter__().next()     # [class*(support+query), window*clip, RGB, frame, H, W]
 92 |         except Exception:
 93 |             continue
 94 |         data = data.view(-1, 3, FRAME_NUM, 128, 128)
 95 | 
 96 |         print("Test_Epi[{}]".format(test_ep), end="\t")
 97 | 
 98 |         # Generate support & query split
 99 |         query_index = []
100 |         support_index = []
101 |         for i in range(CLASS_NUM):
102 |             start = (SAMPLE_NUM+QUERY_NUM) * i
103 |             end = (SAMPLE_NUM+QUERY_NUM) * (i+1)
104 |             index = list(range(start, end))
105 |             q = random.sample(index, QUERY_NUM)
106 |             s = list(set(index)-set(q))
107 |             query_index.extend(q)
108 |             support_index.extend(s)
109 |         random.shuffle(query_index)
110 |         query_index = torch.tensor(query_index)
111 |         support_index = torch.tensor(support_index)
112 |         
113 |         # Encoding
114 |         embed = c3d(Variable(data).to(device))
115 |         embed = embed.view(CLASS_NUM*(SAMPLE_NUM+QUERY_NUM), WINDOW_NUM*CLIP_NUM, -1)  # [class*(support+query), window*clip, feature]
116 | 
117 |         # TCN Processing
118 |         embed = torch.transpose(embed, 1, 2)           # [class*(support+query), feature(channel), window*clip(length)]
119 |         embed = tcn(embed)
120 |         embed = torch.transpose(embed, 1, 2)           # [class*(support+query), window*clip, feature]
121 | 
122 |         # Split data into support & query
123 |         samples = embed[support_index] # [class*support, window*clip, feature]
124 |         batches = embed[query_index]   # [class*query, window*clip, feature]
125 |         batches_labels = data_labels[query_index]
126 | 
127 |         # Attention Pooling
128 |         samples = samples.reshape(CLASS_NUM*SAMPLE_NUM*WINDOW_NUM, CLIP_NUM, -1)  # [class*sample*window, clip, feature]
129 |         batches = batches.reshape(CLASS_NUM*QUERY_NUM*WINDOW_NUM, CLIP_NUM, -1)  # [query*class*window, clip, feature]
130 |         samples = ap(samples, batches)                    # [query*class*window, class, clip, feature]
131 | 
132 |         # Compute Relation
133 |         batches_rn = batches.unsqueeze(0).repeat(CLASS_NUM,1,1,1)  # [class, query*class*window, clip, feature]
134 |         batches_rn = torch.transpose(batches_rn,0,1)               # [query*class*window, class, clip, feature]
135 |         relations = torch.cat((samples,batches_rn),2).reshape(-1,CLIP_NUM*2,TCN_OUT)    # [query*class*window, class, clip*2(channel), feature]
136 |         relations = rn(relations).reshape(QUERY_NUM*CLASS_NUM, WINDOW_NUM, CLASS_NUM)    # [query*class, window, class]
137 |         relations_ctc = torch.cat((blank_prob, relations), 2)             # [query*class, window(length), class+1]
138 |         final_outcome = torch.transpose(logSoftmax(relations_ctc), 0, 1)  # [window(length), query*class, class+1]
139 |         
140 |         cos = nn.CosineSimilarity(dim=2)
141 |         samples = samples.reshape(CLASS_NUM*QUERY_NUM*WINDOW_NUM, CLASS_NUM, -1)
142 |         batches = batches.reshape(CLASS_NUM*QUERY_NUM*WINDOW_NUM,1,-1).repeat(1,CLASS_NUM,1)
143 |         relations = cos(samples,batches).reshape(QUERY_NUM*CLASS_NUM, WINDOW_NUM, CLASS_NUM)
144 |         
145 |         # Generate final probabilities
146 |         blank_prob = torch.full(size=(QUERY_NUM*CLASS_NUM, WINDOW_NUM, 1), fill_value=1, dtype=torch.float).to(device)
147 |         relations_ctc = torch.cat((blank_prob, relations), 2)
148 |         final_outcome = nn.functional.softmax(relations_ctc, 2)  # [query*class, window(length), class+1]
149 | 
150 |         # Predict
151 |         batches_labels = batches_labels.numpy()
152 |         if args.predict == "mse":
153 |             relations_mse = nn.functional.softmax(torch.sum(relations, 1), dim=1) # [query*class, class]
154 |             _, predict_labels = torch.max(relations_mse.data, 1)
155 |             batches_labels = batches_labels - 1
156 |         else:
157 |             predict_labels = ctc_predict(final_outcome.cpu().numpy())
158 |             predict_labels = ctc_predict_single(final_outcome)
159 | 
160 |         rewards = [1 if predict_labels[i] == batches_labels[i] else 0 for i in range(len(predict_labels))]
161 |         total_rewards = np.sum(rewards)
162 | 
163 |         # Record accuracy
164 |         accuracy = total_rewards/(CLASS_NUM * QUERY_NUM)
165 |         accuracies.append(accuracy)
166 |         print("Accuracy = {}".format(accuracy), end='\t')
167 |         test_accuracy, _ = mean_confidence_interval(accuracies)
168 |         print("Average Accuracy = {}".format(test_accuracy))
169 | 
170 |         test_ep += 1
171 | 
172 | # Average accuracy
173 | test_accuracy, _ = mean_confidence_interval(accuracies)
174 | print("Final_Accu = {}".format(test_accuracy))


--------------------------------------------------------------------------------
/relation_net.py:
--------------------------------------------------------------------------------
  1 | # Public Packages
  2 | import torch
  3 | import torch.nn as nn
  4 | import math
  5 | 
  6 | def weights_init(m):
  7 |     classname = m.__class__.__name__
  8 |     if classname.find('Conv') != -1:
  9 |         n = m.kernel_size[0] * m.out_channels
 10 |         m.weight.data.normal_(0, math.sqrt(2. / n))
 11 |         if m.bias is not None:
 12 |             m.bias.data.zero_()
 13 |     elif classname.find('BatchNorm') != -1:
 14 |         m.weight.data.fill_(1)
 15 |         m.bias.data.zero_()
 16 |     elif classname.find('Linear') != -1:
 17 |         n = m.weight.size(1)
 18 |         m.weight.data.normal_(0, 0.01)
 19 |         m.bias.data = torch.ones(m.bias.data.size())
 20 | 
 21 | # Relation Network Module
 22 | class RelationNetwork(nn.Module):
 23 |     def __init__(self, input_size, hidden_size):
 24 |         super(RelationNetwork, self).__init__()
 25 |         self.layer1 = nn.Sequential(
 26 |                         nn.Conv1d(input_size*2,input_size,kernel_size=1),
 27 |                         nn.BatchNorm1d(input_size, momentum=1, affine=True),
 28 |                         nn.ReLU(),
 29 |                         nn.MaxPool1d(2))
 30 |         self.layer2 = nn.Sequential(
 31 |                         nn.Conv1d(input_size,input_size,kernel_size=3),
 32 |                         nn.BatchNorm1d(input_size, momentum=1, affine=True),
 33 |                         nn.ReLU(),
 34 |                         nn.MaxPool1d(2))
 35 |         self.fc1 = nn.Linear(75, hidden_size)
 36 |         self.fc2 = nn.Linear(hidden_size,1)
 37 | 
 38 |         # # Initialize itself
 39 |         self.apply(weights_init)
 40 | 
 41 |     def forward(self,x):    
 42 |         out = self.layer1(x)
 43 |         out = self.layer2(out)
 44 |         out = out.view(out.size(0),-1)
 45 |         out = nn.functional.relu(self.fc1(out))
 46 |         out = torch.sigmoid(self.fc2(out))
 47 |         return out
 48 | 
 49 | class RelationNetworkZero(nn.Module):
 50 |     def __init__(self, input_size, hidden_size):
 51 |         super(RelationNetworkZero, self).__init__()
 52 |         self.layer1 = nn.Sequential(
 53 |                         nn.Conv1d(input_size, int(input_size/2), kernel_size=1),
 54 |                         nn.BatchNorm1d(int(input_size/2), momentum=1, affine=True),
 55 |                         nn.ReLU(),
 56 |                         nn.MaxPool1d(2))
 57 |         self.layer2 = nn.Sequential(
 58 |                         nn.Conv1d(int(input_size/2), int(input_size/4), kernel_size=3),
 59 |                         nn.BatchNorm1d(int(input_size/4), momentum=1, affine=True),
 60 |                         nn.ReLU(),
 61 |                         nn.MaxPool1d(2))
 62 |         # self.fc1 = nn.Linear(105, hidden_size) # 5way
 63 |         self.fc1 = nn.Linear(75, hidden_size) # 3way
 64 |         self.fc2 = nn.Linear(hidden_size,1)
 65 | 
 66 |         # # Initialize itself
 67 |         self.apply(weights_init)
 68 | 
 69 |     def forward(self,x):    
 70 |         out = self.layer1(x)
 71 |         out = self.layer2(out)
 72 |         out = out.view(out.size(0),-1)
 73 |         out = nn.functional.relu(self.fc1(out))
 74 |         out = torch.sigmoid(self.fc2(out))
 75 |         return out
 76 | 
 77 | class MultiRelationNetwork(nn.Module):
 78 |     
 79 |     def __init__(self, input_size, hidden_size, main_fc_dim, feature_dim):
 80 |         super(MultiRelationNetwork, self).__init__()
 81 | 
 82 |         # # FC
 83 |         # self.main_fc = nn.Sequential(
 84 |         #                 nn.Linear(main_fc_dim*2, main_fc_dim),
 85 |         #                 nn.ReLU(),
 86 |         #                 nn.Linear(main_fc_dim, 1))
 87 | 
 88 |         # # Window-wise Inner Product
 89 |         # self.ip_layer = 
 90 | 
 91 |         # Clip-wise
 92 |         self.cw_layer = nn.Sequential(
 93 |                         nn.Conv1d(2, 5, kernel_size=3, padding=1),
 94 |                         nn.BatchNorm1d(5, momentum=1, affine=False),
 95 |                         nn.ReLU(),
 96 |                         nn.Conv1d(5, 1, kernel_size=3, padding=1),
 97 |                         nn.BatchNorm1d(1, momentum=1, affine=True),
 98 |                         nn.ReLU(),
 99 |                         nn.Linear(feature_dim, int(feature_dim/2)),
100 |                         nn.ReLU(),
101 |                         nn.Linear(int(feature_dim/2), 1),
102 |                         nn.ReLU())
103 | 
104 |         # Original
105 |         self.ori_layer = nn.Sequential(
106 |                         nn.Conv1d(input_size*2,input_size,kernel_size=1),
107 |                         nn.BatchNorm1d(input_size, momentum=1, affine=True),
108 |                         nn.ReLU(),
109 |                         nn.MaxPool1d(2),
110 |                         nn.Conv1d(input_size,input_size,kernel_size=3),
111 |                         nn.BatchNorm1d(input_size, momentum=1, affine=True),
112 |                         nn.ReLU(),
113 |                         nn.MaxPool1d(2))
114 |         self.ori_fc1 = nn.Linear(75, hidden_size)
115 |         self.ori_fc2 = nn.Linear(hidden_size,1)
116 | 
117 |         # Initialize itself
118 |         self.weights_init()
119 |     
120 |     def weights_init(self):
121 |         # nn.init.kaiming_normal_(self.main_fc[0].weight)
122 |         # nn.init.kaiming_normal_(self.main_fc[2].weight)
123 | 
124 |         nn.init.kaiming_normal_(self.cw_layer[0].weight)
125 |         nn.init.kaiming_normal_(self.cw_layer[3].weight)
126 |         nn.init.kaiming_normal_(self.cw_layer[6].weight)
127 |         nn.init.kaiming_normal_(self.cw_layer[8].weight)
128 | 
129 |         nn.init.kaiming_normal_(self.ori_layer[0].weight)
130 |         nn.init.kaiming_normal_(self.ori_layer[4].weight)
131 |         nn.init.kaiming_normal_(self.ori_fc1.weight)
132 |         nn.init.kaiming_normal_(self.ori_fc2.weight)
133 | 
134 |     def forward(self, support, query):
135 |         '''
136 |         param:support   shape [class*query, window, clip, class, feature]
137 |         param:query     shape [class*query, window, clip, feature]
138 |         return:         shape [class*query, class]
139 |         '''
140 |         Q_num, W, CL, C, FE = support.shape # class*query, window, clip, class, feature
141 | 
142 |         # # FC
143 |         # support_fc = support.permute(0,3,1,2,4).reshape(Q_num*C, W, CL*FE)         # [class*query*class, window, clip*feature]
144 |         # query_fc = query.unsqueeze(1).repeat(1,C,1,1,1).reshape(Q_num*C, W, CL*FE) # [class*query*class, window, clip*feature]
145 |         # fc_cat = torch.cat((support_fc, query_fc), 2)                              # [class*query*class, window, clip*feature*2]
146 |         # fc_scores = F.sigmoid(self.main_fc(fc_cat).reshape(Q_num, C, W))           # [class*query, class, window]
147 |         # fc_scores = fc_scores.permute(0,2,1)                                       # [class*query, window, class]
148 |         # fc_scores = F.softmax(fc_scores, dim=2)                                    # [class*query, window, class]
149 | 
150 |         # Window-wise Inner Product
151 |         support_ip = support.permute(3,0,1,2,4).reshape(C, Q_num*W, -1) # [class, class*query*window, clip*feature]
152 |         query_ip = query.reshape(1, Q_num*W, -1).repeat(C,1,1)          # [class, class*query*window, clip*feature]
153 |         ip_scores = torch.einsum('ijk,ijk->ij', support_ip, query_ip)   # [class, class*query*window]
154 |         print(ip_scores)
155 |         return None
156 |         ip_scores = ip_scores.reshape(C, Q_num, W).permute(1,2,0)       # [class*query, window, class]
157 |         ip_scores = F.softmax(ip_scores, dim=2)                         # [class*query, window, class]
158 |         
159 |         # Clip-wise
160 |         support_wc = support.permute(0,3,1,2,4).reshape(Q_num*C, W, CL, 1, FE)         # [class*query*class, window, clip, 1, feature]
161 |         query_cw = query.unsqueeze(1).repeat(1,C,1,1,1).reshape(Q_num*C, W, CL, 1, FE) # [class*query*class, window, clip, 1, feature]
162 |         cw_cat = torch.cat((support_wc, query_cw), 3).reshape(-1, 2, FE)               # [class*query*class*window*clip, 2, feature]
163 |         cw_scores = self.cw_layer(cw_cat).reshape(Q_num, C, W, CL)                     # [class*query, class, window, clip]
164 |         cw_scores = cw_scores.permute(0,2,1,3)                                         # [class*query, window, class, clip]
165 |         cw_scores = F.softmax(torch.sum(cw_scores, dim=3), dim=2)                      # [class*query, window, class]
166 | 
167 |         # Original Relation
168 |         support_rn = support.permute(0,1,3,2,4).reshape(Q_num*W*C, CL, FE)               # [class*query*window*class, clip, feature]
169 |         query_rn = query.reshape(Q_num*W, 1, CL, FE).repeat(1,C,1,1).reshape(-1, CL, FE) # [class*query*window*class, clip, feature]
170 |         ori_cat = torch.cat((support_rn, query_rn), 1)                                   # [class*query*window*class, clip*2, feature]
171 |         ori_scores = self.ori_layer(ori_cat)
172 |         ori_scores = ori_scores.reshape(ori_scores.size(0), -1)
173 |         ori_scores = F.relu(self.ori_fc1(ori_scores))
174 |         ori_scores = F.relu(self.ori_fc2(ori_scores)).reshape(Q_num, W, C)             # [class*query, window, class]
175 |         ori_scores = F.softmax(ori_scores, dim=2)                                      # [class*query, window, class]
176 | 
177 |         final_scores = cw_scores + ori_scores + ip_scores  # [class*query, window, class]
178 |         final_scores = F.softmax(final_scores, dim=2)      # [class*query, window, class]
179 | 
180 |         return final_scores


--------------------------------------------------------------------------------
/moco/dataset.py:
--------------------------------------------------------------------------------
  1 | # Public Packages
  2 | import torch                                         #
  3 | import torchvision                                   #  Torch
  4 | from torch.utils.data import Dataset      #
  5 | 
  6 | from vidaug import augmentors as va                  # Video Augmentation
  7 | 
  8 | import cv2                                           #
  9 | import numpy as np                                   #  Image
 10 | from scipy.ndimage import rotate as rotate_img       #
 11 | 
 12 | import random                                        #  OS
 13 | import os                                            #
 14 | 
 15 | WIDTH = HEIGHT = 128
 16 | # Augmentations
 17 | prob_50 = lambda aug: va.Sometimes(0.5, aug) # Used to apply augmentor with 50% probability
 18 | prob_20 = lambda aug: va.Sometimes(0.2, aug) # Used to apply augmentor with 20% probability
 19 | aug_seq = va.Sequential([
 20 |     prob_20(va.OneOf([va.GaussianBlur(2), 
 21 |                       va.InvertColor()])),
 22 |     prob_50(va.HorizontalFlip())
 23 | ])
 24 | def use_aug_seq(frames):
 25 |     aug_frames = []
 26 |     for frame in frames:
 27 |         if frame is not None:
 28 |             aug_frames.append(frame)
 29 |     
 30 |     aug_frames = aug_seq(aug_frames)
 31 |     j = 0
 32 |     for i in range(len(frames)):
 33 |         if frames[i] is not None:
 34 |             frames[i] = aug_frames[j]
 35 |             j += 1
 36 |     
 37 |     return frames
 38 | 
 39 | class MoCoDataset(Dataset):
 40 | 
 41 |     def __init__(self, data_folders, split, window_num, clip_num, frame_num, min_frame_num=25, max_vid_num=0):
 42 |         self.window_num = window_num
 43 |         self.clip_num = clip_num
 44 |         self.frame_num = frame_num
 45 |         self.video_folders = []
 46 |         
 47 |         for data_folder in data_folders:
 48 |             class_names = os.listdir(data_folder) if split == None else split
 49 |             # class_names = [''] # Finegym
 50 |             
 51 |             for class_name in class_names:
 52 |                 class_path = os.path.join(data_folder, class_name)
 53 |                 if not os.path.exists(class_path):
 54 |                     continue
 55 | 
 56 |                 video_names = os.listdir(class_path)
 57 |                 if len(video_names) > max_vid_num and max_vid_num != 0:
 58 |                     video_names = random.sample(video_names, max_vid_num)
 59 |                 for video_name in video_names:
 60 |                     video_folder = os.path.join(class_path, video_name)
 61 | 
 62 |                     if len(os.listdir(video_folder)) >= min_frame_num:
 63 |                         self.video_folders.append(video_folder)
 64 | 
 65 |     def __len__(self):
 66 |         return len(self.video_folders)
 67 |     
 68 |     def __getitem__(self, idx):
 69 |         video_folder = self.video_folders[idx]
 70 | 
 71 |         all_frames = [os.path.join(video_folder, frame_name) for frame_name in os.listdir(video_folder)]
 72 |         all_frames.sort()
 73 | 
 74 |         length = len(all_frames)
 75 |         stride = round((length - self.frame_num)/(self.clip_num*self.window_num-1))
 76 |         
 77 |         selected_frames = []
 78 |         for i in range(self.clip_num*self.window_num):
 79 |             selected_frames.extend(list(range(i*stride, i*stride+self.frame_num)))
 80 |         for i in range(len(selected_frames)):
 81 |             if selected_frames[i] >= length:
 82 |                 selected_frames[i] = length - 1
 83 |         
 84 |         # Process frames
 85 |         processed_frames = [None] * length
 86 |         for idx in selected_frames:
 87 |             if processed_frames[idx] is None:
 88 |                 frame = all_frames[idx]
 89 |                 img = cv2.imread(frame)
 90 |                 img = cv2.resize(img, (WIDTH, HEIGHT))   
 91 |                 img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 92 |                 processed_frames[idx] = img
 93 | 
 94 |         frames = []
 95 |         for i, frame_idx in enumerate(selected_frames):
 96 |             j = i % self.frame_num
 97 |             if j == 0:
 98 |                 frames.append([])
 99 |             
100 |             frame = processed_frames[frame_idx].copy()
101 |             frames[-1].append(frame)
102 |         
103 |         processed_frames = use_aug_seq(processed_frames)
104 |         
105 |         aug_frames = []
106 |         rotate = random.randint(1,5)
107 |         angle = random.randint(-20,20)
108 |         for i, frame_idx in enumerate(selected_frames):
109 |             j = i % self.frame_num
110 |             if j == 0:
111 |                 aug_frames.append([])
112 |             
113 |             frame = processed_frames[frame_idx].copy()
114 |             # Rotate
115 |             if rotate in [1,2]:
116 |                 frame = rotate_img(frame, angle, reshape=False)
117 |             aug_frames[-1].append(frame)
118 |         
119 |         final_frames = [frames, aug_frames]
120 |         final_frames = np.array(final_frames) / 127.5 - 1              # -1 to 1 # [num_frame, h, w, channel]
121 |         final_frames = np.transpose(final_frames, (0, 1, 5, 2, 3, 4))  # [2, window*clip, RGB, frame_num, H, W]
122 |         final_frames = torch.Tensor(final_frames.copy())
123 | 
124 |         return final_frames
125 | 
126 | class HumanNonhumanDataset(Dataset):
127 | 
128 |     def __init__(self, h_data_folders, n_data_folders, split, window_num, clip_num, frame_num, min_frame_num=25, max_vid_num=0):
129 |         self.window_num = window_num
130 |         self.clip_num = clip_num
131 |         self.frame_num = frame_num
132 |         self.video_folders = []
133 |         
134 |         for data_folder in h_data_folders:
135 |             class_names = os.listdir(data_folder) if split == None else split
136 |             # class_names = [''] # Finegym
137 |             
138 |             for class_name in class_names:
139 |                 class_path = os.path.join(data_folder, class_name)
140 |                 if not os.path.exists(class_path):
141 |                     continue
142 | 
143 |                 video_names = os.listdir(class_path)
144 |                 if len(video_names) > max_vid_num and max_vid_num != 0:
145 |                     video_names = random.sample(video_names, max_vid_num)
146 |                 for video_name in video_names:
147 |                     h_video_folder = os.path.join(class_path, video_name)
148 |                     n_video_folder = self.find_same_video(n_data_folders, class_name, video_name)
149 | 
150 |                     if len(os.listdir(h_video_folder)) >= min_frame_num and n_video_folder is not None and len(os.listdir(n_video_folder)) >= min_frame_num:
151 |                         self.video_folders.append([h_video_folder, n_video_folder])
152 |     
153 |     def find_same_video(self, data_folders, class_name, video_name):
154 |         for data_folder in data_folders:
155 |             video_path = os.path.join(data_folder, class_name, video_name)
156 |             if os.path.exists(video_path):
157 |                 return video_path
158 |         return None
159 | 
160 |     def __len__(self):
161 |         return len(self.video_folders)
162 | 
163 |     def __getitem__(self, idx):
164 |         final_frames = []
165 | 
166 |         rotate = random.randint(1,5)
167 |         angle = random.randint(-20,20)
168 | 
169 |         for video_folder in self.video_folders[idx]:
170 |             all_frames = [os.path.join(video_folder, frame_name) for frame_name in os.listdir(video_folder)]
171 |             all_frames.sort()
172 | 
173 |             length = len(all_frames)
174 |             stride = round((length - self.frame_num)/(self.clip_num*self.window_num-1))
175 |             
176 |             selected_frames = []
177 |             for i in range(self.clip_num*self.window_num):
178 |                 selected_frames.extend(list(range(i*stride, i*stride+self.frame_num)))
179 |             for i in range(len(selected_frames)):
180 |                 if selected_frames[i] >= length:
181 |                     selected_frames[i] = length - 1
182 |             
183 |             # Process frames
184 |             processed_frames = [None] * length
185 |             for idx in selected_frames:
186 |                 if processed_frames[idx] is None:
187 |                     frame = all_frames[idx]
188 |                     img = cv2.imread(frame)
189 |                     img = cv2.resize(img, (WIDTH, HEIGHT))   
190 |                     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
191 |                     if rotate in [1,2]:
192 |                         img = rotate_img(img, angle, reshape=False)
193 |                     processed_frames[idx] = img
194 | 
195 |             frames = []
196 |             for i, frame_idx in enumerate(selected_frames):
197 |                 j = i % self.frame_num
198 |                 if j == 0:
199 |                     frames.append([])
200 |                 
201 |                 frame = processed_frames[frame_idx].copy()
202 |                 frames[-1].append(frame)
203 |         
204 |             final_frames.append(frames)
205 | 
206 |         final_frames = np.array(final_frames)
207 |         shape = final_frames.shape
208 |         tmp_shape = np.append(shape[0]*shape[1], shape[2:])
209 |         final_frames = final_frames.reshape(tmp_shape)
210 |         final_frames = use_aug_seq(final_frames)
211 |         final_frames = final_frames.reshape(shape)
212 | 
213 |         final_frames = np.array(final_frames) / 127.5 - 1              # -1 to 1 # [num_frame, h, w, channel]
214 |         final_frames = np.transpose(final_frames, (0, 1, 5, 2, 3, 4))  # [2, window*clip, RGB, frame_num, H, W]
215 |         final_frames = torch.Tensor(final_frames.copy())
216 | 
217 |         return final_frames


--------------------------------------------------------------------------------
/encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | import numpy as np
  6 | import math
  7 | 
  8 | import os
  9 | import sys
 10 | from collections import OrderedDict
 11 | 
 12 | def weights_init(m):
 13 |     classname = m.__class__.__name__
 14 |     if classname.find('Conv') != -1:
 15 |         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 16 |         m.weight.data.normal_(0, math.sqrt(2. / n))
 17 |         if m.bias is not None:
 18 |             m.bias.data.zero_()
 19 |     elif classname.find('BatchNorm') != -1:
 20 |         m.weight.data.fill_(1)
 21 |         m.bias.data.zero_()
 22 |     elif classname.find('Linear') != -1:
 23 |         n = m.weight.size(1)
 24 |         m.weight.data.normal_(0, 0.01)
 25 |         m.bias.data = torch.ones(m.bias.data.size())
 26 | 
 27 | class MaxPool3dSamePadding(nn.MaxPool3d):
 28 |     
 29 |     def compute_pad(self, dim, s):
 30 |         if s % self.stride[dim] == 0:
 31 |             return max(self.kernel_size[dim] - self.stride[dim], 0)
 32 |         else:
 33 |             return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
 34 | 
 35 |     def forward(self, x):
 36 |         # compute 'same' padding
 37 |         (batch, channel, t, h, w) = x.size()
 38 |         #print t,h,w
 39 |         out_t = np.ceil(float(t) / float(self.stride[0]))
 40 |         out_h = np.ceil(float(h) / float(self.stride[1]))
 41 |         out_w = np.ceil(float(w) / float(self.stride[2]))
 42 |         #print out_t, out_h, out_w
 43 |         pad_t = self.compute_pad(0, t)
 44 |         pad_h = self.compute_pad(1, h)
 45 |         pad_w = self.compute_pad(2, w)
 46 |         #print pad_t, pad_h, pad_w
 47 | 
 48 |         pad_t_f = pad_t // 2
 49 |         pad_t_b = pad_t - pad_t_f
 50 |         pad_h_f = pad_h // 2
 51 |         pad_h_b = pad_h - pad_h_f
 52 |         pad_w_f = pad_w // 2
 53 |         pad_w_b = pad_w - pad_w_f
 54 | 
 55 |         pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
 56 |         #print x.size()
 57 |         #print pad
 58 |         x = F.pad(x, pad)
 59 |         return super(MaxPool3dSamePadding, self).forward(x)
 60 |     
 61 | 
 62 | class Unit3D(nn.Module):
 63 | 
 64 |     def __init__(self, in_channels,
 65 |                  output_channels,
 66 |                  kernel_shape=(1, 1, 1),
 67 |                  stride=(1, 1, 1),
 68 |                  padding=0,
 69 |                  activation_fn=F.relu,
 70 |                  use_batch_norm=True,
 71 |                  use_bias=False):
 72 |         
 73 |         """Initializes Unit3D module."""
 74 |         super(Unit3D, self).__init__()
 75 |         
 76 |         self._output_channels = output_channels
 77 |         self._kernel_shape = kernel_shape
 78 |         self._stride = stride
 79 |         self._use_batch_norm = use_batch_norm
 80 |         self._activation_fn = activation_fn
 81 |         self._use_bias = use_bias
 82 |         self.padding = padding
 83 |         
 84 |         self.conv3d = nn.Conv3d(in_channels=in_channels,
 85 |                                 out_channels=self._output_channels,
 86 |                                 kernel_size=self._kernel_shape,
 87 |                                 stride=self._stride,
 88 |                                 padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
 89 |                                 bias=self._use_bias)
 90 |         
 91 |         if self._use_batch_norm:
 92 |             self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
 93 | 
 94 |     def compute_pad(self, dim, s):
 95 |         if s % self._stride[dim] == 0:
 96 |             return max(self._kernel_shape[dim] - self._stride[dim], 0)
 97 |         else:
 98 |             return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
 99 | 
100 |             
101 |     def forward(self, x):
102 |         # compute 'same' padding
103 |         (batch, channel, t, h, w) = x.size()
104 |         #print t,h,w
105 |         out_t = np.ceil(float(t) / float(self._stride[0]))
106 |         out_h = np.ceil(float(h) / float(self._stride[1]))
107 |         out_w = np.ceil(float(w) / float(self._stride[2]))
108 |         #print out_t, out_h, out_w
109 |         pad_t = self.compute_pad(0, t)
110 |         pad_h = self.compute_pad(1, h)
111 |         pad_w = self.compute_pad(2, w)
112 |         #print pad_t, pad_h, pad_w
113 | 
114 |         pad_t_f = pad_t // 2
115 |         pad_t_b = pad_t - pad_t_f
116 |         pad_h_f = pad_h // 2
117 |         pad_h_b = pad_h - pad_h_f
118 |         pad_w_f = pad_w // 2
119 |         pad_w_b = pad_w - pad_w_f
120 | 
121 |         pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
122 |         #print x.size()
123 |         #print pad
124 |         x = F.pad(x, pad)
125 |         #print x.size()        
126 | 
127 |         x = self.conv3d(x)
128 |         if self._use_batch_norm:
129 |             x = self.bn(x)
130 |         if self._activation_fn is not None:
131 |             x = self._activation_fn(x)
132 |         return x
133 | 
134 | 
135 | 
136 | class InceptionModule(nn.Module):
137 |     def __init__(self, in_channels, out_channels):
138 |         super(InceptionModule, self).__init__()
139 | 
140 |         self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0)
141 |         self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0)
142 |         self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3])
143 |         self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0)
144 |         self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3])
145 |         self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],stride=(1, 1, 1), padding=0)
146 |         self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,)
147 | 
148 |     def forward(self, x):    
149 |         b0 = self.b0(x)
150 |         b1 = self.b1b(self.b1a(x))
151 |         b2 = self.b2b(self.b2a(x))
152 |         b3 = self.b3b(self.b3a(x))
153 |         return torch.cat([b0,b1,b2,b3], dim=1)
154 | 
155 | 
156 | class InceptionI3d(nn.Module):
157 | 
158 |     def __init__(self,  in_channels=3):
159 | 
160 |         super(InceptionI3d, self).__init__()
161 | 
162 |         
163 |         'Conv3d_1a_7x7'
164 |         self.l1 = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], stride=(2, 2, 2), padding=(3,3,3))
165 |         
166 |         'MaxPool3d_2a_3x3'
167 |         self.l2 = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
168 |         
169 |         'Conv3d_2b_1x1'
170 |         self.l3 = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0)
171 |       
172 |         'Conv3d_2c_3x3'
173 |         self.l4 = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1)
174 | 
175 |         'MaxPool3d_3a_3x3'
176 |         self.l5 = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
177 |         
178 |         # 'Mixed_3b'
179 |         self.l6 = InceptionModule(192, [64,96,128,16,32,32])
180 | 
181 |         # 'Mixed_3c'
182 |         self.l7 = InceptionModule(256, [128,128,192,32,96,64])
183 | 
184 |         # 'MaxPool3d_4a_3x3'
185 |         self.l8 = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0)
186 | 
187 |         # 'Mixed_4b'
188 |         self.l9 = InceptionModule(128+192+96+64, [192,96,208,16,48,64])
189 | 
190 |         # 'Mixed_4c'
191 |         self.l10 = InceptionModule(192+208+48+64, [160,112,224,24,64,64])
192 | 
193 |         # 'Mixed_4d'
194 |         self.l11 = InceptionModule(160+224+64+64, [128,128,256,24,64,64])
195 | 
196 |         # 'Mixed_4e'
197 |         self.l12 = InceptionModule(128+256+64+64, [112,144,288,32,64,64])
198 | 
199 |         # 'Mixed_4f'
200 |         self.l13 = InceptionModule(112+288+64+64, [256,160,320,32,128,128])
201 | 
202 |         # 'MaxPool3d_5a_2x2'
203 |         self.l14 = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0)
204 | 
205 |         # # 'Mixed_5b'
206 |         # self.l15 = InceptionModule(256+320+128+128, [256,160,320,32,128,128])
207 | 
208 |         # # 'Mixed_5c'
209 |         # # self.l16 = InceptionModule(256+320+128+128, [384,192,384,48,128,128])
210 |         # self.l16 = InceptionModule(256+320+128+128, [96, 48, 96, 12, 32, 32])
211 | 
212 |       
213 |     def forward(self, x):
214 |         x = self.l1(x)
215 |         x = self.l2(x)
216 |         x = self.l3(x)
217 |         x = self.l4(x)
218 |         x = self.l5(x)
219 |         x = self.l6(x)
220 |         x = self.l7(x)
221 |         x = self.l8(x)
222 |         x = self.l9(x)
223 |         x = self.l10(x)
224 |         x = self.l11(x)
225 |         x = self.l12(x)
226 |         x = self.l13(x)
227 |         x = self.l14(x)
228 |         # x = self.l15(x)
229 |         # x = self.l16(x)
230 | 
231 |         return x
232 | 
233 | 
234 | class Simple3DEncoder(nn.Module):
235 | 
236 |     def __init__(self, in_channels):
237 |         super(Simple3DEncoder, self).__init__()
238 | 
239 |         self.l1 = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], stride=(2, 2, 2), padding=(3,3,3))
240 |         self.l2 = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
241 |         self.l3 = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0)
242 |         self.l4 = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1)
243 |         self.l5 = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
244 | 
245 |         self.apply(weights_init)
246 |     
247 |     def forward(self, x):
248 |         x = self.l1(x)
249 |         x = self.l2(x)
250 |         x = self.l3(x)
251 |         x = self.l4(x)
252 |         x = self.l5(x)
253 | 
254 |         return x
255 | 
256 | 
257 | 


--------------------------------------------------------------------------------
/moco/encoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | import numpy as np
  6 | import math
  7 | 
  8 | import os
  9 | import sys
 10 | from collections import OrderedDict
 11 | 
 12 | def weights_init(m):
 13 |     classname = m.__class__.__name__
 14 |     if classname.find('Conv') != -1:
 15 |         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 16 |         m.weight.data.normal_(0, math.sqrt(2. / n))
 17 |         if m.bias is not None:
 18 |             m.bias.data.zero_()
 19 |     elif classname.find('BatchNorm') != -1:
 20 |         m.weight.data.fill_(1)
 21 |         m.bias.data.zero_()
 22 |     elif classname.find('Linear') != -1:
 23 |         n = m.weight.size(1)
 24 |         m.weight.data.normal_(0, 0.01)
 25 |         m.bias.data = torch.ones(m.bias.data.size())
 26 | 
 27 | class MaxPool3dSamePadding(nn.MaxPool3d):
 28 |     
 29 |     def compute_pad(self, dim, s):
 30 |         if s % self.stride[dim] == 0:
 31 |             return max(self.kernel_size[dim] - self.stride[dim], 0)
 32 |         else:
 33 |             return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
 34 | 
 35 |     def forward(self, x):
 36 |         # compute 'same' padding
 37 |         (batch, channel, t, h, w) = x.size()
 38 |         #print t,h,w
 39 |         out_t = np.ceil(float(t) / float(self.stride[0]))
 40 |         out_h = np.ceil(float(h) / float(self.stride[1]))
 41 |         out_w = np.ceil(float(w) / float(self.stride[2]))
 42 |         #print out_t, out_h, out_w
 43 |         pad_t = self.compute_pad(0, t)
 44 |         pad_h = self.compute_pad(1, h)
 45 |         pad_w = self.compute_pad(2, w)
 46 |         #print pad_t, pad_h, pad_w
 47 | 
 48 |         pad_t_f = pad_t // 2
 49 |         pad_t_b = pad_t - pad_t_f
 50 |         pad_h_f = pad_h // 2
 51 |         pad_h_b = pad_h - pad_h_f
 52 |         pad_w_f = pad_w // 2
 53 |         pad_w_b = pad_w - pad_w_f
 54 | 
 55 |         pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
 56 |         #print x.size()
 57 |         #print pad
 58 |         x = F.pad(x, pad)
 59 |         return super(MaxPool3dSamePadding, self).forward(x)
 60 |     
 61 | 
 62 | class Unit3D(nn.Module):
 63 | 
 64 |     def __init__(self, in_channels,
 65 |                  output_channels,
 66 |                  kernel_shape=(1, 1, 1),
 67 |                  stride=(1, 1, 1),
 68 |                  padding=0,
 69 |                  activation_fn=F.relu,
 70 |                  use_batch_norm=True,
 71 |                  use_bias=False):
 72 |         
 73 |         """Initializes Unit3D module."""
 74 |         super(Unit3D, self).__init__()
 75 |         
 76 |         self._output_channels = output_channels
 77 |         self._kernel_shape = kernel_shape
 78 |         self._stride = stride
 79 |         self._use_batch_norm = use_batch_norm
 80 |         self._activation_fn = activation_fn
 81 |         self._use_bias = use_bias
 82 |         self.padding = padding
 83 |         
 84 |         self.conv3d = nn.Conv3d(in_channels=in_channels,
 85 |                                 out_channels=self._output_channels,
 86 |                                 kernel_size=self._kernel_shape,
 87 |                                 stride=self._stride,
 88 |                                 padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
 89 |                                 bias=self._use_bias)
 90 |         
 91 |         if self._use_batch_norm:
 92 |             self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
 93 | 
 94 |     def compute_pad(self, dim, s):
 95 |         if s % self._stride[dim] == 0:
 96 |             return max(self._kernel_shape[dim] - self._stride[dim], 0)
 97 |         else:
 98 |             return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
 99 | 
100 |             
101 |     def forward(self, x):
102 |         # compute 'same' padding
103 |         (batch, channel, t, h, w) = x.size()
104 |         #print t,h,w
105 |         out_t = np.ceil(float(t) / float(self._stride[0]))
106 |         out_h = np.ceil(float(h) / float(self._stride[1]))
107 |         out_w = np.ceil(float(w) / float(self._stride[2]))
108 |         #print out_t, out_h, out_w
109 |         pad_t = self.compute_pad(0, t)
110 |         pad_h = self.compute_pad(1, h)
111 |         pad_w = self.compute_pad(2, w)
112 |         #print pad_t, pad_h, pad_w
113 | 
114 |         pad_t_f = pad_t // 2
115 |         pad_t_b = pad_t - pad_t_f
116 |         pad_h_f = pad_h // 2
117 |         pad_h_b = pad_h - pad_h_f
118 |         pad_w_f = pad_w // 2
119 |         pad_w_b = pad_w - pad_w_f
120 | 
121 |         pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
122 |         #print x.size()
123 |         #print pad
124 |         x = F.pad(x, pad)
125 |         #print x.size()        
126 | 
127 |         x = self.conv3d(x)
128 |         if self._use_batch_norm:
129 |             x = self.bn(x)
130 |         if self._activation_fn is not None:
131 |             x = self._activation_fn(x)
132 |         return x
133 | 
134 | 
135 | 
136 | class InceptionModule(nn.Module):
137 |     def __init__(self, in_channels, out_channels):
138 |         super(InceptionModule, self).__init__()
139 | 
140 |         self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0)
141 |         self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0)
142 |         self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3])
143 |         self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0)
144 |         self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3])
145 |         self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],stride=(1, 1, 1), padding=0)
146 |         self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,)
147 | 
148 |     def forward(self, x):    
149 |         b0 = self.b0(x)
150 |         b1 = self.b1b(self.b1a(x))
151 |         b2 = self.b2b(self.b2a(x))
152 |         b3 = self.b3b(self.b3a(x))
153 |         return torch.cat([b0,b1,b2,b3], dim=1)
154 | 
155 | 
156 | class InceptionI3d(nn.Module):
157 | 
158 |     def __init__(self,  in_channels=3):
159 | 
160 |         super(InceptionI3d, self).__init__()
161 | 
162 |         
163 |         'Conv3d_1a_7x7'
164 |         self.l1 = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], stride=(2, 2, 2), padding=(3,3,3))
165 |         
166 |         'MaxPool3d_2a_3x3'
167 |         self.l2 = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
168 |         
169 |         'Conv3d_2b_1x1'
170 |         self.l3 = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0)
171 |       
172 |         'Conv3d_2c_3x3'
173 |         self.l4 = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1)
174 | 
175 |         'MaxPool3d_3a_3x3'
176 |         self.l5 = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
177 |         
178 |         # 'Mixed_3b'
179 |         self.l6 = InceptionModule(192, [64,96,128,16,32,32])
180 | 
181 |         # 'Mixed_3c'
182 |         self.l7 = InceptionModule(256, [128,128,192,32,96,64])
183 | 
184 |         # 'MaxPool3d_4a_3x3'
185 |         self.l8 = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0)
186 | 
187 |         # 'Mixed_4b'
188 |         self.l9 = InceptionModule(128+192+96+64, [192,96,208,16,48,64])
189 | 
190 |         # 'Mixed_4c'
191 |         self.l10 = InceptionModule(192+208+48+64, [160,112,224,24,64,64])
192 | 
193 |         # 'Mixed_4d'
194 |         self.l11 = InceptionModule(160+224+64+64, [128,128,256,24,64,64])
195 | 
196 |         # 'Mixed_4e'
197 |         self.l12 = InceptionModule(128+256+64+64, [112,144,288,32,64,64])
198 | 
199 |         # 'Mixed_4f'
200 |         self.l13 = InceptionModule(112+288+64+64, [256,160,320,32,128,128])
201 | 
202 |         # 'MaxPool3d_5a_2x2'
203 |         self.l14 = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0)
204 | 
205 |         # # 'Mixed_5b'
206 |         # self.l15 = InceptionModule(256+320+128+128, [256,160,320,32,128,128])
207 | 
208 |         # # 'Mixed_5c'
209 |         # # self.l16 = InceptionModule(256+320+128+128, [384,192,384,48,128,128])
210 |         # self.l16 = InceptionModule(256+320+128+128, [96, 48, 96, 12, 32, 32])
211 | 
212 |       
213 |     def forward(self, x):
214 |         x = self.l1(x)
215 |         x = self.l2(x)
216 |         x = self.l3(x)
217 |         x = self.l4(x)
218 |         x = self.l5(x)
219 |         x = self.l6(x)
220 |         x = self.l7(x)
221 |         x = self.l8(x)
222 |         x = self.l9(x)
223 |         x = self.l10(x)
224 |         x = self.l11(x)
225 |         x = self.l12(x)
226 |         x = self.l13(x)
227 |         x = self.l14(x)
228 |         # x = self.l15(x)
229 |         # x = self.l16(x)
230 | 
231 |         return x
232 | 
233 | 
234 | class Simple3DEncoder(nn.Module):
235 | 
236 |     def __init__(self, in_channels):
237 |         super(Simple3DEncoder, self).__init__()
238 | 
239 |         self.l1 = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], stride=(2, 2, 2), padding=(3,3,3))
240 |         self.l2 = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
241 |         self.l3 = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0)
242 |         self.l4 = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1)
243 |         self.l5 = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
244 | 
245 |         self.apply(weights_init)
246 |     
247 |     def forward(self, x):
248 |         x = self.l1(x)
249 |         x = self.l2(x)
250 |         x = self.l3(x)
251 |         x = self.l4(x)
252 |         x = self.l5(x)
253 | 
254 |         return x
255 | 
256 | 
257 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.optim.lr_scheduler import StepLR
  4 | from torch.autograd import Variable
  5 | 
  6 | import numpy as np
  7 | import os
  8 | import random
  9 | import argparse
 10 | import json
 11 | 
 12 | from relation_net import RelationNetwork as RN
 13 | from encoder import Simple3DEncoder as C3D
 14 | from tcn import TemporalConvNet as TCN
 15 | from attention_pool import AttentionPooling as AP
 16 | import dataset
 17 | from utils import *
 18 | 
 19 | # Argument Parser
 20 | parser = argparse.ArgumentParser()
 21 | parser.add_argument("-d", "--dataset", help="path of the dataset json file", required=True)
 22 | parser.add_argument("-n", "--exp_name", help="experiment name, this determines where to save trained models", required=True)
 23 | parser.add_argument("-w", "--way", help="number of classes", type=int, default=3)
 24 | parser.add_argument("-s", "--shot", help="number of shots", type=int, default=5)
 25 | parser.add_argument("-g", "--gpu", help="indices of gpu to be used, use all if not specified, e.g. --gpu=2,4,5")
 26 | parser.add_argument("-t", "--train_ep", help="number of training episodes", type=int, default=30000)
 27 | parser.add_argument("-v", "--valid_ep", help="number of validation episodes", type=int, default=50)
 28 | parser.add_argument("-f", "--valid_frq", help="validation frequency, a number of training episodes", type=int, default=200)
 29 | parser.add_argument("-l", "--lr", help="learning rate", type=float, default=0.001)
 30 | parser.add_argument("--load_frq", help="loading frequency, one means load new data each episode", type=int, default=3)
 31 | parser.add_argument("-m", "--mse_also", help="whether to use mse together with ctc loss", action="store_true")
 32 | parser.add_argument("-p", "--predict", help="whether to use mse or ctc at prediction", choices=["mse", "ctc"], default="ctc")
 33 | parser.add_argument("-c", "--checkpoint", help="path of a checkpoint to start from, a path with its name as the accuracy")
 34 | 
 35 | args = parser.parse_args()
 36 | 
 37 | if not os.exists(args.dataset):
 38 |     raise Exception("invalid dataset path: {}".format(args.dataset))
 39 | else:
 40 |     file = open(args.dataset, 'r')
 41 |     text = file.readline()
 42 |     file.close()
 43 |     dataset_info = json.loads(text)
 44 | if args.load_frq <= 0:
 45 |     raise Exception("loading frequency must be positive")
 46 | if args.train_ep <= 0 or args.valid_ep <= 0:
 47 |     raise Exception("training and validation episodes must be positive")
 48 | if args.gpu is not None:
 49 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
 50 | device = torch.device('cuda')
 51 | if args.way > 1:
 52 |     CLASS_NUM = args.way
 53 | else:
 54 |     raise Exception("one-way or even less is not meaningful")
 55 | if args.shot >= 1:
 56 |     SAMPLE_NUM = args.shot
 57 | else:
 58 |     raise Exception("zero-shot is beyond the scope of this project")
 59 | if args.checkpoint is not None and not os.path.exists(args.checkpoint):
 60 |     raise Exception("invalid checkpoint path: {}".format(args.checkpoint))
 61 | 
 62 | # Some Constants
 63 | CLIP_NUM = 5    # Num of clips per window
 64 | WINDOW_NUM = 3  # Num of processing window per video
 65 | FRAME_NUM = 10  # Num of frames per clip
 66 | QUERY_NUM = 5   # Num of instances for query per class
 67 | INST_NUM = 10   # Num of videos selected in each class (A silly design, will be removed later)
 68 | TCN_OUT = 64    # Num of channels of output of TCN
 69 | max_accuracy = 0
 70 | 
 71 | # Define Models
 72 | c3d = C3D(in_channels=3) 
 73 | c3d = nn.DataParallel(c3d)
 74 | tcn = TCN(245760, [128,128,64,TCN_OUT])
 75 | ap = AP(CLASS_NUM, SAMPLE_NUM, QUERY_NUM, WINDOW_NUM, CLIP_NUM, TCN_OUT)
 76 | rn = RN(CLIP_NUM, hidden_size=32) 
 77 | 
 78 | ctc = nn.CTCLoss()
 79 | logSoftmax = nn.LogSoftmax(2)
 80 | mse = nn.MSELoss()   
 81 | 
 82 | # Move models to computing device
 83 | c3d.to(device)
 84 | tcn.to(device)
 85 | ap.to(device)
 86 | rn.to(device)
 87 | 
 88 | logSoftmax.to(device)
 89 | ctc.to(device)
 90 | mse.to(device)
 91 | 
 92 | # Define Optimizers
 93 | c3d_optim = torch.optim.AdamW(c3d.parameters(), lr=args.lr)
 94 | rn_optim = torch.optim.AdamW(rn.parameters(), lr=args.lr)
 95 | tcn_optim = torch.optim.AdamW(tcn.parameters(), lr=args.lr)
 96 | ap_optim = torch.optim.AdamW(ap.parameters(), lr=args.lr)
 97 | 
 98 | # Define Schedulers
 99 | c3d_scheduler = StepLR(c3d_optim, step_size=3000, gamma=0.5)
100 | rn_scheduler = StepLR(rn_optim, step_size=3000, gamma=0.5)
101 | tcn_scheduler = StepLR(tcn_optim, step_size=3000, gamma=0.5)
102 | ap_scheduler = StepLR(ap_optim, step_size=3000, gamma=0.5)
103 | 
104 | # Load Saved Models & Optimizers & Schedulers
105 | if args.checkpoint is not None:
106 |     my_load(c3d, "c3d.pkl")
107 |     my_load(tcn, "tcn.pkl")
108 |     my_load(ap, "ap.pkl")
109 |     my_load(rn, "rn.pkl")
110 |     my_load(c3d_optim, "c3d_optim.pkl")
111 |     my_load(tcn_optim, "tcn_optim.pkl")
112 |     my_load(ap_optim, "ap_optim.pkl")
113 |     my_load(rn_optim, "rn_optim.pkl")
114 |     my_load(c3d_scheduler, "c3d_scheduler.pkl")
115 |     my_load(tcn_scheduler, "tcn_scheduler.pkl")
116 |     my_load(ap_scheduler, "ap_scheduler.pkl")
117 |     my_load(rn_scheduler, "rn_scheduler.pkl")
118 | 
119 |     tmp = os.path.split(args.checkpoint)[1]
120 |     if "Latest_" in tmp:
121 |         tmp = tmp[7:]
122 |     max_accuracy = float(tmp)
123 | 
124 | # Prepare output folder
125 | output_folder = os.path.join("./models", args.exp_name)
126 | if not os.path.exists(output_folder):
127 |     os.mkdir(output_folder)
128 | 
129 | # Some Constant Tensors
130 | input_lengths = torch.full(size=(QUERY_NUM*CLASS_NUM,), fill_value=WINDOW_NUM, dtype=torch.long).to(device)
131 | target_lengths = torch.full(size=(QUERY_NUM*CLASS_NUM,), fill_value=1, dtype=torch.long).to(device)
132 | blank_prob = torch.full(size=(QUERY_NUM*CLASS_NUM, WINDOW_NUM, 1), fill_value=1, dtype=torch.float).to(device)
133 | 
134 | # Training Loop
135 | train_ep = 0
136 | while train_ep < args.train_ep:
137 | 
138 |     # Load Data
139 |     if train_ep % args.load_frq == 0:
140 |         try:
141 |             if dataset_info["name"] != "finegym":
142 |                 the_dataset = dataset.StandardDataset(dataset_info["folders"], "train", dataset_info["split"], CLASS_NUM, INST_NUM, FRAME_NUM, CLIP_NUM, WINDOW_NUM)
143 |             else:
144 |                 the_dataset = dataset.FinegymDataset(dataset_info["folder"], dataset_info["finegym_info"], "train", dataset_info["split"], CLASS_NUM, INST_NUM, FRAME_NUM, CLIP_NUM, WINDOW_NUM)
145 |             dataloader = dataset.get_data_loader(the_dataset, num_per_class=SAMPLE_NUM+QUERY_NUM, num_workers=0)
146 |             data, data_labels = dataloader.__iter__().next()     # [class*(support+query), window*clip, RGB, frame, H, W]
147 |         except Exception:
148 |             continue
149 |         data = data.view(-1, 3, FRAME_NUM, 128, 128)
150 |     
151 |     print("Train_Ep[{}] Current_Accuracy = {}".format(train_ep, max_accuracy), end="\t")
152 |     
153 |     # Generate support & query split
154 |     query_index = []
155 |     support_index = []
156 |     for i in range(CLASS_NUM):
157 |         start = (SAMPLE_NUM+QUERY_NUM) * i
158 |         end = (SAMPLE_NUM+QUERY_NUM) * (i+1)
159 |         index = list(range(start, end))
160 |         q = random.sample(index, QUERY_NUM)
161 |         s = list(set(index)-set(q))
162 |         query_index.extend(q)
163 |         support_index.extend(s)
164 |     random.shuffle(query_index)
165 |     query_index = torch.tensor(query_index)
166 |     support_index = torch.tensor(support_index)
167 | 
168 |     # Encoding
169 |     embed = c3d(Variable(data).to(device))
170 |     embed = embed.view(CLASS_NUM*(SAMPLE_NUM+QUERY_NUM), WINDOW_NUM*CLIP_NUM, -1)  # [class*(support+query), window*clip, feature]
171 | 
172 |     # TCN Processing
173 |     embed = torch.transpose(embed, 1, 2)           # [class*(support+query), feature(channel), window*clip(length)]
174 |     embed = tcn(embed)
175 |     embed = torch.transpose(embed, 1, 2)           # [class*(support+query), window*clip, feature]
176 | 
177 |     # Split data into support & query
178 |     samples = embed[support_index] # [class*support, window*clip, feature]
179 |     batches = embed[query_index]   # [class*query, window*clip, feature]
180 |     batches_labels = data_labels[query_index]
181 | 
182 |     # Attention Pooling
183 |     samples = samples.reshape(CLASS_NUM*SAMPLE_NUM*WINDOW_NUM, CLIP_NUM, -1)  # [class*sample*window, clip, feature]
184 |     batches = batches.reshape(CLASS_NUM*QUERY_NUM*WINDOW_NUM, CLIP_NUM, -1)  # [query*class*window, clip, feature]
185 |     samples = ap(samples, batches)                    # [query*class*window, class, clip, feature]
186 | 
187 |     # Compute Relation
188 |     batches_rn = batches.unsqueeze(0).repeat(CLASS_NUM,1,1,1)  # [class, query*class*window, clip, feature]
189 |     batches_rn = torch.transpose(batches_rn,0,1)               # [query*class*window, class, clip, feature]
190 |     relations = torch.cat((samples,batches_rn),2).reshape(-1,CLIP_NUM*2,TCN_OUT)    # [query*class*window, class, clip*2(channel), feature]
191 |     relations = rn(relations).reshape(QUERY_NUM*CLASS_NUM, WINDOW_NUM, CLASS_NUM)    # [query*class, window, class]
192 |     relations_ctc = torch.cat((blank_prob, relations), 2)             # [query*class, window(length), class+1]
193 |     final_outcome = torch.transpose(logSoftmax(relations_ctc), 0, 1)  # [window(length), query*class, class+1]
194 | 
195 |     if args.mse_also:
196 |         relations_mse = nn.functional.softmax(torch.sum(relations, 1), dim=1) # [query*class, class]
197 |         one_hot_labels = Variable(torch.zeros(QUERY_NUM*CLASS_NUM, CLASS_NUM).scatter_(1, (batches_labels-1).view(-1,1), 1).to(device))
198 |         loss = mse(relations_mse, one_hot_labels) + ctc(final_outcome, batches_labels, input_lengths, target_lengths)
199 |     else:
200 |         loss = ctc(final_outcome, batches_labels, input_lengths, target_lengths)
201 |     print("Loss = {}".format(loss))
202 | 
203 |     # Back Propagation
204 |     c3d.zero_grad()
205 |     tcn.zero_grad()
206 |     ap.zero_grad()
207 |     rn.zero_grad()
208 |     loss.backward()
209 | 
210 |     # Clip Gradient
211 |     nn.utils.clip_grad_norm_(c3d.parameters(),0.5)
212 |     nn.utils.clip_grad_norm_(tcn.parameters(),0.5)
213 |     nn.utils.clip_grad_norm_(ap.parameters(),0.5)
214 |     nn.utils.clip_grad_norm_(rn.parameters(),0.5)
215 | 
216 |     # Update Models
217 |     c3d_optim.step()
218 |     tcn_optim.step()
219 |     rn_optim.step()
220 |     ap_optim.step()
221 | 
222 |     # Update "step" for scheduler
223 |     c3d_scheduler.step()
224 |     tcn_scheduler.step()
225 |     ap_scheduler.step()
226 |     rn_scheduler.step()
227 | 
228 |     train_ep += 1
229 | 
230 |     # Validation Loop
231 |     if (train_ep % args.valid_frq == 0 and train_ep != 0) or train_ep == args.train_ep:
232 | 
233 |         with torch.no_grad():
234 |             accuracies = []
235 | 
236 |             valid_ep = 0
237 |             while valid_ep < args.valid_ep:
238 | 
239 |                 # Data Loading
240 |                 try:
241 |                     if DATASET in ['haa', 'mit']:
242 |                         the_dataset = dataset.StandardDataset(DATA_FOLDERS, "test", (TRAIN_SPLIT, VAL_SPLIT, TEST_SPLIT), CLASS_NUM, INST_NUM, FRAME_NUM, CLIP_NUM, WINDOW_NUM)
243 |                     elif DATASET in ['finegym']:
244 |                         the_dataset = dataset.FinegymDataset(DATA_FOLDERS, INFO_DICT, "test", [TRAIN_SPLIT, VAL_SPLIT, TEST_SPLIT], CLASS_NUM, INST_NUM, FRAME_NUM, CLIP_NUM, WINDOW_NUM)
245 |                     sample_dataloader = dataset.get_data_loader(the_dataset, num_per_class=SAMPLE_NUM, num_workers=0)
246 |                     batch_dataloader = dataset.get_data_loader(the_dataset, num_per_class=QUERY_NUM,shuffle=True, num_workers=0)
247 |                     samples, _ = sample_dataloader.__iter__().next()            # [query*class, clip, RGB, frame, H, W]
248 |                     batches, batches_labels = batch_dataloader.__iter__().next()   # [query*class, window*clip, RGB, frame, H, W]
249 |                 except Exception:
250 |                     continue
251 |                 
252 |                 print("Val_Ep[{}] Pres_Accu = {}".format(valid_ep, max_accuracy), end="\t")
253 | 
254 |                 # Encoding
255 |                 samples = samples.view(CLASS_NUM*SAMPLE_NUM*WINDOW_NUM*CLIP_NUM, 3, FRAME_NUM, 128, 128)
256 |                 samples = c3d(Variable(samples).to(device))
257 |                 samples = samples.view(CLASS_NUM*SAMPLE_NUM, WINDOW_NUM*CLIP_NUM, -1)    # [support*class, window*clip, feature]
258 | 
259 |                 batches = batches.view(CLASS_NUM*QUERY_NUM*WINDOW_NUM*CLIP_NUM, 3, FRAME_NUM, 128, 128)
260 |                 batches = c3d(Variable(batches).to(device))
261 |                 batches = batches.view(CLASS_NUM*QUERY_NUM, WINDOW_NUM*CLIP_NUM,-1)       # [query*class, window*clip, feature]
262 | 
263 |                 # TCN Processing
264 |                 samples = torch.transpose(samples,1,2)       # [support*class, feature(channel), window*clip(length)]
265 |                 samples = tcn(samples)
266 |                 samples = torch.transpose(samples,1,2)       # [support*class, window*clip, feature]
267 |                 samples = samples.reshape(CLASS_NUM*SAMPLE_NUM*WINDOW_NUM, CLIP_NUM, -1)  # [class*sample*window, clip, feature]
268 | 
269 |                 batches = torch.transpose(batches,1,2)       # [query*class, feature(channel), window*clip(length)]
270 |                 batches = tcn(batches)
271 |                 batches = torch.transpose(batches,1,2)       # [query*class, window*clip, feature]
272 |                 batches = batches.reshape(CLASS_NUM*QUERY_NUM*WINDOW_NUM, CLIP_NUM, -1)  # [query*class*window, clip, feature]
273 | 
274 |                 # Attention Pooling
275 |                 samples = ap(samples, batches)                    # [query*class*window, class, clip, feature]
276 | 
277 |                 # Compute Relation
278 |                 batches_rn = batches.unsqueeze(0).repeat(CLASS_NUM,1,1,1)  # [class, query*class*window, clip, feature]
279 |                 batches_rn = torch.transpose(batches_rn,0,1)               # [query*class*window, class, clip, feature]
280 |                 relations = torch.cat((samples,batches_rn),2).reshape(-1,CLIP_NUM*2,TCN_OUT)    # [query*class*window, class, clip*2(channel), feature]
281 |                 relations = rn(relations).reshape(QUERY_NUM*CLASS_NUM, WINDOW_NUM, CLASS_NUM)    # [query*class, window, class]
282 | 
283 |                 # Generate final probabilities
284 |                 relations_ctc = torch.cat((blank_prob, relations), 2)
285 |                 final_outcome = nn.functional.softmax(relations_ctc, 2)  # [query*class, window(length), class+1]
286 | 
287 |                 # Predict
288 |                 batches_labels = batches_labels.numpy()
289 |                 if args.predict == "mse":
290 |                     relations_mse = nn.functional.softmax(torch.sum(relations, 1), dim=1) # [query*class, class]
291 |                     _, predict_labels = torch.max(relations_mse.data, 1)
292 |                     batches_labels = batches_labels - 1
293 |                 else:
294 |                     predict_labels = ctc_predict(final_outcome.cpu().numpy())
295 |                     predict_labels = ctc_predict_single(final_outcome)
296 | 
297 |                 rewards = [1 if predict_labels[i] == batches_labels[i] else 0 for i in range(len(predict_labels))]
298 |                 total_rewards = np.sum(rewards)
299 | 
300 |                 # Record accuracy
301 |                 accuracy = total_rewards/(CLASS_NUM * QUERY_NUM)
302 |                 accuracies.append(accuracy)
303 |                 print("Accuracy = {}".format(accuracy))
304 | 
305 |                 valid_ep += 1
306 | 
307 |             # Average accuracy
308 |             val_accuracy, _ = mean_confidence_interval(accuracies)
309 |             print("Average Val_Accu = {}".format(val_accuracy))
310 | 
311 |             # Save Model
312 |             if val_accuracy > max_accuracy:
313 |                 # Prepare folder
314 |                 folder_for_this_accuracy = os.path.join(output_folder, str(val_accuracy))
315 |                 max_accuracy = val_accuracy
316 |                 print("Models Saved with accuracy={}".format(max_accuracy))
317 |             else:
318 |                 folder_for_this_accuracy = os.path.join(output_folder, "Latest_{}".format(val_accuracy))
319 | 
320 |             if not os.path.exists(folder_for_this_accuracy):
321 |                 os.mkdir(folder_for_this_accuracy)
322 | 
323 |             # Save networks
324 |             torch.save(c3d.state_dict(), os.path.join(folder_for_this_accuracy, "c3d.pkl"))
325 |             torch.save(rn.state_dict(), os.path.join(folder_for_this_accuracy, "rn.pkl"))
326 |             torch.save(tcn.state_dict(), os.path.join(folder_for_this_accuracy, "tcn.pkl"))
327 |             torch.save(ap.state_dict(), os.path.join(folder_for_this_accuracy, "ap.pkl"))
328 | 
329 |             torch.save(c3d_optim.state_dict(), os.path.join(folder_for_this_accuracy, "c3d_optim.pkl"))
330 |             torch.save(rn_optim.state_dict(), os.path.join(folder_for_this_accuracy, "rn_optim.pkl"))
331 |             torch.save(tcn_optim.state_dict(), os.path.join(folder_for_this_accuracy, "tcn_optim.pkl"))
332 |             torch.save(ap_optim.state_dict(), os.path.join(folder_for_this_accuracy, "ap_optim.pkl"))
333 | 
334 |             torch.save(c3d_scheduler.state_dict(), os.path.join(folder_for_this_accuracy, "c3d_scheduler.pkl"))
335 |             torch.save(rn_scheduler.state_dict(), os.path.join(folder_for_this_accuracy, "rn_scheduler.pkl"))
336 |             torch.save(tcn_scheduler.state_dict(), os.path.join(folder_for_this_accuracy, "tcn_scheduler.pkl"))
337 |             torch.save(ap_scheduler.state_dict(), os.path.join(folder_for_this_accuracy, "ap_scheduler.pkl"))
338 | 
339 | print("Training Done")
340 | print("Final Accuracy = {}".format(max_accuracy))
341 | 


--------------------------------------------------------------------------------
/moco/main_moco.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  3 | import argparse
  4 | import builtins
  5 | import math
  6 | import os
  7 | import random
  8 | import shutil
  9 | import time
 10 | import warnings
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.nn.parallel
 15 | import torch.backends.cudnn as cudnn
 16 | import torch.distributed as dist
 17 | import torch.optim
 18 | import torch.multiprocessing as mp
 19 | import torch.utils.data
 20 | import torch.utils.data.distributed
 21 | import torchvision.transforms as transforms
 22 | import torchvision.datasets as datasets
 23 | import torchvision.models as models
 24 | 
 25 | from dataset import MoCoDataset as MCDset
 26 | from dataset import HumanNonhumanDataset as HNDset
 27 | from moco_encoder import C3D_TCN
 28 | 
 29 | import moco.loader
 30 | import moco.builder
 31 | 
 32 | model_names = sorted(name for name in models.__dict__
 33 |     if name.islower() and not name.startswith("__")
 34 |     and callable(models.__dict__[name]))
 35 | 
 36 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
 37 | parser.add_argument('-a', '--arch', metavar='ARCH', default='C3D_TCN',
 38 |                     choices=model_names,
 39 |                     help='model architecture: ' +
 40 |                         ' | '.join(model_names) +
 41 |                         ' (default: resnet50)')
 42 | parser.add_argument('-j', '--workers', default=0, type=int, metavar='N',
 43 |                     help='number of data loading workers (default: 32)')
 44 | parser.add_argument('--epochs', default=20, type=int, metavar='N',
 45 |                     help='number of total epochs to run')
 46 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
 47 |                     help='manual epoch number (useful on restarts)')
 48 | parser.add_argument('-b', '--batch-size', default=15, type=int,
 49 |                     metavar='N',
 50 |                     help='mini-batch size (default: 256), this is the total '
 51 |                          'batch size of all GPUs on the current node when '
 52 |                          'using Data Parallel or Distributed Data Parallel')
 53 | parser.add_argument('--lr', '--learning-rate', default=0.005, type=float,
 54 |                     metavar='LR', help='initial learning rate', dest='lr')
 55 | parser.add_argument('--schedule', default=[2,5,8,13,16], nargs='*', type=int,
 56 |                     help='learning rate schedule (when to drop lr by 10x)')
 57 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
 58 |                     help='momentum of SGD solver')
 59 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
 60 |                     metavar='W', help='weight decay (default: 1e-4)',
 61 |                     dest='weight_decay')
 62 | parser.add_argument('-p', '--print-freq', default=1, type=int,
 63 |                     metavar='N', help='print frequency (default: 10)')
 64 | parser.add_argument('--resume', default='', type=str, metavar='PATH',
 65 |                     help='path to latest checkpoint (default: none)')
 66 | parser.add_argument('--world-size', default=1, type=int,
 67 |                     help='number of nodes for distributed training')
 68 | parser.add_argument('--rank', default=0, type=int,
 69 |                     help='node rank for distributed training')
 70 | parser.add_argument('--dist-url', default='tcp://localhost:23456', type=str,
 71 |                     help='url used to set up distributed training')
 72 | parser.add_argument('--dist-backend', default='nccl', type=str,
 73 |                     help='distributed backend')
 74 | parser.add_argument('--seed', default=None, type=int,
 75 |                     help='seed for initializing training. ')
 76 | parser.add_argument('--gpu', default=None, type=int,
 77 |                     help='GPU id to use.')
 78 | parser.add_argument('--multiprocessing-distributed', default=True,
 79 |                     help='Use multi-processing distributed training to launch '
 80 |                          'N processes per node, which has N GPUs. This is the '
 81 |                          'fastest way to use PyTorch for either single node or '
 82 |                          'multi node data parallel training')
 83 | 
 84 | # moco specific configs:
 85 | parser.add_argument('--moco-dim', default=960, type=int,
 86 |                     help='feature dimension (default: 128)')
 87 | parser.add_argument('--moco-k', default=15, type=int,
 88 |                     help='queue size; number of negative keys (default: 65536)')
 89 | parser.add_argument('--moco-m', default=0.999, type=float,
 90 |                     help='moco momentum of updating key encoder (default: 0.999)')
 91 | parser.add_argument('--moco-t', default=0.07, type=float,
 92 |                     help='softmax temperature (default: 0.07)')
 93 | 
 94 | # options for moco v2
 95 | parser.add_argument('--mlp', action='store_true',
 96 |                     help='use mlp head')
 97 | parser.add_argument('--aug-plus', action='store_true',
 98 |                     help='use moco v2 data augmentation')
 99 | parser.add_argument('--cos', action='store_true',
100 |                     help='use cosine lr schedule')
101 | 
102 | DATA_FOLDERS = ["<TRAINING_SET_DIR>",
103 |                 "<VALIDATION_SET_DIR>",
104 |                 "<TEST_SET_DIR>"]
105 | # H_FOLDERS = ["",
106 | #              "",
107 | #              ""]
108 | # N_FOLDERS = DATA_FOLDERS
109 | SPLIT = "<TRAINING_SPLIT_TXT_PATH>"
110 | def read_split(file_path):
111 |     result = []
112 |     if os.path.exists(file_path):
113 |         file = open(file_path, "r")
114 |         lines = file.readlines()
115 |         file.close()
116 |         for line in lines:
117 |             result.append(line.rstrip())    
118 |     return result
119 | SPLIT = read_split(SPLIT)
120 | 
121 | WINDOW_NUM = 3
122 | CLIP_NUM = 5 
123 | FRAME_NUM = 10
124 | 
125 | def main():
126 |     args = parser.parse_args()
127 | 
128 |     if args.seed is not None:
129 |         random.seed(args.seed)
130 |         torch.manual_seed(args.seed)
131 |         cudnn.deterministic = True
132 |         warnings.warn('You have chosen to seed training. '
133 |                       'This will turn on the CUDNN deterministic setting, '
134 |                       'which can slow down your training considerably! '
135 |                       'You may see unexpected behavior when restarting '
136 |                       'from checkpoints.')
137 | 
138 |     if args.gpu is not None:
139 |         warnings.warn('You have chosen a specific GPU. This will completely '
140 |                       'disable data parallelism.')
141 | 
142 |     if args.dist_url == "env://" and args.world_size == -1:
143 |         args.world_size = int(os.environ["WORLD_SIZE"])
144 | 
145 |     args.distributed = args.world_size > 1 or args.multiprocessing_distributed
146 | 
147 |     ngpus_per_node = torch.cuda.device_count()
148 |     if args.multiprocessing_distributed:
149 |         # Since we have ngpus_per_node processes per node, the total world_size
150 |         # needs to be adjusted accordingly
151 |         args.world_size = ngpus_per_node * args.world_size
152 |         # Use torch.multiprocessing.spawn to launch distributed processes: the
153 |         # main_worker process function
154 |         mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
155 |     else:
156 |         # Simply call main_worker function
157 |         main_worker(args.gpu, ngpus_per_node, args)
158 | 
159 | 
160 | def main_worker(gpu, ngpus_per_node, args):
161 |     args.gpu = gpu
162 | 
163 |     # suppress printing if not master
164 |     if args.multiprocessing_distributed and args.gpu != 0:
165 |         def print_pass(*args):
166 |             pass
167 |         builtins.print = print_pass
168 | 
169 |     if args.gpu is not None:
170 |         print("Use GPU: {} for training".format(args.gpu))
171 | 
172 |     if args.distributed:
173 |         if args.dist_url == "env://" and args.rank == -1:
174 |             args.rank = int(os.environ["RANK"])
175 |         if args.multiprocessing_distributed:
176 |             # For multiprocessing distributed training, rank needs to be the
177 |             # global rank among all the processes
178 |             args.rank = args.rank * ngpus_per_node + gpu
179 |         dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
180 |                                 world_size=args.world_size, rank=args.rank)
181 |     # create model
182 |     print("=> creating model '{}'".format(args.arch))
183 |     model = moco.builder.MoCo(
184 |         C3D_TCN,
185 |         args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp)
186 |     # print(model)
187 | 
188 |     if args.distributed:
189 |         # For multiprocessing distributed, DistributedDataParallel constructor
190 |         # should always set the single device scope, otherwise,
191 |         # DistributedDataParallel will use all available devices.
192 |         if args.gpu is not None:
193 |             torch.cuda.set_device(args.gpu)
194 |             model.cuda(args.gpu)
195 |             # When using a single GPU per process and per
196 |             # DistributedDataParallel, we need to divide the batch size
197 |             # ourselves based on the total number of GPUs we have
198 |             args.batch_size = int(args.batch_size / ngpus_per_node)
199 |             args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
200 |             model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
201 |         else:
202 |             model.cuda()
203 |             # DistributedDataParallel will divide and allocate batch_size to all
204 |             # available GPUs if device_ids are not set
205 |             model = torch.nn.parallel.DistributedDataParallel(model)
206 |     elif args.gpu is not None:
207 |         torch.cuda.set_device(args.gpu)
208 |         model = model.cuda(args.gpu)
209 |         # comment out the following line for debugging
210 |         raise NotImplementedError("Only DistributedDataParallel is supported.")
211 |     else:
212 |         # AllGather implementation (batch shuffle, queue update, etc.) in
213 |         # this code only supports DistributedDataParallel.
214 |         raise NotImplementedError("Only DistributedDataParallel is supported.")
215 | 
216 |     # define loss function (criterion) and optimizer
217 |     criterion = nn.CrossEntropyLoss().cuda(args.gpu)
218 | 
219 |     optimizer = torch.optim.SGD(model.parameters(), args.lr,
220 |                                 momentum=args.momentum,
221 |                                 weight_decay=args.weight_decay)
222 | 
223 |     # optionally resume from a checkpoint
224 |     if args.resume:
225 |         if os.path.isfile(args.resume):
226 |             print("=> loading checkpoint '{}'".format(args.resume))
227 |             if args.gpu is None:
228 |                 checkpoint = torch.load(args.resume)
229 |             else:
230 |                 # Map model to be loaded to specified single gpu.
231 |                 loc = 'cuda:{}'.format(args.gpu)
232 |                 checkpoint = torch.load(args.resume, map_location=loc)
233 |             args.start_epoch = checkpoint['epoch']
234 |             model.load_state_dict(checkpoint['state_dict'])
235 |             optimizer.load_state_dict(checkpoint['optimizer'])
236 |             print("=> loaded checkpoint '{}' (epoch {})"
237 |                   .format(args.resume, checkpoint['epoch']))
238 |         else:
239 |             print("=> no checkpoint found at '{}'".format(args.resume))
240 | 
241 |     cudnn.benchmark = True
242 | 
243 |     # # Data loading code
244 |     # traindir = os.path.join(args.data, 'train')
245 |     # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
246 |     #                                  std=[0.229, 0.224, 0.225])
247 |     # if args.aug_plus:
248 |     #     # MoCo v2's aug: similar to SimCLR https://arxiv.org/abs/2002.05709
249 |     #     augmentation = [
250 |     #         transforms.RandomResizedCrop(224, scale=(0.2, 1.)),
251 |     #         transforms.RandomApply([
252 |     #             transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)  # not strengthened
253 |     #         ], p=0.8),
254 |     #         transforms.RandomGrayscale(p=0.2),
255 |     #         transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.5),
256 |     #         transforms.RandomHorizontalFlip(),
257 |     #         transforms.ToTensor(),
258 |     #         normalize
259 |     #     ]
260 |     # else:
261 |     #     # MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978
262 |     #     augmentation = [
263 |     #         transforms.RandomResizedCrop(224, scale=(0.2, 1.)),
264 |     #         transforms.RandomGrayscale(p=0.2),
265 |     #         transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
266 |     #         transforms.RandomHorizontalFlip(),
267 |     #         transforms.ToTensor(),
268 |     #         normalize
269 |     #     ]
270 | 
271 |     train_dataset = MCDset(DATA_FOLDERS, SPLIT, WINDOW_NUM, CLIP_NUM, FRAME_NUM, min_frame_num=10)
272 |     # train_dataset = HNDset(H_FOLDERS, N_FOLDERS, SPLIT, WINDOW_NUM, CLIP_NUM, FRAME_NUM, min_frame_num=20)
273 | 
274 |     if args.distributed:
275 |         train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
276 |     else:
277 |         train_sampler = None
278 | 
279 |     train_loader = torch.utils.data.DataLoader(
280 |         train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
281 |         num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
282 | 
283 |     for epoch in range(args.start_epoch, args.epochs):
284 |         if args.distributed:
285 |             train_sampler.set_epoch(epoch)
286 |         adjust_learning_rate(optimizer, epoch, args)
287 | 
288 |         # train for one epoch
289 |         train(train_loader, model, criterion, optimizer, epoch, args)
290 | 
291 |         if not args.multiprocessing_distributed or (args.multiprocessing_distributed
292 |                 and args.rank % ngpus_per_node == 0):
293 |             save_checkpoint({
294 |                 'epoch': epoch + 1,
295 |                 'arch': args.arch,
296 |                 'state_dict': model.state_dict(),
297 |                 'optimizer' : optimizer.state_dict(),
298 |             }, is_best=False, filename='checkpoint_{:04d}.pth.tar'.format(epoch))
299 | 
300 | 
301 | def train(train_loader, model, criterion, optimizer, epoch, args):
302 |     batch_time = AverageMeter('Time', ':6.3f')
303 |     data_time = AverageMeter('Data', ':6.3f')
304 |     losses = AverageMeter('Loss', ':.4e')
305 |     top1 = AverageMeter('Acc@1', ':6.2f')
306 |     top5 = AverageMeter('Acc@5', ':6.2f')
307 |     progress = ProgressMeter(
308 |         len(train_loader),
309 |         [batch_time, data_time, losses, top1, top5],
310 |         prefix="Epoch: [{}]".format(epoch))
311 | 
312 |     # switch to train mode
313 |     model.train()
314 | 
315 |     end = time.time()
316 |     for i, data in enumerate(train_loader):
317 |         data = torch.transpose(data, 0, 1)
318 |         # measure data loading time
319 |         data_time.update(time.time() - end)
320 | 
321 |         if args.gpu is not None:
322 |             data[0] = data[0].cuda(args.gpu, non_blocking=True)
323 |             data[1] = data[1].cuda(args.gpu, non_blocking=True)
324 | 
325 |         # compute output
326 |         output, target = model(im_q=data[1], im_k=data[0])
327 |         loss = criterion(output, target)
328 | 
329 |         # acc1/acc5 are (K+1)-way contrast classifier accuracy
330 |         # measure accuracy and record loss
331 |         acc1, acc5 = accuracy(output, target, topk=(1, 5))
332 |         losses.update(loss.item(), data[0].size(0))
333 |         top1.update(acc1[0], data[0].size(0))
334 |         top5.update(acc5[0], data[0].size(0))
335 | 
336 |         # compute gradient and do SGD step
337 |         optimizer.zero_grad()
338 |         loss.backward()
339 |         optimizer.step()
340 | 
341 |         # measure elapsed time
342 |         batch_time.update(time.time() - end)
343 |         end = time.time()
344 | 
345 |         if i % args.print_freq == 0:
346 |             progress.display(i)
347 | 
348 | 
349 | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
350 |     torch.save(state, filename)
351 |     if is_best:
352 |         shutil.copyfile(filename, 'model_best.pth.tar')
353 | 
354 | 
355 | class AverageMeter(object):
356 |     """Computes and stores the average and current value"""
357 |     def __init__(self, name, fmt=':f'):
358 |         self.name = name
359 |         self.fmt = fmt
360 |         self.reset()
361 | 
362 |     def reset(self):
363 |         self.val = 0
364 |         self.avg = 0
365 |         self.sum = 0
366 |         self.count = 0
367 | 
368 |     def update(self, val, n=1):
369 |         self.val = val
370 |         self.sum += val * n
371 |         self.count += n
372 |         self.avg = self.sum / self.count
373 | 
374 |     def __str__(self):
375 |         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
376 |         return fmtstr.format(**self.__dict__)
377 | 
378 | 
379 | class ProgressMeter(object):
380 |     def __init__(self, num_batches, meters, prefix=""):
381 |         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
382 |         self.meters = meters
383 |         self.prefix = prefix
384 | 
385 |     def display(self, batch):
386 |         entries = [self.prefix + self.batch_fmtstr.format(batch)]
387 |         entries += [str(meter) for meter in self.meters]
388 |         print('\t'.join(entries))
389 | 
390 |     def _get_batch_fmtstr(self, num_batches):
391 |         num_digits = len(str(num_batches // 1))
392 |         fmt = '{:' + str(num_digits) + 'd}'
393 |         return '[' + fmt + '/' + fmt.format(num_batches) + ']'
394 | 
395 | 
396 | def adjust_learning_rate(optimizer, epoch, args):
397 |     """Decay the learning rate based on schedule"""
398 |     lr = args.lr
399 |     if args.cos:  # cosine lr schedule
400 |         lr *= 0.5 * (1. + math.cos(math.pi * epoch / args.epochs))
401 |     else:  # stepwise lr schedule
402 |         for milestone in args.schedule:
403 |             lr *= 0.1 if epoch >= milestone else 1.
404 |     for param_group in optimizer.param_groups:
405 |         param_group['lr'] = lr
406 | 
407 | 
408 | def accuracy(output, target, topk=(1,)):
409 |     """Computes the accuracy over the k top predictions for the specified values of k"""
410 |     with torch.no_grad():
411 |         maxk = max(topk)
412 |         batch_size = target.size(0)
413 | 
414 |         _, pred = output.topk(maxk, 1, True, True)
415 |         pred = pred.t()
416 |         correct = pred.eq(target.view(1, -1).expand_as(pred))
417 | 
418 |         res = []
419 |         for k in topk:
420 |             correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
421 |             res.append(correct_k.mul_(100.0 / batch_size))
422 |         return res
423 | 
424 | 
425 | if __name__ == '__main__':
426 |     main()
427 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | # Public Packages
  2 | import torch                                         #
  3 | import torchvision                                   #  Torch
  4 | from torch.utils.data import DataLoader,Dataset      #
  5 | from torch.utils.data.sampler import Sampler         #
  6 | 
  7 | from vidaug import augmentors as va                  # Video Augmentation
  8 | 
  9 | import cv2                                           #
 10 | import numpy as np                                   #  Image
 11 | from scipy.ndimage import rotate as rotate_img       #
 12 | 
 13 | import random                                        #  OS
 14 | import os                                            #
 15 | 
 16 | WIDTH = HEIGHT = 128
 17 | # Augmentations
 18 | prob_50 = lambda aug: va.Sometimes(0.5, aug) # Used to apply augmentor with 50% probability
 19 | prob_20 = lambda aug: va.Sometimes(0.2, aug) # Used to apply augmentor with 20% probability
 20 | aug_seq = va.Sequential([
 21 |     # prob_20(va.OneOf([va.GaussianBlur(2), 
 22 |     #                   va.InvertColor()])),
 23 |     prob_50(va.HorizontalFlip())
 24 | ])
 25 | def use_aug_seq(frames):
 26 |     aug_frames = []
 27 |     for frame in frames:
 28 |         if frame is not None:
 29 |             aug_frames.append(frame)
 30 |     
 31 |     aug_frames = aug_seq(aug_frames)
 32 |     j = 0
 33 |     for i in range(len(frames)):
 34 |         if frames[i] is not None:
 35 |             frames[i] = aug_frames[j]
 36 |             j += 1
 37 |     
 38 |     return frames
 39 | 
 40 | # class HAADataset(Dataset):
 41 | #     def __init__(self, data_folders, mode, splits, class_num, video_num, inst_num, frame_num, clip_num, window_num):
 42 | #         self.mode = mode
 43 | #         assert mode in ["train", "test"]
 44 | 
 45 | #         self.class_num = class_num
 46 | #         self.video_num = video_num
 47 | #         self.inst_num = inst_num
 48 | #         self.frame_num = frame_num
 49 | #         self.clip_num = clip_num
 50 | #         self.window_num = window_num
 51 | #         self.data_folder_1 = data_folders[0]
 52 | #         self.data_folder_2 = data_folders[1]
 53 | #         self.data_folder_3 = data_folders[2]
 54 | 
 55 | #         all_class_names = splits[0] if self.mode == "train" else splits[1]
 56 | #         self.class_names = random.sample(all_class_names, self.class_num)
 57 | #         self.labels = dict()
 58 | #         for i, class_name in enumerate(self.class_names):
 59 | #             self.labels[class_name] = i+1
 60 | 
 61 | #         self.video_folders = []
 62 | #         self.video_labels = []
 63 | #         for class_name in self.class_names:
 64 | #             label = self.labels[class_name]
 65 | #             class_folders = [os.path.join(self.data_folder_1, class_name), os.path.join(self.data_folder_2, class_name), os.path.join(self.data_folder_3, class_name)]
 66 | #             video_names = os.listdir(class_folders[0])
 67 | #             random.shuffle(video_names)
 68 | #             video_names = video_names[:self.inst_num]
 69 | 
 70 | #             for video_name in video_names:
 71 | #                 random_stretch = random.randint(1,5)
 72 | #                 random_stretch = max(0, random_stretch-3)
 73 | #                 self.video_folders.append(os.path.join(class_folders[random_stretch], video_name))
 74 | 
 75 | #                 self.video_labels.append(label)
 76 |     
 77 | #     def print_dataset(self):
 78 | #         for i in range(len(self)):
 79 | #             print("[{}]\t{}\t{}".format(i, self.video_labels[i], self.video_folders[i]))
 80 |     
 81 | #     def __len__(self):
 82 | #         return len(self.video_folders)
 83 |     
 84 | #     def get_classes(self):
 85 | #         return self.class_names.copy()
 86 | 
 87 | #     def __getitem__(self, idx):
 88 | #         video_folder = self.video_folders[idx]
 89 | #         video_label = self.video_labels[idx]
 90 | 
 91 | #         all_frames = [os.path.join(video_folder, frame_name) for frame_name in os.listdir(video_folder)]
 92 | #         all_frames.sort()
 93 | 
 94 | 
 95 | #         length = len(all_frames)
 96 | #         stride = round((length - self.frame_num)/(self.clip_num*self.window_num-1))
 97 | #         expected_length = (self.clip_num*self.window_num-1)*stride + self.frame_num
 98 |         
 99 | #         # Deal with length difference
100 | #         if expected_length <= length:
101 | #             all_frames = all_frames[:expected_length]
102 | #         else:
103 | #             tmp = all_frames[-1]
104 | #             for _ in range(expected_length - length):
105 | #                 all_frames.append(tmp)
106 |         
107 | #         selected_frames = []
108 | #         for i in range(self.clip_num*self.window_num):
109 | #             selected_frames.extend(list(range(i*stride, i*stride+self.frame_num)))
110 |         
111 | #         # Process frames
112 | #         flip = random.randint(0,1)
113 | #         processed_frames = []
114 | #         for frame in all_frames:
115 | #             img = cv2.imread(frame)
116 | #             img = cv2.resize(img, (WIDTH, HEIGHT))
117 | #             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
118 | #             img = cv2.flip(img, 1) if flip else img
119 | #             processed_frames.append(img)
120 | 
121 | #         frames = []
122 | #         for i, frame_idx in enumerate(selected_frames):
123 | #             j = i % self.frame_num
124 | #             if j == 0:
125 | #                 frames.append([])
126 |             
127 | #             frame = processed_frames[frame_idx].copy()
128 | #             frames[-1].append(frame)
129 |         
130 | #         frames = np.array(frames) / 127.5 - 1           # -1 to 1 # [num_frame, h, w, channel]
131 | #         frames = np.transpose(frames, (0, 4, 1, 2, 3))     # [video_clip, RGB, frame_num, H, W]
132 | #         frames = torch.Tensor(frames.copy())
133 | 
134 | #         # noise = random.randint(0,1)
135 | #         # if self.mode == "train" and noise:
136 | #         #     frames = frames + 0.1 * torch.randn(self.clip_num, 3, self.frame_num, WIDTH, HEIGHT)
137 | 
138 | #         return frames, video_label
139 | 
140 | class StandardDataset(Dataset):
141 |     def __init__(self, data_folders, mode, splits, class_num, inst_num, frame_num, clip_num, window_num):
142 |         self.mode = mode
143 |         assert mode in ["train", "val", "test"]
144 | 
145 |         # Attribute
146 |         self.class_num = class_num
147 |         self.inst_num = inst_num
148 |         self.frame_num = frame_num
149 |         self.clip_num = clip_num
150 |         self.window_num = window_num
151 |         self.data_folders = data_folders
152 | 
153 |         # Mode & Split
154 |         if self.mode == "train":
155 |             all_class_names = splits[0]
156 |         elif self.mode == "val":
157 |             all_class_names = splits[1]
158 |         else:
159 |             all_class_names = splits[2]
160 |         self.class_names = random.sample(all_class_names, class_num)
161 | 
162 |         self.labels = dict()
163 |         for i, class_name in enumerate(self.class_names):
164 |             self.labels[class_name] = i+1
165 | 
166 |         # Find all videos
167 |         self.video_folders = []
168 |         self.video_labels = []
169 |         for class_name in self.class_names:
170 |             video_folders = []
171 |             label = self.labels[class_name]
172 | 
173 |             for data_folder in self.data_folders:
174 |                 class_folder = os.path.join(data_folder, class_name)
175 |                 if not os.path.exists(class_folder):
176 |                     continue
177 |                 video_names = os.listdir(class_folder) if os.path.exists(class_folder) else []
178 | 
179 |                 for video_name in video_names:
180 |                     video_path = os.path.join(class_folder, video_name)
181 |                     if len(os.listdir(video_path)) >= self.frame_num:
182 |                         video_folders.append(video_path)
183 | 
184 |             # Pick <self.inst_num> random videos
185 |             video_folders = random.sample(video_folders, inst_num)
186 |             video_labels = [label] * inst_num
187 | 
188 |             self.video_folders.extend(video_folders)
189 |             self.video_labels.extend(video_labels)
190 | 
191 |         # self.scales = []
192 |         # for i in range(len(self.video_folders)):
193 |         #     self.scales.append(random.randint(2,4))
194 | 
195 |     def __len__(self):
196 |         return len(self.video_folders)
197 |     
198 |     def print_dataset(self):
199 |         string = ""
200 |         for i in range(len(self)):
201 |             string += "[{}] {} {} {}\n".format(i, self.video_labels[i], self.video_folders[i], self.scales[i])
202 |         
203 |         return string
204 |     
205 |     def get_labels(self):
206 |         if self.mode == "test":
207 |             return self.labels
208 |         return None
209 | 
210 |     def __getitem__(self, idx):
211 |         video_folder = self.video_folders[idx]
212 |         video_label = self.video_labels[idx]
213 |         # scale = self.scales[idx]
214 | 
215 |         all_frames = [os.path.join(video_folder, frame_name) for frame_name in os.listdir(video_folder)]
216 |         all_frames.sort()
217 |         # all_frames = all_frames[::scale]
218 | 
219 |         
220 |         length = len(all_frames)
221 |         stride = round((length - self.frame_num)/(self.clip_num*self.window_num-1))
222 |         
223 |         selected_frames = []
224 |         for i in range(self.clip_num*self.window_num):
225 |             selected_frames.extend(list(range(i*stride, i*stride+self.frame_num)))
226 |         for i in range(len(selected_frames)):
227 |             if selected_frames[i] >= length:
228 |                 selected_frames[i] = length - 1
229 |         
230 |         # Process frames
231 |         processed_frames = [None] * length
232 |         for idx in selected_frames:
233 |             if processed_frames[idx] is None:
234 |                 frame = all_frames[idx]
235 |                 img = cv2.imread(frame)
236 |                 img = cv2.resize(img, (WIDTH, HEIGHT))   
237 |                 img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
238 |                 # # Rotate
239 |                 # if self.mode == "train" and random.randint(0,1):
240 |                 #     angle = random.randint(-25,25)
241 |                 #     img = rotate_img(img, angle, reshape=False)
242 |                 processed_frames[idx] = img
243 |         if self.mode == 'train':
244 |             processed_frames = use_aug_seq(processed_frames)
245 | 
246 |         frames = []
247 |         for i, frame_idx in enumerate(selected_frames):
248 |             j = i % self.frame_num
249 |             if j == 0:
250 |                 frames.append([])
251 |             
252 |             frame = processed_frames[frame_idx].copy()
253 |             frames[-1].append(frame)
254 |         
255 |         frames = np.array(frames) / 127.5 - 1              # -1 to 1 # [num_frame, h, w, channel]
256 |         frames = np.transpose(frames, (0, 4, 1, 2, 3))     # [window*clip, RGB, frame_num, H, W]
257 |         frames = torch.Tensor(frames.copy())
258 | 
259 |         return frames, video_label
260 | 
261 | class AVADataset(Dataset):
262 |     def __init__(self, data_folder, mode, splits, class_num, video_num, inst_num, frame_num, clip_num, window_num):
263 |         self.mode = mode
264 |         assert mode in ["train", "test"]
265 | 
266 |         self.class_num = class_num
267 |         self.video_num = video_num
268 |         self.inst_num = inst_num
269 |         self.frame_num = frame_num
270 |         self.clip_num = clip_num
271 |         self.window_num = window_num
272 |         self.data_folder = data_folder
273 | 
274 |         all_class_names = splits[0] if self.mode == "train" else splits[1]
275 |         while True:
276 |             done = True
277 |             self.class_names = random.sample(all_class_names, self.class_num)
278 |             for class_name in self.class_names:
279 |                 class_folder = os.path.join(self.data_folder, class_name)
280 |                 if len(os.listdir(class_folder)) < self.inst_num:
281 |                     done = False
282 |                     break
283 |             if done:
284 |                 break
285 |         self.labels = dict()
286 |         for i, class_name in enumerate(self.class_names):
287 |             self.labels[class_name] = i+1
288 | 
289 |         self.video_folders = []
290 |         self.video_labels = []
291 |         for class_name in self.class_names:
292 |             label = self.labels[class_name]
293 |             class_folder = os.path.join(self.data_folder, class_name)
294 |             video_names = os.listdir(class_folder)
295 |             random.shuffle(video_names)
296 |             video_names = video_names[:self.inst_num]
297 | 
298 |             for video_name in video_names:
299 |                 self.video_folders.append(os.path.join(class_folder, video_name))
300 |                 self.video_labels.append(label)
301 | 
302 |     def __len__(self):
303 |         return len(self.video_folders)
304 |     
305 |     def print_dataset(self):
306 |         for i in range(len(self)):
307 |             print("[{}] {} {} {}".format(i, self.video_labels[i], self.video_folders[i], self.scales[i]))
308 | 
309 |     def __getitem__(self, idx):
310 |         video_folder = self.video_folders[idx]
311 |         video_label = self.video_labels[idx]
312 | 
313 |         all_frames = [os.path.join(video_folder, frame_name) for frame_name in os.listdir(video_folder)]
314 |         all_frames.sort()
315 | 
316 |         length = len(all_frames)
317 |         stride = round((length - self.frame_num)/(self.clip_num*self.window_num-1))
318 |         expected_length = (self.clip_num*self.window_num-1)*stride + self.frame_num
319 |         
320 |         # Deal with length difference
321 |         if expected_length <= length:
322 |             all_frames = all_frames[:expected_length]
323 |         else:
324 |             tmp = all_frames[-1]
325 |             for _ in range(expected_length - length):
326 |                 all_frames.append(tmp)
327 |         
328 |         selected_frames = []
329 |         for i in range(self.clip_num*self.window_num):
330 |             selected_frames.extend(list(range(i*stride, i*stride+self.frame_num)))
331 |         
332 |         # Process frames
333 |         flip = random.randint(0,1)
334 |         processed_frames = []
335 |         for frame in all_frames:
336 |             img = cv2.imread(frame)
337 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
338 |             img = cv2.flip(img, 1) if flip else img
339 |             processed_frames.append(img)
340 | 
341 |         frames = []
342 |         for i, frame_idx in enumerate(selected_frames):
343 |             j = i % self.frame_num
344 |             if j == 0:
345 |                 frames.append([])
346 |             
347 |             frame = processed_frames[frame_idx].copy()
348 |             frames[-1].append(frame)
349 |         
350 |         frames = np.array(frames) / 127.5 - 1              # -1 to 1 # [num_frame, h, w, channel]
351 |         frames = np.transpose(frames, (0, 4, 1, 2, 3))     # [video_clip, RGB, frame_num, H, W]
352 |         frames = torch.Tensor(frames.copy())
353 | 
354 |         noise = random.randint(0,1)
355 |         if self.mode == "train" and noise:
356 |             frames = frames + 0.1 * torch.randn(self.window_num*self.clip_num, 3, self.frame_num, 128, 128)
357 | 
358 |         return frames, video_label, video_folder
359 | 
360 | class FinegymDataset(Dataset):
361 |     def __init__(self, data_folder, info_dict, mode, splits, class_num, inst_num, frame_num, clip_num, window_num):
362 |         self.mode = mode
363 |         assert mode in ['train', 'val', 'test']
364 |         
365 |         # Attribute
366 |         self.class_num = class_num
367 |         self.inst_num = inst_num
368 |         self.frame_num = frame_num
369 |         self.clip_num = clip_num
370 |         self.window_num = window_num
371 |         self.data_folder = data_folder
372 | 
373 |         # Mode & Split
374 |         if self.mode == "train":
375 |             all_class_names = splits[0]
376 |         elif self.mode == "val":
377 |             all_class_names = splits[1]
378 |         else:
379 |             all_class_names = splits[2]
380 |         while True:
381 |             self.class_names = random.sample(all_class_names, class_num)
382 |             done = True
383 |             for class_name in self.class_names:
384 |                 if len(info_dict[class_name]) < inst_num:
385 |                     done = False
386 |             if done:
387 |                 break
388 | 
389 |         self.labels = dict()
390 |         for i, class_name in enumerate(self.class_names):
391 |             self.labels[class_name] = i+1
392 |         
393 |         self.video_folders = []
394 |         self.video_labels = []
395 |         for class_name in self.class_names:
396 |             label = self.labels[class_name]
397 |             video_folders = info_dict[class_name]
398 |             video_folders = [os.path.join(data_folder, vid) for vid in video_folders]
399 |             sample_folders = []
400 |             for video_folder in video_folders:
401 |                 if os.path.exists(video_folder) and len(os.listdir(video_folder)) >= frame_num:
402 |                     sample_folders.append(video_folder)
403 |             sample_folders = random.sample(sample_folders, inst_num)
404 | 
405 |             self.video_folders.extend(sample_folders)
406 |             self.video_labels.extend([label] * inst_num)
407 |     
408 |     def __len__(self):
409 |         return len(self.video_folders)
410 |     
411 |     def __getitem__(self, idx):
412 |         video_folder = self.video_folders[idx]
413 |         video_label = self.video_labels[idx]
414 | 
415 |         all_frames = [os.path.join(video_folder, frame_name) for frame_name in os.listdir(video_folder)]
416 |         all_frames.sort()
417 | 
418 |         length = len(all_frames)
419 |         stride = round((length - self.frame_num)/(self.clip_num*self.window_num-1))
420 |         
421 |         selected_frames = []
422 |         for i in range(self.clip_num*self.window_num):
423 |             selected_frames.extend(list(range(i*stride, i*stride+self.frame_num)))
424 |         for i in range(len(selected_frames)):
425 |             if selected_frames[i] >= length:
426 |                 selected_frames[i] = length - 1
427 |         
428 |         # Process frames
429 |         processed_frames = [None] * length
430 |         for idx in selected_frames:
431 |             if processed_frames[idx] is None:
432 |                 frame = all_frames[idx]
433 |                 img = cv2.imread(frame)
434 |                 img = cv2.resize(img, (WIDTH, HEIGHT))   
435 |                 img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
436 |                 # # Rotate
437 |                 # if self.mode == "train" and random.randint(0,1):
438 |                 #     angle = random.randint(-25,25)
439 |                 #     img = rotate_img(img, angle, reshape=False)
440 |                 processed_frames[idx] = img
441 |         if self.mode == 'train':
442 |             processed_frames = use_aug_seq(processed_frames)
443 | 
444 |         frames = []
445 |         for i, frame_idx in enumerate(selected_frames):
446 |             j = i % self.frame_num
447 |             if j == 0:
448 |                 frames.append([])
449 |             
450 |             frame = processed_frames[frame_idx].copy()
451 |             frames[-1].append(frame)
452 |         
453 |         frames = np.array(frames) / 127.5 - 1              # -1 to 1 # [num_frame, h, w, channel]
454 |         frames = np.transpose(frames, (0, 4, 1, 2, 3))     # [window*clip, RGB, frame_num, H, W]
455 |         frames = torch.Tensor(frames.copy())
456 | 
457 |         return frames, video_label       
458 | 
459 | # class DoubleStandardDataset(Dataset):
460 | #     def find_same_video(self, folders, class_name, video_name):
461 | #         for folder in folders:
462 | #             class_path = os.path.join(folder, class_name)
463 | #             video_path = os.path.join(class_path, video_name)
464 | #             if os.path.exists(video_path):
465 | #                 return video_path
466 | #         return None
467 | 
468 | #     def __init__(self, h_data_folders, n_data_folders, mode, splits, class_num, inst_num, frame_num, clip_num, window_num):
469 | #         self.mode = mode
470 | #         assert mode in ["train", "val", "test"]
471 | 
472 | #         # Attribute
473 | #         self.class_num = class_num
474 | #         self.inst_num = inst_num
475 | #         self.frame_num = frame_num
476 | #         self.clip_num = clip_num
477 | #         self.window_num = window_num
478 | 
479 | #         # Mode & Split
480 | #         if self.mode == "train":
481 | #             all_class_names = splits[0]
482 | #         elif self.mode == "val":
483 | #             all_class_names = splits[1]
484 | #         else:
485 | #             all_class_names = splits[2]
486 | #         self.class_names = random.sample(all_class_names, class_num)
487 | 
488 | #         self.labels = dict()
489 | #         for i, class_name in enumerate(self.class_names):
490 | #             self.labels[class_name] = i+1
491 | 
492 | #         # Find all videos
493 | #         self.video_folders = []
494 | #         self.video_labels = []
495 | #         for class_name in self.class_names:
496 | #             video_folders = []
497 | #             label = self.labels[class_name]
498 | 
499 | #             for data_folder in n_data_folders:
500 | #                 class_folder = os.path.join(data_folder, class_name)
501 | #                 if not os.path.exists(class_folder):
502 | #                     continue
503 | #                 video_names = os.listdir(class_folder) if os.path.exists(class_folder) else []
504 | 
505 | #                 for video_name in video_names:
506 | #                     video_path = os.path.join(class_folder, video_name)
507 | #                     if len(os.listdir(video_path)) >= self.frame_num:
508 | #                         same_in_h = self.find_same_video(h_data_folders, class_name, video_name)
509 | #                         if same_in_h is not None:
510 | #                             video_folders.append([video_path, same_in_h])
511 | 
512 | #             # Pick <self.inst_num> random videos
513 | #             video_folders = random.sample(video_folders, inst_num)
514 | #             video_labels = [label] * inst_num
515 | 
516 | #             self.video_folders.extend(video_folders)
517 | #             self.video_labels.extend(video_labels)
518 | 
519 | #         # self.scales = []
520 | #         # for i in range(len(self.video_folders)):
521 | #         #     self.scales.append(random.randint(2,4))
522 | 
523 | #     def __len__(self):
524 | #         return len(self.video_folders)
525 | 
526 | #     def __getitem__(self, idx):
527 | #         video_folders = self.video_folders[idx]
528 | #         video_label = self.video_labels[idx]
529 | #         result = []
530 |         
531 | #         for video_folder in video_folders:
532 | 
533 | #             all_frames = [os.path.join(video_folder, frame_name) for frame_name in os.listdir(video_folder)]
534 | #             all_frames.sort()
535 | #             # all_frames = all_frames[::scale]
536 | 
537 | #             length = len(all_frames)
538 | #             stride = round((length - self.frame_num)/(self.clip_num*self.window_num-1))
539 |             
540 | #             selected_frames = []
541 | #             for i in range(self.clip_num*self.window_num):
542 | #                 selected_frames.extend(list(range(i*stride, i*stride+self.frame_num)))
543 | #             for i in range(len(selected_frames)):
544 | #                 if selected_frames[i] >= length:
545 | #                     selected_frames[i] = length - 1
546 |             
547 | #             # Process frames
548 | #             processed_frames = [None] * length
549 | #             for idx in selected_frames:
550 | #                 if processed_frames[idx] is None:
551 | #                     frame = all_frames[idx]
552 | #                     img = cv2.imread(frame)
553 | #                     img = cv2.resize(img, (WIDTH, HEIGHT))   
554 | #                     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
555 | #                     # # Rotate
556 | #                     # if self.mode == "train" and random.randint(0,1):
557 | #                     #     angle = random.randint(-25,25)
558 | #                     #     img = rotate_img(img, angle, reshape=False)
559 | #                     processed_frames[idx] = img
560 | #             if self.mode == 'train':
561 | #                 processed_frames = use_aug_seq(processed_frames)
562 | 
563 | #             frames = []
564 | #             for i, frame_idx in enumerate(selected_frames):
565 | #                 j = i % self.frame_num
566 | #                 if j == 0:
567 | #                     frames.append([])
568 |                 
569 | #                 frame = processed_frames[frame_idx].copy()
570 | #                 frames[-1].append(frame)
571 |             
572 | #             frames = np.array(frames) / 127.5 - 1              # -1 to 1 # [num_frame, h, w, channel]
573 | #             frames = np.transpose(frames, (0, 4, 1, 2, 3))     # [window*clip, RGB, frame_num, H, W]
574 | 
575 | #             result.append(frames)
576 |         
577 | #         result = torch.Tensor(result.copy())
578 | 
579 | #         return result, video_label
580 | 
581 | class ClassBalancedSampler(Sampler):
582 | 
583 |     def __init__(self, num_per_class, class_num, inst_num, shuffle):
584 |         self.num_per_class = num_per_class
585 |         self.class_num = class_num
586 |         self.inst_num = inst_num
587 |         self.shuffle = shuffle
588 | 
589 |     def __iter__(self):
590 |         # return a single list of indices, assuming that items will be grouped by class
591 |         batch = []
592 |         for j in range(self.class_num):
593 |             sublist = []
594 |             for i in range(self.inst_num):
595 |                 sublist.append(i+j*self.inst_num)
596 |             sublist = random.sample(sublist, self.num_per_class)
597 |             batch.append(sublist)
598 | 
599 |         batch = [item for sublist in batch for item in sublist]
600 | 
601 |         if self.shuffle:
602 |             random.shuffle(batch)
603 |         
604 |         return iter(batch)
605 | 
606 |     def __len__(self):
607 |         return 1
608 | 
609 | def get_data_loader(dataset, num_per_class, shuffle=False, num_workers=0):
610 |     sampler = ClassBalancedSampler(num_per_class, dataset.class_num, dataset.inst_num, shuffle)
611 |     loader = DataLoader(dataset, batch_size=num_per_class*dataset.class_num, sampler=sampler, num_workers=num_workers)
612 |     return loader


--------------------------------------------------------------------------------