├── core ├── __init__.py ├── const.py ├── metrics.py ├── config.py ├── keras_utils.py ├── config_utils.py ├── pytorch_utils.py ├── data_utils.py ├── utils.py └── image_utils.py ├── nets ├── __init__.py ├── resnet_152_pytorch.py ├── layers_pytorch.py ├── i3d_torch_charades_utils.py ├── timeception_pytorch.py ├── i3d_torch_charades.py ├── i3d_torch_charades_test.py └── resnet_152_keras.py ├── datasets └── __init__.py ├── experiments ├── __init__.py ├── test_pytorch.py ├── test_keras.py ├── train_keras.py └── train_pytorch.py ├── data └── assets │ ├── badge-keras.png │ ├── badge-pytorch.png │ ├── badge-tensorflow.png │ ├── timeception_layer.jpg │ ├── timeception_layer.pdf │ └── timeception_layer.svg ├── scripts ├── test_charades_i3d_tc4_f1024.sh └── train_charades_i3d_tc4_f1024.sh ├── requirements.txt ├── __doc__.py ├── configs ├── charades_i3d_tc2_f256.yaml ├── charades_i3d_tc3_f256.yaml ├── charades_i3d_tc3_f512.yaml └── charades_i3d_tc4_f1024.yaml ├── main.py └── README.md /core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/test_pytorch.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/assets/badge-keras.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/badge-keras.png -------------------------------------------------------------------------------- /data/assets/badge-pytorch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/badge-pytorch.png -------------------------------------------------------------------------------- /data/assets/badge-tensorflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/badge-tensorflow.png -------------------------------------------------------------------------------- /data/assets/timeception_layer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/timeception_layer.jpg -------------------------------------------------------------------------------- /data/assets/timeception_layer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/timeception_layer.pdf -------------------------------------------------------------------------------- /scripts/test_charades_i3d_tc4_f1024.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python ../experiments/test.py --config_file charades_i3d_tc4_f1024.yaml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | opencv 3 | scikit-learn 4 | 5 | keras 6 | tensorflow-gpu 7 | torch 8 | torchvision 9 | torchsummary 10 | torchviz -------------------------------------------------------------------------------- /scripts/train_charades_i3d_tc4_f1024.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python ../experiments/train.py --config_file charades_i3d_tc4_f1024.yaml 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /__doc__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | __author__ = 'Noureldien Hussein' 24 | __copyright__ = 'Copyright (c) 2019, Noureldien Hussein' 25 | __credits__ = [''] 26 | __license__ = 'GPLv3' 27 | __version__ = '1.0.0' 28 | __maintainer__ = 'Noureldien Hussein' 29 | __email__ = 'nhussein@uva.nl' 30 | __status__ = 'Development' 31 | -------------------------------------------------------------------------------- /configs/charades_i3d_tc2_f256.yaml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env vim 2 | 3 | NUM_GPUS: 1 # how many gups to use 4 | LOG_PERIOD: 10 # log period 5 | DATASET_NAME: 'charades' # name of dataset 6 | 7 | MODEL: 8 | CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl' 9 | N_CLASSES: 157 # how many classes as output 10 | N_CHAMNNEL_GROUPS: 8 # how many channel groups 11 | N_TC_LAYERS: 2 # number of timeception layers 12 | N_TC_TIMESTEPS: 32 # how mant timesteps expected as input to the timeception layers 13 | N_INPUT_TIMESTEPS: 256 # how many timesteps (i.e. frames) expected as an input to the backbone CNN 14 | NAME: 'charades_timeception' # name suffex for the model to be trained 15 | BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used 16 | BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn 17 | MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks" 18 | 19 | TRAIN: 20 | BATCH_SIZE: 32 # batch size for training 21 | N_EPOCHS: 500 # how many training epochs 22 | SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only') 23 | N_WORKERS: 10 # how many parallel workers in the data generator 24 | 25 | TEST: 26 | BATCH_SIZE: 64 27 | N_SAMPLES: 10 28 | 29 | SOLVER: 30 | NAME: 'adam' 31 | LR: 0.01 32 | ADAM_EPSILON: 0.0001 33 | SGD_WEIGHT_DECAY: 0.0001 34 | SGD_MOMENTUM: 0.9 35 | SGD_NESTEROV: True -------------------------------------------------------------------------------- /configs/charades_i3d_tc3_f256.yaml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env vim 2 | 3 | NUM_GPUS: 1 # how many gups to use 4 | LOG_PERIOD: 10 # log period 5 | DATASET_NAME: 'charades' # name of dataset 6 | 7 | MODEL: 8 | CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl' 9 | N_CLASSES: 157 # how many classes as output 10 | N_CHAMNNEL_GROUPS: 8 # how many channel groups 11 | N_TC_LAYERS: 3 # number of timeception layers 12 | N_TC_TIMESTEPS: 32 # how mant timesteps expected as input to the timeception layers 13 | N_INPUT_TIMESTEPS: 256 # how many timesteps (i.e. frames) expected as an input to the backbone CNN 14 | NAME: 'charades_timeception' # name suffex for the model to be trained 15 | BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used 16 | BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn 17 | MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks" 18 | 19 | TRAIN: 20 | BATCH_SIZE: 32 # batch size for training 21 | N_EPOCHS: 500 # how many training epochs 22 | SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only') 23 | N_WORKERS: 10 # how many parallel workers in the data generator 24 | 25 | TEST: 26 | BATCH_SIZE: 50 27 | N_SAMPLES: 10 28 | 29 | SOLVER: 30 | NAME: 'adam' 31 | LR: 0.01 32 | ADAM_EPSILON: 0.0001 33 | SGD_WEIGHT_DECAY: 0.0001 34 | SGD_MOMENTUM: 0.9 35 | SGD_NESTEROV: True -------------------------------------------------------------------------------- /configs/charades_i3d_tc3_f512.yaml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env vim 2 | 3 | NUM_GPUS: 1 # how many gups to use 4 | LOG_PERIOD: 10 # log period 5 | DATASET_NAME: 'charades' # name of dataset 6 | 7 | MODEL: 8 | CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl' 9 | N_CLASSES: 157 # how many classes as output 10 | N_CHAMNNEL_GROUPS: 8 # how many channel groups 11 | N_TC_LAYERS: 3 # number of timeception layers 12 | N_TC_TIMESTEPS: 64 # how mant timesteps expected as input to the timeception layers 13 | N_INPUT_TIMESTEPS: 512 # how many timesteps (i.e. frames) expected as an input to the backbone CNN 14 | NAME: 'charades_timeception' # name suffex for the model to be trained 15 | BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used 16 | BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn 17 | MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks" 18 | 19 | TRAIN: 20 | BATCH_SIZE: 20 # batch size for training 21 | N_EPOCHS: 500 # how many training epochs 22 | SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only') 23 | N_WORKERS: 10 # how many parallel workers in the data generator 24 | 25 | TEST: 26 | BATCH_SIZE: 40 27 | N_SAMPLES: 10 28 | 29 | SOLVER: 30 | NAME: 'adam' 31 | LR: 0.01 32 | ADAM_EPSILON: 0.0001 33 | SGD_WEIGHT_DECAY: 0.0001 34 | SGD_MOMENTUM: 0.9 35 | SGD_NESTEROV: True -------------------------------------------------------------------------------- /configs/charades_i3d_tc4_f1024.yaml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env vim 2 | 3 | NUM_GPUS: 1 # how many gups to use 4 | LOG_PERIOD: 10 # log period 5 | DATASET_NAME: 'charades' # name of dataset 6 | 7 | MODEL: 8 | CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl' 9 | N_CLASSES: 157 # how many classes as output 10 | N_CHAMNNEL_GROUPS: 8 # how many channel groups 11 | N_TC_LAYERS: 4 # number of timeception layers 12 | N_TC_TIMESTEPS: 128 # how mant timesteps expected as input to the timeception layers 13 | N_INPUT_TIMESTEPS: 1024 # how many timesteps (i.e. frames) expected as an input to the backbone CNN 14 | NAME: 'charades_timeception' # name suffex for the model to be trained 15 | BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used 16 | BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn 17 | MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks" 18 | 19 | TRAIN: 20 | BATCH_SIZE: 16 # batch size for training 21 | N_EPOCHS: 500 # how many training epochs 22 | SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only') 23 | N_WORKERS: 10 # how many parallel workers in the data generator 24 | 25 | TEST: 26 | BATCH_SIZE: 32 27 | N_SAMPLES: 10 28 | 29 | SOLVER: 30 | NAME: 'adam' 31 | LR: 0.01 32 | ADAM_EPSILON: 0.0001 33 | SGD_WEIGHT_DECAY: 0.0001 34 | SGD_MOMENTUM: 0.9 35 | SGD_NESTEROV: True -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Main file of the project. 25 | """ 26 | 27 | def __main(): 28 | from experiments import train_keras, test_keras, train_pytorch, test_pytorch 29 | 30 | # to train Timeception using keras 31 | train_keras.__main() 32 | 33 | # or using pytorch 34 | # train_pytorch.__main() 35 | 36 | # to test Timeception using keras 37 | # test_keras.__main() 38 | 39 | # or using pytorch 40 | # test_pytorch.__main() 41 | 42 | if __name__ == '__main__': 43 | __main() 44 | pass 45 | -------------------------------------------------------------------------------- /experiments/test_keras.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Test Timeception models. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import logging 33 | import os 34 | import datetime 35 | from optparse import OptionParser 36 | 37 | import tensorflow as tf 38 | import keras.backend as K 39 | from keras.layers import Dense, LeakyReLU, Dropout, Input, Activation 40 | from keras.optimizers import SGD, Adam 41 | from keras.models import Sequential, Model 42 | from keras.layers.normalization import BatchNormalization 43 | 44 | from nets import timeception 45 | from nets.layers_keras import MaxLayer 46 | from core import utils, keras_utils, image_utils, config_utils, const, config, data_utils 47 | from core.utils import Path as Pth 48 | 49 | logger = logging.getLogger(__name__) 50 | 51 | def test_tco(): 52 | pass -------------------------------------------------------------------------------- /core/const.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Constants for project. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import os 33 | import platform 34 | import numpy as np 35 | 36 | DL_FRAMEWORKS = np.array(['caffe', 'tensorflow', 'pytorch', 'keras', 'caffe2']) 37 | DL_FRAMEWORK = None 38 | GPU_CORE_ID = 0 39 | 40 | CNN_FEATURE_SIZES = np.array([2048, 2048, 1000, 1024, 1000, 2048, 2048]) 41 | CNN_FEATURE_TYPES = np.array(['fc6', 'fc7', 'fc1000', 'fc1024', 'fc365', 'prob', 'pool5', 'fc8a', 'res3b7', 'res4b35', 'res5c']) 42 | CNN_MODEL_TYPES = np.array(['resnet152', 'googlenet1k', 'vgg16', 'places365-resnet152', 'places365-vgg', 'googlenet13k']) 43 | RESIZE_TYPES = np.array(['resize', 'resize_crop', 'resize_crop_scaled', 'resize_keep_aspect_ratio_padded']) 44 | ROOT_PATH_TYPES = np.array(['data', 'project']) 45 | TRAIN_SCHEMES = np.array(['ete', 'tco']) 46 | MODEL_CLASSIFICATION_TYPES = np.array(['ml', 'sl']) 47 | MODEL_MULTISCALE_TYPES = np.array(['dl', 'ks']) 48 | SOLVER_NAMES = np.array(['adam', 'sgd']) 49 | DATASET_NAMES = np.array(['charades', 'kinetics400', 'breakfast_actions', 'you_cook_2', 'multi_thumos']) 50 | DATA_ROOT_PATH = './data' 51 | PROJECT_ROOT_PATH = '../' 52 | MACHINE_NAME = platform.node() 53 | -------------------------------------------------------------------------------- /core/metrics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Evaluation functions. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import numpy as np 33 | from sklearn.metrics import average_precision_score 34 | 35 | def map_charades(y_true, y_pred): 36 | """ Returns mAP """ 37 | m_aps = [] 38 | n_classes = y_pred.shape[1] 39 | for oc_i in range(n_classes): 40 | pred_row = y_pred[:, oc_i] 41 | sorted_idxs = np.argsort(-pred_row) 42 | true_row = y_true[:, oc_i] 43 | tp = true_row[sorted_idxs] == 1 44 | fp = np.invert(tp) 45 | n_pos = tp.sum() 46 | if n_pos < 0.1: 47 | m_aps.append(float('nan')) 48 | continue 49 | f_pcs = np.cumsum(fp) 50 | t_pcs = np.cumsum(tp) 51 | prec = t_pcs / (f_pcs + t_pcs).astype(float) 52 | avg_prec = 0 53 | for i in range(y_pred.shape[0]): 54 | if tp[i]: 55 | avg_prec += prec[i] 56 | m_aps.append(avg_prec / n_pos.astype(float)) 57 | m_aps = np.array(m_aps) 58 | m_ap = np.mean(m_aps) 59 | return m_ap 60 | 61 | def map_sklearn(y_true, y_pred): 62 | # """ Returns mAP """ 63 | n_classes = y_true.shape[1] 64 | map = [average_precision_score(y_true[:, i], y_pred[:, i]) for i in range(n_classes)] 65 | map = np.nan_to_num(map) 66 | map = np.mean(map) 67 | return map 68 | 69 | def accuracy(y_true, y_pred): 70 | idx = np.argmax(y_pred, axis=1) 71 | n_items = len(y_true) 72 | accuracy = np.sum(idx == y_true) / float(n_items) 73 | return accuracy 74 | 75 | def acuracy_top_n(n_top, y_true, y_pred): 76 | n_corrects = 0 77 | for gt, pr in zip(y_true, y_pred): 78 | idx = np.argsort(pr)[::-1] 79 | idx = idx[0:n_top] 80 | gt = np.where(gt == 1)[0][0] 81 | if gt in idx: 82 | n_corrects += 1 83 | n = len(y_true) 84 | score = n_corrects / float(n) 85 | return score 86 | 87 | -------------------------------------------------------------------------------- /core/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Definition for all configuration options for training/testing Timeception model on various datasets. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import logging 33 | import sys 34 | 35 | from core.utils import AttrDict 36 | 37 | logger = logging.getLogger(__name__) 38 | 39 | __C = AttrDict() 40 | cfg = __C 41 | 42 | # region Misc 43 | 44 | __C.DEBUG = False # is debugging 45 | __C.NUM_GPUS = 1 # how many gups to use 46 | __C.LOG_PERIOD = 10 # log period 47 | __C.DATASET_NAME = str('') # name of dataset 48 | 49 | # endregion 50 | 51 | # region Model 52 | 53 | __C.MODEL = AttrDict() 54 | __C.MODEL.CLASSIFICATION_TYPE = str('') # either multi-label 'ml' or single-label 'sl' 55 | __C.MODEL.N_CLASSES = 157 # how many classes as output 56 | __C.MODEL.N_CHAMNNEL_GROUPS = 8 # how many channel groups 57 | __C.MODEL.N_TC_LAYERS = 4 # number of timeception layers 58 | __C.MODEL.N_TC_TIMESTEPS = 64 # how mant timesteps expected as input to the timeception layers 59 | __C.MODEL.N_INPUT_TIMESTEPS = 512 # how many timesteps (i.e. frames) expected as an input to the backbone CNN 60 | __C.MODEL.NAME = str('') # name suffex for the model to be trained 61 | __C.MODEL.BACKBONE_CNN = str('') # which backbone cnn is used 62 | __C.MODEL.BACKBONE_FEATURE = str('') # type of feature output from backbone cnn 63 | __C.MODEL.MULTISCALE_TYPE = str('') # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks" 64 | 65 | # endregion 66 | 67 | # region Train 68 | 69 | __C.TRAIN = AttrDict() 70 | __C.TRAIN.BATCH_SIZE = 64 # batch size for training 71 | __C.TRAIN.N_EPOCHS = 500 # how many training epochs 72 | __C.TRAIN.SCHEME = str('') # either 'ete' (end-to-end) or tco ('timeception-only') 73 | __C.TRAIN.N_WORKERS = 10 # 74 | 75 | # endregion 76 | 77 | # region Test 78 | 79 | __C.TEST = AttrDict() 80 | __C.TEST.BATCH_SIZE = 64 81 | __C.TEST.N_SAMPLES = 10 82 | 83 | # endregion 84 | 85 | # region Solver 86 | 87 | __C.SOLVER = AttrDict() 88 | __C.SOLVER.NAME = str('adam') 89 | __C.SOLVER.LR = 0.0001 90 | __C.SOLVER.ADAM_EPSILON = 1e-4 91 | __C.SOLVER.SGD_WEIGHT_DECAY = 0.0001 92 | __C.SOLVER.SGD_MOMENTUM = 0.9 93 | __C.SOLVER.SGD_NESTEROV = True 94 | 95 | # endregion 96 | -------------------------------------------------------------------------------- /nets/resnet_152_pytorch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | ResNet-152 fine-tuned on Charades. 25 | https://github.com/gsig/charades-algorithms/tree/master/pytorch 26 | """ 27 | 28 | from __future__ import absolute_import 29 | from __future__ import division 30 | from __future__ import print_function 31 | from __future__ import unicode_literals 32 | 33 | import logging 34 | import warnings 35 | import os 36 | import random 37 | import sys 38 | import time 39 | import datetime 40 | import math 41 | import shutil 42 | import random 43 | 44 | import numpy as np 45 | import cv2 46 | import scipy.io 47 | import h5py 48 | from collections import OrderedDict 49 | 50 | from core import const as c, utils 51 | from core import image_utils 52 | 53 | logger = logging.getLogger(__name__) 54 | 55 | if c.DL_FRAMEWORK == 'tensorflow': 56 | import tensorflow as tf 57 | elif c.DL_FRAMEWORK == 'caffe': 58 | import caffe 59 | elif c.DL_FRAMEWORK == 'pytorch': 60 | import torch 61 | import torch.nn as nn 62 | import torch.nn.parallel 63 | import torch.backends.cudnn as cudnn 64 | import torch.distributed as dist 65 | import torchvision.models as tmodels 66 | import importlib 67 | elif c.DL_FRAMEWORK == 'keras': 68 | import tensorflow as tf 69 | import keras.backend as K 70 | 71 | def get_resnet_152_charades_model(): 72 | import torch 73 | import torch.nn as nn 74 | import torch.nn.parallel 75 | import torch.backends.cudnn as cudnn 76 | import torch.distributed as dist 77 | import torchvision.models as tmodels 78 | import importlib 79 | import torch.utils.model_zoo as model_zoo 80 | 81 | root_path = c.DATA_ROOT_PATH 82 | model_arch = 'resnet152' 83 | model_checkpoint_path = '%s/Charades/baseline_models/resnet_rgb.pth.tar' % (root_path) 84 | 85 | # load model 86 | print("=> creating model '{}'".format(model_arch)) 87 | model = tmodels.__dict__[model_arch](pretrained=False) 88 | cudnn.benchmark = True 89 | 90 | # load checkpoint 91 | checkpoint = torch.load(model_checkpoint_path) 92 | checkpoint = checkpoint['state_dict'] 93 | 94 | # fix keys of state dict 95 | unwanted_keys = ['fc.weight', 'fc.bias'] 96 | state_dict = OrderedDict() 97 | for k, v in checkpoint.iteritems(): 98 | key = k.replace('module.', '') 99 | if key not in unwanted_keys: 100 | state_dict[key] = v 101 | 102 | # remove fc and avgpool layers 103 | layers = model._modules.items() 104 | layers = list(layers)[:-2] 105 | layers = OrderedDict(layers) 106 | model = nn.Sequential(layers) 107 | 108 | # load the dictionary 109 | model.load_state_dict(state_dict) 110 | 111 | # if parrallize the model 112 | # model = torch.nn.DataParallel(model).cuda() 113 | 114 | # make sure it's only for testing 115 | model.train(False) 116 | 117 | # convert to eval model 118 | model.eval() 119 | 120 | # convert to gpu model 121 | model.cuda() 122 | 123 | return model 124 | 125 | def get_mean_std_for_resnet_152_pytorch_model(): 126 | img_mean = [0.485, 0.456, 0.406] 127 | img_std = [0.229, 0.224, 0.225] 128 | return img_mean, img_std 129 | -------------------------------------------------------------------------------- /nets/layers_pytorch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Layers for pytorch. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import numpy as np 33 | import logging 34 | 35 | import torch 36 | from torch.nn import Module, Conv2d, Conv1d 37 | from torch.nn import functional as F 38 | 39 | from core import pytorch_utils 40 | 41 | logger = logging.getLogger(__name__) 42 | 43 | # region Basic Layers 44 | 45 | class ChannelShuffleLayer(Module): 46 | """ 47 | Shuffle the channels across groups. 48 | """ 49 | 50 | def __init__(self, n_channels, n_groups): 51 | super(ChannelShuffleLayer, self).__init__() 52 | 53 | n_channels_per_group = int(n_channels / n_groups) 54 | assert n_channels_per_group * n_groups == n_channels 55 | 56 | self.n_channels_per_group = n_channels_per_group 57 | self.n_groups = n_groups 58 | 59 | def forward(self, input): 60 | """ 61 | input shape (None, 1024, 20, 7, 7), or (BN, C, T, H, W) 62 | """ 63 | 64 | input_shape = input.size() 65 | n_samples, n_channels, n_timesteps, side_dim1, side_dim2 = input_shape 66 | 67 | n_groups = self.n_groups 68 | n_channels_per_group = self.n_channels_per_group 69 | 70 | tensor = input.view(n_samples, n_groups, n_channels_per_group, n_timesteps, side_dim1, side_dim2) 71 | tensor = tensor.permute(0, 2, 1, 3, 4, 5) 72 | tensor = tensor.contiguous() 73 | tensor = tensor.view(n_samples, n_channels, n_timesteps, side_dim1, side_dim2) 74 | 75 | return tensor 76 | 77 | # endregion 78 | 79 | # region Timeception Layers 80 | 81 | class DepthwiseConv1DLayer(Module): 82 | """ 83 | Shuffle the channels across groups. 84 | """ 85 | 86 | def __init__(self, input_shape, kernel_size, dilation, name): 87 | super(DepthwiseConv1DLayer, self).__init__() 88 | 89 | assert len(input_shape) == 5 90 | 91 | self.kernel_size = kernel_size 92 | self.dilation = dilation 93 | self._name = name 94 | 95 | n_channels = input_shape[1] 96 | n_timesteps = input_shape[2] 97 | 98 | # TODO: support using different dilation rates. 99 | padding = pytorch_utils.calc_padding_1d(n_timesteps, kernel_size) 100 | self.depthwise_conv1d = Conv1d(n_channels, n_channels, kernel_size, dilation=dilation, groups=n_channels, padding=padding) 101 | self.depthwise_conv1d._name = name 102 | 103 | def forward(self, input): 104 | """ 105 | input shape (None, 1024, 20, 7, 7), or (BN, C, T, H, W) 106 | """ 107 | 108 | input_shape = input.size() 109 | 110 | n, c, t, h, w = input_shape 111 | 112 | # transpose and reshape to hide the spatial dimension, only expose the temporal dimension for depthwise conv 113 | tensor = input.permute(0, 3, 4, 1, 2) # (None, 7, 7, 1024, 20) 114 | tensor = tensor.contiguous() 115 | tensor = tensor.view(-1, c, t) # (None*7*7, 1024, 20) 116 | 117 | # depthwise conv on the temporal dimension, as if it was the spatial dimension 118 | tensor = self.depthwise_conv1d(tensor) # (None*7*7, 1024, 20) 119 | 120 | # get timesteps after convolution 121 | t = tensor.size()[-1] 122 | 123 | # reshape to get the spatial dimensions 124 | tensor = tensor.view(n, h, w, c, t) # (None, 7, 7, 1024, 20) 125 | 126 | # finally, transpose to get the desired output shape 127 | tensor = tensor.permute(0, 3, 4, 1, 2) # (None, 1024, 20, 7, 7) 128 | 129 | return tensor 130 | 131 | # endregion 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Timeception for Complex Action Recognition 2 | 3 | ![Keras](./data/assets/badge-keras.png "Keras") ![Keras](./data/assets/badge-tensorflow.png "TensorFlow") ![Keras](./data/assets/badge-pytorch.png "PyTorch") 4 | 5 | This code repository is the implementation for the paper [Timeception for Complex Action Recognition](https://arxiv.org/abs/1812.01289). 6 | We provide the implementation for 3 different libraries: `keras`, `tensorflow` and `pytorch`. 7 | 8 | ![Timeception for Complex Action Recognition](./data/assets/timeception_layer.jpg "Timeception Block") 9 | 10 | ### Citation 11 | 12 | Please consider citing this work using this BibTeX entry 13 | 14 | ```bibtex 15 | @inproceedings{hussein2018timeception, 16 | title = {Timeception for Complex Action Recognition}, 17 | author = {Hussein, Noureldien and Gavves, Efstratios and Smeulders, Arnold WM}, 18 | booktitle = {CVPR}, 19 | year = {2019} 20 | } 21 | ``` 22 | 23 | ### How to Use? 24 | 25 | ###### Keras 26 | 27 | Using `keras`, we can define `timeception` as a sub-model. 28 | Then we use it along with another model definition. 29 | For example, here we define 4 `timeception` layers followed by a `dense` layer for classification. 30 | 31 | ```python 32 | from keras import Model 33 | from keras.layers import Input, Dense 34 | from nets.layers_keras import MaxLayer 35 | from nets.timeception import Timeception 36 | 37 | # define the timeception layers 38 | timeception = Timeception(1024, n_layers=4) 39 | 40 | # define network for classification 41 | input = Input(shape=(128, 7, 7, 1024)) 42 | tensor = timeception(input) 43 | tensor = MaxLayer(axis=(1, 2, 3))(tensor) 44 | output = Dense(100, activation='softmax')(tensor) 45 | model = Model(inputs=input, outputs=output) 46 | model.summary() 47 | ``` 48 | 49 | This results in the model defined as: 50 | 51 | ``` 52 | Layer (type) Output Shape Param # 53 | ================================================ 54 | (InputLayer) (None, 128, 7, 7, 1024) 0 55 | (Timeception) (None, 8, 7, 7, 2480) 1494304 56 | (MaxLayer) (None, 2480) 0 57 | (Dense) (None, 100) 248100 58 | ================================================ 59 | Total params: 1,742,404 60 | ``` 61 | 62 | ###### Tensorflow 63 | 64 | Using `tensorflow`, we can define `timeception` as a list of nodes in the computational graph. 65 | Then we use it along with another model definition. 66 | For example, here a functions defines 4 `timeception` layers. 67 | It takes the input tensor, feedforward it to the `timeception` layers and return the output tensor `output`. 68 | 69 | ```python 70 | import tensorflow as tf 71 | from nets import timeception 72 | 73 | # define input tensor 74 | input = tf.placeholder(tf.float32, shape=(None, 128, 7, 7, 1024)) 75 | 76 | # feedforward the input to the timeception layers 77 | tensor = timeception.timeception_layers(input, n_layers=4) 78 | 79 | # the output is (?, 8, 7, 7, 2480) 80 | print (tensor.get_shape()) 81 | ``` 82 | 83 | ###### PyTorch 84 | 85 | Using `pytorch`, we can define `timeception` as a module. 86 | Then we use it along with another model definition. 87 | For example, here we define 4 `timeception` layers followed by a `dense` layer for classification.. 88 | 89 | ```python 90 | import numpy as np 91 | import torch as T 92 | from nets import timeception_pytorch 93 | 94 | # define input tensor 95 | input = T.tensor(np.zeros((32, 1024, 128, 7, 7)), dtype=T.float32) 96 | 97 | # define 4 layers of timeception 98 | module = timeception_pytorch.Timeception(input.size(), n_layers=4) 99 | 100 | # feedforward the input to the timeception layers 101 | tensor = module(input) 102 | 103 | # the output is (32, 2480, 8, 7, 7) 104 | print (tensor.size()) 105 | ``` 106 | 107 | ### Installation 108 | 109 | We use python 2.7.15, provided by Anaconda 4.6.2, and we depend on the following python packages. 110 | - Keras 2.2.4 111 | - Tensorflow 1.10.1 112 | - PyTorch 1.0.1 113 | 114 | ### Training 115 | 116 | ### Testing 117 | 118 | ### Fine-tuning 119 | 120 | ### Pretrained Models 121 | 122 | #### Charades 123 | 124 | We will add all pretrained models for Charades by the end of April. 125 | For testing, start with the script `./scripts/test_charades_timeception.sh`. 126 | In order to change which baseline is uses for testing, set the `-- config-file` using on of the following options. 127 | 128 | ###### 2D-ResNet-152 129 | 130 | Timeception on top of 2D-ResNet-152 as backnone. 131 | 132 | | Config File | Backbone | TC Layers | Frames | mAP (%) | Model | 133 | |---|:---:|:---:|:---:|:---:|:---:| 134 | | [charades_r2d_tc3_f32.yaml](./configs/charades_r2d_tc3_f32.yaml) | R2D | 3 | 32 | 30.37 | [Link](./data/charades/charades_r2d_tc3_f32.pkl) | 135 | | [charades_r2d_tc3_f64.yaml](./configs/charades_r2d_tc3_f64.yaml) | R2D | 3 | 64 | 31.25 | [Link](./data/charades/charades_r2d_tc3_f64.pkl) | 136 | | [charades_r2d_tc4_f128.yaml](./configs/charades_r2d_tc4_f128.yaml) | R2D | 4 | 128 | 31.82 | [Link](./data/charades/charades_r2d_tc4_f128.pkl) | 137 | 138 | ###### I3D 139 | 140 | Timeception on top of ResNet-152 as backnone. 141 | 142 | | Config File | Backbone | TC Layers | Frames | mAP (%) | Model | 143 | |---|:---:|:---:|:---:|:---:|:---:| 144 | | [charades_i3d_tc3_f256.yaml](./configs/charades_i3d_tc3_f256.yaml) | I3D | 3 | 256 | 33.89 | [Link](./data/charades/charades_i3d_tc3_f256.pkl) | 145 | | [charades_i3d_tc3_f512.yaml](./configs/charades_i3d_tc3_f512.yaml) | I3D | 3 | 512 | 35.46 | [Link](./data/charades/charades_i3d_tc3_f512.pkl) | 146 | | [charades_i3d_tc4_f1024.yaml](./configs/charades_i3d_tc4_f1024.yaml) | I3D | 4 | 1024 | 37.20 | [Link](./data/charades/charades_i3d_tc4_f1024.pkl) | 147 | 148 | ###### 3D-ResNet-100 149 | Timeception on top of 3D-ResNet-100 as backnone. 150 | 151 | 152 | | Config File | Backbone | TC Layers | Frames | mAP (%) | Model | 153 | |---|:---:|:---:|:---:|:---:|:---:| 154 | | [charades_r3d_tc4_f1024.yaml](./configs/charades_r3d_tc4_f1024.yaml) | R3D | 4 | 1024 | 41.1 | [Link](./data/charades/charades_r3d_tc4_f1024.pkl) | 155 | 156 | 157 | #### Kinetics 400 158 | 159 | We will add all pretrained models for Kinetics 400 by the end of June. 160 | 161 | ### License 162 | 163 | The code and the models in this repo are released under the GNU 3.0 [LICENSE](LICENSE). 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /core/keras_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Helper functions for keras. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import os 33 | import json 34 | import pydot 35 | import logging 36 | import numpy as np 37 | 38 | import tensorflow as tf 39 | from tensorflow.contrib import framework as tf_framework 40 | 41 | import keras.backend as K 42 | from keras.callbacks import Callback 43 | from keras.utils import vis_utils 44 | from keras.models import Sequential, model_from_json 45 | 46 | from core import config_utils 47 | 48 | logger = logging.getLogger(__name__) 49 | 50 | # region Constants 51 | 52 | EPS_VALUE = 1e-9 53 | LOSSES = ['categorical_crossentropy', 'mean_squared_error', 'mean_absolute_error', 'binary_crossentropy'] 54 | METRICS = ['accuracy', 'mean_squared_error', 'mean_absolute_error'] 55 | OPTIMIZERS = ['sgd', 'rmsprop', 'adam'] 56 | ACTIVATIONS = ['tanh', 'relu', 'sigmoid', 'softmax'] 57 | 58 | # endregion 59 | 60 | # region Functions 61 | 62 | def save_model_figure(model, file_path='/.model.eps'): 63 | vis_utils.plot_model(model, file_path, show_shapes=True, show_layer_names=True) 64 | 65 | def load_model(json_path, weight_path, metrics=None, loss=None, optimizer=None, custom_objects=None, is_compile=True): 66 | with open(json_path, 'r') as f: 67 | model_json_string = json.load(f) 68 | model_json_dict = json.loads(model_json_string) 69 | model = model_from_json(model_json_string, custom_objects=custom_objects) 70 | model.load_weights(weight_path) 71 | 72 | if is_compile: 73 | if optimizer is None: 74 | optimizer = model_json_dict['optimizer']['name'] 75 | 76 | if loss is None: 77 | loss = model_json_dict['loss'] 78 | 79 | if metrics is None: 80 | model.compile(loss=loss, optimizer=optimizer) 81 | else: 82 | model.compile(loss=loss, optimizer=optimizer, metrics=metrics) 83 | 84 | return model 85 | 86 | def save_model(model, json_path, weight_path): 87 | model.save_weights(weight_path, overwrite=True) 88 | model_json = model.to_json() 89 | with open(json_path, 'w') as f: 90 | json.dump(model_json, f) 91 | 92 | def layer_exist(model, layer_name): 93 | exist = False 94 | for layer in model.layers: 95 | if layer.name == layer_name: 96 | exist = True 97 | break 98 | 99 | return exist 100 | 101 | def calc_num_batches(n_samples, batch_size): 102 | n_batch = int(n_samples / float(batch_size)) 103 | n_batch = n_batch if n_samples % batch_size == 0 else n_batch + 1 104 | return n_batch 105 | 106 | # endregion 107 | 108 | # region Metrics 109 | 110 | def map_charades(y_true, y_pred): 111 | """ 112 | Returns mAP 113 | """ 114 | m_aps = [] 115 | 116 | tf_one = tf.constant(1, dtype=tf.float32) 117 | 118 | n_classes = y_pred.shape[1] 119 | for oc_i in range(n_classes): 120 | pred_row = y_pred[:, oc_i] 121 | sorted_idxs = tf_framework.argsort(-pred_row) 122 | true_row = y_true[:, oc_i] 123 | true_row = tf.map_fn(lambda i: true_row[i], sorted_idxs, dtype=np.float32) 124 | tp_poolean = tf.equal(true_row, tf_one) 125 | tp = tf.cast(tp_poolean, dtype=np.float32) 126 | fp = K.reverse(tp, axes=0) 127 | n_pos = tf.reduce_sum(tp) 128 | f_pcs = tf.cumsum(fp) 129 | t_pcs = tf.cumsum(tp) 130 | s = f_pcs + t_pcs 131 | 132 | s = tf.cast(s, tf.float32) 133 | t_pcs = tf.cast(t_pcs, tf.float32) 134 | tp_float = tf.cast(tp_poolean, np.float32) 135 | 136 | prec = t_pcs / s 137 | avg_prec = prec * tp_float 138 | 139 | n_pos = tf.cast(n_pos, tf.float32) 140 | avg_prec = avg_prec / n_pos 141 | avg_prec = tf.expand_dims(avg_prec, axis=0) 142 | m_aps.append(avg_prec) 143 | 144 | m_aps = K.concatenate(m_aps, axis=0) 145 | mAP = K.mean(m_aps) 146 | return mAP 147 | 148 | # endregion 149 | 150 | # region Callbacks 151 | 152 | class SaveCallback(Callback): 153 | def __init__(self, dataset_name, model_name): 154 | self.model_name = model_name 155 | 156 | model_root_path = './data/%s/models' % (dataset_name) 157 | assert os.path.exists(model_root_path) 158 | 159 | model_root_path = './data/%s/models/%s' % (dataset_name, model_name) 160 | if not os.path.exists(model_root_path): 161 | os.mkdir(model_root_path) 162 | 163 | self.model_root_path = model_root_path 164 | 165 | super(SaveCallback, self).__init__() 166 | 167 | def on_epoch_end(self, idx_epoch, logs=None): 168 | """ 169 | Save the model. 170 | """ 171 | 172 | epoch_num = idx_epoch + 1 173 | self.__save(epoch_num) 174 | 175 | def __save(self, epoch_num): 176 | model_root_path = self.model_root_path 177 | model = self.model 178 | 179 | # hfpy accept only strings as a path 180 | model_json_path = str('%s/%03d.json' % (model_root_path, epoch_num)) 181 | model_weight_path = str('%s/%03d.pkl' % (model_root_path, epoch_num)) 182 | 183 | # save model definition as json, and save model weights 184 | model.save_weights(model_weight_path, overwrite=True) 185 | model_json = model.to_json() 186 | with open(model_json_path, 'w') as f: 187 | json.dump(model_json, f) 188 | 189 | # endregion 190 | -------------------------------------------------------------------------------- /nets/i3d_torch_charades_utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import os 3 | import random 4 | import sys 5 | import time 6 | import datetime 7 | import math 8 | import shutil 9 | import random 10 | import threading 11 | 12 | import numpy as np 13 | import cv2 14 | import scipy.io 15 | import h5py 16 | from optparse import OptionParser 17 | from collections import OrderedDict 18 | 19 | import torch 20 | import torch.nn as nn 21 | import torch.nn.parallel 22 | import torch.backends.cudnn as cudnn 23 | import torch.distributed as dist 24 | import torchvision.models as tmodels 25 | import importlib 26 | import torchsummary 27 | from core import pytorch_utils 28 | import torch.nn.functional as F 29 | import torch.optim as optim 30 | from torch.optim import lr_scheduler 31 | from torch.autograd import Variable 32 | 33 | import torchvision 34 | from torchvision import datasets, transforms 35 | 36 | from core import const as c, utils 37 | from core import image_utils 38 | from nets import i3d_torch_charades_test 39 | 40 | def extract_features_rgb(): 41 | from core import config_utils 42 | 43 | is_local = config_utils.is_local_machine() 44 | if is_local: 45 | begin_num = None 46 | end_num = None 47 | else: 48 | parser = OptionParser() 49 | parser.add_option("-b", "--begin_num", dest="begin_num", help="begin_num") 50 | parser.add_option("-e", "--end_num", dest="end_num", help="end_num") 51 | parser.add_option("-c", "--gpu_core_id", dest="gpu_core_id", help="gpu_core_id") 52 | (options, args) = parser.parse_args() 53 | begin_num = int(options.begin_num) 54 | end_num = int(options.end_num) 55 | gpu_core_id = int(options.gpu_core_id) 56 | os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_core_id) 57 | 58 | __extract_features_rgb(begin_num, end_num) 59 | 60 | def load_model_i3d_charades_rgb_for_testing(model_path): 61 | import torch 62 | from nets.i3d_torch_charades_test import InceptionI3d 63 | 64 | # setup the model 65 | state_dict = torch.load(model_path) 66 | model = InceptionI3d() 67 | model.replace_logits(157) 68 | model.load_state_dict(state_dict) 69 | model.train(False) 70 | model.eval() 71 | model.cuda() 72 | return model 73 | 74 | def __extract_features_rgb(begin_num=None, end_num=None): 75 | root_path = c.DATA_ROOT_PATH 76 | annotation_path = '%s/Charades/annotation/frames_dict_trimmed_multi_label_i3d_160_frames.pkl' % (root_path) 77 | features_root_path = '%s/Charades/features_i3d_charades_rgb_mixed_5c_trimmed_20_frames' % (root_path) 78 | video_frames_root_path = '%s/Charades/frames/Charades_v1_rgb' % (root_path) 79 | model_path = '%s/Charades/baseline_models/i3d/rgb_charades.pt' % (root_path) 80 | feature_name = 'Mixed_5c' 81 | 82 | (video_frames_dict_tr, video_frames_dict_te) = utils.pkl_load(annotation_path) 83 | video_frames_dict = dict() 84 | video_frames_dict.update(video_frames_dict_tr) 85 | video_frames_dict.update(video_frames_dict_te) 86 | video_names = video_frames_dict.keys() 87 | 88 | n_videos = len(video_names) 89 | frame_count = 0 90 | 91 | if not os.path.exists(features_root_path): 92 | print('Sorry, path does not exist: %s' % (features_root_path)) 93 | return 94 | 95 | t1 = time.time() 96 | print('extracting training features') 97 | print('start time: %s' % utils.timestamp()) 98 | 99 | # aync reader, and get load images for the first video 100 | img_reader = image_utils.AsyncImageReaderCharadesForI3DTorchModel(n_threads=20) 101 | img_reader.load_imgs_in_batch(__get_video_frame_pathes(video_names[0], video_frames_root_path, video_frames_dict)) 102 | 103 | # load the model 104 | model = __load_i3d_model_rgb(model_path) 105 | torchsummary.summary(model, input_size=(3, 160, 224, 224)) 106 | 107 | # loop on list of videos 108 | for idx_video in range(n_videos): 109 | video_num = idx_video + 1 110 | 111 | if begin_num is not None and end_num is not None: 112 | if video_num <= begin_num or video_num > end_num: 113 | continue 114 | 115 | video_name = video_names[idx_video] 116 | 117 | # wait untill the image_batch is loaded 118 | t1 = time.time() 119 | while img_reader.is_busy(): 120 | threading._sleep(0.1) 121 | t2 = time.time() 122 | duration_waited = t2 - t1 123 | print('...... video %d/%d: %s, waited: %d' % (video_num, n_videos, video_name, duration_waited)) 124 | 125 | # get the video frames 126 | video_frames = img_reader.get_images() 127 | 128 | # pre-load for the next video 129 | if video_num < n_videos: 130 | next_video_name = video_names[idx_video + 1] 131 | img_reader.load_imgs_in_batch(__get_video_frame_pathes(next_video_name, video_frames_root_path, video_frames_dict)) 132 | 133 | video_features_path = '%s/%s.pkl' % (features_root_path, video_name) 134 | # if os.path.exists(video_features_path): 135 | # print ('... features for video already exist: %s.pkl' % (video_name)) 136 | # continue 137 | 138 | if len(video_frames) != 160: 139 | print('... wrong n frames: %d' % (video_num)) 140 | continue 141 | 142 | # transpose to have the channel_first (160, 224, 224, 3) => (3, 160, 224, 224) 143 | video_frames = np.transpose(video_frames, (3, 0, 1, 2)) 144 | 145 | # add one dimension to represent the batch size 146 | video_frames = np.expand_dims(video_frames, axis=0) 147 | 148 | # prepare input variable 149 | with torch.no_grad(): 150 | # extract features 151 | input_var = torch.from_numpy(video_frames).cuda() 152 | output_var = model(input_var) 153 | output_var = output_var.cpu() 154 | features = output_var.data.numpy() # (1, 1024, 20, 7, 7) 155 | 156 | # don't forget to clean up variables 157 | del input_var 158 | del output_var 159 | 160 | # squeeze to remove the dimension of the batch_size 161 | features = features[0] # (1024, 20, 7, 7) 162 | 163 | # transpose to have the channel_last 164 | features = np.transpose(features, (1, 2, 3, 0)) # (20, 7, 7, 1024) 165 | 166 | # path to save the features 167 | utils.pkl_dump(features, video_features_path, is_highest=True) 168 | 169 | # increment counts 170 | frame_count += len(video_frames) 171 | 172 | t2 = time.time() 173 | print('finish extracting %d features in %d seconds' % (frame_count, t2 - t1)) 174 | print('end time: %s' % utils.timestamp()) 175 | 176 | def __get_video_frame_pathes(video_name, video_frames_root_path, video_frames_dict): 177 | video_frame_names = video_frames_dict[video_name] 178 | video_frame_pathes = [('%s/%s/%s') % (video_frames_root_path, video_name, n) for n in video_frame_names] 179 | video_frame_pathes = np.array(video_frame_pathes) 180 | return video_frame_pathes 181 | 182 | def __load_i3d_model_rgb(model_path): 183 | # setup the model 184 | state_dict = torch.load(model_path) 185 | model = i3d_torch_charades_test.InceptionI3d() 186 | model.replace_logits(157) 187 | model.load_state_dict(state_dict) 188 | model.cuda() 189 | model.train(True) 190 | return model 191 | 192 | if __name__ == '__main__': 193 | print('Hello World!') 194 | extract_features_rgb() 195 | -------------------------------------------------------------------------------- /core/config_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Configurations for project. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import os 33 | import platform 34 | import argparse 35 | import logging 36 | import yaml 37 | import pprint 38 | from ast import literal_eval 39 | 40 | from core.config import __C 41 | from core.utils import AttrDict 42 | from core import const, config, utils 43 | 44 | logger = logging.getLogger(__name__) 45 | 46 | # region Misc 47 | 48 | def get_machine_name(): 49 | return platform.node() 50 | 51 | def import_dl_platform(): 52 | if const.DL_FRAMEWORK == 'tensorflow': 53 | import tensorflow as tf 54 | elif const.DL_FRAMEWORK == 'pytorch': 55 | import torch 56 | elif const.DL_FRAMEWORK == 'caffe': 57 | import caffe 58 | elif const.DL_FRAMEWORK == 'keras': 59 | import keras.backend as K 60 | 61 | # endregion 62 | 63 | # region Config GPU 64 | 65 | def config_gpu(): 66 | if const.DL_FRAMEWORK == 'tensorflow': 67 | __config_gpu_for_tensorflow() 68 | elif const.DL_FRAMEWORK == 'pytorch': 69 | __config_gpu_for_pytorch() 70 | elif const.DL_FRAMEWORK == 'keras': 71 | __config_gpu_for_keras() 72 | elif const.DL_FRAMEWORK == 'caffe': 73 | __config_gpu_for_caffe() 74 | 75 | def __config_gpu_for_tensorflow(): 76 | import tensorflow as tf 77 | 78 | gpu_core_id = __parse_gpu_id() 79 | 80 | # import os 81 | # import tensorflow as tf 82 | # set the logging level of tensorflow 83 | # 1: filter out INFO 84 | # 2: filter out WARNING 85 | # 3: filter out ERROR 86 | # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # or any {'0', '1', '2'} 87 | 88 | # set which device to be used 89 | const.GPU_CORE_ID = gpu_core_id 90 | pass 91 | 92 | def __config_gpu_for_keras(): 93 | import tensorflow as tf 94 | import keras.backend as K 95 | 96 | gpu_core_id = __parse_gpu_id() 97 | 98 | K.clear_session() 99 | config = tf.ConfigProto() 100 | config.gpu_options.visible_device_list = str(gpu_core_id) 101 | config.gpu_options.allow_growth = True 102 | session = tf.Session(config=config) 103 | K.set_session(session) 104 | 105 | # set which device to be used 106 | const.GPU_CORE_ID = gpu_core_id 107 | 108 | def __config_gpu_for_pytorch(): 109 | import torch 110 | 111 | gpu_core_id = __parse_gpu_id() 112 | 113 | torch.cuda.set_device(gpu_core_id) 114 | 115 | # set which device to be used 116 | const.GPU_CORE_ID = gpu_core_id 117 | 118 | def __config_gpu_for_caffe(): 119 | import os 120 | 121 | gpu_core_id = __parse_gpu_id() 122 | 123 | os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_core_id) 124 | 125 | # set which device to be used 126 | const.GPU_CORE_ID = gpu_core_id 127 | 128 | def __parse_gpu_id(): 129 | parser = argparse.ArgumentParser() 130 | parser.add_argument('-c', '--gpu_core_id', default='-1', type=int) 131 | args = parser.parse_args() 132 | gpu_core_id = args.gpu_core_id 133 | return gpu_core_id 134 | 135 | # endregion 136 | 137 | # region Config File Helpers 138 | 139 | def cfg_print_cfg(): 140 | logger.info('Config file is:') 141 | logger.info(pprint.pformat(__C)) 142 | 143 | def cfg_merge_dicts(dict_a, dict_b): 144 | from ast import literal_eval 145 | 146 | for key, value in dict_a.items(): 147 | if key not in dict_b: 148 | raise KeyError('Invalid key in config file: {}'.format(key)) 149 | if type(value) is dict: 150 | dict_a[key] = value = AttrDict(value) 151 | if isinstance(value, str): 152 | try: 153 | value = literal_eval(value) 154 | except BaseException: 155 | pass 156 | # the types must match, too 157 | old_type = type(dict_b[key]) 158 | if old_type is not type(value) and value is not None: 159 | raise ValueError('Type mismatch ({} vs. {}) for config key: {}'.format(type(dict_b[key]), type(value), key)) 160 | # recursively merge dicts 161 | if isinstance(value, AttrDict): 162 | try: 163 | cfg_merge_dicts(dict_a[key], dict_b[key]) 164 | except BaseException: 165 | raise Exception('Error under config key: {}'.format(key)) 166 | else: 167 | dict_b[key] = value 168 | 169 | def cfg_from_file(file_path, is_check=True): 170 | """ 171 | Load a config file and merge it into the default options. 172 | """ 173 | 174 | # read from file 175 | yaml_config = utils.yaml_load(file_path) 176 | 177 | # merge to project config 178 | cfg_merge_dicts(yaml_config, __C) 179 | 180 | # make sure everything is okay 181 | if is_check: 182 | cfg_sanity_check() 183 | 184 | def cfg_from_attrdict(attr_dict): 185 | cfg_merge_dicts(attr_dict, __C) 186 | 187 | def cfg_from_dict(args_dict): 188 | """Set config keys via list (e.g., from command line).""" 189 | 190 | for key, value in args_dict.iteritems(): 191 | key_list = key.split('.') 192 | cfg = __C 193 | for subkey in key_list[:-1]: 194 | assert subkey in cfg, 'Config key {} not found'.format(subkey) 195 | cfg = cfg[subkey] 196 | subkey = key_list[-1] 197 | if subkey not in cfg: 198 | raise Exception('Config key {} not found'.format(subkey)) 199 | try: 200 | # handle the case when v is a string literal 201 | val = literal_eval(value) 202 | except BaseException: 203 | val = value 204 | if isinstance(val, type(cfg[subkey])) or cfg[subkey] is None: 205 | pass 206 | else: 207 | type1 = type(val) 208 | type2 = type(cfg[subkey]) 209 | msg = 'type {} does not match original type {}'.format(type1, type2) 210 | raise Exception(msg) 211 | cfg[subkey] = val 212 | 213 | def cfg_from_list(args_list): 214 | """ 215 | Set config keys via list (e.g., from command line). 216 | """ 217 | from ast import literal_eval 218 | 219 | assert len(args_list) % 2 == 0, 'Specify values or keys for args' 220 | for key, value in zip(args_list[0::2], args_list[1::2]): 221 | key_list = key.split('.') 222 | cfg = __C 223 | for subkey in key_list[:-1]: 224 | assert subkey in cfg, 'Config key {} not found'.format(subkey) 225 | cfg = cfg[subkey] 226 | subkey = key_list[-1] 227 | assert subkey in cfg, 'Config key {} not found'.format(subkey) 228 | try: 229 | # handle the case when v is a string literal 230 | val = literal_eval(value) 231 | except BaseException: 232 | val = value 233 | msg = 'type {} does not match original type {}'.format(type(val), type(cfg[subkey])) 234 | assert isinstance(val, type(cfg[subkey])) or cfg[subkey] is None, msg 235 | cfg[subkey] = val 236 | 237 | def cfg_sanity_check(): 238 | assert __C.TRAIN.SCHEME in const.TRAIN_SCHEMES 239 | assert __C.MODEL.CLASSIFICATION_TYPE in const.MODEL_CLASSIFICATION_TYPES 240 | assert __C.MODEL.MULTISCALE_TYPE in const.MODEL_MULTISCALE_TYPES 241 | assert __C.SOLVER.NAME in const.SOLVER_NAMES 242 | assert __C.DATASET_NAME in const.DATASET_NAMES 243 | 244 | # endregion 245 | -------------------------------------------------------------------------------- /experiments/train_keras.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Train Timeception layers on different datasets. There are two different ways to train Timeception. 25 | 1. Timeception-only (TCO): only timeception layers are trained, using features extracted from backbone CNNs. 26 | 2. End-to-end (ETE): timeception is trained on top of backbone CNN. The input is video frames passed throughtout the backboneCNN 27 | and then the resulted feature is fed to Timeception layers. Here, you enjoy all the benefits of end-to-end training. 28 | For example, do pre-processing to the input frames, randomly sample the frames, temporal jittering, ...., etc. 29 | """ 30 | 31 | from __future__ import absolute_import 32 | from __future__ import division 33 | from __future__ import print_function 34 | from __future__ import unicode_literals 35 | 36 | import logging 37 | import os 38 | import datetime 39 | import numpy as np 40 | from optparse import OptionParser 41 | 42 | import tensorflow as tf 43 | import keras.backend as K 44 | from keras.layers import Dense, LeakyReLU, Dropout, Input, Activation, BatchNormalization 45 | from keras.optimizers import SGD, Adam 46 | from keras.models import Model 47 | 48 | from nets import timeception 49 | from nets.layers_keras import MaxLayer 50 | from core import utils, keras_utils, image_utils, config_utils, const, config, data_utils 51 | from core.utils import Path as Pth 52 | 53 | logger = logging.getLogger(__name__) 54 | 55 | def train_tco(): 56 | """ 57 | Train Timeception layers based on the given configurations. 58 | This train scheme is Timeception-only (TCO). 59 | """ 60 | 61 | # get some configs for the training 62 | n_workers = config.cfg.TRAIN.N_WORKERS 63 | n_epochs = config.cfg.TRAIN.N_EPOCHS 64 | dataset_name = config.cfg.DATASET_NAME 65 | model_name = '%s_%s' % (config.cfg.MODEL.NAME, utils.timestamp()) 66 | 67 | # data generators 68 | data_generator_tr = __define_data_generator(is_training=True) 69 | data_generator_te = __define_data_generator(is_training=False) 70 | 71 | logger.info('--- start time') 72 | logger.info(datetime.datetime.now()) 73 | logger.info('... [tr]: n_samples, n_batch, batch_size: %d, %d, %d' % (data_generator_tr.n_samples, data_generator_tr.n_batches, config.cfg.TRAIN.BATCH_SIZE)) 74 | logger.info('... [te]: n_samples, n_batch, batch_size: %d, %d, %d' % (data_generator_te.n_samples, data_generator_te.n_batches, config.cfg.TEST.BATCH_SIZE)) 75 | 76 | # callback to save the model 77 | save_callback = keras_utils.SaveCallback(dataset_name, model_name) 78 | 79 | # load model 80 | model = __define_timeception_model() 81 | logger.info(model.summary()) 82 | 83 | # train the model 84 | model.fit_generator(epochs=n_epochs, generator=data_generator_tr, validation_data=data_generator_te, use_multiprocessing=True, workers=n_workers, callbacks=[save_callback], verbose=2) 85 | 86 | logger.info('--- finish time') 87 | logger.info(datetime.datetime.now()) 88 | 89 | def train_ete(): 90 | """ 91 | Train Timeception layers based on the given configurations. 92 | This train scheme is End-to-end (ETE). 93 | """ 94 | 95 | model = __define_timeception_model() 96 | 97 | raise Exception('Sorry, not implemented yet!') 98 | 99 | def __define_data_generator(is_training): 100 | """ 101 | Define data generator. 102 | """ 103 | 104 | # get some configs for the training 105 | n_classes = config.cfg.MODEL.N_CLASSES 106 | dataset_name = config.cfg.DATASET_NAME 107 | backbone_model_name = config.cfg.MODEL.BACKBONE_CNN 108 | backbone_feature_name = config.cfg.MODEL.BACKBONE_FEATURE 109 | n_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS 110 | 111 | batch_size_tr = config.cfg.TRAIN.BATCH_SIZE 112 | batch_size_te = config.cfg.TEST.BATCH_SIZE 113 | batch_size = batch_size_tr if is_training else batch_size_te 114 | 115 | # size and name of feature 116 | feature_name = 'features_%s_%s_%sf' % (backbone_model_name, backbone_feature_name, n_timesteps) 117 | c, h, w = utils.get_model_feat_maps_info(backbone_model_name, backbone_feature_name) 118 | feature_dim = (n_timesteps, h, w, c) 119 | 120 | # data generators 121 | params = {'batch_size': batch_size, 'n_classes': n_classes, 'feature_name': feature_name, 'feature_dim': feature_dim, 'is_shuffle': True, 'is_training': is_training} 122 | data_generator_class = data_utils.KERAS_DATA_GENERATORS_DICT[dataset_name] 123 | data_generator = data_generator_class(**params) 124 | 125 | return data_generator 126 | 127 | def __define_timeception_model(): 128 | """ 129 | Define Timeception classifier. 130 | """ 131 | 132 | # some configurations for the model 133 | classification_type = config.cfg.MODEL.CLASSIFICATION_TYPE 134 | solver_name = config.cfg.SOLVER.NAME 135 | solver_lr = config.cfg.SOLVER.LR 136 | adam_epsilon = config.cfg.SOLVER.ADAM_EPSILON 137 | n_tc_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS 138 | backbone_name = config.cfg.MODEL.BACKBONE_CNN 139 | feature_name = config.cfg.MODEL.BACKBONE_FEATURE 140 | n_tc_layers = config.cfg.MODEL.N_TC_LAYERS 141 | n_classes = config.cfg.MODEL.N_CLASSES 142 | is_dilated = config.cfg.MODEL.MULTISCALE_TYPE 143 | n_channels_in, channel_h, channel_w = utils.get_model_feat_maps_info(backbone_name, feature_name) 144 | n_groups = int(n_channels_in / 128.0) 145 | 146 | # optimizer and loss for either multi-label "ml" or single-label "sl" classification 147 | if classification_type == 'ml': 148 | loss = keras_utils.LOSSES[3] 149 | output_activation = keras_utils.ACTIVATIONS[2] 150 | metric_function = keras_utils.map_charades 151 | else: 152 | loss = keras_utils.LOSSES[0] 153 | output_activation = keras_utils.ACTIVATIONS[3] 154 | metric_function = keras_utils.METRICS[0] 155 | 156 | # define the optimizer 157 | optimizer = SGD(lr=0.01) if solver_name == 'sgd' else Adam(lr=solver_lr, epsilon=adam_epsilon) 158 | 159 | # input layer 160 | input_shape = (n_tc_timesteps, channel_h, channel_w, n_channels_in) # (T, H, W, C) 161 | tensor_input = Input(shape=input_shape, name='input') # (T, H, W, C) 162 | 163 | # define timeception layers, as a standalone module 164 | timeception_module = timeception.Timeception(n_channels_in, n_tc_layers, n_groups, is_dilated=is_dilated) 165 | tensor = timeception_module(tensor_input) # (T, H, W, C) 166 | 167 | # but if you fancy, you can define timeception layers as a series of layers 168 | # tensor = timeception.timeception_layers(tensor_input, n_tc_layers, n_groups, is_dilated=is_dilated) # (T, H, W, C) 169 | 170 | # max-pool over space-time 171 | tensor = MaxLayer(axis=(1, 2, 3), name='maxpool_t_s')(tensor) 172 | 173 | # dense layers for classification 174 | tensor = Dropout(0.5)(tensor) 175 | tensor = Dense(512)(tensor) 176 | tensor = BatchNormalization()(tensor) 177 | tensor = LeakyReLU(alpha=0.2)(tensor) 178 | tensor = Dropout(0.25)(tensor) 179 | tensor = Dense(n_classes)(tensor) 180 | tensor_output = Activation(output_activation)(tensor) 181 | 182 | # define the model 183 | model = Model(inputs=tensor_input, outputs=tensor_output) 184 | model.compile(loss=loss, optimizer=optimizer, metrics=[metric_function]) 185 | 186 | return model 187 | 188 | def __main(): 189 | """ 190 | Run this script to train Timeception. 191 | """ 192 | 193 | default_config_file = 'charades_i3d_tc4_f1024.yaml' 194 | default_config_file = 'charades_i3d_tc2_f256.yaml' 195 | 196 | # Parse the arguments 197 | parser = OptionParser() 198 | parser.add_option('-c', '--config_file', dest='config_file', default=default_config_file, help='Yaml config file that contains all training details.') 199 | (options, args) = parser.parse_args() 200 | config_file = options.config_file 201 | 202 | # check if exist 203 | if config_file is None or config_file == '': 204 | msg = 'Config file not passed, default config is used: %s' % (config_file) 205 | logging.warning(msg) 206 | config_file = default_config_file 207 | 208 | # path of config file 209 | config_path = './configs/%s' % (config_file) 210 | 211 | # check if file exist 212 | if not os.path.exists(config_path): 213 | msg = 'Sorry, could not find config file with the following path: %s' % (config_path) 214 | logging.error(msg) 215 | else: 216 | # read the config from file and copy it to the project configuration "cfg" 217 | config_utils.cfg_from_file(config_path) 218 | 219 | # choose which training scheme, either 'ete' or 'tco' 220 | training_scheme = config.cfg.TRAIN.SCHEME 221 | 222 | # start training 223 | if training_scheme == 'tco': 224 | train_tco() 225 | else: 226 | train_ete() 227 | 228 | if __name__ == '__main__': 229 | __main() 230 | -------------------------------------------------------------------------------- /core/pytorch_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Helper functions for pytorch. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import os 33 | import logging 34 | import json 35 | import numpy as np 36 | from collections import OrderedDict 37 | 38 | import torch 39 | from torch import nn 40 | from torch.nn import functional as F 41 | from torch.autograd import Variable 42 | 43 | import torchviz 44 | import torchvision 45 | import torchsummary 46 | 47 | logger = logging.getLogger(__name__) 48 | 49 | # region Helpers 50 | 51 | def save_model(model, path): 52 | model.save_state_dict(path) 53 | 54 | def load_model(model, path): 55 | model_dict = torch.load(path) 56 | model.load_state_dict(model_dict) 57 | 58 | def padding1d(tensor, filter): 59 | it, = tensor.shape[2:] 60 | ft = filter 61 | 62 | pt = max(0, (it - 1) + (ft - 1) + 1 - it) 63 | oddt = (pt % it != 0) 64 | 65 | mode = str('constant') 66 | if any([oddt]): 67 | pad = [0, int(oddt)] 68 | tensor = F.pad(tensor, pad, mode=mode) 69 | 70 | padding = (pt // it,) 71 | return tensor, padding 72 | 73 | def padding3d(tensor, filter, mode=str('constant')): 74 | """ 75 | Input shape (BN, C, T, H, W) 76 | """ 77 | 78 | it, ih, iw = tensor.shape[2:] 79 | ft, fh, fw = filter.shape 80 | 81 | pt = max(0, (it - 1) + (ft - 1) + 1 - it) 82 | ph = max(0, (ih - 1) + (fh - 1) + 1 - ih) 83 | pw = max(0, (iw - 1) + (fw - 1) + 1 - iw) 84 | 85 | oddt = (pt % 2 != 0) 86 | oddh = (ph % 2 != 0) 87 | oddw = (pw % 2 != 0) 88 | 89 | if any([oddt, oddh, oddw]): 90 | pad = [0, int(oddt), 0, int(oddh), 0, int(oddw)] 91 | tensor = F.pad(tensor, pad, mode=mode) 92 | 93 | padding = (pt // 2, ph // 2, pw // 2) 94 | tensor = F.conv3d(tensor, filter, padding=padding) 95 | 96 | return tensor 97 | 98 | def calc_padding_1d(input_size, kernel_size, stride=1, dilation=1): 99 | """ 100 | Calculate the padding. 101 | """ 102 | 103 | # i = input 104 | # o = output 105 | # p = padding 106 | # k = kernel_size 107 | # s = stride 108 | # d = dilation 109 | # the equation is 110 | # o = [i + 2 * p - k - (k - 1) * (d - 1)] / s + 1 111 | # give that we want i = o, then we solve the equation for p gives us 112 | 113 | i = input_size 114 | s = stride 115 | k = kernel_size 116 | d = dilation 117 | 118 | padding = 0.5 * (k - i + s * (i - 1) + (k - 1) * (d - 1)) 119 | padding = int(padding) 120 | 121 | return padding 122 | 123 | def summary(model, input_size, batch_size=-1, device="cuda"): 124 | """ 125 | Custom summary function, to print the custom name of module, instead of the assigned layer name. 126 | :param model: 127 | :param input_size: 128 | :param batch_size: 129 | :param device: 130 | :return: 131 | """ 132 | 133 | # this has to be imported here, not to create import-loop between "nets.layers_pytorch" and "core.pytorch_utils" 134 | from nets.layers_pytorch import DepthwiseConv1DLayer 135 | 136 | def register_hook(module): 137 | 138 | def hook(module, input, output): 139 | 140 | # old code 141 | # class_name = str(module.__class__).split(".")[-1].split("'")[0] 142 | # m_key = "%s-%i" % (class_name, module_idx + 1) 143 | 144 | # don't consider this layer 145 | if type(module) == DepthwiseConv1DLayer: 146 | return 147 | 148 | # new code 149 | if hasattr(module, '_name'): 150 | m_key = str(module._name) 151 | else: 152 | module_idx = len(summary) 153 | class_name = str(module.__class__).split(".")[-1].split("'")[0] 154 | m_key = "%s-%i" % (class_name, module_idx + 1) 155 | 156 | summary[m_key] = OrderedDict() 157 | summary[m_key]["input_shape"] = list(input[0].size()) 158 | summary[m_key]["input_shape"][0] = batch_size 159 | if isinstance(output, (list, tuple)): 160 | summary[m_key]["output_shape"] = [ 161 | [-1] + list(o.size())[1:] for o in output 162 | ] 163 | else: 164 | summary[m_key]["output_shape"] = list(output.size()) 165 | summary[m_key]["output_shape"][0] = batch_size 166 | 167 | params = 0 168 | if hasattr(module, "weight") and hasattr(module.weight, "size"): 169 | params += torch.prod(torch.LongTensor(list(module.weight.size()))) 170 | summary[m_key]["trainable"] = module.weight.requires_grad 171 | if hasattr(module, "bias") and hasattr(module.bias, "size"): 172 | params += torch.prod(torch.LongTensor(list(module.bias.size()))) 173 | summary[m_key]["nb_params"] = params 174 | 175 | if (not isinstance(module, nn.Sequential) and not isinstance(module, nn.ModuleList) and not (module == model)): 176 | hooks.append(module.register_forward_hook(hook)) 177 | 178 | device = device.lower() 179 | assert device in [ 180 | "cuda", 181 | "cpu", 182 | ], "Input device is not valid, please specify 'cuda' or 'cpu'" 183 | 184 | if device == "cuda" and torch.cuda.is_available(): 185 | dtype = torch.cuda.FloatTensor 186 | else: 187 | dtype = torch.FloatTensor 188 | 189 | # multiple inputs to the network 190 | if isinstance(input_size, tuple): 191 | input_size = [input_size] 192 | 193 | # batch_size of 2 for batchnorm 194 | x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size] 195 | # print(type(x[0])) 196 | 197 | # create properties 198 | summary = OrderedDict() 199 | hooks = [] 200 | 201 | # register hook 202 | model.apply(register_hook) 203 | 204 | # make a forward pass 205 | # print(x.shape) 206 | model(*x) 207 | 208 | # remove these hooks 209 | for h in hooks: 210 | h.remove() 211 | 212 | print("----------------------------------------------------------------") 213 | line_new = "{:>20} {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #") 214 | print(line_new) 215 | print("================================================================") 216 | total_params = 0 217 | total_output = 0 218 | trainable_params = 0 219 | for layer in summary: 220 | # input_shape, output_shape, trainable, nb_params 221 | line_new = "{:>20} {:>25} {:>15}".format(layer, str(summary[layer]["output_shape"]), "{0:,}".format(summary[layer]["nb_params"]), ) 222 | total_params += summary[layer]["nb_params"] 223 | total_output += np.prod(summary[layer]["output_shape"]) 224 | if "trainable" in summary[layer]: 225 | if summary[layer]["trainable"] == True: 226 | trainable_params += summary[layer]["nb_params"] 227 | print(line_new) 228 | 229 | # assume 4 bytes/number (float on cuda). 230 | total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.)) 231 | total_output_size = abs(2. * total_output * 4. / (1024 ** 2.)) # x2 for gradients 232 | total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.)) 233 | total_size = total_params_size + total_output_size + total_input_size 234 | 235 | print("================================================================") 236 | print("Total params: {0:,}".format(total_params)) 237 | print("Trainable params: {0:,}".format(trainable_params)) 238 | print("Non-trainable params: {0:,}".format(total_params - trainable_params)) 239 | print("----------------------------------------------------------------") 240 | print("Input size (MB): %0.2f" % total_input_size) 241 | print("Forward/backward pass size (MB): %0.2f" % total_output_size) 242 | print("Params size (MB): %0.2f" % total_params_size) 243 | print("Estimated Total Size (MB): %0.2f" % total_size) 244 | print("----------------------------------------------------------------") 245 | # return summary 246 | 247 | # endregion 248 | 249 | # region Classes 250 | 251 | class ModelSaver(): 252 | def __init__(self, model, dataset_name, model_name): 253 | self.model = model 254 | self.model_name = model_name 255 | 256 | model_root_path = './data/%s/models' % (dataset_name) 257 | assert os.path.exists(model_root_path) 258 | 259 | model_root_path = './data/%s/models/%s' % (dataset_name, model_name) 260 | if not os.path.exists(model_root_path): 261 | os.mkdir(model_root_path) 262 | 263 | self.model_root_path = model_root_path 264 | 265 | def save(self, idx_epoch): 266 | """ 267 | Save the model. 268 | """ 269 | epoch_num = idx_epoch + 1 270 | model_root_path = self.model_root_path 271 | model_state_path = str('%s/%03d.pt' % (model_root_path, epoch_num)) 272 | 273 | # save model state using pytorch 274 | model_state = self.model.state_dict() 275 | torch.save(model_state, model_state_path) 276 | 277 | 278 | # endregion 279 | -------------------------------------------------------------------------------- /core/data_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Helpful functions and classes to deal with data. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import logging 33 | import random 34 | import numpy as np 35 | import pickle as pkl 36 | from datetime import datetime 37 | from multiprocessing.dummy import Pool 38 | 39 | import keras.utils 40 | import torch.utils.data 41 | import torchvision 42 | 43 | from core import utils, config 44 | from core.utils import Path as Pth 45 | 46 | logger = logging.getLogger(__name__) 47 | 48 | # region Async File Loader 49 | 50 | class AsyncLoaderVideoFeatures(): 51 | """ 52 | Load features for the video frames. 53 | """ 54 | 55 | def __init__(self, feats_path, target, n_frames_per_video, batch_size, n_feat_maps, feat_map_side_dim, n_threads=10, annotation_dict=None): 56 | random.seed(101) 57 | np.random.seed(101) 58 | 59 | self.__feats_pathes = feats_path 60 | self.__n_frames_per_video = n_frames_per_video 61 | self.__n_feat_maps = n_feat_maps 62 | self.__feat_map_side_dim = feat_map_side_dim 63 | self.__annotation_dict = annotation_dict 64 | 65 | self.__batch_size = batch_size 66 | self.__y = target 67 | 68 | self.__is_busy = False 69 | self.__batch_features = None 70 | self.__batch_y = None 71 | self.__n_threads_in_pool = n_threads 72 | self.__pool = Pool(self.__n_threads_in_pool) 73 | 74 | def load_feats_in_batch(self, batch_number): 75 | self.__is_busy = True 76 | 77 | idx_batch = batch_number - 1 78 | start_idx = idx_batch * self.__batch_size 79 | stop_idx = (idx_batch + 1) * self.__batch_size 80 | 81 | batch_feat_pathes = self.__feats_pathes[start_idx:stop_idx] 82 | batch_y = self.__y[start_idx:stop_idx] 83 | 84 | n_batch_feats = len(batch_feat_pathes) 85 | n_batch_y = len(batch_y) 86 | idxces = range(0, n_batch_feats) 87 | 88 | assert n_batch_feats == n_batch_y 89 | 90 | # parameters passed to the reading function 91 | params = [data_item for data_item in zip(idxces, batch_feat_pathes)] 92 | 93 | # set list of batch features before start reading 94 | batch_feats_shape = (n_batch_feats, self.__n_frames_per_video, self.__feat_map_side_dim, self.__feat_map_side_dim, self.__n_feat_maps) 95 | 96 | self.__batch_features = np.zeros(batch_feats_shape, dtype=np.float32) 97 | self.__batch_y = batch_y 98 | 99 | # start pool of threads 100 | self.__pool.map_async(self.__load_features, params, callback=self.__thread_pool_callback) 101 | 102 | def get_batch_data(self): 103 | if self.__is_busy: 104 | raise Exception('Sorry, you can\'t get features while threads are running!') 105 | else: 106 | return (self.__batch_features, self.__batch_y) 107 | 108 | def get_y(self): 109 | return self.__y 110 | 111 | def is_busy(self): 112 | return self.__is_busy 113 | 114 | def __thread_pool_callback(self, args): 115 | self.__is_busy = False 116 | 117 | def __load_features(self, params): 118 | 119 | idx_video = params[0] 120 | feats_path = params[1] 121 | video_name = feats_path.split('/')[-1] 122 | 123 | try: 124 | # load feature from file 125 | feats = utils.pkl_load(feats_path) 126 | 127 | n_feats = len(feats) 128 | assert n_feats == self.__n_frames_per_video, 'Sorry, wrong number of frames, expected: %d, got: %d' % (self.__n_frames_per_video, n_feats) 129 | self.__batch_features[idx_video] = feats 130 | 131 | except Exception as exp: 132 | print('\nSorry, error in loading feature %s' % (feats_path)) 133 | print(exp) 134 | 135 | def shuffle_data(self): 136 | """ 137 | shuffle these data: self.__feats_pathes, self.__class_names, self.__y 138 | :return: 139 | """ 140 | 141 | n_samples = len(self.__feats_pathes) 142 | 143 | idx = range(n_samples) 144 | np.random.shuffle(idx) 145 | self.__feats_pathes = self.__feats_pathes[idx] 146 | self.__y = self.__y[idx] 147 | 148 | def close(self): 149 | self.__pool.close() 150 | self.__pool.terminate() 151 | 152 | # endregion 153 | 154 | # region Data Generators (Keras) 155 | 156 | class DataGeneratorCharades(keras.utils.Sequence): 157 | 'Generates data for Keras' 158 | 159 | def __init__(self, batch_size, n_classes, feature_dim, feature_name, is_training, is_shuffle=True): 160 | """ 161 | Initialization 162 | """ 163 | self.batch_size = batch_size 164 | self.is_training = is_training 165 | self.n_classes = n_classes 166 | self.feature_dim = feature_dim 167 | self.feature_name = feature_name 168 | self.is_shuffle = is_shuffle 169 | self.dataset_name = 'charades' 170 | 171 | # load annotation 172 | root_path = './data/charades' 173 | annotation_path = '%s/annotation/video_annotation.pkl' % (root_path) 174 | if self.is_training: 175 | (video_names, y, _, _) = utils.pkl_load(annotation_path) 176 | else: 177 | (_, _, video_names, y) = utils.pkl_load(annotation_path) 178 | 179 | # convert relative to root pathes 180 | feats_path = np.array(['%s/%s/%s.pkl' % (root_path, feature_name, p) for p in video_names]) 181 | 182 | n_samples = len(y) 183 | self.n_samples = n_samples 184 | self.n_batches = utils.calc_num_batches(n_samples, batch_size) 185 | self.feats_path = feats_path 186 | self.y = y 187 | 188 | # shuffle the data 189 | if self.is_shuffle: 190 | self.__shuffle() 191 | 192 | def __len__(self): 193 | """ 194 | Denotes the number of batches per epoc 195 | """ 196 | return self.n_batches 197 | 198 | def __getitem__(self, index): 199 | """ 200 | Generate one batch of data. 201 | """ 202 | 203 | idx_start = index * self.batch_size 204 | idx_stop = (index + 1) * self.batch_size 205 | y = self.y[idx_start:idx_stop] 206 | feats_path = self.feats_path[idx_start:idx_stop] 207 | 208 | n_items = len(feats_path) 209 | x_shape = tuple([n_items] + list(self.feature_dim)) 210 | x = np.zeros(x_shape, dtype=np.float32) 211 | 212 | # loop of feature pathes and load them 213 | for idx, p in enumerate(feats_path): 214 | x[idx] = utils.pkl_load(p) 215 | 216 | return x, y 217 | 218 | def on_epoch_end(self): 219 | """ 220 | Shuffle after finishing the epoch. 221 | :return: 222 | """ 223 | 224 | if self.is_shuffle: 225 | self.__shuffle() 226 | 227 | def __shuffle(self): 228 | 229 | idx = range(self.n_samples) 230 | np.random.shuffle(idx) 231 | self.feats_path = self.feats_path[idx] 232 | self.y = self.y[idx] 233 | 234 | # endregion 235 | 236 | # region Data Loaders (PyTorch) 237 | 238 | class DatasetCharades(torch.utils.data.Dataset): 239 | def __init__(self, batch_size, n_classes, feature_dim, feature_name, is_training, is_shuffle=True): 240 | """ 241 | Initialization 242 | """ 243 | 244 | self.batch_size = batch_size 245 | self.is_training = is_training 246 | self.n_classes = n_classes 247 | self.feature_dim = feature_dim 248 | self.feature_name = feature_name 249 | self.is_shuffle = is_shuffle 250 | self.dataset_name = 'charades' 251 | 252 | # load annotation 253 | root_path = './data/charades' 254 | annotation_path = '%s/annotation/video_annotation.pkl' % (root_path) 255 | if self.is_training: 256 | (video_names, y, _, _) = utils.pkl_load(annotation_path) 257 | else: 258 | (_, _, video_names, y) = utils.pkl_load(annotation_path) 259 | 260 | # in case of single label classification, debinarize the labels 261 | if config.cfg.MODEL.CLASSIFICATION_TYPE == 'sl': 262 | y = utils.debinarize_label(y) 263 | 264 | # in any case, make sure target is float 265 | y = y.astype(np.float32) 266 | 267 | # convert relative to root pathes 268 | feats_path = np.array(['%s/%s/%s.pkl' % (root_path, feature_name, p) for p in video_names]) 269 | 270 | n_samples = len(y) 271 | self.n_samples = n_samples 272 | self.n_batches = utils.calc_num_batches(n_samples, batch_size) 273 | self.feats_path = feats_path 274 | self.y = y 275 | 276 | # shuffle the data 277 | if self.is_shuffle: 278 | self.__shuffle() 279 | 280 | def __getitem__(self, index): 281 | """ 282 | Generate one batch of data 283 | """ 284 | 285 | y = self.y[index] 286 | p = self.feats_path[index] 287 | x = utils.pkl_load(p) # (T, H, W, C) 288 | 289 | # convert to channel last 290 | x = np.transpose(x, (3, 0, 1, 2)) # (T, H, W, C) 291 | 292 | return x, y 293 | 294 | def __len__(self): 295 | return self.n_samples 296 | 297 | def __shuffle(self): 298 | idx = range(self.n_samples) 299 | np.random.shuffle(idx) 300 | self.feats_path = self.feats_path[idx] 301 | self.y = self.y[idx] 302 | 303 | # endregion 304 | 305 | # region Constants 306 | 307 | KERAS_DATA_GENERATORS_DICT = {'charades': DataGeneratorCharades} 308 | PYTORCH_DATASETS_DICT = {'charades': DatasetCharades} 309 | 310 | # endregion 311 | -------------------------------------------------------------------------------- /experiments/train_pytorch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Train Timeception layers on different datasets. There are two different ways to train Timeception. 25 | 1. Timeception-only (TCO): only timeception layers are trained, using features extracted from backbone CNNs. 26 | 2. End-to-end (ETE): timeception is trained on top of backbone CNN. The input is video frames passed throughtout the backboneCNN 27 | and then the resulted feature is fed to Timeception layers. Here, you enjoy all the benefits of end-to-end training. 28 | For example, do pre-processing to the input frames, randomly sample the frames, temporal jittering, ...., etc. 29 | """ 30 | 31 | from __future__ import absolute_import 32 | from __future__ import division 33 | from __future__ import print_function 34 | from __future__ import unicode_literals 35 | 36 | import os 37 | import sys 38 | import time 39 | import logging 40 | import datetime 41 | import numpy as np 42 | from optparse import OptionParser 43 | 44 | import torch 45 | import torch.utils.data 46 | 47 | from torch.nn import functional as F 48 | from torch.nn import Module, Dropout, BatchNorm1d, LeakyReLU, Linear, LogSoftmax, Sigmoid 49 | from torch.optim import SGD, Adam 50 | from torch.autograd import Variable 51 | from torch.utils.data import DataLoader 52 | from torchvision import datasets, transforms 53 | 54 | import torchviz 55 | import torchvision 56 | import torchsummary 57 | 58 | from nets import timeception_pytorch 59 | from core import utils, pytorch_utils, image_utils, config_utils, const, config, data_utils, metrics 60 | from core.utils import Path as Pth 61 | 62 | logger = logging.getLogger(__name__) 63 | 64 | def train_tco(): 65 | """ 66 | Train Timeception layers based on the given configurations. 67 | This train scheme is Timeception-only (TCO). 68 | """ 69 | 70 | # get some configs for the training 71 | n_epochs = config.cfg.TRAIN.N_EPOCHS 72 | dataset_name = config.cfg.DATASET_NAME 73 | model_name = '%s_%s' % (config.cfg.MODEL.NAME, utils.timestamp()) 74 | device = 'cuda' 75 | 76 | # data generators 77 | loader_tr, n_samples_tr, n_batches_tr = __define_loader(is_training=True) 78 | loader_te, n_samples_te, n_batches_te = __define_loader(is_training=False) 79 | 80 | logger.info('--- start time') 81 | logger.info(datetime.datetime.now()) 82 | logger.info('... [tr]: n_samples, n_batch, batch_size: %d, %d, %d' % (n_samples_tr, n_batches_tr, config.cfg.TRAIN.BATCH_SIZE)) 83 | logger.info('... [te]: n_samples, n_batch, batch_size: %d, %d, %d' % (n_samples_te, n_batches_te, config.cfg.TEST.BATCH_SIZE)) 84 | 85 | # load model 86 | model, optimizer, loss_fn, metric_fn, metric_fn_name = __define_timeception_model(device) 87 | logger.info(pytorch_utils.summary(model, model._input_shape[1:], batch_size=2, device='cuda')) 88 | 89 | # save the model 90 | model_saver = pytorch_utils.ModelSaver(model, dataset_name, model_name) 91 | 92 | # loop on the epochs 93 | sys.stdout.write('\n') 94 | for idx_epoch in range(n_epochs): 95 | 96 | epoch_num = idx_epoch + 1 97 | 98 | loss_tr = 0.0 99 | acc_tr = 0.0 100 | loss_te = 0.0 101 | acc_te = 0.0 102 | 103 | tt1 = time.time() 104 | 105 | # flag model as training 106 | model.train() 107 | 108 | # training 109 | for idx_batch, (x, y_true) in enumerate(loader_tr): 110 | batch_num = idx_batch + 1 111 | 112 | x, y_true = x.to(device), y_true.to(device) 113 | optimizer.zero_grad() 114 | y_pred = model(x) 115 | loss = loss_fn(y_pred, y_true) 116 | loss.backward() 117 | optimizer.step() 118 | 119 | # calculate accuracy 120 | y_true = y_true.cpu().numpy().astype(np.int32) 121 | y_pred = y_pred.cpu().detach().numpy() 122 | loss_b_tr = loss.cpu().detach().numpy() 123 | acc_b_tr = metric_fn(y_true, y_pred) 124 | 125 | loss_tr += loss_b_tr 126 | acc_tr += acc_b_tr 127 | loss_b_tr = loss_tr / float(batch_num) 128 | acc_b_tr = acc_tr / float(batch_num) 129 | tt2 = time.time() 130 | duration = tt2 - tt1 131 | sys.stdout.write('\r%04ds - epoch: %02d/%02d, batch [tr]: %02d/%02d, loss, %s: %0.2f, %0.2f ' % (duration, epoch_num, n_epochs, batch_num, n_batches_tr, metric_fn_name, loss_b_tr, acc_b_tr)) 132 | 133 | # flag model as testing 134 | model.eval() 135 | 136 | # testing 137 | for idx_batch, (x, y_true) in enumerate(loader_te): 138 | batch_num = idx_batch + 1 139 | 140 | x, y_true = x.to(device), y_true.to(device) 141 | y_pred = model(x) 142 | loss_b_te = loss_fn(y_pred, y_true).cpu().detach().numpy() 143 | y_true = y_true.cpu().numpy().astype(np.int32) 144 | y_pred = y_pred.cpu().detach().numpy() 145 | acc_b_te = metric_fn(y_true, y_pred) 146 | 147 | loss_te += loss_b_te 148 | acc_te += acc_b_te 149 | loss_b_te = loss_te / float(batch_num) 150 | acc_b_te = acc_te / float(batch_num) 151 | tt2 = time.time() 152 | duration = tt2 - tt1 153 | sys.stdout.write('\r%04ds - epoch: %02d/%02d, batch [te]: %02d/%02d, loss, %s: %0.2f, %0.2f ' % (duration, epoch_num, n_epochs, batch_num, n_batches_te, metric_fn_name, loss_b_te, acc_b_te)) 154 | 155 | loss_tr /= float(n_batches_tr) 156 | loss_te /= float(n_batches_te) 157 | acc_tr /= float(n_batches_tr) 158 | acc_te /= float(n_batches_te) 159 | 160 | tt2 = time.time() 161 | duration = tt2 - tt1 162 | sys.stdout.write('\r%04ds - epoch: %02d/%02d, [tr]: %0.2f, %0.2f, [te]: %0.2f, %0.2f \n' % (duration, epoch_num, n_epochs, loss_tr, acc_te, loss_te, acc_te)) 163 | 164 | # after each epoch, save data 165 | model_saver.save(idx_epoch) 166 | 167 | logger.info('--- finish time') 168 | logger.info(datetime.datetime.now()) 169 | 170 | def train_ete(): 171 | """ 172 | Train Timeception layers based on the given configurations. 173 | This train scheme is End-to-end (ETE). 174 | """ 175 | 176 | raise Exception('Sorry, not implemented yet!') 177 | 178 | def __define_loader(is_training): 179 | """ 180 | Define data loader. 181 | """ 182 | 183 | # get some configs for the training 184 | n_classes = config.cfg.MODEL.N_CLASSES 185 | dataset_name = config.cfg.DATASET_NAME 186 | backbone_model_name = config.cfg.MODEL.BACKBONE_CNN 187 | backbone_feature_name = config.cfg.MODEL.BACKBONE_FEATURE 188 | n_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS 189 | n_workers = config.cfg.TRAIN.N_WORKERS 190 | 191 | batch_size_tr = config.cfg.TRAIN.BATCH_SIZE 192 | batch_size_te = config.cfg.TEST.BATCH_SIZE 193 | batch_size = batch_size_tr if is_training else batch_size_te 194 | 195 | # size and name of feature 196 | feature_name = 'features_%s_%s_%sf' % (backbone_model_name, backbone_feature_name, n_timesteps) 197 | c, h, w = utils.get_model_feat_maps_info(backbone_model_name, backbone_feature_name) 198 | feature_dim = (c, n_timesteps, h, w) 199 | 200 | # data generators 201 | params = {'batch_size': batch_size, 'n_classes': n_classes, 'feature_name': feature_name, 'feature_dim': feature_dim, 'is_training': is_training} 202 | dataset_class = data_utils.PYTORCH_DATASETS_DICT[dataset_name] 203 | dataset = dataset_class(**params) 204 | n_samples = dataset.n_samples 205 | n_batches = dataset.n_batches 206 | 207 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=n_workers, shuffle=True) 208 | 209 | return data_loader, n_samples, n_batches 210 | 211 | def __define_timeception_model(device): 212 | """ 213 | Define model, optimizer, loss function and metric function. 214 | """ 215 | # some configurations 216 | classification_type = config.cfg.MODEL.CLASSIFICATION_TYPE 217 | solver_name = config.cfg.SOLVER.NAME 218 | solver_lr = config.cfg.SOLVER.LR 219 | adam_epsilon = config.cfg.SOLVER.ADAM_EPSILON 220 | 221 | # define model 222 | model = Model().to(device) 223 | model_param = model.parameters() 224 | 225 | # define the optimizer 226 | optimizer = SGD(model_param, lr=0.01) if solver_name == 'sgd' else Adam(model_param, lr=solver_lr, eps=adam_epsilon) 227 | 228 | # loss and evaluation function for either multi-label "ml" or single-label "sl" classification 229 | if classification_type == 'ml': 230 | loss_fn = torch.nn.BCELoss() 231 | metric_fn = metrics.map_charades 232 | metric_fn_name = 'map' 233 | else: 234 | loss_fn = torch.nn.NLLLoss() 235 | metric_fn = metrics.accuracy 236 | metric_fn_name = 'acc' 237 | 238 | return model, optimizer, loss_fn, metric_fn, metric_fn_name 239 | 240 | class Model(Module): 241 | """ 242 | Define Timeception classifier. 243 | """ 244 | 245 | def __init__(self): 246 | super(Model, self).__init__() 247 | 248 | # some configurations for the model 249 | n_tc_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS 250 | backbone_name = config.cfg.MODEL.BACKBONE_CNN 251 | feature_name = config.cfg.MODEL.BACKBONE_FEATURE 252 | n_tc_layers = config.cfg.MODEL.N_TC_LAYERS 253 | n_classes = config.cfg.MODEL.N_CLASSES 254 | is_dilated = config.cfg.MODEL.MULTISCALE_TYPE 255 | OutputActivation = Sigmoid if config.cfg.MODEL.CLASSIFICATION_TYPE == 'ml' else LogSoftmax 256 | n_channels_in, channel_h, channel_w = utils.get_model_feat_maps_info(backbone_name, feature_name) 257 | n_groups = int(n_channels_in / 128.0) 258 | 259 | input_shape = (None, n_channels_in, n_tc_timesteps, channel_h, channel_w) # (C, T, H, W) 260 | self._input_shape = input_shape 261 | 262 | # define 4 layers of timeception 263 | self.timeception = timeception_pytorch.Timeception(input_shape, n_tc_layers, n_groups, is_dilated) # (C, T, H, W) 264 | 265 | # get number of output channels after timeception 266 | n_channels_in = self.timeception.n_channels_out 267 | 268 | # define layers for classifier 269 | self.do1 = Dropout(0.5) 270 | self.l1 = Linear(n_channels_in, 512) 271 | self.bn1 = BatchNorm1d(512) 272 | self.ac1 = LeakyReLU(0.2) 273 | self.do2 = Dropout(0.25) 274 | self.l2 = Linear(512, n_classes) 275 | self.ac2 = OutputActivation() 276 | 277 | def forward(self, input): 278 | # feedforward the input to the timeception layers 279 | tensor = self.timeception(input) 280 | 281 | # max-pool over space-time 282 | bn, c, t, h, w = tensor.size() 283 | tensor = tensor.view(bn, c, t * h * w) 284 | tensor = torch.max(tensor, dim=2, keepdim=False) 285 | tensor = tensor[0] 286 | 287 | # dense layers for classification 288 | tensor = self.do1(tensor) 289 | tensor = self.l1(tensor) 290 | tensor = self.bn1(tensor) 291 | tensor = self.ac1(tensor) 292 | tensor = self.do2(tensor) 293 | tensor = self.l2(tensor) 294 | tensor = self.ac2(tensor) 295 | 296 | return tensor 297 | 298 | def __main(): 299 | """ 300 | Run this script to train Timeception. 301 | """ 302 | 303 | default_config_file = 'charades_i3d_tc4_f1024.yaml' 304 | default_config_file = 'charades_i3d_tc2_f256.yaml' 305 | 306 | # Parse the arguments 307 | parser = OptionParser() 308 | parser.add_option('-c', '--config_file', dest='config_file', default=default_config_file, help='Yaml config file that contains all training details.') 309 | (options, args) = parser.parse_args() 310 | config_file = options.config_file 311 | 312 | # check if exist 313 | if config_file is None or config_file == '': 314 | msg = 'Config file not passed, default config is used: %s' % (config_file) 315 | logging.warning(msg) 316 | config_file = default_config_file 317 | 318 | # path of config file 319 | config_path = './configs/%s' % (config_file) 320 | 321 | # check if file exist 322 | if not os.path.exists(config_path): 323 | msg = 'Sorry, could not find config file with the following path: %s' % (config_path) 324 | logging.error(msg) 325 | else: 326 | # read the config from file and copy it to the project configuration "cfg" 327 | config_utils.cfg_from_file(config_path) 328 | 329 | # choose which training scheme, either 'ete' or 'tco' 330 | training_scheme = config.cfg.TRAIN.SCHEME 331 | 332 | # start training 333 | if training_scheme == 'tco': 334 | train_tco() 335 | else: 336 | train_ete() 337 | 338 | if __name__ == '__main__': 339 | __main() 340 | -------------------------------------------------------------------------------- /core/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Helper functions for many things. Also, some needed classes. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import logging 33 | import time 34 | import h5py 35 | import yaml 36 | import numpy as np 37 | import pickle as pkl 38 | import pandas as pd 39 | from datetime import datetime 40 | import matplotlib.pyplot as plt 41 | from sklearn.preprocessing import label_binarize 42 | from sklearn import preprocessing, manifold 43 | import scipy.io as sio 44 | 45 | import os 46 | import json 47 | import natsort 48 | import random 49 | from multiprocessing.dummy import Pool 50 | 51 | from core import const 52 | 53 | logger = logging.getLogger(__name__) 54 | 55 | # region Load and Dump 56 | 57 | def pkl_load(path): 58 | with open(path, 'r') as f: 59 | data = pkl.load(f) 60 | return data 61 | 62 | def txt_load(path): 63 | with open(path, 'r') as f: 64 | lines = f.read().splitlines() 65 | lines = np.array(lines) 66 | return lines 67 | 68 | def byte_load(path): 69 | with open(path, 'rb') as f: 70 | data = f.read() 71 | return data 72 | 73 | def json_load(path): 74 | with open(path, 'r') as f: 75 | data = json.load(f) 76 | 77 | return data 78 | 79 | def yaml_load(file_path): 80 | with open(file_path, 'r') as f: 81 | data = yaml.load(f) 82 | data = AttrDict(data) 83 | 84 | data = convert_dict_to_attrdict(data) 85 | return data 86 | 87 | def h5_load(path, dataset_name='data'): 88 | h5_file = h5py.File(path, 'r') 89 | data = h5_file[dataset_name].value 90 | h5_file.close() 91 | return data 92 | 93 | def h5_load_multi(path, dataset_names): 94 | h5_file = h5py.File(path, 'r') 95 | data = [h5_file[name].value for name in dataset_names] 96 | h5_file.close() 97 | return data 98 | 99 | def txt_dump(data, path): 100 | l = len(data) - 1 101 | with open(path, 'w') as f: 102 | for i, k in enumerate(data): 103 | if i < l: 104 | k = ('%s\n' % k) 105 | else: 106 | k = ('%s' % k) 107 | f.writelines(k) 108 | 109 | def byte_dump(data, path): 110 | with open(path, 'wb') as f: 111 | f.write(data) 112 | 113 | def pkl_dump(data, path, is_highest=True): 114 | with open(path, 'w') as f: 115 | if not is_highest: 116 | pkl.dump(data, f) 117 | else: 118 | pkl.dump(data, f, pkl.HIGHEST_PROTOCOL) 119 | 120 | def json_dump(data, path): 121 | with open(path, 'w') as f: 122 | json.dump(data, f) 123 | 124 | def h5_dump(data, path, dataset_name='data'): 125 | h5_file = h5py.File(path, 'w') 126 | h5_file.create_dataset(dataset_name, data=data, dtype=data.dtype) 127 | h5_file.close() 128 | 129 | def h5_dump_multi(data, dataset_names, path): 130 | h5_file = h5py.File(path, 'w') 131 | n_items = len(data) 132 | for i in range(n_items): 133 | item_data = data[i] 134 | item_name = dataset_names[i] 135 | h5_file.create_dataset(item_name, data=item_data, dtype=item_data.dtype) 136 | h5_file.close() 137 | 138 | def csv_load(path, sep=',', header='infer'): 139 | df = pd.read_csv(path, sep=sep, header=header) 140 | data = df.values 141 | return data 142 | 143 | def mat_load(path, m_dict=None): 144 | """ 145 | Load mat files. 146 | :param path: 147 | :return: 148 | """ 149 | if m_dict is None: 150 | data = sio.loadmat(path) 151 | else: 152 | data = sio.loadmat(path, m_dict) 153 | 154 | return data 155 | 156 | # endregion 157 | 158 | # region File/Folder Names/Pathes 159 | 160 | def file_names(path, is_nat_sort=False): 161 | if not os.path.exists(path): 162 | exp_msg = 'Sorry, folder path does not exist: %s' % (path) 163 | raise Exception(exp_msg) 164 | 165 | names = os.walk(path).next()[2] 166 | 167 | if is_nat_sort: 168 | names = natsort.natsorted(names) 169 | 170 | return names 171 | 172 | def file_pathes(path, is_nat_sort=False): 173 | if not os.path.exists(path): 174 | exp_msg = 'Sorry, folder path does not exist: %s' % (path) 175 | raise Exception(exp_msg) 176 | 177 | names = os.walk(path).next()[2] 178 | 179 | if is_nat_sort: 180 | names = natsort.natsorted(names) 181 | 182 | pathes = ['%s/%s' % (path, n) for n in names] 183 | return pathes 184 | 185 | def folder_names(path, is_nat_sort=False): 186 | if not os.path.exists(path): 187 | exp_msg = 'Sorry, folder path does not exist: %s' % (path) 188 | raise Exception(exp_msg) 189 | 190 | names = os.walk(path).next()[1] 191 | 192 | if is_nat_sort: 193 | names = natsort.natsorted(names) 194 | 195 | return names 196 | 197 | def folder_pathes(path, is_nat_sort=False): 198 | if not os.path.exists(path): 199 | exp_msg = 'Sorry, folder path does not exist: %s' % (path) 200 | raise Exception(exp_msg) 201 | 202 | names = os.walk(path).next()[1] 203 | 204 | if is_nat_sort: 205 | names = natsort.natsorted(names) 206 | 207 | pathes = ['%s/%s' % (path, n) for n in names] 208 | return pathes 209 | 210 | # endregion 211 | 212 | # region Normalization 213 | 214 | def normalize_mean_std(x): 215 | mean = np.mean(x, axis=0) 216 | std = np.std(x, axis=0) 217 | x -= mean 218 | x /= std 219 | return x 220 | 221 | def normalize_mean(x): 222 | mean = np.mean(x, axis=0) 223 | x /= mean 224 | return x 225 | 226 | def normalize_sum(x): 227 | sum = np.sum(x, axis=1) 228 | x = np.array([x_i / sum_i for x_i, sum_i in zip(x, sum)]) 229 | return x 230 | 231 | def normalize_l2(x): 232 | return preprocessing.normalize(x) 233 | 234 | def normalize_l1(x): 235 | return preprocessing.normalize(x, norm='l1') 236 | 237 | def normalize_range_0_to_1(x): 238 | x = np.add(x, -x.min()) 239 | x = np.divide(x, x.max()) 240 | return x 241 | 242 | # endregion 243 | 244 | # region Array Helpers 245 | 246 | def array_to_text(a, separator=', '): 247 | text = separator.join([str(s) for s in a]) 248 | return text 249 | 250 | def get_size_in_kb(size): 251 | size /= float(1024) 252 | return size 253 | 254 | def get_size_in_mb(size): 255 | size /= float(1024 * 1024) 256 | return size 257 | 258 | def get_size_in_gb(size): 259 | size /= float(1024 * 1024 * 1024) 260 | return size 261 | 262 | def get_array_memory_size(a): 263 | if type(a) is not np.ndarray: 264 | raise Exception('Sorry, input is not numpy array!') 265 | 266 | dtype = a.dtype 267 | if dtype == np.float16: 268 | n_bytes = 2 269 | elif dtype == np.float32: 270 | n_bytes = 4 271 | else: 272 | raise Exception('Sorry, unsupported dtype:', dtype) 273 | 274 | s = a.size 275 | size = s * n_bytes 276 | return size 277 | 278 | def get_expected_memory_size(array_shape, array_dtype): 279 | dtype = array_dtype 280 | if dtype == np.float16: 281 | n_bytes = 2 282 | elif dtype == np.float32: 283 | n_bytes = 4 284 | else: 285 | raise Exception('Sorry, unsupported dtype:', dtype) 286 | 287 | s = 1 288 | for dim_size in array_shape: 289 | s *= dim_size 290 | 291 | size = s * n_bytes 292 | return size 293 | 294 | def print_array(a): 295 | for item in a: 296 | print(item) 297 | 298 | def print_array_joined(a): 299 | s = ', '.join([str(i) for i in a]) 300 | print(s) 301 | 302 | # endregion 303 | 304 | # region Misc 305 | 306 | def learn_manifold(manifold_type, feats, n_components=2): 307 | if manifold_type == 'tsne': 308 | feats_fitted = manifold.TSNE(n_components=n_components, random_state=0).fit_transform(feats) 309 | elif manifold_type == 'isomap': 310 | feats_fitted = manifold.Isomap(n_components=n_components).fit_transform(feats) 311 | elif manifold_type == 'mds': 312 | feats_fitted = manifold.MDS(n_components=n_components).fit_transform(feats) 313 | elif manifold_type == 'spectral': 314 | feats_fitted = manifold.SpectralEmbedding(n_components=n_components).fit_transform(feats) 315 | else: 316 | raise Exception('wrong maniford type!') 317 | 318 | # methods = ['standard', 'ltsa', 'hessian', 'modified'] 319 | # feats_fitted = manifold.LocallyLinearEmbedding(n_components=n_components, method=methods[0]).fit_transform(pred) 320 | 321 | return feats_fitted 322 | 323 | def debinarize_label(labels): 324 | debinarized = np.array([np.where(l == 1)[0][0] for l in labels]) 325 | return debinarized 326 | 327 | def timestamp(): 328 | time_stamp = "{0:%y}.{0:%m}.{0:%d}-{0:%I}:{0:%M}:{0:%S}".format(datetime.now()) 329 | return time_stamp 330 | 331 | def remove_extension(name): 332 | name = name[:-4] 333 | return name 334 | 335 | def get_file_extension(name): 336 | name = name.split('.')[-1] 337 | return name 338 | 339 | def print_counter(num, total, freq=None): 340 | if freq is None: 341 | logger.info('... %d/%d' % (num, total)) 342 | elif num % freq == 0: 343 | logger.info('... %d/%d' % (num, total)) 344 | 345 | def calc_num_batches(n_samples, batch_size): 346 | n_batch = int(n_samples / float(batch_size)) 347 | n_batch = n_batch if n_samples % batch_size == 0 else n_batch + 1 348 | return n_batch 349 | 350 | def convert_dict_to_attrdict(d): 351 | for k, v in d.iteritems(): 352 | if isinstance(v, dict): 353 | v = convert_dict_to_attrdict(v) 354 | d[k] = v 355 | 356 | if isinstance(d, dict): 357 | d = AttrDict(d) 358 | 359 | return d 360 | 361 | def get_model_feat_maps_info(model_type, feature_type): 362 | """ 363 | Get feature map details according to model type and feature type. 364 | :param model_type: 365 | :param feature_type: 366 | :return: 367 | """ 368 | 369 | if model_type in ['vgg', 'vgg_charades_rgb']: 370 | if feature_type == 'pool5': 371 | return 512, 7, 7 372 | elif feature_type == 'conv5_3': 373 | return 512, 14, 14 374 | else: 375 | raise Exception('Sorry, unsupported feature type: %s' % (feature_type)) 376 | elif model_type in ['resnet152', 'resnet152_charades_rgb']: 377 | if feature_type == 'res4b35': 378 | return 1024, 14, 14 379 | elif feature_type == 'res5c': 380 | return 2048, 7, 7 381 | elif feature_type == 'pool5': 382 | return 2048, 1, 1 383 | else: 384 | raise Exception('Sorry, unsupported feature type: %s' % (feature_type)) 385 | elif model_type in ['i3d_rgb', 'i3d_pytorch_charades_rgb', 'i3d_kinetics_keras', 'i3d_keras_kinetics_rgb']: 386 | if feature_type == 'mixed_5c': 387 | return 1024, 7, 7 388 | elif feature_type == 'mixed_4f': 389 | return 832, 7, 7 390 | else: 391 | raise Exception('Sorry, unsupported feature type: %s' % (feature_type)) 392 | elif model_type in ['i3d_resnet_50_kinetics_rgb', 'i3d_resnet_101_kinetics_rgb']: 393 | if feature_type == 'pool5': 394 | return 2048, 7, 7 395 | else: 396 | raise Exception('Sorry, unsupported feature type: %s' % (feature_type)) 397 | elif model_type in ['i3d_resnet101_charades_rgb']: 398 | if feature_type == 'res5_2': 399 | return 2048, 7, 7 400 | else: 401 | raise Exception('Sorry, unsupported feature type: %s' % (feature_type)) 402 | else: 403 | raise Exception('Sorry, unsupported model type: %s' % (model_type)) 404 | 405 | # endregion 406 | 407 | # region Classes 408 | 409 | class Path(str): 410 | def __new__(self, relative_path, args=None, root_type=const.ROOT_PATH_TYPES[0]): 411 | assert root_type in const.ROOT_PATH_TYPES 412 | root_types = list(const.ROOT_PATH_TYPES) 413 | idx_root_type = root_types.index(root_type) 414 | 415 | root_paths = [const.DATA_ROOT_PATH, const.PROJECT_ROOT_PATH] 416 | root_path = root_paths[idx_root_type] 417 | 418 | relative_path = relative_path % args if args is not None else relative_path 419 | path = os.path.join(root_path, relative_path) 420 | 421 | self.__path = path 422 | return self.__path 423 | 424 | def __str__(self): 425 | return self.__path 426 | 427 | def __repr__(self): 428 | return self.__path 429 | 430 | class DurationTimer(object): 431 | def __init__(self): 432 | self.start_time = time.time() 433 | 434 | def duration(self, is_string=True): 435 | stop_time = time.time() 436 | durtation = stop_time - self.start_time 437 | if is_string: 438 | durtation = self.format_duration(durtation) 439 | return durtation 440 | 441 | def format_duration(self, duration): 442 | if duration < 60: 443 | return str(duration) + " sec" 444 | elif duration < (60 * 60): 445 | return str(duration / 60) + " min" 446 | else: 447 | return str(duration / (60 * 60)) + " hr" 448 | 449 | class AttrDict(dict): 450 | """ 451 | Subclass dict and define getter-setter. This behaves as both dict and obj. 452 | """ 453 | 454 | def __getattr__(self, key): 455 | return self[key] 456 | 457 | def __setattr__(self, key, value): 458 | if key in self.__dict__: 459 | self.__dict__[key] = value 460 | else: 461 | self[key] = value 462 | 463 | # endregion 464 | -------------------------------------------------------------------------------- /nets/timeception_pytorch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Definitio of Timeception as pytorch model. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import logging 33 | 34 | import torch 35 | import torch.nn 36 | import torchvision 37 | import torchviz 38 | import torchsummary 39 | 40 | from torch.nn import Module, Conv3d, BatchNorm3d, MaxPool3d, ReLU 41 | from torch.nn import functional as F 42 | 43 | from nets.layers_pytorch import ChannelShuffleLayer, DepthwiseConv1DLayer 44 | 45 | # region Timeception as Module 46 | 47 | class Timeception(Module): 48 | """ 49 | Timeception is defined as a keras model. 50 | """ 51 | 52 | def __init__(self, input_shape, n_layers=4, n_groups=8, is_dilated=True): 53 | 54 | super(Timeception, self).__init__() 55 | 56 | # TODO: Add support for multi-scale using dilation rates 57 | # current, for pytorch, we only support multi-scale using kernel sizes 58 | is_dilated = False 59 | 60 | expansion_factor = 1.25 61 | self.expansion_factor = expansion_factor 62 | self.n_layers = n_layers 63 | self.is_dilated = is_dilated 64 | self.n_groups = n_groups 65 | self.n_channels_out = None 66 | 67 | # convert it as a list 68 | input_shape = list(input_shape) 69 | 70 | # define timeception layers 71 | n_channels_out = self.__define_timeception_layers(input_shape, n_layers, n_groups, expansion_factor, is_dilated) 72 | 73 | # set the output channels 74 | self.n_channels_out = n_channels_out 75 | 76 | def forward(self, input): 77 | 78 | n_layers = self.n_layers 79 | n_groups = self.n_groups 80 | expansion_factor = self.expansion_factor 81 | 82 | output = self.__call_timeception_layers(input, n_layers, n_groups, expansion_factor) 83 | 84 | return output 85 | 86 | def __define_timeception_layers(self, input_shape, n_layers, n_groups, expansion_factor, is_dilated): 87 | """ 88 | Define layers inside the timeception layers. 89 | """ 90 | 91 | n_channels_in = input_shape[1] 92 | 93 | # how many layers of timeception 94 | for i in range(n_layers): 95 | layer_num = i + 1 96 | 97 | # get details about grouping 98 | n_channels_per_branch, n_channels_out = self.__get_n_channels_per_branch(n_groups, expansion_factor, n_channels_in) 99 | 100 | # temporal conv per group 101 | self.__define_grouped_convolutions(input_shape, n_groups, n_channels_per_branch, is_dilated, layer_num) 102 | 103 | # downsample over time 104 | layer_name = 'maxpool_tc%d' % (layer_num) 105 | layer = MaxPool3d(kernel_size=(2, 1, 1)) 106 | layer._name = layer_name 107 | setattr(self, layer_name, layer) 108 | 109 | n_channels_in = n_channels_out 110 | input_shape[1] = n_channels_in 111 | 112 | return n_channels_in 113 | 114 | def __define_grouped_convolutions(self, input_shape, n_groups, n_channels_per_branch, is_dilated, layer_num): 115 | """ 116 | Define layers inside grouped convolutional block. 117 | """ 118 | 119 | n_channels_in = input_shape[1] 120 | 121 | n_branches = 5 122 | n_channels_per_group_in = int(n_channels_in / n_groups) 123 | n_channels_out = int(n_groups * n_branches * n_channels_per_branch) 124 | n_channels_per_group_out = int(n_channels_out / n_groups) 125 | 126 | assert n_channels_in % n_groups == 0 127 | assert n_channels_out % n_groups == 0 128 | 129 | # type of multi-scale kernels to use: either multi_kernel_sizes or multi_dilation_rates 130 | if is_dilated: 131 | kernel_sizes = (3, 3, 3) 132 | dilation_rates = (1, 2, 3) 133 | else: 134 | kernel_sizes = (3, 5, 7) 135 | dilation_rates = (1, 1, 1) 136 | 137 | input_shape_per_group = list(input_shape) 138 | input_shape_per_group[1] = n_channels_per_group_in 139 | 140 | # loop on groups, and define convolutions in each group 141 | for idx_group in range(n_groups): 142 | group_num = idx_group + 1 143 | self.__define_temporal_convolutional_block(input_shape_per_group, n_channels_per_branch, kernel_sizes, dilation_rates, layer_num, group_num) 144 | 145 | # activation 146 | layer_name = 'relu_tc%d' % (layer_num) 147 | layer = ReLU() 148 | layer._name = layer_name 149 | setattr(self, layer_name, layer) 150 | 151 | # shuffle channels 152 | layer_name = 'shuffle_tc%d' % (layer_num) 153 | layer = ChannelShuffleLayer(n_channels_out, n_groups) 154 | layer._name = layer_name 155 | setattr(self, layer_name, layer) 156 | 157 | def __define_temporal_convolutional_block(self, input_shape, n_channels_per_branch_out, kernel_sizes, dilation_rates, layer_num, group_num): 158 | """ 159 | Define 5 branches of convolutions that operate of channels of each group. 160 | """ 161 | 162 | n_channels_in = input_shape[1] 163 | 164 | dw_input_shape = list(input_shape) 165 | dw_input_shape[1] = n_channels_per_branch_out 166 | 167 | # branch 1: dimension reduction only and no temporal conv 168 | layer_name = 'conv_b1_g%d_tc%d' % (group_num, layer_num) 169 | layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1)) 170 | layer._name = layer_name 171 | setattr(self, layer_name, layer) 172 | layer_name = 'bn_b1_g%d_tc%d' % (group_num, layer_num) 173 | layer = BatchNorm3d(n_channels_per_branch_out) 174 | layer._name = layer_name 175 | setattr(self, layer_name, layer) 176 | 177 | # branch 2: dimension reduction followed by depth-wise temp conv (kernel-size 3) 178 | layer_name = 'conv_b2_g%d_tc%d' % (group_num, layer_num) 179 | layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1)) 180 | layer._name = layer_name 181 | setattr(self, layer_name, layer) 182 | layer_name = 'convdw_b2_g%d_tc%d' % (group_num, layer_num) 183 | layer = DepthwiseConv1DLayer(dw_input_shape, kernel_sizes[0], dilation_rates[0], layer_name) 184 | setattr(self, layer_name, layer) 185 | layer_name = 'bn_b2_g%d_tc%d' % (group_num, layer_num) 186 | layer = BatchNorm3d(n_channels_per_branch_out) 187 | layer._name = layer_name 188 | setattr(self, layer_name, layer) 189 | 190 | # branch 3: dimension reduction followed by depth-wise temp conv (kernel-size 5) 191 | layer_name = 'conv_b3_g%d_tc%d' % (group_num, layer_num) 192 | layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1)) 193 | layer._name = layer_name 194 | setattr(self, layer_name, layer) 195 | layer_name = 'convdw_b3_g%d_tc%d' % (group_num, layer_num) 196 | layer = DepthwiseConv1DLayer(dw_input_shape, kernel_sizes[1], dilation_rates[1], layer_name) 197 | setattr(self, layer_name, layer) 198 | layer_name = 'bn_b3_g%d_tc%d' % (group_num, layer_num) 199 | layer = BatchNorm3d(n_channels_per_branch_out) 200 | layer._name = layer_name 201 | setattr(self, layer_name, layer) 202 | 203 | # branch 4: dimension reduction followed by depth-wise temp conv (kernel-size 7) 204 | layer_name = 'conv_b4_g%d_tc%d' % (group_num, layer_num) 205 | layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1)) 206 | layer._name = layer_name 207 | setattr(self, layer_name, layer) 208 | layer_name = 'convdw_b4_g%d_tc%d' % (group_num, layer_num) 209 | layer = DepthwiseConv1DLayer(dw_input_shape, kernel_sizes[2], dilation_rates[2], layer_name) 210 | setattr(self, layer_name, layer) 211 | layer_name = 'bn_b4_g%d_tc%d' % (group_num, layer_num) 212 | layer = BatchNorm3d(n_channels_per_branch_out) 213 | layer._name = layer_name 214 | setattr(self, layer_name, layer) 215 | 216 | # branch 5: dimension reduction followed by temporal max pooling 217 | layer_name = 'conv_b5_g%d_tc%d' % (group_num, layer_num) 218 | layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1)) 219 | layer._name = layer_name 220 | setattr(self, layer_name, layer) 221 | layer_name = 'maxpool_b5_g%d_tc%d' % (group_num, layer_num) 222 | layer = MaxPool3d(kernel_size=(2, 1, 1), stride=(1, 1, 1)) 223 | layer._name = layer_name 224 | setattr(self, layer_name, layer) 225 | layer_name = 'padding_b5_g%d_tc%d' % (group_num, layer_num) 226 | layer = torch.nn.ReplicationPad3d((0, 0, 0, 0, 1, 0)) # left, right, top, bottom, front, back 227 | layer._name = layer_name 228 | setattr(self, layer_name, layer) 229 | layer_name = 'bn_b5_g%d_tc%d' % (group_num, layer_num) 230 | layer = BatchNorm3d(n_channels_per_branch_out) 231 | layer._name = layer_name 232 | setattr(self, layer_name, layer) 233 | 234 | def __call_timeception_layers(self, tensor, n_layers, n_groups, expansion_factor): 235 | input_shape = tensor.size() 236 | n_channels_in = input_shape[1] 237 | 238 | # how many layers of timeception 239 | for i in range(n_layers): 240 | layer_num = i + 1 241 | 242 | # get details about grouping 243 | n_channels_per_branch, n_channels_out = self.__get_n_channels_per_branch(n_groups, expansion_factor, n_channels_in) 244 | 245 | # temporal conv per group 246 | tensor = self.__call_grouped_convolutions(tensor, n_groups, layer_num) 247 | 248 | # downsample over time 249 | tensor = getattr(self, 'maxpool_tc%d' % (layer_num))(tensor) 250 | n_channels_in = n_channels_out 251 | 252 | return tensor 253 | 254 | def __call_grouped_convolutions(self, tensor_input, n_groups, layer_num): 255 | 256 | n_channels_in = tensor_input.size()[1] 257 | n_channels_per_group_in = int(n_channels_in / n_groups) 258 | 259 | # loop on groups 260 | t_outputs = [] 261 | for idx_group in range(n_groups): 262 | group_num = idx_group + 1 263 | 264 | # slice maps to get maps per group 265 | idx_start = idx_group * n_channels_per_group_in 266 | idx_end = (idx_group + 1) * n_channels_per_group_in 267 | tensor = tensor_input[:, idx_start:idx_end] 268 | 269 | tensor = self.__call_temporal_convolutional_block(tensor, layer_num, group_num) 270 | t_outputs.append(tensor) 271 | 272 | # concatenate channels of groups 273 | tensor = torch.cat(t_outputs, dim=1) 274 | # activation 275 | tensor = getattr(self, 'relu_tc%d' % (layer_num))(tensor) 276 | # shuffle channels 277 | tensor = getattr(self, 'shuffle_tc%d' % (layer_num))(tensor) 278 | 279 | return tensor 280 | 281 | def __call_temporal_convolutional_block(self, tensor, layer_num, group_num): 282 | """ 283 | Feedforward for 5 branches of convolutions that operate of channels of each group. 284 | """ 285 | 286 | # branch 1: dimension reduction only and no temporal conv 287 | t_1 = getattr(self, 'conv_b1_g%d_tc%d' % (group_num, layer_num))(tensor) 288 | t_1 = getattr(self, 'bn_b1_g%d_tc%d' % (group_num, layer_num))(t_1) 289 | 290 | # branch 2: dimension reduction followed by depth-wise temp conv (kernel-size 3) 291 | t_2 = getattr(self, 'conv_b2_g%d_tc%d' % (group_num, layer_num))(tensor) 292 | t_2 = getattr(self, 'convdw_b2_g%d_tc%d' % (group_num, layer_num))(t_2) 293 | t_2 = getattr(self, 'bn_b2_g%d_tc%d' % (group_num, layer_num))(t_2) 294 | 295 | # branch 3: dimension reduction followed by depth-wise temp conv (kernel-size 5) 296 | t_3 = getattr(self, 'conv_b3_g%d_tc%d' % (group_num, layer_num))(tensor) 297 | t_3 = getattr(self, 'convdw_b3_g%d_tc%d' % (group_num, layer_num))(t_3) 298 | t_3 = getattr(self, 'bn_b3_g%d_tc%d' % (group_num, layer_num))(t_3) 299 | 300 | # branch 4: dimension reduction followed by depth-wise temp conv (kernel-size 7) 301 | t_4 = getattr(self, 'conv_b4_g%d_tc%d' % (group_num, layer_num))(tensor) 302 | t_4 = getattr(self, 'convdw_b4_g%d_tc%d' % (group_num, layer_num))(t_4) 303 | t_4 = getattr(self, 'bn_b4_g%d_tc%d' % (group_num, layer_num))(t_4) 304 | 305 | # branch 5: dimension reduction followed by temporal max pooling 306 | t_5 = getattr(self, 'conv_b5_g%d_tc%d' % (group_num, layer_num))(tensor) 307 | t_5 = getattr(self, 'maxpool_b5_g%d_tc%d' % (group_num, layer_num))(t_5) 308 | t_5 = getattr(self, 'padding_b5_g%d_tc%d' % (group_num, layer_num))(t_5) 309 | t_5 = getattr(self, 'bn_b5_g%d_tc%d' % (group_num, layer_num))(t_5) 310 | 311 | # concatenate channels of branches 312 | tensors = (t_1, t_2, t_3, t_4, t_5) 313 | tensor = torch.cat(tensors, dim=1) 314 | 315 | return tensor 316 | 317 | def __get_n_channels_per_branch(self, n_groups, expansion_factor, n_channels_in): 318 | n_branches = 5 319 | n_channels_per_branch = int(n_channels_in * expansion_factor / float(n_branches * n_groups)) 320 | n_channels_per_branch = int(n_channels_per_branch) 321 | n_channels_out = int(n_channels_per_branch * n_groups * n_branches) 322 | n_channels_out = int(n_channels_out) 323 | 324 | return n_channels_per_branch, n_channels_out 325 | 326 | # endregion 327 | -------------------------------------------------------------------------------- /nets/i3d_torch_charades.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | 6 | import numpy as np 7 | 8 | import os 9 | import sys 10 | from collections import OrderedDict 11 | 12 | class MaxPool3dSamePadding(nn.MaxPool3d): 13 | 14 | def compute_pad(self, dim, s): 15 | if s % self.stride[dim] == 0: 16 | return max(self.kernel_size[dim] - self.stride[dim], 0) 17 | else: 18 | return max(self.kernel_size[dim] - (s % self.stride[dim]), 0) 19 | 20 | def forward(self, x): 21 | # compute 'same' padding 22 | (batch, channel, t, h, w) = x.size() 23 | # print t,h,w 24 | out_t = np.ceil(float(t) / float(self.stride[0])) 25 | out_h = np.ceil(float(h) / float(self.stride[1])) 26 | out_w = np.ceil(float(w) / float(self.stride[2])) 27 | # print out_t, out_h, out_w 28 | pad_t = self.compute_pad(0, t) 29 | pad_h = self.compute_pad(1, h) 30 | pad_w = self.compute_pad(2, w) 31 | # print pad_t, pad_h, pad_w 32 | 33 | pad_t_f = pad_t // 2 34 | pad_t_b = pad_t - pad_t_f 35 | pad_h_f = pad_h // 2 36 | pad_h_b = pad_h - pad_h_f 37 | pad_w_f = pad_w // 2 38 | pad_w_b = pad_w - pad_w_f 39 | 40 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) 41 | # print x.size() 42 | # print pad 43 | x = F.pad(x, pad) 44 | return super(MaxPool3dSamePadding, self).forward(x) 45 | 46 | class Unit3D(nn.Module): 47 | 48 | def __init__(self, in_channels, 49 | output_channels, 50 | kernel_shape=(1, 1, 1), 51 | stride=(1, 1, 1), 52 | padding=0, 53 | activation_fn=F.relu, 54 | use_batch_norm=True, 55 | use_bias=False, 56 | name='unit_3d'): 57 | 58 | """Initializes Unit3D module.""" 59 | super(Unit3D, self).__init__() 60 | 61 | self._output_channels = output_channels 62 | self._kernel_shape = kernel_shape 63 | self._stride = stride 64 | self._use_batch_norm = use_batch_norm 65 | self._activation_fn = activation_fn 66 | self._use_bias = use_bias 67 | self.name = name 68 | self.padding = padding 69 | 70 | self.conv3d = nn.Conv3d(in_channels=in_channels, 71 | out_channels=self._output_channels, 72 | kernel_size=self._kernel_shape, 73 | stride=self._stride, 74 | padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function 75 | bias=self._use_bias) 76 | 77 | if self._use_batch_norm: 78 | self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01) 79 | 80 | def compute_pad(self, dim, s): 81 | if s % self._stride[dim] == 0: 82 | return max(self._kernel_shape[dim] - self._stride[dim], 0) 83 | else: 84 | return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0) 85 | 86 | def forward(self, x): 87 | # compute 'same' padding 88 | (batch, channel, t, h, w) = x.size() 89 | # print t,h,w 90 | out_t = np.ceil(float(t) / float(self._stride[0])) 91 | out_h = np.ceil(float(h) / float(self._stride[1])) 92 | out_w = np.ceil(float(w) / float(self._stride[2])) 93 | # print out_t, out_h, out_w 94 | pad_t = self.compute_pad(0, t) 95 | pad_h = self.compute_pad(1, h) 96 | pad_w = self.compute_pad(2, w) 97 | # print pad_t, pad_h, pad_w 98 | 99 | pad_t_f = pad_t // 2 100 | pad_t_b = pad_t - pad_t_f 101 | pad_h_f = pad_h // 2 102 | pad_h_b = pad_h - pad_h_f 103 | pad_w_f = pad_w // 2 104 | pad_w_b = pad_w - pad_w_f 105 | 106 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) 107 | # print x.size() 108 | # print pad 109 | x = F.pad(x, pad) 110 | # print x.size() 111 | 112 | x = self.conv3d(x) 113 | if self._use_batch_norm: 114 | x = self.bn(x) 115 | if self._activation_fn is not None: 116 | x = self._activation_fn(x) 117 | return x 118 | 119 | class InceptionModule(nn.Module): 120 | def __init__(self, in_channels, out_channels, name): 121 | super(InceptionModule, self).__init__() 122 | 123 | self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0, 124 | name=name + '/Branch_0/Conv3d_0a_1x1') 125 | self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0, 126 | name=name + '/Branch_1/Conv3d_0a_1x1') 127 | self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3], 128 | name=name + '/Branch_1/Conv3d_0b_3x3') 129 | self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0, 130 | name=name + '/Branch_2/Conv3d_0a_1x1') 131 | self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3], 132 | name=name + '/Branch_2/Conv3d_0b_3x3') 133 | self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3], 134 | stride=(1, 1, 1), padding=0) 135 | self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0, 136 | name=name + '/Branch_3/Conv3d_0b_1x1') 137 | self.name = name 138 | 139 | def forward(self, x): 140 | b0 = self.b0(x) 141 | b1 = self.b1b(self.b1a(x)) 142 | b2 = self.b2b(self.b2a(x)) 143 | b3 = self.b3b(self.b3a(x)) 144 | return torch.cat([b0, b1, b2, b3], dim=1) 145 | 146 | class InceptionI3d(nn.Module): 147 | """Inception-v1 I3D architecture. 148 | The model is introduced in: 149 | Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset 150 | Joao Carreira, Andrew Zisserman 151 | https://arxiv.org/pdf/1705.07750v1.pdf. 152 | See also the Inception architecture, introduced in: 153 | Going deeper with convolutions 154 | Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, 155 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. 156 | http://arxiv.org/pdf/1409.4842v1.pdf. 157 | """ 158 | 159 | # Endpoints of the model in order. During construction, all the endpoints up 160 | # to a designated `final_endpoint` are returned in a dictionary as the 161 | # second return value. 162 | VALID_ENDPOINTS = ( 163 | 'Conv3d_1a_7x7', 164 | 'MaxPool3d_2a_3x3', 165 | 'Conv3d_2b_1x1', 166 | 'Conv3d_2c_3x3', 167 | 'MaxPool3d_3a_3x3', 168 | 'Mixed_3b', 169 | 'Mixed_3c', 170 | 'MaxPool3d_4a_3x3', 171 | 'Mixed_4b', 172 | 'Mixed_4c', 173 | 'Mixed_4d', 174 | 'Mixed_4e', 175 | 'Mixed_4f', 176 | 'MaxPool3d_5a_2x2', 177 | 'Mixed_5b', 178 | 'Mixed_5c', 179 | 'Logits', 180 | 'Predictions', 181 | ) 182 | 183 | def __init__(self, num_classes=400, spatial_squeeze=True, final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5): 184 | """Initializes I3D model instance. 185 | Args: 186 | num_classes: The number of outputs in the logit layer (default 400, which 187 | matches the Kinetics dataset). 188 | spatial_squeeze: Whether to squeeze the spatial dimensions for the logits 189 | before returning (default True). 190 | final_endpoint: The model contains many possible endpoints. 191 | `final_endpoint` specifies the last endpoint for the model to be built 192 | up to. In addition to the output at `final_endpoint`, all the outputs 193 | at endpoints up to `final_endpoint` will also be returned, in a 194 | dictionary. `final_endpoint` must be one of 195 | InceptionI3d.VALID_ENDPOINTS (default 'Logits'). 196 | name: A string (optional). The name of this module. 197 | Raises: 198 | ValueError: if `final_endpoint` is not recognized. 199 | """ 200 | 201 | if final_endpoint not in self.VALID_ENDPOINTS: 202 | raise ValueError('Unknown final endpoint %s' % final_endpoint) 203 | 204 | super(InceptionI3d, self).__init__() 205 | self._num_classes = num_classes 206 | self._spatial_squeeze = spatial_squeeze 207 | self._final_endpoint = final_endpoint 208 | self.logits = None 209 | 210 | if self._final_endpoint not in self.VALID_ENDPOINTS: 211 | raise ValueError('Unknown final endpoint %s' % self._final_endpoint) 212 | 213 | self.end_points = {} 214 | end_point = 'Conv3d_1a_7x7' 215 | self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], stride=(2, 2, 2), padding=(3, 3, 3), name=name + end_point) 216 | if self._final_endpoint == end_point: return 217 | 218 | end_point = 'MaxPool3d_2a_3x3' 219 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), 220 | padding=0) 221 | if self._final_endpoint == end_point: return 222 | 223 | end_point = 'Conv3d_2b_1x1' 224 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0, name=name + end_point) 225 | if self._final_endpoint == end_point: return 226 | 227 | end_point = 'Conv3d_2c_3x3' 228 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, name=name + end_point) 229 | if self._final_endpoint == end_point: return 230 | 231 | end_point = 'MaxPool3d_3a_3x3' 232 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0) 233 | if self._final_endpoint == end_point: return 234 | 235 | end_point = 'Mixed_3b' 236 | self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point) 237 | if self._final_endpoint == end_point: return 238 | 239 | end_point = 'Mixed_3c' 240 | self.end_points[end_point] = InceptionModule(256, [128, 128, 192, 32, 96, 64], name + end_point) 241 | if self._final_endpoint == end_point: return 242 | 243 | end_point = 'MaxPool3d_4a_3x3' 244 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2), 245 | padding=0) 246 | if self._final_endpoint == end_point: return 247 | 248 | end_point = 'Mixed_4b' 249 | self.end_points[end_point] = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point) 250 | if self._final_endpoint == end_point: return 251 | 252 | end_point = 'Mixed_4c' 253 | self.end_points[end_point] = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point) 254 | if self._final_endpoint == end_point: return 255 | 256 | end_point = 'Mixed_4d' 257 | self.end_points[end_point] = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point) 258 | if self._final_endpoint == end_point: return 259 | 260 | end_point = 'Mixed_4e' 261 | self.end_points[end_point] = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point) 262 | if self._final_endpoint == end_point: return 263 | 264 | end_point = 'Mixed_4f' 265 | self.end_points[end_point] = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], name + end_point) 266 | if self._final_endpoint == end_point: return 267 | 268 | end_point = 'MaxPool3d_5a_2x2' 269 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0) 270 | if self._final_endpoint == end_point: return 271 | 272 | end_point = 'Mixed_5b' 273 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], name + end_point) 274 | if self._final_endpoint == end_point: return 275 | 276 | end_point = 'Mixed_5c' 277 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], name + end_point) 278 | if self._final_endpoint == end_point: return 279 | 280 | end_point = 'Logits' 281 | self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1)) 282 | self.dropout = nn.Dropout(dropout_keep_prob) 283 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits') 284 | 285 | self.build() 286 | 287 | def replace_logits(self, num_classes): 288 | self._num_classes = num_classes 289 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits') 290 | 291 | def build(self): 292 | for k in self.end_points.keys(): 293 | self.add_module(k, self.end_points[k]) 294 | 295 | # def forward(self, x): 296 | # for end_point in self.VALID_ENDPOINTS: 297 | # if end_point in self.end_points: 298 | # x = self._modules[end_point](x) # use _modules to work with dataparallel 299 | # 300 | # x = self.logits(self.dropout(self.avg_pool(x))) 301 | # if self._spatial_squeeze: 302 | # logits = x.squeeze(3).squeeze(3) 303 | # # logits is batch X time X classes, which is what we want to work with 304 | # return logits 305 | 306 | def forward(self, x): 307 | for end_point in self.VALID_ENDPOINTS: 308 | if end_point in self.end_points: 309 | x = self.end_points[end_point](x) # use _modules to work with dataparallel 310 | return x 311 | 312 | # for end_point in self.VALID_ENDPOINTS: 313 | # if end_point in self.end_points: 314 | # x = self._modules[end_point](x) # use _modules to work with dataparallel 315 | # return x 316 | 317 | def extract_features(self, x): 318 | for end_point in self.VALID_ENDPOINTS: 319 | if end_point in self.end_points: 320 | x = self._modules[end_point](x) 321 | return self.avg_pool(x) 322 | -------------------------------------------------------------------------------- /nets/i3d_torch_charades_test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | 6 | import numpy as np 7 | 8 | import os 9 | import sys 10 | from collections import OrderedDict 11 | 12 | class MaxPool3dSamePadding(nn.MaxPool3d): 13 | 14 | def compute_pad(self, dim, s): 15 | if s % self.stride[dim] == 0: 16 | return max(self.kernel_size[dim] - self.stride[dim], 0) 17 | else: 18 | return max(self.kernel_size[dim] - (s % self.stride[dim]), 0) 19 | 20 | def forward(self, x): 21 | # compute 'same' padding 22 | (batch, channel, t, h, w) = x.size() 23 | # print t,h,w 24 | out_t = np.ceil(float(t) / float(self.stride[0])) 25 | out_h = np.ceil(float(h) / float(self.stride[1])) 26 | out_w = np.ceil(float(w) / float(self.stride[2])) 27 | # print out_t, out_h, out_w 28 | pad_t = self.compute_pad(0, t) 29 | pad_h = self.compute_pad(1, h) 30 | pad_w = self.compute_pad(2, w) 31 | # print pad_t, pad_h, pad_w 32 | 33 | pad_t_f = pad_t // 2 34 | pad_t_b = pad_t - pad_t_f 35 | pad_h_f = pad_h // 2 36 | pad_h_b = pad_h - pad_h_f 37 | pad_w_f = pad_w // 2 38 | pad_w_b = pad_w - pad_w_f 39 | 40 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) 41 | # print x.size() 42 | # print pad 43 | x = F.pad(x, pad) 44 | return super(MaxPool3dSamePadding, self).forward(x) 45 | 46 | class Unit3D(nn.Module): 47 | 48 | def __init__(self, in_channels, output_channels, kernel_shape=(1, 1, 1), stride=(1, 1, 1), padding=0, activation_fn=F.relu, use_batch_norm=True, use_bias=False, name='unit_3d'): 49 | 50 | """Initializes Unit3D module.""" 51 | super(Unit3D, self).__init__() 52 | 53 | self._output_channels = output_channels 54 | self._kernel_shape = kernel_shape 55 | self._stride = stride 56 | self._use_batch_norm = use_batch_norm 57 | self._activation_fn = activation_fn 58 | self._use_bias = use_bias 59 | self.name = name 60 | self.padding = padding 61 | 62 | # we always want padding to be 0 here. We will dynamically pad based on input size in forward function 63 | self.conv3d = nn.Conv3d(in_channels=in_channels, out_channels=self._output_channels, kernel_size=self._kernel_shape, stride=self._stride, padding=0, bias=self._use_bias) 64 | 65 | if self._use_batch_norm: 66 | self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01) 67 | 68 | def compute_pad(self, dim, s): 69 | if s % self._stride[dim] == 0: 70 | return max(self._kernel_shape[dim] - self._stride[dim], 0) 71 | else: 72 | return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0) 73 | 74 | def forward(self, x): 75 | # compute 'same' padding 76 | (batch, channel, t, h, w) = x.size() 77 | # print t,h,w 78 | # out_t = np.ceil(float(t) / float(self._stride[0])) 79 | # out_h = np.ceil(float(h) / float(self._stride[1])) 80 | # out_w = np.ceil(float(w) / float(self._stride[2])) 81 | # print out_t, out_h, out_w 82 | pad_t = self.compute_pad(0, t) 83 | pad_h = self.compute_pad(1, h) 84 | pad_w = self.compute_pad(2, w) 85 | # print pad_t, pad_h, pad_w 86 | 87 | pad_t_f = pad_t // 2 88 | pad_t_b = pad_t - pad_t_f 89 | pad_h_f = pad_h // 2 90 | pad_h_b = pad_h - pad_h_f 91 | pad_w_f = pad_w // 2 92 | pad_w_b = pad_w - pad_w_f 93 | 94 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) 95 | # print x.size() 96 | # print pad 97 | x = F.pad(x, pad) 98 | # print x.size() 99 | 100 | x = self.conv3d(x) 101 | if self._use_batch_norm: 102 | x = self.bn(x) 103 | if self._activation_fn is not None: 104 | x = self._activation_fn(x) 105 | return x 106 | 107 | class InceptionModule(nn.Module): 108 | def __init__(self, in_channels, out_channels, name): 109 | super(InceptionModule, self).__init__() 110 | 111 | self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_0/Conv3d_0a_1x1') 112 | self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_1/Conv3d_0a_1x1') 113 | self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3], name=name + '/Branch_1/Conv3d_0b_3x3') 114 | self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_2/Conv3d_0a_1x1') 115 | self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3], name=name + '/Branch_2/Conv3d_0b_3x3') 116 | self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(1, 1, 1), padding=0) 117 | self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_3/Conv3d_0b_1x1') 118 | self.name = name 119 | 120 | def forward(self, x): 121 | b0 = self.b0(x) 122 | b1 = self.b1b(self.b1a(x)) 123 | b2 = self.b2b(self.b2a(x)) 124 | b3 = self.b3b(self.b3a(x)) 125 | return torch.cat([b0, b1, b2, b3], dim=1) 126 | 127 | class InceptionI3d(nn.Module): 128 | """Inception-v1 I3D architecture. 129 | The model is introduced in: 130 | Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset 131 | Joao Carreira, Andrew Zisserman 132 | https://arxiv.org/pdf/1705.07750v1.pdf. 133 | See also the Inception architecture, introduced in: 134 | Going deeper with convolutions 135 | Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, 136 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. 137 | http://arxiv.org/pdf/1409.4842v1.pdf. 138 | """ 139 | 140 | # Endpoints of the model in order. During construction, all the endpoints up 141 | # to a designated `final_endpoint` are returned in a dictionary as the 142 | # second return value. 143 | VALID_ENDPOINTS = ( 144 | 'Conv3d_1a_7x7', 145 | 'MaxPool3d_2a_3x3', 146 | 'Conv3d_2b_1x1', 147 | 'Conv3d_2c_3x3', 148 | 'MaxPool3d_3a_3x3', 149 | 'Mixed_3b', 150 | 'Mixed_3c', 151 | 'MaxPool3d_4a_3x3', 152 | 'Mixed_4b', 153 | 'Mixed_4c', 154 | 'Mixed_4d', 155 | 'Mixed_4e', 156 | 'Mixed_4f', 157 | 'MaxPool3d_5a_2x2', 158 | 'Mixed_5b', 159 | 'Mixed_5c', 160 | 'Logits', 161 | 'Predictions', 162 | ) 163 | 164 | def __init__(self, num_classes=400, spatial_squeeze=True, final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5): 165 | """Initializes I3D model instance. 166 | Args: 167 | num_classes: The number of outputs in the logit layer (default 400, which 168 | matches the Kinetics dataset). 169 | spatial_squeeze: Whether to squeeze the spatial dimensions for the logits 170 | before returning (default True). 171 | final_endpoint: The model contains many possible endpoints. 172 | `final_endpoint` specifies the last endpoint for the model to be built 173 | up to. In addition to the output at `final_endpoint`, all the outputs 174 | at endpoints up to `final_endpoint` will also be returned, in a 175 | dictionary. `final_endpoint` must be one of 176 | InceptionI3d.VALID_ENDPOINTS (default 'Logits'). 177 | name: A string (optional). The name of this module. 178 | Raises: 179 | ValueError: if `final_endpoint` is not recognized. 180 | """ 181 | 182 | if final_endpoint not in self.VALID_ENDPOINTS: 183 | raise ValueError('Unknown final endpoint %s' % final_endpoint) 184 | 185 | super(InceptionI3d, self).__init__() 186 | self._num_classes = num_classes 187 | self._spatial_squeeze = spatial_squeeze 188 | self._final_endpoint = final_endpoint 189 | self.logits = None 190 | 191 | if self._final_endpoint not in self.VALID_ENDPOINTS: 192 | raise ValueError('Unknown final endpoint %s' % self._final_endpoint) 193 | 194 | self.end_points = {} 195 | end_point = 'Conv3d_1a_7x7' 196 | self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], stride=(2, 2, 2), padding=(3, 3, 3), name=name + end_point) 197 | self.__freeze_layer(self.end_points[end_point]) 198 | if self._final_endpoint == end_point: 199 | return 200 | 201 | end_point = 'MaxPool3d_2a_3x3' 202 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0) 203 | self.__freeze_layer(self.end_points[end_point]) 204 | if self._final_endpoint == end_point: 205 | return 206 | 207 | end_point = 'Conv3d_2b_1x1' 208 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0, name=name + end_point) 209 | self.__freeze_layer(self.end_points[end_point]) 210 | if self._final_endpoint == end_point: 211 | return 212 | 213 | end_point = 'Conv3d_2c_3x3' 214 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, name=name + end_point) 215 | self.__freeze_layer(self.end_points[end_point]) 216 | if self._final_endpoint == end_point: 217 | return 218 | 219 | end_point = 'MaxPool3d_3a_3x3' 220 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0) 221 | self.__freeze_layer(self.end_points[end_point]) 222 | if self._final_endpoint == end_point: 223 | return 224 | 225 | end_point = 'Mixed_3b' 226 | self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point) 227 | self.__freeze_layer(self.end_points[end_point]) 228 | if self._final_endpoint == end_point: 229 | return 230 | 231 | end_point = 'Mixed_3c' 232 | self.end_points[end_point] = InceptionModule(256, [128, 128, 192, 32, 96, 64], name + end_point) 233 | self.__freeze_layer(self.end_points[end_point]) 234 | if self._final_endpoint == end_point: 235 | return 236 | 237 | end_point = 'MaxPool3d_4a_3x3' 238 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0) 239 | self.__freeze_layer(self.end_points[end_point]) 240 | if self._final_endpoint == end_point: 241 | return 242 | 243 | end_point = 'Mixed_4b' 244 | self.end_points[end_point] = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point) 245 | self.__freeze_layer(self.end_points[end_point]) 246 | if self._final_endpoint == end_point: 247 | return 248 | 249 | end_point = 'Mixed_4c' 250 | self.end_points[end_point] = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point) 251 | self.__freeze_layer(self.end_points[end_point]) 252 | if self._final_endpoint == end_point: 253 | return 254 | 255 | end_point = 'Mixed_4d' 256 | self.end_points[end_point] = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point) 257 | self.__freeze_layer(self.end_points[end_point]) 258 | if self._final_endpoint == end_point: 259 | return 260 | 261 | end_point = 'Mixed_4e' 262 | self.end_points[end_point] = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point) 263 | self.__freeze_layer(self.end_points[end_point]) 264 | if self._final_endpoint == end_point: 265 | return 266 | 267 | end_point = 'Mixed_4f' 268 | self.end_points[end_point] = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], name + end_point) 269 | self.__freeze_layer(self.end_points[end_point]) 270 | if self._final_endpoint == end_point: 271 | return 272 | 273 | end_point = 'MaxPool3d_5a_2x2' 274 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0) 275 | self.__freeze_layer(self.end_points[end_point]) 276 | if self._final_endpoint == end_point: 277 | return 278 | 279 | end_point = 'Mixed_5b' 280 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], name + end_point) 281 | self.__freeze_layer(self.end_points[end_point]) 282 | if self._final_endpoint == end_point: 283 | return 284 | 285 | end_point = 'Mixed_5c' 286 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], name + end_point) 287 | self.__freeze_layer(self.end_points[end_point]) 288 | if self._final_endpoint == end_point: 289 | return 290 | 291 | self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1)) 292 | self.__freeze_layer(self.avg_pool) 293 | self.dropout = nn.Dropout(dropout_keep_prob) 294 | self.__freeze_layer(self.dropout) 295 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits') 296 | self.__freeze_layer(self.logits) 297 | 298 | self.build() 299 | 300 | def replace_logits(self, num_classes): 301 | self._num_classes = num_classes 302 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits') 303 | pass 304 | 305 | def build(self): 306 | for k in self.end_points.keys(): 307 | self.add_module(k, self.end_points[k]) 308 | 309 | def forward(self, x): 310 | for end_point in self.VALID_ENDPOINTS: 311 | if end_point in self.end_points: 312 | # use _modules to work with dataparallel 313 | # x = self.end_points[end_point](x) 314 | x = self._modules[end_point](x) 315 | return x 316 | 317 | def extract_features(self, x): 318 | for end_point in self.VALID_ENDPOINTS: 319 | if end_point in self.end_points: 320 | x = self._modules[end_point](x) 321 | return self.avg_pool(x) 322 | 323 | def __freeze_layer(self, layer): 324 | layer_params = layer.parameters() 325 | for param in layer_params: 326 | param.requires_grad = False 327 | -------------------------------------------------------------------------------- /nets/resnet_152_keras.py: -------------------------------------------------------------------------------- 1 | import os 2 | import keras.backend as K 3 | 4 | from keras import initializers 5 | from keras.layers import Input 6 | from keras.layers import Dense 7 | from keras.layers import Conv2D 8 | from keras.layers import MaxPooling2D 9 | from keras.layers import AveragePooling2D 10 | from keras.layers import ZeroPadding2D 11 | from keras.layers import Flatten 12 | from keras.layers import Activation 13 | from keras.layers import add 14 | from keras.layers import BatchNormalization 15 | from keras.layers import GlobalAveragePooling2D 16 | from keras.layers import GlobalMaxPooling2D 17 | 18 | from keras.models import Model 19 | from keras.engine import Layer, InputSpec 20 | from keras.engine import get_source_inputs 21 | 22 | from keras.utils.data_utils import get_file 23 | from keras.applications.imagenet_utils import imagenet_utils 24 | 25 | from core import const as c 26 | 27 | # WEIGHTS_PATH = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels.h5' 28 | # WEIGHTS_PATH_NO_TOP = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5' 29 | WEIGHTS_PATH = '%s/keras_models/resnet_152/resnet152_weights_tf_dim_ordering_tf_kernels.h5' % (c.DATA_ROOT_PATH) 30 | WEIGHTS_PATH_NO_TOP = '%s/keras_models/resnet_152/resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5' % (c.DATA_ROOT_PATH) 31 | 32 | class Scale(Layer): 33 | """ Custom Layer for ResNet used for BatchNormalization. 34 | 35 | Learns a set of weights and biases used for scaling the input data. 36 | the output consists simply in an element-wise multiplication of the input 37 | and a sum of a set of constants: 38 | out = in * gamma + beta, 39 | where 'gamma' and 'beta' are the weights and biases larned. 40 | # Arguments 41 | axis: integer, axis along which to normalize in mode 0. For instance, 42 | if your input tensor has shape (samples, channels, rows, cols), 43 | set axis to 1 to normalize per feature map (channels axis). 44 | momentum: momentum in the computation of the 45 | exponential average of the mean and standard deviation 46 | of the data, for feature-wise normalization. 47 | weights: Initialization weights. 48 | List of 2 Numpy arrays, with shapes: 49 | `[(input_shape,), (input_shape,)]` 50 | beta_init: name of initialization function for shift parameter 51 | (see [initializers](../initializers.md)), or alternatively, 52 | Theano/TensorFlow function to use for weights initialization. 53 | This parameter is only relevant if you don't pass a `weights` argument. 54 | gamma_init: name of initialization function for scale parameter (see 55 | [initializers](../initializers.md)), or alternatively, 56 | Theano/TensorFlow function to use for weights initialization. 57 | This parameter is only relevant if you don't pass a `weights` argument. 58 | """ 59 | 60 | def __init__(self, weights=None, axis=-1, momentum=0.9, beta_init='zero', gamma_init='one', **kwargs): 61 | self.momentum = momentum 62 | self.axis = axis 63 | self.beta_init = initializers.get(beta_init) 64 | self.gamma_init = initializers.get(gamma_init) 65 | self.initial_weights = weights 66 | super(Scale, self).__init__(**kwargs) 67 | 68 | def build(self, input_shape): 69 | self.input_spec = [InputSpec(shape=input_shape)] 70 | shape = (int(input_shape[self.axis]),) 71 | 72 | self.gamma = K.variable(self.gamma_init(shape), name='%s_gamma' % self.name) 73 | self.beta = K.variable(self.beta_init(shape), name='%s_beta' % self.name) 74 | self.trainable_weights = [self.gamma, self.beta] 75 | 76 | if self.initial_weights is not None: 77 | self.set_weights(self.initial_weights) 78 | del self.initial_weights 79 | 80 | def call(self, x, mask=None): 81 | input_shape = self.input_spec[0].shape 82 | broadcast_shape = [1] * len(input_shape) 83 | broadcast_shape[self.axis] = input_shape[self.axis] 84 | 85 | out = K.reshape(self.gamma, broadcast_shape) * x + K.reshape(self.beta, broadcast_shape) 86 | return out 87 | 88 | def get_config(self): 89 | config = {"momentum": self.momentum, "axis": self.axis} 90 | base_config = super(Scale, self).get_config() 91 | return dict(list(base_config.items()) + list(config.items())) 92 | 93 | def identity_block(input_tensor, kernel_size, filters, stage, block): 94 | """ 95 | The identity_block is the block that has no conv layer at shortcut 96 | # Arguments 97 | input_tensor: input tensor 98 | kernel_size: defualt 3, the kernel size of middle conv layer at main path 99 | filters: list of integers, the nb_filters of 3 conv layer at main path 100 | stage: integer, current stage label, used for generating layer names 101 | block: 'a','b'..., current block label, used for generating layer names 102 | """ 103 | eps = 1.1e-5 104 | nb_filter1, nb_filter2, nb_filter3 = filters 105 | conv_name_base = 'res' + str(stage) + block + '_branch' 106 | bn_name_base = 'bn' + str(stage) + block + '_branch' 107 | scale_name_base = 'scale' + str(stage) + block + '_branch' 108 | 109 | if K.image_data_format() == 'channels_last': 110 | bn_axis = 3 111 | else: 112 | bn_axis = 1 113 | 114 | x = Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a', use_bias=False)(input_tensor) 115 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x) 116 | x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x) 117 | x = Activation('relu', name=conv_name_base + '2a_relu')(x) 118 | 119 | x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x) 120 | x = Conv2D(nb_filter2, (kernel_size, kernel_size), name=conv_name_base + '2b', use_bias=False)(x) 121 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x) 122 | x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x) 123 | x = Activation('relu', name=conv_name_base + '2b_relu')(x) 124 | 125 | x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x) 126 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x) 127 | x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x) 128 | 129 | x = add([x, input_tensor], name='res' + str(stage) + block) 130 | x = Activation('relu', name='res' + str(stage) + block + '_relu')(x) 131 | return x 132 | 133 | def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)): 134 | """ conv_block is the block that has a conv layer at shortcut 135 | # Arguments 136 | input_tensor: input tensor 137 | kernel_size: defualt 3, the kernel size of middle conv layer at main path 138 | filters: list of integers, the nb_filters of 3 conv layer at main path 139 | stage: integer, current stage label, used for generating layer names 140 | block: 'a','b'..., current block label, used for generating layer names 141 | Note that from stage 3, the first conv layer at main path is with subsample=(2,2) 142 | And the shortcut should have subsample=(2,2) as well 143 | """ 144 | 145 | eps = 1.1e-5 146 | nb_filter1, nb_filter2, nb_filter3 = filters 147 | conv_name_base = 'res' + str(stage) + block + '_branch' 148 | bn_name_base = 'bn' + str(stage) + block + '_branch' 149 | scale_name_base = 'scale' + str(stage) + block + '_branch' 150 | 151 | if K.image_data_format() == 'channels_last': 152 | bn_axis = 3 153 | else: 154 | bn_axis = 1 155 | 156 | x = Conv2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + '2a', use_bias=False)(input_tensor) 157 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x) 158 | x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x) 159 | x = Activation('relu', name=conv_name_base + '2a_relu')(x) 160 | 161 | x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x) 162 | x = Conv2D(nb_filter2, (kernel_size, kernel_size), 163 | name=conv_name_base + '2b', use_bias=False)(x) 164 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x) 165 | x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x) 166 | x = Activation('relu', name=conv_name_base + '2b_relu')(x) 167 | 168 | x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x) 169 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x) 170 | x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x) 171 | 172 | shortcut = Conv2D(nb_filter3, (1, 1), strides=strides, 173 | name=conv_name_base + '1', use_bias=False)(input_tensor) 174 | shortcut = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '1')(shortcut) 175 | shortcut = Scale(axis=bn_axis, name=scale_name_base + '1')(shortcut) 176 | 177 | x = add([x, shortcut], name='res' + str(stage) + block) 178 | x = Activation('relu', name='res' + str(stage) + block + '_relu')(x) 179 | return x 180 | 181 | def ResNet152(include_top=True, weights='imagenet', input_tensor=None, input_shape=None, pooling=None, classes=1000): 182 | """ Instantiates the ResNet152 architecture. 183 | Optionally loads weights pre-trained 184 | on ImageNet. Note that when using TensorFlow, 185 | for best performance you should set 186 | `image_data_format='channels_last'` in your Keras config 187 | at ~/.keras/keras.json. 188 | The model and the weights are compatible only with 189 | TensorFlow. The data format 190 | convention used by the model is the one 191 | specified in your Keras config file. 192 | # Arguments 193 | include_top: whether to include the fully-connected 194 | layer at the top of the network. 195 | weights: one of `None` (random initialization), 196 | 'imagenet' (pre-training on ImageNet), 197 | or the path to the weights file to be loaded. 198 | input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) 199 | to use as image input for the model. 200 | input_shape: optional shape tuple, only to be specified 201 | if `include_top` is False (otherwise the input shape 202 | has to be `(224, 224, 3)` (with `channels_last` data format) 203 | or `(3, 224, 224)` (with `channels_first` data format). 204 | It should have exactly 3 inputs channels, 205 | and width and height should be no smaller than 197. 206 | E.g. `(200, 200, 3)` would be one valid value. 207 | pooling: Optional pooling mode for feature extraction 208 | when `include_top` is `False`. 209 | - `None` means that the output of the model will be 210 | the 4D tensor output of the 211 | last convolutional layer. 212 | - `avg` means that global average pooling 213 | will be applied to the output of the 214 | last convolutional layer, and thus 215 | the output of the model will be a 2D tensor. 216 | - `max` means that global max pooling will 217 | be applied. 218 | classes: optional number of classes to classify images 219 | into, only to be specified if `include_top` is True, and 220 | if no `weights` argument is specified. 221 | # Returns 222 | A Keras model instance. 223 | # Raises 224 | ValueError: in case of invalid argument for `weights`, 225 | or invalid input shape. 226 | """ 227 | 228 | eps = 1.1e-5 229 | 230 | if not (weights in {'imagenet', None} or os.path.exists(weights)): 231 | raise ValueError('The `weights` argument should be either ' 232 | '`None` (random initialization), `imagenet` ' 233 | '(pre-training on ImageNet), ' 234 | 'or the path to the weights file to be loaded.') 235 | 236 | if weights == 'imagenet' and include_top and classes != 1000: 237 | raise ValueError('If using `weights` as imagenet with `include_top`' 238 | ' as true, `classes` should be 1000') 239 | 240 | # Determine proper input shape 241 | input_shape = imagenet_utils._obtain_input_shape(input_shape, 242 | default_size=224, 243 | min_size=197, 244 | data_format=K.image_data_format(), 245 | require_flatten=include_top, 246 | weights=weights) 247 | 248 | if input_tensor is None: 249 | img_input = Input(shape=input_shape) 250 | else: 251 | if not K.is_keras_tensor(input_tensor): 252 | img_input = Input(tensor=input_tensor, shape=input_shape, name='data') 253 | else: 254 | img_input = input_tensor 255 | 256 | # Handle dimension ordering for different backends 257 | if K.image_dim_ordering() == 'tf': 258 | bn_axis = 3 259 | else: 260 | bn_axis = 1 261 | 262 | x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input) 263 | x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=False)(x) 264 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name='bn_conv1')(x) 265 | x = Scale(axis=bn_axis, name='scale_conv1')(x) 266 | x = Activation('relu', name='conv1_relu')(x) 267 | x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1', padding='same')(x) 268 | 269 | x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) 270 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') 271 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') 272 | 273 | x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') 274 | for i in range(1, 8): 275 | x = identity_block(x, 3, [128, 128, 512], stage=3, block='b' + str(i)) 276 | 277 | x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') 278 | for i in range(1, 36): 279 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b' + str(i)) 280 | 281 | x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') 282 | x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') 283 | x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') 284 | 285 | if include_top: 286 | # Classification block 287 | x = AveragePooling2D((7, 7), name='avg_pool')(x) 288 | x = Flatten()(x) 289 | x = Dense(classes, activation='softmax', name='fc1000')(x) 290 | else: 291 | if pooling == 'avg': 292 | x = GlobalAveragePooling2D()(x) 293 | elif pooling == 'max': 294 | x = GlobalMaxPooling2D()(x) 295 | 296 | # Ensure that the model takes into account 297 | # any potential predecessors of `input_tensor`. 298 | if input_tensor is not None: 299 | inputs = get_source_inputs(input_tensor) 300 | else: 301 | inputs = img_input 302 | 303 | # Create model 304 | model = Model(inputs, x, name='resnet152') 305 | 306 | # Load weights 307 | if weights == 'imagenet': 308 | if include_top: 309 | weights_path = WEIGHTS_PATH 310 | else: 311 | weights_path = WEIGHTS_PATH_NO_TOP 312 | model.load_weights(weights_path) 313 | 314 | elif weights is not None: 315 | model.load_weights(weights) 316 | 317 | return model 318 | -------------------------------------------------------------------------------- /core/image_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | ######################################################################## 5 | # GNU General Public License v3.0 6 | # GNU GPLv3 7 | # Copyright (c) 2019, Noureldien Hussein 8 | # 9 | # This program is free software: you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation, either version 3 of the License, or 12 | # (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | ######################################################################## 22 | 23 | """ 24 | Helper functions for images. 25 | """ 26 | 27 | from __future__ import absolute_import 28 | from __future__ import division 29 | from __future__ import print_function 30 | from __future__ import unicode_literals 31 | 32 | import cv2 33 | import numpy as np 34 | import random 35 | import math 36 | from multiprocessing.dummy import Pool 37 | 38 | from core import utils 39 | 40 | # region Frame Resizing 41 | 42 | def resize_frame(image, target_height=224, target_width=224): 43 | return __resize_frame(image, target_height, target_width) 44 | 45 | def resize_keep_aspect_ratio_max_dim(image, max_dim=None): 46 | return __resize_keep_aspect_ratio_max_dim(image, max_dim) 47 | 48 | def resize_keep_aspect_ratio_min_dim(image, min_dim=None): 49 | return __resize_keep_aspect_ratio_min_dim(image, min_dim) 50 | 51 | def resize_crop(image, target_height=224, target_width=224): 52 | return __resize_crop(image, target_height, target_width) 53 | 54 | def resize_crop_scaled(image, target_height=224, target_width=224): 55 | return __resize_crop_scaled(image, target_height, target_width) 56 | 57 | def resize_keep_aspect_ratio_padded(image, target_height=224, target_width=224): 58 | return __resize_keep_aspect_ratio_padded(image, target_height, target_width) 59 | 60 | def __resize_frame(image, target_height=224, target_width=224): 61 | """ 62 | Resize to the given dimensions. Don't care about maintaining the aspect ratio of the given image. 63 | """ 64 | if len(image.shape) == 2: 65 | image = np.tile(image[:, :, None], 3) 66 | elif len(image.shape) == 4: 67 | image = image[:, :, :, 0] 68 | 69 | resized_image = cv2.resize(image, dsize=(target_height, target_width)) 70 | return resized_image 71 | 72 | def __resize_keep_aspect_ratio_max_dim(image, max_dim=224): 73 | """ 74 | Resize the given image while maintaining the aspect ratio. 75 | """ 76 | if len(image.shape) == 2: 77 | image = np.tile(image[:, :, None], 3) 78 | elif len(image.shape) == 4: 79 | image = image[:, :, :, 0] 80 | 81 | height = image.shape[0] 82 | width = image.shape[1] 83 | 84 | if height > width: 85 | target_height = max_dim 86 | target_width = int(target_height * width / float(height)) 87 | else: 88 | target_width = max_dim 89 | target_height = int(target_width * height / float(width)) 90 | 91 | resized_image = cv2.resize(image, dsize=(target_width, target_height)) 92 | return resized_image 93 | 94 | def __resize_keep_aspect_ratio_min_dim(image, min_dim=224): 95 | """ 96 | Resize the given image while maintaining the aspect ratio. 97 | """ 98 | if len(image.shape) == 2: 99 | image = np.tile(image[:, :, None], 3) 100 | elif len(image.shape) == 4: 101 | image = image[:, :, :, 0] 102 | 103 | height = image.shape[0] 104 | width = image.shape[1] 105 | 106 | if height > width: 107 | target_width = min_dim 108 | target_height = int(target_width * height / float(width)) 109 | else: 110 | target_height = min_dim 111 | target_width = int(target_height * width / float(height)) 112 | 113 | resized_image = cv2.resize(image, dsize=(target_width, target_height)) 114 | return resized_image 115 | 116 | def __resize_crop(image, target_height=224, target_width=224): 117 | if len(image.shape) == 2: 118 | image = np.tile(image[:, :, None], 3) 119 | elif len(image.shape) == 4: 120 | image = image[:, :, :, 0] 121 | 122 | height, width, rgb = image.shape 123 | if width == height: 124 | resized_image = cv2.resize(image, (target_height, target_width)) 125 | 126 | elif height < width: 127 | resized_image = cv2.resize(image, (int(width * float(target_height) / height), target_width)) 128 | cropping_length = int((resized_image.shape[1] - target_height) / 2) 129 | resized_image = resized_image[:, cropping_length:resized_image.shape[1] - cropping_length] 130 | 131 | else: 132 | resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width))) 133 | cropping_length = int((resized_image.shape[0] - target_width) / 2) 134 | resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length, :] 135 | 136 | resized_image = cv2.resize(resized_image, (target_height, target_width)) 137 | return resized_image 138 | 139 | def __resize_crop_scaled(image, target_height=224, target_width=224): 140 | # re-scale the image by ratio 3/4 so a landscape or portrait image becomes square 141 | # then resize_crop it 142 | 143 | # for example, if input image is (height*width) is 400*1000 it will be (400 * 1000 * 3/4) = 400 * 750 144 | 145 | if len(image.shape) == 2: 146 | image = np.tile(image[:, :, None], 3) 147 | elif len(image.shape) == 4: 148 | image = image[:, :, :, 0] 149 | 150 | height, width, _ = image.shape 151 | if width == height: 152 | resized_image = cv2.resize(image, (target_height, target_width)) 153 | else: 154 | 155 | # first, rescale it, only if the rescale won't bring the scaled dimention to lower than target_dim (= 224) 156 | scale_factor = 3 / 4.0 157 | if height < width: 158 | new_width = int(width * scale_factor) 159 | if new_width >= target_width: 160 | image = cv2.resize(image, (new_width, height)) 161 | else: 162 | new_height = int(height * scale_factor) 163 | if new_height >= target_height: 164 | image = cv2.resize(image, (width, new_height)) 165 | 166 | # now, resize and crop 167 | height, width, _ = image.shape 168 | if height < width: 169 | resized_image = cv2.resize(image, (int(width * float(target_height) / height), target_width)) 170 | cropping_length = int((resized_image.shape[1] - target_height) / 2) 171 | resized_image = resized_image[:, cropping_length:resized_image.shape[1] - cropping_length] 172 | 173 | else: 174 | resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width))) 175 | cropping_length = int((resized_image.shape[0] - target_width) / 2) 176 | resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length, :] 177 | 178 | # this line is important, because sometimes the cropping there is a 1 pixel more 179 | height, width, _ = resized_image.shape 180 | if height > target_height or width > target_width: 181 | resized_image = cv2.resize(resized_image, (target_height, target_width)) 182 | 183 | return resized_image 184 | 185 | def __resize_keep_aspect_ratio_padded(image, target_height=224, target_width=224): 186 | """ 187 | Resize the frame while keeping aspect ratio. Also, to result in an image with the given dimensions, the resized image is zero-padded. 188 | """ 189 | 190 | if len(image.shape) == 2: 191 | image = np.tile(image[:, :, None], 3) 192 | elif len(image.shape) == 4: 193 | image = image[:, :, :, 0] 194 | 195 | original_height, original_width, _ = image.shape 196 | original_aspect_ratio = original_height / float(original_width) 197 | target_aspect_ratio = target_height / float(target_width) 198 | 199 | if target_aspect_ratio >= original_aspect_ratio: 200 | if original_width >= original_height: 201 | max_dim = target_width 202 | else: 203 | max_dim = int(original_height * target_width / float(original_width)) 204 | else: 205 | if original_height >= original_width: 206 | max_dim = target_height 207 | else: 208 | max_dim = int(original_width * target_height / float(original_height)) 209 | 210 | image = __resize_keep_aspect_ratio_max_dim(image, max_dim=max_dim) 211 | 212 | new_height, new_width, _ = image.shape 213 | new_aspect_ratio = new_height / float(new_width) 214 | 215 | # do zero-padding for the image (vertical or horizontal) 216 | img_padded = np.zeros((target_height, target_width, 3), dtype=image.dtype) 217 | 218 | if target_aspect_ratio < new_aspect_ratio: 219 | # horizontal padding 220 | y1 = 0 221 | y2 = new_height 222 | x1 = int((target_width - new_width) / 2.0) 223 | x2 = x1 + new_width 224 | else: 225 | # vertical padding 226 | x1 = 0 227 | x2 = new_width 228 | y1 = int((target_height - new_height) / 2.0) 229 | y2 = y1 + new_height 230 | 231 | img_padded[y1:y2, x1:x2, :] = image 232 | return img_padded 233 | 234 | # endregion 235 | 236 | # region Image Reader ResNet-152 Keras 237 | 238 | class AsyncImageReaderResNet152Keras(): 239 | def __init__(self, bgr_mean, n_threads=20): 240 | random.seed(101) 241 | np.random.seed(101) 242 | 243 | self.__is_busy = False 244 | self.__images = None 245 | self.__n_channels = 3 246 | self.__img_dim = 224 247 | self.__bgr_mean = bgr_mean 248 | 249 | self.__n_threads_in_pool = n_threads 250 | self.__pool = Pool(self.__n_threads_in_pool) 251 | 252 | def load_imgs_in_batch(self, image_pathes): 253 | self.__is_busy = True 254 | 255 | n_pathes = len(image_pathes) 256 | idxces = np.arange(0, n_pathes) 257 | 258 | # parameters passed to the reading function 259 | params = [data_item for data_item in zip(idxces, image_pathes)] 260 | 261 | # set list of images before start reading 262 | imgs_shape = (n_pathes, self.__img_dim, self.__img_dim, self.__n_channels) 263 | self.__images = np.zeros(imgs_shape, dtype=np.float32) 264 | 265 | # start pool of threads 266 | self.__pool.map_async(self.__preprocess_img_wrapper, params, callback=self.__thread_pool_callback) 267 | 268 | def get_images(self): 269 | if self.__is_busy: 270 | raise Exception('Sorry, you can\'t get images while threads are running!') 271 | else: 272 | return self.__images 273 | 274 | def is_busy(self): 275 | return self.__is_busy 276 | 277 | def __thread_pool_callback(self, args): 278 | self.__is_busy = False 279 | 280 | def __preprocess_img_wrapper(self, params): 281 | try: 282 | self.__preprocess_img(params) 283 | except Exception as exp: 284 | print ('Error in __preprocess_img') 285 | print (exp) 286 | 287 | def __preprocess_img(self, params): 288 | 289 | idx = params[0] 290 | path = params[1] 291 | 292 | img = cv2.imread(path) 293 | img = img.astype(np.float32) 294 | 295 | # subtract mean pixel from image 296 | img[:, :, 0] -= self.__bgr_mean[0] 297 | img[:, :, 1] -= self.__bgr_mean[1] 298 | img[:, :, 2] -= self.__bgr_mean[2] 299 | 300 | # convert from bgr to rgb 301 | img = img[:, :, (2, 1, 0)] 302 | 303 | self.__images[idx] = img 304 | 305 | def close(self): 306 | self.__pool.close() 307 | self.__pool.terminate() 308 | 309 | # endregion 310 | 311 | # region Image/Video Readers MultiTHUMOS 312 | 313 | class AsyncImageReaderMultiTHUMOSForI3DKerasModel(): 314 | def __init__(self, n_threads=20): 315 | random.seed(101) 316 | np.random.seed(101) 317 | 318 | self.__is_busy = False 319 | self.__images = None 320 | self.__n_channels = 3 321 | self.__img_dim = 224 322 | 323 | self.__n_threads_in_pool = n_threads 324 | self.__pool = Pool(self.__n_threads_in_pool) 325 | 326 | def load_imgs_in_batch(self, image_pathes): 327 | self.__is_busy = True 328 | 329 | n_pathes = len(image_pathes) 330 | idxces = np.arange(0, n_pathes) 331 | 332 | # parameters passed to the reading function 333 | params = [data_item for data_item in zip(idxces, image_pathes)] 334 | 335 | # set list of images before start reading 336 | imgs_shape = (n_pathes, self.__img_dim, self.__img_dim, self.__n_channels) 337 | self.__images = np.zeros(imgs_shape, dtype=np.float32) 338 | 339 | # start pool of threads 340 | self.__pool.map_async(self.__preprocess_img_wrapper, params, callback=self.__thread_pool_callback) 341 | 342 | def get_images(self): 343 | if self.__is_busy: 344 | raise Exception('Sorry, you can\'t get images while threads are running!') 345 | else: 346 | return self.__images 347 | 348 | def is_busy(self): 349 | return self.__is_busy 350 | 351 | def __thread_pool_callback(self, args): 352 | self.__is_busy = False 353 | 354 | def __preprocess_img_wrapper(self, params): 355 | try: 356 | self.__preprocess_img(params) 357 | except Exception as exp: 358 | print ('Error in __preprocess_img') 359 | print (exp) 360 | 361 | def __preprocess_img(self, params): 362 | 363 | idx = params[0] 364 | path = params[1] 365 | 366 | img = cv2.imread(path) 367 | img = img.astype(np.float32) 368 | # normalize such that values range from -1 to 1 369 | img /= float(127.5) 370 | img -= 1.0 371 | # convert from bgr to rgb 372 | img = img[:, :, (2, 1, 0)] 373 | 374 | self.__images[idx] = img 375 | 376 | def close(self): 377 | self.__pool.close() 378 | self.__pool.terminate() 379 | 380 | # endregion 381 | 382 | # region Image/Video Readers Breakfast 383 | 384 | class AsyncImageReaderBreakfastForI3DKerasModel(): 385 | def __init__(self, n_threads=20): 386 | random.seed(101) 387 | np.random.seed(101) 388 | 389 | self.__is_busy = False 390 | self.__images = None 391 | self.__n_channels = 3 392 | self.__img_dim = 224 393 | 394 | self.__n_threads_in_pool = n_threads 395 | self.__pool = Pool(self.__n_threads_in_pool) 396 | 397 | def load_imgs_in_batch(self, image_pathes): 398 | self.__is_busy = True 399 | 400 | n_pathes = len(image_pathes) 401 | idxces = np.arange(0, n_pathes) 402 | 403 | # parameters passed to the reading function 404 | params = [data_item for data_item in zip(idxces, image_pathes)] 405 | 406 | # set list of images before start reading 407 | imgs_shape = (n_pathes, self.__img_dim, self.__img_dim, self.__n_channels) 408 | self.__images = np.zeros(imgs_shape, dtype=np.float32) 409 | 410 | # start pool of threads 411 | self.__pool.map_async(self.__preprocess_img_wrapper, params, callback=self.__thread_pool_callback) 412 | 413 | def get_images(self): 414 | if self.__is_busy: 415 | raise Exception('Sorry, you can\'t get images while threads are running!') 416 | else: 417 | return self.__images 418 | 419 | def is_busy(self): 420 | return self.__is_busy 421 | 422 | def __thread_pool_callback(self, args): 423 | self.__is_busy = False 424 | 425 | def __preprocess_img_wrapper(self, params): 426 | try: 427 | self.__preprocess_img(params) 428 | except Exception as exp: 429 | print ('Error in __preprocess_img') 430 | print (exp) 431 | 432 | def __preprocess_img(self, params): 433 | 434 | idx = params[0] 435 | path = params[1] 436 | 437 | img = cv2.imread(path) 438 | img = img.astype(np.float32) 439 | # normalize such that values range from -1 to 1 440 | img /= float(127.5) 441 | img -= 1.0 442 | # convert from bgr to rgb 443 | img = img[:, :, (2, 1, 0)] 444 | 445 | self.__images[idx] = img 446 | 447 | def close(self): 448 | self.__pool.close() 449 | self.__pool.terminate() 450 | 451 | # endregion 452 | -------------------------------------------------------------------------------- /data/assets/timeception_layer.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | 14 | 141 | 142 | 143 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | Group 156 | 157 | 158 | Temp Conv 159 | 160 | 161 | 162 | Temp Conv 163 | 164 | 165 | 166 | T×L×L×C 167 | 168 | 169 | 170 | • • • 171 | N Groups 172 | 173 | 174 | Concat + Shuffle 175 | 176 | 177 | 178 | 179 | 180 | 181 | Max 1D 182 | k=2, s=2 183 | 184 | 185 | 186 | 187 | T × L × L × C/N 188 | T × L × L × C/N 189 | T×L×L×C 190 | T/2 × L × L × C 191 | (a) Timeception Layer 192 | 193 | 194 | 195 | 196 | 197 | 198 | Conv 1D 199 | k=3, s=1 200 | 201 | 202 | Max 1D 203 | k=2, s=1 204 | 205 | 206 | Conv 1D 207 | k=7, s=1 208 | 209 | 210 | Conv 1D 211 | k = 5, s = 1 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | Conv 2D 224 | k=1x1, s=1 225 | 226 | 227 | 228 | 229 | 230 | Concat 231 | 232 | 233 | Conv 2D 234 | k=1x1, s=1 235 | 236 | 237 | Conv 2D 238 | k=1x1, s=1 239 | 240 | 241 | Conv 2D 242 | k =1x1, s=1 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | T × L × L × C/N 261 | T × L × L × C/N 262 | T × L × L × C/N 263 | T × L × L × C/(M. N) 264 | T × L × L × 5C/(M. N) 265 | 266 | 267 | Conv 2D 268 | k =1x1, s=1 269 | 270 | (b) Temporal Conv Module 271 | 272 | 273 | 274 | 297 | 298 | 299 | --------------------------------------------------------------------------------