├── core
    ├── __init__.py
    ├── const.py
    ├── metrics.py
    ├── config.py
    ├── keras_utils.py
    ├── config_utils.py
    ├── pytorch_utils.py
    ├── data_utils.py
    ├── utils.py
    └── image_utils.py
├── nets
    ├── __init__.py
    ├── resnet_152_pytorch.py
    ├── layers_pytorch.py
    ├── i3d_torch_charades_utils.py
    ├── timeception_pytorch.py
    ├── i3d_torch_charades.py
    ├── i3d_torch_charades_test.py
    └── resnet_152_keras.py
├── datasets
    └── __init__.py
├── experiments
    ├── __init__.py
    ├── test_pytorch.py
    ├── test_keras.py
    ├── train_keras.py
    └── train_pytorch.py
├── data
    └── assets
    │   ├── badge-keras.png
    │   ├── badge-pytorch.png
    │   ├── badge-tensorflow.png
    │   ├── timeception_layer.jpg
    │   ├── timeception_layer.pdf
    │   └── timeception_layer.svg
├── scripts
    ├── test_charades_i3d_tc4_f1024.sh
    └── train_charades_i3d_tc4_f1024.sh
├── requirements.txt
├── __doc__.py
├── configs
    ├── charades_i3d_tc2_f256.yaml
    ├── charades_i3d_tc3_f256.yaml
    ├── charades_i3d_tc3_f512.yaml
    └── charades_i3d_tc4_f1024.yaml
├── main.py
└── README.md


/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/nets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/experiments/test_pytorch.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/assets/badge-keras.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/badge-keras.png


--------------------------------------------------------------------------------
/data/assets/badge-pytorch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/badge-pytorch.png


--------------------------------------------------------------------------------
/data/assets/badge-tensorflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/badge-tensorflow.png


--------------------------------------------------------------------------------
/data/assets/timeception_layer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/timeception_layer.jpg


--------------------------------------------------------------------------------
/data/assets/timeception_layer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/timeception_layer.pdf


--------------------------------------------------------------------------------
/scripts/test_charades_i3d_tc4_f1024.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python ../experiments/test.py --config_file charades_i3d_tc4_f1024.yaml


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | opencv
 3 | scikit-learn
 4 | 
 5 | keras
 6 | tensorflow-gpu
 7 | torch
 8 | torchvision
 9 | torchsummary
10 | torchviz


--------------------------------------------------------------------------------
/scripts/train_charades_i3d_tc4_f1024.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python ../experiments/train.py --config_file charades_i3d_tc4_f1024.yaml
4 | 
5 | 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/__doc__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | ########################################################################
 5 | # GNU General Public License v3.0
 6 | # GNU GPLv3
 7 | # Copyright (c) 2019, Noureldien Hussein
 8 | #
 9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
21 | ########################################################################
22 | 
23 | __author__ = 'Noureldien Hussein'
24 | __copyright__ = 'Copyright (c) 2019, Noureldien Hussein'
25 | __credits__ = ['']
26 | __license__ = 'GPLv3'
27 | __version__ = '1.0.0'
28 | __maintainer__ = 'Noureldien Hussein'
29 | __email__ = 'nhussein@uva.nl'
30 | __status__ = 'Development'
31 | 


--------------------------------------------------------------------------------
/configs/charades_i3d_tc2_f256.yaml:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env vim
 2 | 
 3 | NUM_GPUS: 1 # how many gups to use
 4 | LOG_PERIOD: 10 # log period
 5 | DATASET_NAME: 'charades' # name of dataset
 6 | 
 7 | MODEL:
 8 |   CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl'
 9 |   N_CLASSES: 157 # how many classes as output
10 |   N_CHAMNNEL_GROUPS: 8 # how many channel groups
11 |   N_TC_LAYERS: 2 # number of timeception layers
12 |   N_TC_TIMESTEPS: 32 # how mant timesteps expected as input to the timeception layers
13 |   N_INPUT_TIMESTEPS: 256 # how many timesteps (i.e. frames) expected as an input to the backbone CNN
14 |   NAME: 'charades_timeception' # name suffex for the model to be trained
15 |   BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used
16 |   BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn
17 |   MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks"
18 | 
19 | TRAIN:
20 |   BATCH_SIZE: 32 # batch size for training
21 |   N_EPOCHS: 500 # how many training epochs
22 |   SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only')
23 |   N_WORKERS: 10 # how many parallel workers in the data generator
24 | 
25 | TEST:
26 |   BATCH_SIZE: 64
27 |   N_SAMPLES: 10
28 | 
29 | SOLVER:
30 |   NAME: 'adam'
31 |   LR: 0.01
32 |   ADAM_EPSILON: 0.0001
33 |   SGD_WEIGHT_DECAY: 0.0001
34 |   SGD_MOMENTUM: 0.9
35 |   SGD_NESTEROV: True


--------------------------------------------------------------------------------
/configs/charades_i3d_tc3_f256.yaml:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env vim
 2 | 
 3 | NUM_GPUS: 1 # how many gups to use
 4 | LOG_PERIOD: 10 # log period
 5 | DATASET_NAME: 'charades' # name of dataset
 6 | 
 7 | MODEL:
 8 |   CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl'
 9 |   N_CLASSES: 157 # how many classes as output
10 |   N_CHAMNNEL_GROUPS: 8 # how many channel groups
11 |   N_TC_LAYERS: 3 # number of timeception layers
12 |   N_TC_TIMESTEPS: 32 # how mant timesteps expected as input to the timeception layers
13 |   N_INPUT_TIMESTEPS: 256 # how many timesteps (i.e. frames) expected as an input to the backbone CNN
14 |   NAME: 'charades_timeception' # name suffex for the model to be trained
15 |   BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used
16 |   BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn
17 |   MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks"
18 | 
19 | TRAIN:
20 |   BATCH_SIZE: 32 # batch size for training
21 |   N_EPOCHS: 500 # how many training epochs
22 |   SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only')
23 |   N_WORKERS: 10 # how many parallel workers in the data generator
24 | 
25 | TEST:
26 |   BATCH_SIZE: 50
27 |   N_SAMPLES: 10
28 | 
29 | SOLVER:
30 |   NAME: 'adam'
31 |   LR: 0.01
32 |   ADAM_EPSILON: 0.0001
33 |   SGD_WEIGHT_DECAY: 0.0001
34 |   SGD_MOMENTUM: 0.9
35 |   SGD_NESTEROV: True


--------------------------------------------------------------------------------
/configs/charades_i3d_tc3_f512.yaml:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env vim
 2 | 
 3 | NUM_GPUS: 1 # how many gups to use
 4 | LOG_PERIOD: 10 # log period
 5 | DATASET_NAME: 'charades' # name of dataset
 6 | 
 7 | MODEL:
 8 |   CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl'
 9 |   N_CLASSES: 157 # how many classes as output
10 |   N_CHAMNNEL_GROUPS: 8 # how many channel groups
11 |   N_TC_LAYERS: 3 # number of timeception layers
12 |   N_TC_TIMESTEPS: 64 # how mant timesteps expected as input to the timeception layers
13 |   N_INPUT_TIMESTEPS: 512 # how many timesteps (i.e. frames) expected as an input to the backbone CNN
14 |   NAME: 'charades_timeception' # name suffex for the model to be trained
15 |   BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used
16 |   BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn
17 |   MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks"
18 | 
19 | TRAIN:
20 |   BATCH_SIZE: 20 # batch size for training
21 |   N_EPOCHS: 500 # how many training epochs
22 |   SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only')
23 |   N_WORKERS: 10 # how many parallel workers in the data generator
24 | 
25 | TEST:
26 |   BATCH_SIZE: 40
27 |   N_SAMPLES: 10
28 | 
29 | SOLVER:
30 |   NAME: 'adam'
31 |   LR: 0.01
32 |   ADAM_EPSILON: 0.0001
33 |   SGD_WEIGHT_DECAY: 0.0001
34 |   SGD_MOMENTUM: 0.9
35 |   SGD_NESTEROV: True


--------------------------------------------------------------------------------
/configs/charades_i3d_tc4_f1024.yaml:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env vim
 2 | 
 3 | NUM_GPUS: 1 # how many gups to use
 4 | LOG_PERIOD: 10 # log period
 5 | DATASET_NAME: 'charades' # name of dataset
 6 | 
 7 | MODEL:
 8 |   CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl'
 9 |   N_CLASSES: 157 # how many classes as output
10 |   N_CHAMNNEL_GROUPS: 8 # how many channel groups
11 |   N_TC_LAYERS: 4 # number of timeception layers
12 |   N_TC_TIMESTEPS: 128 # how mant timesteps expected as input to the timeception layers
13 |   N_INPUT_TIMESTEPS: 1024 # how many timesteps (i.e. frames) expected as an input to the backbone CNN
14 |   NAME: 'charades_timeception' # name suffex for the model to be trained
15 |   BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used
16 |   BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn
17 |   MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks"
18 | 
19 | TRAIN:
20 |   BATCH_SIZE: 16 # batch size for training
21 |   N_EPOCHS: 500 # how many training epochs
22 |   SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only')
23 |   N_WORKERS: 10 # how many parallel workers in the data generator
24 | 
25 | TEST:
26 |   BATCH_SIZE: 32
27 |   N_SAMPLES: 10
28 | 
29 | SOLVER:
30 |   NAME: 'adam'
31 |   LR: 0.01
32 |   ADAM_EPSILON: 0.0001
33 |   SGD_WEIGHT_DECAY: 0.0001
34 |   SGD_MOMENTUM: 0.9
35 |   SGD_NESTEROV: True


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | ########################################################################
 5 | # GNU General Public License v3.0
 6 | # GNU GPLv3
 7 | # Copyright (c) 2019, Noureldien Hussein
 8 | #
 9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
21 | ########################################################################
22 | 
23 | """
24 | Main file of the project.
25 | """
26 | 
27 | def __main():
28 |     from experiments import train_keras, test_keras, train_pytorch, test_pytorch
29 | 
30 |     # to train Timeception using keras
31 |     train_keras.__main()
32 | 
33 |     # or using pytorch
34 |     # train_pytorch.__main()
35 | 
36 |     # to test Timeception using keras
37 |     # test_keras.__main()
38 | 
39 |     # or using pytorch
40 |     # test_pytorch.__main()
41 | 
42 | if __name__ == '__main__':
43 |     __main()
44 |     pass
45 | 


--------------------------------------------------------------------------------
/experiments/test_keras.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | ########################################################################
 5 | # GNU General Public License v3.0
 6 | # GNU GPLv3
 7 | # Copyright (c) 2019, Noureldien Hussein
 8 | #
 9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
21 | ########################################################################
22 | 
23 | """
24 | Test Timeception models.
25 | """
26 | 
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 | 
32 | import logging
33 | import os
34 | import datetime
35 | from optparse import OptionParser
36 | 
37 | import tensorflow as tf
38 | import keras.backend as K
39 | from keras.layers import Dense, LeakyReLU, Dropout, Input, Activation
40 | from keras.optimizers import SGD, Adam
41 | from keras.models import Sequential, Model
42 | from keras.layers.normalization import BatchNormalization
43 | 
44 | from nets import timeception
45 | from nets.layers_keras import MaxLayer
46 | from core import utils, keras_utils, image_utils, config_utils, const, config, data_utils
47 | from core.utils import Path as Pth
48 | 
49 | logger = logging.getLogger(__name__)
50 | 
51 | def test_tco():
52 |     pass


--------------------------------------------------------------------------------
/core/const.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | ########################################################################
 5 | # GNU General Public License v3.0
 6 | # GNU GPLv3
 7 | # Copyright (c) 2019, Noureldien Hussein
 8 | #
 9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
21 | ########################################################################
22 | 
23 | """
24 | Constants for project.
25 | """
26 | 
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 | 
32 | import os
33 | import platform
34 | import numpy as np
35 | 
36 | DL_FRAMEWORKS = np.array(['caffe', 'tensorflow', 'pytorch', 'keras', 'caffe2'])
37 | DL_FRAMEWORK = None
38 | GPU_CORE_ID = 0
39 | 
40 | CNN_FEATURE_SIZES = np.array([2048, 2048, 1000, 1024, 1000, 2048, 2048])
41 | CNN_FEATURE_TYPES = np.array(['fc6', 'fc7', 'fc1000', 'fc1024', 'fc365', 'prob', 'pool5', 'fc8a', 'res3b7', 'res4b35', 'res5c'])
42 | CNN_MODEL_TYPES = np.array(['resnet152', 'googlenet1k', 'vgg16', 'places365-resnet152', 'places365-vgg', 'googlenet13k'])
43 | RESIZE_TYPES = np.array(['resize', 'resize_crop', 'resize_crop_scaled', 'resize_keep_aspect_ratio_padded'])
44 | ROOT_PATH_TYPES = np.array(['data', 'project'])
45 | TRAIN_SCHEMES = np.array(['ete', 'tco'])
46 | MODEL_CLASSIFICATION_TYPES = np.array(['ml', 'sl'])
47 | MODEL_MULTISCALE_TYPES = np.array(['dl', 'ks'])
48 | SOLVER_NAMES = np.array(['adam', 'sgd'])
49 | DATASET_NAMES = np.array(['charades', 'kinetics400', 'breakfast_actions', 'you_cook_2', 'multi_thumos'])
50 | DATA_ROOT_PATH = './data'
51 | PROJECT_ROOT_PATH = '../'
52 | MACHINE_NAME = platform.node()
53 | 


--------------------------------------------------------------------------------
/core/metrics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | ########################################################################
 5 | # GNU General Public License v3.0
 6 | # GNU GPLv3
 7 | # Copyright (c) 2019, Noureldien Hussein
 8 | #
 9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
21 | ########################################################################
22 | 
23 | """
24 | Evaluation functions.
25 | """
26 | 
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 | 
32 | import numpy as np
33 | from sklearn.metrics import average_precision_score
34 | 
35 | def map_charades(y_true, y_pred):
36 |     """ Returns mAP """
37 |     m_aps = []
38 |     n_classes = y_pred.shape[1]
39 |     for oc_i in range(n_classes):
40 |         pred_row = y_pred[:, oc_i]
41 |         sorted_idxs = np.argsort(-pred_row)
42 |         true_row = y_true[:, oc_i]
43 |         tp = true_row[sorted_idxs] == 1
44 |         fp = np.invert(tp)
45 |         n_pos = tp.sum()
46 |         if n_pos < 0.1:
47 |             m_aps.append(float('nan'))
48 |             continue
49 |         f_pcs = np.cumsum(fp)
50 |         t_pcs = np.cumsum(tp)
51 |         prec = t_pcs / (f_pcs + t_pcs).astype(float)
52 |         avg_prec = 0
53 |         for i in range(y_pred.shape[0]):
54 |             if tp[i]:
55 |                 avg_prec += prec[i]
56 |         m_aps.append(avg_prec / n_pos.astype(float))
57 |     m_aps = np.array(m_aps)
58 |     m_ap = np.mean(m_aps)
59 |     return m_ap
60 | 
61 | def map_sklearn(y_true, y_pred):
62 |     # """ Returns mAP """
63 |     n_classes = y_true.shape[1]
64 |     map = [average_precision_score(y_true[:, i], y_pred[:, i]) for i in range(n_classes)]
65 |     map = np.nan_to_num(map)
66 |     map = np.mean(map)
67 |     return map
68 | 
69 | def accuracy(y_true, y_pred):
70 |     idx = np.argmax(y_pred, axis=1)
71 |     n_items = len(y_true)
72 |     accuracy = np.sum(idx == y_true) / float(n_items)
73 |     return accuracy
74 | 
75 | def acuracy_top_n(n_top, y_true, y_pred):
76 |     n_corrects = 0
77 |     for gt, pr in zip(y_true, y_pred):
78 |         idx = np.argsort(pr)[::-1]
79 |         idx = idx[0:n_top]
80 |         gt = np.where(gt == 1)[0][0]
81 |         if gt in idx:
82 |             n_corrects += 1
83 |     n = len(y_true)
84 |     score = n_corrects / float(n)
85 |     return score
86 | 
87 | 


--------------------------------------------------------------------------------
/core/config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | ########################################################################
 5 | # GNU General Public License v3.0
 6 | # GNU GPLv3
 7 | # Copyright (c) 2019, Noureldien Hussein
 8 | #
 9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
21 | ########################################################################
22 | 
23 | """
24 | Definition for all configuration options for training/testing Timeception model on various datasets.
25 | """
26 | 
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 | 
32 | import logging
33 | import sys
34 | 
35 | from core.utils import AttrDict
36 | 
37 | logger = logging.getLogger(__name__)
38 | 
39 | __C = AttrDict()
40 | cfg = __C
41 | 
42 | # region Misc
43 | 
44 | __C.DEBUG = False  # is debugging
45 | __C.NUM_GPUS = 1  # how many gups to use
46 | __C.LOG_PERIOD = 10  # log period
47 | __C.DATASET_NAME = str('')  # name of dataset
48 | 
49 | # endregion
50 | 
51 | # region Model
52 | 
53 | __C.MODEL = AttrDict()
54 | __C.MODEL.CLASSIFICATION_TYPE = str('')  # either multi-label 'ml' or single-label 'sl'
55 | __C.MODEL.N_CLASSES = 157  # how many classes as output
56 | __C.MODEL.N_CHAMNNEL_GROUPS = 8  # how many channel groups
57 | __C.MODEL.N_TC_LAYERS = 4  # number of timeception layers
58 | __C.MODEL.N_TC_TIMESTEPS = 64  # how mant timesteps expected as input to the timeception layers
59 | __C.MODEL.N_INPUT_TIMESTEPS = 512  # how many timesteps (i.e. frames) expected as an input to the backbone CNN
60 | __C.MODEL.NAME = str('')  # name suffex for the model to be trained
61 | __C.MODEL.BACKBONE_CNN = str('')  # which backbone cnn is used
62 | __C.MODEL.BACKBONE_FEATURE = str('')  # type of feature output from backbone cnn
63 | __C.MODEL.MULTISCALE_TYPE = str('')  # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks"
64 | 
65 | # endregion
66 | 
67 | # region Train
68 | 
69 | __C.TRAIN = AttrDict()
70 | __C.TRAIN.BATCH_SIZE = 64 # batch size for training
71 | __C.TRAIN.N_EPOCHS = 500 # how many training epochs
72 | __C.TRAIN.SCHEME = str('')  # either 'ete' (end-to-end) or tco ('timeception-only')
73 | __C.TRAIN.N_WORKERS = 10 #
74 | 
75 | # endregion
76 | 
77 | # region Test
78 | 
79 | __C.TEST = AttrDict()
80 | __C.TEST.BATCH_SIZE = 64
81 | __C.TEST.N_SAMPLES = 10
82 | 
83 | # endregion
84 | 
85 | # region Solver
86 | 
87 | __C.SOLVER = AttrDict()
88 | __C.SOLVER.NAME = str('adam')
89 | __C.SOLVER.LR = 0.0001
90 | __C.SOLVER.ADAM_EPSILON = 1e-4
91 | __C.SOLVER.SGD_WEIGHT_DECAY = 0.0001
92 | __C.SOLVER.SGD_MOMENTUM = 0.9
93 | __C.SOLVER.SGD_NESTEROV = True
94 | 
95 | # endregion
96 | 


--------------------------------------------------------------------------------
/nets/resnet_152_pytorch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | ########################################################################
  5 | # GNU General Public License v3.0
  6 | # GNU GPLv3
  7 | # Copyright (c) 2019, Noureldien Hussein
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 21 | ########################################################################
 22 | 
 23 | """
 24 | ResNet-152 fine-tuned on Charades.
 25 | https://github.com/gsig/charades-algorithms/tree/master/pytorch
 26 | """
 27 | 
 28 | from __future__ import absolute_import
 29 | from __future__ import division
 30 | from __future__ import print_function
 31 | from __future__ import unicode_literals
 32 | 
 33 | import logging
 34 | import warnings
 35 | import os
 36 | import random
 37 | import sys
 38 | import time
 39 | import datetime
 40 | import math
 41 | import shutil
 42 | import random
 43 | 
 44 | import numpy as np
 45 | import cv2
 46 | import scipy.io
 47 | import h5py
 48 | from collections import OrderedDict
 49 | 
 50 | from core import const as c, utils
 51 | from core import image_utils
 52 | 
 53 | logger = logging.getLogger(__name__)
 54 | 
 55 | if c.DL_FRAMEWORK == 'tensorflow':
 56 |     import tensorflow as tf
 57 | elif c.DL_FRAMEWORK == 'caffe':
 58 |     import caffe
 59 | elif c.DL_FRAMEWORK == 'pytorch':
 60 |     import torch
 61 |     import torch.nn as nn
 62 |     import torch.nn.parallel
 63 |     import torch.backends.cudnn as cudnn
 64 |     import torch.distributed as dist
 65 |     import torchvision.models as tmodels
 66 |     import importlib
 67 | elif c.DL_FRAMEWORK == 'keras':
 68 |     import tensorflow as tf
 69 |     import keras.backend as K
 70 | 
 71 | def get_resnet_152_charades_model():
 72 |     import torch
 73 |     import torch.nn as nn
 74 |     import torch.nn.parallel
 75 |     import torch.backends.cudnn as cudnn
 76 |     import torch.distributed as dist
 77 |     import torchvision.models as tmodels
 78 |     import importlib
 79 |     import torch.utils.model_zoo as model_zoo
 80 | 
 81 |     root_path = c.DATA_ROOT_PATH
 82 |     model_arch = 'resnet152'
 83 |     model_checkpoint_path = '%s/Charades/baseline_models/resnet_rgb.pth.tar' % (root_path)
 84 | 
 85 |     # load model
 86 |     print("=> creating model '{}'".format(model_arch))
 87 |     model = tmodels.__dict__[model_arch](pretrained=False)
 88 |     cudnn.benchmark = True
 89 | 
 90 |     # load checkpoint
 91 |     checkpoint = torch.load(model_checkpoint_path)
 92 |     checkpoint = checkpoint['state_dict']
 93 | 
 94 |     # fix keys of state dict
 95 |     unwanted_keys = ['fc.weight', 'fc.bias']
 96 |     state_dict = OrderedDict()
 97 |     for k, v in checkpoint.iteritems():
 98 |         key = k.replace('module.', '')
 99 |         if key not in unwanted_keys:
100 |             state_dict[key] = v
101 | 
102 |     # remove fc and avgpool layers
103 |     layers = model._modules.items()
104 |     layers = list(layers)[:-2]
105 |     layers = OrderedDict(layers)
106 |     model = nn.Sequential(layers)
107 | 
108 |     # load the dictionary
109 |     model.load_state_dict(state_dict)
110 | 
111 |     # if parrallize the model
112 |     # model = torch.nn.DataParallel(model).cuda()
113 | 
114 |     # make sure it's only for testing
115 |     model.train(False)
116 | 
117 |     # convert to eval model
118 |     model.eval()
119 | 
120 |     # convert to gpu model
121 |     model.cuda()
122 | 
123 |     return model
124 | 
125 | def get_mean_std_for_resnet_152_pytorch_model():
126 |     img_mean = [0.485, 0.456, 0.406]
127 |     img_std = [0.229, 0.224, 0.225]
128 |     return img_mean, img_std
129 | 


--------------------------------------------------------------------------------
/nets/layers_pytorch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | ########################################################################
  5 | # GNU General Public License v3.0
  6 | # GNU GPLv3
  7 | # Copyright (c) 2019, Noureldien Hussein
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 21 | ########################################################################
 22 | 
 23 | """
 24 | Layers for pytorch.
 25 | """
 26 | 
 27 | from __future__ import absolute_import
 28 | from __future__ import division
 29 | from __future__ import print_function
 30 | from __future__ import unicode_literals
 31 | 
 32 | import numpy as np
 33 | import logging
 34 | 
 35 | import torch
 36 | from torch.nn import Module, Conv2d, Conv1d
 37 | from torch.nn import functional as F
 38 | 
 39 | from core import pytorch_utils
 40 | 
 41 | logger = logging.getLogger(__name__)
 42 | 
 43 | # region Basic Layers
 44 | 
 45 | class ChannelShuffleLayer(Module):
 46 |     """
 47 |     Shuffle the channels across groups.
 48 |     """
 49 | 
 50 |     def __init__(self, n_channels, n_groups):
 51 |         super(ChannelShuffleLayer, self).__init__()
 52 | 
 53 |         n_channels_per_group = int(n_channels / n_groups)
 54 |         assert n_channels_per_group * n_groups == n_channels
 55 | 
 56 |         self.n_channels_per_group = n_channels_per_group
 57 |         self.n_groups = n_groups
 58 | 
 59 |     def forward(self, input):
 60 |         """
 61 |         input shape (None, 1024, 20, 7, 7), or (BN, C, T, H, W)
 62 |         """
 63 | 
 64 |         input_shape = input.size()
 65 |         n_samples, n_channels, n_timesteps, side_dim1, side_dim2 = input_shape
 66 | 
 67 |         n_groups = self.n_groups
 68 |         n_channels_per_group = self.n_channels_per_group
 69 | 
 70 |         tensor = input.view(n_samples, n_groups, n_channels_per_group, n_timesteps, side_dim1, side_dim2)
 71 |         tensor = tensor.permute(0, 2, 1, 3, 4, 5)
 72 |         tensor = tensor.contiguous()
 73 |         tensor = tensor.view(n_samples, n_channels, n_timesteps, side_dim1, side_dim2)
 74 | 
 75 |         return tensor
 76 | 
 77 | # endregion
 78 | 
 79 | # region Timeception Layers
 80 | 
 81 | class DepthwiseConv1DLayer(Module):
 82 |     """
 83 |     Shuffle the channels across groups.
 84 |     """
 85 | 
 86 |     def __init__(self, input_shape, kernel_size, dilation, name):
 87 |         super(DepthwiseConv1DLayer, self).__init__()
 88 | 
 89 |         assert len(input_shape) == 5
 90 | 
 91 |         self.kernel_size = kernel_size
 92 |         self.dilation = dilation
 93 |         self._name = name
 94 | 
 95 |         n_channels = input_shape[1]
 96 |         n_timesteps = input_shape[2]
 97 | 
 98 |         # TODO: support using different dilation rates.
 99 |         padding = pytorch_utils.calc_padding_1d(n_timesteps, kernel_size)
100 |         self.depthwise_conv1d = Conv1d(n_channels, n_channels, kernel_size, dilation=dilation, groups=n_channels, padding=padding)
101 |         self.depthwise_conv1d._name = name
102 | 
103 |     def forward(self, input):
104 |         """
105 |         input shape (None, 1024, 20, 7, 7), or (BN, C, T, H, W)
106 |         """
107 | 
108 |         input_shape = input.size()
109 | 
110 |         n, c, t, h, w = input_shape
111 | 
112 |         # transpose and reshape to hide the spatial dimension, only expose the temporal dimension for depthwise conv
113 |         tensor = input.permute(0, 3, 4, 1, 2)  # (None, 7, 7, 1024, 20)
114 |         tensor = tensor.contiguous()
115 |         tensor = tensor.view(-1, c, t)  # (None*7*7, 1024, 20)
116 | 
117 |         # depthwise conv on the temporal dimension, as if it was the spatial dimension
118 |         tensor = self.depthwise_conv1d(tensor)  # (None*7*7, 1024, 20)
119 | 
120 |         # get timesteps after convolution
121 |         t = tensor.size()[-1]
122 | 
123 |         # reshape to get the spatial dimensions
124 |         tensor = tensor.view(n, h, w, c, t)  # (None, 7, 7, 1024, 20)
125 | 
126 |         # finally, transpose to get the desired output shape
127 |         tensor = tensor.permute(0, 3, 4, 1, 2)  # (None, 1024, 20, 7, 7)
128 | 
129 |         return tensor
130 | 
131 | # endregion
132 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Timeception for Complex Action Recognition
  2 | 
  3 | ![Keras](./data/assets/badge-keras.png "Keras") ![Keras](./data/assets/badge-tensorflow.png "TensorFlow") ![Keras](./data/assets/badge-pytorch.png "PyTorch")
  4 | 
  5 | This code repository is the implementation for the paper [Timeception for Complex Action Recognition](https://arxiv.org/abs/1812.01289).
  6 | We provide the implementation for 3 different libraries: `keras`, `tensorflow` and `pytorch`.
  7 | 
  8 | ![Timeception for Complex Action Recognition](./data/assets/timeception_layer.jpg "Timeception Block")
  9 | 
 10 | ### Citation
 11 | 
 12 | Please consider citing this work using this BibTeX entry
 13 | 
 14 | ```bibtex
 15 | @inproceedings{hussein2018timeception,
 16 |   title     = {Timeception for Complex Action Recognition},
 17 |   author    = {Hussein, Noureldien and Gavves, Efstratios and Smeulders, Arnold WM},
 18 |   booktitle = {CVPR},
 19 |   year      = {2019}
 20 | }
 21 | ```
 22 | 
 23 | ### How to Use?
 24 | 
 25 | ###### Keras
 26 | 
 27 | Using `keras`, we can define `timeception` as a sub-model.
 28 | Then we use it along with another model definition.
 29 | For example, here we define 4 `timeception` layers followed by a `dense` layer for classification.
 30 | 
 31 | ```python
 32 | from keras import Model
 33 | from keras.layers import Input, Dense
 34 | from nets.layers_keras import MaxLayer
 35 | from nets.timeception import Timeception
 36 | 
 37 | # define the timeception layers
 38 | timeception = Timeception(1024, n_layers=4)
 39 | 
 40 | # define network for classification
 41 | input = Input(shape=(128, 7, 7, 1024))
 42 | tensor = timeception(input)
 43 | tensor = MaxLayer(axis=(1, 2, 3))(tensor)
 44 | output = Dense(100, activation='softmax')(tensor)
 45 | model = Model(inputs=input, outputs=output)
 46 | model.summary()
 47 | ```
 48 | 
 49 | This results in the model defined as:
 50 | 
 51 | ```
 52 | Layer (type)  Output Shape              Param #   
 53 | ================================================
 54 | (InputLayer)  (None, 128, 7, 7, 1024)   0         
 55 | (Timeception) (None, 8, 7, 7, 2480)     1494304   
 56 | (MaxLayer)    (None, 2480)              0         
 57 | (Dense)       (None, 100)               248100    
 58 | ================================================
 59 | Total params: 1,742,404
 60 | ```
 61 | 
 62 | ###### Tensorflow
 63 | 
 64 | Using `tensorflow`, we can define `timeception` as a list of nodes in the computational graph.
 65 | Then we use it along with another model definition.
 66 | For example, here a functions defines 4 `timeception` layers.
 67 | It takes the input tensor, feedforward it to the `timeception` layers and return the output tensor `output`.
 68 | 
 69 | ```python
 70 | import tensorflow as tf
 71 | from nets import timeception
 72 | 
 73 | # define input tensor
 74 | input = tf.placeholder(tf.float32, shape=(None, 128, 7, 7, 1024))
 75 | 
 76 | # feedforward the input to the timeception layers
 77 | tensor = timeception.timeception_layers(input, n_layers=4)
 78 | 
 79 | # the output is (?, 8, 7, 7, 2480)
 80 | print (tensor.get_shape())
 81 | ```
 82 | 
 83 | ###### PyTorch
 84 | 
 85 | Using `pytorch`, we can define `timeception` as a module.
 86 | Then we use it along with another model definition.
 87 | For example, here we define 4 `timeception` layers followed by a `dense` layer for classification..
 88 | 
 89 | ```python
 90 | import numpy as np
 91 | import torch as T
 92 | from nets import timeception_pytorch
 93 | 
 94 | # define input tensor
 95 | input = T.tensor(np.zeros((32, 1024, 128, 7, 7)), dtype=T.float32)
 96 | 
 97 | # define 4 layers of timeception
 98 | module = timeception_pytorch.Timeception(input.size(), n_layers=4)
 99 | 
100 | # feedforward the input to the timeception layers 
101 | tensor = module(input)
102 | 
103 | # the output is (32, 2480, 8, 7, 7)
104 | print (tensor.size())
105 | ```
106 | 
107 | ### Installation
108 | 
109 | We use python 2.7.15, provided by Anaconda 4.6.2, and we depend on the following python packages.
110 | - Keras 2.2.4
111 | - Tensorflow 1.10.1
112 | - PyTorch 1.0.1
113 | 
114 | ### Training
115 | 
116 | ### Testing
117 | 
118 | ### Fine-tuning
119 | 
120 | ### Pretrained Models
121 | 
122 | #### Charades
123 | 
124 | We will add all pretrained models for Charades by the end of April.
125 | For testing, start with the script `./scripts/test_charades_timeception.sh`.
126 | In order to change which baseline is uses for testing, set the `-- config-file` using on of the following options.
127 | 
128 | ###### 2D-ResNet-152
129 | 
130 | Timeception on top of 2D-ResNet-152 as backnone.
131 | 
132 | |  Config File | Backbone | TC Layers | Frames  | mAP (%)  | Model |
133 | |---|:---:|:---:|:---:|:---:|:---:|
134 | | [charades_r2d_tc3_f32.yaml](./configs/charades_r2d_tc3_f32.yaml)     | R2D   | 3 | 32  | 30.37  | [Link](./data/charades/charades_r2d_tc3_f32.pkl)   |
135 | | [charades_r2d_tc3_f64.yaml](./configs/charades_r2d_tc3_f64.yaml)     | R2D   | 3 | 64  | 31.25  | [Link](./data/charades/charades_r2d_tc3_f64.pkl)   |
136 | | [charades_r2d_tc4_f128.yaml](./configs/charades_r2d_tc4_f128.yaml)   | R2D   | 4 | 128 | 31.82  | [Link](./data/charades/charades_r2d_tc4_f128.pkl)  |
137 | 
138 | ###### I3D
139 | 
140 | Timeception on top of ResNet-152 as backnone.
141 | 
142 | |  Config File | Backbone | TC Layers | Frames  | mAP (%)  | Model |
143 | |---|:---:|:---:|:---:|:---:|:---:|
144 | | [charades_i3d_tc3_f256.yaml](./configs/charades_i3d_tc3_f256.yaml)    | I3D  | 3 | 256  | 33.89  | [Link](./data/charades/charades_i3d_tc3_f256.pkl)   |
145 | | [charades_i3d_tc3_f512.yaml](./configs/charades_i3d_tc3_f512.yaml)    | I3D  | 3 | 512  | 35.46  | [Link](./data/charades/charades_i3d_tc3_f512.pkl)   |
146 | | [charades_i3d_tc4_f1024.yaml](./configs/charades_i3d_tc4_f1024.yaml)  | I3D  | 4 | 1024 | 37.20  | [Link](./data/charades/charades_i3d_tc4_f1024.pkl)  |
147 | 
148 | ###### 3D-ResNet-100
149 | Timeception on top of 3D-ResNet-100 as backnone.
150 | 
151 | 
152 | |  Config File | Backbone | TC Layers | Frames  | mAP (%)  | Model |
153 | |---|:---:|:---:|:---:|:---:|:---:|
154 | | [charades_r3d_tc4_f1024.yaml](./configs/charades_r3d_tc4_f1024.yaml)  | R3D  | 4 | 1024 |  41.1  | [Link](./data/charades/charades_r3d_tc4_f1024.pkl)  |
155 | 
156 | 
157 | #### Kinetics 400
158 | 
159 | We will add all pretrained models for Kinetics 400 by the end of June.
160 | 
161 | ### License
162 | 
163 | The code and the models in this repo are released under the GNU 3.0 [LICENSE](LICENSE).
164 | 
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/core/keras_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | ########################################################################
  5 | # GNU General Public License v3.0
  6 | # GNU GPLv3
  7 | # Copyright (c) 2019, Noureldien Hussein
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 21 | ########################################################################
 22 | 
 23 | """
 24 | Helper functions for keras.
 25 | """
 26 | 
 27 | from __future__ import absolute_import
 28 | from __future__ import division
 29 | from __future__ import print_function
 30 | from __future__ import unicode_literals
 31 | 
 32 | import os
 33 | import json
 34 | import pydot
 35 | import logging
 36 | import numpy as np
 37 | 
 38 | import tensorflow as tf
 39 | from tensorflow.contrib import framework as tf_framework
 40 | 
 41 | import keras.backend as K
 42 | from keras.callbacks import Callback
 43 | from keras.utils import vis_utils
 44 | from keras.models import Sequential, model_from_json
 45 | 
 46 | from core import config_utils
 47 | 
 48 | logger = logging.getLogger(__name__)
 49 | 
 50 | # region Constants
 51 | 
 52 | EPS_VALUE = 1e-9
 53 | LOSSES = ['categorical_crossentropy', 'mean_squared_error', 'mean_absolute_error', 'binary_crossentropy']
 54 | METRICS = ['accuracy', 'mean_squared_error', 'mean_absolute_error']
 55 | OPTIMIZERS = ['sgd', 'rmsprop', 'adam']
 56 | ACTIVATIONS = ['tanh', 'relu', 'sigmoid', 'softmax']
 57 | 
 58 | # endregion
 59 | 
 60 | # region Functions
 61 | 
 62 | def save_model_figure(model, file_path='/.model.eps'):
 63 |     vis_utils.plot_model(model, file_path, show_shapes=True, show_layer_names=True)
 64 | 
 65 | def load_model(json_path, weight_path, metrics=None, loss=None, optimizer=None, custom_objects=None, is_compile=True):
 66 |     with open(json_path, 'r') as f:
 67 |         model_json_string = json.load(f)
 68 |     model_json_dict = json.loads(model_json_string)
 69 |     model = model_from_json(model_json_string, custom_objects=custom_objects)
 70 |     model.load_weights(weight_path)
 71 | 
 72 |     if is_compile:
 73 |         if optimizer is None:
 74 |             optimizer = model_json_dict['optimizer']['name']
 75 | 
 76 |         if loss is None:
 77 |             loss = model_json_dict['loss']
 78 | 
 79 |         if metrics is None:
 80 |             model.compile(loss=loss, optimizer=optimizer)
 81 |         else:
 82 |             model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
 83 | 
 84 |     return model
 85 | 
 86 | def save_model(model, json_path, weight_path):
 87 |     model.save_weights(weight_path, overwrite=True)
 88 |     model_json = model.to_json()
 89 |     with open(json_path, 'w') as f:
 90 |         json.dump(model_json, f)
 91 | 
 92 | def layer_exist(model, layer_name):
 93 |     exist = False
 94 |     for layer in model.layers:
 95 |         if layer.name == layer_name:
 96 |             exist = True
 97 |             break
 98 | 
 99 |     return exist
100 | 
101 | def calc_num_batches(n_samples, batch_size):
102 |     n_batch = int(n_samples / float(batch_size))
103 |     n_batch = n_batch if n_samples % batch_size == 0 else n_batch + 1
104 |     return n_batch
105 | 
106 | # endregion
107 | 
108 | # region Metrics
109 | 
110 | def map_charades(y_true, y_pred):
111 |     """
112 |     Returns mAP
113 |     """
114 |     m_aps = []
115 | 
116 |     tf_one = tf.constant(1, dtype=tf.float32)
117 | 
118 |     n_classes = y_pred.shape[1]
119 |     for oc_i in range(n_classes):
120 |         pred_row = y_pred[:, oc_i]
121 |         sorted_idxs = tf_framework.argsort(-pred_row)
122 |         true_row = y_true[:, oc_i]
123 |         true_row = tf.map_fn(lambda i: true_row[i], sorted_idxs, dtype=np.float32)
124 |         tp_poolean = tf.equal(true_row, tf_one)
125 |         tp = tf.cast(tp_poolean, dtype=np.float32)
126 |         fp = K.reverse(tp, axes=0)
127 |         n_pos = tf.reduce_sum(tp)
128 |         f_pcs = tf.cumsum(fp)
129 |         t_pcs = tf.cumsum(tp)
130 |         s = f_pcs + t_pcs
131 | 
132 |         s = tf.cast(s, tf.float32)
133 |         t_pcs = tf.cast(t_pcs, tf.float32)
134 |         tp_float = tf.cast(tp_poolean, np.float32)
135 | 
136 |         prec = t_pcs / s
137 |         avg_prec = prec * tp_float
138 | 
139 |         n_pos = tf.cast(n_pos, tf.float32)
140 |         avg_prec = avg_prec / n_pos
141 |         avg_prec = tf.expand_dims(avg_prec, axis=0)
142 |         m_aps.append(avg_prec)
143 | 
144 |     m_aps = K.concatenate(m_aps, axis=0)
145 |     mAP = K.mean(m_aps)
146 |     return mAP
147 | 
148 | # endregion
149 | 
150 | # region Callbacks
151 | 
152 | class SaveCallback(Callback):
153 |     def __init__(self, dataset_name, model_name):
154 |         self.model_name = model_name
155 | 
156 |         model_root_path = './data/%s/models' % (dataset_name)
157 |         assert os.path.exists(model_root_path)
158 | 
159 |         model_root_path = './data/%s/models/%s' % (dataset_name, model_name)
160 |         if not os.path.exists(model_root_path):
161 |             os.mkdir(model_root_path)
162 | 
163 |         self.model_root_path = model_root_path
164 | 
165 |         super(SaveCallback, self).__init__()
166 | 
167 |     def on_epoch_end(self, idx_epoch, logs=None):
168 |         """
169 |         Save the model.
170 |         """
171 | 
172 |         epoch_num = idx_epoch + 1
173 |         self.__save(epoch_num)
174 | 
175 |     def __save(self, epoch_num):
176 |         model_root_path = self.model_root_path
177 |         model = self.model
178 | 
179 |         # hfpy accept only strings as a path
180 |         model_json_path = str('%s/%03d.json' % (model_root_path, epoch_num))
181 |         model_weight_path = str('%s/%03d.pkl' % (model_root_path, epoch_num))
182 | 
183 |         # save model definition as json, and save model weights
184 |         model.save_weights(model_weight_path, overwrite=True)
185 |         model_json = model.to_json()
186 |         with open(model_json_path, 'w') as f:
187 |             json.dump(model_json, f)
188 | 
189 | # endregion
190 | 


--------------------------------------------------------------------------------
/nets/i3d_torch_charades_utils.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | import os
  3 | import random
  4 | import sys
  5 | import time
  6 | import datetime
  7 | import math
  8 | import shutil
  9 | import random
 10 | import threading
 11 | 
 12 | import numpy as np
 13 | import cv2
 14 | import scipy.io
 15 | import h5py
 16 | from optparse import OptionParser
 17 | from collections import OrderedDict
 18 | 
 19 | import torch
 20 | import torch.nn as nn
 21 | import torch.nn.parallel
 22 | import torch.backends.cudnn as cudnn
 23 | import torch.distributed as dist
 24 | import torchvision.models as tmodels
 25 | import importlib
 26 | import torchsummary
 27 | from core import pytorch_utils
 28 | import torch.nn.functional as F
 29 | import torch.optim as optim
 30 | from torch.optim import lr_scheduler
 31 | from torch.autograd import Variable
 32 | 
 33 | import torchvision
 34 | from torchvision import datasets, transforms
 35 | 
 36 | from core import const as c, utils
 37 | from core import image_utils
 38 | from nets import i3d_torch_charades_test
 39 | 
 40 | def extract_features_rgb():
 41 |     from core import config_utils
 42 | 
 43 |     is_local = config_utils.is_local_machine()
 44 |     if is_local:
 45 |         begin_num = None
 46 |         end_num = None
 47 |     else:
 48 |         parser = OptionParser()
 49 |         parser.add_option("-b", "--begin_num", dest="begin_num", help="begin_num")
 50 |         parser.add_option("-e", "--end_num", dest="end_num", help="end_num")
 51 |         parser.add_option("-c", "--gpu_core_id", dest="gpu_core_id", help="gpu_core_id")
 52 |         (options, args) = parser.parse_args()
 53 |         begin_num = int(options.begin_num)
 54 |         end_num = int(options.end_num)
 55 |         gpu_core_id = int(options.gpu_core_id)
 56 |         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_core_id)
 57 | 
 58 |     __extract_features_rgb(begin_num, end_num)
 59 | 
 60 | def load_model_i3d_charades_rgb_for_testing(model_path):
 61 |     import torch
 62 |     from nets.i3d_torch_charades_test import InceptionI3d
 63 | 
 64 |     # setup the model
 65 |     state_dict = torch.load(model_path)
 66 |     model = InceptionI3d()
 67 |     model.replace_logits(157)
 68 |     model.load_state_dict(state_dict)
 69 |     model.train(False)
 70 |     model.eval()
 71 |     model.cuda()
 72 |     return model
 73 | 
 74 | def __extract_features_rgb(begin_num=None, end_num=None):
 75 |     root_path = c.DATA_ROOT_PATH
 76 |     annotation_path = '%s/Charades/annotation/frames_dict_trimmed_multi_label_i3d_160_frames.pkl' % (root_path)
 77 |     features_root_path = '%s/Charades/features_i3d_charades_rgb_mixed_5c_trimmed_20_frames' % (root_path)
 78 |     video_frames_root_path = '%s/Charades/frames/Charades_v1_rgb' % (root_path)
 79 |     model_path = '%s/Charades/baseline_models/i3d/rgb_charades.pt' % (root_path)
 80 |     feature_name = 'Mixed_5c'
 81 | 
 82 |     (video_frames_dict_tr, video_frames_dict_te) = utils.pkl_load(annotation_path)
 83 |     video_frames_dict = dict()
 84 |     video_frames_dict.update(video_frames_dict_tr)
 85 |     video_frames_dict.update(video_frames_dict_te)
 86 |     video_names = video_frames_dict.keys()
 87 | 
 88 |     n_videos = len(video_names)
 89 |     frame_count = 0
 90 | 
 91 |     if not os.path.exists(features_root_path):
 92 |         print('Sorry, path does not exist: %s' % (features_root_path))
 93 |         return
 94 | 
 95 |     t1 = time.time()
 96 |     print('extracting training features')
 97 |     print('start time: %s' % utils.timestamp())
 98 | 
 99 |     # aync reader, and get load images for the first video
100 |     img_reader = image_utils.AsyncImageReaderCharadesForI3DTorchModel(n_threads=20)
101 |     img_reader.load_imgs_in_batch(__get_video_frame_pathes(video_names[0], video_frames_root_path, video_frames_dict))
102 | 
103 |     # load the model
104 |     model = __load_i3d_model_rgb(model_path)
105 |     torchsummary.summary(model, input_size=(3, 160, 224, 224))
106 | 
107 |     # loop on list of videos
108 |     for idx_video in range(n_videos):
109 |         video_num = idx_video + 1
110 | 
111 |         if begin_num is not None and end_num is not None:
112 |             if video_num <= begin_num or video_num > end_num:
113 |                 continue
114 | 
115 |         video_name = video_names[idx_video]
116 | 
117 |         # wait untill the image_batch is loaded
118 |         t1 = time.time()
119 |         while img_reader.is_busy():
120 |             threading._sleep(0.1)
121 |         t2 = time.time()
122 |         duration_waited = t2 - t1
123 |         print('...... video %d/%d: %s, waited: %d' % (video_num, n_videos, video_name, duration_waited))
124 | 
125 |         # get the video frames
126 |         video_frames = img_reader.get_images()
127 | 
128 |         # pre-load for the next video
129 |         if video_num < n_videos:
130 |             next_video_name = video_names[idx_video + 1]
131 |             img_reader.load_imgs_in_batch(__get_video_frame_pathes(next_video_name, video_frames_root_path, video_frames_dict))
132 | 
133 |         video_features_path = '%s/%s.pkl' % (features_root_path, video_name)
134 |         # if os.path.exists(video_features_path):
135 |         #     print ('... features for video already exist: %s.pkl' % (video_name))
136 |         #     continue
137 | 
138 |         if len(video_frames) != 160:
139 |             print('... wrong n frames: %d' % (video_num))
140 |             continue
141 | 
142 |         # transpose to have the channel_first (160, 224, 224, 3) => (3, 160, 224, 224)
143 |         video_frames = np.transpose(video_frames, (3, 0, 1, 2))
144 | 
145 |         # add one dimension to represent the batch size
146 |         video_frames = np.expand_dims(video_frames, axis=0)
147 | 
148 |         # prepare input variable
149 |         with torch.no_grad():
150 |             # extract features
151 |             input_var = torch.from_numpy(video_frames).cuda()
152 |             output_var = model(input_var)
153 |             output_var = output_var.cpu()
154 |             features = output_var.data.numpy()  # (1, 1024, 20, 7, 7)
155 | 
156 |             # don't forget to clean up variables
157 |             del input_var
158 |             del output_var
159 | 
160 |         # squeeze to remove the dimension of the batch_size
161 |         features = features[0]  # (1024, 20, 7, 7)
162 | 
163 |         # transpose to have the channel_last
164 |         features = np.transpose(features, (1, 2, 3, 0))  # (20, 7, 7, 1024)
165 | 
166 |         # path to save the features
167 |         utils.pkl_dump(features, video_features_path, is_highest=True)
168 | 
169 |         # increment counts
170 |         frame_count += len(video_frames)
171 | 
172 |     t2 = time.time()
173 |     print('finish extracting %d features in %d seconds' % (frame_count, t2 - t1))
174 |     print('end time: %s' % utils.timestamp())
175 | 
176 | def __get_video_frame_pathes(video_name, video_frames_root_path, video_frames_dict):
177 |     video_frame_names = video_frames_dict[video_name]
178 |     video_frame_pathes = [('%s/%s/%s') % (video_frames_root_path, video_name, n) for n in video_frame_names]
179 |     video_frame_pathes = np.array(video_frame_pathes)
180 |     return video_frame_pathes
181 | 
182 | def __load_i3d_model_rgb(model_path):
183 |     # setup the model
184 |     state_dict = torch.load(model_path)
185 |     model = i3d_torch_charades_test.InceptionI3d()
186 |     model.replace_logits(157)
187 |     model.load_state_dict(state_dict)
188 |     model.cuda()
189 |     model.train(True)
190 |     return model
191 | 
192 | if __name__ == '__main__':
193 |     print('Hello World!')
194 |     extract_features_rgb()
195 | 


--------------------------------------------------------------------------------
/core/config_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | ########################################################################
  5 | # GNU General Public License v3.0
  6 | # GNU GPLv3
  7 | # Copyright (c) 2019, Noureldien Hussein
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 21 | ########################################################################
 22 | 
 23 | """
 24 | Configurations for project.
 25 | """
 26 | 
 27 | from __future__ import absolute_import
 28 | from __future__ import division
 29 | from __future__ import print_function
 30 | from __future__ import unicode_literals
 31 | 
 32 | import os
 33 | import platform
 34 | import argparse
 35 | import logging
 36 | import yaml
 37 | import pprint
 38 | from ast import literal_eval
 39 | 
 40 | from core.config import __C
 41 | from core.utils import AttrDict
 42 | from core import const, config, utils
 43 | 
 44 | logger = logging.getLogger(__name__)
 45 | 
 46 | # region Misc
 47 | 
 48 | def get_machine_name():
 49 |     return platform.node()
 50 | 
 51 | def import_dl_platform():
 52 |     if const.DL_FRAMEWORK == 'tensorflow':
 53 |         import tensorflow as tf
 54 |     elif const.DL_FRAMEWORK == 'pytorch':
 55 |         import torch
 56 |     elif const.DL_FRAMEWORK == 'caffe':
 57 |         import caffe
 58 |     elif const.DL_FRAMEWORK == 'keras':
 59 |         import keras.backend as K
 60 | 
 61 | # endregion
 62 | 
 63 | # region Config GPU
 64 | 
 65 | def config_gpu():
 66 |     if const.DL_FRAMEWORK == 'tensorflow':
 67 |         __config_gpu_for_tensorflow()
 68 |     elif const.DL_FRAMEWORK == 'pytorch':
 69 |         __config_gpu_for_pytorch()
 70 |     elif const.DL_FRAMEWORK == 'keras':
 71 |         __config_gpu_for_keras()
 72 |     elif const.DL_FRAMEWORK == 'caffe':
 73 |         __config_gpu_for_caffe()
 74 | 
 75 | def __config_gpu_for_tensorflow():
 76 |     import tensorflow as tf
 77 | 
 78 |     gpu_core_id = __parse_gpu_id()
 79 | 
 80 |     # import os
 81 |     # import tensorflow as tf
 82 |     # set the logging level of tensorflow
 83 |     # 1: filter out INFO
 84 |     # 2: filter out WARNING
 85 |     # 3: filter out ERROR
 86 |     # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # or any {'0', '1', '2'}
 87 | 
 88 |     # set which device to be used
 89 |     const.GPU_CORE_ID = gpu_core_id
 90 |     pass
 91 | 
 92 | def __config_gpu_for_keras():
 93 |     import tensorflow as tf
 94 |     import keras.backend as K
 95 | 
 96 |     gpu_core_id = __parse_gpu_id()
 97 | 
 98 |     K.clear_session()
 99 |     config = tf.ConfigProto()
100 |     config.gpu_options.visible_device_list = str(gpu_core_id)
101 |     config.gpu_options.allow_growth = True
102 |     session = tf.Session(config=config)
103 |     K.set_session(session)
104 | 
105 |     # set which device to be used
106 |     const.GPU_CORE_ID = gpu_core_id
107 | 
108 | def __config_gpu_for_pytorch():
109 |     import torch
110 | 
111 |     gpu_core_id = __parse_gpu_id()
112 | 
113 |     torch.cuda.set_device(gpu_core_id)
114 | 
115 |     # set which device to be used
116 |     const.GPU_CORE_ID = gpu_core_id
117 | 
118 | def __config_gpu_for_caffe():
119 |     import os
120 | 
121 |     gpu_core_id = __parse_gpu_id()
122 | 
123 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_core_id)
124 | 
125 |     # set which device to be used
126 |     const.GPU_CORE_ID = gpu_core_id
127 | 
128 | def __parse_gpu_id():
129 |     parser = argparse.ArgumentParser()
130 |     parser.add_argument('-c', '--gpu_core_id', default='-1', type=int)
131 |     args = parser.parse_args()
132 |     gpu_core_id = args.gpu_core_id
133 |     return gpu_core_id
134 | 
135 | # endregion
136 | 
137 | # region Config File Helpers
138 | 
139 | def cfg_print_cfg():
140 |     logger.info('Config file is:')
141 |     logger.info(pprint.pformat(__C))
142 | 
143 | def cfg_merge_dicts(dict_a, dict_b):
144 |     from ast import literal_eval
145 | 
146 |     for key, value in dict_a.items():
147 |         if key not in dict_b:
148 |             raise KeyError('Invalid key in config file: {}'.format(key))
149 |         if type(value) is dict:
150 |             dict_a[key] = value = AttrDict(value)
151 |         if isinstance(value, str):
152 |             try:
153 |                 value = literal_eval(value)
154 |             except BaseException:
155 |                 pass
156 |         # the types must match, too
157 |         old_type = type(dict_b[key])
158 |         if old_type is not type(value) and value is not None:
159 |             raise ValueError('Type mismatch ({} vs. {}) for config key: {}'.format(type(dict_b[key]), type(value), key))
160 |         # recursively merge dicts
161 |         if isinstance(value, AttrDict):
162 |             try:
163 |                 cfg_merge_dicts(dict_a[key], dict_b[key])
164 |             except BaseException:
165 |                 raise Exception('Error under config key: {}'.format(key))
166 |         else:
167 |             dict_b[key] = value
168 | 
169 | def cfg_from_file(file_path, is_check=True):
170 |     """
171 |     Load a config file and merge it into the default options.
172 |     """
173 | 
174 |     # read from file
175 |     yaml_config = utils.yaml_load(file_path)
176 | 
177 |     # merge to project config
178 |     cfg_merge_dicts(yaml_config, __C)
179 | 
180 |     # make sure everything is okay
181 |     if is_check:
182 |         cfg_sanity_check()
183 | 
184 | def cfg_from_attrdict(attr_dict):
185 |     cfg_merge_dicts(attr_dict, __C)
186 | 
187 | def cfg_from_dict(args_dict):
188 |     """Set config keys via list (e.g., from command line)."""
189 | 
190 |     for key, value in args_dict.iteritems():
191 |         key_list = key.split('.')
192 |         cfg = __C
193 |         for subkey in key_list[:-1]:
194 |             assert subkey in cfg, 'Config key {} not found'.format(subkey)
195 |             cfg = cfg[subkey]
196 |         subkey = key_list[-1]
197 |         if subkey not in cfg:
198 |             raise Exception('Config key {} not found'.format(subkey))
199 |         try:
200 |             # handle the case when v is a string literal
201 |             val = literal_eval(value)
202 |         except BaseException:
203 |             val = value
204 |         if isinstance(val, type(cfg[subkey])) or cfg[subkey] is None:
205 |             pass
206 |         else:
207 |             type1 = type(val)
208 |             type2 = type(cfg[subkey])
209 |             msg = 'type {} does not match original type {}'.format(type1, type2)
210 |             raise Exception(msg)
211 |         cfg[subkey] = val
212 | 
213 | def cfg_from_list(args_list):
214 |     """
215 |     Set config keys via list (e.g., from command line).
216 |     """
217 |     from ast import literal_eval
218 | 
219 |     assert len(args_list) % 2 == 0, 'Specify values or keys for args'
220 |     for key, value in zip(args_list[0::2], args_list[1::2]):
221 |         key_list = key.split('.')
222 |         cfg = __C
223 |         for subkey in key_list[:-1]:
224 |             assert subkey in cfg, 'Config key {} not found'.format(subkey)
225 |             cfg = cfg[subkey]
226 |         subkey = key_list[-1]
227 |         assert subkey in cfg, 'Config key {} not found'.format(subkey)
228 |         try:
229 |             # handle the case when v is a string literal
230 |             val = literal_eval(value)
231 |         except BaseException:
232 |             val = value
233 |         msg = 'type {} does not match original type {}'.format(type(val), type(cfg[subkey]))
234 |         assert isinstance(val, type(cfg[subkey])) or cfg[subkey] is None, msg
235 |         cfg[subkey] = val
236 | 
237 | def cfg_sanity_check():
238 |     assert __C.TRAIN.SCHEME in const.TRAIN_SCHEMES
239 |     assert __C.MODEL.CLASSIFICATION_TYPE in const.MODEL_CLASSIFICATION_TYPES
240 |     assert __C.MODEL.MULTISCALE_TYPE in const.MODEL_MULTISCALE_TYPES
241 |     assert __C.SOLVER.NAME in const.SOLVER_NAMES
242 |     assert __C.DATASET_NAME in const.DATASET_NAMES
243 | 
244 | # endregion
245 | 


--------------------------------------------------------------------------------
/experiments/train_keras.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | ########################################################################
  5 | # GNU General Public License v3.0
  6 | # GNU GPLv3
  7 | # Copyright (c) 2019, Noureldien Hussein
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 21 | ########################################################################
 22 | 
 23 | """
 24 | Train Timeception layers on different datasets. There are two different ways to train Timeception.
 25 |  1. Timeception-only (TCO): only timeception layers are trained, using features extracted from backbone CNNs.
 26 |  2. End-to-end (ETE): timeception is trained on top of backbone CNN. The input is video frames passed throughtout the backboneCNN
 27 |     and then the resulted feature is fed to Timeception layers. Here, you enjoy all the benefits of end-to-end training.
 28 |     For example, do pre-processing to the input frames, randomly sample the frames, temporal jittering, ...., etc.
 29 | """
 30 | 
 31 | from __future__ import absolute_import
 32 | from __future__ import division
 33 | from __future__ import print_function
 34 | from __future__ import unicode_literals
 35 | 
 36 | import logging
 37 | import os
 38 | import datetime
 39 | import numpy as np
 40 | from optparse import OptionParser
 41 | 
 42 | import tensorflow as tf
 43 | import keras.backend as K
 44 | from keras.layers import Dense, LeakyReLU, Dropout, Input, Activation, BatchNormalization
 45 | from keras.optimizers import SGD, Adam
 46 | from keras.models import Model
 47 | 
 48 | from nets import timeception
 49 | from nets.layers_keras import MaxLayer
 50 | from core import utils, keras_utils, image_utils, config_utils, const, config, data_utils
 51 | from core.utils import Path as Pth
 52 | 
 53 | logger = logging.getLogger(__name__)
 54 | 
 55 | def train_tco():
 56 |     """
 57 |     Train Timeception layers based on the given configurations.
 58 |     This train scheme is Timeception-only (TCO).
 59 |     """
 60 | 
 61 |     # get some configs for the training
 62 |     n_workers = config.cfg.TRAIN.N_WORKERS
 63 |     n_epochs = config.cfg.TRAIN.N_EPOCHS
 64 |     dataset_name = config.cfg.DATASET_NAME
 65 |     model_name = '%s_%s' % (config.cfg.MODEL.NAME, utils.timestamp())
 66 | 
 67 |     # data generators
 68 |     data_generator_tr = __define_data_generator(is_training=True)
 69 |     data_generator_te = __define_data_generator(is_training=False)
 70 | 
 71 |     logger.info('--- start time')
 72 |     logger.info(datetime.datetime.now())
 73 |     logger.info('... [tr]: n_samples, n_batch, batch_size: %d, %d, %d' % (data_generator_tr.n_samples, data_generator_tr.n_batches, config.cfg.TRAIN.BATCH_SIZE))
 74 |     logger.info('... [te]: n_samples, n_batch, batch_size: %d, %d, %d' % (data_generator_te.n_samples, data_generator_te.n_batches, config.cfg.TEST.BATCH_SIZE))
 75 | 
 76 |     # callback to save the model
 77 |     save_callback = keras_utils.SaveCallback(dataset_name, model_name)
 78 | 
 79 |     # load model
 80 |     model = __define_timeception_model()
 81 |     logger.info(model.summary())
 82 | 
 83 |     # train the model
 84 |     model.fit_generator(epochs=n_epochs, generator=data_generator_tr, validation_data=data_generator_te, use_multiprocessing=True, workers=n_workers, callbacks=[save_callback], verbose=2)
 85 | 
 86 |     logger.info('--- finish time')
 87 |     logger.info(datetime.datetime.now())
 88 | 
 89 | def train_ete():
 90 |     """
 91 |     Train Timeception layers based on the given configurations.
 92 |     This train scheme is End-to-end (ETE).
 93 |     """
 94 | 
 95 |     model = __define_timeception_model()
 96 | 
 97 |     raise Exception('Sorry, not implemented yet!')
 98 | 
 99 | def __define_data_generator(is_training):
100 |     """
101 |     Define data generator.
102 |     """
103 | 
104 |     # get some configs for the training
105 |     n_classes = config.cfg.MODEL.N_CLASSES
106 |     dataset_name = config.cfg.DATASET_NAME
107 |     backbone_model_name = config.cfg.MODEL.BACKBONE_CNN
108 |     backbone_feature_name = config.cfg.MODEL.BACKBONE_FEATURE
109 |     n_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS
110 | 
111 |     batch_size_tr = config.cfg.TRAIN.BATCH_SIZE
112 |     batch_size_te = config.cfg.TEST.BATCH_SIZE
113 |     batch_size = batch_size_tr if is_training else batch_size_te
114 | 
115 |     # size and name of feature
116 |     feature_name = 'features_%s_%s_%sf' % (backbone_model_name, backbone_feature_name, n_timesteps)
117 |     c, h, w = utils.get_model_feat_maps_info(backbone_model_name, backbone_feature_name)
118 |     feature_dim = (n_timesteps, h, w, c)
119 | 
120 |     # data generators
121 |     params = {'batch_size': batch_size, 'n_classes': n_classes, 'feature_name': feature_name, 'feature_dim': feature_dim, 'is_shuffle': True, 'is_training': is_training}
122 |     data_generator_class = data_utils.KERAS_DATA_GENERATORS_DICT[dataset_name]
123 |     data_generator = data_generator_class(**params)
124 | 
125 |     return data_generator
126 | 
127 | def __define_timeception_model():
128 |     """
129 |     Define Timeception classifier.
130 |     """
131 | 
132 |     # some configurations for the model
133 |     classification_type = config.cfg.MODEL.CLASSIFICATION_TYPE
134 |     solver_name = config.cfg.SOLVER.NAME
135 |     solver_lr = config.cfg.SOLVER.LR
136 |     adam_epsilon = config.cfg.SOLVER.ADAM_EPSILON
137 |     n_tc_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS
138 |     backbone_name = config.cfg.MODEL.BACKBONE_CNN
139 |     feature_name = config.cfg.MODEL.BACKBONE_FEATURE
140 |     n_tc_layers = config.cfg.MODEL.N_TC_LAYERS
141 |     n_classes = config.cfg.MODEL.N_CLASSES
142 |     is_dilated = config.cfg.MODEL.MULTISCALE_TYPE
143 |     n_channels_in, channel_h, channel_w = utils.get_model_feat_maps_info(backbone_name, feature_name)
144 |     n_groups = int(n_channels_in / 128.0)
145 | 
146 |     # optimizer and loss for either multi-label "ml" or single-label "sl" classification
147 |     if classification_type == 'ml':
148 |         loss = keras_utils.LOSSES[3]
149 |         output_activation = keras_utils.ACTIVATIONS[2]
150 |         metric_function = keras_utils.map_charades
151 |     else:
152 |         loss = keras_utils.LOSSES[0]
153 |         output_activation = keras_utils.ACTIVATIONS[3]
154 |         metric_function = keras_utils.METRICS[0]
155 | 
156 |     # define the optimizer
157 |     optimizer = SGD(lr=0.01) if solver_name == 'sgd' else Adam(lr=solver_lr, epsilon=adam_epsilon)
158 | 
159 |     # input layer
160 |     input_shape = (n_tc_timesteps, channel_h, channel_w, n_channels_in)  # (T, H, W, C)
161 |     tensor_input = Input(shape=input_shape, name='input')  # (T, H, W, C)
162 | 
163 |     # define timeception layers, as a standalone module
164 |     timeception_module = timeception.Timeception(n_channels_in, n_tc_layers, n_groups, is_dilated=is_dilated)
165 |     tensor = timeception_module(tensor_input)  # (T, H, W, C)
166 | 
167 |     # but if you fancy, you can define timeception layers as a series of layers
168 |     # tensor = timeception.timeception_layers(tensor_input, n_tc_layers, n_groups, is_dilated=is_dilated) # (T, H, W, C)
169 | 
170 |     # max-pool over space-time
171 |     tensor = MaxLayer(axis=(1, 2, 3), name='maxpool_t_s')(tensor)
172 | 
173 |     # dense layers for classification
174 |     tensor = Dropout(0.5)(tensor)
175 |     tensor = Dense(512)(tensor)
176 |     tensor = BatchNormalization()(tensor)
177 |     tensor = LeakyReLU(alpha=0.2)(tensor)
178 |     tensor = Dropout(0.25)(tensor)
179 |     tensor = Dense(n_classes)(tensor)
180 |     tensor_output = Activation(output_activation)(tensor)
181 | 
182 |     # define the model
183 |     model = Model(inputs=tensor_input, outputs=tensor_output)
184 |     model.compile(loss=loss, optimizer=optimizer, metrics=[metric_function])
185 | 
186 |     return model
187 | 
188 | def __main():
189 |     """
190 |     Run this script to train Timeception.
191 |     """
192 | 
193 |     default_config_file = 'charades_i3d_tc4_f1024.yaml'
194 |     default_config_file = 'charades_i3d_tc2_f256.yaml'
195 | 
196 |     # Parse the arguments
197 |     parser = OptionParser()
198 |     parser.add_option('-c', '--config_file', dest='config_file', default=default_config_file, help='Yaml config file that contains all training details.')
199 |     (options, args) = parser.parse_args()
200 |     config_file = options.config_file
201 | 
202 |     # check if exist
203 |     if config_file is None or config_file == '':
204 |         msg = 'Config file not passed, default config is used: %s' % (config_file)
205 |         logging.warning(msg)
206 |         config_file = default_config_file
207 | 
208 |     # path of config file
209 |     config_path = './configs/%s' % (config_file)
210 | 
211 |     # check if file exist
212 |     if not os.path.exists(config_path):
213 |         msg = 'Sorry, could not find config file with the following path: %s' % (config_path)
214 |         logging.error(msg)
215 |     else:
216 |         # read the config from file and copy it to the project configuration "cfg"
217 |         config_utils.cfg_from_file(config_path)
218 | 
219 |         # choose which training scheme, either 'ete' or 'tco'
220 |         training_scheme = config.cfg.TRAIN.SCHEME
221 | 
222 |         # start training
223 |         if training_scheme == 'tco':
224 |             train_tco()
225 |         else:
226 |             train_ete()
227 | 
228 | if __name__ == '__main__':
229 |     __main()
230 | 


--------------------------------------------------------------------------------
/core/pytorch_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | ########################################################################
  5 | # GNU General Public License v3.0
  6 | # GNU GPLv3
  7 | # Copyright (c) 2019, Noureldien Hussein
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 21 | ########################################################################
 22 | 
 23 | """
 24 | Helper functions for pytorch.
 25 | """
 26 | 
 27 | from __future__ import absolute_import
 28 | from __future__ import division
 29 | from __future__ import print_function
 30 | from __future__ import unicode_literals
 31 | 
 32 | import os
 33 | import logging
 34 | import json
 35 | import numpy as np
 36 | from collections import OrderedDict
 37 | 
 38 | import torch
 39 | from torch import nn
 40 | from torch.nn import functional as F
 41 | from torch.autograd import Variable
 42 | 
 43 | import torchviz
 44 | import torchvision
 45 | import torchsummary
 46 | 
 47 | logger = logging.getLogger(__name__)
 48 | 
 49 | # region Helpers
 50 | 
 51 | def save_model(model, path):
 52 |     model.save_state_dict(path)
 53 | 
 54 | def load_model(model, path):
 55 |     model_dict = torch.load(path)
 56 |     model.load_state_dict(model_dict)
 57 | 
 58 | def padding1d(tensor, filter):
 59 |     it, = tensor.shape[2:]
 60 |     ft = filter
 61 | 
 62 |     pt = max(0, (it - 1) + (ft - 1) + 1 - it)
 63 |     oddt = (pt % it != 0)
 64 | 
 65 |     mode = str('constant')
 66 |     if any([oddt]):
 67 |         pad = [0, int(oddt)]
 68 |         tensor = F.pad(tensor, pad, mode=mode)
 69 | 
 70 |     padding = (pt // it,)
 71 |     return tensor, padding
 72 | 
 73 | def padding3d(tensor, filter, mode=str('constant')):
 74 |     """
 75 |     Input shape (BN, C, T, H, W)
 76 |     """
 77 | 
 78 |     it, ih, iw = tensor.shape[2:]
 79 |     ft, fh, fw = filter.shape
 80 | 
 81 |     pt = max(0, (it - 1) + (ft - 1) + 1 - it)
 82 |     ph = max(0, (ih - 1) + (fh - 1) + 1 - ih)
 83 |     pw = max(0, (iw - 1) + (fw - 1) + 1 - iw)
 84 | 
 85 |     oddt = (pt % 2 != 0)
 86 |     oddh = (ph % 2 != 0)
 87 |     oddw = (pw % 2 != 0)
 88 | 
 89 |     if any([oddt, oddh, oddw]):
 90 |         pad = [0, int(oddt), 0, int(oddh), 0, int(oddw)]
 91 |         tensor = F.pad(tensor, pad, mode=mode)
 92 | 
 93 |     padding = (pt // 2, ph // 2, pw // 2)
 94 |     tensor = F.conv3d(tensor, filter, padding=padding)
 95 | 
 96 |     return tensor
 97 | 
 98 | def calc_padding_1d(input_size, kernel_size, stride=1, dilation=1):
 99 |     """
100 |     Calculate the padding.
101 |     """
102 | 
103 |     # i = input
104 |     # o = output
105 |     # p = padding
106 |     # k = kernel_size
107 |     # s = stride
108 |     # d = dilation
109 |     # the equation is
110 |     # o = [i + 2 * p - k - (k - 1) * (d - 1)] / s + 1
111 |     # give that we want i = o, then we solve the equation for p gives us
112 | 
113 |     i = input_size
114 |     s = stride
115 |     k = kernel_size
116 |     d = dilation
117 | 
118 |     padding = 0.5 * (k - i + s * (i - 1) + (k - 1) * (d - 1))
119 |     padding = int(padding)
120 | 
121 |     return padding
122 | 
123 | def summary(model, input_size, batch_size=-1, device="cuda"):
124 |     """
125 |     Custom summary function, to print the custom name of module, instead of the assigned layer name.
126 |     :param model:
127 |     :param input_size:
128 |     :param batch_size:
129 |     :param device:
130 |     :return:
131 |     """
132 | 
133 |     # this has to be imported here, not to create import-loop between "nets.layers_pytorch" and "core.pytorch_utils"
134 |     from nets.layers_pytorch import DepthwiseConv1DLayer
135 | 
136 |     def register_hook(module):
137 | 
138 |         def hook(module, input, output):
139 | 
140 |             # old code
141 |             # class_name = str(module.__class__).split(".")[-1].split("'")[0]
142 |             # m_key = "%s-%i" % (class_name, module_idx + 1)
143 | 
144 |             # don't consider this layer
145 |             if type(module) == DepthwiseConv1DLayer:
146 |                 return
147 | 
148 |             # new code
149 |             if hasattr(module, '_name'):
150 |                 m_key = str(module._name)
151 |             else:
152 |                 module_idx = len(summary)
153 |                 class_name = str(module.__class__).split(".")[-1].split("'")[0]
154 |                 m_key = "%s-%i" % (class_name, module_idx + 1)
155 | 
156 |             summary[m_key] = OrderedDict()
157 |             summary[m_key]["input_shape"] = list(input[0].size())
158 |             summary[m_key]["input_shape"][0] = batch_size
159 |             if isinstance(output, (list, tuple)):
160 |                 summary[m_key]["output_shape"] = [
161 |                     [-1] + list(o.size())[1:] for o in output
162 |                 ]
163 |             else:
164 |                 summary[m_key]["output_shape"] = list(output.size())
165 |                 summary[m_key]["output_shape"][0] = batch_size
166 | 
167 |             params = 0
168 |             if hasattr(module, "weight") and hasattr(module.weight, "size"):
169 |                 params += torch.prod(torch.LongTensor(list(module.weight.size())))
170 |                 summary[m_key]["trainable"] = module.weight.requires_grad
171 |             if hasattr(module, "bias") and hasattr(module.bias, "size"):
172 |                 params += torch.prod(torch.LongTensor(list(module.bias.size())))
173 |             summary[m_key]["nb_params"] = params
174 | 
175 |         if (not isinstance(module, nn.Sequential) and not isinstance(module, nn.ModuleList) and not (module == model)):
176 |             hooks.append(module.register_forward_hook(hook))
177 | 
178 |     device = device.lower()
179 |     assert device in [
180 |         "cuda",
181 |         "cpu",
182 |     ], "Input device is not valid, please specify 'cuda' or 'cpu'"
183 | 
184 |     if device == "cuda" and torch.cuda.is_available():
185 |         dtype = torch.cuda.FloatTensor
186 |     else:
187 |         dtype = torch.FloatTensor
188 | 
189 |     # multiple inputs to the network
190 |     if isinstance(input_size, tuple):
191 |         input_size = [input_size]
192 | 
193 |     # batch_size of 2 for batchnorm
194 |     x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]
195 |     # print(type(x[0]))
196 | 
197 |     # create properties
198 |     summary = OrderedDict()
199 |     hooks = []
200 | 
201 |     # register hook
202 |     model.apply(register_hook)
203 | 
204 |     # make a forward pass
205 |     # print(x.shape)
206 |     model(*x)
207 | 
208 |     # remove these hooks
209 |     for h in hooks:
210 |         h.remove()
211 | 
212 |     print("----------------------------------------------------------------")
213 |     line_new = "{:>20}  {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
214 |     print(line_new)
215 |     print("================================================================")
216 |     total_params = 0
217 |     total_output = 0
218 |     trainable_params = 0
219 |     for layer in summary:
220 |         # input_shape, output_shape, trainable, nb_params
221 |         line_new = "{:>20}  {:>25} {:>15}".format(layer, str(summary[layer]["output_shape"]), "{0:,}".format(summary[layer]["nb_params"]), )
222 |         total_params += summary[layer]["nb_params"]
223 |         total_output += np.prod(summary[layer]["output_shape"])
224 |         if "trainable" in summary[layer]:
225 |             if summary[layer]["trainable"] == True:
226 |                 trainable_params += summary[layer]["nb_params"]
227 |         print(line_new)
228 | 
229 |     # assume 4 bytes/number (float on cuda).
230 |     total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.))
231 |     total_output_size = abs(2. * total_output * 4. / (1024 ** 2.))  # x2 for gradients
232 |     total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.))
233 |     total_size = total_params_size + total_output_size + total_input_size
234 | 
235 |     print("================================================================")
236 |     print("Total params: {0:,}".format(total_params))
237 |     print("Trainable params: {0:,}".format(trainable_params))
238 |     print("Non-trainable params: {0:,}".format(total_params - trainable_params))
239 |     print("----------------------------------------------------------------")
240 |     print("Input size (MB): %0.2f" % total_input_size)
241 |     print("Forward/backward pass size (MB): %0.2f" % total_output_size)
242 |     print("Params size (MB): %0.2f" % total_params_size)
243 |     print("Estimated Total Size (MB): %0.2f" % total_size)
244 |     print("----------------------------------------------------------------")
245 |     # return summary
246 | 
247 | # endregion
248 | 
249 | # region Classes
250 | 
251 | class ModelSaver():
252 |     def __init__(self, model, dataset_name, model_name):
253 |         self.model = model
254 |         self.model_name = model_name
255 | 
256 |         model_root_path = './data/%s/models' % (dataset_name)
257 |         assert os.path.exists(model_root_path)
258 | 
259 |         model_root_path = './data/%s/models/%s' % (dataset_name, model_name)
260 |         if not os.path.exists(model_root_path):
261 |             os.mkdir(model_root_path)
262 | 
263 |         self.model_root_path = model_root_path
264 | 
265 |     def save(self, idx_epoch):
266 |         """
267 |         Save the model.
268 |         """
269 |         epoch_num = idx_epoch + 1
270 |         model_root_path = self.model_root_path
271 |         model_state_path = str('%s/%03d.pt' % (model_root_path, epoch_num))
272 | 
273 |         # save model state using pytorch
274 |         model_state = self.model.state_dict()
275 |         torch.save(model_state, model_state_path)
276 | 
277 | 
278 | # endregion
279 | 


--------------------------------------------------------------------------------
/core/data_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | ########################################################################
  5 | # GNU General Public License v3.0
  6 | # GNU GPLv3
  7 | # Copyright (c) 2019, Noureldien Hussein
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 21 | ########################################################################
 22 | 
 23 | """
 24 | Helpful functions and classes to deal with data.
 25 | """
 26 | 
 27 | from __future__ import absolute_import
 28 | from __future__ import division
 29 | from __future__ import print_function
 30 | from __future__ import unicode_literals
 31 | 
 32 | import logging
 33 | import random
 34 | import numpy as np
 35 | import pickle as pkl
 36 | from datetime import datetime
 37 | from multiprocessing.dummy import Pool
 38 | 
 39 | import keras.utils
 40 | import torch.utils.data
 41 | import torchvision
 42 | 
 43 | from core import utils, config
 44 | from core.utils import Path as Pth
 45 | 
 46 | logger = logging.getLogger(__name__)
 47 | 
 48 | # region Async File Loader
 49 | 
 50 | class AsyncLoaderVideoFeatures():
 51 |     """
 52 |     Load features for the video frames.
 53 |     """
 54 | 
 55 |     def __init__(self, feats_path, target, n_frames_per_video, batch_size, n_feat_maps, feat_map_side_dim, n_threads=10, annotation_dict=None):
 56 |         random.seed(101)
 57 |         np.random.seed(101)
 58 | 
 59 |         self.__feats_pathes = feats_path
 60 |         self.__n_frames_per_video = n_frames_per_video
 61 |         self.__n_feat_maps = n_feat_maps
 62 |         self.__feat_map_side_dim = feat_map_side_dim
 63 |         self.__annotation_dict = annotation_dict
 64 | 
 65 |         self.__batch_size = batch_size
 66 |         self.__y = target
 67 | 
 68 |         self.__is_busy = False
 69 |         self.__batch_features = None
 70 |         self.__batch_y = None
 71 |         self.__n_threads_in_pool = n_threads
 72 |         self.__pool = Pool(self.__n_threads_in_pool)
 73 | 
 74 |     def load_feats_in_batch(self, batch_number):
 75 |         self.__is_busy = True
 76 | 
 77 |         idx_batch = batch_number - 1
 78 |         start_idx = idx_batch * self.__batch_size
 79 |         stop_idx = (idx_batch + 1) * self.__batch_size
 80 | 
 81 |         batch_feat_pathes = self.__feats_pathes[start_idx:stop_idx]
 82 |         batch_y = self.__y[start_idx:stop_idx]
 83 | 
 84 |         n_batch_feats = len(batch_feat_pathes)
 85 |         n_batch_y = len(batch_y)
 86 |         idxces = range(0, n_batch_feats)
 87 | 
 88 |         assert n_batch_feats == n_batch_y
 89 | 
 90 |         # parameters passed to the reading function
 91 |         params = [data_item for data_item in zip(idxces, batch_feat_pathes)]
 92 | 
 93 |         # set list of batch features before start reading
 94 |         batch_feats_shape = (n_batch_feats, self.__n_frames_per_video, self.__feat_map_side_dim, self.__feat_map_side_dim, self.__n_feat_maps)
 95 | 
 96 |         self.__batch_features = np.zeros(batch_feats_shape, dtype=np.float32)
 97 |         self.__batch_y = batch_y
 98 | 
 99 |         # start pool of threads
100 |         self.__pool.map_async(self.__load_features, params, callback=self.__thread_pool_callback)
101 | 
102 |     def get_batch_data(self):
103 |         if self.__is_busy:
104 |             raise Exception('Sorry, you can\'t get features while threads are running!')
105 |         else:
106 |             return (self.__batch_features, self.__batch_y)
107 | 
108 |     def get_y(self):
109 |         return self.__y
110 | 
111 |     def is_busy(self):
112 |         return self.__is_busy
113 | 
114 |     def __thread_pool_callback(self, args):
115 |         self.__is_busy = False
116 | 
117 |     def __load_features(self, params):
118 | 
119 |         idx_video = params[0]
120 |         feats_path = params[1]
121 |         video_name = feats_path.split('/')[-1]
122 | 
123 |         try:
124 |             # load feature from file
125 |             feats = utils.pkl_load(feats_path)
126 | 
127 |             n_feats = len(feats)
128 |             assert n_feats == self.__n_frames_per_video, 'Sorry, wrong number of frames, expected: %d, got: %d' % (self.__n_frames_per_video, n_feats)
129 |             self.__batch_features[idx_video] = feats
130 | 
131 |         except Exception as exp:
132 |             print('\nSorry, error in loading feature %s' % (feats_path))
133 |             print(exp)
134 | 
135 |     def shuffle_data(self):
136 |         """
137 |         shuffle these data: self.__feats_pathes, self.__class_names, self.__y
138 |         :return:
139 |         """
140 | 
141 |         n_samples = len(self.__feats_pathes)
142 | 
143 |         idx = range(n_samples)
144 |         np.random.shuffle(idx)
145 |         self.__feats_pathes = self.__feats_pathes[idx]
146 |         self.__y = self.__y[idx]
147 | 
148 |     def close(self):
149 |         self.__pool.close()
150 |         self.__pool.terminate()
151 | 
152 | # endregion
153 | 
154 | # region Data Generators (Keras)
155 | 
156 | class DataGeneratorCharades(keras.utils.Sequence):
157 |     'Generates data for Keras'
158 | 
159 |     def __init__(self, batch_size, n_classes, feature_dim, feature_name, is_training, is_shuffle=True):
160 |         """
161 |         Initialization
162 |         """
163 |         self.batch_size = batch_size
164 |         self.is_training = is_training
165 |         self.n_classes = n_classes
166 |         self.feature_dim = feature_dim
167 |         self.feature_name = feature_name
168 |         self.is_shuffle = is_shuffle
169 |         self.dataset_name = 'charades'
170 | 
171 |         # load annotation
172 |         root_path = './data/charades'
173 |         annotation_path = '%s/annotation/video_annotation.pkl' % (root_path)
174 |         if self.is_training:
175 |             (video_names, y, _, _) = utils.pkl_load(annotation_path)
176 |         else:
177 |             (_, _, video_names, y) = utils.pkl_load(annotation_path)
178 | 
179 |         # convert relative to root pathes
180 |         feats_path = np.array(['%s/%s/%s.pkl' % (root_path, feature_name, p) for p in video_names])
181 | 
182 |         n_samples = len(y)
183 |         self.n_samples = n_samples
184 |         self.n_batches = utils.calc_num_batches(n_samples, batch_size)
185 |         self.feats_path = feats_path
186 |         self.y = y
187 | 
188 |         # shuffle the data
189 |         if self.is_shuffle:
190 |             self.__shuffle()
191 | 
192 |     def __len__(self):
193 |         """
194 |         Denotes the number of batches per epoc
195 |         """
196 |         return self.n_batches
197 | 
198 |     def __getitem__(self, index):
199 |         """
200 |         Generate one batch of data.
201 |         """
202 | 
203 |         idx_start = index * self.batch_size
204 |         idx_stop = (index + 1) * self.batch_size
205 |         y = self.y[idx_start:idx_stop]
206 |         feats_path = self.feats_path[idx_start:idx_stop]
207 | 
208 |         n_items = len(feats_path)
209 |         x_shape = tuple([n_items] + list(self.feature_dim))
210 |         x = np.zeros(x_shape, dtype=np.float32)
211 | 
212 |         # loop of feature pathes and load them
213 |         for idx, p in enumerate(feats_path):
214 |             x[idx] = utils.pkl_load(p)
215 | 
216 |         return x, y
217 | 
218 |     def on_epoch_end(self):
219 |         """
220 |         Shuffle after finishing the epoch.
221 |         :return:
222 |         """
223 | 
224 |         if self.is_shuffle:
225 |             self.__shuffle()
226 | 
227 |     def __shuffle(self):
228 | 
229 |         idx = range(self.n_samples)
230 |         np.random.shuffle(idx)
231 |         self.feats_path = self.feats_path[idx]
232 |         self.y = self.y[idx]
233 | 
234 | # endregion
235 | 
236 | # region Data Loaders (PyTorch)
237 | 
238 | class DatasetCharades(torch.utils.data.Dataset):
239 |     def __init__(self, batch_size, n_classes, feature_dim, feature_name, is_training, is_shuffle=True):
240 |         """
241 |         Initialization
242 |         """
243 | 
244 |         self.batch_size = batch_size
245 |         self.is_training = is_training
246 |         self.n_classes = n_classes
247 |         self.feature_dim = feature_dim
248 |         self.feature_name = feature_name
249 |         self.is_shuffle = is_shuffle
250 |         self.dataset_name = 'charades'
251 | 
252 |         # load annotation
253 |         root_path = './data/charades'
254 |         annotation_path = '%s/annotation/video_annotation.pkl' % (root_path)
255 |         if self.is_training:
256 |             (video_names, y, _, _) = utils.pkl_load(annotation_path)
257 |         else:
258 |             (_, _, video_names, y) = utils.pkl_load(annotation_path)
259 | 
260 |         # in case of single label classification, debinarize the labels
261 |         if config.cfg.MODEL.CLASSIFICATION_TYPE == 'sl':
262 |             y = utils.debinarize_label(y)
263 | 
264 |         # in any case, make sure target is float
265 |         y = y.astype(np.float32)
266 | 
267 |         # convert relative to root pathes
268 |         feats_path = np.array(['%s/%s/%s.pkl' % (root_path, feature_name, p) for p in video_names])
269 | 
270 |         n_samples = len(y)
271 |         self.n_samples = n_samples
272 |         self.n_batches = utils.calc_num_batches(n_samples, batch_size)
273 |         self.feats_path = feats_path
274 |         self.y = y
275 | 
276 |         # shuffle the data
277 |         if self.is_shuffle:
278 |             self.__shuffle()
279 | 
280 |     def __getitem__(self, index):
281 |         """
282 |         Generate one batch of data
283 |         """
284 | 
285 |         y = self.y[index]
286 |         p = self.feats_path[index]
287 |         x = utils.pkl_load(p)  # (T, H, W, C)
288 | 
289 |         # convert to channel last
290 |         x = np.transpose(x, (3, 0, 1, 2))  # (T, H, W, C)
291 | 
292 |         return x, y
293 | 
294 |     def __len__(self):
295 |         return self.n_samples
296 | 
297 |     def __shuffle(self):
298 |         idx = range(self.n_samples)
299 |         np.random.shuffle(idx)
300 |         self.feats_path = self.feats_path[idx]
301 |         self.y = self.y[idx]
302 | 
303 | # endregion
304 | 
305 | # region Constants
306 | 
307 | KERAS_DATA_GENERATORS_DICT = {'charades': DataGeneratorCharades}
308 | PYTORCH_DATASETS_DICT = {'charades': DatasetCharades}
309 | 
310 | # endregion
311 | 


--------------------------------------------------------------------------------
/experiments/train_pytorch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | ########################################################################
  5 | # GNU General Public License v3.0
  6 | # GNU GPLv3
  7 | # Copyright (c) 2019, Noureldien Hussein
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 21 | ########################################################################
 22 | 
 23 | """
 24 | Train Timeception layers on different datasets. There are two different ways to train Timeception.
 25 |  1. Timeception-only (TCO): only timeception layers are trained, using features extracted from backbone CNNs.
 26 |  2. End-to-end (ETE): timeception is trained on top of backbone CNN. The input is video frames passed throughtout the backboneCNN
 27 |     and then the resulted feature is fed to Timeception layers. Here, you enjoy all the benefits of end-to-end training.
 28 |     For example, do pre-processing to the input frames, randomly sample the frames, temporal jittering, ...., etc.
 29 | """
 30 | 
 31 | from __future__ import absolute_import
 32 | from __future__ import division
 33 | from __future__ import print_function
 34 | from __future__ import unicode_literals
 35 | 
 36 | import os
 37 | import sys
 38 | import time
 39 | import logging
 40 | import datetime
 41 | import numpy as np
 42 | from optparse import OptionParser
 43 | 
 44 | import torch
 45 | import torch.utils.data
 46 | 
 47 | from torch.nn import functional as F
 48 | from torch.nn import Module, Dropout, BatchNorm1d, LeakyReLU, Linear, LogSoftmax, Sigmoid
 49 | from torch.optim import SGD, Adam
 50 | from torch.autograd import Variable
 51 | from torch.utils.data import DataLoader
 52 | from torchvision import datasets, transforms
 53 | 
 54 | import torchviz
 55 | import torchvision
 56 | import torchsummary
 57 | 
 58 | from nets import timeception_pytorch
 59 | from core import utils, pytorch_utils, image_utils, config_utils, const, config, data_utils, metrics
 60 | from core.utils import Path as Pth
 61 | 
 62 | logger = logging.getLogger(__name__)
 63 | 
 64 | def train_tco():
 65 |     """
 66 |     Train Timeception layers based on the given configurations.
 67 |     This train scheme is Timeception-only (TCO).
 68 |     """
 69 | 
 70 |     # get some configs for the training
 71 |     n_epochs = config.cfg.TRAIN.N_EPOCHS
 72 |     dataset_name = config.cfg.DATASET_NAME
 73 |     model_name = '%s_%s' % (config.cfg.MODEL.NAME, utils.timestamp())
 74 |     device = 'cuda'
 75 | 
 76 |     # data generators
 77 |     loader_tr, n_samples_tr, n_batches_tr = __define_loader(is_training=True)
 78 |     loader_te, n_samples_te, n_batches_te = __define_loader(is_training=False)
 79 | 
 80 |     logger.info('--- start time')
 81 |     logger.info(datetime.datetime.now())
 82 |     logger.info('... [tr]: n_samples, n_batch, batch_size: %d, %d, %d' % (n_samples_tr, n_batches_tr, config.cfg.TRAIN.BATCH_SIZE))
 83 |     logger.info('... [te]: n_samples, n_batch, batch_size: %d, %d, %d' % (n_samples_te, n_batches_te, config.cfg.TEST.BATCH_SIZE))
 84 | 
 85 |     # load model
 86 |     model, optimizer, loss_fn, metric_fn, metric_fn_name = __define_timeception_model(device)
 87 |     logger.info(pytorch_utils.summary(model, model._input_shape[1:], batch_size=2, device='cuda'))
 88 | 
 89 |     # save the model
 90 |     model_saver = pytorch_utils.ModelSaver(model, dataset_name, model_name)
 91 | 
 92 |     # loop on the epochs
 93 |     sys.stdout.write('\n')
 94 |     for idx_epoch in range(n_epochs):
 95 | 
 96 |         epoch_num = idx_epoch + 1
 97 | 
 98 |         loss_tr = 0.0
 99 |         acc_tr = 0.0
100 |         loss_te = 0.0
101 |         acc_te = 0.0
102 | 
103 |         tt1 = time.time()
104 | 
105 |         # flag model as training
106 |         model.train()
107 | 
108 |         # training
109 |         for idx_batch, (x, y_true) in enumerate(loader_tr):
110 |             batch_num = idx_batch + 1
111 | 
112 |             x, y_true = x.to(device), y_true.to(device)
113 |             optimizer.zero_grad()
114 |             y_pred = model(x)
115 |             loss = loss_fn(y_pred, y_true)
116 |             loss.backward()
117 |             optimizer.step()
118 | 
119 |             # calculate accuracy
120 |             y_true = y_true.cpu().numpy().astype(np.int32)
121 |             y_pred = y_pred.cpu().detach().numpy()
122 |             loss_b_tr = loss.cpu().detach().numpy()
123 |             acc_b_tr = metric_fn(y_true, y_pred)
124 | 
125 |             loss_tr += loss_b_tr
126 |             acc_tr += acc_b_tr
127 |             loss_b_tr = loss_tr / float(batch_num)
128 |             acc_b_tr = acc_tr / float(batch_num)
129 |             tt2 = time.time()
130 |             duration = tt2 - tt1
131 |             sys.stdout.write('\r%04ds - epoch: %02d/%02d, batch [tr]: %02d/%02d, loss, %s: %0.2f, %0.2f ' % (duration, epoch_num, n_epochs, batch_num, n_batches_tr, metric_fn_name, loss_b_tr, acc_b_tr))
132 | 
133 |         # flag model as testing
134 |         model.eval()
135 | 
136 |         # testing
137 |         for idx_batch, (x, y_true) in enumerate(loader_te):
138 |             batch_num = idx_batch + 1
139 | 
140 |             x, y_true = x.to(device), y_true.to(device)
141 |             y_pred = model(x)
142 |             loss_b_te = loss_fn(y_pred, y_true).cpu().detach().numpy()
143 |             y_true = y_true.cpu().numpy().astype(np.int32)
144 |             y_pred = y_pred.cpu().detach().numpy()
145 |             acc_b_te = metric_fn(y_true, y_pred)
146 | 
147 |             loss_te += loss_b_te
148 |             acc_te += acc_b_te
149 |             loss_b_te = loss_te / float(batch_num)
150 |             acc_b_te = acc_te / float(batch_num)
151 |             tt2 = time.time()
152 |             duration = tt2 - tt1
153 |             sys.stdout.write('\r%04ds - epoch: %02d/%02d, batch [te]: %02d/%02d, loss, %s: %0.2f, %0.2f ' % (duration, epoch_num, n_epochs, batch_num, n_batches_te, metric_fn_name, loss_b_te, acc_b_te))
154 | 
155 |         loss_tr /= float(n_batches_tr)
156 |         loss_te /= float(n_batches_te)
157 |         acc_tr /= float(n_batches_tr)
158 |         acc_te /= float(n_batches_te)
159 | 
160 |         tt2 = time.time()
161 |         duration = tt2 - tt1
162 |         sys.stdout.write('\r%04ds - epoch: %02d/%02d, [tr]: %0.2f, %0.2f, [te]: %0.2f, %0.2f           \n' % (duration, epoch_num, n_epochs, loss_tr, acc_te, loss_te, acc_te))
163 | 
164 |         # after each epoch, save data
165 |         model_saver.save(idx_epoch)
166 | 
167 |     logger.info('--- finish time')
168 |     logger.info(datetime.datetime.now())
169 | 
170 | def train_ete():
171 |     """
172 |     Train Timeception layers based on the given configurations.
173 |     This train scheme is End-to-end (ETE).
174 |     """
175 | 
176 |     raise Exception('Sorry, not implemented yet!')
177 | 
178 | def __define_loader(is_training):
179 |     """
180 |     Define data loader.
181 |     """
182 | 
183 |     # get some configs for the training
184 |     n_classes = config.cfg.MODEL.N_CLASSES
185 |     dataset_name = config.cfg.DATASET_NAME
186 |     backbone_model_name = config.cfg.MODEL.BACKBONE_CNN
187 |     backbone_feature_name = config.cfg.MODEL.BACKBONE_FEATURE
188 |     n_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS
189 |     n_workers = config.cfg.TRAIN.N_WORKERS
190 | 
191 |     batch_size_tr = config.cfg.TRAIN.BATCH_SIZE
192 |     batch_size_te = config.cfg.TEST.BATCH_SIZE
193 |     batch_size = batch_size_tr if is_training else batch_size_te
194 | 
195 |     # size and name of feature
196 |     feature_name = 'features_%s_%s_%sf' % (backbone_model_name, backbone_feature_name, n_timesteps)
197 |     c, h, w = utils.get_model_feat_maps_info(backbone_model_name, backbone_feature_name)
198 |     feature_dim = (c, n_timesteps, h, w)
199 | 
200 |     # data generators
201 |     params = {'batch_size': batch_size, 'n_classes': n_classes, 'feature_name': feature_name, 'feature_dim': feature_dim, 'is_training': is_training}
202 |     dataset_class = data_utils.PYTORCH_DATASETS_DICT[dataset_name]
203 |     dataset = dataset_class(**params)
204 |     n_samples = dataset.n_samples
205 |     n_batches = dataset.n_batches
206 | 
207 |     data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=n_workers, shuffle=True)
208 | 
209 |     return data_loader, n_samples, n_batches
210 | 
211 | def __define_timeception_model(device):
212 |     """
213 |     Define model, optimizer, loss function and metric function.
214 |     """
215 |     # some configurations
216 |     classification_type = config.cfg.MODEL.CLASSIFICATION_TYPE
217 |     solver_name = config.cfg.SOLVER.NAME
218 |     solver_lr = config.cfg.SOLVER.LR
219 |     adam_epsilon = config.cfg.SOLVER.ADAM_EPSILON
220 | 
221 |     # define model
222 |     model = Model().to(device)
223 |     model_param = model.parameters()
224 | 
225 |     # define the optimizer
226 |     optimizer = SGD(model_param, lr=0.01) if solver_name == 'sgd' else Adam(model_param, lr=solver_lr, eps=adam_epsilon)
227 | 
228 |     # loss and evaluation function for either multi-label "ml" or single-label "sl" classification
229 |     if classification_type == 'ml':
230 |         loss_fn = torch.nn.BCELoss()
231 |         metric_fn = metrics.map_charades
232 |         metric_fn_name = 'map'
233 |     else:
234 |         loss_fn = torch.nn.NLLLoss()
235 |         metric_fn = metrics.accuracy
236 |         metric_fn_name = 'acc'
237 | 
238 |     return model, optimizer, loss_fn, metric_fn, metric_fn_name
239 | 
240 | class Model(Module):
241 |     """
242 |     Define Timeception classifier.
243 |     """
244 | 
245 |     def __init__(self):
246 |         super(Model, self).__init__()
247 | 
248 |         # some configurations for the model
249 |         n_tc_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS
250 |         backbone_name = config.cfg.MODEL.BACKBONE_CNN
251 |         feature_name = config.cfg.MODEL.BACKBONE_FEATURE
252 |         n_tc_layers = config.cfg.MODEL.N_TC_LAYERS
253 |         n_classes = config.cfg.MODEL.N_CLASSES
254 |         is_dilated = config.cfg.MODEL.MULTISCALE_TYPE
255 |         OutputActivation = Sigmoid if config.cfg.MODEL.CLASSIFICATION_TYPE == 'ml' else LogSoftmax
256 |         n_channels_in, channel_h, channel_w = utils.get_model_feat_maps_info(backbone_name, feature_name)
257 |         n_groups = int(n_channels_in / 128.0)
258 | 
259 |         input_shape = (None, n_channels_in, n_tc_timesteps, channel_h, channel_w)  # (C, T, H, W)
260 |         self._input_shape = input_shape
261 | 
262 |         # define 4 layers of timeception
263 |         self.timeception = timeception_pytorch.Timeception(input_shape, n_tc_layers, n_groups, is_dilated)  # (C, T, H, W)
264 | 
265 |         # get number of output channels after timeception
266 |         n_channels_in = self.timeception.n_channels_out
267 | 
268 |         # define layers for classifier
269 |         self.do1 = Dropout(0.5)
270 |         self.l1 = Linear(n_channels_in, 512)
271 |         self.bn1 = BatchNorm1d(512)
272 |         self.ac1 = LeakyReLU(0.2)
273 |         self.do2 = Dropout(0.25)
274 |         self.l2 = Linear(512, n_classes)
275 |         self.ac2 = OutputActivation()
276 | 
277 |     def forward(self, input):
278 |         # feedforward the input to the timeception layers
279 |         tensor = self.timeception(input)
280 | 
281 |         # max-pool over space-time
282 |         bn, c, t, h, w = tensor.size()
283 |         tensor = tensor.view(bn, c, t * h * w)
284 |         tensor = torch.max(tensor, dim=2, keepdim=False)
285 |         tensor = tensor[0]
286 | 
287 |         # dense layers for classification
288 |         tensor = self.do1(tensor)
289 |         tensor = self.l1(tensor)
290 |         tensor = self.bn1(tensor)
291 |         tensor = self.ac1(tensor)
292 |         tensor = self.do2(tensor)
293 |         tensor = self.l2(tensor)
294 |         tensor = self.ac2(tensor)
295 | 
296 |         return tensor
297 | 
298 | def __main():
299 |     """
300 |     Run this script to train Timeception.
301 |     """
302 | 
303 |     default_config_file = 'charades_i3d_tc4_f1024.yaml'
304 |     default_config_file = 'charades_i3d_tc2_f256.yaml'
305 | 
306 |     # Parse the arguments
307 |     parser = OptionParser()
308 |     parser.add_option('-c', '--config_file', dest='config_file', default=default_config_file, help='Yaml config file that contains all training details.')
309 |     (options, args) = parser.parse_args()
310 |     config_file = options.config_file
311 | 
312 |     # check if exist
313 |     if config_file is None or config_file == '':
314 |         msg = 'Config file not passed, default config is used: %s' % (config_file)
315 |         logging.warning(msg)
316 |         config_file = default_config_file
317 | 
318 |     # path of config file
319 |     config_path = './configs/%s' % (config_file)
320 | 
321 |     # check if file exist
322 |     if not os.path.exists(config_path):
323 |         msg = 'Sorry, could not find config file with the following path: %s' % (config_path)
324 |         logging.error(msg)
325 |     else:
326 |         # read the config from file and copy it to the project configuration "cfg"
327 |         config_utils.cfg_from_file(config_path)
328 | 
329 |         # choose which training scheme, either 'ete' or 'tco'
330 |         training_scheme = config.cfg.TRAIN.SCHEME
331 | 
332 |         # start training
333 |         if training_scheme == 'tco':
334 |             train_tco()
335 |         else:
336 |             train_ete()
337 | 
338 | if __name__ == '__main__':
339 |     __main()
340 | 


--------------------------------------------------------------------------------
/core/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | ########################################################################
  5 | # GNU General Public License v3.0
  6 | # GNU GPLv3
  7 | # Copyright (c) 2019, Noureldien Hussein
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 21 | ########################################################################
 22 | 
 23 | """
 24 | Helper functions for many things. Also, some needed classes.
 25 | """
 26 | 
 27 | from __future__ import absolute_import
 28 | from __future__ import division
 29 | from __future__ import print_function
 30 | from __future__ import unicode_literals
 31 | 
 32 | import logging
 33 | import time
 34 | import h5py
 35 | import yaml
 36 | import numpy as np
 37 | import pickle as pkl
 38 | import pandas as pd
 39 | from datetime import datetime
 40 | import matplotlib.pyplot as plt
 41 | from sklearn.preprocessing import label_binarize
 42 | from sklearn import preprocessing, manifold
 43 | import scipy.io as sio
 44 | 
 45 | import os
 46 | import json
 47 | import natsort
 48 | import random
 49 | from multiprocessing.dummy import Pool
 50 | 
 51 | from core import const
 52 | 
 53 | logger = logging.getLogger(__name__)
 54 | 
 55 | # region Load and Dump
 56 | 
 57 | def pkl_load(path):
 58 |     with open(path, 'r') as f:
 59 |         data = pkl.load(f)
 60 |     return data
 61 | 
 62 | def txt_load(path):
 63 |     with open(path, 'r') as f:
 64 |         lines = f.read().splitlines()
 65 |     lines = np.array(lines)
 66 |     return lines
 67 | 
 68 | def byte_load(path):
 69 |     with open(path, 'rb') as f:
 70 |         data = f.read()
 71 |     return data
 72 | 
 73 | def json_load(path):
 74 |     with open(path, 'r') as f:
 75 |         data = json.load(f)
 76 | 
 77 |     return data
 78 | 
 79 | def yaml_load(file_path):
 80 |     with open(file_path, 'r') as f:
 81 |         data = yaml.load(f)
 82 |         data = AttrDict(data)
 83 | 
 84 |     data = convert_dict_to_attrdict(data)
 85 |     return data
 86 | 
 87 | def h5_load(path, dataset_name='data'):
 88 |     h5_file = h5py.File(path, 'r')
 89 |     data = h5_file[dataset_name].value
 90 |     h5_file.close()
 91 |     return data
 92 | 
 93 | def h5_load_multi(path, dataset_names):
 94 |     h5_file = h5py.File(path, 'r')
 95 |     data = [h5_file[name].value for name in dataset_names]
 96 |     h5_file.close()
 97 |     return data
 98 | 
 99 | def txt_dump(data, path):
100 |     l = len(data) - 1
101 |     with open(path, 'w') as f:
102 |         for i, k in enumerate(data):
103 |             if i < l:
104 |                 k = ('%s\n' % k)
105 |             else:
106 |                 k = ('%s' % k)
107 |             f.writelines(k)
108 | 
109 | def byte_dump(data, path):
110 |     with open(path, 'wb') as f:
111 |         f.write(data)
112 | 
113 | def pkl_dump(data, path, is_highest=True):
114 |     with open(path, 'w') as f:
115 |         if not is_highest:
116 |             pkl.dump(data, f)
117 |         else:
118 |             pkl.dump(data, f, pkl.HIGHEST_PROTOCOL)
119 | 
120 | def json_dump(data, path):
121 |     with open(path, 'w') as f:
122 |         json.dump(data, f)
123 | 
124 | def h5_dump(data, path, dataset_name='data'):
125 |     h5_file = h5py.File(path, 'w')
126 |     h5_file.create_dataset(dataset_name, data=data, dtype=data.dtype)
127 |     h5_file.close()
128 | 
129 | def h5_dump_multi(data, dataset_names, path):
130 |     h5_file = h5py.File(path, 'w')
131 |     n_items = len(data)
132 |     for i in range(n_items):
133 |         item_data = data[i]
134 |         item_name = dataset_names[i]
135 |         h5_file.create_dataset(item_name, data=item_data, dtype=item_data.dtype)
136 |     h5_file.close()
137 | 
138 | def csv_load(path, sep=',', header='infer'):
139 |     df = pd.read_csv(path, sep=sep, header=header)
140 |     data = df.values
141 |     return data
142 | 
143 | def mat_load(path, m_dict=None):
144 |     """
145 |     Load mat files.
146 |     :param path:
147 |     :return:
148 |     """
149 |     if m_dict is None:
150 |         data = sio.loadmat(path)
151 |     else:
152 |         data = sio.loadmat(path, m_dict)
153 | 
154 |     return data
155 | 
156 | # endregion
157 | 
158 | # region File/Folder Names/Pathes
159 | 
160 | def file_names(path, is_nat_sort=False):
161 |     if not os.path.exists(path):
162 |         exp_msg = 'Sorry, folder path does not exist: %s' % (path)
163 |         raise Exception(exp_msg)
164 | 
165 |     names = os.walk(path).next()[2]
166 | 
167 |     if is_nat_sort:
168 |         names = natsort.natsorted(names)
169 | 
170 |     return names
171 | 
172 | def file_pathes(path, is_nat_sort=False):
173 |     if not os.path.exists(path):
174 |         exp_msg = 'Sorry, folder path does not exist: %s' % (path)
175 |         raise Exception(exp_msg)
176 | 
177 |     names = os.walk(path).next()[2]
178 | 
179 |     if is_nat_sort:
180 |         names = natsort.natsorted(names)
181 | 
182 |     pathes = ['%s/%s' % (path, n) for n in names]
183 |     return pathes
184 | 
185 | def folder_names(path, is_nat_sort=False):
186 |     if not os.path.exists(path):
187 |         exp_msg = 'Sorry, folder path does not exist: %s' % (path)
188 |         raise Exception(exp_msg)
189 | 
190 |     names = os.walk(path).next()[1]
191 | 
192 |     if is_nat_sort:
193 |         names = natsort.natsorted(names)
194 | 
195 |     return names
196 | 
197 | def folder_pathes(path, is_nat_sort=False):
198 |     if not os.path.exists(path):
199 |         exp_msg = 'Sorry, folder path does not exist: %s' % (path)
200 |         raise Exception(exp_msg)
201 | 
202 |     names = os.walk(path).next()[1]
203 | 
204 |     if is_nat_sort:
205 |         names = natsort.natsorted(names)
206 | 
207 |     pathes = ['%s/%s' % (path, n) for n in names]
208 |     return pathes
209 | 
210 | # endregion
211 | 
212 | # region Normalization
213 | 
214 | def normalize_mean_std(x):
215 |     mean = np.mean(x, axis=0)
216 |     std = np.std(x, axis=0)
217 |     x -= mean
218 |     x /= std
219 |     return x
220 | 
221 | def normalize_mean(x):
222 |     mean = np.mean(x, axis=0)
223 |     x /= mean
224 |     return x
225 | 
226 | def normalize_sum(x):
227 |     sum = np.sum(x, axis=1)
228 |     x = np.array([x_i / sum_i for x_i, sum_i in zip(x, sum)])
229 |     return x
230 | 
231 | def normalize_l2(x):
232 |     return preprocessing.normalize(x)
233 | 
234 | def normalize_l1(x):
235 |     return preprocessing.normalize(x, norm='l1')
236 | 
237 | def normalize_range_0_to_1(x):
238 |     x = np.add(x, -x.min())
239 |     x = np.divide(x, x.max())
240 |     return x
241 | 
242 | # endregion
243 | 
244 | # region Array Helpers
245 | 
246 | def array_to_text(a, separator=', '):
247 |     text = separator.join([str(s) for s in a])
248 |     return text
249 | 
250 | def get_size_in_kb(size):
251 |     size /= float(1024)
252 |     return size
253 | 
254 | def get_size_in_mb(size):
255 |     size /= float(1024 * 1024)
256 |     return size
257 | 
258 | def get_size_in_gb(size):
259 |     size /= float(1024 * 1024 * 1024)
260 |     return size
261 | 
262 | def get_array_memory_size(a):
263 |     if type(a) is not np.ndarray:
264 |         raise Exception('Sorry, input is not numpy array!')
265 | 
266 |     dtype = a.dtype
267 |     if dtype == np.float16:
268 |         n_bytes = 2
269 |     elif dtype == np.float32:
270 |         n_bytes = 4
271 |     else:
272 |         raise Exception('Sorry, unsupported dtype:', dtype)
273 | 
274 |     s = a.size
275 |     size = s * n_bytes
276 |     return size
277 | 
278 | def get_expected_memory_size(array_shape, array_dtype):
279 |     dtype = array_dtype
280 |     if dtype == np.float16:
281 |         n_bytes = 2
282 |     elif dtype == np.float32:
283 |         n_bytes = 4
284 |     else:
285 |         raise Exception('Sorry, unsupported dtype:', dtype)
286 | 
287 |     s = 1
288 |     for dim_size in array_shape:
289 |         s *= dim_size
290 | 
291 |     size = s * n_bytes
292 |     return size
293 | 
294 | def print_array(a):
295 |     for item in a:
296 |         print(item)
297 | 
298 | def print_array_joined(a):
299 |     s = ', '.join([str(i) for i in a])
300 |     print(s)
301 | 
302 | # endregion
303 | 
304 | # region Misc
305 | 
306 | def learn_manifold(manifold_type, feats, n_components=2):
307 |     if manifold_type == 'tsne':
308 |         feats_fitted = manifold.TSNE(n_components=n_components, random_state=0).fit_transform(feats)
309 |     elif manifold_type == 'isomap':
310 |         feats_fitted = manifold.Isomap(n_components=n_components).fit_transform(feats)
311 |     elif manifold_type == 'mds':
312 |         feats_fitted = manifold.MDS(n_components=n_components).fit_transform(feats)
313 |     elif manifold_type == 'spectral':
314 |         feats_fitted = manifold.SpectralEmbedding(n_components=n_components).fit_transform(feats)
315 |     else:
316 |         raise Exception('wrong maniford type!')
317 | 
318 |     # methods = ['standard', 'ltsa', 'hessian', 'modified']
319 |     # feats_fitted = manifold.LocallyLinearEmbedding(n_components=n_components, method=methods[0]).fit_transform(pred)
320 | 
321 |     return feats_fitted
322 | 
323 | def debinarize_label(labels):
324 |     debinarized = np.array([np.where(l == 1)[0][0] for l in labels])
325 |     return debinarized
326 | 
327 | def timestamp():
328 |     time_stamp = "{0:%y}.{0:%m}.{0:%d}-{0:%I}:{0:%M}:{0:%S}".format(datetime.now())
329 |     return time_stamp
330 | 
331 | def remove_extension(name):
332 |     name = name[:-4]
333 |     return name
334 | 
335 | def get_file_extension(name):
336 |     name = name.split('.')[-1]
337 |     return name
338 | 
339 | def print_counter(num, total, freq=None):
340 |     if freq is None:
341 |         logger.info('... %d/%d' % (num, total))
342 |     elif num % freq == 0:
343 |         logger.info('... %d/%d' % (num, total))
344 | 
345 | def calc_num_batches(n_samples, batch_size):
346 |     n_batch = int(n_samples / float(batch_size))
347 |     n_batch = n_batch if n_samples % batch_size == 0 else n_batch + 1
348 |     return n_batch
349 | 
350 | def convert_dict_to_attrdict(d):
351 |     for k, v in d.iteritems():
352 |         if isinstance(v, dict):
353 |             v = convert_dict_to_attrdict(v)
354 |             d[k] = v
355 | 
356 |     if isinstance(d, dict):
357 |         d = AttrDict(d)
358 | 
359 |     return d
360 | 
361 | def get_model_feat_maps_info(model_type, feature_type):
362 |     """
363 |     Get feature map details according to model type and feature type.
364 |     :param model_type:
365 |     :param feature_type:
366 |     :return:
367 |     """
368 | 
369 |     if model_type in ['vgg', 'vgg_charades_rgb']:
370 |         if feature_type == 'pool5':
371 |             return 512, 7, 7
372 |         elif feature_type == 'conv5_3':
373 |             return 512, 14, 14
374 |         else:
375 |             raise Exception('Sorry, unsupported feature type: %s' % (feature_type))
376 |     elif model_type in ['resnet152', 'resnet152_charades_rgb']:
377 |         if feature_type == 'res4b35':
378 |             return 1024, 14, 14
379 |         elif feature_type == 'res5c':
380 |             return 2048, 7, 7
381 |         elif feature_type == 'pool5':
382 |             return 2048, 1, 1
383 |         else:
384 |             raise Exception('Sorry, unsupported feature type: %s' % (feature_type))
385 |     elif model_type in ['i3d_rgb', 'i3d_pytorch_charades_rgb', 'i3d_kinetics_keras', 'i3d_keras_kinetics_rgb']:
386 |         if feature_type == 'mixed_5c':
387 |             return 1024, 7, 7
388 |         elif feature_type == 'mixed_4f':
389 |             return 832, 7, 7
390 |         else:
391 |             raise Exception('Sorry, unsupported feature type: %s' % (feature_type))
392 |     elif model_type in ['i3d_resnet_50_kinetics_rgb', 'i3d_resnet_101_kinetics_rgb']:
393 |         if feature_type == 'pool5':
394 |             return 2048, 7, 7
395 |         else:
396 |             raise Exception('Sorry, unsupported feature type: %s' % (feature_type))
397 |     elif model_type in ['i3d_resnet101_charades_rgb']:
398 |         if feature_type == 'res5_2':
399 |             return 2048, 7, 7
400 |         else:
401 |             raise Exception('Sorry, unsupported feature type: %s' % (feature_type))
402 |     else:
403 |         raise Exception('Sorry, unsupported model type: %s' % (model_type))
404 | 
405 | # endregion
406 | 
407 | # region Classes
408 | 
409 | class Path(str):
410 |     def __new__(self, relative_path, args=None, root_type=const.ROOT_PATH_TYPES[0]):
411 |         assert root_type in const.ROOT_PATH_TYPES
412 |         root_types = list(const.ROOT_PATH_TYPES)
413 |         idx_root_type = root_types.index(root_type)
414 | 
415 |         root_paths = [const.DATA_ROOT_PATH, const.PROJECT_ROOT_PATH]
416 |         root_path = root_paths[idx_root_type]
417 | 
418 |         relative_path = relative_path % args if args is not None else relative_path
419 |         path = os.path.join(root_path, relative_path)
420 | 
421 |         self.__path = path
422 |         return self.__path
423 | 
424 |     def __str__(self):
425 |         return self.__path
426 | 
427 |     def __repr__(self):
428 |         return self.__path
429 | 
430 | class DurationTimer(object):
431 |     def __init__(self):
432 |         self.start_time = time.time()
433 | 
434 |     def duration(self, is_string=True):
435 |         stop_time = time.time()
436 |         durtation = stop_time - self.start_time
437 |         if is_string:
438 |             durtation = self.format_duration(durtation)
439 |         return durtation
440 | 
441 |     def format_duration(self, duration):
442 |         if duration < 60:
443 |             return str(duration) + " sec"
444 |         elif duration < (60 * 60):
445 |             return str(duration / 60) + " min"
446 |         else:
447 |             return str(duration / (60 * 60)) + " hr"
448 | 
449 | class AttrDict(dict):
450 |     """
451 |     Subclass dict and define getter-setter. This behaves as both dict and obj.
452 |     """
453 | 
454 |     def __getattr__(self, key):
455 |         return self[key]
456 | 
457 |     def __setattr__(self, key, value):
458 |         if key in self.__dict__:
459 |             self.__dict__[key] = value
460 |         else:
461 |             self[key] = value
462 | 
463 | # endregion
464 | 


--------------------------------------------------------------------------------
/nets/timeception_pytorch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | ########################################################################
  5 | # GNU General Public License v3.0
  6 | # GNU GPLv3
  7 | # Copyright (c) 2019, Noureldien Hussein
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 21 | ########################################################################
 22 | 
 23 | """
 24 | Definitio of Timeception as pytorch model.
 25 | """
 26 | 
 27 | from __future__ import absolute_import
 28 | from __future__ import division
 29 | from __future__ import print_function
 30 | from __future__ import unicode_literals
 31 | 
 32 | import logging
 33 | 
 34 | import torch
 35 | import torch.nn
 36 | import torchvision
 37 | import torchviz
 38 | import torchsummary
 39 | 
 40 | from torch.nn import Module, Conv3d, BatchNorm3d, MaxPool3d, ReLU
 41 | from torch.nn import functional as F
 42 | 
 43 | from nets.layers_pytorch import ChannelShuffleLayer, DepthwiseConv1DLayer
 44 | 
 45 | # region Timeception as Module
 46 | 
 47 | class Timeception(Module):
 48 |     """
 49 |     Timeception is defined as a keras model.
 50 |     """
 51 | 
 52 |     def __init__(self, input_shape, n_layers=4, n_groups=8, is_dilated=True):
 53 | 
 54 |         super(Timeception, self).__init__()
 55 | 
 56 |         # TODO: Add support for multi-scale using dilation rates
 57 |         # current, for pytorch, we only support multi-scale using kernel sizes
 58 |         is_dilated = False
 59 | 
 60 |         expansion_factor = 1.25
 61 |         self.expansion_factor = expansion_factor
 62 |         self.n_layers = n_layers
 63 |         self.is_dilated = is_dilated
 64 |         self.n_groups = n_groups
 65 |         self.n_channels_out = None
 66 | 
 67 |         # convert it as a list
 68 |         input_shape = list(input_shape)
 69 | 
 70 |         # define timeception layers
 71 |         n_channels_out = self.__define_timeception_layers(input_shape, n_layers, n_groups, expansion_factor, is_dilated)
 72 | 
 73 |         # set the output channels
 74 |         self.n_channels_out = n_channels_out
 75 | 
 76 |     def forward(self, input):
 77 | 
 78 |         n_layers = self.n_layers
 79 |         n_groups = self.n_groups
 80 |         expansion_factor = self.expansion_factor
 81 | 
 82 |         output = self.__call_timeception_layers(input, n_layers, n_groups, expansion_factor)
 83 | 
 84 |         return output
 85 | 
 86 |     def __define_timeception_layers(self, input_shape, n_layers, n_groups, expansion_factor, is_dilated):
 87 |         """
 88 |         Define layers inside the timeception layers.
 89 |         """
 90 | 
 91 |         n_channels_in = input_shape[1]
 92 | 
 93 |         # how many layers of timeception
 94 |         for i in range(n_layers):
 95 |             layer_num = i + 1
 96 | 
 97 |             # get details about grouping
 98 |             n_channels_per_branch, n_channels_out = self.__get_n_channels_per_branch(n_groups, expansion_factor, n_channels_in)
 99 | 
100 |             # temporal conv per group
101 |             self.__define_grouped_convolutions(input_shape, n_groups, n_channels_per_branch, is_dilated, layer_num)
102 | 
103 |             # downsample over time
104 |             layer_name = 'maxpool_tc%d' % (layer_num)
105 |             layer = MaxPool3d(kernel_size=(2, 1, 1))
106 |             layer._name = layer_name
107 |             setattr(self, layer_name, layer)
108 | 
109 |             n_channels_in = n_channels_out
110 |             input_shape[1] = n_channels_in
111 | 
112 |         return n_channels_in
113 | 
114 |     def __define_grouped_convolutions(self, input_shape, n_groups, n_channels_per_branch, is_dilated, layer_num):
115 |         """
116 |         Define layers inside grouped convolutional block.
117 |         """
118 | 
119 |         n_channels_in = input_shape[1]
120 | 
121 |         n_branches = 5
122 |         n_channels_per_group_in = int(n_channels_in / n_groups)
123 |         n_channels_out = int(n_groups * n_branches * n_channels_per_branch)
124 |         n_channels_per_group_out = int(n_channels_out / n_groups)
125 | 
126 |         assert n_channels_in % n_groups == 0
127 |         assert n_channels_out % n_groups == 0
128 | 
129 |         # type of multi-scale kernels to use: either multi_kernel_sizes or multi_dilation_rates
130 |         if is_dilated:
131 |             kernel_sizes = (3, 3, 3)
132 |             dilation_rates = (1, 2, 3)
133 |         else:
134 |             kernel_sizes = (3, 5, 7)
135 |             dilation_rates = (1, 1, 1)
136 | 
137 |         input_shape_per_group = list(input_shape)
138 |         input_shape_per_group[1] = n_channels_per_group_in
139 | 
140 |         # loop on groups, and define convolutions in each group
141 |         for idx_group in range(n_groups):
142 |             group_num = idx_group + 1
143 |             self.__define_temporal_convolutional_block(input_shape_per_group, n_channels_per_branch, kernel_sizes, dilation_rates, layer_num, group_num)
144 | 
145 |         # activation
146 |         layer_name = 'relu_tc%d' % (layer_num)
147 |         layer = ReLU()
148 |         layer._name = layer_name
149 |         setattr(self, layer_name, layer)
150 | 
151 |         # shuffle channels
152 |         layer_name = 'shuffle_tc%d' % (layer_num)
153 |         layer = ChannelShuffleLayer(n_channels_out, n_groups)
154 |         layer._name = layer_name
155 |         setattr(self, layer_name, layer)
156 | 
157 |     def __define_temporal_convolutional_block(self, input_shape, n_channels_per_branch_out, kernel_sizes, dilation_rates, layer_num, group_num):
158 |         """
159 |         Define 5 branches of convolutions that operate of channels of each group.
160 |         """
161 | 
162 |         n_channels_in = input_shape[1]
163 | 
164 |         dw_input_shape = list(input_shape)
165 |         dw_input_shape[1] = n_channels_per_branch_out
166 | 
167 |         # branch 1: dimension reduction only and no temporal conv
168 |         layer_name = 'conv_b1_g%d_tc%d' % (group_num, layer_num)
169 |         layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1))
170 |         layer._name = layer_name
171 |         setattr(self, layer_name, layer)
172 |         layer_name = 'bn_b1_g%d_tc%d' % (group_num, layer_num)
173 |         layer = BatchNorm3d(n_channels_per_branch_out)
174 |         layer._name = layer_name
175 |         setattr(self, layer_name, layer)
176 | 
177 |         # branch 2: dimension reduction followed by depth-wise temp conv (kernel-size 3)
178 |         layer_name = 'conv_b2_g%d_tc%d' % (group_num, layer_num)
179 |         layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1))
180 |         layer._name = layer_name
181 |         setattr(self, layer_name, layer)
182 |         layer_name = 'convdw_b2_g%d_tc%d' % (group_num, layer_num)
183 |         layer = DepthwiseConv1DLayer(dw_input_shape, kernel_sizes[0], dilation_rates[0], layer_name)
184 |         setattr(self, layer_name, layer)
185 |         layer_name = 'bn_b2_g%d_tc%d' % (group_num, layer_num)
186 |         layer = BatchNorm3d(n_channels_per_branch_out)
187 |         layer._name = layer_name
188 |         setattr(self, layer_name, layer)
189 | 
190 |         # branch 3: dimension reduction followed by depth-wise temp conv (kernel-size 5)
191 |         layer_name = 'conv_b3_g%d_tc%d' % (group_num, layer_num)
192 |         layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1))
193 |         layer._name = layer_name
194 |         setattr(self, layer_name, layer)
195 |         layer_name = 'convdw_b3_g%d_tc%d' % (group_num, layer_num)
196 |         layer = DepthwiseConv1DLayer(dw_input_shape, kernel_sizes[1], dilation_rates[1], layer_name)
197 |         setattr(self, layer_name, layer)
198 |         layer_name = 'bn_b3_g%d_tc%d' % (group_num, layer_num)
199 |         layer = BatchNorm3d(n_channels_per_branch_out)
200 |         layer._name = layer_name
201 |         setattr(self, layer_name, layer)
202 | 
203 |         # branch 4: dimension reduction followed by depth-wise temp conv (kernel-size 7)
204 |         layer_name = 'conv_b4_g%d_tc%d' % (group_num, layer_num)
205 |         layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1))
206 |         layer._name = layer_name
207 |         setattr(self, layer_name, layer)
208 |         layer_name = 'convdw_b4_g%d_tc%d' % (group_num, layer_num)
209 |         layer = DepthwiseConv1DLayer(dw_input_shape, kernel_sizes[2], dilation_rates[2], layer_name)
210 |         setattr(self, layer_name, layer)
211 |         layer_name = 'bn_b4_g%d_tc%d' % (group_num, layer_num)
212 |         layer = BatchNorm3d(n_channels_per_branch_out)
213 |         layer._name = layer_name
214 |         setattr(self, layer_name, layer)
215 | 
216 |         # branch 5: dimension reduction followed by temporal max pooling
217 |         layer_name = 'conv_b5_g%d_tc%d' % (group_num, layer_num)
218 |         layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1))
219 |         layer._name = layer_name
220 |         setattr(self, layer_name, layer)
221 |         layer_name = 'maxpool_b5_g%d_tc%d' % (group_num, layer_num)
222 |         layer = MaxPool3d(kernel_size=(2, 1, 1), stride=(1, 1, 1))
223 |         layer._name = layer_name
224 |         setattr(self, layer_name, layer)
225 |         layer_name = 'padding_b5_g%d_tc%d' % (group_num, layer_num)
226 |         layer = torch.nn.ReplicationPad3d((0, 0, 0, 0, 1, 0))  # left, right, top, bottom, front, back
227 |         layer._name = layer_name
228 |         setattr(self, layer_name, layer)
229 |         layer_name = 'bn_b5_g%d_tc%d' % (group_num, layer_num)
230 |         layer = BatchNorm3d(n_channels_per_branch_out)
231 |         layer._name = layer_name
232 |         setattr(self, layer_name, layer)
233 | 
234 |     def __call_timeception_layers(self, tensor, n_layers, n_groups, expansion_factor):
235 |         input_shape = tensor.size()
236 |         n_channels_in = input_shape[1]
237 | 
238 |         # how many layers of timeception
239 |         for i in range(n_layers):
240 |             layer_num = i + 1
241 | 
242 |             # get details about grouping
243 |             n_channels_per_branch, n_channels_out = self.__get_n_channels_per_branch(n_groups, expansion_factor, n_channels_in)
244 | 
245 |             # temporal conv per group
246 |             tensor = self.__call_grouped_convolutions(tensor, n_groups, layer_num)
247 | 
248 |             # downsample over time
249 |             tensor = getattr(self, 'maxpool_tc%d' % (layer_num))(tensor)
250 |             n_channels_in = n_channels_out
251 | 
252 |         return tensor
253 | 
254 |     def __call_grouped_convolutions(self, tensor_input, n_groups, layer_num):
255 | 
256 |         n_channels_in = tensor_input.size()[1]
257 |         n_channels_per_group_in = int(n_channels_in / n_groups)
258 | 
259 |         # loop on groups
260 |         t_outputs = []
261 |         for idx_group in range(n_groups):
262 |             group_num = idx_group + 1
263 | 
264 |             # slice maps to get maps per group
265 |             idx_start = idx_group * n_channels_per_group_in
266 |             idx_end = (idx_group + 1) * n_channels_per_group_in
267 |             tensor = tensor_input[:, idx_start:idx_end]
268 | 
269 |             tensor = self.__call_temporal_convolutional_block(tensor, layer_num, group_num)
270 |             t_outputs.append(tensor)
271 | 
272 |         # concatenate channels of groups
273 |         tensor = torch.cat(t_outputs, dim=1)
274 |         # activation
275 |         tensor = getattr(self, 'relu_tc%d' % (layer_num))(tensor)
276 |         # shuffle channels
277 |         tensor = getattr(self, 'shuffle_tc%d' % (layer_num))(tensor)
278 | 
279 |         return tensor
280 | 
281 |     def __call_temporal_convolutional_block(self, tensor, layer_num, group_num):
282 |         """
283 |         Feedforward for 5 branches of convolutions that operate of channels of each group.
284 |         """
285 | 
286 |         # branch 1: dimension reduction only and no temporal conv
287 |         t_1 = getattr(self, 'conv_b1_g%d_tc%d' % (group_num, layer_num))(tensor)
288 |         t_1 = getattr(self, 'bn_b1_g%d_tc%d' % (group_num, layer_num))(t_1)
289 | 
290 |         # branch 2: dimension reduction followed by depth-wise temp conv (kernel-size 3)
291 |         t_2 = getattr(self, 'conv_b2_g%d_tc%d' % (group_num, layer_num))(tensor)
292 |         t_2 = getattr(self, 'convdw_b2_g%d_tc%d' % (group_num, layer_num))(t_2)
293 |         t_2 = getattr(self, 'bn_b2_g%d_tc%d' % (group_num, layer_num))(t_2)
294 | 
295 |         # branch 3: dimension reduction followed by depth-wise temp conv (kernel-size 5)
296 |         t_3 = getattr(self, 'conv_b3_g%d_tc%d' % (group_num, layer_num))(tensor)
297 |         t_3 = getattr(self, 'convdw_b3_g%d_tc%d' % (group_num, layer_num))(t_3)
298 |         t_3 = getattr(self, 'bn_b3_g%d_tc%d' % (group_num, layer_num))(t_3)
299 | 
300 |         # branch 4: dimension reduction followed by depth-wise temp conv (kernel-size 7)
301 |         t_4 = getattr(self, 'conv_b4_g%d_tc%d' % (group_num, layer_num))(tensor)
302 |         t_4 = getattr(self, 'convdw_b4_g%d_tc%d' % (group_num, layer_num))(t_4)
303 |         t_4 = getattr(self, 'bn_b4_g%d_tc%d' % (group_num, layer_num))(t_4)
304 | 
305 |         # branch 5: dimension reduction followed by temporal max pooling
306 |         t_5 = getattr(self, 'conv_b5_g%d_tc%d' % (group_num, layer_num))(tensor)
307 |         t_5 = getattr(self, 'maxpool_b5_g%d_tc%d' % (group_num, layer_num))(t_5)
308 |         t_5 = getattr(self, 'padding_b5_g%d_tc%d' % (group_num, layer_num))(t_5)
309 |         t_5 = getattr(self, 'bn_b5_g%d_tc%d' % (group_num, layer_num))(t_5)
310 | 
311 |         # concatenate channels of branches
312 |         tensors = (t_1, t_2, t_3, t_4, t_5)
313 |         tensor = torch.cat(tensors, dim=1)
314 | 
315 |         return tensor
316 | 
317 |     def __get_n_channels_per_branch(self, n_groups, expansion_factor, n_channels_in):
318 |         n_branches = 5
319 |         n_channels_per_branch = int(n_channels_in * expansion_factor / float(n_branches * n_groups))
320 |         n_channels_per_branch = int(n_channels_per_branch)
321 |         n_channels_out = int(n_channels_per_branch * n_groups * n_branches)
322 |         n_channels_out = int(n_channels_out)
323 | 
324 |         return n_channels_per_branch, n_channels_out
325 | 
326 | # endregion
327 | 


--------------------------------------------------------------------------------
/nets/i3d_torch_charades.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | 
  6 | import numpy as np
  7 | 
  8 | import os
  9 | import sys
 10 | from collections import OrderedDict
 11 | 
 12 | class MaxPool3dSamePadding(nn.MaxPool3d):
 13 | 
 14 |     def compute_pad(self, dim, s):
 15 |         if s % self.stride[dim] == 0:
 16 |             return max(self.kernel_size[dim] - self.stride[dim], 0)
 17 |         else:
 18 |             return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
 19 | 
 20 |     def forward(self, x):
 21 |         # compute 'same' padding
 22 |         (batch, channel, t, h, w) = x.size()
 23 |         # print t,h,w
 24 |         out_t = np.ceil(float(t) / float(self.stride[0]))
 25 |         out_h = np.ceil(float(h) / float(self.stride[1]))
 26 |         out_w = np.ceil(float(w) / float(self.stride[2]))
 27 |         # print out_t, out_h, out_w
 28 |         pad_t = self.compute_pad(0, t)
 29 |         pad_h = self.compute_pad(1, h)
 30 |         pad_w = self.compute_pad(2, w)
 31 |         # print pad_t, pad_h, pad_w
 32 | 
 33 |         pad_t_f = pad_t // 2
 34 |         pad_t_b = pad_t - pad_t_f
 35 |         pad_h_f = pad_h // 2
 36 |         pad_h_b = pad_h - pad_h_f
 37 |         pad_w_f = pad_w // 2
 38 |         pad_w_b = pad_w - pad_w_f
 39 | 
 40 |         pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
 41 |         # print x.size()
 42 |         # print pad
 43 |         x = F.pad(x, pad)
 44 |         return super(MaxPool3dSamePadding, self).forward(x)
 45 | 
 46 | class Unit3D(nn.Module):
 47 | 
 48 |     def __init__(self, in_channels,
 49 |                  output_channels,
 50 |                  kernel_shape=(1, 1, 1),
 51 |                  stride=(1, 1, 1),
 52 |                  padding=0,
 53 |                  activation_fn=F.relu,
 54 |                  use_batch_norm=True,
 55 |                  use_bias=False,
 56 |                  name='unit_3d'):
 57 | 
 58 |         """Initializes Unit3D module."""
 59 |         super(Unit3D, self).__init__()
 60 | 
 61 |         self._output_channels = output_channels
 62 |         self._kernel_shape = kernel_shape
 63 |         self._stride = stride
 64 |         self._use_batch_norm = use_batch_norm
 65 |         self._activation_fn = activation_fn
 66 |         self._use_bias = use_bias
 67 |         self.name = name
 68 |         self.padding = padding
 69 | 
 70 |         self.conv3d = nn.Conv3d(in_channels=in_channels,
 71 |                                 out_channels=self._output_channels,
 72 |                                 kernel_size=self._kernel_shape,
 73 |                                 stride=self._stride,
 74 |                                 padding=0,  # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
 75 |                                 bias=self._use_bias)
 76 | 
 77 |         if self._use_batch_norm:
 78 |             self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
 79 | 
 80 |     def compute_pad(self, dim, s):
 81 |         if s % self._stride[dim] == 0:
 82 |             return max(self._kernel_shape[dim] - self._stride[dim], 0)
 83 |         else:
 84 |             return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
 85 | 
 86 |     def forward(self, x):
 87 |         # compute 'same' padding
 88 |         (batch, channel, t, h, w) = x.size()
 89 |         # print t,h,w
 90 |         out_t = np.ceil(float(t) / float(self._stride[0]))
 91 |         out_h = np.ceil(float(h) / float(self._stride[1]))
 92 |         out_w = np.ceil(float(w) / float(self._stride[2]))
 93 |         # print out_t, out_h, out_w
 94 |         pad_t = self.compute_pad(0, t)
 95 |         pad_h = self.compute_pad(1, h)
 96 |         pad_w = self.compute_pad(2, w)
 97 |         # print pad_t, pad_h, pad_w
 98 | 
 99 |         pad_t_f = pad_t // 2
100 |         pad_t_b = pad_t - pad_t_f
101 |         pad_h_f = pad_h // 2
102 |         pad_h_b = pad_h - pad_h_f
103 |         pad_w_f = pad_w // 2
104 |         pad_w_b = pad_w - pad_w_f
105 | 
106 |         pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
107 |         # print x.size()
108 |         # print pad
109 |         x = F.pad(x, pad)
110 |         # print x.size()
111 | 
112 |         x = self.conv3d(x)
113 |         if self._use_batch_norm:
114 |             x = self.bn(x)
115 |         if self._activation_fn is not None:
116 |             x = self._activation_fn(x)
117 |         return x
118 | 
119 | class InceptionModule(nn.Module):
120 |     def __init__(self, in_channels, out_channels, name):
121 |         super(InceptionModule, self).__init__()
122 | 
123 |         self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
124 |                          name=name + '/Branch_0/Conv3d_0a_1x1')
125 |         self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
126 |                           name=name + '/Branch_1/Conv3d_0a_1x1')
127 |         self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
128 |                           name=name + '/Branch_1/Conv3d_0b_3x3')
129 |         self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
130 |                           name=name + '/Branch_2/Conv3d_0a_1x1')
131 |         self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
132 |                           name=name + '/Branch_2/Conv3d_0b_3x3')
133 |         self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
134 |                                         stride=(1, 1, 1), padding=0)
135 |         self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
136 |                           name=name + '/Branch_3/Conv3d_0b_1x1')
137 |         self.name = name
138 | 
139 |     def forward(self, x):
140 |         b0 = self.b0(x)
141 |         b1 = self.b1b(self.b1a(x))
142 |         b2 = self.b2b(self.b2a(x))
143 |         b3 = self.b3b(self.b3a(x))
144 |         return torch.cat([b0, b1, b2, b3], dim=1)
145 | 
146 | class InceptionI3d(nn.Module):
147 |     """Inception-v1 I3D architecture.
148 |     The model is introduced in:
149 |         Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
150 |         Joao Carreira, Andrew Zisserman
151 |         https://arxiv.org/pdf/1705.07750v1.pdf.
152 |     See also the Inception architecture, introduced in:
153 |         Going deeper with convolutions
154 |         Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
155 |         Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
156 |         http://arxiv.org/pdf/1409.4842v1.pdf.
157 |     """
158 | 
159 |     # Endpoints of the model in order. During construction, all the endpoints up
160 |     # to a designated `final_endpoint` are returned in a dictionary as the
161 |     # second return value.
162 |     VALID_ENDPOINTS = (
163 |         'Conv3d_1a_7x7',
164 |         'MaxPool3d_2a_3x3',
165 |         'Conv3d_2b_1x1',
166 |         'Conv3d_2c_3x3',
167 |         'MaxPool3d_3a_3x3',
168 |         'Mixed_3b',
169 |         'Mixed_3c',
170 |         'MaxPool3d_4a_3x3',
171 |         'Mixed_4b',
172 |         'Mixed_4c',
173 |         'Mixed_4d',
174 |         'Mixed_4e',
175 |         'Mixed_4f',
176 |         'MaxPool3d_5a_2x2',
177 |         'Mixed_5b',
178 |         'Mixed_5c',
179 |         'Logits',
180 |         'Predictions',
181 |     )
182 | 
183 |     def __init__(self, num_classes=400, spatial_squeeze=True, final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
184 |         """Initializes I3D model instance.
185 |         Args:
186 |           num_classes: The number of outputs in the logit layer (default 400, which
187 |               matches the Kinetics dataset).
188 |           spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
189 |               before returning (default True).
190 |           final_endpoint: The model contains many possible endpoints.
191 |               `final_endpoint` specifies the last endpoint for the model to be built
192 |               up to. In addition to the output at `final_endpoint`, all the outputs
193 |               at endpoints up to `final_endpoint` will also be returned, in a
194 |               dictionary. `final_endpoint` must be one of
195 |               InceptionI3d.VALID_ENDPOINTS (default 'Logits').
196 |           name: A string (optional). The name of this module.
197 |         Raises:
198 |           ValueError: if `final_endpoint` is not recognized.
199 |         """
200 | 
201 |         if final_endpoint not in self.VALID_ENDPOINTS:
202 |             raise ValueError('Unknown final endpoint %s' % final_endpoint)
203 | 
204 |         super(InceptionI3d, self).__init__()
205 |         self._num_classes = num_classes
206 |         self._spatial_squeeze = spatial_squeeze
207 |         self._final_endpoint = final_endpoint
208 |         self.logits = None
209 | 
210 |         if self._final_endpoint not in self.VALID_ENDPOINTS:
211 |             raise ValueError('Unknown final endpoint %s' % self._final_endpoint)
212 | 
213 |         self.end_points = {}
214 |         end_point = 'Conv3d_1a_7x7'
215 |         self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], stride=(2, 2, 2), padding=(3, 3, 3), name=name + end_point)
216 |         if self._final_endpoint == end_point: return
217 | 
218 |         end_point = 'MaxPool3d_2a_3x3'
219 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
220 |                                                           padding=0)
221 |         if self._final_endpoint == end_point: return
222 | 
223 |         end_point = 'Conv3d_2b_1x1'
224 |         self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0, name=name + end_point)
225 |         if self._final_endpoint == end_point: return
226 | 
227 |         end_point = 'Conv3d_2c_3x3'
228 |         self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, name=name + end_point)
229 |         if self._final_endpoint == end_point: return
230 | 
231 |         end_point = 'MaxPool3d_3a_3x3'
232 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
233 |         if self._final_endpoint == end_point: return
234 | 
235 |         end_point = 'Mixed_3b'
236 |         self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point)
237 |         if self._final_endpoint == end_point: return
238 | 
239 |         end_point = 'Mixed_3c'
240 |         self.end_points[end_point] = InceptionModule(256, [128, 128, 192, 32, 96, 64], name + end_point)
241 |         if self._final_endpoint == end_point: return
242 | 
243 |         end_point = 'MaxPool3d_4a_3x3'
244 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
245 |                                                           padding=0)
246 |         if self._final_endpoint == end_point: return
247 | 
248 |         end_point = 'Mixed_4b'
249 |         self.end_points[end_point] = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point)
250 |         if self._final_endpoint == end_point: return
251 | 
252 |         end_point = 'Mixed_4c'
253 |         self.end_points[end_point] = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point)
254 |         if self._final_endpoint == end_point: return
255 | 
256 |         end_point = 'Mixed_4d'
257 |         self.end_points[end_point] = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point)
258 |         if self._final_endpoint == end_point: return
259 | 
260 |         end_point = 'Mixed_4e'
261 |         self.end_points[end_point] = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point)
262 |         if self._final_endpoint == end_point: return
263 | 
264 |         end_point = 'Mixed_4f'
265 |         self.end_points[end_point] = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], name + end_point)
266 |         if self._final_endpoint == end_point: return
267 | 
268 |         end_point = 'MaxPool3d_5a_2x2'
269 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0)
270 |         if self._final_endpoint == end_point: return
271 | 
272 |         end_point = 'Mixed_5b'
273 |         self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], name + end_point)
274 |         if self._final_endpoint == end_point: return
275 | 
276 |         end_point = 'Mixed_5c'
277 |         self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], name + end_point)
278 |         if self._final_endpoint == end_point: return
279 | 
280 |         end_point = 'Logits'
281 |         self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1))
282 |         self.dropout = nn.Dropout(dropout_keep_prob)
283 |         self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits')
284 | 
285 |         self.build()
286 | 
287 |     def replace_logits(self, num_classes):
288 |         self._num_classes = num_classes
289 |         self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits')
290 | 
291 |     def build(self):
292 |         for k in self.end_points.keys():
293 |             self.add_module(k, self.end_points[k])
294 | 
295 |     # def forward(self, x):
296 |     #     for end_point in self.VALID_ENDPOINTS:
297 |     #         if end_point in self.end_points:
298 |     #             x = self._modules[end_point](x)  # use _modules to work with dataparallel
299 |     #
300 |     #     x = self.logits(self.dropout(self.avg_pool(x)))
301 |     #     if self._spatial_squeeze:
302 |     #         logits = x.squeeze(3).squeeze(3)
303 |     #     # logits is batch X time X classes, which is what we want to work with
304 |     #     return logits
305 | 
306 |     def forward(self, x):
307 |         for end_point in self.VALID_ENDPOINTS:
308 |             if end_point in self.end_points:
309 |                 x = self.end_points[end_point](x)  # use _modules to work with dataparallel
310 |         return x
311 | 
312 |         # for end_point in self.VALID_ENDPOINTS:
313 |         #     if end_point in self.end_points:
314 |         #         x = self._modules[end_point](x)  # use _modules to work with dataparallel
315 |         # return x
316 | 
317 |     def extract_features(self, x):
318 |         for end_point in self.VALID_ENDPOINTS:
319 |             if end_point in self.end_points:
320 |                 x = self._modules[end_point](x)
321 |         return self.avg_pool(x)
322 | 


--------------------------------------------------------------------------------
/nets/i3d_torch_charades_test.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | 
  6 | import numpy as np
  7 | 
  8 | import os
  9 | import sys
 10 | from collections import OrderedDict
 11 | 
 12 | class MaxPool3dSamePadding(nn.MaxPool3d):
 13 | 
 14 |     def compute_pad(self, dim, s):
 15 |         if s % self.stride[dim] == 0:
 16 |             return max(self.kernel_size[dim] - self.stride[dim], 0)
 17 |         else:
 18 |             return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
 19 | 
 20 |     def forward(self, x):
 21 |         # compute 'same' padding
 22 |         (batch, channel, t, h, w) = x.size()
 23 |         # print t,h,w
 24 |         out_t = np.ceil(float(t) / float(self.stride[0]))
 25 |         out_h = np.ceil(float(h) / float(self.stride[1]))
 26 |         out_w = np.ceil(float(w) / float(self.stride[2]))
 27 |         # print out_t, out_h, out_w
 28 |         pad_t = self.compute_pad(0, t)
 29 |         pad_h = self.compute_pad(1, h)
 30 |         pad_w = self.compute_pad(2, w)
 31 |         # print pad_t, pad_h, pad_w
 32 | 
 33 |         pad_t_f = pad_t // 2
 34 |         pad_t_b = pad_t - pad_t_f
 35 |         pad_h_f = pad_h // 2
 36 |         pad_h_b = pad_h - pad_h_f
 37 |         pad_w_f = pad_w // 2
 38 |         pad_w_b = pad_w - pad_w_f
 39 | 
 40 |         pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
 41 |         # print x.size()
 42 |         # print pad
 43 |         x = F.pad(x, pad)
 44 |         return super(MaxPool3dSamePadding, self).forward(x)
 45 | 
 46 | class Unit3D(nn.Module):
 47 | 
 48 |     def __init__(self, in_channels, output_channels, kernel_shape=(1, 1, 1), stride=(1, 1, 1), padding=0, activation_fn=F.relu, use_batch_norm=True, use_bias=False, name='unit_3d'):
 49 | 
 50 |         """Initializes Unit3D module."""
 51 |         super(Unit3D, self).__init__()
 52 | 
 53 |         self._output_channels = output_channels
 54 |         self._kernel_shape = kernel_shape
 55 |         self._stride = stride
 56 |         self._use_batch_norm = use_batch_norm
 57 |         self._activation_fn = activation_fn
 58 |         self._use_bias = use_bias
 59 |         self.name = name
 60 |         self.padding = padding
 61 | 
 62 |         # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
 63 |         self.conv3d = nn.Conv3d(in_channels=in_channels, out_channels=self._output_channels, kernel_size=self._kernel_shape, stride=self._stride, padding=0, bias=self._use_bias)
 64 | 
 65 |         if self._use_batch_norm:
 66 |             self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
 67 | 
 68 |     def compute_pad(self, dim, s):
 69 |         if s % self._stride[dim] == 0:
 70 |             return max(self._kernel_shape[dim] - self._stride[dim], 0)
 71 |         else:
 72 |             return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
 73 | 
 74 |     def forward(self, x):
 75 |         # compute 'same' padding
 76 |         (batch, channel, t, h, w) = x.size()
 77 |         # print t,h,w
 78 |         # out_t = np.ceil(float(t) / float(self._stride[0]))
 79 |         # out_h = np.ceil(float(h) / float(self._stride[1]))
 80 |         # out_w = np.ceil(float(w) / float(self._stride[2]))
 81 |         # print out_t, out_h, out_w
 82 |         pad_t = self.compute_pad(0, t)
 83 |         pad_h = self.compute_pad(1, h)
 84 |         pad_w = self.compute_pad(2, w)
 85 |         # print pad_t, pad_h, pad_w
 86 | 
 87 |         pad_t_f = pad_t // 2
 88 |         pad_t_b = pad_t - pad_t_f
 89 |         pad_h_f = pad_h // 2
 90 |         pad_h_b = pad_h - pad_h_f
 91 |         pad_w_f = pad_w // 2
 92 |         pad_w_b = pad_w - pad_w_f
 93 | 
 94 |         pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
 95 |         # print x.size()
 96 |         # print pad
 97 |         x = F.pad(x, pad)
 98 |         # print x.size()
 99 | 
100 |         x = self.conv3d(x)
101 |         if self._use_batch_norm:
102 |             x = self.bn(x)
103 |         if self._activation_fn is not None:
104 |             x = self._activation_fn(x)
105 |         return x
106 | 
107 | class InceptionModule(nn.Module):
108 |     def __init__(self, in_channels, out_channels, name):
109 |         super(InceptionModule, self).__init__()
110 | 
111 |         self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_0/Conv3d_0a_1x1')
112 |         self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_1/Conv3d_0a_1x1')
113 |         self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3], name=name + '/Branch_1/Conv3d_0b_3x3')
114 |         self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_2/Conv3d_0a_1x1')
115 |         self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3], name=name + '/Branch_2/Conv3d_0b_3x3')
116 |         self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(1, 1, 1), padding=0)
117 |         self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_3/Conv3d_0b_1x1')
118 |         self.name = name
119 | 
120 |     def forward(self, x):
121 |         b0 = self.b0(x)
122 |         b1 = self.b1b(self.b1a(x))
123 |         b2 = self.b2b(self.b2a(x))
124 |         b3 = self.b3b(self.b3a(x))
125 |         return torch.cat([b0, b1, b2, b3], dim=1)
126 | 
127 | class InceptionI3d(nn.Module):
128 |     """Inception-v1 I3D architecture.
129 |     The model is introduced in:
130 |         Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
131 |         Joao Carreira, Andrew Zisserman
132 |         https://arxiv.org/pdf/1705.07750v1.pdf.
133 |     See also the Inception architecture, introduced in:
134 |         Going deeper with convolutions
135 |         Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
136 |         Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
137 |         http://arxiv.org/pdf/1409.4842v1.pdf.
138 |     """
139 | 
140 |     # Endpoints of the model in order. During construction, all the endpoints up
141 |     # to a designated `final_endpoint` are returned in a dictionary as the
142 |     # second return value.
143 |     VALID_ENDPOINTS = (
144 |         'Conv3d_1a_7x7',
145 |         'MaxPool3d_2a_3x3',
146 |         'Conv3d_2b_1x1',
147 |         'Conv3d_2c_3x3',
148 |         'MaxPool3d_3a_3x3',
149 |         'Mixed_3b',
150 |         'Mixed_3c',
151 |         'MaxPool3d_4a_3x3',
152 |         'Mixed_4b',
153 |         'Mixed_4c',
154 |         'Mixed_4d',
155 |         'Mixed_4e',
156 |         'Mixed_4f',
157 |         'MaxPool3d_5a_2x2',
158 |         'Mixed_5b',
159 |         'Mixed_5c',
160 |         'Logits',
161 |         'Predictions',
162 |     )
163 | 
164 |     def __init__(self, num_classes=400, spatial_squeeze=True, final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
165 |         """Initializes I3D model instance.
166 |         Args:
167 |           num_classes: The number of outputs in the logit layer (default 400, which
168 |               matches the Kinetics dataset).
169 |           spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
170 |               before returning (default True).
171 |           final_endpoint: The model contains many possible endpoints.
172 |               `final_endpoint` specifies the last endpoint for the model to be built
173 |               up to. In addition to the output at `final_endpoint`, all the outputs
174 |               at endpoints up to `final_endpoint` will also be returned, in a
175 |               dictionary. `final_endpoint` must be one of
176 |               InceptionI3d.VALID_ENDPOINTS (default 'Logits').
177 |           name: A string (optional). The name of this module.
178 |         Raises:
179 |           ValueError: if `final_endpoint` is not recognized.
180 |         """
181 | 
182 |         if final_endpoint not in self.VALID_ENDPOINTS:
183 |             raise ValueError('Unknown final endpoint %s' % final_endpoint)
184 | 
185 |         super(InceptionI3d, self).__init__()
186 |         self._num_classes = num_classes
187 |         self._spatial_squeeze = spatial_squeeze
188 |         self._final_endpoint = final_endpoint
189 |         self.logits = None
190 | 
191 |         if self._final_endpoint not in self.VALID_ENDPOINTS:
192 |             raise ValueError('Unknown final endpoint %s' % self._final_endpoint)
193 | 
194 |         self.end_points = {}
195 |         end_point = 'Conv3d_1a_7x7'
196 |         self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], stride=(2, 2, 2), padding=(3, 3, 3), name=name + end_point)
197 |         self.__freeze_layer(self.end_points[end_point])
198 |         if self._final_endpoint == end_point:
199 |             return
200 | 
201 |         end_point = 'MaxPool3d_2a_3x3'
202 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
203 |         self.__freeze_layer(self.end_points[end_point])
204 |         if self._final_endpoint == end_point:
205 |             return
206 | 
207 |         end_point = 'Conv3d_2b_1x1'
208 |         self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0, name=name + end_point)
209 |         self.__freeze_layer(self.end_points[end_point])
210 |         if self._final_endpoint == end_point:
211 |             return
212 | 
213 |         end_point = 'Conv3d_2c_3x3'
214 |         self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, name=name + end_point)
215 |         self.__freeze_layer(self.end_points[end_point])
216 |         if self._final_endpoint == end_point:
217 |             return
218 | 
219 |         end_point = 'MaxPool3d_3a_3x3'
220 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
221 |         self.__freeze_layer(self.end_points[end_point])
222 |         if self._final_endpoint == end_point:
223 |             return
224 | 
225 |         end_point = 'Mixed_3b'
226 |         self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point)
227 |         self.__freeze_layer(self.end_points[end_point])
228 |         if self._final_endpoint == end_point:
229 |             return
230 | 
231 |         end_point = 'Mixed_3c'
232 |         self.end_points[end_point] = InceptionModule(256, [128, 128, 192, 32, 96, 64], name + end_point)
233 |         self.__freeze_layer(self.end_points[end_point])
234 |         if self._final_endpoint == end_point:
235 |             return
236 | 
237 |         end_point = 'MaxPool3d_4a_3x3'
238 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0)
239 |         self.__freeze_layer(self.end_points[end_point])
240 |         if self._final_endpoint == end_point:
241 |             return
242 | 
243 |         end_point = 'Mixed_4b'
244 |         self.end_points[end_point] = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point)
245 |         self.__freeze_layer(self.end_points[end_point])
246 |         if self._final_endpoint == end_point:
247 |             return
248 | 
249 |         end_point = 'Mixed_4c'
250 |         self.end_points[end_point] = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point)
251 |         self.__freeze_layer(self.end_points[end_point])
252 |         if self._final_endpoint == end_point:
253 |             return
254 | 
255 |         end_point = 'Mixed_4d'
256 |         self.end_points[end_point] = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point)
257 |         self.__freeze_layer(self.end_points[end_point])
258 |         if self._final_endpoint == end_point:
259 |             return
260 | 
261 |         end_point = 'Mixed_4e'
262 |         self.end_points[end_point] = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point)
263 |         self.__freeze_layer(self.end_points[end_point])
264 |         if self._final_endpoint == end_point:
265 |             return
266 | 
267 |         end_point = 'Mixed_4f'
268 |         self.end_points[end_point] = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], name + end_point)
269 |         self.__freeze_layer(self.end_points[end_point])
270 |         if self._final_endpoint == end_point:
271 |             return
272 | 
273 |         end_point = 'MaxPool3d_5a_2x2'
274 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0)
275 |         self.__freeze_layer(self.end_points[end_point])
276 |         if self._final_endpoint == end_point:
277 |             return
278 | 
279 |         end_point = 'Mixed_5b'
280 |         self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], name + end_point)
281 |         self.__freeze_layer(self.end_points[end_point])
282 |         if self._final_endpoint == end_point:
283 |             return
284 | 
285 |         end_point = 'Mixed_5c'
286 |         self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], name + end_point)
287 |         self.__freeze_layer(self.end_points[end_point])
288 |         if self._final_endpoint == end_point:
289 |             return
290 | 
291 |         self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1))
292 |         self.__freeze_layer(self.avg_pool)
293 |         self.dropout = nn.Dropout(dropout_keep_prob)
294 |         self.__freeze_layer(self.dropout)
295 |         self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits')
296 |         self.__freeze_layer(self.logits)
297 | 
298 |         self.build()
299 | 
300 |     def replace_logits(self, num_classes):
301 |         self._num_classes = num_classes
302 |         self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits')
303 |         pass
304 | 
305 |     def build(self):
306 |         for k in self.end_points.keys():
307 |             self.add_module(k, self.end_points[k])
308 | 
309 |     def forward(self, x):
310 |         for end_point in self.VALID_ENDPOINTS:
311 |             if end_point in self.end_points:
312 |                 # use _modules to work with dataparallel
313 |                 # x = self.end_points[end_point](x)
314 |                 x = self._modules[end_point](x)
315 |         return x
316 | 
317 |     def extract_features(self, x):
318 |         for end_point in self.VALID_ENDPOINTS:
319 |             if end_point in self.end_points:
320 |                 x = self._modules[end_point](x)
321 |         return self.avg_pool(x)
322 | 
323 |     def __freeze_layer(self, layer):
324 |         layer_params = layer.parameters()
325 |         for param in layer_params:
326 |             param.requires_grad = False
327 | 


--------------------------------------------------------------------------------
/nets/resnet_152_keras.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import keras.backend as K
  3 | 
  4 | from keras import initializers
  5 | from keras.layers import Input
  6 | from keras.layers import Dense
  7 | from keras.layers import Conv2D
  8 | from keras.layers import MaxPooling2D
  9 | from keras.layers import AveragePooling2D
 10 | from keras.layers import ZeroPadding2D
 11 | from keras.layers import Flatten
 12 | from keras.layers import Activation
 13 | from keras.layers import add
 14 | from keras.layers import BatchNormalization
 15 | from keras.layers import GlobalAveragePooling2D
 16 | from keras.layers import GlobalMaxPooling2D
 17 | 
 18 | from keras.models import Model
 19 | from keras.engine import Layer, InputSpec
 20 | from keras.engine import get_source_inputs
 21 | 
 22 | from keras.utils.data_utils import get_file
 23 | from keras.applications.imagenet_utils import imagenet_utils
 24 | 
 25 | from core import const as c
 26 | 
 27 | # WEIGHTS_PATH = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels.h5'
 28 | # WEIGHTS_PATH_NO_TOP = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5'
 29 | WEIGHTS_PATH = '%s/keras_models/resnet_152/resnet152_weights_tf_dim_ordering_tf_kernels.h5' % (c.DATA_ROOT_PATH)
 30 | WEIGHTS_PATH_NO_TOP = '%s/keras_models/resnet_152/resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5' % (c.DATA_ROOT_PATH)
 31 | 
 32 | class Scale(Layer):
 33 |     """ Custom Layer for ResNet used for BatchNormalization.
 34 | 
 35 |     Learns a set of weights and biases used for scaling the input data.
 36 |     the output consists simply in an element-wise multiplication of the input
 37 |     and a sum of a set of constants:
 38 |         out = in * gamma + beta,
 39 |     where 'gamma' and 'beta' are the weights and biases larned.
 40 |     # Arguments
 41 |         axis: integer, axis along which to normalize in mode 0. For instance,
 42 |             if your input tensor has shape (samples, channels, rows, cols),
 43 |             set axis to 1 to normalize per feature map (channels axis).
 44 |         momentum: momentum in the computation of the
 45 |             exponential average of the mean and standard deviation
 46 |             of the data, for feature-wise normalization.
 47 |         weights: Initialization weights.
 48 |             List of 2 Numpy arrays, with shapes:
 49 |             `[(input_shape,), (input_shape,)]`
 50 |         beta_init: name of initialization function for shift parameter
 51 |             (see [initializers](../initializers.md)), or alternatively,
 52 |             Theano/TensorFlow function to use for weights initialization.
 53 |             This parameter is only relevant if you don't pass a `weights` argument.
 54 |         gamma_init: name of initialization function for scale parameter (see
 55 |             [initializers](../initializers.md)), or alternatively,
 56 |             Theano/TensorFlow function to use for weights initialization.
 57 |             This parameter is only relevant if you don't pass a `weights` argument.
 58 |     """
 59 | 
 60 |     def __init__(self, weights=None, axis=-1, momentum=0.9, beta_init='zero', gamma_init='one', **kwargs):
 61 |         self.momentum = momentum
 62 |         self.axis = axis
 63 |         self.beta_init = initializers.get(beta_init)
 64 |         self.gamma_init = initializers.get(gamma_init)
 65 |         self.initial_weights = weights
 66 |         super(Scale, self).__init__(**kwargs)
 67 | 
 68 |     def build(self, input_shape):
 69 |         self.input_spec = [InputSpec(shape=input_shape)]
 70 |         shape = (int(input_shape[self.axis]),)
 71 | 
 72 |         self.gamma = K.variable(self.gamma_init(shape), name='%s_gamma' % self.name)
 73 |         self.beta = K.variable(self.beta_init(shape), name='%s_beta' % self.name)
 74 |         self.trainable_weights = [self.gamma, self.beta]
 75 | 
 76 |         if self.initial_weights is not None:
 77 |             self.set_weights(self.initial_weights)
 78 |             del self.initial_weights
 79 | 
 80 |     def call(self, x, mask=None):
 81 |         input_shape = self.input_spec[0].shape
 82 |         broadcast_shape = [1] * len(input_shape)
 83 |         broadcast_shape[self.axis] = input_shape[self.axis]
 84 | 
 85 |         out = K.reshape(self.gamma, broadcast_shape) * x + K.reshape(self.beta, broadcast_shape)
 86 |         return out
 87 | 
 88 |     def get_config(self):
 89 |         config = {"momentum": self.momentum, "axis": self.axis}
 90 |         base_config = super(Scale, self).get_config()
 91 |         return dict(list(base_config.items()) + list(config.items()))
 92 | 
 93 | def identity_block(input_tensor, kernel_size, filters, stage, block):
 94 |     """
 95 |     The identity_block is the block that has no conv layer at shortcut
 96 |     # Arguments
 97 |         input_tensor: input tensor
 98 |         kernel_size: defualt 3, the kernel size of middle conv layer at main path
 99 |         filters: list of integers, the nb_filters of 3 conv layer at main path
100 |         stage: integer, current stage label, used for generating layer names
101 |         block: 'a','b'..., current block label, used for generating layer names
102 |     """
103 |     eps = 1.1e-5
104 |     nb_filter1, nb_filter2, nb_filter3 = filters
105 |     conv_name_base = 'res' + str(stage) + block + '_branch'
106 |     bn_name_base = 'bn' + str(stage) + block + '_branch'
107 |     scale_name_base = 'scale' + str(stage) + block + '_branch'
108 | 
109 |     if K.image_data_format() == 'channels_last':
110 |         bn_axis = 3
111 |     else:
112 |         bn_axis = 1
113 | 
114 |     x = Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a', use_bias=False)(input_tensor)
115 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
116 |     x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
117 |     x = Activation('relu', name=conv_name_base + '2a_relu')(x)
118 | 
119 |     x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
120 |     x = Conv2D(nb_filter2, (kernel_size, kernel_size), name=conv_name_base + '2b', use_bias=False)(x)
121 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
122 |     x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
123 |     x = Activation('relu', name=conv_name_base + '2b_relu')(x)
124 | 
125 |     x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
126 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
127 |     x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
128 | 
129 |     x = add([x, input_tensor], name='res' + str(stage) + block)
130 |     x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
131 |     return x
132 | 
133 | def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
134 |     """ conv_block is the block that has a conv layer at shortcut
135 |     # Arguments
136 |         input_tensor: input tensor
137 |         kernel_size: defualt 3, the kernel size of middle conv layer at main path
138 |         filters: list of integers, the nb_filters of 3 conv layer at main path
139 |         stage: integer, current stage label, used for generating layer names
140 |         block: 'a','b'..., current block label, used for generating layer names
141 |     Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
142 |     And the shortcut should have subsample=(2,2) as well
143 |     """
144 | 
145 |     eps = 1.1e-5
146 |     nb_filter1, nb_filter2, nb_filter3 = filters
147 |     conv_name_base = 'res' + str(stage) + block + '_branch'
148 |     bn_name_base = 'bn' + str(stage) + block + '_branch'
149 |     scale_name_base = 'scale' + str(stage) + block + '_branch'
150 | 
151 |     if K.image_data_format() == 'channels_last':
152 |         bn_axis = 3
153 |     else:
154 |         bn_axis = 1
155 | 
156 |     x = Conv2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + '2a', use_bias=False)(input_tensor)
157 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
158 |     x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
159 |     x = Activation('relu', name=conv_name_base + '2a_relu')(x)
160 | 
161 |     x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
162 |     x = Conv2D(nb_filter2, (kernel_size, kernel_size),
163 |                name=conv_name_base + '2b', use_bias=False)(x)
164 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
165 |     x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
166 |     x = Activation('relu', name=conv_name_base + '2b_relu')(x)
167 | 
168 |     x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
169 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
170 |     x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
171 | 
172 |     shortcut = Conv2D(nb_filter3, (1, 1), strides=strides,
173 |                       name=conv_name_base + '1', use_bias=False)(input_tensor)
174 |     shortcut = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '1')(shortcut)
175 |     shortcut = Scale(axis=bn_axis, name=scale_name_base + '1')(shortcut)
176 | 
177 |     x = add([x, shortcut], name='res' + str(stage) + block)
178 |     x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
179 |     return x
180 | 
181 | def ResNet152(include_top=True, weights='imagenet', input_tensor=None, input_shape=None, pooling=None, classes=1000):
182 |     """ Instantiates the ResNet152 architecture.
183 |     Optionally loads weights pre-trained
184 |     on ImageNet. Note that when using TensorFlow,
185 |     for best performance you should set
186 |     `image_data_format='channels_last'` in your Keras config
187 |     at ~/.keras/keras.json.
188 |     The model and the weights are compatible only with
189 |     TensorFlow. The data format
190 |     convention used by the model is the one
191 |     specified in your Keras config file.
192 |     # Arguments
193 |         include_top: whether to include the fully-connected
194 |             layer at the top of the network.
195 |         weights: one of `None` (random initialization),
196 |               'imagenet' (pre-training on ImageNet),
197 |               or the path to the weights file to be loaded.
198 |         input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
199 |             to use as image input for the model.
200 |         input_shape: optional shape tuple, only to be specified
201 |             if `include_top` is False (otherwise the input shape
202 |             has to be `(224, 224, 3)` (with `channels_last` data format)
203 |             or `(3, 224, 224)` (with `channels_first` data format).
204 |             It should have exactly 3 inputs channels,
205 |             and width and height should be no smaller than 197.
206 |             E.g. `(200, 200, 3)` would be one valid value.
207 |         pooling: Optional pooling mode for feature extraction
208 |             when `include_top` is `False`.
209 |             - `None` means that the output of the model will be
210 |                 the 4D tensor output of the
211 |                 last convolutional layer.
212 |             - `avg` means that global average pooling
213 |                 will be applied to the output of the
214 |                 last convolutional layer, and thus
215 |                 the output of the model will be a 2D tensor.
216 |             - `max` means that global max pooling will
217 |                 be applied.
218 |         classes: optional number of classes to classify images
219 |             into, only to be specified if `include_top` is True, and
220 |             if no `weights` argument is specified.
221 |     # Returns
222 |         A Keras model instance.
223 |     # Raises
224 |         ValueError: in case of invalid argument for `weights`,
225 |             or invalid input shape.
226 |     """
227 | 
228 |     eps = 1.1e-5
229 | 
230 |     if not (weights in {'imagenet', None} or os.path.exists(weights)):
231 |         raise ValueError('The `weights` argument should be either '
232 |                          '`None` (random initialization), `imagenet` '
233 |                          '(pre-training on ImageNet), '
234 |                          'or the path to the weights file to be loaded.')
235 | 
236 |     if weights == 'imagenet' and include_top and classes != 1000:
237 |         raise ValueError('If using `weights` as imagenet with `include_top`'
238 |                          ' as true, `classes` should be 1000')
239 | 
240 |     # Determine proper input shape
241 |     input_shape = imagenet_utils._obtain_input_shape(input_shape,
242 |                                                      default_size=224,
243 |                                                      min_size=197,
244 |                                                      data_format=K.image_data_format(),
245 |                                                      require_flatten=include_top,
246 |                                                      weights=weights)
247 | 
248 |     if input_tensor is None:
249 |         img_input = Input(shape=input_shape)
250 |     else:
251 |         if not K.is_keras_tensor(input_tensor):
252 |             img_input = Input(tensor=input_tensor, shape=input_shape, name='data')
253 |         else:
254 |             img_input = input_tensor
255 | 
256 |     # Handle dimension ordering for different backends
257 |     if K.image_dim_ordering() == 'tf':
258 |         bn_axis = 3
259 |     else:
260 |         bn_axis = 1
261 | 
262 |     x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input)
263 |     x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=False)(x)
264 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name='bn_conv1')(x)
265 |     x = Scale(axis=bn_axis, name='scale_conv1')(x)
266 |     x = Activation('relu', name='conv1_relu')(x)
267 |     x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1', padding='same')(x)
268 | 
269 |     x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
270 |     x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
271 |     x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
272 | 
273 |     x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
274 |     for i in range(1, 8):
275 |         x = identity_block(x, 3, [128, 128, 512], stage=3, block='b' + str(i))
276 | 
277 |     x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
278 |     for i in range(1, 36):
279 |         x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b' + str(i))
280 | 
281 |     x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
282 |     x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
283 |     x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
284 | 
285 |     if include_top:
286 |         # Classification block
287 |         x = AveragePooling2D((7, 7), name='avg_pool')(x)
288 |         x = Flatten()(x)
289 |         x = Dense(classes, activation='softmax', name='fc1000')(x)
290 |     else:
291 |         if pooling == 'avg':
292 |             x = GlobalAveragePooling2D()(x)
293 |         elif pooling == 'max':
294 |             x = GlobalMaxPooling2D()(x)
295 | 
296 |     # Ensure that the model takes into account
297 |     # any potential predecessors of `input_tensor`.
298 |     if input_tensor is not None:
299 |         inputs = get_source_inputs(input_tensor)
300 |     else:
301 |         inputs = img_input
302 | 
303 |     # Create model
304 |     model = Model(inputs, x, name='resnet152')
305 | 
306 |     # Load weights
307 |     if weights == 'imagenet':
308 |         if include_top:
309 |             weights_path = WEIGHTS_PATH
310 |         else:
311 |             weights_path = WEIGHTS_PATH_NO_TOP
312 |         model.load_weights(weights_path)
313 | 
314 |     elif weights is not None:
315 |         model.load_weights(weights)
316 | 
317 |     return model
318 | 


--------------------------------------------------------------------------------
/core/image_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | ########################################################################
  5 | # GNU General Public License v3.0
  6 | # GNU GPLv3
  7 | # Copyright (c) 2019, Noureldien Hussein
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 21 | ########################################################################
 22 | 
 23 | """
 24 | Helper functions for images.
 25 | """
 26 | 
 27 | from __future__ import absolute_import
 28 | from __future__ import division
 29 | from __future__ import print_function
 30 | from __future__ import unicode_literals
 31 | 
 32 | import cv2
 33 | import numpy as np
 34 | import random
 35 | import math
 36 | from multiprocessing.dummy import Pool
 37 | 
 38 | from core import utils
 39 | 
 40 | # region Frame Resizing
 41 | 
 42 | def resize_frame(image, target_height=224, target_width=224):
 43 |     return __resize_frame(image, target_height, target_width)
 44 | 
 45 | def resize_keep_aspect_ratio_max_dim(image, max_dim=None):
 46 |     return __resize_keep_aspect_ratio_max_dim(image, max_dim)
 47 | 
 48 | def resize_keep_aspect_ratio_min_dim(image, min_dim=None):
 49 |     return __resize_keep_aspect_ratio_min_dim(image, min_dim)
 50 | 
 51 | def resize_crop(image, target_height=224, target_width=224):
 52 |     return __resize_crop(image, target_height, target_width)
 53 | 
 54 | def resize_crop_scaled(image, target_height=224, target_width=224):
 55 |     return __resize_crop_scaled(image, target_height, target_width)
 56 | 
 57 | def resize_keep_aspect_ratio_padded(image, target_height=224, target_width=224):
 58 |     return __resize_keep_aspect_ratio_padded(image, target_height, target_width)
 59 | 
 60 | def __resize_frame(image, target_height=224, target_width=224):
 61 |     """
 62 |     Resize to the given dimensions. Don't care about maintaining the aspect ratio of the given image.
 63 |     """
 64 |     if len(image.shape) == 2:
 65 |         image = np.tile(image[:, :, None], 3)
 66 |     elif len(image.shape) == 4:
 67 |         image = image[:, :, :, 0]
 68 | 
 69 |     resized_image = cv2.resize(image, dsize=(target_height, target_width))
 70 |     return resized_image
 71 | 
 72 | def __resize_keep_aspect_ratio_max_dim(image, max_dim=224):
 73 |     """
 74 |     Resize the given image while maintaining the aspect ratio.
 75 |     """
 76 |     if len(image.shape) == 2:
 77 |         image = np.tile(image[:, :, None], 3)
 78 |     elif len(image.shape) == 4:
 79 |         image = image[:, :, :, 0]
 80 | 
 81 |     height = image.shape[0]
 82 |     width = image.shape[1]
 83 | 
 84 |     if height > width:
 85 |         target_height = max_dim
 86 |         target_width = int(target_height * width / float(height))
 87 |     else:
 88 |         target_width = max_dim
 89 |         target_height = int(target_width * height / float(width))
 90 | 
 91 |     resized_image = cv2.resize(image, dsize=(target_width, target_height))
 92 |     return resized_image
 93 | 
 94 | def __resize_keep_aspect_ratio_min_dim(image, min_dim=224):
 95 |     """
 96 |     Resize the given image while maintaining the aspect ratio.
 97 |     """
 98 |     if len(image.shape) == 2:
 99 |         image = np.tile(image[:, :, None], 3)
100 |     elif len(image.shape) == 4:
101 |         image = image[:, :, :, 0]
102 | 
103 |     height = image.shape[0]
104 |     width = image.shape[1]
105 | 
106 |     if height > width:
107 |         target_width = min_dim
108 |         target_height = int(target_width * height / float(width))
109 |     else:
110 |         target_height = min_dim
111 |         target_width = int(target_height * width / float(height))
112 | 
113 |     resized_image = cv2.resize(image, dsize=(target_width, target_height))
114 |     return resized_image
115 | 
116 | def __resize_crop(image, target_height=224, target_width=224):
117 |     if len(image.shape) == 2:
118 |         image = np.tile(image[:, :, None], 3)
119 |     elif len(image.shape) == 4:
120 |         image = image[:, :, :, 0]
121 | 
122 |     height, width, rgb = image.shape
123 |     if width == height:
124 |         resized_image = cv2.resize(image, (target_height, target_width))
125 | 
126 |     elif height < width:
127 |         resized_image = cv2.resize(image, (int(width * float(target_height) / height), target_width))
128 |         cropping_length = int((resized_image.shape[1] - target_height) / 2)
129 |         resized_image = resized_image[:, cropping_length:resized_image.shape[1] - cropping_length]
130 | 
131 |     else:
132 |         resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width)))
133 |         cropping_length = int((resized_image.shape[0] - target_width) / 2)
134 |         resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length, :]
135 | 
136 |     resized_image = cv2.resize(resized_image, (target_height, target_width))
137 |     return resized_image
138 | 
139 | def __resize_crop_scaled(image, target_height=224, target_width=224):
140 |     # re-scale the image by ratio 3/4 so a landscape or portrait image becomes square
141 |     # then resize_crop it
142 | 
143 |     # for example, if input image is (height*width) is 400*1000 it will be (400 * 1000 * 3/4) = 400 * 750
144 | 
145 |     if len(image.shape) == 2:
146 |         image = np.tile(image[:, :, None], 3)
147 |     elif len(image.shape) == 4:
148 |         image = image[:, :, :, 0]
149 | 
150 |     height, width, _ = image.shape
151 |     if width == height:
152 |         resized_image = cv2.resize(image, (target_height, target_width))
153 |     else:
154 | 
155 |         # first, rescale it, only if the rescale won't bring the scaled dimention to lower than target_dim (= 224)
156 |         scale_factor = 3 / 4.0
157 |         if height < width:
158 |             new_width = int(width * scale_factor)
159 |             if new_width >= target_width:
160 |                 image = cv2.resize(image, (new_width, height))
161 |         else:
162 |             new_height = int(height * scale_factor)
163 |             if new_height >= target_height:
164 |                 image = cv2.resize(image, (width, new_height))
165 | 
166 |         # now, resize and crop
167 |         height, width, _ = image.shape
168 |         if height < width:
169 |             resized_image = cv2.resize(image, (int(width * float(target_height) / height), target_width))
170 |             cropping_length = int((resized_image.shape[1] - target_height) / 2)
171 |             resized_image = resized_image[:, cropping_length:resized_image.shape[1] - cropping_length]
172 | 
173 |         else:
174 |             resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width)))
175 |             cropping_length = int((resized_image.shape[0] - target_width) / 2)
176 |             resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length, :]
177 | 
178 |         # this line is important, because sometimes the cropping there is a 1 pixel more
179 |         height, width, _ = resized_image.shape
180 |         if height > target_height or width > target_width:
181 |             resized_image = cv2.resize(resized_image, (target_height, target_width))
182 | 
183 |     return resized_image
184 | 
185 | def __resize_keep_aspect_ratio_padded(image, target_height=224, target_width=224):
186 |     """
187 |     Resize the frame while keeping aspect ratio. Also, to result in an image with the given dimensions, the resized image is zero-padded.
188 |     """
189 | 
190 |     if len(image.shape) == 2:
191 |         image = np.tile(image[:, :, None], 3)
192 |     elif len(image.shape) == 4:
193 |         image = image[:, :, :, 0]
194 | 
195 |     original_height, original_width, _ = image.shape
196 |     original_aspect_ratio = original_height / float(original_width)
197 |     target_aspect_ratio = target_height / float(target_width)
198 | 
199 |     if target_aspect_ratio >= original_aspect_ratio:
200 |         if original_width >= original_height:
201 |             max_dim = target_width
202 |         else:
203 |             max_dim = int(original_height * target_width / float(original_width))
204 |     else:
205 |         if original_height >= original_width:
206 |             max_dim = target_height
207 |         else:
208 |             max_dim = int(original_width * target_height / float(original_height))
209 | 
210 |     image = __resize_keep_aspect_ratio_max_dim(image, max_dim=max_dim)
211 | 
212 |     new_height, new_width, _ = image.shape
213 |     new_aspect_ratio = new_height / float(new_width)
214 | 
215 |     # do zero-padding for the image (vertical or horizontal)
216 |     img_padded = np.zeros((target_height, target_width, 3), dtype=image.dtype)
217 | 
218 |     if target_aspect_ratio < new_aspect_ratio:
219 |         # horizontal padding
220 |         y1 = 0
221 |         y2 = new_height
222 |         x1 = int((target_width - new_width) / 2.0)
223 |         x2 = x1 + new_width
224 |     else:
225 |         # vertical padding
226 |         x1 = 0
227 |         x2 = new_width
228 |         y1 = int((target_height - new_height) / 2.0)
229 |         y2 = y1 + new_height
230 | 
231 |     img_padded[y1:y2, x1:x2, :] = image
232 |     return img_padded
233 | 
234 | # endregion
235 | 
236 | # region Image Reader ResNet-152 Keras
237 | 
238 | class AsyncImageReaderResNet152Keras():
239 |     def __init__(self, bgr_mean, n_threads=20):
240 |         random.seed(101)
241 |         np.random.seed(101)
242 | 
243 |         self.__is_busy = False
244 |         self.__images = None
245 |         self.__n_channels = 3
246 |         self.__img_dim = 224
247 |         self.__bgr_mean = bgr_mean
248 | 
249 |         self.__n_threads_in_pool = n_threads
250 |         self.__pool = Pool(self.__n_threads_in_pool)
251 | 
252 |     def load_imgs_in_batch(self, image_pathes):
253 |         self.__is_busy = True
254 | 
255 |         n_pathes = len(image_pathes)
256 |         idxces = np.arange(0, n_pathes)
257 | 
258 |         # parameters passed to the reading function
259 |         params = [data_item for data_item in zip(idxces, image_pathes)]
260 | 
261 |         # set list of images before start reading
262 |         imgs_shape = (n_pathes, self.__img_dim, self.__img_dim, self.__n_channels)
263 |         self.__images = np.zeros(imgs_shape, dtype=np.float32)
264 | 
265 |         # start pool of threads
266 |         self.__pool.map_async(self.__preprocess_img_wrapper, params, callback=self.__thread_pool_callback)
267 | 
268 |     def get_images(self):
269 |         if self.__is_busy:
270 |             raise Exception('Sorry, you can\'t get images while threads are running!')
271 |         else:
272 |             return self.__images
273 | 
274 |     def is_busy(self):
275 |         return self.__is_busy
276 | 
277 |     def __thread_pool_callback(self, args):
278 |         self.__is_busy = False
279 | 
280 |     def __preprocess_img_wrapper(self, params):
281 |         try:
282 |             self.__preprocess_img(params)
283 |         except Exception as exp:
284 |             print ('Error in __preprocess_img')
285 |             print (exp)
286 | 
287 |     def __preprocess_img(self, params):
288 | 
289 |         idx = params[0]
290 |         path = params[1]
291 | 
292 |         img = cv2.imread(path)
293 |         img = img.astype(np.float32)
294 | 
295 |         # subtract mean pixel from image
296 |         img[:, :, 0] -= self.__bgr_mean[0]
297 |         img[:, :, 1] -= self.__bgr_mean[1]
298 |         img[:, :, 2] -= self.__bgr_mean[2]
299 | 
300 |         # convert from bgr to rgb
301 |         img = img[:, :, (2, 1, 0)]
302 | 
303 |         self.__images[idx] = img
304 | 
305 |     def close(self):
306 |         self.__pool.close()
307 |         self.__pool.terminate()
308 | 
309 | # endregion
310 | 
311 | # region Image/Video Readers MultiTHUMOS
312 | 
313 | class AsyncImageReaderMultiTHUMOSForI3DKerasModel():
314 |     def __init__(self, n_threads=20):
315 |         random.seed(101)
316 |         np.random.seed(101)
317 | 
318 |         self.__is_busy = False
319 |         self.__images = None
320 |         self.__n_channels = 3
321 |         self.__img_dim = 224
322 | 
323 |         self.__n_threads_in_pool = n_threads
324 |         self.__pool = Pool(self.__n_threads_in_pool)
325 | 
326 |     def load_imgs_in_batch(self, image_pathes):
327 |         self.__is_busy = True
328 | 
329 |         n_pathes = len(image_pathes)
330 |         idxces = np.arange(0, n_pathes)
331 | 
332 |         # parameters passed to the reading function
333 |         params = [data_item for data_item in zip(idxces, image_pathes)]
334 | 
335 |         # set list of images before start reading
336 |         imgs_shape = (n_pathes, self.__img_dim, self.__img_dim, self.__n_channels)
337 |         self.__images = np.zeros(imgs_shape, dtype=np.float32)
338 | 
339 |         # start pool of threads
340 |         self.__pool.map_async(self.__preprocess_img_wrapper, params, callback=self.__thread_pool_callback)
341 | 
342 |     def get_images(self):
343 |         if self.__is_busy:
344 |             raise Exception('Sorry, you can\'t get images while threads are running!')
345 |         else:
346 |             return self.__images
347 | 
348 |     def is_busy(self):
349 |         return self.__is_busy
350 | 
351 |     def __thread_pool_callback(self, args):
352 |         self.__is_busy = False
353 | 
354 |     def __preprocess_img_wrapper(self, params):
355 |         try:
356 |             self.__preprocess_img(params)
357 |         except Exception as exp:
358 |             print ('Error in __preprocess_img')
359 |             print (exp)
360 | 
361 |     def __preprocess_img(self, params):
362 | 
363 |         idx = params[0]
364 |         path = params[1]
365 | 
366 |         img = cv2.imread(path)
367 |         img = img.astype(np.float32)
368 |         # normalize such that values range from -1 to 1
369 |         img /= float(127.5)
370 |         img -= 1.0
371 |         # convert from bgr to rgb
372 |         img = img[:, :, (2, 1, 0)]
373 | 
374 |         self.__images[idx] = img
375 | 
376 |     def close(self):
377 |         self.__pool.close()
378 |         self.__pool.terminate()
379 | 
380 | # endregion
381 | 
382 | # region Image/Video Readers Breakfast
383 | 
384 | class AsyncImageReaderBreakfastForI3DKerasModel():
385 |     def __init__(self, n_threads=20):
386 |         random.seed(101)
387 |         np.random.seed(101)
388 | 
389 |         self.__is_busy = False
390 |         self.__images = None
391 |         self.__n_channels = 3
392 |         self.__img_dim = 224
393 | 
394 |         self.__n_threads_in_pool = n_threads
395 |         self.__pool = Pool(self.__n_threads_in_pool)
396 | 
397 |     def load_imgs_in_batch(self, image_pathes):
398 |         self.__is_busy = True
399 | 
400 |         n_pathes = len(image_pathes)
401 |         idxces = np.arange(0, n_pathes)
402 | 
403 |         # parameters passed to the reading function
404 |         params = [data_item for data_item in zip(idxces, image_pathes)]
405 | 
406 |         # set list of images before start reading
407 |         imgs_shape = (n_pathes, self.__img_dim, self.__img_dim, self.__n_channels)
408 |         self.__images = np.zeros(imgs_shape, dtype=np.float32)
409 | 
410 |         # start pool of threads
411 |         self.__pool.map_async(self.__preprocess_img_wrapper, params, callback=self.__thread_pool_callback)
412 | 
413 |     def get_images(self):
414 |         if self.__is_busy:
415 |             raise Exception('Sorry, you can\'t get images while threads are running!')
416 |         else:
417 |             return self.__images
418 | 
419 |     def is_busy(self):
420 |         return self.__is_busy
421 | 
422 |     def __thread_pool_callback(self, args):
423 |         self.__is_busy = False
424 | 
425 |     def __preprocess_img_wrapper(self, params):
426 |         try:
427 |             self.__preprocess_img(params)
428 |         except Exception as exp:
429 |             print ('Error in __preprocess_img')
430 |             print (exp)
431 | 
432 |     def __preprocess_img(self, params):
433 | 
434 |         idx = params[0]
435 |         path = params[1]
436 | 
437 |         img = cv2.imread(path)
438 |         img = img.astype(np.float32)
439 |         # normalize such that values range from -1 to 1
440 |         img /= float(127.5)
441 |         img -= 1.0
442 |         # convert from bgr to rgb
443 |         img = img[:, :, (2, 1, 0)]
444 | 
445 |         self.__images[idx] = img
446 | 
447 |     def close(self):
448 |         self.__pool.close()
449 |         self.__pool.terminate()
450 | 
451 | # endregion
452 | 


--------------------------------------------------------------------------------
/data/assets/timeception_layer.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
  3 | "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
  4 | 
  5 | 
  6 | <!-- Page 1 -->
  7 | <svg x="0" y="0" width="759" height="310" viewBox="0 0 759 310" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
  8 | style="display: block;margin-left: auto;margin-right: auto;">
  9 | <defs>
 10 | <clipPath id="c0_1"><path d="M-180.9,671.5V-153.4H1010.6V671.5Z" /></clipPath>
 11 | <clipPath id="c1_1"><path d="M1010.6,671.5H-180.9V-153.4H1010.6V671.5Z" /></clipPath>
 12 | <clipPath id="c2_1"><path d="M-180.9,671.5V-153.4H1010.6V671.5H-180.9Z" /></clipPath>
 13 | 
 14 | <style type="text/css"><![CDATA[
 15 | text { white-space: pre; }
 16 | path, image { pointer-events: none; }
 17 | 
 18 | .s1_1{
 19 | font-size: 12.28px;
 20 | font-family: Cambria_i;
 21 | fill: #000000;
 22 | }
 23 | .s2_1{
 24 | font-size: 10.63px;
 25 | font-family: CambriaMath_e;
 26 | fill: #000000;
 27 | }
 28 | .s3_1{
 29 | font-size: 9.17px;
 30 | font-family: Cambria_q;
 31 | fill: #000000;
 32 | }
 33 | .s4_1{
 34 | font-size: 9.17px;
 35 | font-family: Cambria_i;
 36 | fill: #000000;
 37 | }
 38 | .s5_1{
 39 | font-size: 15.22px;
 40 | font-family: Cambria_i;
 41 | fill: #000000;
 42 | }
 43 | .s6_1{
 44 | font-size: 9.17px;
 45 | font-family: CambriaMath_t;
 46 | fill: #000000;
 47 | }
 48 | 
 49 | .g0_1{
 50 | fill: #FFFFFF;
 51 | }
 52 | .g1_1{
 53 | fill: #EEECE1;
 54 | fill-opacity: 0.580392;
 55 | }
 56 | .g2_1{
 57 | fill: none;
 58 | stroke: #4F6228;
 59 | stroke-width: 0.73333335;
 60 | stroke-linecap: butt;
 61 | stroke-linejoin: round;
 62 | stroke-dasharray: 3,3;
 63 | }
 64 | .g3_1{
 65 | fill: #E6E0EC;
 66 | }
 67 | .g4_1{
 68 | fill: none;
 69 | stroke: #604A7B;
 70 | stroke-width: 1.4666667;
 71 | stroke-linecap: butt;
 72 | stroke-linejoin: round;
 73 | }
 74 | .g5_1{
 75 | fill: #F2DCDB;
 76 | }
 77 | .g6_1{
 78 | fill: none;
 79 | stroke: #953735;
 80 | stroke-width: 1.4666667;
 81 | stroke-linecap: butt;
 82 | stroke-linejoin: round;
 83 | }
 84 | .g7_1{
 85 | fill: #4A452A;
 86 | }
 87 | .g8_1{
 88 | fill: #DCE6F2;
 89 | }
 90 | .g9_1{
 91 | fill: none;
 92 | stroke: #376092;
 93 | stroke-width: 1.4666667;
 94 | stroke-linecap: butt;
 95 | stroke-linejoin: round;
 96 | }
 97 | .g10_1{
 98 | fill: none;
 99 | stroke: #632523;
100 | stroke-width: 0.73333335;
101 | stroke-linecap: butt;
102 | stroke-linejoin: round;
103 | stroke-dasharray: 3,3;
104 | }
105 | .g11_1{
106 | fill: #632523;
107 | }
108 | .g12_1{
109 | fill: #F2DCDB;
110 | fill-opacity: 0.34902;
111 | }
112 | .g13_1{
113 | fill: #D7E4BD;
114 | }
115 | .g14_1{
116 | fill: none;
117 | stroke: #4F6228;
118 | stroke-width: 1.4666667;
119 | stroke-linecap: butt;
120 | stroke-linejoin: round;
121 | }
122 | .g15_1{
123 | fill: none;
124 | stroke: #4A452A;
125 | stroke-width: 1.5277778;
126 | stroke-linecap: butt;
127 | stroke-linejoin: round;
128 | }
129 | .g16_1{
130 | fill: #FCD5B5;
131 | }
132 | .g17_1{
133 | fill: none;
134 | stroke: #984807;
135 | stroke-width: 1.4666667;
136 | stroke-linecap: butt;
137 | stroke-linejoin: round;
138 | }
139 | 
140 | ]]></style>
141 | 
142 | </defs>
143 | <path d="M0,0
144 | L0,310
145 | L759,310
146 | L759,0 Z " 
147 | fill="#FFFFFF" stroke="none" />
148 | <g clip-path="url(#c0_1)">
149 | <path fill-rule="evenodd" d="M-180.9,671.5H1010.6V-153.4H-180.9V671.5Z" class="g0_1" />
150 | </g>
151 | <path fill-rule="evenodd" d="M242.6,254.2H60.5c-9.6,0,-17.3,-7.8,-17.3,-17.3V25.8H259.9V236.9c0,9.5,-7.8,17.3,-17.3,17.3Z" class="g1_1" />
152 | <path d="M242.6,254.2H60.5c-9.6,0,-17.3,-7.8,-17.3,-17.3V25.8H259.9V236.9c0,9.5,-7.8,17.3,-17.3,17.3Z" class="g2_1" />
153 | <path fill-rule="evenodd" d="M132.1,72h49.1c4.7,0,8.5,-3.8,8.5,-8.5V43.4h-66V63.5c0,4.7,3.7,8.5,8.4,8.5Z" class="g3_1" />
154 | <path d="M132.1,72h49.1c4.7,0,8.5,-3.8,8.5,-8.5V43.4h-66V63.5c0,4.7,3.7,8.5,8.4,8.5Z" class="g4_1" />
155 | <text x="140" y="60" dx="0,0,0,0.1,0" class="s1_1">Group</text>
156 | <path fill-rule="evenodd" d="M65,130.1h49.1c4.7,0,8.5,-3.8,8.5,-8.5V101.5h-66v20.1c0,4.7,3.7,8.5,8.4,8.5Z" class="g5_1" />
157 | <path d="M65,130.1h49.1c4.7,0,8.5,-3.8,8.5,-8.5V101.5h-66v20.1c0,4.7,3.7,8.5,8.4,8.5Z" class="g6_1" />
158 | <text x="59" y="118" dx="0,0,0,0,0,-0.3,0,0.1,-0.1" class="s1_1">Temp Conv</text>
159 | <path d="M144.2,75.4V77l-.1,1.4l-.1,1.3q0,.1,0,.1l-.1,.6l-.2,.6q-.1,.1,-.1,.2l-.3,.5c0,0,0,.1,-.1,.1l-.4,.5v.1l-.5,.4q-.1,.1,-.1,.1l-.7,.4l-.9,.5l-1,.4l-1.2,.4l-1.4,.3l-.9,.2l-.9,.1l-1,.1l-1,.1l-1.2,.1h-1.2l-2.6,.1h-5.7l-3,-.1h-6.1l-3,-.1h-2.9l-2.7,.1l-2.5,.1l-1.1,.1l-1.1,.1l-1,.1l-.9,.2l-.8,.2l-.8,.1l-1.2,.5l-1,.6l-.9,.7v-.1l-.8,.8l-.6,.8l-.5,.9l-.4,1l-.3,1l-.2,1l-.1,.8l-1.5,-.2l.1,-.9l.3,-1.2L89.8,91l.5,-1.1l.6,-1.1l.8,-1l.8,-.8l.1,-.1l1,-.7l1.3,-.7L96.3,85l.7,-.3l.9,-.1l1,-.2l1,-.1l1.2,-.1l1.1,-.1l2.6,-.1l2.7,-.1h5.9l3.1,.1h6l3,.1h2.7l2.5,-.1l1.2,-.1H133l1.1,-.1l.9,-.1l.8,-.1l.8,-.2l1.3,-.3l1.1,-.3l.9,-.4l.8,-.4l.6,-.4l-.1,.1l.5,-.5v.1l.4,-.5l-.1,.1l.3,-.5l-.1,.1l.2,-.4l.1,-.6v.1l.1,-1.3V76.9l.1,-1.6l1.5,.1ZM89.8,94.3l4.6,-3.1l-4.5,9.2L85.2,91.3l4.6,3Z" class="g7_1" />
160 | <path fill-rule="evenodd" d="M193,130.1h49.1c4.6,0,8.4,-3.8,8.4,-8.5V101.5h-66v20.1c0,4.7,3.8,8.5,8.5,8.5Z" class="g5_1" />
161 | <path d="M193,130.1h49.1c4.6,0,8.4,-3.8,8.4,-8.5V101.5h-66v20.1c0,4.7,3.8,8.5,8.5,8.5Z" class="g6_1" />
162 | <text x="186" y="118" dx="0,0,0,0,0,-0.3,0,0.1,-0.1" class="s1_1">Temp Conv</text>
163 | <g clip-path="url(#c0_1)">
164 | <path d="M166.3,75.6l.1,1.6v1.4l.1,1.4c0,0,0,.1,0,.1l.2,.6l.2,.6c0,0,0,.1,0,.1l.3,.6c0,0,.1,0,.1,.1l.4,.4q0,.1,.1,.1l.5,.5l.7,.5l.9,.4l1,.4l1.3,.4l1.4,.4l.8,.1l.9,.1l1,.2l1.1,.1h1.1l1.2,.1h2.6l2.8,.1l2.9,-.1h6.1l3,-.1H203l2.7,.1l2.5,.1l1.2,.1l1.1,.1l.9,.1l1,.1l.8,.2l.7,.2l1.2,.5l1.1,.6l.9,.6H217l.8,.8l.6,.8l.5,.9l.4,.9l.3,1l.2,1.1l.2,.7l1.5,-.2l-.2,-.9l-.2,-1.1l-.4,-1.2l-.5,-1.1l-.6,-1l-.7,-1l-.8,-.8c-.1,0,-.1,-.1,-.1,-.1l-1.1,-.8l-1.2,-.6l-1.4,-.6l-.8,-.2l-.9,-.2l-1,-.1l-1,-.1l-1.1,-.1l-1.2,-.1l-2.6,-.2H194l-3,.1h-8.7l-2.5,-.1h-1.2l-1.1,-.1l-1,-.1h-1l-.8,-.2l-.8,-.1l-1.3,-.3L171.5,83l-.9,-.3l-.7,-.4l-.6,-.4l-.5,-.4l.1,.1l-.4,-.5l.1,.1l-.3,-.5v.1l-.2,-.5l-.1,-.6v.1l-.1,-1.2V77.2l-.1,-1.6h-1.5Zm54.4,19l-4.5,-3.2l4.4,9.3l4.7,-9.1l-4.6,3Z" class="g7_1" />
165 | </g>
166 | <text x="86" y="17" dx="0,2.3,2.3,2.5,2.3,2.6,2.3" class="s2_1">T×L×L×C</text>
167 | <g clip-path="url(#c0_1)">
168 | <path d="M156.3,17V33.9h1.5V17h-1.5Zm.7,16.9l-4.6,-3l4.6,9.2l4.6,-9.2l-4.6,3Z" class="g7_1" />
169 | </g>
170 | <text x="144" y="121" dx="0,0,0,0,0.2" class="s3_1">• • •</text>
171 | <text x="128" y="109" dx="0,0.1,-0.1,0,0,0.1,0,0" class="s1_1">N Groups</text>
172 | <path fill-rule="evenodd" d="M115.1,190.2h82c4.7,0,8.5,-3.8,8.5,-8.5V161.6h-99v20.1c0,4.7,3.8,8.5,8.5,8.5Z" class="g3_1" />
173 | <path d="M115.1,190.2h82c4.7,0,8.5,-3.8,8.5,-8.5V161.6h-99v20.1c0,4.7,3.8,8.5,8.5,8.5Z" class="g4_1" />
174 | <text x="114" y="179" dx="0,0,0.1,-0.1,0.1,0,0,-0.4,0,-0.2,0,0,0,0,0,0" class="s1_1">Concat + Shuffle</text>
175 | <g clip-path="url(#c1_1)">
176 | <path d="M87.1,134.7l.1,1.6v1.4l.1,1.4c0,0,.1,0,.1,.1l.1,.5l.2,.7c0,0,.1,.1,.1,.1l.3,.5q0,.1,0,.1l.4,.5q.1,.1,.1,.1l.5,.5q.1,0,.1,0l.7,.5l.8,.4l1.1,.4l1.2,.4l1.4,.4l.8,.1l1,.1l.9,.1l1.1,.1l1.2,.1l1.2,.1h11.2l3.1,-.1h11.6l2.6,.2l1.1,.1l1.1,.1l1,.1l.9,.1l.8,.2l.7,.2l1.3,.5l1,.5l.9,.7l-.1,-.1l.8,.9l.7,.8l.5,.9l.4,.9l.3,1l.2,1l.1,.8l1.5,-.2l-.1,-.9l-.3,-1.2l-.3,-1.1l-.5,-1.1l-.6,-1l-.8,-1l-.8,-.8q0,-.1,-.1,-.1l-1.1,-.8l-1.2,-.7l-1.4,-.5l-.8,-.2l-.8,-.2l-1,-.1l-1.1,-.2l-1.1,-.1h-1.2l-2.5,-.2H114.9l-3.1,.1h-8.6l-2.6,-.1H99.4l-1.1,-.1l-1,-.1l-.9,-.1l-.9,-.1l-.7,-.1l-1.3,-.4l-1.2,-.3l-.9,-.3l-.7,-.4L90,141h.1l-.5,-.4l.1,.1l-.4,-.5l.1,.1l-.3,-.6l.1,.2l-.2,-.5l-.2,-.6l.1,.1l-.2,-1.2v-3.1l-1.6,.1Zm54.5,18.9L137,150.5l4.5,9.2l4.7,-9.1l-4.6,3Z" class="g7_1" />
177 | </g>
178 | <path d="M221.5,134.6l-.1,1.6v1.4l-.1,1.4c0,0,-.1,0,-.1,.1l-.1,.5l-.2,.7l-.4,.6q0,.1,0,.1l-.4,.5c-.1,0,-.1,.1,-.1,.1l-.5,.5l-.8,.5l-.9,.4l-1,.4l-1.2,.4l-1.4,.3l-.8,.2l-1,.1l-1,.1l-1,.1l-1.2,.1H208l-2.5,.1h-8.8l-3,-.1H182l-2.5,.2l-1.1,.1l-1.1,.1l-1,.1l-.9,.1l-.8,.2l-.8,.2l-1.2,.5l-1,.5l-.9,.7l.1,-.1l-.8,.8l-.7,.9l-.5,.9l-.4,.9l-.3,1l-.2,1l-.1,.8l-1.5,-.2l.1,-.9l.3,-1.1l.3,-1.2l.5,-1.1l.6,-1l.8,-1l.8,-.8q0,-.1,.1,-.1l1,-.8l1.3,-.7l1.4,-.5l.7,-.2l.9,-.2l1,-.1l1,-.2l1.2,-.1h1.2l2.5,-.2h14.8l3,.1h5.6l2.6,-.1h1.2l1.1,-.1l1,-.1l.9,-.1l.9,-.1l.7,-.1l1.3,-.4l1.1,-.3l1,-.4l.7,-.3l.6,-.4l.5,-.4l-.1,.1l.4,-.5l-.1,.1l.2,-.5l.2,-.4l.2,-.6l-.1,.1l.1,-1.2l.1,-1.5v-1.6l1.6,.1ZM167,153.5l4.6,-3.1l-4.5,9.2l-4.7,-9.1l4.6,3Z" class="g7_1" />
179 | <path fill-rule="evenodd" d="M132.2,246.1h47.5c5.1,0,9.2,-4.1,9.2,-9.2V214.8h-66v22.1c0,5.1,4.2,9.2,9.3,9.2Z" class="g8_1" />
180 | <path d="M132.2,246.1h47.5c5.1,0,9.2,-4.1,9.2,-9.2V214.8h-66v22.1c0,5.1,4.2,9.2,9.3,9.2Z" class="g9_1" />
181 | <text x="135" y="229" dx="0,0.1,0,-0.1,-0.3,0" class="s1_1">Max 1D</text>
182 | <text x="139" y="240" dx="0,0,0,0,0,0.2,-0.1,0" class="s4_1">k=2, s=2</text>
183 | <g clip-path="url(#c0_1)">
184 | <path d="M156.4,193.7v11.5H158V193.7h-1.6Zm.8,11.5l-4.6,-3.1l4.6,9.2l4.6,-9.2l-4.6,3.1Z" class="g7_1" />
185 | </g>
186 | <path d="M157.8,246.5V269h-1.5V246.5h1.5ZM157,269l4.6,-3.1l-4.6,9.2l-4.6,-9.2L157,269Z" class="g7_1" />
187 | <text x="9" y="141" dx="0,0,-0.3,0,-0.4,0,-0.1,0,-0.4,0,-0.1,0,-0.4,0.1,0.1" class="s2_1">T × L × L × C/N</text>
188 | <text x="9" y="85" dx="0,0,-0.3,0,-0.4,0,-0.1,0,-0.4,0,-0.1,0,-0.4,0.1,0.1" class="s2_1">T × L × L × C/N</text>
189 | <text x="82" y="205" dx="0,2.3,2.3,2.5,2.3,2.5,2.3" class="s2_1">T×L×L×C</text>
190 | <text x="73" y="269" dx="0,-0.1,-0.1,0,-0.3,0,-0.2,0,-0.3,0,-0.2,0,-0.3,0,-0.2" class="s2_1">T/2 × L × L × C</text>
191 | <text x="71" y="299" dx="0,0,0.1,0,0.2,0,-0.1,0,-0.1,0.1,-0.1,0,0,0,0,0,0.4,0.1,0.1,0,-0.1" class="s5_1">(a) Timeception Layer</text>
192 | <path d="M253.8,115.1l22,-.4" class="g10_1" />
193 | <path d="M302.2,25.7l-1.5,.3l-1.8,.6l-1.7,.7l-.9,.4l.7,1.4l.8,-.4l1.5,-.7l1.7,-.5l1.6,-.4l-.4,-1.4Zm-9.9,4.6l-.6,.6l-1,1.1l-.9,1.2l-.7,1.3l-.5,1.1l1.4,.5l.4,-.9l.6,-1.1l.8,-1.1l.9,-1l.6,-.5l-1,-1.2Zm-4.2,10.1v6.1h1.5V40.4h-1.5Zm0,10.7v6.1h1.5V51.1h-1.5Zm0,10.7v6.1h1.5V61.8h-1.5Zm0,10.7v6.1h1.5V72.5h-1.5Zm0,10.7v6.1h1.5V83.2h-1.5Zm0,10.7V100h1.5V93.9h-1.5Zm-.4,10.3l-.4,1.1l-.7,1.1l-.7,1l-1,1l-.5,.5l1,1.1l.7,-.5l1,-1.1l.9,-1.3l.7,-1.3l.4,-1l-1.4,-.6Zm-6.9,7.1l-.9,.5l-1.6,.6l-1.7,.5l-1.4,.4l.4,1.5l1.5,-.4l1.8,-.6l1.6,-.7l1,-.4l-.7,-1.4Zm-10,2.5l-1.6,.1c-.4,0,-.7,.3,-.7,.8c0,.4,.3,.7,.7,.7l2,.1l1.9,.2l.5,.1l.2,-1.5l-.5,-.1l-2,-.2l-2,-.1v1.5h1.6l-.1,-1.6Zm7.1,3l.4,.2l1.6,.6l1.5,.7l1.3,.8l.4,.3l.9,-1.2l-.5,-.4l-1.5,-.9l-1.5,-.7l-1.7,-.7l-.4,-.1l-.5,1.4Zm8.4,5.6l.4,.6l.6,1.1l.4,1.2v-.1l.3,1.3v-.2l.1,1.4h1.5v-.1l-.1,-1.4c0,0,0,-.1,0,-.1l-.3,-1.3l-.1,-.1l-.5,-1.3l-.7,-1.3l-.4,-.6l-1.2,.9Zm1.8,9.8v6.2h1.5v-6.2h-1.5Zm0,10.7V149h1.5v-6.1h-1.5Zm0,10.7v6.1h1.5v-6.1h-1.5Zm0,10.7v6.1h1.5v-6.1h-1.5Zm0,10.7v6.1h1.5V175h-1.5Zm0,10.7v6.1h1.5v-6.1h-1.5Zm0,10.7v6.1h1.5v-6.1h-1.5Zm0,10.7v6.1h1.5v-6.1h-1.5Zm0,10.7v6.1h1.5v-6.1h-1.5Zm0,10.7v6.1h1.5v-6.1h-1.5Zm0,10.7v2.1l.1,1.3q0,.1,0,.2l.3,1.2q0,.1,0,.1l.5,1.3l.1,.1l1.3,-.7l-.4,-1.2v.1l-.3,-1.3v.1l-.1,-1.2v-2.1h-1.5Zm3.9,10.1l.8,.8l1.3,.9l1.5,.9l1.6,.8l.2,.1l.5,-1.5h-.1l-1.4,-.7l-1.4,-.8l-1.2,-.9l-.8,-.7l-1,1.1Zm9.8,4.9l.4,.1l.4,-1.5l-.4,-.1l-.4,1.5Zm.6,-227.8l-2.6,4.9l8.7,-5.5l-9.6,-3.6l3.5,4.2Zm0,227.1l-3.5,4.3l9.6,-3.6l-8.7,-5.5l2.6,4.8Z" class="g11_1" />
194 | <path fill-rule="evenodd" d="M729.1,254.6H341.4c-10.1,0,-18.3,-8.2,-18.3,-18.3V26.1H747.4V236.3c0,10.1,-8.2,18.3,-18.3,18.3Z" class="g12_1" />
195 | <path d="M729.1,254.6H341.4c-10.1,0,-18.3,-8.2,-18.3,-18.3V26.1H747.4V236.3c0,10.1,-8.2,18.3,-18.3,18.3Z" class="g10_1" />
196 | <path fill-rule="evenodd" d="M454,103.1h45.2c5.7,0,10.4,-4.6,10.4,-10.4V67.9h-66V92.7c0,5.8,4.6,10.4,10.4,10.4Z" class="g13_1" />
197 | <path d="M454,103.1h45.2c5.7,0,10.4,-4.6,10.4,-10.4V67.9h-66V92.7c0,5.8,4.6,10.4,10.4,10.4Z" class="g14_1" />
198 | <text x="454" y="84" dx="0,0,0.1,-0.1,0,-0.3,0" class="s1_1">Conv 1D</text>
199 | <text x="460" y="96" dx="0,0,0,0,0,0.2,-0.1,0" class="s4_1">k=3, s=1</text>
200 | <path fill-rule="evenodd" d="M378.3,103.1h45.1c5.8,0,10.5,-4.6,10.5,-10.4V67.9h-66V92.7c0,5.8,4.6,10.4,10.4,10.4Z" class="g8_1" />
201 | <path d="M378.3,103.1h45.1c5.8,0,10.5,-4.6,10.5,-10.4V67.9h-66V92.7c0,5.8,4.6,10.4,10.4,10.4Z" class="g9_1" />
202 | <text x="381" y="84" dx="0,0.1,0,-0.1,-0.3,0" class="s1_1">Max 1D</text>
203 | <text x="384" y="96" dx="0,0,0,0,0,0.2,-0.1,0" class="s4_1">k=2, s=1</text>
204 | <path fill-rule="evenodd" d="M604.3,103.1h45.2c5.7,0,10.4,-4.6,10.4,-10.4V67.9h-66V92.7c0,5.8,4.7,10.4,10.4,10.4Z" class="g13_1" />
205 | <path d="M604.3,103.1h45.2c5.7,0,10.4,-4.6,10.4,-10.4V67.9h-66V92.7c0,5.8,4.7,10.4,10.4,10.4Z" class="g14_1" />
206 | <text x="605" y="84" dx="0,0,0.1,-0.1,0,-0.3,0" class="s1_1">Conv 1D</text>
207 | <text x="610" y="96" dx="0,0,0,0,0,0.2,-0.1,0" class="s4_1">k=7, s=1</text>
208 | <path fill-rule="evenodd" d="M528.6,103.1h45.2c5.7,0,10.4,-4.6,10.4,-10.4V67.9h-66V92.7c0,5.8,4.7,10.4,10.4,10.4Z" class="g13_1" />
209 | <path d="M528.6,103.1h45.2c5.7,0,10.4,-4.6,10.4,-10.4V67.9h-66V92.7c0,5.8,4.7,10.4,10.4,10.4Z" class="g14_1" />
210 | <text x="529" y="84" dx="0,0,0.1,-0.1,0,-0.3,0" class="s1_1">Conv 1D</text>
211 | <text x="530" y="96" dx="0,0,0,0,0.2,0,0,0.2,-0.1,0.2,0,0.2" class="s4_1">k = 5, s = 1</text>
212 | <path d="M553.7,7.8V56.5h1.6V7.8h-1.6Zm.8,48.7l-4.6,-3.1l4.6,9.2l4.6,-9.2l-4.6,3.1Z" class="g7_1" />
213 | <g clip-path="url(#c2_1)">
214 | <path d="M554.5,32.2c1.4,4.6,-1.8,5,18.3,9.2c20.1,4.1,104.9,2.3,117.4,4c16.7,1.8,15.3,13.2,14.3,17.4" class="g15_1" />
215 | </g>
216 | <g clip-path="url(#c0_1)">
217 | <path d="M553.5,37.9l.2,.9c0,0,0,.1,.1,.1l.4,.8q0,.1,0,.1l.6,.8c.1,0,.1,.1,.1,.1l.8,.7l.9,.8l1.2,.7l1.2,.6l1.3,.6l1.5,.6l1.5,.6l1.6,.5l1.6,.5l3.4,1l3.5,.8l1.8,.4l2,.3l2.1,.2l2.2,.2l2.4,.2l2.3,.1l4.9,.2L596,50l2.4,.1l2.3,.2l2.3,.2l2.1,.3l2,.3l1.9,.4l1.7,.5l1.6,.5l1.5,.7l1.3,.7l1.3,.7l1.2,.8l1,.9l.9,.8l.8,.9l.3,.4l1.2,-.9l-.4,-.5l-.8,-1l-1.1,-.9l-1.1,-.9l-1.2,-.9l-1.4,-.8l-1.4,-.7l-1.6,-.7l-1.7,-.6L609.3,50l-1.9,-.4l-2.1,-.4L603.1,49l-2.3,-.2l-2.3,-.2l-2.4,-.2l-4.9,-.2L586.3,48L584,47.9l-2.3,-.2l-2.2,-.2l-2.1,-.2L575.5,47l-1.8,-.4l-3.4,-.8L567,44.9l-1.6,-.5l-1.6,-.5l-1.4,-.6L561,42.8l-1.3,-.6l-1.1,-.6l-1,-.7l-.9,-.6l-.7,-.7v.1l-.5,-.8V39l-.4,-.8l.1,.2l-.2,-.8l-1.5,.3Zm67.7,20l-5.3,-1.3l7.4,7.1l1.2,-10.2l-3.3,4.4Z" class="g7_1" />
218 | <path d="M555.1,32.3l-.2,.8l-.2,.7l-.1,.6l-.1,.6l-.3,.7v.1l-.3,.5c0,0,-.1,.1,-.1,.1l-.4,.5c-.1,0,-.1,.1,-.1,.1l-.7,.4l-.9,.5l-.5,.3l-.6,.2l-.7,.3l-.8,.2l-.8,.3l-.9,.2l-1,.3l-1.1,.3l-1.2,.3l-1.4,.3l-1.4,.3l-1.6,.3l-1.7,.4l-1.8,.4l-1,.2l-1.1,.2l-1.2,.1l-1.2,.2l-1.4,.2l-1.4,.1l-1.5,.1l-1.6,.2l-1.6,.1l-1.7,.1l-3.6,.3l-3.8,.2l-4,.1l-4.3,.2l-4.3,.1l-4.5,.2h-4.6l-4.6,.1l-4.8,.1l-9.5,.1l-4.7,.1h-4.7l-4.6,.1h-8.9l-4.2,.1h-4.1l-3.8,.1h-3.7l-3.4,.1h-1.6l-1.5,.1h-2.8l-1.3,.1H424l-1.2,.1h-1l-.9,.1l-.9,.1l-.7,.1h-.6l-1.5,.2l-1.4,.3l-1.2,.3l-1.2,.3l-1.1,.4l-1,.4l-.9,.5l-.8,.4l-.7,.5l-.7,.6l-.6,.5l-.5,.6l-.9,1.2l.1,-.1l-.8,1.3l-.4,1.3l-.3,1.3l-.1,.6l-1.5,-.2l.1,-.8l.4,-1.4l.5,-1.5l.7,-1.3q0,-.1,.1,-.1L406,50l.6,-.7l.6,-.6l.8,-.6l.8,-.6l1,-.5l.9,-.5l1.1,-.5l1.2,-.3l1.2,-.4l1.4,-.3l1.4,-.3l1.5,-.2l.7,-.1h.7l.9,-.1l.9,-.1h1.1l1.1,-.1h1.2l1.3,-.1h1.4l1.4,-.1h3.2l3.4,-.1l3.6,-.1h3.9l4.1,-.1H456l4.5,-.1h9.3l4.7,-.1l9.5,-.1l4.7,-.1l4.7,-.1L498,43l4.4,-.1l4.4,-.1l4.2,-.2l4,-.2l3.8,-.2l3.6,-.2l1.7,-.1l1.6,-.1l1.6,-.2l1.4,-.1l1.5,-.2l1.3,-.1l1.2,-.2l1.2,-.2l1,-.1l1,-.2l1.8,-.4l1.7,-.3l1.6,-.4l1.4,-.3l1.3,-.3l1.2,-.3l1.1,-.3l1,-.2l.9,-.3l.8,-.2l.7,-.3L550,37l.5,-.2l.5,-.2l.7,-.4l.6,-.5l-.1,.2l.5,-.5l-.1,.1l.3,-.5l-.1,.1l.2,-.4l.1,-.6l.1,-.6l.2,-.8l.2,-.8l1.5,.4ZM403.9,56.5l4.4,-3.3l-4,9.4l-5.1,-8.8l4.7,2.7Z" class="g7_1" />
219 | <path d="M554.6,37.8l-.2,.8c0,0,0,.1,0,.1l-.4,.8c0,.1,-.1,.1,-.1,.1l-.6,.8c0,0,0,.1,-.1,.1l-.7,.7l-1,.8l-1.1,.7l-1.2,.6l-1.4,.6l-1.4,.6l-1.5,.6l-1.6,.5l-1.7,.5l-3.3,1l-3.5,.8l-1.8,.4l-2,.3l-2.2,.2l-2.2,.2l-2.3,.2l-2.4,.1l-4.9,.2l-4.9,.3l-2.3,.1l-2.4,.2l-2.2,.2l-2.2,.3l-2,.3l-1.8,.4l-1.7,.5l-1.6,.5l-1.5,.7l-1.4,.7l-1.3,.7l-1.1,.8l-1,.9l-1,.8l-.7,.9l-.4,.4l-1.2,-.9l.4,-.5l.9,-1l1,-.9l1.1,-.9l1.3,-.9l1.3,-.8l1.5,-.7l1.6,-.7l1.7,-.6l1.8,-.5l1.9,-.4l2,-.4l2.2,-.2l2.3,-.3l2.4,-.1l2.4,-.2L517,48l4.8,-.2l2.4,-.1l2.3,-.2l2.2,-.2l2,-.2l2,-.3l1.7,-.4l3.4,-.8l3.4,-.9l1.6,-.5l1.5,-.6l1.5,-.5l1.4,-.6l1.2,-.5l1.2,-.6l1,-.7l.8,-.6l.8,-.7l-.1,.1l.6,-.8l-.1,.1L553,38v.2l.2,-.8l1.4,.4ZM486.9,57.7l5.4,-1.3l-7.4,7.1L483.6,53.3l3.3,4.4Z" class="g7_1" />
220 | </g>
221 | <path fill-rule="evenodd" d="M378.5,163.6h45.1c5.8,0,10.4,-4.6,10.4,-10.4V128.4H368v24.8c0,5.8,4.7,10.4,10.5,10.4Z" class="g16_1" />
222 | <path d="M378.5,163.6h45.1c5.8,0,10.4,-4.6,10.4,-10.4V128.4H368v24.8c0,5.8,4.7,10.4,10.5,10.4Z" class="g17_1" />
223 | <text x="379" y="145" dx="0,0,0.1,-0.1,0,-0.3,0" class="s1_1">Conv 2D</text>
224 | <text x="378" y="156" dx="0,0,-0.1,0,0,0,0,0.4,-0.1,-0.1" class="s6_1">k=1x1, s=1</text>
225 | <g clip-path="url(#c0_1)">
226 | <path d="M398.4,107.2v11.4H400V107.2h-1.6Zm.8,11.4l-4.6,-3l4.6,9.2l4.6,-9.2l-4.6,3Z" class="g7_1" />
227 | </g>
228 | <path fill-rule="evenodd" d="M528.5,238.4h49.1c4.6,0,8.4,-3.8,8.4,-8.4V209.8H520V230c0,4.6,3.8,8.4,8.5,8.4Z" class="g3_1" />
229 | <path d="M528.5,238.4h49.1c4.6,0,8.4,-3.8,8.4,-8.4V209.8H520V230c0,4.6,3.8,8.4,8.5,8.4Z" class="g4_1" />
230 | <text x="534" y="227" dx="0,0,0.1,-0.1,0.1,0" class="s1_1">Concat</text>
231 | <path fill-rule="evenodd" d="M454,163.6h45.2c5.7,0,10.4,-4.6,10.4,-10.4V128.4h-66v24.8c0,5.8,4.6,10.4,10.4,10.4Z" class="g16_1" />
232 | <path d="M454,163.6h45.2c5.7,0,10.4,-4.6,10.4,-10.4V128.4h-66v24.8c0,5.8,4.6,10.4,10.4,10.4Z" class="g17_1" />
233 | <text x="454" y="145" dx="0,0,0.1,-0.1,0,-0.3,0" class="s1_1">Conv 2D</text>
234 | <text x="453" y="156" dx="0,0,-0.1,0,0,0,0,0.4,-0.1,-0.1" class="s6_1">k=1x1, s=1</text>
235 | <path fill-rule="evenodd" d="M528.6,163.6h45.2c5.7,0,10.4,-4.6,10.4,-10.4V128.4h-66v24.8c0,5.8,4.7,10.4,10.4,10.4Z" class="g16_1" />
236 | <path d="M528.6,163.6h45.2c5.7,0,10.4,-4.6,10.4,-10.4V128.4h-66v24.8c0,5.8,4.7,10.4,10.4,10.4Z" class="g17_1" />
237 | <text x="529" y="145" dx="0,0,0.1,-0.1,0,-0.3,0" class="s1_1">Conv 2D</text>
238 | <text x="528" y="156" dx="0,0,-0.1,0,0,0,0,0.4,-0.1,-0.1" class="s6_1">k=1x1, s=1</text>
239 | <path fill-rule="evenodd" d="M604.3,163.6h45.2c5.7,0,10.4,-4.6,10.4,-10.4V128.4h-66v24.8c0,5.8,4.7,10.4,10.4,10.4Z" class="g16_1" />
240 | <path d="M604.3,163.6h45.2c5.7,0,10.4,-4.6,10.4,-10.4V128.4h-66v24.8c0,5.8,4.7,10.4,10.4,10.4Z" class="g17_1" />
241 | <text x="605" y="145" dx="0,0,0.1,-0.1,0,-0.3,0" class="s1_1">Conv 2D</text>
242 | <text x="602" y="156" dx="0,0,0,-0.1,0,0,0,0,0.4,-0.1,-0.1" class="s6_1">k =1x1, s=1</text>
243 | <path d="M553.8,241.5v26.9h-1.5V241.5h1.5Zm-.8,26.9l4.6,-3l-4.6,9.1l-4.6,-9.1l4.6,3Z" class="g7_1" />
244 | <g clip-path="url(#c0_1)">
245 | <path d="M472,107.2v11.4h1.5V107.2H472Zm.7,11.4l-4.6,-3l4.6,9.2l4.6,-9.2l-4.6,3Z" class="g7_1" />
246 | <path d="M551.5,107.2v11.4h1.6V107.2h-1.6Zm.8,11.4l-4.6,-3l4.6,9.2l4.6,-9.2l-4.6,3Z" class="g7_1" />
247 | <path d="M626.1,107.2v11.4h1.6V107.2h-1.6Zm.8,11.4l-4.6,-3l4.6,9.2l4.6,-9.2l-4.6,3Z" class="g7_1" />
248 | </g>
249 | <path d="M553.3,198.7c1.4,-4.6,-1.8,-5,18.3,-9.2c20,-4.1,104.9,-2.3,117.4,-4c16.7,-1.8,15.2,-13.2,14.3,-17.4" class="g15_1" />
250 | <g clip-path="url(#c1_1)">
251 | <path d="M553,193.8c.6,-4.5,10.2,-7.4,19.3,-9.6c9.2,-2.1,26,-.9,35.6,-3.3c9.7,-2.4,14.7,-8.8,14.2,-13" class="g15_1" />
252 | </g>
253 | <g clip-path="url(#c0_1)">
254 | <path d="M552.5,198.5c-1.4,-4.5,1.8,-5,-18.3,-9.1c-20,-4.2,-104.9,-2.3,-117.4,-4.1C400.1,183.5,401.6,172.1,402.5,168" class="g15_1" />
255 | </g>
256 | <g clip-path="url(#c1_1)">
257 | <path d="M552.7,193c-.5,-4.5,-10.1,-7.5,-19.3,-9.6c-9.1,-2.2,-25.9,-1,-35.6,-3.4c-9.6,-2.4,-14.7,-8.8,-14.1,-13" class="g15_1" />
258 | </g>
259 | <path d="M553.8,169.5v30.2h-1.5V169.5h1.5Zm-.8,30.2l4.6,-3.1l-4.6,9.2l-4.6,-9.2l4.6,3.1Z" class="g7_1" />
260 | <text x="468" y="13" dx="0,0,-0.3,0,-0.4,0,-0.1,0,-0.4,0,-0.1,0,-0.4,0.1,0.1" class="s2_1">T × L × L × C/N</text>
261 | <text x="311" y="55" dx="0,0,-0.3,0,-0.4,0,-0.1,0,-0.4,0,-0.1,0,-0.4,0.1,0.1" class="s2_1">T × L × L × C/N</text>
262 | <text x="311" y="120" dx="0,0,-0.3,0,-0.4,0,-0.1,0,-0.4,0,-0.1,0,-0.4,0.1,0.1" class="s2_1">T × L × L × C/N</text>
263 | <text x="311" y="190" dx="0,0,-0.3,0,-0.4,0,-0.1,0,-0.4,0,-0.1,0,-0.4,0.1,0.1,0,0,0,-0.8,-0.1" class="s2_1">T × L × L × C/(M. N)</text>
264 | <text x="443" y="269" dx="0,0,-0.3,0,-0.4,0,-0.1,0,-0.4,0,-0.1,0,-0.4,0,0.2,-0.1,0,0.1,0,-1,0.1" class="s2_1">T × L × L × 5C/(M. N)</text>
265 | <path fill-rule="evenodd" d="M680.6,163.6h45.2c5.7,0,10.4,-4.6,10.4,-10.4V128.4h-66v24.8c0,5.8,4.6,10.4,10.4,10.4Z" class="g16_1" />
266 | <path d="M680.6,163.6h45.2c5.7,0,10.4,-4.6,10.4,-10.4V128.4h-66v24.8c0,5.8,4.6,10.4,10.4,10.4Z" class="g17_1" />
267 | <text x="681" y="145" dx="0,0,0.1,-0.1,0,-0.3,0" class="s1_1">Conv 2D</text>
268 | <text x="679" y="156" dx="0,0,0,-0.1,0,0,0,0,0.4,-0.1,-0.1" class="s6_1">k =1x1, s=1</text>
269 | <path d="M705.4,57.1V117h-1.5V57.1h1.5Zm-.8,59.9l4.6,-3.1l-4.6,9.2l-4.5,-9.2l4.5,3.1Z" class="g7_1" />
270 | <text x="467" y="299" dx="0,0,-0.1,0,0.3,0,-0.1,0,0,0,-0.1,0.1,0.1,0.3,0,0,-0.1,0,0.2,0,0,0,0,0.1" class="s5_1">(b) Temporal Conv Module</text>
271 | 
272 | 
273 | <!-- Any embedded fonts defined here -->
274 | <style type="text/css" ><![CDATA[
275 | 
276 | @font-face {
277 | 	font-family: CambriaMath_e;
278 | 	src: url("fonts/CambriaMath_e.woff") format("woff");
279 | }
280 | 
281 | @font-face {
282 | 	font-family: CambriaMath_t;
283 | 	src: url("fonts/CambriaMath_t.woff") format("woff");
284 | }
285 | 
286 | @font-face {
287 | 	font-family: Cambria_i;
288 | 	src: url("fonts/Cambria_i.woff") format("woff");
289 | }
290 | 
291 | @font-face {
292 | 	font-family: Cambria_q;
293 | 	src: url("fonts/Cambria_q.woff") format("woff");
294 | }
295 | 
296 | ]]></style>
297 | 
298 | </svg>
299 | 


--------------------------------------------------------------------------------