├── core
├── __init__.py
├── const.py
├── metrics.py
├── config.py
├── keras_utils.py
├── config_utils.py
├── pytorch_utils.py
├── data_utils.py
├── utils.py
└── image_utils.py
├── nets
├── __init__.py
├── resnet_152_pytorch.py
├── layers_pytorch.py
├── i3d_torch_charades_utils.py
├── timeception_pytorch.py
├── i3d_torch_charades.py
├── i3d_torch_charades_test.py
└── resnet_152_keras.py
├── datasets
└── __init__.py
├── experiments
├── __init__.py
├── test_pytorch.py
├── test_keras.py
├── train_keras.py
└── train_pytorch.py
├── data
└── assets
│ ├── badge-keras.png
│ ├── badge-pytorch.png
│ ├── badge-tensorflow.png
│ ├── timeception_layer.jpg
│ ├── timeception_layer.pdf
│ └── timeception_layer.svg
├── scripts
├── test_charades_i3d_tc4_f1024.sh
└── train_charades_i3d_tc4_f1024.sh
├── requirements.txt
├── __doc__.py
├── configs
├── charades_i3d_tc2_f256.yaml
├── charades_i3d_tc3_f256.yaml
├── charades_i3d_tc3_f512.yaml
└── charades_i3d_tc4_f1024.yaml
├── main.py
└── README.md
/core/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/nets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/test_pytorch.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/assets/badge-keras.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/badge-keras.png
--------------------------------------------------------------------------------
/data/assets/badge-pytorch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/badge-pytorch.png
--------------------------------------------------------------------------------
/data/assets/badge-tensorflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/badge-tensorflow.png
--------------------------------------------------------------------------------
/data/assets/timeception_layer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/timeception_layer.jpg
--------------------------------------------------------------------------------
/data/assets/timeception_layer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/noureldien/timeception/HEAD/data/assets/timeception_layer.pdf
--------------------------------------------------------------------------------
/scripts/test_charades_i3d_tc4_f1024.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | python ../experiments/test.py --config_file charades_i3d_tc4_f1024.yaml
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | opencv
3 | scikit-learn
4 |
5 | keras
6 | tensorflow-gpu
7 | torch
8 | torchvision
9 | torchsummary
10 | torchviz
--------------------------------------------------------------------------------
/scripts/train_charades_i3d_tc4_f1024.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | python ../experiments/train.py --config_file charades_i3d_tc4_f1024.yaml
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/__doc__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | __author__ = 'Noureldien Hussein'
24 | __copyright__ = 'Copyright (c) 2019, Noureldien Hussein'
25 | __credits__ = ['']
26 | __license__ = 'GPLv3'
27 | __version__ = '1.0.0'
28 | __maintainer__ = 'Noureldien Hussein'
29 | __email__ = 'nhussein@uva.nl'
30 | __status__ = 'Development'
31 |
--------------------------------------------------------------------------------
/configs/charades_i3d_tc2_f256.yaml:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env vim
2 |
3 | NUM_GPUS: 1 # how many gups to use
4 | LOG_PERIOD: 10 # log period
5 | DATASET_NAME: 'charades' # name of dataset
6 |
7 | MODEL:
8 | CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl'
9 | N_CLASSES: 157 # how many classes as output
10 | N_CHAMNNEL_GROUPS: 8 # how many channel groups
11 | N_TC_LAYERS: 2 # number of timeception layers
12 | N_TC_TIMESTEPS: 32 # how mant timesteps expected as input to the timeception layers
13 | N_INPUT_TIMESTEPS: 256 # how many timesteps (i.e. frames) expected as an input to the backbone CNN
14 | NAME: 'charades_timeception' # name suffex for the model to be trained
15 | BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used
16 | BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn
17 | MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks"
18 |
19 | TRAIN:
20 | BATCH_SIZE: 32 # batch size for training
21 | N_EPOCHS: 500 # how many training epochs
22 | SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only')
23 | N_WORKERS: 10 # how many parallel workers in the data generator
24 |
25 | TEST:
26 | BATCH_SIZE: 64
27 | N_SAMPLES: 10
28 |
29 | SOLVER:
30 | NAME: 'adam'
31 | LR: 0.01
32 | ADAM_EPSILON: 0.0001
33 | SGD_WEIGHT_DECAY: 0.0001
34 | SGD_MOMENTUM: 0.9
35 | SGD_NESTEROV: True
--------------------------------------------------------------------------------
/configs/charades_i3d_tc3_f256.yaml:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env vim
2 |
3 | NUM_GPUS: 1 # how many gups to use
4 | LOG_PERIOD: 10 # log period
5 | DATASET_NAME: 'charades' # name of dataset
6 |
7 | MODEL:
8 | CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl'
9 | N_CLASSES: 157 # how many classes as output
10 | N_CHAMNNEL_GROUPS: 8 # how many channel groups
11 | N_TC_LAYERS: 3 # number of timeception layers
12 | N_TC_TIMESTEPS: 32 # how mant timesteps expected as input to the timeception layers
13 | N_INPUT_TIMESTEPS: 256 # how many timesteps (i.e. frames) expected as an input to the backbone CNN
14 | NAME: 'charades_timeception' # name suffex for the model to be trained
15 | BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used
16 | BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn
17 | MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks"
18 |
19 | TRAIN:
20 | BATCH_SIZE: 32 # batch size for training
21 | N_EPOCHS: 500 # how many training epochs
22 | SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only')
23 | N_WORKERS: 10 # how many parallel workers in the data generator
24 |
25 | TEST:
26 | BATCH_SIZE: 50
27 | N_SAMPLES: 10
28 |
29 | SOLVER:
30 | NAME: 'adam'
31 | LR: 0.01
32 | ADAM_EPSILON: 0.0001
33 | SGD_WEIGHT_DECAY: 0.0001
34 | SGD_MOMENTUM: 0.9
35 | SGD_NESTEROV: True
--------------------------------------------------------------------------------
/configs/charades_i3d_tc3_f512.yaml:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env vim
2 |
3 | NUM_GPUS: 1 # how many gups to use
4 | LOG_PERIOD: 10 # log period
5 | DATASET_NAME: 'charades' # name of dataset
6 |
7 | MODEL:
8 | CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl'
9 | N_CLASSES: 157 # how many classes as output
10 | N_CHAMNNEL_GROUPS: 8 # how many channel groups
11 | N_TC_LAYERS: 3 # number of timeception layers
12 | N_TC_TIMESTEPS: 64 # how mant timesteps expected as input to the timeception layers
13 | N_INPUT_TIMESTEPS: 512 # how many timesteps (i.e. frames) expected as an input to the backbone CNN
14 | NAME: 'charades_timeception' # name suffex for the model to be trained
15 | BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used
16 | BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn
17 | MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks"
18 |
19 | TRAIN:
20 | BATCH_SIZE: 20 # batch size for training
21 | N_EPOCHS: 500 # how many training epochs
22 | SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only')
23 | N_WORKERS: 10 # how many parallel workers in the data generator
24 |
25 | TEST:
26 | BATCH_SIZE: 40
27 | N_SAMPLES: 10
28 |
29 | SOLVER:
30 | NAME: 'adam'
31 | LR: 0.01
32 | ADAM_EPSILON: 0.0001
33 | SGD_WEIGHT_DECAY: 0.0001
34 | SGD_MOMENTUM: 0.9
35 | SGD_NESTEROV: True
--------------------------------------------------------------------------------
/configs/charades_i3d_tc4_f1024.yaml:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env vim
2 |
3 | NUM_GPUS: 1 # how many gups to use
4 | LOG_PERIOD: 10 # log period
5 | DATASET_NAME: 'charades' # name of dataset
6 |
7 | MODEL:
8 | CLASSIFICATION_TYPE: 'ml' # either multi-label 'ml' or single-label 'sl'
9 | N_CLASSES: 157 # how many classes as output
10 | N_CHAMNNEL_GROUPS: 8 # how many channel groups
11 | N_TC_LAYERS: 4 # number of timeception layers
12 | N_TC_TIMESTEPS: 128 # how mant timesteps expected as input to the timeception layers
13 | N_INPUT_TIMESTEPS: 1024 # how many timesteps (i.e. frames) expected as an input to the backbone CNN
14 | NAME: 'charades_timeception' # name suffex for the model to be trained
15 | BACKBONE_CNN: 'i3d_pytorch_charades_rgb'# which backbone cnn is used
16 | BACKBONE_FEATURE: 'mixed_5c' # type of feature output from backbone cnn
17 | MULTISCALE_TYPE: 'dl' # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks"
18 |
19 | TRAIN:
20 | BATCH_SIZE: 16 # batch size for training
21 | N_EPOCHS: 500 # how many training epochs
22 | SCHEME: 'tco' # either 'ete' (end-to-end) or 'tco' ('timeception-only')
23 | N_WORKERS: 10 # how many parallel workers in the data generator
24 |
25 | TEST:
26 | BATCH_SIZE: 32
27 | N_SAMPLES: 10
28 |
29 | SOLVER:
30 | NAME: 'adam'
31 | LR: 0.01
32 | ADAM_EPSILON: 0.0001
33 | SGD_WEIGHT_DECAY: 0.0001
34 | SGD_MOMENTUM: 0.9
35 | SGD_NESTEROV: True
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Main file of the project.
25 | """
26 |
27 | def __main():
28 | from experiments import train_keras, test_keras, train_pytorch, test_pytorch
29 |
30 | # to train Timeception using keras
31 | train_keras.__main()
32 |
33 | # or using pytorch
34 | # train_pytorch.__main()
35 |
36 | # to test Timeception using keras
37 | # test_keras.__main()
38 |
39 | # or using pytorch
40 | # test_pytorch.__main()
41 |
42 | if __name__ == '__main__':
43 | __main()
44 | pass
45 |
--------------------------------------------------------------------------------
/experiments/test_keras.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Test Timeception models.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import logging
33 | import os
34 | import datetime
35 | from optparse import OptionParser
36 |
37 | import tensorflow as tf
38 | import keras.backend as K
39 | from keras.layers import Dense, LeakyReLU, Dropout, Input, Activation
40 | from keras.optimizers import SGD, Adam
41 | from keras.models import Sequential, Model
42 | from keras.layers.normalization import BatchNormalization
43 |
44 | from nets import timeception
45 | from nets.layers_keras import MaxLayer
46 | from core import utils, keras_utils, image_utils, config_utils, const, config, data_utils
47 | from core.utils import Path as Pth
48 |
49 | logger = logging.getLogger(__name__)
50 |
51 | def test_tco():
52 | pass
--------------------------------------------------------------------------------
/core/const.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Constants for project.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import os
33 | import platform
34 | import numpy as np
35 |
36 | DL_FRAMEWORKS = np.array(['caffe', 'tensorflow', 'pytorch', 'keras', 'caffe2'])
37 | DL_FRAMEWORK = None
38 | GPU_CORE_ID = 0
39 |
40 | CNN_FEATURE_SIZES = np.array([2048, 2048, 1000, 1024, 1000, 2048, 2048])
41 | CNN_FEATURE_TYPES = np.array(['fc6', 'fc7', 'fc1000', 'fc1024', 'fc365', 'prob', 'pool5', 'fc8a', 'res3b7', 'res4b35', 'res5c'])
42 | CNN_MODEL_TYPES = np.array(['resnet152', 'googlenet1k', 'vgg16', 'places365-resnet152', 'places365-vgg', 'googlenet13k'])
43 | RESIZE_TYPES = np.array(['resize', 'resize_crop', 'resize_crop_scaled', 'resize_keep_aspect_ratio_padded'])
44 | ROOT_PATH_TYPES = np.array(['data', 'project'])
45 | TRAIN_SCHEMES = np.array(['ete', 'tco'])
46 | MODEL_CLASSIFICATION_TYPES = np.array(['ml', 'sl'])
47 | MODEL_MULTISCALE_TYPES = np.array(['dl', 'ks'])
48 | SOLVER_NAMES = np.array(['adam', 'sgd'])
49 | DATASET_NAMES = np.array(['charades', 'kinetics400', 'breakfast_actions', 'you_cook_2', 'multi_thumos'])
50 | DATA_ROOT_PATH = './data'
51 | PROJECT_ROOT_PATH = '../'
52 | MACHINE_NAME = platform.node()
53 |
--------------------------------------------------------------------------------
/core/metrics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Evaluation functions.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import numpy as np
33 | from sklearn.metrics import average_precision_score
34 |
35 | def map_charades(y_true, y_pred):
36 | """ Returns mAP """
37 | m_aps = []
38 | n_classes = y_pred.shape[1]
39 | for oc_i in range(n_classes):
40 | pred_row = y_pred[:, oc_i]
41 | sorted_idxs = np.argsort(-pred_row)
42 | true_row = y_true[:, oc_i]
43 | tp = true_row[sorted_idxs] == 1
44 | fp = np.invert(tp)
45 | n_pos = tp.sum()
46 | if n_pos < 0.1:
47 | m_aps.append(float('nan'))
48 | continue
49 | f_pcs = np.cumsum(fp)
50 | t_pcs = np.cumsum(tp)
51 | prec = t_pcs / (f_pcs + t_pcs).astype(float)
52 | avg_prec = 0
53 | for i in range(y_pred.shape[0]):
54 | if tp[i]:
55 | avg_prec += prec[i]
56 | m_aps.append(avg_prec / n_pos.astype(float))
57 | m_aps = np.array(m_aps)
58 | m_ap = np.mean(m_aps)
59 | return m_ap
60 |
61 | def map_sklearn(y_true, y_pred):
62 | # """ Returns mAP """
63 | n_classes = y_true.shape[1]
64 | map = [average_precision_score(y_true[:, i], y_pred[:, i]) for i in range(n_classes)]
65 | map = np.nan_to_num(map)
66 | map = np.mean(map)
67 | return map
68 |
69 | def accuracy(y_true, y_pred):
70 | idx = np.argmax(y_pred, axis=1)
71 | n_items = len(y_true)
72 | accuracy = np.sum(idx == y_true) / float(n_items)
73 | return accuracy
74 |
75 | def acuracy_top_n(n_top, y_true, y_pred):
76 | n_corrects = 0
77 | for gt, pr in zip(y_true, y_pred):
78 | idx = np.argsort(pr)[::-1]
79 | idx = idx[0:n_top]
80 | gt = np.where(gt == 1)[0][0]
81 | if gt in idx:
82 | n_corrects += 1
83 | n = len(y_true)
84 | score = n_corrects / float(n)
85 | return score
86 |
87 |
--------------------------------------------------------------------------------
/core/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Definition for all configuration options for training/testing Timeception model on various datasets.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import logging
33 | import sys
34 |
35 | from core.utils import AttrDict
36 |
37 | logger = logging.getLogger(__name__)
38 |
39 | __C = AttrDict()
40 | cfg = __C
41 |
42 | # region Misc
43 |
44 | __C.DEBUG = False # is debugging
45 | __C.NUM_GPUS = 1 # how many gups to use
46 | __C.LOG_PERIOD = 10 # log period
47 | __C.DATASET_NAME = str('') # name of dataset
48 |
49 | # endregion
50 |
51 | # region Model
52 |
53 | __C.MODEL = AttrDict()
54 | __C.MODEL.CLASSIFICATION_TYPE = str('') # either multi-label 'ml' or single-label 'sl'
55 | __C.MODEL.N_CLASSES = 157 # how many classes as output
56 | __C.MODEL.N_CHAMNNEL_GROUPS = 8 # how many channel groups
57 | __C.MODEL.N_TC_LAYERS = 4 # number of timeception layers
58 | __C.MODEL.N_TC_TIMESTEPS = 64 # how mant timesteps expected as input to the timeception layers
59 | __C.MODEL.N_INPUT_TIMESTEPS = 512 # how many timesteps (i.e. frames) expected as an input to the backbone CNN
60 | __C.MODEL.NAME = str('') # name suffex for the model to be trained
61 | __C.MODEL.BACKBONE_CNN = str('') # which backbone cnn is used
62 | __C.MODEL.BACKBONE_FEATURE = str('') # type of feature output from backbone cnn
63 | __C.MODEL.MULTISCALE_TYPE = str('') # use multi-scale by dilation rate "dl" or multi-scale by kernel-size "ks"
64 |
65 | # endregion
66 |
67 | # region Train
68 |
69 | __C.TRAIN = AttrDict()
70 | __C.TRAIN.BATCH_SIZE = 64 # batch size for training
71 | __C.TRAIN.N_EPOCHS = 500 # how many training epochs
72 | __C.TRAIN.SCHEME = str('') # either 'ete' (end-to-end) or tco ('timeception-only')
73 | __C.TRAIN.N_WORKERS = 10 #
74 |
75 | # endregion
76 |
77 | # region Test
78 |
79 | __C.TEST = AttrDict()
80 | __C.TEST.BATCH_SIZE = 64
81 | __C.TEST.N_SAMPLES = 10
82 |
83 | # endregion
84 |
85 | # region Solver
86 |
87 | __C.SOLVER = AttrDict()
88 | __C.SOLVER.NAME = str('adam')
89 | __C.SOLVER.LR = 0.0001
90 | __C.SOLVER.ADAM_EPSILON = 1e-4
91 | __C.SOLVER.SGD_WEIGHT_DECAY = 0.0001
92 | __C.SOLVER.SGD_MOMENTUM = 0.9
93 | __C.SOLVER.SGD_NESTEROV = True
94 |
95 | # endregion
96 |
--------------------------------------------------------------------------------
/nets/resnet_152_pytorch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | ResNet-152 fine-tuned on Charades.
25 | https://github.com/gsig/charades-algorithms/tree/master/pytorch
26 | """
27 |
28 | from __future__ import absolute_import
29 | from __future__ import division
30 | from __future__ import print_function
31 | from __future__ import unicode_literals
32 |
33 | import logging
34 | import warnings
35 | import os
36 | import random
37 | import sys
38 | import time
39 | import datetime
40 | import math
41 | import shutil
42 | import random
43 |
44 | import numpy as np
45 | import cv2
46 | import scipy.io
47 | import h5py
48 | from collections import OrderedDict
49 |
50 | from core import const as c, utils
51 | from core import image_utils
52 |
53 | logger = logging.getLogger(__name__)
54 |
55 | if c.DL_FRAMEWORK == 'tensorflow':
56 | import tensorflow as tf
57 | elif c.DL_FRAMEWORK == 'caffe':
58 | import caffe
59 | elif c.DL_FRAMEWORK == 'pytorch':
60 | import torch
61 | import torch.nn as nn
62 | import torch.nn.parallel
63 | import torch.backends.cudnn as cudnn
64 | import torch.distributed as dist
65 | import torchvision.models as tmodels
66 | import importlib
67 | elif c.DL_FRAMEWORK == 'keras':
68 | import tensorflow as tf
69 | import keras.backend as K
70 |
71 | def get_resnet_152_charades_model():
72 | import torch
73 | import torch.nn as nn
74 | import torch.nn.parallel
75 | import torch.backends.cudnn as cudnn
76 | import torch.distributed as dist
77 | import torchvision.models as tmodels
78 | import importlib
79 | import torch.utils.model_zoo as model_zoo
80 |
81 | root_path = c.DATA_ROOT_PATH
82 | model_arch = 'resnet152'
83 | model_checkpoint_path = '%s/Charades/baseline_models/resnet_rgb.pth.tar' % (root_path)
84 |
85 | # load model
86 | print("=> creating model '{}'".format(model_arch))
87 | model = tmodels.__dict__[model_arch](pretrained=False)
88 | cudnn.benchmark = True
89 |
90 | # load checkpoint
91 | checkpoint = torch.load(model_checkpoint_path)
92 | checkpoint = checkpoint['state_dict']
93 |
94 | # fix keys of state dict
95 | unwanted_keys = ['fc.weight', 'fc.bias']
96 | state_dict = OrderedDict()
97 | for k, v in checkpoint.iteritems():
98 | key = k.replace('module.', '')
99 | if key not in unwanted_keys:
100 | state_dict[key] = v
101 |
102 | # remove fc and avgpool layers
103 | layers = model._modules.items()
104 | layers = list(layers)[:-2]
105 | layers = OrderedDict(layers)
106 | model = nn.Sequential(layers)
107 |
108 | # load the dictionary
109 | model.load_state_dict(state_dict)
110 |
111 | # if parrallize the model
112 | # model = torch.nn.DataParallel(model).cuda()
113 |
114 | # make sure it's only for testing
115 | model.train(False)
116 |
117 | # convert to eval model
118 | model.eval()
119 |
120 | # convert to gpu model
121 | model.cuda()
122 |
123 | return model
124 |
125 | def get_mean_std_for_resnet_152_pytorch_model():
126 | img_mean = [0.485, 0.456, 0.406]
127 | img_std = [0.229, 0.224, 0.225]
128 | return img_mean, img_std
129 |
--------------------------------------------------------------------------------
/nets/layers_pytorch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Layers for pytorch.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import numpy as np
33 | import logging
34 |
35 | import torch
36 | from torch.nn import Module, Conv2d, Conv1d
37 | from torch.nn import functional as F
38 |
39 | from core import pytorch_utils
40 |
41 | logger = logging.getLogger(__name__)
42 |
43 | # region Basic Layers
44 |
45 | class ChannelShuffleLayer(Module):
46 | """
47 | Shuffle the channels across groups.
48 | """
49 |
50 | def __init__(self, n_channels, n_groups):
51 | super(ChannelShuffleLayer, self).__init__()
52 |
53 | n_channels_per_group = int(n_channels / n_groups)
54 | assert n_channels_per_group * n_groups == n_channels
55 |
56 | self.n_channels_per_group = n_channels_per_group
57 | self.n_groups = n_groups
58 |
59 | def forward(self, input):
60 | """
61 | input shape (None, 1024, 20, 7, 7), or (BN, C, T, H, W)
62 | """
63 |
64 | input_shape = input.size()
65 | n_samples, n_channels, n_timesteps, side_dim1, side_dim2 = input_shape
66 |
67 | n_groups = self.n_groups
68 | n_channels_per_group = self.n_channels_per_group
69 |
70 | tensor = input.view(n_samples, n_groups, n_channels_per_group, n_timesteps, side_dim1, side_dim2)
71 | tensor = tensor.permute(0, 2, 1, 3, 4, 5)
72 | tensor = tensor.contiguous()
73 | tensor = tensor.view(n_samples, n_channels, n_timesteps, side_dim1, side_dim2)
74 |
75 | return tensor
76 |
77 | # endregion
78 |
79 | # region Timeception Layers
80 |
81 | class DepthwiseConv1DLayer(Module):
82 | """
83 | Shuffle the channels across groups.
84 | """
85 |
86 | def __init__(self, input_shape, kernel_size, dilation, name):
87 | super(DepthwiseConv1DLayer, self).__init__()
88 |
89 | assert len(input_shape) == 5
90 |
91 | self.kernel_size = kernel_size
92 | self.dilation = dilation
93 | self._name = name
94 |
95 | n_channels = input_shape[1]
96 | n_timesteps = input_shape[2]
97 |
98 | # TODO: support using different dilation rates.
99 | padding = pytorch_utils.calc_padding_1d(n_timesteps, kernel_size)
100 | self.depthwise_conv1d = Conv1d(n_channels, n_channels, kernel_size, dilation=dilation, groups=n_channels, padding=padding)
101 | self.depthwise_conv1d._name = name
102 |
103 | def forward(self, input):
104 | """
105 | input shape (None, 1024, 20, 7, 7), or (BN, C, T, H, W)
106 | """
107 |
108 | input_shape = input.size()
109 |
110 | n, c, t, h, w = input_shape
111 |
112 | # transpose and reshape to hide the spatial dimension, only expose the temporal dimension for depthwise conv
113 | tensor = input.permute(0, 3, 4, 1, 2) # (None, 7, 7, 1024, 20)
114 | tensor = tensor.contiguous()
115 | tensor = tensor.view(-1, c, t) # (None*7*7, 1024, 20)
116 |
117 | # depthwise conv on the temporal dimension, as if it was the spatial dimension
118 | tensor = self.depthwise_conv1d(tensor) # (None*7*7, 1024, 20)
119 |
120 | # get timesteps after convolution
121 | t = tensor.size()[-1]
122 |
123 | # reshape to get the spatial dimensions
124 | tensor = tensor.view(n, h, w, c, t) # (None, 7, 7, 1024, 20)
125 |
126 | # finally, transpose to get the desired output shape
127 | tensor = tensor.permute(0, 3, 4, 1, 2) # (None, 1024, 20, 7, 7)
128 |
129 | return tensor
130 |
131 | # endregion
132 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Timeception for Complex Action Recognition
2 |
3 |   
4 |
5 | This code repository is the implementation for the paper [Timeception for Complex Action Recognition](https://arxiv.org/abs/1812.01289).
6 | We provide the implementation for 3 different libraries: `keras`, `tensorflow` and `pytorch`.
7 |
8 | 
9 |
10 | ### Citation
11 |
12 | Please consider citing this work using this BibTeX entry
13 |
14 | ```bibtex
15 | @inproceedings{hussein2018timeception,
16 | title = {Timeception for Complex Action Recognition},
17 | author = {Hussein, Noureldien and Gavves, Efstratios and Smeulders, Arnold WM},
18 | booktitle = {CVPR},
19 | year = {2019}
20 | }
21 | ```
22 |
23 | ### How to Use?
24 |
25 | ###### Keras
26 |
27 | Using `keras`, we can define `timeception` as a sub-model.
28 | Then we use it along with another model definition.
29 | For example, here we define 4 `timeception` layers followed by a `dense` layer for classification.
30 |
31 | ```python
32 | from keras import Model
33 | from keras.layers import Input, Dense
34 | from nets.layers_keras import MaxLayer
35 | from nets.timeception import Timeception
36 |
37 | # define the timeception layers
38 | timeception = Timeception(1024, n_layers=4)
39 |
40 | # define network for classification
41 | input = Input(shape=(128, 7, 7, 1024))
42 | tensor = timeception(input)
43 | tensor = MaxLayer(axis=(1, 2, 3))(tensor)
44 | output = Dense(100, activation='softmax')(tensor)
45 | model = Model(inputs=input, outputs=output)
46 | model.summary()
47 | ```
48 |
49 | This results in the model defined as:
50 |
51 | ```
52 | Layer (type) Output Shape Param #
53 | ================================================
54 | (InputLayer) (None, 128, 7, 7, 1024) 0
55 | (Timeception) (None, 8, 7, 7, 2480) 1494304
56 | (MaxLayer) (None, 2480) 0
57 | (Dense) (None, 100) 248100
58 | ================================================
59 | Total params: 1,742,404
60 | ```
61 |
62 | ###### Tensorflow
63 |
64 | Using `tensorflow`, we can define `timeception` as a list of nodes in the computational graph.
65 | Then we use it along with another model definition.
66 | For example, here a functions defines 4 `timeception` layers.
67 | It takes the input tensor, feedforward it to the `timeception` layers and return the output tensor `output`.
68 |
69 | ```python
70 | import tensorflow as tf
71 | from nets import timeception
72 |
73 | # define input tensor
74 | input = tf.placeholder(tf.float32, shape=(None, 128, 7, 7, 1024))
75 |
76 | # feedforward the input to the timeception layers
77 | tensor = timeception.timeception_layers(input, n_layers=4)
78 |
79 | # the output is (?, 8, 7, 7, 2480)
80 | print (tensor.get_shape())
81 | ```
82 |
83 | ###### PyTorch
84 |
85 | Using `pytorch`, we can define `timeception` as a module.
86 | Then we use it along with another model definition.
87 | For example, here we define 4 `timeception` layers followed by a `dense` layer for classification..
88 |
89 | ```python
90 | import numpy as np
91 | import torch as T
92 | from nets import timeception_pytorch
93 |
94 | # define input tensor
95 | input = T.tensor(np.zeros((32, 1024, 128, 7, 7)), dtype=T.float32)
96 |
97 | # define 4 layers of timeception
98 | module = timeception_pytorch.Timeception(input.size(), n_layers=4)
99 |
100 | # feedforward the input to the timeception layers
101 | tensor = module(input)
102 |
103 | # the output is (32, 2480, 8, 7, 7)
104 | print (tensor.size())
105 | ```
106 |
107 | ### Installation
108 |
109 | We use python 2.7.15, provided by Anaconda 4.6.2, and we depend on the following python packages.
110 | - Keras 2.2.4
111 | - Tensorflow 1.10.1
112 | - PyTorch 1.0.1
113 |
114 | ### Training
115 |
116 | ### Testing
117 |
118 | ### Fine-tuning
119 |
120 | ### Pretrained Models
121 |
122 | #### Charades
123 |
124 | We will add all pretrained models for Charades by the end of April.
125 | For testing, start with the script `./scripts/test_charades_timeception.sh`.
126 | In order to change which baseline is uses for testing, set the `-- config-file` using on of the following options.
127 |
128 | ###### 2D-ResNet-152
129 |
130 | Timeception on top of 2D-ResNet-152 as backnone.
131 |
132 | | Config File | Backbone | TC Layers | Frames | mAP (%) | Model |
133 | |---|:---:|:---:|:---:|:---:|:---:|
134 | | [charades_r2d_tc3_f32.yaml](./configs/charades_r2d_tc3_f32.yaml) | R2D | 3 | 32 | 30.37 | [Link](./data/charades/charades_r2d_tc3_f32.pkl) |
135 | | [charades_r2d_tc3_f64.yaml](./configs/charades_r2d_tc3_f64.yaml) | R2D | 3 | 64 | 31.25 | [Link](./data/charades/charades_r2d_tc3_f64.pkl) |
136 | | [charades_r2d_tc4_f128.yaml](./configs/charades_r2d_tc4_f128.yaml) | R2D | 4 | 128 | 31.82 | [Link](./data/charades/charades_r2d_tc4_f128.pkl) |
137 |
138 | ###### I3D
139 |
140 | Timeception on top of ResNet-152 as backnone.
141 |
142 | | Config File | Backbone | TC Layers | Frames | mAP (%) | Model |
143 | |---|:---:|:---:|:---:|:---:|:---:|
144 | | [charades_i3d_tc3_f256.yaml](./configs/charades_i3d_tc3_f256.yaml) | I3D | 3 | 256 | 33.89 | [Link](./data/charades/charades_i3d_tc3_f256.pkl) |
145 | | [charades_i3d_tc3_f512.yaml](./configs/charades_i3d_tc3_f512.yaml) | I3D | 3 | 512 | 35.46 | [Link](./data/charades/charades_i3d_tc3_f512.pkl) |
146 | | [charades_i3d_tc4_f1024.yaml](./configs/charades_i3d_tc4_f1024.yaml) | I3D | 4 | 1024 | 37.20 | [Link](./data/charades/charades_i3d_tc4_f1024.pkl) |
147 |
148 | ###### 3D-ResNet-100
149 | Timeception on top of 3D-ResNet-100 as backnone.
150 |
151 |
152 | | Config File | Backbone | TC Layers | Frames | mAP (%) | Model |
153 | |---|:---:|:---:|:---:|:---:|:---:|
154 | | [charades_r3d_tc4_f1024.yaml](./configs/charades_r3d_tc4_f1024.yaml) | R3D | 4 | 1024 | 41.1 | [Link](./data/charades/charades_r3d_tc4_f1024.pkl) |
155 |
156 |
157 | #### Kinetics 400
158 |
159 | We will add all pretrained models for Kinetics 400 by the end of June.
160 |
161 | ### License
162 |
163 | The code and the models in this repo are released under the GNU 3.0 [LICENSE](LICENSE).
164 |
165 |
166 |
167 |
--------------------------------------------------------------------------------
/core/keras_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Helper functions for keras.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import os
33 | import json
34 | import pydot
35 | import logging
36 | import numpy as np
37 |
38 | import tensorflow as tf
39 | from tensorflow.contrib import framework as tf_framework
40 |
41 | import keras.backend as K
42 | from keras.callbacks import Callback
43 | from keras.utils import vis_utils
44 | from keras.models import Sequential, model_from_json
45 |
46 | from core import config_utils
47 |
48 | logger = logging.getLogger(__name__)
49 |
50 | # region Constants
51 |
52 | EPS_VALUE = 1e-9
53 | LOSSES = ['categorical_crossentropy', 'mean_squared_error', 'mean_absolute_error', 'binary_crossentropy']
54 | METRICS = ['accuracy', 'mean_squared_error', 'mean_absolute_error']
55 | OPTIMIZERS = ['sgd', 'rmsprop', 'adam']
56 | ACTIVATIONS = ['tanh', 'relu', 'sigmoid', 'softmax']
57 |
58 | # endregion
59 |
60 | # region Functions
61 |
62 | def save_model_figure(model, file_path='/.model.eps'):
63 | vis_utils.plot_model(model, file_path, show_shapes=True, show_layer_names=True)
64 |
65 | def load_model(json_path, weight_path, metrics=None, loss=None, optimizer=None, custom_objects=None, is_compile=True):
66 | with open(json_path, 'r') as f:
67 | model_json_string = json.load(f)
68 | model_json_dict = json.loads(model_json_string)
69 | model = model_from_json(model_json_string, custom_objects=custom_objects)
70 | model.load_weights(weight_path)
71 |
72 | if is_compile:
73 | if optimizer is None:
74 | optimizer = model_json_dict['optimizer']['name']
75 |
76 | if loss is None:
77 | loss = model_json_dict['loss']
78 |
79 | if metrics is None:
80 | model.compile(loss=loss, optimizer=optimizer)
81 | else:
82 | model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
83 |
84 | return model
85 |
86 | def save_model(model, json_path, weight_path):
87 | model.save_weights(weight_path, overwrite=True)
88 | model_json = model.to_json()
89 | with open(json_path, 'w') as f:
90 | json.dump(model_json, f)
91 |
92 | def layer_exist(model, layer_name):
93 | exist = False
94 | for layer in model.layers:
95 | if layer.name == layer_name:
96 | exist = True
97 | break
98 |
99 | return exist
100 |
101 | def calc_num_batches(n_samples, batch_size):
102 | n_batch = int(n_samples / float(batch_size))
103 | n_batch = n_batch if n_samples % batch_size == 0 else n_batch + 1
104 | return n_batch
105 |
106 | # endregion
107 |
108 | # region Metrics
109 |
110 | def map_charades(y_true, y_pred):
111 | """
112 | Returns mAP
113 | """
114 | m_aps = []
115 |
116 | tf_one = tf.constant(1, dtype=tf.float32)
117 |
118 | n_classes = y_pred.shape[1]
119 | for oc_i in range(n_classes):
120 | pred_row = y_pred[:, oc_i]
121 | sorted_idxs = tf_framework.argsort(-pred_row)
122 | true_row = y_true[:, oc_i]
123 | true_row = tf.map_fn(lambda i: true_row[i], sorted_idxs, dtype=np.float32)
124 | tp_poolean = tf.equal(true_row, tf_one)
125 | tp = tf.cast(tp_poolean, dtype=np.float32)
126 | fp = K.reverse(tp, axes=0)
127 | n_pos = tf.reduce_sum(tp)
128 | f_pcs = tf.cumsum(fp)
129 | t_pcs = tf.cumsum(tp)
130 | s = f_pcs + t_pcs
131 |
132 | s = tf.cast(s, tf.float32)
133 | t_pcs = tf.cast(t_pcs, tf.float32)
134 | tp_float = tf.cast(tp_poolean, np.float32)
135 |
136 | prec = t_pcs / s
137 | avg_prec = prec * tp_float
138 |
139 | n_pos = tf.cast(n_pos, tf.float32)
140 | avg_prec = avg_prec / n_pos
141 | avg_prec = tf.expand_dims(avg_prec, axis=0)
142 | m_aps.append(avg_prec)
143 |
144 | m_aps = K.concatenate(m_aps, axis=0)
145 | mAP = K.mean(m_aps)
146 | return mAP
147 |
148 | # endregion
149 |
150 | # region Callbacks
151 |
152 | class SaveCallback(Callback):
153 | def __init__(self, dataset_name, model_name):
154 | self.model_name = model_name
155 |
156 | model_root_path = './data/%s/models' % (dataset_name)
157 | assert os.path.exists(model_root_path)
158 |
159 | model_root_path = './data/%s/models/%s' % (dataset_name, model_name)
160 | if not os.path.exists(model_root_path):
161 | os.mkdir(model_root_path)
162 |
163 | self.model_root_path = model_root_path
164 |
165 | super(SaveCallback, self).__init__()
166 |
167 | def on_epoch_end(self, idx_epoch, logs=None):
168 | """
169 | Save the model.
170 | """
171 |
172 | epoch_num = idx_epoch + 1
173 | self.__save(epoch_num)
174 |
175 | def __save(self, epoch_num):
176 | model_root_path = self.model_root_path
177 | model = self.model
178 |
179 | # hfpy accept only strings as a path
180 | model_json_path = str('%s/%03d.json' % (model_root_path, epoch_num))
181 | model_weight_path = str('%s/%03d.pkl' % (model_root_path, epoch_num))
182 |
183 | # save model definition as json, and save model weights
184 | model.save_weights(model_weight_path, overwrite=True)
185 | model_json = model.to_json()
186 | with open(model_json_path, 'w') as f:
187 | json.dump(model_json, f)
188 |
189 | # endregion
190 |
--------------------------------------------------------------------------------
/nets/i3d_torch_charades_utils.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | import os
3 | import random
4 | import sys
5 | import time
6 | import datetime
7 | import math
8 | import shutil
9 | import random
10 | import threading
11 |
12 | import numpy as np
13 | import cv2
14 | import scipy.io
15 | import h5py
16 | from optparse import OptionParser
17 | from collections import OrderedDict
18 |
19 | import torch
20 | import torch.nn as nn
21 | import torch.nn.parallel
22 | import torch.backends.cudnn as cudnn
23 | import torch.distributed as dist
24 | import torchvision.models as tmodels
25 | import importlib
26 | import torchsummary
27 | from core import pytorch_utils
28 | import torch.nn.functional as F
29 | import torch.optim as optim
30 | from torch.optim import lr_scheduler
31 | from torch.autograd import Variable
32 |
33 | import torchvision
34 | from torchvision import datasets, transforms
35 |
36 | from core import const as c, utils
37 | from core import image_utils
38 | from nets import i3d_torch_charades_test
39 |
40 | def extract_features_rgb():
41 | from core import config_utils
42 |
43 | is_local = config_utils.is_local_machine()
44 | if is_local:
45 | begin_num = None
46 | end_num = None
47 | else:
48 | parser = OptionParser()
49 | parser.add_option("-b", "--begin_num", dest="begin_num", help="begin_num")
50 | parser.add_option("-e", "--end_num", dest="end_num", help="end_num")
51 | parser.add_option("-c", "--gpu_core_id", dest="gpu_core_id", help="gpu_core_id")
52 | (options, args) = parser.parse_args()
53 | begin_num = int(options.begin_num)
54 | end_num = int(options.end_num)
55 | gpu_core_id = int(options.gpu_core_id)
56 | os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_core_id)
57 |
58 | __extract_features_rgb(begin_num, end_num)
59 |
60 | def load_model_i3d_charades_rgb_for_testing(model_path):
61 | import torch
62 | from nets.i3d_torch_charades_test import InceptionI3d
63 |
64 | # setup the model
65 | state_dict = torch.load(model_path)
66 | model = InceptionI3d()
67 | model.replace_logits(157)
68 | model.load_state_dict(state_dict)
69 | model.train(False)
70 | model.eval()
71 | model.cuda()
72 | return model
73 |
74 | def __extract_features_rgb(begin_num=None, end_num=None):
75 | root_path = c.DATA_ROOT_PATH
76 | annotation_path = '%s/Charades/annotation/frames_dict_trimmed_multi_label_i3d_160_frames.pkl' % (root_path)
77 | features_root_path = '%s/Charades/features_i3d_charades_rgb_mixed_5c_trimmed_20_frames' % (root_path)
78 | video_frames_root_path = '%s/Charades/frames/Charades_v1_rgb' % (root_path)
79 | model_path = '%s/Charades/baseline_models/i3d/rgb_charades.pt' % (root_path)
80 | feature_name = 'Mixed_5c'
81 |
82 | (video_frames_dict_tr, video_frames_dict_te) = utils.pkl_load(annotation_path)
83 | video_frames_dict = dict()
84 | video_frames_dict.update(video_frames_dict_tr)
85 | video_frames_dict.update(video_frames_dict_te)
86 | video_names = video_frames_dict.keys()
87 |
88 | n_videos = len(video_names)
89 | frame_count = 0
90 |
91 | if not os.path.exists(features_root_path):
92 | print('Sorry, path does not exist: %s' % (features_root_path))
93 | return
94 |
95 | t1 = time.time()
96 | print('extracting training features')
97 | print('start time: %s' % utils.timestamp())
98 |
99 | # aync reader, and get load images for the first video
100 | img_reader = image_utils.AsyncImageReaderCharadesForI3DTorchModel(n_threads=20)
101 | img_reader.load_imgs_in_batch(__get_video_frame_pathes(video_names[0], video_frames_root_path, video_frames_dict))
102 |
103 | # load the model
104 | model = __load_i3d_model_rgb(model_path)
105 | torchsummary.summary(model, input_size=(3, 160, 224, 224))
106 |
107 | # loop on list of videos
108 | for idx_video in range(n_videos):
109 | video_num = idx_video + 1
110 |
111 | if begin_num is not None and end_num is not None:
112 | if video_num <= begin_num or video_num > end_num:
113 | continue
114 |
115 | video_name = video_names[idx_video]
116 |
117 | # wait untill the image_batch is loaded
118 | t1 = time.time()
119 | while img_reader.is_busy():
120 | threading._sleep(0.1)
121 | t2 = time.time()
122 | duration_waited = t2 - t1
123 | print('...... video %d/%d: %s, waited: %d' % (video_num, n_videos, video_name, duration_waited))
124 |
125 | # get the video frames
126 | video_frames = img_reader.get_images()
127 |
128 | # pre-load for the next video
129 | if video_num < n_videos:
130 | next_video_name = video_names[idx_video + 1]
131 | img_reader.load_imgs_in_batch(__get_video_frame_pathes(next_video_name, video_frames_root_path, video_frames_dict))
132 |
133 | video_features_path = '%s/%s.pkl' % (features_root_path, video_name)
134 | # if os.path.exists(video_features_path):
135 | # print ('... features for video already exist: %s.pkl' % (video_name))
136 | # continue
137 |
138 | if len(video_frames) != 160:
139 | print('... wrong n frames: %d' % (video_num))
140 | continue
141 |
142 | # transpose to have the channel_first (160, 224, 224, 3) => (3, 160, 224, 224)
143 | video_frames = np.transpose(video_frames, (3, 0, 1, 2))
144 |
145 | # add one dimension to represent the batch size
146 | video_frames = np.expand_dims(video_frames, axis=0)
147 |
148 | # prepare input variable
149 | with torch.no_grad():
150 | # extract features
151 | input_var = torch.from_numpy(video_frames).cuda()
152 | output_var = model(input_var)
153 | output_var = output_var.cpu()
154 | features = output_var.data.numpy() # (1, 1024, 20, 7, 7)
155 |
156 | # don't forget to clean up variables
157 | del input_var
158 | del output_var
159 |
160 | # squeeze to remove the dimension of the batch_size
161 | features = features[0] # (1024, 20, 7, 7)
162 |
163 | # transpose to have the channel_last
164 | features = np.transpose(features, (1, 2, 3, 0)) # (20, 7, 7, 1024)
165 |
166 | # path to save the features
167 | utils.pkl_dump(features, video_features_path, is_highest=True)
168 |
169 | # increment counts
170 | frame_count += len(video_frames)
171 |
172 | t2 = time.time()
173 | print('finish extracting %d features in %d seconds' % (frame_count, t2 - t1))
174 | print('end time: %s' % utils.timestamp())
175 |
176 | def __get_video_frame_pathes(video_name, video_frames_root_path, video_frames_dict):
177 | video_frame_names = video_frames_dict[video_name]
178 | video_frame_pathes = [('%s/%s/%s') % (video_frames_root_path, video_name, n) for n in video_frame_names]
179 | video_frame_pathes = np.array(video_frame_pathes)
180 | return video_frame_pathes
181 |
182 | def __load_i3d_model_rgb(model_path):
183 | # setup the model
184 | state_dict = torch.load(model_path)
185 | model = i3d_torch_charades_test.InceptionI3d()
186 | model.replace_logits(157)
187 | model.load_state_dict(state_dict)
188 | model.cuda()
189 | model.train(True)
190 | return model
191 |
192 | if __name__ == '__main__':
193 | print('Hello World!')
194 | extract_features_rgb()
195 |
--------------------------------------------------------------------------------
/core/config_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Configurations for project.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import os
33 | import platform
34 | import argparse
35 | import logging
36 | import yaml
37 | import pprint
38 | from ast import literal_eval
39 |
40 | from core.config import __C
41 | from core.utils import AttrDict
42 | from core import const, config, utils
43 |
44 | logger = logging.getLogger(__name__)
45 |
46 | # region Misc
47 |
48 | def get_machine_name():
49 | return platform.node()
50 |
51 | def import_dl_platform():
52 | if const.DL_FRAMEWORK == 'tensorflow':
53 | import tensorflow as tf
54 | elif const.DL_FRAMEWORK == 'pytorch':
55 | import torch
56 | elif const.DL_FRAMEWORK == 'caffe':
57 | import caffe
58 | elif const.DL_FRAMEWORK == 'keras':
59 | import keras.backend as K
60 |
61 | # endregion
62 |
63 | # region Config GPU
64 |
65 | def config_gpu():
66 | if const.DL_FRAMEWORK == 'tensorflow':
67 | __config_gpu_for_tensorflow()
68 | elif const.DL_FRAMEWORK == 'pytorch':
69 | __config_gpu_for_pytorch()
70 | elif const.DL_FRAMEWORK == 'keras':
71 | __config_gpu_for_keras()
72 | elif const.DL_FRAMEWORK == 'caffe':
73 | __config_gpu_for_caffe()
74 |
75 | def __config_gpu_for_tensorflow():
76 | import tensorflow as tf
77 |
78 | gpu_core_id = __parse_gpu_id()
79 |
80 | # import os
81 | # import tensorflow as tf
82 | # set the logging level of tensorflow
83 | # 1: filter out INFO
84 | # 2: filter out WARNING
85 | # 3: filter out ERROR
86 | # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # or any {'0', '1', '2'}
87 |
88 | # set which device to be used
89 | const.GPU_CORE_ID = gpu_core_id
90 | pass
91 |
92 | def __config_gpu_for_keras():
93 | import tensorflow as tf
94 | import keras.backend as K
95 |
96 | gpu_core_id = __parse_gpu_id()
97 |
98 | K.clear_session()
99 | config = tf.ConfigProto()
100 | config.gpu_options.visible_device_list = str(gpu_core_id)
101 | config.gpu_options.allow_growth = True
102 | session = tf.Session(config=config)
103 | K.set_session(session)
104 |
105 | # set which device to be used
106 | const.GPU_CORE_ID = gpu_core_id
107 |
108 | def __config_gpu_for_pytorch():
109 | import torch
110 |
111 | gpu_core_id = __parse_gpu_id()
112 |
113 | torch.cuda.set_device(gpu_core_id)
114 |
115 | # set which device to be used
116 | const.GPU_CORE_ID = gpu_core_id
117 |
118 | def __config_gpu_for_caffe():
119 | import os
120 |
121 | gpu_core_id = __parse_gpu_id()
122 |
123 | os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_core_id)
124 |
125 | # set which device to be used
126 | const.GPU_CORE_ID = gpu_core_id
127 |
128 | def __parse_gpu_id():
129 | parser = argparse.ArgumentParser()
130 | parser.add_argument('-c', '--gpu_core_id', default='-1', type=int)
131 | args = parser.parse_args()
132 | gpu_core_id = args.gpu_core_id
133 | return gpu_core_id
134 |
135 | # endregion
136 |
137 | # region Config File Helpers
138 |
139 | def cfg_print_cfg():
140 | logger.info('Config file is:')
141 | logger.info(pprint.pformat(__C))
142 |
143 | def cfg_merge_dicts(dict_a, dict_b):
144 | from ast import literal_eval
145 |
146 | for key, value in dict_a.items():
147 | if key not in dict_b:
148 | raise KeyError('Invalid key in config file: {}'.format(key))
149 | if type(value) is dict:
150 | dict_a[key] = value = AttrDict(value)
151 | if isinstance(value, str):
152 | try:
153 | value = literal_eval(value)
154 | except BaseException:
155 | pass
156 | # the types must match, too
157 | old_type = type(dict_b[key])
158 | if old_type is not type(value) and value is not None:
159 | raise ValueError('Type mismatch ({} vs. {}) for config key: {}'.format(type(dict_b[key]), type(value), key))
160 | # recursively merge dicts
161 | if isinstance(value, AttrDict):
162 | try:
163 | cfg_merge_dicts(dict_a[key], dict_b[key])
164 | except BaseException:
165 | raise Exception('Error under config key: {}'.format(key))
166 | else:
167 | dict_b[key] = value
168 |
169 | def cfg_from_file(file_path, is_check=True):
170 | """
171 | Load a config file and merge it into the default options.
172 | """
173 |
174 | # read from file
175 | yaml_config = utils.yaml_load(file_path)
176 |
177 | # merge to project config
178 | cfg_merge_dicts(yaml_config, __C)
179 |
180 | # make sure everything is okay
181 | if is_check:
182 | cfg_sanity_check()
183 |
184 | def cfg_from_attrdict(attr_dict):
185 | cfg_merge_dicts(attr_dict, __C)
186 |
187 | def cfg_from_dict(args_dict):
188 | """Set config keys via list (e.g., from command line)."""
189 |
190 | for key, value in args_dict.iteritems():
191 | key_list = key.split('.')
192 | cfg = __C
193 | for subkey in key_list[:-1]:
194 | assert subkey in cfg, 'Config key {} not found'.format(subkey)
195 | cfg = cfg[subkey]
196 | subkey = key_list[-1]
197 | if subkey not in cfg:
198 | raise Exception('Config key {} not found'.format(subkey))
199 | try:
200 | # handle the case when v is a string literal
201 | val = literal_eval(value)
202 | except BaseException:
203 | val = value
204 | if isinstance(val, type(cfg[subkey])) or cfg[subkey] is None:
205 | pass
206 | else:
207 | type1 = type(val)
208 | type2 = type(cfg[subkey])
209 | msg = 'type {} does not match original type {}'.format(type1, type2)
210 | raise Exception(msg)
211 | cfg[subkey] = val
212 |
213 | def cfg_from_list(args_list):
214 | """
215 | Set config keys via list (e.g., from command line).
216 | """
217 | from ast import literal_eval
218 |
219 | assert len(args_list) % 2 == 0, 'Specify values or keys for args'
220 | for key, value in zip(args_list[0::2], args_list[1::2]):
221 | key_list = key.split('.')
222 | cfg = __C
223 | for subkey in key_list[:-1]:
224 | assert subkey in cfg, 'Config key {} not found'.format(subkey)
225 | cfg = cfg[subkey]
226 | subkey = key_list[-1]
227 | assert subkey in cfg, 'Config key {} not found'.format(subkey)
228 | try:
229 | # handle the case when v is a string literal
230 | val = literal_eval(value)
231 | except BaseException:
232 | val = value
233 | msg = 'type {} does not match original type {}'.format(type(val), type(cfg[subkey]))
234 | assert isinstance(val, type(cfg[subkey])) or cfg[subkey] is None, msg
235 | cfg[subkey] = val
236 |
237 | def cfg_sanity_check():
238 | assert __C.TRAIN.SCHEME in const.TRAIN_SCHEMES
239 | assert __C.MODEL.CLASSIFICATION_TYPE in const.MODEL_CLASSIFICATION_TYPES
240 | assert __C.MODEL.MULTISCALE_TYPE in const.MODEL_MULTISCALE_TYPES
241 | assert __C.SOLVER.NAME in const.SOLVER_NAMES
242 | assert __C.DATASET_NAME in const.DATASET_NAMES
243 |
244 | # endregion
245 |
--------------------------------------------------------------------------------
/experiments/train_keras.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Train Timeception layers on different datasets. There are two different ways to train Timeception.
25 | 1. Timeception-only (TCO): only timeception layers are trained, using features extracted from backbone CNNs.
26 | 2. End-to-end (ETE): timeception is trained on top of backbone CNN. The input is video frames passed throughtout the backboneCNN
27 | and then the resulted feature is fed to Timeception layers. Here, you enjoy all the benefits of end-to-end training.
28 | For example, do pre-processing to the input frames, randomly sample the frames, temporal jittering, ...., etc.
29 | """
30 |
31 | from __future__ import absolute_import
32 | from __future__ import division
33 | from __future__ import print_function
34 | from __future__ import unicode_literals
35 |
36 | import logging
37 | import os
38 | import datetime
39 | import numpy as np
40 | from optparse import OptionParser
41 |
42 | import tensorflow as tf
43 | import keras.backend as K
44 | from keras.layers import Dense, LeakyReLU, Dropout, Input, Activation, BatchNormalization
45 | from keras.optimizers import SGD, Adam
46 | from keras.models import Model
47 |
48 | from nets import timeception
49 | from nets.layers_keras import MaxLayer
50 | from core import utils, keras_utils, image_utils, config_utils, const, config, data_utils
51 | from core.utils import Path as Pth
52 |
53 | logger = logging.getLogger(__name__)
54 |
55 | def train_tco():
56 | """
57 | Train Timeception layers based on the given configurations.
58 | This train scheme is Timeception-only (TCO).
59 | """
60 |
61 | # get some configs for the training
62 | n_workers = config.cfg.TRAIN.N_WORKERS
63 | n_epochs = config.cfg.TRAIN.N_EPOCHS
64 | dataset_name = config.cfg.DATASET_NAME
65 | model_name = '%s_%s' % (config.cfg.MODEL.NAME, utils.timestamp())
66 |
67 | # data generators
68 | data_generator_tr = __define_data_generator(is_training=True)
69 | data_generator_te = __define_data_generator(is_training=False)
70 |
71 | logger.info('--- start time')
72 | logger.info(datetime.datetime.now())
73 | logger.info('... [tr]: n_samples, n_batch, batch_size: %d, %d, %d' % (data_generator_tr.n_samples, data_generator_tr.n_batches, config.cfg.TRAIN.BATCH_SIZE))
74 | logger.info('... [te]: n_samples, n_batch, batch_size: %d, %d, %d' % (data_generator_te.n_samples, data_generator_te.n_batches, config.cfg.TEST.BATCH_SIZE))
75 |
76 | # callback to save the model
77 | save_callback = keras_utils.SaveCallback(dataset_name, model_name)
78 |
79 | # load model
80 | model = __define_timeception_model()
81 | logger.info(model.summary())
82 |
83 | # train the model
84 | model.fit_generator(epochs=n_epochs, generator=data_generator_tr, validation_data=data_generator_te, use_multiprocessing=True, workers=n_workers, callbacks=[save_callback], verbose=2)
85 |
86 | logger.info('--- finish time')
87 | logger.info(datetime.datetime.now())
88 |
89 | def train_ete():
90 | """
91 | Train Timeception layers based on the given configurations.
92 | This train scheme is End-to-end (ETE).
93 | """
94 |
95 | model = __define_timeception_model()
96 |
97 | raise Exception('Sorry, not implemented yet!')
98 |
99 | def __define_data_generator(is_training):
100 | """
101 | Define data generator.
102 | """
103 |
104 | # get some configs for the training
105 | n_classes = config.cfg.MODEL.N_CLASSES
106 | dataset_name = config.cfg.DATASET_NAME
107 | backbone_model_name = config.cfg.MODEL.BACKBONE_CNN
108 | backbone_feature_name = config.cfg.MODEL.BACKBONE_FEATURE
109 | n_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS
110 |
111 | batch_size_tr = config.cfg.TRAIN.BATCH_SIZE
112 | batch_size_te = config.cfg.TEST.BATCH_SIZE
113 | batch_size = batch_size_tr if is_training else batch_size_te
114 |
115 | # size and name of feature
116 | feature_name = 'features_%s_%s_%sf' % (backbone_model_name, backbone_feature_name, n_timesteps)
117 | c, h, w = utils.get_model_feat_maps_info(backbone_model_name, backbone_feature_name)
118 | feature_dim = (n_timesteps, h, w, c)
119 |
120 | # data generators
121 | params = {'batch_size': batch_size, 'n_classes': n_classes, 'feature_name': feature_name, 'feature_dim': feature_dim, 'is_shuffle': True, 'is_training': is_training}
122 | data_generator_class = data_utils.KERAS_DATA_GENERATORS_DICT[dataset_name]
123 | data_generator = data_generator_class(**params)
124 |
125 | return data_generator
126 |
127 | def __define_timeception_model():
128 | """
129 | Define Timeception classifier.
130 | """
131 |
132 | # some configurations for the model
133 | classification_type = config.cfg.MODEL.CLASSIFICATION_TYPE
134 | solver_name = config.cfg.SOLVER.NAME
135 | solver_lr = config.cfg.SOLVER.LR
136 | adam_epsilon = config.cfg.SOLVER.ADAM_EPSILON
137 | n_tc_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS
138 | backbone_name = config.cfg.MODEL.BACKBONE_CNN
139 | feature_name = config.cfg.MODEL.BACKBONE_FEATURE
140 | n_tc_layers = config.cfg.MODEL.N_TC_LAYERS
141 | n_classes = config.cfg.MODEL.N_CLASSES
142 | is_dilated = config.cfg.MODEL.MULTISCALE_TYPE
143 | n_channels_in, channel_h, channel_w = utils.get_model_feat_maps_info(backbone_name, feature_name)
144 | n_groups = int(n_channels_in / 128.0)
145 |
146 | # optimizer and loss for either multi-label "ml" or single-label "sl" classification
147 | if classification_type == 'ml':
148 | loss = keras_utils.LOSSES[3]
149 | output_activation = keras_utils.ACTIVATIONS[2]
150 | metric_function = keras_utils.map_charades
151 | else:
152 | loss = keras_utils.LOSSES[0]
153 | output_activation = keras_utils.ACTIVATIONS[3]
154 | metric_function = keras_utils.METRICS[0]
155 |
156 | # define the optimizer
157 | optimizer = SGD(lr=0.01) if solver_name == 'sgd' else Adam(lr=solver_lr, epsilon=adam_epsilon)
158 |
159 | # input layer
160 | input_shape = (n_tc_timesteps, channel_h, channel_w, n_channels_in) # (T, H, W, C)
161 | tensor_input = Input(shape=input_shape, name='input') # (T, H, W, C)
162 |
163 | # define timeception layers, as a standalone module
164 | timeception_module = timeception.Timeception(n_channels_in, n_tc_layers, n_groups, is_dilated=is_dilated)
165 | tensor = timeception_module(tensor_input) # (T, H, W, C)
166 |
167 | # but if you fancy, you can define timeception layers as a series of layers
168 | # tensor = timeception.timeception_layers(tensor_input, n_tc_layers, n_groups, is_dilated=is_dilated) # (T, H, W, C)
169 |
170 | # max-pool over space-time
171 | tensor = MaxLayer(axis=(1, 2, 3), name='maxpool_t_s')(tensor)
172 |
173 | # dense layers for classification
174 | tensor = Dropout(0.5)(tensor)
175 | tensor = Dense(512)(tensor)
176 | tensor = BatchNormalization()(tensor)
177 | tensor = LeakyReLU(alpha=0.2)(tensor)
178 | tensor = Dropout(0.25)(tensor)
179 | tensor = Dense(n_classes)(tensor)
180 | tensor_output = Activation(output_activation)(tensor)
181 |
182 | # define the model
183 | model = Model(inputs=tensor_input, outputs=tensor_output)
184 | model.compile(loss=loss, optimizer=optimizer, metrics=[metric_function])
185 |
186 | return model
187 |
188 | def __main():
189 | """
190 | Run this script to train Timeception.
191 | """
192 |
193 | default_config_file = 'charades_i3d_tc4_f1024.yaml'
194 | default_config_file = 'charades_i3d_tc2_f256.yaml'
195 |
196 | # Parse the arguments
197 | parser = OptionParser()
198 | parser.add_option('-c', '--config_file', dest='config_file', default=default_config_file, help='Yaml config file that contains all training details.')
199 | (options, args) = parser.parse_args()
200 | config_file = options.config_file
201 |
202 | # check if exist
203 | if config_file is None or config_file == '':
204 | msg = 'Config file not passed, default config is used: %s' % (config_file)
205 | logging.warning(msg)
206 | config_file = default_config_file
207 |
208 | # path of config file
209 | config_path = './configs/%s' % (config_file)
210 |
211 | # check if file exist
212 | if not os.path.exists(config_path):
213 | msg = 'Sorry, could not find config file with the following path: %s' % (config_path)
214 | logging.error(msg)
215 | else:
216 | # read the config from file and copy it to the project configuration "cfg"
217 | config_utils.cfg_from_file(config_path)
218 |
219 | # choose which training scheme, either 'ete' or 'tco'
220 | training_scheme = config.cfg.TRAIN.SCHEME
221 |
222 | # start training
223 | if training_scheme == 'tco':
224 | train_tco()
225 | else:
226 | train_ete()
227 |
228 | if __name__ == '__main__':
229 | __main()
230 |
--------------------------------------------------------------------------------
/core/pytorch_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Helper functions for pytorch.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import os
33 | import logging
34 | import json
35 | import numpy as np
36 | from collections import OrderedDict
37 |
38 | import torch
39 | from torch import nn
40 | from torch.nn import functional as F
41 | from torch.autograd import Variable
42 |
43 | import torchviz
44 | import torchvision
45 | import torchsummary
46 |
47 | logger = logging.getLogger(__name__)
48 |
49 | # region Helpers
50 |
51 | def save_model(model, path):
52 | model.save_state_dict(path)
53 |
54 | def load_model(model, path):
55 | model_dict = torch.load(path)
56 | model.load_state_dict(model_dict)
57 |
58 | def padding1d(tensor, filter):
59 | it, = tensor.shape[2:]
60 | ft = filter
61 |
62 | pt = max(0, (it - 1) + (ft - 1) + 1 - it)
63 | oddt = (pt % it != 0)
64 |
65 | mode = str('constant')
66 | if any([oddt]):
67 | pad = [0, int(oddt)]
68 | tensor = F.pad(tensor, pad, mode=mode)
69 |
70 | padding = (pt // it,)
71 | return tensor, padding
72 |
73 | def padding3d(tensor, filter, mode=str('constant')):
74 | """
75 | Input shape (BN, C, T, H, W)
76 | """
77 |
78 | it, ih, iw = tensor.shape[2:]
79 | ft, fh, fw = filter.shape
80 |
81 | pt = max(0, (it - 1) + (ft - 1) + 1 - it)
82 | ph = max(0, (ih - 1) + (fh - 1) + 1 - ih)
83 | pw = max(0, (iw - 1) + (fw - 1) + 1 - iw)
84 |
85 | oddt = (pt % 2 != 0)
86 | oddh = (ph % 2 != 0)
87 | oddw = (pw % 2 != 0)
88 |
89 | if any([oddt, oddh, oddw]):
90 | pad = [0, int(oddt), 0, int(oddh), 0, int(oddw)]
91 | tensor = F.pad(tensor, pad, mode=mode)
92 |
93 | padding = (pt // 2, ph // 2, pw // 2)
94 | tensor = F.conv3d(tensor, filter, padding=padding)
95 |
96 | return tensor
97 |
98 | def calc_padding_1d(input_size, kernel_size, stride=1, dilation=1):
99 | """
100 | Calculate the padding.
101 | """
102 |
103 | # i = input
104 | # o = output
105 | # p = padding
106 | # k = kernel_size
107 | # s = stride
108 | # d = dilation
109 | # the equation is
110 | # o = [i + 2 * p - k - (k - 1) * (d - 1)] / s + 1
111 | # give that we want i = o, then we solve the equation for p gives us
112 |
113 | i = input_size
114 | s = stride
115 | k = kernel_size
116 | d = dilation
117 |
118 | padding = 0.5 * (k - i + s * (i - 1) + (k - 1) * (d - 1))
119 | padding = int(padding)
120 |
121 | return padding
122 |
123 | def summary(model, input_size, batch_size=-1, device="cuda"):
124 | """
125 | Custom summary function, to print the custom name of module, instead of the assigned layer name.
126 | :param model:
127 | :param input_size:
128 | :param batch_size:
129 | :param device:
130 | :return:
131 | """
132 |
133 | # this has to be imported here, not to create import-loop between "nets.layers_pytorch" and "core.pytorch_utils"
134 | from nets.layers_pytorch import DepthwiseConv1DLayer
135 |
136 | def register_hook(module):
137 |
138 | def hook(module, input, output):
139 |
140 | # old code
141 | # class_name = str(module.__class__).split(".")[-1].split("'")[0]
142 | # m_key = "%s-%i" % (class_name, module_idx + 1)
143 |
144 | # don't consider this layer
145 | if type(module) == DepthwiseConv1DLayer:
146 | return
147 |
148 | # new code
149 | if hasattr(module, '_name'):
150 | m_key = str(module._name)
151 | else:
152 | module_idx = len(summary)
153 | class_name = str(module.__class__).split(".")[-1].split("'")[0]
154 | m_key = "%s-%i" % (class_name, module_idx + 1)
155 |
156 | summary[m_key] = OrderedDict()
157 | summary[m_key]["input_shape"] = list(input[0].size())
158 | summary[m_key]["input_shape"][0] = batch_size
159 | if isinstance(output, (list, tuple)):
160 | summary[m_key]["output_shape"] = [
161 | [-1] + list(o.size())[1:] for o in output
162 | ]
163 | else:
164 | summary[m_key]["output_shape"] = list(output.size())
165 | summary[m_key]["output_shape"][0] = batch_size
166 |
167 | params = 0
168 | if hasattr(module, "weight") and hasattr(module.weight, "size"):
169 | params += torch.prod(torch.LongTensor(list(module.weight.size())))
170 | summary[m_key]["trainable"] = module.weight.requires_grad
171 | if hasattr(module, "bias") and hasattr(module.bias, "size"):
172 | params += torch.prod(torch.LongTensor(list(module.bias.size())))
173 | summary[m_key]["nb_params"] = params
174 |
175 | if (not isinstance(module, nn.Sequential) and not isinstance(module, nn.ModuleList) and not (module == model)):
176 | hooks.append(module.register_forward_hook(hook))
177 |
178 | device = device.lower()
179 | assert device in [
180 | "cuda",
181 | "cpu",
182 | ], "Input device is not valid, please specify 'cuda' or 'cpu'"
183 |
184 | if device == "cuda" and torch.cuda.is_available():
185 | dtype = torch.cuda.FloatTensor
186 | else:
187 | dtype = torch.FloatTensor
188 |
189 | # multiple inputs to the network
190 | if isinstance(input_size, tuple):
191 | input_size = [input_size]
192 |
193 | # batch_size of 2 for batchnorm
194 | x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]
195 | # print(type(x[0]))
196 |
197 | # create properties
198 | summary = OrderedDict()
199 | hooks = []
200 |
201 | # register hook
202 | model.apply(register_hook)
203 |
204 | # make a forward pass
205 | # print(x.shape)
206 | model(*x)
207 |
208 | # remove these hooks
209 | for h in hooks:
210 | h.remove()
211 |
212 | print("----------------------------------------------------------------")
213 | line_new = "{:>20} {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
214 | print(line_new)
215 | print("================================================================")
216 | total_params = 0
217 | total_output = 0
218 | trainable_params = 0
219 | for layer in summary:
220 | # input_shape, output_shape, trainable, nb_params
221 | line_new = "{:>20} {:>25} {:>15}".format(layer, str(summary[layer]["output_shape"]), "{0:,}".format(summary[layer]["nb_params"]), )
222 | total_params += summary[layer]["nb_params"]
223 | total_output += np.prod(summary[layer]["output_shape"])
224 | if "trainable" in summary[layer]:
225 | if summary[layer]["trainable"] == True:
226 | trainable_params += summary[layer]["nb_params"]
227 | print(line_new)
228 |
229 | # assume 4 bytes/number (float on cuda).
230 | total_input_size = abs(np.prod(input_size) * batch_size * 4. / (1024 ** 2.))
231 | total_output_size = abs(2. * total_output * 4. / (1024 ** 2.)) # x2 for gradients
232 | total_params_size = abs(total_params.numpy() * 4. / (1024 ** 2.))
233 | total_size = total_params_size + total_output_size + total_input_size
234 |
235 | print("================================================================")
236 | print("Total params: {0:,}".format(total_params))
237 | print("Trainable params: {0:,}".format(trainable_params))
238 | print("Non-trainable params: {0:,}".format(total_params - trainable_params))
239 | print("----------------------------------------------------------------")
240 | print("Input size (MB): %0.2f" % total_input_size)
241 | print("Forward/backward pass size (MB): %0.2f" % total_output_size)
242 | print("Params size (MB): %0.2f" % total_params_size)
243 | print("Estimated Total Size (MB): %0.2f" % total_size)
244 | print("----------------------------------------------------------------")
245 | # return summary
246 |
247 | # endregion
248 |
249 | # region Classes
250 |
251 | class ModelSaver():
252 | def __init__(self, model, dataset_name, model_name):
253 | self.model = model
254 | self.model_name = model_name
255 |
256 | model_root_path = './data/%s/models' % (dataset_name)
257 | assert os.path.exists(model_root_path)
258 |
259 | model_root_path = './data/%s/models/%s' % (dataset_name, model_name)
260 | if not os.path.exists(model_root_path):
261 | os.mkdir(model_root_path)
262 |
263 | self.model_root_path = model_root_path
264 |
265 | def save(self, idx_epoch):
266 | """
267 | Save the model.
268 | """
269 | epoch_num = idx_epoch + 1
270 | model_root_path = self.model_root_path
271 | model_state_path = str('%s/%03d.pt' % (model_root_path, epoch_num))
272 |
273 | # save model state using pytorch
274 | model_state = self.model.state_dict()
275 | torch.save(model_state, model_state_path)
276 |
277 |
278 | # endregion
279 |
--------------------------------------------------------------------------------
/core/data_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Helpful functions and classes to deal with data.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import logging
33 | import random
34 | import numpy as np
35 | import pickle as pkl
36 | from datetime import datetime
37 | from multiprocessing.dummy import Pool
38 |
39 | import keras.utils
40 | import torch.utils.data
41 | import torchvision
42 |
43 | from core import utils, config
44 | from core.utils import Path as Pth
45 |
46 | logger = logging.getLogger(__name__)
47 |
48 | # region Async File Loader
49 |
50 | class AsyncLoaderVideoFeatures():
51 | """
52 | Load features for the video frames.
53 | """
54 |
55 | def __init__(self, feats_path, target, n_frames_per_video, batch_size, n_feat_maps, feat_map_side_dim, n_threads=10, annotation_dict=None):
56 | random.seed(101)
57 | np.random.seed(101)
58 |
59 | self.__feats_pathes = feats_path
60 | self.__n_frames_per_video = n_frames_per_video
61 | self.__n_feat_maps = n_feat_maps
62 | self.__feat_map_side_dim = feat_map_side_dim
63 | self.__annotation_dict = annotation_dict
64 |
65 | self.__batch_size = batch_size
66 | self.__y = target
67 |
68 | self.__is_busy = False
69 | self.__batch_features = None
70 | self.__batch_y = None
71 | self.__n_threads_in_pool = n_threads
72 | self.__pool = Pool(self.__n_threads_in_pool)
73 |
74 | def load_feats_in_batch(self, batch_number):
75 | self.__is_busy = True
76 |
77 | idx_batch = batch_number - 1
78 | start_idx = idx_batch * self.__batch_size
79 | stop_idx = (idx_batch + 1) * self.__batch_size
80 |
81 | batch_feat_pathes = self.__feats_pathes[start_idx:stop_idx]
82 | batch_y = self.__y[start_idx:stop_idx]
83 |
84 | n_batch_feats = len(batch_feat_pathes)
85 | n_batch_y = len(batch_y)
86 | idxces = range(0, n_batch_feats)
87 |
88 | assert n_batch_feats == n_batch_y
89 |
90 | # parameters passed to the reading function
91 | params = [data_item for data_item in zip(idxces, batch_feat_pathes)]
92 |
93 | # set list of batch features before start reading
94 | batch_feats_shape = (n_batch_feats, self.__n_frames_per_video, self.__feat_map_side_dim, self.__feat_map_side_dim, self.__n_feat_maps)
95 |
96 | self.__batch_features = np.zeros(batch_feats_shape, dtype=np.float32)
97 | self.__batch_y = batch_y
98 |
99 | # start pool of threads
100 | self.__pool.map_async(self.__load_features, params, callback=self.__thread_pool_callback)
101 |
102 | def get_batch_data(self):
103 | if self.__is_busy:
104 | raise Exception('Sorry, you can\'t get features while threads are running!')
105 | else:
106 | return (self.__batch_features, self.__batch_y)
107 |
108 | def get_y(self):
109 | return self.__y
110 |
111 | def is_busy(self):
112 | return self.__is_busy
113 |
114 | def __thread_pool_callback(self, args):
115 | self.__is_busy = False
116 |
117 | def __load_features(self, params):
118 |
119 | idx_video = params[0]
120 | feats_path = params[1]
121 | video_name = feats_path.split('/')[-1]
122 |
123 | try:
124 | # load feature from file
125 | feats = utils.pkl_load(feats_path)
126 |
127 | n_feats = len(feats)
128 | assert n_feats == self.__n_frames_per_video, 'Sorry, wrong number of frames, expected: %d, got: %d' % (self.__n_frames_per_video, n_feats)
129 | self.__batch_features[idx_video] = feats
130 |
131 | except Exception as exp:
132 | print('\nSorry, error in loading feature %s' % (feats_path))
133 | print(exp)
134 |
135 | def shuffle_data(self):
136 | """
137 | shuffle these data: self.__feats_pathes, self.__class_names, self.__y
138 | :return:
139 | """
140 |
141 | n_samples = len(self.__feats_pathes)
142 |
143 | idx = range(n_samples)
144 | np.random.shuffle(idx)
145 | self.__feats_pathes = self.__feats_pathes[idx]
146 | self.__y = self.__y[idx]
147 |
148 | def close(self):
149 | self.__pool.close()
150 | self.__pool.terminate()
151 |
152 | # endregion
153 |
154 | # region Data Generators (Keras)
155 |
156 | class DataGeneratorCharades(keras.utils.Sequence):
157 | 'Generates data for Keras'
158 |
159 | def __init__(self, batch_size, n_classes, feature_dim, feature_name, is_training, is_shuffle=True):
160 | """
161 | Initialization
162 | """
163 | self.batch_size = batch_size
164 | self.is_training = is_training
165 | self.n_classes = n_classes
166 | self.feature_dim = feature_dim
167 | self.feature_name = feature_name
168 | self.is_shuffle = is_shuffle
169 | self.dataset_name = 'charades'
170 |
171 | # load annotation
172 | root_path = './data/charades'
173 | annotation_path = '%s/annotation/video_annotation.pkl' % (root_path)
174 | if self.is_training:
175 | (video_names, y, _, _) = utils.pkl_load(annotation_path)
176 | else:
177 | (_, _, video_names, y) = utils.pkl_load(annotation_path)
178 |
179 | # convert relative to root pathes
180 | feats_path = np.array(['%s/%s/%s.pkl' % (root_path, feature_name, p) for p in video_names])
181 |
182 | n_samples = len(y)
183 | self.n_samples = n_samples
184 | self.n_batches = utils.calc_num_batches(n_samples, batch_size)
185 | self.feats_path = feats_path
186 | self.y = y
187 |
188 | # shuffle the data
189 | if self.is_shuffle:
190 | self.__shuffle()
191 |
192 | def __len__(self):
193 | """
194 | Denotes the number of batches per epoc
195 | """
196 | return self.n_batches
197 |
198 | def __getitem__(self, index):
199 | """
200 | Generate one batch of data.
201 | """
202 |
203 | idx_start = index * self.batch_size
204 | idx_stop = (index + 1) * self.batch_size
205 | y = self.y[idx_start:idx_stop]
206 | feats_path = self.feats_path[idx_start:idx_stop]
207 |
208 | n_items = len(feats_path)
209 | x_shape = tuple([n_items] + list(self.feature_dim))
210 | x = np.zeros(x_shape, dtype=np.float32)
211 |
212 | # loop of feature pathes and load them
213 | for idx, p in enumerate(feats_path):
214 | x[idx] = utils.pkl_load(p)
215 |
216 | return x, y
217 |
218 | def on_epoch_end(self):
219 | """
220 | Shuffle after finishing the epoch.
221 | :return:
222 | """
223 |
224 | if self.is_shuffle:
225 | self.__shuffle()
226 |
227 | def __shuffle(self):
228 |
229 | idx = range(self.n_samples)
230 | np.random.shuffle(idx)
231 | self.feats_path = self.feats_path[idx]
232 | self.y = self.y[idx]
233 |
234 | # endregion
235 |
236 | # region Data Loaders (PyTorch)
237 |
238 | class DatasetCharades(torch.utils.data.Dataset):
239 | def __init__(self, batch_size, n_classes, feature_dim, feature_name, is_training, is_shuffle=True):
240 | """
241 | Initialization
242 | """
243 |
244 | self.batch_size = batch_size
245 | self.is_training = is_training
246 | self.n_classes = n_classes
247 | self.feature_dim = feature_dim
248 | self.feature_name = feature_name
249 | self.is_shuffle = is_shuffle
250 | self.dataset_name = 'charades'
251 |
252 | # load annotation
253 | root_path = './data/charades'
254 | annotation_path = '%s/annotation/video_annotation.pkl' % (root_path)
255 | if self.is_training:
256 | (video_names, y, _, _) = utils.pkl_load(annotation_path)
257 | else:
258 | (_, _, video_names, y) = utils.pkl_load(annotation_path)
259 |
260 | # in case of single label classification, debinarize the labels
261 | if config.cfg.MODEL.CLASSIFICATION_TYPE == 'sl':
262 | y = utils.debinarize_label(y)
263 |
264 | # in any case, make sure target is float
265 | y = y.astype(np.float32)
266 |
267 | # convert relative to root pathes
268 | feats_path = np.array(['%s/%s/%s.pkl' % (root_path, feature_name, p) for p in video_names])
269 |
270 | n_samples = len(y)
271 | self.n_samples = n_samples
272 | self.n_batches = utils.calc_num_batches(n_samples, batch_size)
273 | self.feats_path = feats_path
274 | self.y = y
275 |
276 | # shuffle the data
277 | if self.is_shuffle:
278 | self.__shuffle()
279 |
280 | def __getitem__(self, index):
281 | """
282 | Generate one batch of data
283 | """
284 |
285 | y = self.y[index]
286 | p = self.feats_path[index]
287 | x = utils.pkl_load(p) # (T, H, W, C)
288 |
289 | # convert to channel last
290 | x = np.transpose(x, (3, 0, 1, 2)) # (T, H, W, C)
291 |
292 | return x, y
293 |
294 | def __len__(self):
295 | return self.n_samples
296 |
297 | def __shuffle(self):
298 | idx = range(self.n_samples)
299 | np.random.shuffle(idx)
300 | self.feats_path = self.feats_path[idx]
301 | self.y = self.y[idx]
302 |
303 | # endregion
304 |
305 | # region Constants
306 |
307 | KERAS_DATA_GENERATORS_DICT = {'charades': DataGeneratorCharades}
308 | PYTORCH_DATASETS_DICT = {'charades': DatasetCharades}
309 |
310 | # endregion
311 |
--------------------------------------------------------------------------------
/experiments/train_pytorch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Train Timeception layers on different datasets. There are two different ways to train Timeception.
25 | 1. Timeception-only (TCO): only timeception layers are trained, using features extracted from backbone CNNs.
26 | 2. End-to-end (ETE): timeception is trained on top of backbone CNN. The input is video frames passed throughtout the backboneCNN
27 | and then the resulted feature is fed to Timeception layers. Here, you enjoy all the benefits of end-to-end training.
28 | For example, do pre-processing to the input frames, randomly sample the frames, temporal jittering, ...., etc.
29 | """
30 |
31 | from __future__ import absolute_import
32 | from __future__ import division
33 | from __future__ import print_function
34 | from __future__ import unicode_literals
35 |
36 | import os
37 | import sys
38 | import time
39 | import logging
40 | import datetime
41 | import numpy as np
42 | from optparse import OptionParser
43 |
44 | import torch
45 | import torch.utils.data
46 |
47 | from torch.nn import functional as F
48 | from torch.nn import Module, Dropout, BatchNorm1d, LeakyReLU, Linear, LogSoftmax, Sigmoid
49 | from torch.optim import SGD, Adam
50 | from torch.autograd import Variable
51 | from torch.utils.data import DataLoader
52 | from torchvision import datasets, transforms
53 |
54 | import torchviz
55 | import torchvision
56 | import torchsummary
57 |
58 | from nets import timeception_pytorch
59 | from core import utils, pytorch_utils, image_utils, config_utils, const, config, data_utils, metrics
60 | from core.utils import Path as Pth
61 |
62 | logger = logging.getLogger(__name__)
63 |
64 | def train_tco():
65 | """
66 | Train Timeception layers based on the given configurations.
67 | This train scheme is Timeception-only (TCO).
68 | """
69 |
70 | # get some configs for the training
71 | n_epochs = config.cfg.TRAIN.N_EPOCHS
72 | dataset_name = config.cfg.DATASET_NAME
73 | model_name = '%s_%s' % (config.cfg.MODEL.NAME, utils.timestamp())
74 | device = 'cuda'
75 |
76 | # data generators
77 | loader_tr, n_samples_tr, n_batches_tr = __define_loader(is_training=True)
78 | loader_te, n_samples_te, n_batches_te = __define_loader(is_training=False)
79 |
80 | logger.info('--- start time')
81 | logger.info(datetime.datetime.now())
82 | logger.info('... [tr]: n_samples, n_batch, batch_size: %d, %d, %d' % (n_samples_tr, n_batches_tr, config.cfg.TRAIN.BATCH_SIZE))
83 | logger.info('... [te]: n_samples, n_batch, batch_size: %d, %d, %d' % (n_samples_te, n_batches_te, config.cfg.TEST.BATCH_SIZE))
84 |
85 | # load model
86 | model, optimizer, loss_fn, metric_fn, metric_fn_name = __define_timeception_model(device)
87 | logger.info(pytorch_utils.summary(model, model._input_shape[1:], batch_size=2, device='cuda'))
88 |
89 | # save the model
90 | model_saver = pytorch_utils.ModelSaver(model, dataset_name, model_name)
91 |
92 | # loop on the epochs
93 | sys.stdout.write('\n')
94 | for idx_epoch in range(n_epochs):
95 |
96 | epoch_num = idx_epoch + 1
97 |
98 | loss_tr = 0.0
99 | acc_tr = 0.0
100 | loss_te = 0.0
101 | acc_te = 0.0
102 |
103 | tt1 = time.time()
104 |
105 | # flag model as training
106 | model.train()
107 |
108 | # training
109 | for idx_batch, (x, y_true) in enumerate(loader_tr):
110 | batch_num = idx_batch + 1
111 |
112 | x, y_true = x.to(device), y_true.to(device)
113 | optimizer.zero_grad()
114 | y_pred = model(x)
115 | loss = loss_fn(y_pred, y_true)
116 | loss.backward()
117 | optimizer.step()
118 |
119 | # calculate accuracy
120 | y_true = y_true.cpu().numpy().astype(np.int32)
121 | y_pred = y_pred.cpu().detach().numpy()
122 | loss_b_tr = loss.cpu().detach().numpy()
123 | acc_b_tr = metric_fn(y_true, y_pred)
124 |
125 | loss_tr += loss_b_tr
126 | acc_tr += acc_b_tr
127 | loss_b_tr = loss_tr / float(batch_num)
128 | acc_b_tr = acc_tr / float(batch_num)
129 | tt2 = time.time()
130 | duration = tt2 - tt1
131 | sys.stdout.write('\r%04ds - epoch: %02d/%02d, batch [tr]: %02d/%02d, loss, %s: %0.2f, %0.2f ' % (duration, epoch_num, n_epochs, batch_num, n_batches_tr, metric_fn_name, loss_b_tr, acc_b_tr))
132 |
133 | # flag model as testing
134 | model.eval()
135 |
136 | # testing
137 | for idx_batch, (x, y_true) in enumerate(loader_te):
138 | batch_num = idx_batch + 1
139 |
140 | x, y_true = x.to(device), y_true.to(device)
141 | y_pred = model(x)
142 | loss_b_te = loss_fn(y_pred, y_true).cpu().detach().numpy()
143 | y_true = y_true.cpu().numpy().astype(np.int32)
144 | y_pred = y_pred.cpu().detach().numpy()
145 | acc_b_te = metric_fn(y_true, y_pred)
146 |
147 | loss_te += loss_b_te
148 | acc_te += acc_b_te
149 | loss_b_te = loss_te / float(batch_num)
150 | acc_b_te = acc_te / float(batch_num)
151 | tt2 = time.time()
152 | duration = tt2 - tt1
153 | sys.stdout.write('\r%04ds - epoch: %02d/%02d, batch [te]: %02d/%02d, loss, %s: %0.2f, %0.2f ' % (duration, epoch_num, n_epochs, batch_num, n_batches_te, metric_fn_name, loss_b_te, acc_b_te))
154 |
155 | loss_tr /= float(n_batches_tr)
156 | loss_te /= float(n_batches_te)
157 | acc_tr /= float(n_batches_tr)
158 | acc_te /= float(n_batches_te)
159 |
160 | tt2 = time.time()
161 | duration = tt2 - tt1
162 | sys.stdout.write('\r%04ds - epoch: %02d/%02d, [tr]: %0.2f, %0.2f, [te]: %0.2f, %0.2f \n' % (duration, epoch_num, n_epochs, loss_tr, acc_te, loss_te, acc_te))
163 |
164 | # after each epoch, save data
165 | model_saver.save(idx_epoch)
166 |
167 | logger.info('--- finish time')
168 | logger.info(datetime.datetime.now())
169 |
170 | def train_ete():
171 | """
172 | Train Timeception layers based on the given configurations.
173 | This train scheme is End-to-end (ETE).
174 | """
175 |
176 | raise Exception('Sorry, not implemented yet!')
177 |
178 | def __define_loader(is_training):
179 | """
180 | Define data loader.
181 | """
182 |
183 | # get some configs for the training
184 | n_classes = config.cfg.MODEL.N_CLASSES
185 | dataset_name = config.cfg.DATASET_NAME
186 | backbone_model_name = config.cfg.MODEL.BACKBONE_CNN
187 | backbone_feature_name = config.cfg.MODEL.BACKBONE_FEATURE
188 | n_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS
189 | n_workers = config.cfg.TRAIN.N_WORKERS
190 |
191 | batch_size_tr = config.cfg.TRAIN.BATCH_SIZE
192 | batch_size_te = config.cfg.TEST.BATCH_SIZE
193 | batch_size = batch_size_tr if is_training else batch_size_te
194 |
195 | # size and name of feature
196 | feature_name = 'features_%s_%s_%sf' % (backbone_model_name, backbone_feature_name, n_timesteps)
197 | c, h, w = utils.get_model_feat_maps_info(backbone_model_name, backbone_feature_name)
198 | feature_dim = (c, n_timesteps, h, w)
199 |
200 | # data generators
201 | params = {'batch_size': batch_size, 'n_classes': n_classes, 'feature_name': feature_name, 'feature_dim': feature_dim, 'is_training': is_training}
202 | dataset_class = data_utils.PYTORCH_DATASETS_DICT[dataset_name]
203 | dataset = dataset_class(**params)
204 | n_samples = dataset.n_samples
205 | n_batches = dataset.n_batches
206 |
207 | data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=n_workers, shuffle=True)
208 |
209 | return data_loader, n_samples, n_batches
210 |
211 | def __define_timeception_model(device):
212 | """
213 | Define model, optimizer, loss function and metric function.
214 | """
215 | # some configurations
216 | classification_type = config.cfg.MODEL.CLASSIFICATION_TYPE
217 | solver_name = config.cfg.SOLVER.NAME
218 | solver_lr = config.cfg.SOLVER.LR
219 | adam_epsilon = config.cfg.SOLVER.ADAM_EPSILON
220 |
221 | # define model
222 | model = Model().to(device)
223 | model_param = model.parameters()
224 |
225 | # define the optimizer
226 | optimizer = SGD(model_param, lr=0.01) if solver_name == 'sgd' else Adam(model_param, lr=solver_lr, eps=adam_epsilon)
227 |
228 | # loss and evaluation function for either multi-label "ml" or single-label "sl" classification
229 | if classification_type == 'ml':
230 | loss_fn = torch.nn.BCELoss()
231 | metric_fn = metrics.map_charades
232 | metric_fn_name = 'map'
233 | else:
234 | loss_fn = torch.nn.NLLLoss()
235 | metric_fn = metrics.accuracy
236 | metric_fn_name = 'acc'
237 |
238 | return model, optimizer, loss_fn, metric_fn, metric_fn_name
239 |
240 | class Model(Module):
241 | """
242 | Define Timeception classifier.
243 | """
244 |
245 | def __init__(self):
246 | super(Model, self).__init__()
247 |
248 | # some configurations for the model
249 | n_tc_timesteps = config.cfg.MODEL.N_TC_TIMESTEPS
250 | backbone_name = config.cfg.MODEL.BACKBONE_CNN
251 | feature_name = config.cfg.MODEL.BACKBONE_FEATURE
252 | n_tc_layers = config.cfg.MODEL.N_TC_LAYERS
253 | n_classes = config.cfg.MODEL.N_CLASSES
254 | is_dilated = config.cfg.MODEL.MULTISCALE_TYPE
255 | OutputActivation = Sigmoid if config.cfg.MODEL.CLASSIFICATION_TYPE == 'ml' else LogSoftmax
256 | n_channels_in, channel_h, channel_w = utils.get_model_feat_maps_info(backbone_name, feature_name)
257 | n_groups = int(n_channels_in / 128.0)
258 |
259 | input_shape = (None, n_channels_in, n_tc_timesteps, channel_h, channel_w) # (C, T, H, W)
260 | self._input_shape = input_shape
261 |
262 | # define 4 layers of timeception
263 | self.timeception = timeception_pytorch.Timeception(input_shape, n_tc_layers, n_groups, is_dilated) # (C, T, H, W)
264 |
265 | # get number of output channels after timeception
266 | n_channels_in = self.timeception.n_channels_out
267 |
268 | # define layers for classifier
269 | self.do1 = Dropout(0.5)
270 | self.l1 = Linear(n_channels_in, 512)
271 | self.bn1 = BatchNorm1d(512)
272 | self.ac1 = LeakyReLU(0.2)
273 | self.do2 = Dropout(0.25)
274 | self.l2 = Linear(512, n_classes)
275 | self.ac2 = OutputActivation()
276 |
277 | def forward(self, input):
278 | # feedforward the input to the timeception layers
279 | tensor = self.timeception(input)
280 |
281 | # max-pool over space-time
282 | bn, c, t, h, w = tensor.size()
283 | tensor = tensor.view(bn, c, t * h * w)
284 | tensor = torch.max(tensor, dim=2, keepdim=False)
285 | tensor = tensor[0]
286 |
287 | # dense layers for classification
288 | tensor = self.do1(tensor)
289 | tensor = self.l1(tensor)
290 | tensor = self.bn1(tensor)
291 | tensor = self.ac1(tensor)
292 | tensor = self.do2(tensor)
293 | tensor = self.l2(tensor)
294 | tensor = self.ac2(tensor)
295 |
296 | return tensor
297 |
298 | def __main():
299 | """
300 | Run this script to train Timeception.
301 | """
302 |
303 | default_config_file = 'charades_i3d_tc4_f1024.yaml'
304 | default_config_file = 'charades_i3d_tc2_f256.yaml'
305 |
306 | # Parse the arguments
307 | parser = OptionParser()
308 | parser.add_option('-c', '--config_file', dest='config_file', default=default_config_file, help='Yaml config file that contains all training details.')
309 | (options, args) = parser.parse_args()
310 | config_file = options.config_file
311 |
312 | # check if exist
313 | if config_file is None or config_file == '':
314 | msg = 'Config file not passed, default config is used: %s' % (config_file)
315 | logging.warning(msg)
316 | config_file = default_config_file
317 |
318 | # path of config file
319 | config_path = './configs/%s' % (config_file)
320 |
321 | # check if file exist
322 | if not os.path.exists(config_path):
323 | msg = 'Sorry, could not find config file with the following path: %s' % (config_path)
324 | logging.error(msg)
325 | else:
326 | # read the config from file and copy it to the project configuration "cfg"
327 | config_utils.cfg_from_file(config_path)
328 |
329 | # choose which training scheme, either 'ete' or 'tco'
330 | training_scheme = config.cfg.TRAIN.SCHEME
331 |
332 | # start training
333 | if training_scheme == 'tco':
334 | train_tco()
335 | else:
336 | train_ete()
337 |
338 | if __name__ == '__main__':
339 | __main()
340 |
--------------------------------------------------------------------------------
/core/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Helper functions for many things. Also, some needed classes.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import logging
33 | import time
34 | import h5py
35 | import yaml
36 | import numpy as np
37 | import pickle as pkl
38 | import pandas as pd
39 | from datetime import datetime
40 | import matplotlib.pyplot as plt
41 | from sklearn.preprocessing import label_binarize
42 | from sklearn import preprocessing, manifold
43 | import scipy.io as sio
44 |
45 | import os
46 | import json
47 | import natsort
48 | import random
49 | from multiprocessing.dummy import Pool
50 |
51 | from core import const
52 |
53 | logger = logging.getLogger(__name__)
54 |
55 | # region Load and Dump
56 |
57 | def pkl_load(path):
58 | with open(path, 'r') as f:
59 | data = pkl.load(f)
60 | return data
61 |
62 | def txt_load(path):
63 | with open(path, 'r') as f:
64 | lines = f.read().splitlines()
65 | lines = np.array(lines)
66 | return lines
67 |
68 | def byte_load(path):
69 | with open(path, 'rb') as f:
70 | data = f.read()
71 | return data
72 |
73 | def json_load(path):
74 | with open(path, 'r') as f:
75 | data = json.load(f)
76 |
77 | return data
78 |
79 | def yaml_load(file_path):
80 | with open(file_path, 'r') as f:
81 | data = yaml.load(f)
82 | data = AttrDict(data)
83 |
84 | data = convert_dict_to_attrdict(data)
85 | return data
86 |
87 | def h5_load(path, dataset_name='data'):
88 | h5_file = h5py.File(path, 'r')
89 | data = h5_file[dataset_name].value
90 | h5_file.close()
91 | return data
92 |
93 | def h5_load_multi(path, dataset_names):
94 | h5_file = h5py.File(path, 'r')
95 | data = [h5_file[name].value for name in dataset_names]
96 | h5_file.close()
97 | return data
98 |
99 | def txt_dump(data, path):
100 | l = len(data) - 1
101 | with open(path, 'w') as f:
102 | for i, k in enumerate(data):
103 | if i < l:
104 | k = ('%s\n' % k)
105 | else:
106 | k = ('%s' % k)
107 | f.writelines(k)
108 |
109 | def byte_dump(data, path):
110 | with open(path, 'wb') as f:
111 | f.write(data)
112 |
113 | def pkl_dump(data, path, is_highest=True):
114 | with open(path, 'w') as f:
115 | if not is_highest:
116 | pkl.dump(data, f)
117 | else:
118 | pkl.dump(data, f, pkl.HIGHEST_PROTOCOL)
119 |
120 | def json_dump(data, path):
121 | with open(path, 'w') as f:
122 | json.dump(data, f)
123 |
124 | def h5_dump(data, path, dataset_name='data'):
125 | h5_file = h5py.File(path, 'w')
126 | h5_file.create_dataset(dataset_name, data=data, dtype=data.dtype)
127 | h5_file.close()
128 |
129 | def h5_dump_multi(data, dataset_names, path):
130 | h5_file = h5py.File(path, 'w')
131 | n_items = len(data)
132 | for i in range(n_items):
133 | item_data = data[i]
134 | item_name = dataset_names[i]
135 | h5_file.create_dataset(item_name, data=item_data, dtype=item_data.dtype)
136 | h5_file.close()
137 |
138 | def csv_load(path, sep=',', header='infer'):
139 | df = pd.read_csv(path, sep=sep, header=header)
140 | data = df.values
141 | return data
142 |
143 | def mat_load(path, m_dict=None):
144 | """
145 | Load mat files.
146 | :param path:
147 | :return:
148 | """
149 | if m_dict is None:
150 | data = sio.loadmat(path)
151 | else:
152 | data = sio.loadmat(path, m_dict)
153 |
154 | return data
155 |
156 | # endregion
157 |
158 | # region File/Folder Names/Pathes
159 |
160 | def file_names(path, is_nat_sort=False):
161 | if not os.path.exists(path):
162 | exp_msg = 'Sorry, folder path does not exist: %s' % (path)
163 | raise Exception(exp_msg)
164 |
165 | names = os.walk(path).next()[2]
166 |
167 | if is_nat_sort:
168 | names = natsort.natsorted(names)
169 |
170 | return names
171 |
172 | def file_pathes(path, is_nat_sort=False):
173 | if not os.path.exists(path):
174 | exp_msg = 'Sorry, folder path does not exist: %s' % (path)
175 | raise Exception(exp_msg)
176 |
177 | names = os.walk(path).next()[2]
178 |
179 | if is_nat_sort:
180 | names = natsort.natsorted(names)
181 |
182 | pathes = ['%s/%s' % (path, n) for n in names]
183 | return pathes
184 |
185 | def folder_names(path, is_nat_sort=False):
186 | if not os.path.exists(path):
187 | exp_msg = 'Sorry, folder path does not exist: %s' % (path)
188 | raise Exception(exp_msg)
189 |
190 | names = os.walk(path).next()[1]
191 |
192 | if is_nat_sort:
193 | names = natsort.natsorted(names)
194 |
195 | return names
196 |
197 | def folder_pathes(path, is_nat_sort=False):
198 | if not os.path.exists(path):
199 | exp_msg = 'Sorry, folder path does not exist: %s' % (path)
200 | raise Exception(exp_msg)
201 |
202 | names = os.walk(path).next()[1]
203 |
204 | if is_nat_sort:
205 | names = natsort.natsorted(names)
206 |
207 | pathes = ['%s/%s' % (path, n) for n in names]
208 | return pathes
209 |
210 | # endregion
211 |
212 | # region Normalization
213 |
214 | def normalize_mean_std(x):
215 | mean = np.mean(x, axis=0)
216 | std = np.std(x, axis=0)
217 | x -= mean
218 | x /= std
219 | return x
220 |
221 | def normalize_mean(x):
222 | mean = np.mean(x, axis=0)
223 | x /= mean
224 | return x
225 |
226 | def normalize_sum(x):
227 | sum = np.sum(x, axis=1)
228 | x = np.array([x_i / sum_i for x_i, sum_i in zip(x, sum)])
229 | return x
230 |
231 | def normalize_l2(x):
232 | return preprocessing.normalize(x)
233 |
234 | def normalize_l1(x):
235 | return preprocessing.normalize(x, norm='l1')
236 |
237 | def normalize_range_0_to_1(x):
238 | x = np.add(x, -x.min())
239 | x = np.divide(x, x.max())
240 | return x
241 |
242 | # endregion
243 |
244 | # region Array Helpers
245 |
246 | def array_to_text(a, separator=', '):
247 | text = separator.join([str(s) for s in a])
248 | return text
249 |
250 | def get_size_in_kb(size):
251 | size /= float(1024)
252 | return size
253 |
254 | def get_size_in_mb(size):
255 | size /= float(1024 * 1024)
256 | return size
257 |
258 | def get_size_in_gb(size):
259 | size /= float(1024 * 1024 * 1024)
260 | return size
261 |
262 | def get_array_memory_size(a):
263 | if type(a) is not np.ndarray:
264 | raise Exception('Sorry, input is not numpy array!')
265 |
266 | dtype = a.dtype
267 | if dtype == np.float16:
268 | n_bytes = 2
269 | elif dtype == np.float32:
270 | n_bytes = 4
271 | else:
272 | raise Exception('Sorry, unsupported dtype:', dtype)
273 |
274 | s = a.size
275 | size = s * n_bytes
276 | return size
277 |
278 | def get_expected_memory_size(array_shape, array_dtype):
279 | dtype = array_dtype
280 | if dtype == np.float16:
281 | n_bytes = 2
282 | elif dtype == np.float32:
283 | n_bytes = 4
284 | else:
285 | raise Exception('Sorry, unsupported dtype:', dtype)
286 |
287 | s = 1
288 | for dim_size in array_shape:
289 | s *= dim_size
290 |
291 | size = s * n_bytes
292 | return size
293 |
294 | def print_array(a):
295 | for item in a:
296 | print(item)
297 |
298 | def print_array_joined(a):
299 | s = ', '.join([str(i) for i in a])
300 | print(s)
301 |
302 | # endregion
303 |
304 | # region Misc
305 |
306 | def learn_manifold(manifold_type, feats, n_components=2):
307 | if manifold_type == 'tsne':
308 | feats_fitted = manifold.TSNE(n_components=n_components, random_state=0).fit_transform(feats)
309 | elif manifold_type == 'isomap':
310 | feats_fitted = manifold.Isomap(n_components=n_components).fit_transform(feats)
311 | elif manifold_type == 'mds':
312 | feats_fitted = manifold.MDS(n_components=n_components).fit_transform(feats)
313 | elif manifold_type == 'spectral':
314 | feats_fitted = manifold.SpectralEmbedding(n_components=n_components).fit_transform(feats)
315 | else:
316 | raise Exception('wrong maniford type!')
317 |
318 | # methods = ['standard', 'ltsa', 'hessian', 'modified']
319 | # feats_fitted = manifold.LocallyLinearEmbedding(n_components=n_components, method=methods[0]).fit_transform(pred)
320 |
321 | return feats_fitted
322 |
323 | def debinarize_label(labels):
324 | debinarized = np.array([np.where(l == 1)[0][0] for l in labels])
325 | return debinarized
326 |
327 | def timestamp():
328 | time_stamp = "{0:%y}.{0:%m}.{0:%d}-{0:%I}:{0:%M}:{0:%S}".format(datetime.now())
329 | return time_stamp
330 |
331 | def remove_extension(name):
332 | name = name[:-4]
333 | return name
334 |
335 | def get_file_extension(name):
336 | name = name.split('.')[-1]
337 | return name
338 |
339 | def print_counter(num, total, freq=None):
340 | if freq is None:
341 | logger.info('... %d/%d' % (num, total))
342 | elif num % freq == 0:
343 | logger.info('... %d/%d' % (num, total))
344 |
345 | def calc_num_batches(n_samples, batch_size):
346 | n_batch = int(n_samples / float(batch_size))
347 | n_batch = n_batch if n_samples % batch_size == 0 else n_batch + 1
348 | return n_batch
349 |
350 | def convert_dict_to_attrdict(d):
351 | for k, v in d.iteritems():
352 | if isinstance(v, dict):
353 | v = convert_dict_to_attrdict(v)
354 | d[k] = v
355 |
356 | if isinstance(d, dict):
357 | d = AttrDict(d)
358 |
359 | return d
360 |
361 | def get_model_feat_maps_info(model_type, feature_type):
362 | """
363 | Get feature map details according to model type and feature type.
364 | :param model_type:
365 | :param feature_type:
366 | :return:
367 | """
368 |
369 | if model_type in ['vgg', 'vgg_charades_rgb']:
370 | if feature_type == 'pool5':
371 | return 512, 7, 7
372 | elif feature_type == 'conv5_3':
373 | return 512, 14, 14
374 | else:
375 | raise Exception('Sorry, unsupported feature type: %s' % (feature_type))
376 | elif model_type in ['resnet152', 'resnet152_charades_rgb']:
377 | if feature_type == 'res4b35':
378 | return 1024, 14, 14
379 | elif feature_type == 'res5c':
380 | return 2048, 7, 7
381 | elif feature_type == 'pool5':
382 | return 2048, 1, 1
383 | else:
384 | raise Exception('Sorry, unsupported feature type: %s' % (feature_type))
385 | elif model_type in ['i3d_rgb', 'i3d_pytorch_charades_rgb', 'i3d_kinetics_keras', 'i3d_keras_kinetics_rgb']:
386 | if feature_type == 'mixed_5c':
387 | return 1024, 7, 7
388 | elif feature_type == 'mixed_4f':
389 | return 832, 7, 7
390 | else:
391 | raise Exception('Sorry, unsupported feature type: %s' % (feature_type))
392 | elif model_type in ['i3d_resnet_50_kinetics_rgb', 'i3d_resnet_101_kinetics_rgb']:
393 | if feature_type == 'pool5':
394 | return 2048, 7, 7
395 | else:
396 | raise Exception('Sorry, unsupported feature type: %s' % (feature_type))
397 | elif model_type in ['i3d_resnet101_charades_rgb']:
398 | if feature_type == 'res5_2':
399 | return 2048, 7, 7
400 | else:
401 | raise Exception('Sorry, unsupported feature type: %s' % (feature_type))
402 | else:
403 | raise Exception('Sorry, unsupported model type: %s' % (model_type))
404 |
405 | # endregion
406 |
407 | # region Classes
408 |
409 | class Path(str):
410 | def __new__(self, relative_path, args=None, root_type=const.ROOT_PATH_TYPES[0]):
411 | assert root_type in const.ROOT_PATH_TYPES
412 | root_types = list(const.ROOT_PATH_TYPES)
413 | idx_root_type = root_types.index(root_type)
414 |
415 | root_paths = [const.DATA_ROOT_PATH, const.PROJECT_ROOT_PATH]
416 | root_path = root_paths[idx_root_type]
417 |
418 | relative_path = relative_path % args if args is not None else relative_path
419 | path = os.path.join(root_path, relative_path)
420 |
421 | self.__path = path
422 | return self.__path
423 |
424 | def __str__(self):
425 | return self.__path
426 |
427 | def __repr__(self):
428 | return self.__path
429 |
430 | class DurationTimer(object):
431 | def __init__(self):
432 | self.start_time = time.time()
433 |
434 | def duration(self, is_string=True):
435 | stop_time = time.time()
436 | durtation = stop_time - self.start_time
437 | if is_string:
438 | durtation = self.format_duration(durtation)
439 | return durtation
440 |
441 | def format_duration(self, duration):
442 | if duration < 60:
443 | return str(duration) + " sec"
444 | elif duration < (60 * 60):
445 | return str(duration / 60) + " min"
446 | else:
447 | return str(duration / (60 * 60)) + " hr"
448 |
449 | class AttrDict(dict):
450 | """
451 | Subclass dict and define getter-setter. This behaves as both dict and obj.
452 | """
453 |
454 | def __getattr__(self, key):
455 | return self[key]
456 |
457 | def __setattr__(self, key, value):
458 | if key in self.__dict__:
459 | self.__dict__[key] = value
460 | else:
461 | self[key] = value
462 |
463 | # endregion
464 |
--------------------------------------------------------------------------------
/nets/timeception_pytorch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Definitio of Timeception as pytorch model.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import logging
33 |
34 | import torch
35 | import torch.nn
36 | import torchvision
37 | import torchviz
38 | import torchsummary
39 |
40 | from torch.nn import Module, Conv3d, BatchNorm3d, MaxPool3d, ReLU
41 | from torch.nn import functional as F
42 |
43 | from nets.layers_pytorch import ChannelShuffleLayer, DepthwiseConv1DLayer
44 |
45 | # region Timeception as Module
46 |
47 | class Timeception(Module):
48 | """
49 | Timeception is defined as a keras model.
50 | """
51 |
52 | def __init__(self, input_shape, n_layers=4, n_groups=8, is_dilated=True):
53 |
54 | super(Timeception, self).__init__()
55 |
56 | # TODO: Add support for multi-scale using dilation rates
57 | # current, for pytorch, we only support multi-scale using kernel sizes
58 | is_dilated = False
59 |
60 | expansion_factor = 1.25
61 | self.expansion_factor = expansion_factor
62 | self.n_layers = n_layers
63 | self.is_dilated = is_dilated
64 | self.n_groups = n_groups
65 | self.n_channels_out = None
66 |
67 | # convert it as a list
68 | input_shape = list(input_shape)
69 |
70 | # define timeception layers
71 | n_channels_out = self.__define_timeception_layers(input_shape, n_layers, n_groups, expansion_factor, is_dilated)
72 |
73 | # set the output channels
74 | self.n_channels_out = n_channels_out
75 |
76 | def forward(self, input):
77 |
78 | n_layers = self.n_layers
79 | n_groups = self.n_groups
80 | expansion_factor = self.expansion_factor
81 |
82 | output = self.__call_timeception_layers(input, n_layers, n_groups, expansion_factor)
83 |
84 | return output
85 |
86 | def __define_timeception_layers(self, input_shape, n_layers, n_groups, expansion_factor, is_dilated):
87 | """
88 | Define layers inside the timeception layers.
89 | """
90 |
91 | n_channels_in = input_shape[1]
92 |
93 | # how many layers of timeception
94 | for i in range(n_layers):
95 | layer_num = i + 1
96 |
97 | # get details about grouping
98 | n_channels_per_branch, n_channels_out = self.__get_n_channels_per_branch(n_groups, expansion_factor, n_channels_in)
99 |
100 | # temporal conv per group
101 | self.__define_grouped_convolutions(input_shape, n_groups, n_channels_per_branch, is_dilated, layer_num)
102 |
103 | # downsample over time
104 | layer_name = 'maxpool_tc%d' % (layer_num)
105 | layer = MaxPool3d(kernel_size=(2, 1, 1))
106 | layer._name = layer_name
107 | setattr(self, layer_name, layer)
108 |
109 | n_channels_in = n_channels_out
110 | input_shape[1] = n_channels_in
111 |
112 | return n_channels_in
113 |
114 | def __define_grouped_convolutions(self, input_shape, n_groups, n_channels_per_branch, is_dilated, layer_num):
115 | """
116 | Define layers inside grouped convolutional block.
117 | """
118 |
119 | n_channels_in = input_shape[1]
120 |
121 | n_branches = 5
122 | n_channels_per_group_in = int(n_channels_in / n_groups)
123 | n_channels_out = int(n_groups * n_branches * n_channels_per_branch)
124 | n_channels_per_group_out = int(n_channels_out / n_groups)
125 |
126 | assert n_channels_in % n_groups == 0
127 | assert n_channels_out % n_groups == 0
128 |
129 | # type of multi-scale kernels to use: either multi_kernel_sizes or multi_dilation_rates
130 | if is_dilated:
131 | kernel_sizes = (3, 3, 3)
132 | dilation_rates = (1, 2, 3)
133 | else:
134 | kernel_sizes = (3, 5, 7)
135 | dilation_rates = (1, 1, 1)
136 |
137 | input_shape_per_group = list(input_shape)
138 | input_shape_per_group[1] = n_channels_per_group_in
139 |
140 | # loop on groups, and define convolutions in each group
141 | for idx_group in range(n_groups):
142 | group_num = idx_group + 1
143 | self.__define_temporal_convolutional_block(input_shape_per_group, n_channels_per_branch, kernel_sizes, dilation_rates, layer_num, group_num)
144 |
145 | # activation
146 | layer_name = 'relu_tc%d' % (layer_num)
147 | layer = ReLU()
148 | layer._name = layer_name
149 | setattr(self, layer_name, layer)
150 |
151 | # shuffle channels
152 | layer_name = 'shuffle_tc%d' % (layer_num)
153 | layer = ChannelShuffleLayer(n_channels_out, n_groups)
154 | layer._name = layer_name
155 | setattr(self, layer_name, layer)
156 |
157 | def __define_temporal_convolutional_block(self, input_shape, n_channels_per_branch_out, kernel_sizes, dilation_rates, layer_num, group_num):
158 | """
159 | Define 5 branches of convolutions that operate of channels of each group.
160 | """
161 |
162 | n_channels_in = input_shape[1]
163 |
164 | dw_input_shape = list(input_shape)
165 | dw_input_shape[1] = n_channels_per_branch_out
166 |
167 | # branch 1: dimension reduction only and no temporal conv
168 | layer_name = 'conv_b1_g%d_tc%d' % (group_num, layer_num)
169 | layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1))
170 | layer._name = layer_name
171 | setattr(self, layer_name, layer)
172 | layer_name = 'bn_b1_g%d_tc%d' % (group_num, layer_num)
173 | layer = BatchNorm3d(n_channels_per_branch_out)
174 | layer._name = layer_name
175 | setattr(self, layer_name, layer)
176 |
177 | # branch 2: dimension reduction followed by depth-wise temp conv (kernel-size 3)
178 | layer_name = 'conv_b2_g%d_tc%d' % (group_num, layer_num)
179 | layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1))
180 | layer._name = layer_name
181 | setattr(self, layer_name, layer)
182 | layer_name = 'convdw_b2_g%d_tc%d' % (group_num, layer_num)
183 | layer = DepthwiseConv1DLayer(dw_input_shape, kernel_sizes[0], dilation_rates[0], layer_name)
184 | setattr(self, layer_name, layer)
185 | layer_name = 'bn_b2_g%d_tc%d' % (group_num, layer_num)
186 | layer = BatchNorm3d(n_channels_per_branch_out)
187 | layer._name = layer_name
188 | setattr(self, layer_name, layer)
189 |
190 | # branch 3: dimension reduction followed by depth-wise temp conv (kernel-size 5)
191 | layer_name = 'conv_b3_g%d_tc%d' % (group_num, layer_num)
192 | layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1))
193 | layer._name = layer_name
194 | setattr(self, layer_name, layer)
195 | layer_name = 'convdw_b3_g%d_tc%d' % (group_num, layer_num)
196 | layer = DepthwiseConv1DLayer(dw_input_shape, kernel_sizes[1], dilation_rates[1], layer_name)
197 | setattr(self, layer_name, layer)
198 | layer_name = 'bn_b3_g%d_tc%d' % (group_num, layer_num)
199 | layer = BatchNorm3d(n_channels_per_branch_out)
200 | layer._name = layer_name
201 | setattr(self, layer_name, layer)
202 |
203 | # branch 4: dimension reduction followed by depth-wise temp conv (kernel-size 7)
204 | layer_name = 'conv_b4_g%d_tc%d' % (group_num, layer_num)
205 | layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1))
206 | layer._name = layer_name
207 | setattr(self, layer_name, layer)
208 | layer_name = 'convdw_b4_g%d_tc%d' % (group_num, layer_num)
209 | layer = DepthwiseConv1DLayer(dw_input_shape, kernel_sizes[2], dilation_rates[2], layer_name)
210 | setattr(self, layer_name, layer)
211 | layer_name = 'bn_b4_g%d_tc%d' % (group_num, layer_num)
212 | layer = BatchNorm3d(n_channels_per_branch_out)
213 | layer._name = layer_name
214 | setattr(self, layer_name, layer)
215 |
216 | # branch 5: dimension reduction followed by temporal max pooling
217 | layer_name = 'conv_b5_g%d_tc%d' % (group_num, layer_num)
218 | layer = Conv3d(n_channels_in, n_channels_per_branch_out, kernel_size=(1, 1, 1))
219 | layer._name = layer_name
220 | setattr(self, layer_name, layer)
221 | layer_name = 'maxpool_b5_g%d_tc%d' % (group_num, layer_num)
222 | layer = MaxPool3d(kernel_size=(2, 1, 1), stride=(1, 1, 1))
223 | layer._name = layer_name
224 | setattr(self, layer_name, layer)
225 | layer_name = 'padding_b5_g%d_tc%d' % (group_num, layer_num)
226 | layer = torch.nn.ReplicationPad3d((0, 0, 0, 0, 1, 0)) # left, right, top, bottom, front, back
227 | layer._name = layer_name
228 | setattr(self, layer_name, layer)
229 | layer_name = 'bn_b5_g%d_tc%d' % (group_num, layer_num)
230 | layer = BatchNorm3d(n_channels_per_branch_out)
231 | layer._name = layer_name
232 | setattr(self, layer_name, layer)
233 |
234 | def __call_timeception_layers(self, tensor, n_layers, n_groups, expansion_factor):
235 | input_shape = tensor.size()
236 | n_channels_in = input_shape[1]
237 |
238 | # how many layers of timeception
239 | for i in range(n_layers):
240 | layer_num = i + 1
241 |
242 | # get details about grouping
243 | n_channels_per_branch, n_channels_out = self.__get_n_channels_per_branch(n_groups, expansion_factor, n_channels_in)
244 |
245 | # temporal conv per group
246 | tensor = self.__call_grouped_convolutions(tensor, n_groups, layer_num)
247 |
248 | # downsample over time
249 | tensor = getattr(self, 'maxpool_tc%d' % (layer_num))(tensor)
250 | n_channels_in = n_channels_out
251 |
252 | return tensor
253 |
254 | def __call_grouped_convolutions(self, tensor_input, n_groups, layer_num):
255 |
256 | n_channels_in = tensor_input.size()[1]
257 | n_channels_per_group_in = int(n_channels_in / n_groups)
258 |
259 | # loop on groups
260 | t_outputs = []
261 | for idx_group in range(n_groups):
262 | group_num = idx_group + 1
263 |
264 | # slice maps to get maps per group
265 | idx_start = idx_group * n_channels_per_group_in
266 | idx_end = (idx_group + 1) * n_channels_per_group_in
267 | tensor = tensor_input[:, idx_start:idx_end]
268 |
269 | tensor = self.__call_temporal_convolutional_block(tensor, layer_num, group_num)
270 | t_outputs.append(tensor)
271 |
272 | # concatenate channels of groups
273 | tensor = torch.cat(t_outputs, dim=1)
274 | # activation
275 | tensor = getattr(self, 'relu_tc%d' % (layer_num))(tensor)
276 | # shuffle channels
277 | tensor = getattr(self, 'shuffle_tc%d' % (layer_num))(tensor)
278 |
279 | return tensor
280 |
281 | def __call_temporal_convolutional_block(self, tensor, layer_num, group_num):
282 | """
283 | Feedforward for 5 branches of convolutions that operate of channels of each group.
284 | """
285 |
286 | # branch 1: dimension reduction only and no temporal conv
287 | t_1 = getattr(self, 'conv_b1_g%d_tc%d' % (group_num, layer_num))(tensor)
288 | t_1 = getattr(self, 'bn_b1_g%d_tc%d' % (group_num, layer_num))(t_1)
289 |
290 | # branch 2: dimension reduction followed by depth-wise temp conv (kernel-size 3)
291 | t_2 = getattr(self, 'conv_b2_g%d_tc%d' % (group_num, layer_num))(tensor)
292 | t_2 = getattr(self, 'convdw_b2_g%d_tc%d' % (group_num, layer_num))(t_2)
293 | t_2 = getattr(self, 'bn_b2_g%d_tc%d' % (group_num, layer_num))(t_2)
294 |
295 | # branch 3: dimension reduction followed by depth-wise temp conv (kernel-size 5)
296 | t_3 = getattr(self, 'conv_b3_g%d_tc%d' % (group_num, layer_num))(tensor)
297 | t_3 = getattr(self, 'convdw_b3_g%d_tc%d' % (group_num, layer_num))(t_3)
298 | t_3 = getattr(self, 'bn_b3_g%d_tc%d' % (group_num, layer_num))(t_3)
299 |
300 | # branch 4: dimension reduction followed by depth-wise temp conv (kernel-size 7)
301 | t_4 = getattr(self, 'conv_b4_g%d_tc%d' % (group_num, layer_num))(tensor)
302 | t_4 = getattr(self, 'convdw_b4_g%d_tc%d' % (group_num, layer_num))(t_4)
303 | t_4 = getattr(self, 'bn_b4_g%d_tc%d' % (group_num, layer_num))(t_4)
304 |
305 | # branch 5: dimension reduction followed by temporal max pooling
306 | t_5 = getattr(self, 'conv_b5_g%d_tc%d' % (group_num, layer_num))(tensor)
307 | t_5 = getattr(self, 'maxpool_b5_g%d_tc%d' % (group_num, layer_num))(t_5)
308 | t_5 = getattr(self, 'padding_b5_g%d_tc%d' % (group_num, layer_num))(t_5)
309 | t_5 = getattr(self, 'bn_b5_g%d_tc%d' % (group_num, layer_num))(t_5)
310 |
311 | # concatenate channels of branches
312 | tensors = (t_1, t_2, t_3, t_4, t_5)
313 | tensor = torch.cat(tensors, dim=1)
314 |
315 | return tensor
316 |
317 | def __get_n_channels_per_branch(self, n_groups, expansion_factor, n_channels_in):
318 | n_branches = 5
319 | n_channels_per_branch = int(n_channels_in * expansion_factor / float(n_branches * n_groups))
320 | n_channels_per_branch = int(n_channels_per_branch)
321 | n_channels_out = int(n_channels_per_branch * n_groups * n_branches)
322 | n_channels_out = int(n_channels_out)
323 |
324 | return n_channels_per_branch, n_channels_out
325 |
326 | # endregion
327 |
--------------------------------------------------------------------------------
/nets/i3d_torch_charades.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 |
6 | import numpy as np
7 |
8 | import os
9 | import sys
10 | from collections import OrderedDict
11 |
12 | class MaxPool3dSamePadding(nn.MaxPool3d):
13 |
14 | def compute_pad(self, dim, s):
15 | if s % self.stride[dim] == 0:
16 | return max(self.kernel_size[dim] - self.stride[dim], 0)
17 | else:
18 | return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
19 |
20 | def forward(self, x):
21 | # compute 'same' padding
22 | (batch, channel, t, h, w) = x.size()
23 | # print t,h,w
24 | out_t = np.ceil(float(t) / float(self.stride[0]))
25 | out_h = np.ceil(float(h) / float(self.stride[1]))
26 | out_w = np.ceil(float(w) / float(self.stride[2]))
27 | # print out_t, out_h, out_w
28 | pad_t = self.compute_pad(0, t)
29 | pad_h = self.compute_pad(1, h)
30 | pad_w = self.compute_pad(2, w)
31 | # print pad_t, pad_h, pad_w
32 |
33 | pad_t_f = pad_t // 2
34 | pad_t_b = pad_t - pad_t_f
35 | pad_h_f = pad_h // 2
36 | pad_h_b = pad_h - pad_h_f
37 | pad_w_f = pad_w // 2
38 | pad_w_b = pad_w - pad_w_f
39 |
40 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
41 | # print x.size()
42 | # print pad
43 | x = F.pad(x, pad)
44 | return super(MaxPool3dSamePadding, self).forward(x)
45 |
46 | class Unit3D(nn.Module):
47 |
48 | def __init__(self, in_channels,
49 | output_channels,
50 | kernel_shape=(1, 1, 1),
51 | stride=(1, 1, 1),
52 | padding=0,
53 | activation_fn=F.relu,
54 | use_batch_norm=True,
55 | use_bias=False,
56 | name='unit_3d'):
57 |
58 | """Initializes Unit3D module."""
59 | super(Unit3D, self).__init__()
60 |
61 | self._output_channels = output_channels
62 | self._kernel_shape = kernel_shape
63 | self._stride = stride
64 | self._use_batch_norm = use_batch_norm
65 | self._activation_fn = activation_fn
66 | self._use_bias = use_bias
67 | self.name = name
68 | self.padding = padding
69 |
70 | self.conv3d = nn.Conv3d(in_channels=in_channels,
71 | out_channels=self._output_channels,
72 | kernel_size=self._kernel_shape,
73 | stride=self._stride,
74 | padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
75 | bias=self._use_bias)
76 |
77 | if self._use_batch_norm:
78 | self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
79 |
80 | def compute_pad(self, dim, s):
81 | if s % self._stride[dim] == 0:
82 | return max(self._kernel_shape[dim] - self._stride[dim], 0)
83 | else:
84 | return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
85 |
86 | def forward(self, x):
87 | # compute 'same' padding
88 | (batch, channel, t, h, w) = x.size()
89 | # print t,h,w
90 | out_t = np.ceil(float(t) / float(self._stride[0]))
91 | out_h = np.ceil(float(h) / float(self._stride[1]))
92 | out_w = np.ceil(float(w) / float(self._stride[2]))
93 | # print out_t, out_h, out_w
94 | pad_t = self.compute_pad(0, t)
95 | pad_h = self.compute_pad(1, h)
96 | pad_w = self.compute_pad(2, w)
97 | # print pad_t, pad_h, pad_w
98 |
99 | pad_t_f = pad_t // 2
100 | pad_t_b = pad_t - pad_t_f
101 | pad_h_f = pad_h // 2
102 | pad_h_b = pad_h - pad_h_f
103 | pad_w_f = pad_w // 2
104 | pad_w_b = pad_w - pad_w_f
105 |
106 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
107 | # print x.size()
108 | # print pad
109 | x = F.pad(x, pad)
110 | # print x.size()
111 |
112 | x = self.conv3d(x)
113 | if self._use_batch_norm:
114 | x = self.bn(x)
115 | if self._activation_fn is not None:
116 | x = self._activation_fn(x)
117 | return x
118 |
119 | class InceptionModule(nn.Module):
120 | def __init__(self, in_channels, out_channels, name):
121 | super(InceptionModule, self).__init__()
122 |
123 | self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
124 | name=name + '/Branch_0/Conv3d_0a_1x1')
125 | self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
126 | name=name + '/Branch_1/Conv3d_0a_1x1')
127 | self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
128 | name=name + '/Branch_1/Conv3d_0b_3x3')
129 | self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
130 | name=name + '/Branch_2/Conv3d_0a_1x1')
131 | self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
132 | name=name + '/Branch_2/Conv3d_0b_3x3')
133 | self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
134 | stride=(1, 1, 1), padding=0)
135 | self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
136 | name=name + '/Branch_3/Conv3d_0b_1x1')
137 | self.name = name
138 |
139 | def forward(self, x):
140 | b0 = self.b0(x)
141 | b1 = self.b1b(self.b1a(x))
142 | b2 = self.b2b(self.b2a(x))
143 | b3 = self.b3b(self.b3a(x))
144 | return torch.cat([b0, b1, b2, b3], dim=1)
145 |
146 | class InceptionI3d(nn.Module):
147 | """Inception-v1 I3D architecture.
148 | The model is introduced in:
149 | Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
150 | Joao Carreira, Andrew Zisserman
151 | https://arxiv.org/pdf/1705.07750v1.pdf.
152 | See also the Inception architecture, introduced in:
153 | Going deeper with convolutions
154 | Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
155 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
156 | http://arxiv.org/pdf/1409.4842v1.pdf.
157 | """
158 |
159 | # Endpoints of the model in order. During construction, all the endpoints up
160 | # to a designated `final_endpoint` are returned in a dictionary as the
161 | # second return value.
162 | VALID_ENDPOINTS = (
163 | 'Conv3d_1a_7x7',
164 | 'MaxPool3d_2a_3x3',
165 | 'Conv3d_2b_1x1',
166 | 'Conv3d_2c_3x3',
167 | 'MaxPool3d_3a_3x3',
168 | 'Mixed_3b',
169 | 'Mixed_3c',
170 | 'MaxPool3d_4a_3x3',
171 | 'Mixed_4b',
172 | 'Mixed_4c',
173 | 'Mixed_4d',
174 | 'Mixed_4e',
175 | 'Mixed_4f',
176 | 'MaxPool3d_5a_2x2',
177 | 'Mixed_5b',
178 | 'Mixed_5c',
179 | 'Logits',
180 | 'Predictions',
181 | )
182 |
183 | def __init__(self, num_classes=400, spatial_squeeze=True, final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
184 | """Initializes I3D model instance.
185 | Args:
186 | num_classes: The number of outputs in the logit layer (default 400, which
187 | matches the Kinetics dataset).
188 | spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
189 | before returning (default True).
190 | final_endpoint: The model contains many possible endpoints.
191 | `final_endpoint` specifies the last endpoint for the model to be built
192 | up to. In addition to the output at `final_endpoint`, all the outputs
193 | at endpoints up to `final_endpoint` will also be returned, in a
194 | dictionary. `final_endpoint` must be one of
195 | InceptionI3d.VALID_ENDPOINTS (default 'Logits').
196 | name: A string (optional). The name of this module.
197 | Raises:
198 | ValueError: if `final_endpoint` is not recognized.
199 | """
200 |
201 | if final_endpoint not in self.VALID_ENDPOINTS:
202 | raise ValueError('Unknown final endpoint %s' % final_endpoint)
203 |
204 | super(InceptionI3d, self).__init__()
205 | self._num_classes = num_classes
206 | self._spatial_squeeze = spatial_squeeze
207 | self._final_endpoint = final_endpoint
208 | self.logits = None
209 |
210 | if self._final_endpoint not in self.VALID_ENDPOINTS:
211 | raise ValueError('Unknown final endpoint %s' % self._final_endpoint)
212 |
213 | self.end_points = {}
214 | end_point = 'Conv3d_1a_7x7'
215 | self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], stride=(2, 2, 2), padding=(3, 3, 3), name=name + end_point)
216 | if self._final_endpoint == end_point: return
217 |
218 | end_point = 'MaxPool3d_2a_3x3'
219 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
220 | padding=0)
221 | if self._final_endpoint == end_point: return
222 |
223 | end_point = 'Conv3d_2b_1x1'
224 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0, name=name + end_point)
225 | if self._final_endpoint == end_point: return
226 |
227 | end_point = 'Conv3d_2c_3x3'
228 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, name=name + end_point)
229 | if self._final_endpoint == end_point: return
230 |
231 | end_point = 'MaxPool3d_3a_3x3'
232 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
233 | if self._final_endpoint == end_point: return
234 |
235 | end_point = 'Mixed_3b'
236 | self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point)
237 | if self._final_endpoint == end_point: return
238 |
239 | end_point = 'Mixed_3c'
240 | self.end_points[end_point] = InceptionModule(256, [128, 128, 192, 32, 96, 64], name + end_point)
241 | if self._final_endpoint == end_point: return
242 |
243 | end_point = 'MaxPool3d_4a_3x3'
244 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
245 | padding=0)
246 | if self._final_endpoint == end_point: return
247 |
248 | end_point = 'Mixed_4b'
249 | self.end_points[end_point] = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point)
250 | if self._final_endpoint == end_point: return
251 |
252 | end_point = 'Mixed_4c'
253 | self.end_points[end_point] = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point)
254 | if self._final_endpoint == end_point: return
255 |
256 | end_point = 'Mixed_4d'
257 | self.end_points[end_point] = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point)
258 | if self._final_endpoint == end_point: return
259 |
260 | end_point = 'Mixed_4e'
261 | self.end_points[end_point] = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point)
262 | if self._final_endpoint == end_point: return
263 |
264 | end_point = 'Mixed_4f'
265 | self.end_points[end_point] = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], name + end_point)
266 | if self._final_endpoint == end_point: return
267 |
268 | end_point = 'MaxPool3d_5a_2x2'
269 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0)
270 | if self._final_endpoint == end_point: return
271 |
272 | end_point = 'Mixed_5b'
273 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], name + end_point)
274 | if self._final_endpoint == end_point: return
275 |
276 | end_point = 'Mixed_5c'
277 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], name + end_point)
278 | if self._final_endpoint == end_point: return
279 |
280 | end_point = 'Logits'
281 | self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1))
282 | self.dropout = nn.Dropout(dropout_keep_prob)
283 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits')
284 |
285 | self.build()
286 |
287 | def replace_logits(self, num_classes):
288 | self._num_classes = num_classes
289 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits')
290 |
291 | def build(self):
292 | for k in self.end_points.keys():
293 | self.add_module(k, self.end_points[k])
294 |
295 | # def forward(self, x):
296 | # for end_point in self.VALID_ENDPOINTS:
297 | # if end_point in self.end_points:
298 | # x = self._modules[end_point](x) # use _modules to work with dataparallel
299 | #
300 | # x = self.logits(self.dropout(self.avg_pool(x)))
301 | # if self._spatial_squeeze:
302 | # logits = x.squeeze(3).squeeze(3)
303 | # # logits is batch X time X classes, which is what we want to work with
304 | # return logits
305 |
306 | def forward(self, x):
307 | for end_point in self.VALID_ENDPOINTS:
308 | if end_point in self.end_points:
309 | x = self.end_points[end_point](x) # use _modules to work with dataparallel
310 | return x
311 |
312 | # for end_point in self.VALID_ENDPOINTS:
313 | # if end_point in self.end_points:
314 | # x = self._modules[end_point](x) # use _modules to work with dataparallel
315 | # return x
316 |
317 | def extract_features(self, x):
318 | for end_point in self.VALID_ENDPOINTS:
319 | if end_point in self.end_points:
320 | x = self._modules[end_point](x)
321 | return self.avg_pool(x)
322 |
--------------------------------------------------------------------------------
/nets/i3d_torch_charades_test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 |
6 | import numpy as np
7 |
8 | import os
9 | import sys
10 | from collections import OrderedDict
11 |
12 | class MaxPool3dSamePadding(nn.MaxPool3d):
13 |
14 | def compute_pad(self, dim, s):
15 | if s % self.stride[dim] == 0:
16 | return max(self.kernel_size[dim] - self.stride[dim], 0)
17 | else:
18 | return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
19 |
20 | def forward(self, x):
21 | # compute 'same' padding
22 | (batch, channel, t, h, w) = x.size()
23 | # print t,h,w
24 | out_t = np.ceil(float(t) / float(self.stride[0]))
25 | out_h = np.ceil(float(h) / float(self.stride[1]))
26 | out_w = np.ceil(float(w) / float(self.stride[2]))
27 | # print out_t, out_h, out_w
28 | pad_t = self.compute_pad(0, t)
29 | pad_h = self.compute_pad(1, h)
30 | pad_w = self.compute_pad(2, w)
31 | # print pad_t, pad_h, pad_w
32 |
33 | pad_t_f = pad_t // 2
34 | pad_t_b = pad_t - pad_t_f
35 | pad_h_f = pad_h // 2
36 | pad_h_b = pad_h - pad_h_f
37 | pad_w_f = pad_w // 2
38 | pad_w_b = pad_w - pad_w_f
39 |
40 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
41 | # print x.size()
42 | # print pad
43 | x = F.pad(x, pad)
44 | return super(MaxPool3dSamePadding, self).forward(x)
45 |
46 | class Unit3D(nn.Module):
47 |
48 | def __init__(self, in_channels, output_channels, kernel_shape=(1, 1, 1), stride=(1, 1, 1), padding=0, activation_fn=F.relu, use_batch_norm=True, use_bias=False, name='unit_3d'):
49 |
50 | """Initializes Unit3D module."""
51 | super(Unit3D, self).__init__()
52 |
53 | self._output_channels = output_channels
54 | self._kernel_shape = kernel_shape
55 | self._stride = stride
56 | self._use_batch_norm = use_batch_norm
57 | self._activation_fn = activation_fn
58 | self._use_bias = use_bias
59 | self.name = name
60 | self.padding = padding
61 |
62 | # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
63 | self.conv3d = nn.Conv3d(in_channels=in_channels, out_channels=self._output_channels, kernel_size=self._kernel_shape, stride=self._stride, padding=0, bias=self._use_bias)
64 |
65 | if self._use_batch_norm:
66 | self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
67 |
68 | def compute_pad(self, dim, s):
69 | if s % self._stride[dim] == 0:
70 | return max(self._kernel_shape[dim] - self._stride[dim], 0)
71 | else:
72 | return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
73 |
74 | def forward(self, x):
75 | # compute 'same' padding
76 | (batch, channel, t, h, w) = x.size()
77 | # print t,h,w
78 | # out_t = np.ceil(float(t) / float(self._stride[0]))
79 | # out_h = np.ceil(float(h) / float(self._stride[1]))
80 | # out_w = np.ceil(float(w) / float(self._stride[2]))
81 | # print out_t, out_h, out_w
82 | pad_t = self.compute_pad(0, t)
83 | pad_h = self.compute_pad(1, h)
84 | pad_w = self.compute_pad(2, w)
85 | # print pad_t, pad_h, pad_w
86 |
87 | pad_t_f = pad_t // 2
88 | pad_t_b = pad_t - pad_t_f
89 | pad_h_f = pad_h // 2
90 | pad_h_b = pad_h - pad_h_f
91 | pad_w_f = pad_w // 2
92 | pad_w_b = pad_w - pad_w_f
93 |
94 | pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
95 | # print x.size()
96 | # print pad
97 | x = F.pad(x, pad)
98 | # print x.size()
99 |
100 | x = self.conv3d(x)
101 | if self._use_batch_norm:
102 | x = self.bn(x)
103 | if self._activation_fn is not None:
104 | x = self._activation_fn(x)
105 | return x
106 |
107 | class InceptionModule(nn.Module):
108 | def __init__(self, in_channels, out_channels, name):
109 | super(InceptionModule, self).__init__()
110 |
111 | self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_0/Conv3d_0a_1x1')
112 | self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_1/Conv3d_0a_1x1')
113 | self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3], name=name + '/Branch_1/Conv3d_0b_3x3')
114 | self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_2/Conv3d_0a_1x1')
115 | self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3], name=name + '/Branch_2/Conv3d_0b_3x3')
116 | self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(1, 1, 1), padding=0)
117 | self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0, name=name + '/Branch_3/Conv3d_0b_1x1')
118 | self.name = name
119 |
120 | def forward(self, x):
121 | b0 = self.b0(x)
122 | b1 = self.b1b(self.b1a(x))
123 | b2 = self.b2b(self.b2a(x))
124 | b3 = self.b3b(self.b3a(x))
125 | return torch.cat([b0, b1, b2, b3], dim=1)
126 |
127 | class InceptionI3d(nn.Module):
128 | """Inception-v1 I3D architecture.
129 | The model is introduced in:
130 | Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
131 | Joao Carreira, Andrew Zisserman
132 | https://arxiv.org/pdf/1705.07750v1.pdf.
133 | See also the Inception architecture, introduced in:
134 | Going deeper with convolutions
135 | Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
136 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
137 | http://arxiv.org/pdf/1409.4842v1.pdf.
138 | """
139 |
140 | # Endpoints of the model in order. During construction, all the endpoints up
141 | # to a designated `final_endpoint` are returned in a dictionary as the
142 | # second return value.
143 | VALID_ENDPOINTS = (
144 | 'Conv3d_1a_7x7',
145 | 'MaxPool3d_2a_3x3',
146 | 'Conv3d_2b_1x1',
147 | 'Conv3d_2c_3x3',
148 | 'MaxPool3d_3a_3x3',
149 | 'Mixed_3b',
150 | 'Mixed_3c',
151 | 'MaxPool3d_4a_3x3',
152 | 'Mixed_4b',
153 | 'Mixed_4c',
154 | 'Mixed_4d',
155 | 'Mixed_4e',
156 | 'Mixed_4f',
157 | 'MaxPool3d_5a_2x2',
158 | 'Mixed_5b',
159 | 'Mixed_5c',
160 | 'Logits',
161 | 'Predictions',
162 | )
163 |
164 | def __init__(self, num_classes=400, spatial_squeeze=True, final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
165 | """Initializes I3D model instance.
166 | Args:
167 | num_classes: The number of outputs in the logit layer (default 400, which
168 | matches the Kinetics dataset).
169 | spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
170 | before returning (default True).
171 | final_endpoint: The model contains many possible endpoints.
172 | `final_endpoint` specifies the last endpoint for the model to be built
173 | up to. In addition to the output at `final_endpoint`, all the outputs
174 | at endpoints up to `final_endpoint` will also be returned, in a
175 | dictionary. `final_endpoint` must be one of
176 | InceptionI3d.VALID_ENDPOINTS (default 'Logits').
177 | name: A string (optional). The name of this module.
178 | Raises:
179 | ValueError: if `final_endpoint` is not recognized.
180 | """
181 |
182 | if final_endpoint not in self.VALID_ENDPOINTS:
183 | raise ValueError('Unknown final endpoint %s' % final_endpoint)
184 |
185 | super(InceptionI3d, self).__init__()
186 | self._num_classes = num_classes
187 | self._spatial_squeeze = spatial_squeeze
188 | self._final_endpoint = final_endpoint
189 | self.logits = None
190 |
191 | if self._final_endpoint not in self.VALID_ENDPOINTS:
192 | raise ValueError('Unknown final endpoint %s' % self._final_endpoint)
193 |
194 | self.end_points = {}
195 | end_point = 'Conv3d_1a_7x7'
196 | self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], stride=(2, 2, 2), padding=(3, 3, 3), name=name + end_point)
197 | self.__freeze_layer(self.end_points[end_point])
198 | if self._final_endpoint == end_point:
199 | return
200 |
201 | end_point = 'MaxPool3d_2a_3x3'
202 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
203 | self.__freeze_layer(self.end_points[end_point])
204 | if self._final_endpoint == end_point:
205 | return
206 |
207 | end_point = 'Conv3d_2b_1x1'
208 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0, name=name + end_point)
209 | self.__freeze_layer(self.end_points[end_point])
210 | if self._final_endpoint == end_point:
211 | return
212 |
213 | end_point = 'Conv3d_2c_3x3'
214 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, name=name + end_point)
215 | self.__freeze_layer(self.end_points[end_point])
216 | if self._final_endpoint == end_point:
217 | return
218 |
219 | end_point = 'MaxPool3d_3a_3x3'
220 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
221 | self.__freeze_layer(self.end_points[end_point])
222 | if self._final_endpoint == end_point:
223 | return
224 |
225 | end_point = 'Mixed_3b'
226 | self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point)
227 | self.__freeze_layer(self.end_points[end_point])
228 | if self._final_endpoint == end_point:
229 | return
230 |
231 | end_point = 'Mixed_3c'
232 | self.end_points[end_point] = InceptionModule(256, [128, 128, 192, 32, 96, 64], name + end_point)
233 | self.__freeze_layer(self.end_points[end_point])
234 | if self._final_endpoint == end_point:
235 | return
236 |
237 | end_point = 'MaxPool3d_4a_3x3'
238 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0)
239 | self.__freeze_layer(self.end_points[end_point])
240 | if self._final_endpoint == end_point:
241 | return
242 |
243 | end_point = 'Mixed_4b'
244 | self.end_points[end_point] = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point)
245 | self.__freeze_layer(self.end_points[end_point])
246 | if self._final_endpoint == end_point:
247 | return
248 |
249 | end_point = 'Mixed_4c'
250 | self.end_points[end_point] = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point)
251 | self.__freeze_layer(self.end_points[end_point])
252 | if self._final_endpoint == end_point:
253 | return
254 |
255 | end_point = 'Mixed_4d'
256 | self.end_points[end_point] = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point)
257 | self.__freeze_layer(self.end_points[end_point])
258 | if self._final_endpoint == end_point:
259 | return
260 |
261 | end_point = 'Mixed_4e'
262 | self.end_points[end_point] = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point)
263 | self.__freeze_layer(self.end_points[end_point])
264 | if self._final_endpoint == end_point:
265 | return
266 |
267 | end_point = 'Mixed_4f'
268 | self.end_points[end_point] = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], name + end_point)
269 | self.__freeze_layer(self.end_points[end_point])
270 | if self._final_endpoint == end_point:
271 | return
272 |
273 | end_point = 'MaxPool3d_5a_2x2'
274 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0)
275 | self.__freeze_layer(self.end_points[end_point])
276 | if self._final_endpoint == end_point:
277 | return
278 |
279 | end_point = 'Mixed_5b'
280 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], name + end_point)
281 | self.__freeze_layer(self.end_points[end_point])
282 | if self._final_endpoint == end_point:
283 | return
284 |
285 | end_point = 'Mixed_5c'
286 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], name + end_point)
287 | self.__freeze_layer(self.end_points[end_point])
288 | if self._final_endpoint == end_point:
289 | return
290 |
291 | self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1))
292 | self.__freeze_layer(self.avg_pool)
293 | self.dropout = nn.Dropout(dropout_keep_prob)
294 | self.__freeze_layer(self.dropout)
295 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits')
296 | self.__freeze_layer(self.logits)
297 |
298 | self.build()
299 |
300 | def replace_logits(self, num_classes):
301 | self._num_classes = num_classes
302 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits')
303 | pass
304 |
305 | def build(self):
306 | for k in self.end_points.keys():
307 | self.add_module(k, self.end_points[k])
308 |
309 | def forward(self, x):
310 | for end_point in self.VALID_ENDPOINTS:
311 | if end_point in self.end_points:
312 | # use _modules to work with dataparallel
313 | # x = self.end_points[end_point](x)
314 | x = self._modules[end_point](x)
315 | return x
316 |
317 | def extract_features(self, x):
318 | for end_point in self.VALID_ENDPOINTS:
319 | if end_point in self.end_points:
320 | x = self._modules[end_point](x)
321 | return self.avg_pool(x)
322 |
323 | def __freeze_layer(self, layer):
324 | layer_params = layer.parameters()
325 | for param in layer_params:
326 | param.requires_grad = False
327 |
--------------------------------------------------------------------------------
/nets/resnet_152_keras.py:
--------------------------------------------------------------------------------
1 | import os
2 | import keras.backend as K
3 |
4 | from keras import initializers
5 | from keras.layers import Input
6 | from keras.layers import Dense
7 | from keras.layers import Conv2D
8 | from keras.layers import MaxPooling2D
9 | from keras.layers import AveragePooling2D
10 | from keras.layers import ZeroPadding2D
11 | from keras.layers import Flatten
12 | from keras.layers import Activation
13 | from keras.layers import add
14 | from keras.layers import BatchNormalization
15 | from keras.layers import GlobalAveragePooling2D
16 | from keras.layers import GlobalMaxPooling2D
17 |
18 | from keras.models import Model
19 | from keras.engine import Layer, InputSpec
20 | from keras.engine import get_source_inputs
21 |
22 | from keras.utils.data_utils import get_file
23 | from keras.applications.imagenet_utils import imagenet_utils
24 |
25 | from core import const as c
26 |
27 | # WEIGHTS_PATH = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels.h5'
28 | # WEIGHTS_PATH_NO_TOP = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5'
29 | WEIGHTS_PATH = '%s/keras_models/resnet_152/resnet152_weights_tf_dim_ordering_tf_kernels.h5' % (c.DATA_ROOT_PATH)
30 | WEIGHTS_PATH_NO_TOP = '%s/keras_models/resnet_152/resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5' % (c.DATA_ROOT_PATH)
31 |
32 | class Scale(Layer):
33 | """ Custom Layer for ResNet used for BatchNormalization.
34 |
35 | Learns a set of weights and biases used for scaling the input data.
36 | the output consists simply in an element-wise multiplication of the input
37 | and a sum of a set of constants:
38 | out = in * gamma + beta,
39 | where 'gamma' and 'beta' are the weights and biases larned.
40 | # Arguments
41 | axis: integer, axis along which to normalize in mode 0. For instance,
42 | if your input tensor has shape (samples, channels, rows, cols),
43 | set axis to 1 to normalize per feature map (channels axis).
44 | momentum: momentum in the computation of the
45 | exponential average of the mean and standard deviation
46 | of the data, for feature-wise normalization.
47 | weights: Initialization weights.
48 | List of 2 Numpy arrays, with shapes:
49 | `[(input_shape,), (input_shape,)]`
50 | beta_init: name of initialization function for shift parameter
51 | (see [initializers](../initializers.md)), or alternatively,
52 | Theano/TensorFlow function to use for weights initialization.
53 | This parameter is only relevant if you don't pass a `weights` argument.
54 | gamma_init: name of initialization function for scale parameter (see
55 | [initializers](../initializers.md)), or alternatively,
56 | Theano/TensorFlow function to use for weights initialization.
57 | This parameter is only relevant if you don't pass a `weights` argument.
58 | """
59 |
60 | def __init__(self, weights=None, axis=-1, momentum=0.9, beta_init='zero', gamma_init='one', **kwargs):
61 | self.momentum = momentum
62 | self.axis = axis
63 | self.beta_init = initializers.get(beta_init)
64 | self.gamma_init = initializers.get(gamma_init)
65 | self.initial_weights = weights
66 | super(Scale, self).__init__(**kwargs)
67 |
68 | def build(self, input_shape):
69 | self.input_spec = [InputSpec(shape=input_shape)]
70 | shape = (int(input_shape[self.axis]),)
71 |
72 | self.gamma = K.variable(self.gamma_init(shape), name='%s_gamma' % self.name)
73 | self.beta = K.variable(self.beta_init(shape), name='%s_beta' % self.name)
74 | self.trainable_weights = [self.gamma, self.beta]
75 |
76 | if self.initial_weights is not None:
77 | self.set_weights(self.initial_weights)
78 | del self.initial_weights
79 |
80 | def call(self, x, mask=None):
81 | input_shape = self.input_spec[0].shape
82 | broadcast_shape = [1] * len(input_shape)
83 | broadcast_shape[self.axis] = input_shape[self.axis]
84 |
85 | out = K.reshape(self.gamma, broadcast_shape) * x + K.reshape(self.beta, broadcast_shape)
86 | return out
87 |
88 | def get_config(self):
89 | config = {"momentum": self.momentum, "axis": self.axis}
90 | base_config = super(Scale, self).get_config()
91 | return dict(list(base_config.items()) + list(config.items()))
92 |
93 | def identity_block(input_tensor, kernel_size, filters, stage, block):
94 | """
95 | The identity_block is the block that has no conv layer at shortcut
96 | # Arguments
97 | input_tensor: input tensor
98 | kernel_size: defualt 3, the kernel size of middle conv layer at main path
99 | filters: list of integers, the nb_filters of 3 conv layer at main path
100 | stage: integer, current stage label, used for generating layer names
101 | block: 'a','b'..., current block label, used for generating layer names
102 | """
103 | eps = 1.1e-5
104 | nb_filter1, nb_filter2, nb_filter3 = filters
105 | conv_name_base = 'res' + str(stage) + block + '_branch'
106 | bn_name_base = 'bn' + str(stage) + block + '_branch'
107 | scale_name_base = 'scale' + str(stage) + block + '_branch'
108 |
109 | if K.image_data_format() == 'channels_last':
110 | bn_axis = 3
111 | else:
112 | bn_axis = 1
113 |
114 | x = Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a', use_bias=False)(input_tensor)
115 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
116 | x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
117 | x = Activation('relu', name=conv_name_base + '2a_relu')(x)
118 |
119 | x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
120 | x = Conv2D(nb_filter2, (kernel_size, kernel_size), name=conv_name_base + '2b', use_bias=False)(x)
121 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
122 | x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
123 | x = Activation('relu', name=conv_name_base + '2b_relu')(x)
124 |
125 | x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
126 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
127 | x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
128 |
129 | x = add([x, input_tensor], name='res' + str(stage) + block)
130 | x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
131 | return x
132 |
133 | def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
134 | """ conv_block is the block that has a conv layer at shortcut
135 | # Arguments
136 | input_tensor: input tensor
137 | kernel_size: defualt 3, the kernel size of middle conv layer at main path
138 | filters: list of integers, the nb_filters of 3 conv layer at main path
139 | stage: integer, current stage label, used for generating layer names
140 | block: 'a','b'..., current block label, used for generating layer names
141 | Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
142 | And the shortcut should have subsample=(2,2) as well
143 | """
144 |
145 | eps = 1.1e-5
146 | nb_filter1, nb_filter2, nb_filter3 = filters
147 | conv_name_base = 'res' + str(stage) + block + '_branch'
148 | bn_name_base = 'bn' + str(stage) + block + '_branch'
149 | scale_name_base = 'scale' + str(stage) + block + '_branch'
150 |
151 | if K.image_data_format() == 'channels_last':
152 | bn_axis = 3
153 | else:
154 | bn_axis = 1
155 |
156 | x = Conv2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + '2a', use_bias=False)(input_tensor)
157 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
158 | x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
159 | x = Activation('relu', name=conv_name_base + '2a_relu')(x)
160 |
161 | x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
162 | x = Conv2D(nb_filter2, (kernel_size, kernel_size),
163 | name=conv_name_base + '2b', use_bias=False)(x)
164 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
165 | x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
166 | x = Activation('relu', name=conv_name_base + '2b_relu')(x)
167 |
168 | x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
169 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
170 | x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
171 |
172 | shortcut = Conv2D(nb_filter3, (1, 1), strides=strides,
173 | name=conv_name_base + '1', use_bias=False)(input_tensor)
174 | shortcut = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '1')(shortcut)
175 | shortcut = Scale(axis=bn_axis, name=scale_name_base + '1')(shortcut)
176 |
177 | x = add([x, shortcut], name='res' + str(stage) + block)
178 | x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
179 | return x
180 |
181 | def ResNet152(include_top=True, weights='imagenet', input_tensor=None, input_shape=None, pooling=None, classes=1000):
182 | """ Instantiates the ResNet152 architecture.
183 | Optionally loads weights pre-trained
184 | on ImageNet. Note that when using TensorFlow,
185 | for best performance you should set
186 | `image_data_format='channels_last'` in your Keras config
187 | at ~/.keras/keras.json.
188 | The model and the weights are compatible only with
189 | TensorFlow. The data format
190 | convention used by the model is the one
191 | specified in your Keras config file.
192 | # Arguments
193 | include_top: whether to include the fully-connected
194 | layer at the top of the network.
195 | weights: one of `None` (random initialization),
196 | 'imagenet' (pre-training on ImageNet),
197 | or the path to the weights file to be loaded.
198 | input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
199 | to use as image input for the model.
200 | input_shape: optional shape tuple, only to be specified
201 | if `include_top` is False (otherwise the input shape
202 | has to be `(224, 224, 3)` (with `channels_last` data format)
203 | or `(3, 224, 224)` (with `channels_first` data format).
204 | It should have exactly 3 inputs channels,
205 | and width and height should be no smaller than 197.
206 | E.g. `(200, 200, 3)` would be one valid value.
207 | pooling: Optional pooling mode for feature extraction
208 | when `include_top` is `False`.
209 | - `None` means that the output of the model will be
210 | the 4D tensor output of the
211 | last convolutional layer.
212 | - `avg` means that global average pooling
213 | will be applied to the output of the
214 | last convolutional layer, and thus
215 | the output of the model will be a 2D tensor.
216 | - `max` means that global max pooling will
217 | be applied.
218 | classes: optional number of classes to classify images
219 | into, only to be specified if `include_top` is True, and
220 | if no `weights` argument is specified.
221 | # Returns
222 | A Keras model instance.
223 | # Raises
224 | ValueError: in case of invalid argument for `weights`,
225 | or invalid input shape.
226 | """
227 |
228 | eps = 1.1e-5
229 |
230 | if not (weights in {'imagenet', None} or os.path.exists(weights)):
231 | raise ValueError('The `weights` argument should be either '
232 | '`None` (random initialization), `imagenet` '
233 | '(pre-training on ImageNet), '
234 | 'or the path to the weights file to be loaded.')
235 |
236 | if weights == 'imagenet' and include_top and classes != 1000:
237 | raise ValueError('If using `weights` as imagenet with `include_top`'
238 | ' as true, `classes` should be 1000')
239 |
240 | # Determine proper input shape
241 | input_shape = imagenet_utils._obtain_input_shape(input_shape,
242 | default_size=224,
243 | min_size=197,
244 | data_format=K.image_data_format(),
245 | require_flatten=include_top,
246 | weights=weights)
247 |
248 | if input_tensor is None:
249 | img_input = Input(shape=input_shape)
250 | else:
251 | if not K.is_keras_tensor(input_tensor):
252 | img_input = Input(tensor=input_tensor, shape=input_shape, name='data')
253 | else:
254 | img_input = input_tensor
255 |
256 | # Handle dimension ordering for different backends
257 | if K.image_dim_ordering() == 'tf':
258 | bn_axis = 3
259 | else:
260 | bn_axis = 1
261 |
262 | x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input)
263 | x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=False)(x)
264 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name='bn_conv1')(x)
265 | x = Scale(axis=bn_axis, name='scale_conv1')(x)
266 | x = Activation('relu', name='conv1_relu')(x)
267 | x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1', padding='same')(x)
268 |
269 | x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
270 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
271 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
272 |
273 | x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
274 | for i in range(1, 8):
275 | x = identity_block(x, 3, [128, 128, 512], stage=3, block='b' + str(i))
276 |
277 | x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
278 | for i in range(1, 36):
279 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b' + str(i))
280 |
281 | x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
282 | x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
283 | x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
284 |
285 | if include_top:
286 | # Classification block
287 | x = AveragePooling2D((7, 7), name='avg_pool')(x)
288 | x = Flatten()(x)
289 | x = Dense(classes, activation='softmax', name='fc1000')(x)
290 | else:
291 | if pooling == 'avg':
292 | x = GlobalAveragePooling2D()(x)
293 | elif pooling == 'max':
294 | x = GlobalMaxPooling2D()(x)
295 |
296 | # Ensure that the model takes into account
297 | # any potential predecessors of `input_tensor`.
298 | if input_tensor is not None:
299 | inputs = get_source_inputs(input_tensor)
300 | else:
301 | inputs = img_input
302 |
303 | # Create model
304 | model = Model(inputs, x, name='resnet152')
305 |
306 | # Load weights
307 | if weights == 'imagenet':
308 | if include_top:
309 | weights_path = WEIGHTS_PATH
310 | else:
311 | weights_path = WEIGHTS_PATH_NO_TOP
312 | model.load_weights(weights_path)
313 |
314 | elif weights is not None:
315 | model.load_weights(weights)
316 |
317 | return model
318 |
--------------------------------------------------------------------------------
/core/image_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | ########################################################################
5 | # GNU General Public License v3.0
6 | # GNU GPLv3
7 | # Copyright (c) 2019, Noureldien Hussein
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 | ########################################################################
22 |
23 | """
24 | Helper functions for images.
25 | """
26 |
27 | from __future__ import absolute_import
28 | from __future__ import division
29 | from __future__ import print_function
30 | from __future__ import unicode_literals
31 |
32 | import cv2
33 | import numpy as np
34 | import random
35 | import math
36 | from multiprocessing.dummy import Pool
37 |
38 | from core import utils
39 |
40 | # region Frame Resizing
41 |
42 | def resize_frame(image, target_height=224, target_width=224):
43 | return __resize_frame(image, target_height, target_width)
44 |
45 | def resize_keep_aspect_ratio_max_dim(image, max_dim=None):
46 | return __resize_keep_aspect_ratio_max_dim(image, max_dim)
47 |
48 | def resize_keep_aspect_ratio_min_dim(image, min_dim=None):
49 | return __resize_keep_aspect_ratio_min_dim(image, min_dim)
50 |
51 | def resize_crop(image, target_height=224, target_width=224):
52 | return __resize_crop(image, target_height, target_width)
53 |
54 | def resize_crop_scaled(image, target_height=224, target_width=224):
55 | return __resize_crop_scaled(image, target_height, target_width)
56 |
57 | def resize_keep_aspect_ratio_padded(image, target_height=224, target_width=224):
58 | return __resize_keep_aspect_ratio_padded(image, target_height, target_width)
59 |
60 | def __resize_frame(image, target_height=224, target_width=224):
61 | """
62 | Resize to the given dimensions. Don't care about maintaining the aspect ratio of the given image.
63 | """
64 | if len(image.shape) == 2:
65 | image = np.tile(image[:, :, None], 3)
66 | elif len(image.shape) == 4:
67 | image = image[:, :, :, 0]
68 |
69 | resized_image = cv2.resize(image, dsize=(target_height, target_width))
70 | return resized_image
71 |
72 | def __resize_keep_aspect_ratio_max_dim(image, max_dim=224):
73 | """
74 | Resize the given image while maintaining the aspect ratio.
75 | """
76 | if len(image.shape) == 2:
77 | image = np.tile(image[:, :, None], 3)
78 | elif len(image.shape) == 4:
79 | image = image[:, :, :, 0]
80 |
81 | height = image.shape[0]
82 | width = image.shape[1]
83 |
84 | if height > width:
85 | target_height = max_dim
86 | target_width = int(target_height * width / float(height))
87 | else:
88 | target_width = max_dim
89 | target_height = int(target_width * height / float(width))
90 |
91 | resized_image = cv2.resize(image, dsize=(target_width, target_height))
92 | return resized_image
93 |
94 | def __resize_keep_aspect_ratio_min_dim(image, min_dim=224):
95 | """
96 | Resize the given image while maintaining the aspect ratio.
97 | """
98 | if len(image.shape) == 2:
99 | image = np.tile(image[:, :, None], 3)
100 | elif len(image.shape) == 4:
101 | image = image[:, :, :, 0]
102 |
103 | height = image.shape[0]
104 | width = image.shape[1]
105 |
106 | if height > width:
107 | target_width = min_dim
108 | target_height = int(target_width * height / float(width))
109 | else:
110 | target_height = min_dim
111 | target_width = int(target_height * width / float(height))
112 |
113 | resized_image = cv2.resize(image, dsize=(target_width, target_height))
114 | return resized_image
115 |
116 | def __resize_crop(image, target_height=224, target_width=224):
117 | if len(image.shape) == 2:
118 | image = np.tile(image[:, :, None], 3)
119 | elif len(image.shape) == 4:
120 | image = image[:, :, :, 0]
121 |
122 | height, width, rgb = image.shape
123 | if width == height:
124 | resized_image = cv2.resize(image, (target_height, target_width))
125 |
126 | elif height < width:
127 | resized_image = cv2.resize(image, (int(width * float(target_height) / height), target_width))
128 | cropping_length = int((resized_image.shape[1] - target_height) / 2)
129 | resized_image = resized_image[:, cropping_length:resized_image.shape[1] - cropping_length]
130 |
131 | else:
132 | resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width)))
133 | cropping_length = int((resized_image.shape[0] - target_width) / 2)
134 | resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length, :]
135 |
136 | resized_image = cv2.resize(resized_image, (target_height, target_width))
137 | return resized_image
138 |
139 | def __resize_crop_scaled(image, target_height=224, target_width=224):
140 | # re-scale the image by ratio 3/4 so a landscape or portrait image becomes square
141 | # then resize_crop it
142 |
143 | # for example, if input image is (height*width) is 400*1000 it will be (400 * 1000 * 3/4) = 400 * 750
144 |
145 | if len(image.shape) == 2:
146 | image = np.tile(image[:, :, None], 3)
147 | elif len(image.shape) == 4:
148 | image = image[:, :, :, 0]
149 |
150 | height, width, _ = image.shape
151 | if width == height:
152 | resized_image = cv2.resize(image, (target_height, target_width))
153 | else:
154 |
155 | # first, rescale it, only if the rescale won't bring the scaled dimention to lower than target_dim (= 224)
156 | scale_factor = 3 / 4.0
157 | if height < width:
158 | new_width = int(width * scale_factor)
159 | if new_width >= target_width:
160 | image = cv2.resize(image, (new_width, height))
161 | else:
162 | new_height = int(height * scale_factor)
163 | if new_height >= target_height:
164 | image = cv2.resize(image, (width, new_height))
165 |
166 | # now, resize and crop
167 | height, width, _ = image.shape
168 | if height < width:
169 | resized_image = cv2.resize(image, (int(width * float(target_height) / height), target_width))
170 | cropping_length = int((resized_image.shape[1] - target_height) / 2)
171 | resized_image = resized_image[:, cropping_length:resized_image.shape[1] - cropping_length]
172 |
173 | else:
174 | resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width)))
175 | cropping_length = int((resized_image.shape[0] - target_width) / 2)
176 | resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length, :]
177 |
178 | # this line is important, because sometimes the cropping there is a 1 pixel more
179 | height, width, _ = resized_image.shape
180 | if height > target_height or width > target_width:
181 | resized_image = cv2.resize(resized_image, (target_height, target_width))
182 |
183 | return resized_image
184 |
185 | def __resize_keep_aspect_ratio_padded(image, target_height=224, target_width=224):
186 | """
187 | Resize the frame while keeping aspect ratio. Also, to result in an image with the given dimensions, the resized image is zero-padded.
188 | """
189 |
190 | if len(image.shape) == 2:
191 | image = np.tile(image[:, :, None], 3)
192 | elif len(image.shape) == 4:
193 | image = image[:, :, :, 0]
194 |
195 | original_height, original_width, _ = image.shape
196 | original_aspect_ratio = original_height / float(original_width)
197 | target_aspect_ratio = target_height / float(target_width)
198 |
199 | if target_aspect_ratio >= original_aspect_ratio:
200 | if original_width >= original_height:
201 | max_dim = target_width
202 | else:
203 | max_dim = int(original_height * target_width / float(original_width))
204 | else:
205 | if original_height >= original_width:
206 | max_dim = target_height
207 | else:
208 | max_dim = int(original_width * target_height / float(original_height))
209 |
210 | image = __resize_keep_aspect_ratio_max_dim(image, max_dim=max_dim)
211 |
212 | new_height, new_width, _ = image.shape
213 | new_aspect_ratio = new_height / float(new_width)
214 |
215 | # do zero-padding for the image (vertical or horizontal)
216 | img_padded = np.zeros((target_height, target_width, 3), dtype=image.dtype)
217 |
218 | if target_aspect_ratio < new_aspect_ratio:
219 | # horizontal padding
220 | y1 = 0
221 | y2 = new_height
222 | x1 = int((target_width - new_width) / 2.0)
223 | x2 = x1 + new_width
224 | else:
225 | # vertical padding
226 | x1 = 0
227 | x2 = new_width
228 | y1 = int((target_height - new_height) / 2.0)
229 | y2 = y1 + new_height
230 |
231 | img_padded[y1:y2, x1:x2, :] = image
232 | return img_padded
233 |
234 | # endregion
235 |
236 | # region Image Reader ResNet-152 Keras
237 |
238 | class AsyncImageReaderResNet152Keras():
239 | def __init__(self, bgr_mean, n_threads=20):
240 | random.seed(101)
241 | np.random.seed(101)
242 |
243 | self.__is_busy = False
244 | self.__images = None
245 | self.__n_channels = 3
246 | self.__img_dim = 224
247 | self.__bgr_mean = bgr_mean
248 |
249 | self.__n_threads_in_pool = n_threads
250 | self.__pool = Pool(self.__n_threads_in_pool)
251 |
252 | def load_imgs_in_batch(self, image_pathes):
253 | self.__is_busy = True
254 |
255 | n_pathes = len(image_pathes)
256 | idxces = np.arange(0, n_pathes)
257 |
258 | # parameters passed to the reading function
259 | params = [data_item for data_item in zip(idxces, image_pathes)]
260 |
261 | # set list of images before start reading
262 | imgs_shape = (n_pathes, self.__img_dim, self.__img_dim, self.__n_channels)
263 | self.__images = np.zeros(imgs_shape, dtype=np.float32)
264 |
265 | # start pool of threads
266 | self.__pool.map_async(self.__preprocess_img_wrapper, params, callback=self.__thread_pool_callback)
267 |
268 | def get_images(self):
269 | if self.__is_busy:
270 | raise Exception('Sorry, you can\'t get images while threads are running!')
271 | else:
272 | return self.__images
273 |
274 | def is_busy(self):
275 | return self.__is_busy
276 |
277 | def __thread_pool_callback(self, args):
278 | self.__is_busy = False
279 |
280 | def __preprocess_img_wrapper(self, params):
281 | try:
282 | self.__preprocess_img(params)
283 | except Exception as exp:
284 | print ('Error in __preprocess_img')
285 | print (exp)
286 |
287 | def __preprocess_img(self, params):
288 |
289 | idx = params[0]
290 | path = params[1]
291 |
292 | img = cv2.imread(path)
293 | img = img.astype(np.float32)
294 |
295 | # subtract mean pixel from image
296 | img[:, :, 0] -= self.__bgr_mean[0]
297 | img[:, :, 1] -= self.__bgr_mean[1]
298 | img[:, :, 2] -= self.__bgr_mean[2]
299 |
300 | # convert from bgr to rgb
301 | img = img[:, :, (2, 1, 0)]
302 |
303 | self.__images[idx] = img
304 |
305 | def close(self):
306 | self.__pool.close()
307 | self.__pool.terminate()
308 |
309 | # endregion
310 |
311 | # region Image/Video Readers MultiTHUMOS
312 |
313 | class AsyncImageReaderMultiTHUMOSForI3DKerasModel():
314 | def __init__(self, n_threads=20):
315 | random.seed(101)
316 | np.random.seed(101)
317 |
318 | self.__is_busy = False
319 | self.__images = None
320 | self.__n_channels = 3
321 | self.__img_dim = 224
322 |
323 | self.__n_threads_in_pool = n_threads
324 | self.__pool = Pool(self.__n_threads_in_pool)
325 |
326 | def load_imgs_in_batch(self, image_pathes):
327 | self.__is_busy = True
328 |
329 | n_pathes = len(image_pathes)
330 | idxces = np.arange(0, n_pathes)
331 |
332 | # parameters passed to the reading function
333 | params = [data_item for data_item in zip(idxces, image_pathes)]
334 |
335 | # set list of images before start reading
336 | imgs_shape = (n_pathes, self.__img_dim, self.__img_dim, self.__n_channels)
337 | self.__images = np.zeros(imgs_shape, dtype=np.float32)
338 |
339 | # start pool of threads
340 | self.__pool.map_async(self.__preprocess_img_wrapper, params, callback=self.__thread_pool_callback)
341 |
342 | def get_images(self):
343 | if self.__is_busy:
344 | raise Exception('Sorry, you can\'t get images while threads are running!')
345 | else:
346 | return self.__images
347 |
348 | def is_busy(self):
349 | return self.__is_busy
350 |
351 | def __thread_pool_callback(self, args):
352 | self.__is_busy = False
353 |
354 | def __preprocess_img_wrapper(self, params):
355 | try:
356 | self.__preprocess_img(params)
357 | except Exception as exp:
358 | print ('Error in __preprocess_img')
359 | print (exp)
360 |
361 | def __preprocess_img(self, params):
362 |
363 | idx = params[0]
364 | path = params[1]
365 |
366 | img = cv2.imread(path)
367 | img = img.astype(np.float32)
368 | # normalize such that values range from -1 to 1
369 | img /= float(127.5)
370 | img -= 1.0
371 | # convert from bgr to rgb
372 | img = img[:, :, (2, 1, 0)]
373 |
374 | self.__images[idx] = img
375 |
376 | def close(self):
377 | self.__pool.close()
378 | self.__pool.terminate()
379 |
380 | # endregion
381 |
382 | # region Image/Video Readers Breakfast
383 |
384 | class AsyncImageReaderBreakfastForI3DKerasModel():
385 | def __init__(self, n_threads=20):
386 | random.seed(101)
387 | np.random.seed(101)
388 |
389 | self.__is_busy = False
390 | self.__images = None
391 | self.__n_channels = 3
392 | self.__img_dim = 224
393 |
394 | self.__n_threads_in_pool = n_threads
395 | self.__pool = Pool(self.__n_threads_in_pool)
396 |
397 | def load_imgs_in_batch(self, image_pathes):
398 | self.__is_busy = True
399 |
400 | n_pathes = len(image_pathes)
401 | idxces = np.arange(0, n_pathes)
402 |
403 | # parameters passed to the reading function
404 | params = [data_item for data_item in zip(idxces, image_pathes)]
405 |
406 | # set list of images before start reading
407 | imgs_shape = (n_pathes, self.__img_dim, self.__img_dim, self.__n_channels)
408 | self.__images = np.zeros(imgs_shape, dtype=np.float32)
409 |
410 | # start pool of threads
411 | self.__pool.map_async(self.__preprocess_img_wrapper, params, callback=self.__thread_pool_callback)
412 |
413 | def get_images(self):
414 | if self.__is_busy:
415 | raise Exception('Sorry, you can\'t get images while threads are running!')
416 | else:
417 | return self.__images
418 |
419 | def is_busy(self):
420 | return self.__is_busy
421 |
422 | def __thread_pool_callback(self, args):
423 | self.__is_busy = False
424 |
425 | def __preprocess_img_wrapper(self, params):
426 | try:
427 | self.__preprocess_img(params)
428 | except Exception as exp:
429 | print ('Error in __preprocess_img')
430 | print (exp)
431 |
432 | def __preprocess_img(self, params):
433 |
434 | idx = params[0]
435 | path = params[1]
436 |
437 | img = cv2.imread(path)
438 | img = img.astype(np.float32)
439 | # normalize such that values range from -1 to 1
440 | img /= float(127.5)
441 | img -= 1.0
442 | # convert from bgr to rgb
443 | img = img[:, :, (2, 1, 0)]
444 |
445 | self.__images[idx] = img
446 |
447 | def close(self):
448 | self.__pool.close()
449 | self.__pool.terminate()
450 |
451 | # endregion
452 |
--------------------------------------------------------------------------------
/data/assets/timeception_layer.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
299 |
--------------------------------------------------------------------------------