├── preprocess
    ├── __init__.py
    ├── audio_extractor
    │   ├── __init__.py
    │   ├── vggish_params.py
    │   ├── vggish_input.py
    │   ├── vggish_postprocess.py
    │   └── vggish_slim.py
    ├── imgfeat_extractor
    │   ├── __init__.py
    │   └── efficientnet_extractor.py
    ├── txt_extractor
    │   └── text_requests.py
    └── feat_extract_main.py
├── src
    ├── model
    │   ├── __init__.py
    │   ├── cover_head
    │   │   ├── __init__.py
    │   │   ├── nasnet
    │   │   │   ├── __init__.py
    │   │   │   ├── nasnet_utils_test.py
    │   │   │   └── README.md
    │   │   ├── mobilenet_v1_eval.py
    │   │   ├── mobilenet_v1_train.py
    │   │   └── nets_factory.py
    │   ├── fusion_head
    │   │   ├── __init__.py
    │   │   └── fusion_se.py
    │   ├── text_head
    │   │   ├── __init__.py
    │   │   └── bert_model.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── base_model.py
    │   │   └── nextvlad_bert.py
    │   ├── video_head
    │   │   ├── __init__.py
    │   │   └── nextvlad.py
    │   ├── classify_head
    │   │   ├── __init__.py
    │   │   ├── logistic_model.py
    │   │   └── moe_model.py
    │   └── image_head
    │   │   ├── efficientNet
    │   │       └── condconv
    │   │       │   ├── __init__.py
    │   │       │   └── efficientnet_condconv_builder.py
    │   │   └── __init__.py
    ├── dataloader
    │   ├── __init__.py
    │   ├── preprocess
    │   │   ├── __init__.py
    │   │   ├── cnn_preprocessing
    │   │   │   ├── __init__.py
    │   │   │   ├── lenet_preprocessing.py
    │   │   │   ├── preprocessing_factory.py
    │   │   │   └── cifarnet_preprocessing.py
    │   │   ├── text_preprocess.py
    │   │   ├── image_preprocess.py
    │   │   ├── label_preprocess.py
    │   │   └── frames_npy_preprocess.py
    │   └── dataloader.py
    └── loss
    │   ├── __init__.py
    │   └── loss.py
├── utils
    ├── metrics
    │   ├── __init__.py
    │   ├── pr_calculator_per_tag.py
    │   ├── pr_calculator.py
    │   └── mean_average_precision_calculator.py
    ├── k_fold_prepare.py
    ├── k_fold_fusion.py
    ├── save_best_ckpt.py
    └── export_model.py
├── .gitattributes
├── requirement.txt
├── LICENSE
├── .gitignore
├── configs
    ├── config.tagging.5k.yaml
    ├── config.tagging.5k.0.yaml
    ├── config.tagging.5k.1.yaml
    ├── config.tagging.5k.2.yaml
    ├── config.tagging.5k.3.yaml
    ├── config.tagging.5k.4.yaml
    ├── config.tagging.5k.5.yaml
    ├── config.tagging.5k.6.yaml
    ├── config.tagging.5k.7.yaml
    ├── config.tagging.5k.8.yaml
    └── config.tagging.5k.9.yaml
├── readme.md
├── init.sh
├── train.sh
├── train.py
└── infer.sh


/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/dataloader/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/model/cover_head/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/preprocess/audio_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/dataloader/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/preprocess/imgfeat_extractor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/model/cover_head/nasnet/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/dataloader/preprocess/cnn_preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/src/model/fusion_head/__init__.py:
--------------------------------------------------------------------------------
1 | from src.model.fusion_head.fusion_se import SE
2 | 
3 | def get_instance(name, paramters):
4 |     model = {'SE': SE}[name]
5 |     return model(**paramters)


--------------------------------------------------------------------------------
/src/model/text_head/__init__.py:
--------------------------------------------------------------------------------
1 | from src.model.text_head.bert_model import BERT
2 | 
3 | def get_instance(name, paramters):
4 |     model = {'BERT': BERT}[name]
5 |     return model(**paramters)


--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
 1 | pyyaml
 2 | tqdm
 3 | munch
 4 | resampy
 5 | soundfile
 6 | moviepy==1.0.3
 7 | gast==0.2.2
 8 | ipython
 9 | jupyter
10 | matplotlib
11 | pandas
12 | xlrd
13 | openpyxl
14 | tomorrow3
15 | 


--------------------------------------------------------------------------------
/src/model/models/__init__.py:
--------------------------------------------------------------------------------
1 | from src.model.models.nextvlad_bert import NextVladBERT
2 | 
3 | def get_instance(name, paramters):
4 |     model = {"NextVladBERT": NextVladBERT}[name]
5 |     return model(paramters)


--------------------------------------------------------------------------------
/src/model/video_head/__init__.py:
--------------------------------------------------------------------------------
1 | from src.model.video_head.nextvlad import NeXtVLAD
2 | 
3 | def get_instance(name, paramters_dict):
4 |     model = {'NeXtVLAD': NeXtVLAD}[name]
5 |     return model(**paramters_dict)


--------------------------------------------------------------------------------
/src/model/models/base_model.py:
--------------------------------------------------------------------------------
1 | class BaseModel():
2 |     def __init__(self, args):
3 |         raise NotImplementedError
4 |     def __call__(self, inputs, is_training):
5 |         raise NotImplementedError
6 | 
7 |     def build_loss(self):
8 |         raise NotImplementedError
9 | 


--------------------------------------------------------------------------------
/src/loss/__init__.py:
--------------------------------------------------------------------------------
1 | from src.loss.loss import CrossEntropyLoss
2 | from src.loss.loss import SoftmaxLoss
3 | 
4 | def get_instance(name, paramters_dict):
5 |     model = {'CrossEntropyLoss': CrossEntropyLoss,
6 |             'SoftmaxLoss': SoftmaxLoss}[name]
7 |     return model(**paramters_dict)


--------------------------------------------------------------------------------
/src/model/classify_head/__init__.py:
--------------------------------------------------------------------------------
1 | from src.model.classify_head.logistic_model import LogisticModel
2 | from src.model.classify_head.moe_model import MoeModel
3 | 
4 | def get_instance(name, paramters_dict):
5 |     model = {'LogisticModel': LogisticModel,
6 |              'MoeModel': MoeModel}[name]
7 |     return model(**paramters_dict)


--------------------------------------------------------------------------------
/preprocess/txt_extractor/text_requests.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | class VideoASR():
 4 |     """视频ASR"""
 5 |     def request(self, inp):
 6 |         return inp
 7 |     
 8 | class VideoOCR():
 9 |     """视频OCR"""
10 |     def request(self, inp):
11 |         return inp
12 |     
13 | class ImageOCR():
14 |     """图像OCR"""
15 |     def request(self, inp):
16 |         return inp
17 |    
18 | if __name__ == '__main__':
19 |     test_image = './test.jpg'
20 |     image_ocr = ImageOCR().request(test_image)
21 |     print("image_ocr: {}".format(image_ocr))
22 | 
23 | 


--------------------------------------------------------------------------------
/src/model/image_head/efficientNet/condconv/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 


--------------------------------------------------------------------------------
/src/model/classify_head/logistic_model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.contrib.slim as slim
 2 | import tensorflow as tf
 3 | 
 4 | class LogisticModel():
 5 |   """Logistic model with L2 regularization."""
 6 |   def __init__(self, num_classes, l2_penalty=None):
 7 |       self.num_classes = num_classes
 8 |       self.l2_penalty =  0.0 if l2_penalty==None else l2_penalty
 9 | 
10 |   def __call__(self, model_input):
11 |     """
12 |     model_input: 'batch' x 'num_features' matrix of input features.
13 |     Returns: The dimensions of the tensor are batch_size x num_classes."""
14 |     logits = slim.fully_connected(
15 |         model_input, self.num_classes, activation_fn=None,
16 |         weights_regularizer=slim.l2_regularizer(self.l2_penalty),
17 |         biases_regularizer=slim.l2_regularizer(self.l2_penalty),
18 |         weights_initializer=slim.variance_scaling_initializer())
19 |     output = tf.nn.sigmoid(logits)
20 |     return {"predictions": output, "logits": logits}
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/model/text_head/bert_model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from src.model.text_head.bert_base import BertModel,BertConfig
 3 | 
 4 | class BERT():
 5 |     def __init__(self, bert_config, bert_emb_encode_size, reuse_variables=tf.AUTO_REUSE):
 6 |         self.reuse_variables = reuse_variables
 7 |         self.bert_emb_encode_size = bert_emb_encode_size
 8 |         self.bert_config = BertConfig(**bert_config)
 9 | 
10 |     def __call__(self, input_ids, is_training):
11 |         input_mask = tf.cast(tf.not_equal(input_ids,0),tf.int32)
12 |         bert_model = BertModel(config = self.bert_config,
13 |                                 is_training = is_training,
14 |                                 input_ids = input_ids,
15 |                                 input_mask = input_mask,
16 |                                 reuse_variables = self.reuse_variables)
17 |         
18 |         text_features = bert_model.get_pooled_output()
19 |         text_features = tf.layers.dense(text_features, self.bert_emb_encode_size, activation=None, name='text_features', reuse=self.reuse_variables)
20 |         text_features = tf.layers.batch_normalization(text_features, training=is_training, reuse=self.reuse_variables)
21 |         return text_features
22 | 


--------------------------------------------------------------------------------
/src/dataloader/preprocess/text_preprocess.py:
--------------------------------------------------------------------------------
 1 | import tokenization
 2 | import numpy as np
 3 | import random
 4 | import tensorflow as tf
 5 | import os
 6 | 
 7 | seed = 20210627
 8 | random.seed(seed)
 9 | tf.set_random_seed(seed)
10 | np.random.seed(seed)
11 | os.environ["PYTHONHASHSEED"] = str(seed)
12 | 
13 | 
14 | class Preprocess:
15 | 
16 |     def __init__(self, vocab, max_len, is_training=False):
17 |         self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab)
18 |         self.max_len = max_len
19 |         self.is_training = is_training
20 | 
21 |     def __call__(self, text, augment):
22 |         with open(text) as f:
23 |             data = eval(f.read().strip())
24 |             text = data['video_ocr'] + data['video_asr']
25 |             text = text.replace("|", "")
26 |             if augment > 0:
27 |                 text = text[random.randint(0, int(max(0, len(text) - 50))):]
28 |         tokens = ['[CLS]'] + self.tokenizer.tokenize(text)
29 |         if augment == 2:
30 |             tokens = ['[CLS]'] + [token for token in tokens[1:] if random.random() > 0.1]
31 |         ids = self.tokenizer.convert_tokens_to_ids(tokens)[:self.max_len]
32 |         ids = ids + [0]*(self.max_len-len(ids))
33 |         return np.array(ids).astype('int64')


--------------------------------------------------------------------------------
/src/dataloader/preprocess/image_preprocess.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | 
 5 | from cnn_preprocessing import inception_preprocessing
 6 | 
 7 | class Preprocess:
 8 | 
 9 |     def __init__(self, is_training, return_idx=False):
10 |         self.is_training = is_training
11 |         #with tf.get_default_graph():
12 |         self.path_placeholder = tf.placeholder(shape=None,dtype=tf.string)
13 |         image = tf.io.read_file(self.path_placeholder)
14 |         image = tf.io.decode_image(image, channels=3)
15 |         self.image_shape = (224, 224, 3)
16 |         #TODO(jefxiong, 对不同模型预处理要通用)
17 |         image.set_shape(self.image_shape)
18 |         self.image = inception_preprocessing.preprocess_image(image, 224, 224,
19 |                                                               is_training=self.is_training,
20 |                                                               add_image_summaries=False,
21 |                                                               crop_image=self.is_training)
22 |         sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
23 |         sess_config.gpu_options.allow_growth = True
24 |         self.sess = tf.Session(config=sess_config)
25 |         self.return_idx = return_idx
26 | 
27 |     def __call__(self, path, augment):
28 |         if os.path.exists(path):
29 |             image = self.sess.run(self.image,feed_dict={self.path_placeholder:path})
30 |         else:
31 |             image = np.zeros(self.image_shape)
32 |         if self.return_idx:
33 |             idx = os.path.basename(path).split('.')[0]
34 |             return image, idx
35 |         return image
36 | 


--------------------------------------------------------------------------------
/utils/k_fold_prepare.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from sklearn.model_selection import KFold
 5 | 
 6 | 
 7 | def parse_ground_truth(path):
 8 |     dataset = {}
 9 |     with open(path) as f:
10 |         video_feat = []
11 |         for row in f:
12 |             row = row.strip()
13 |             if len(row) == 0:
14 |                 assert len(video_feat) == 5
15 |                 dataset[os.path.split(video_feat[0])[-1].split(".")[0]] = video_feat
16 |                 video_feat = []
17 |             else:
18 |                 video_feat.append(row)
19 |     return dataset
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     save_train, save_valid, config_path = sys.argv[3:6]    
24 |     train_valid_data = parse_ground_truth(sys.argv[1])
25 |     train_valid_data.update(parse_ground_truth(sys.argv[2]))
26 |     videos = list(train_valid_data.keys())
27 |     kf = KFold(n_splits=10, random_state=2021, shuffle=True)
28 |     for i, (train_index, valid_index) in enumerate(kf.split(videos)):
29 |         with open(config_path + 'config.tagging.5k.yaml', 'r') as fin, \
30 |              open(config_path + 'config.tagging.5k.{}.yaml'.format(i), 'w') as fout:
31 |             config = fin.read().replace("train.txt", 'train_{}.txt'.format(i))
32 |             fout.write(config.replace("val.txt", 'valid_{}.txt'.format(i)))
33 |         
34 |         with open(save_train.format(i), 'w') as f:
35 |             for idx in train_index:
36 |                 f.write(u"\n".join(train_valid_data[videos[idx]]) + '\n\n')
37 |         
38 |         with open(save_valid.format(i), 'w') as f:
39 |             for idx in valid_index:
40 |                 f.write(u"\n".join(train_valid_data[videos[idx]]) + '\n\n')
41 | 


--------------------------------------------------------------------------------
/src/dataloader/preprocess/cnn_preprocessing/lenet_preprocessing.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Provides utilities for preprocessing."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import tensorflow as tf
22 | 
23 | slim = tf.contrib.slim
24 | 
25 | 
26 | def preprocess_image(image, output_height, output_width, is_training):
27 |   """Preprocesses the given image.
28 | 
29 |   Args:
30 |     image: A `Tensor` representing an image of arbitrary size.
31 |     output_height: The height of the image after preprocessing.
32 |     output_width: The width of the image after preprocessing.
33 |     is_training: `True` if we're preprocessing the image for training and
34 |       `False` otherwise.
35 | 
36 |   Returns:
37 |     A preprocessed image.
38 |   """
39 |   image = tf.to_float(image)
40 |   image = tf.image.resize_image_with_crop_or_pad(
41 |       image, output_width, output_height)
42 |   image = tf.subtract(image, 128.0)
43 |   image = tf.div(image, 128.0)
44 |   return image
45 | 


--------------------------------------------------------------------------------
/utils/metrics/pr_calculator_per_tag.py:
--------------------------------------------------------------------------------
 1 | #encoding: utf-8
 2 | #Author: jefxiong@tencent.com
 3 | 
 4 | from utils.metrics.pr_calculator import PRCalculator
 5 | import numpy as np
 6 | import time
 7 | 
 8 | def count_func_time(func):
 9 |     def call_fun(*args, **kwargs):
10 |         start_time = time.time()
11 |         func(*args, **kwargs)
12 |         end_time = time.time()
13 |         print('{} cost {:.3f} sec'.format(func.__name__, end_time-start_time))
14 |     return call_fun
15 | 
16 | def map_func(obj, x1, x2):
17 |   obj.accumulate(x1, x2)
18 | 
19 | class PRCalculatorPerTag():
20 |   def __init__(self, tag_num):
21 |     self.tag_num = tag_num
22 |     self.pr_calculators = []
23 |     for i in range(self.tag_num):
24 |       self.pr_calculators.append(PRCalculator())
25 | 
26 |   #@count_func_time
27 |   def accumulate(self, predictions, actuals):
28 |     """
29 |     predictions: n_example X n_classes
30 |     actuals: n_example X n_classes
31 |     """
32 |     #n_example X n_classes ==> n_classes * [n_example x 1]
33 |     pred_per_tag_list = np.expand_dims(predictions.transpose(), -1)
34 |     actuals_per_tag_list = np.expand_dims(actuals.transpose(), -1)
35 | 
36 |     for i in range(self.tag_num):
37 |       self.pr_calculators[i].accumulate(pred_per_tag_list[i], actuals_per_tag_list[i])
38 |     #ret = list(map(map_func, self.pr_calculators, pred_per_tag_list, actuals_per_tag_list))
39 | 
40 |   def get_precision_list(self, th=0.5):
41 |     return [self.pr_calculators[i].get_precision_at_conf(th) for i in range(self.tag_num)]
42 | 
43 |   def get_recall_list(self, th=0.5):
44 |     return [self.pr_calculators[i].get_recall_at_conf(th) for i in range(self.tag_num)]
45 | 
46 |   def clear(self):
47 |     for i in range(self.tag_num):
48 |       self.pr_calculators[i].clear()
49 | 


--------------------------------------------------------------------------------
/src/model/image_head/__init__.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.contrib.slim.nets import resnet_v2
 3 | import functools
 4 | import tensorflow.contrib.slim as slim
 5 | 
 6 | import src.model.image_head.efficientNet.efficientnet_builder as efficientnet_builder
 7 | 
 8 | networks_map={
 9 | 	'resnet_v2_50': resnet_v2.resnet_v2_50,
10 | 	'resnet_v2_101': resnet_v2.resnet_v2_101,
11 | 	'resnet_v2_152': resnet_v2.resnet_v2_152,
12 | 	'resnet_v2_200': resnet_v2.resnet_v2_200,
13 |         'efficientnet': efficientnet_builder.build_model_base,
14 | }
15 | 
16 | arg_scopes_map={
17 | 	'resnet_v2_50': resnet_v2.resnet_arg_scope,
18 | 	'resnet_v2_101': resnet_v2.resnet_arg_scope,
19 | 	'resnet_v2_152': resnet_v2.resnet_arg_scope,
20 | 	'resnet_v2_200': resnet_v2.resnet_arg_scope,
21 | }
22 | 
23 | def get_network_fn(name, model_name = None):
24 |   if name not in networks_map:
25 |     raise ValueError('Name of network unknown %s' % name)
26 |   func = networks_map.get(name, None)
27 |   if model_name is not None:
28 |       func = functools.partial(func, model_name = model_name)
29 |   @functools.wraps(func)
30 |   def network_fn(images, is_training, **kwargs):
31 |     if arg_scopes_map.get(name,None) is not None:
32 |         arg_scope = arg_scopes_map[name](weight_decay=1e-5)
33 |         with slim.arg_scope(arg_scope):
34 |           out, _ = func(images, num_classes=None, is_training=is_training, **kwargs)
35 |     else:
36 |         out, _ = func(images, is_training=is_training, **kwargs)
37 |     if len(out.get_shape()) ==4:
38 |       return out[:,0,0,:] #squeeze conv feat
39 |     else:
40 |       return out
41 | 
42 |   if hasattr(func, 'default_image_size'):
43 |     network_fn.default_image_size = func.default_image_size
44 | 
45 |   return network_fn
46 | 
47 | def get_instance(name, paramters):
48 |     return get_network_fn(name, **paramters)
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .nox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # IPython
 77 | profile_default/
 78 | ipython_config.py
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | env.bak/
 96 | venv.bak/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | .dmypy.json
111 | dmypy.json
112 | 
113 | # Pyre type checker
114 | .pyre/
115 | 


--------------------------------------------------------------------------------
/preprocess/audio_extractor/vggish_params.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Global parameters for the VGGish model.
17 | 
18 | See vggish_slim.py for more information.
19 | """
20 | 
21 | # Architectural constants.
22 | NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
23 | NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
24 | EMBEDDING_SIZE = 128  # Size of embedding layer.
25 | 
26 | # Hyperparameters used in feature and example generation.
27 | SAMPLE_RATE = 16000
28 | STFT_WINDOW_LENGTH_SECONDS = 0.025
29 | STFT_HOP_LENGTH_SECONDS = 0.010
30 | NUM_MEL_BINS = NUM_BANDS
31 | MEL_MIN_HZ = 125
32 | MEL_MAX_HZ = 7500
33 | LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
34 | EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
35 | EXAMPLE_HOP_SECONDS = 0.96     # with zero overlap.
36 | 
37 | # Parameters used for embedding postprocessing.
38 | PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
39 | PCA_MEANS_NAME = 'pca_means'
40 | QUANTIZE_MIN_VAL = -2.0
41 | QUANTIZE_MAX_VAL = +2.0
42 | 
43 | # Hyperparameters used in training.
44 | INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
45 | LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
46 | ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
47 | 
48 | # Names of ops, tensors, and features.
49 | INPUT_OP_NAME = 'vggish/input_features'
50 | INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
51 | OUTPUT_OP_NAME = 'vggish/embedding'
52 | OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
53 | AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'
54 | 


--------------------------------------------------------------------------------
/utils/k_fold_fusion.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import json
 3 | import sys
 4 | 
 5 | 
 6 | def load_label_dict(path):
 7 |     str2idx, idx2str = {}, []
 8 |     with open(path, encoding="utf8") as f:
 9 |         for line in f.readlines():
10 |             label, index = line.strip().split('\t')
11 |             str2idx[label] = int(index)
12 |             idx2str.append(label)
13 |     return str2idx, idx2str
14 | 
15 |         
16 | def json_to_array(output, str2idx):
17 |     target = np.zeros(82)
18 |     output = output['result'][0]
19 |     for label, score in zip(output['labels'],
20 |                             output['scores']):
21 |         target[str2idx[label]] = float(score)
22 |     return target     
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     num_folds = int(sys.argv[1])
27 |     label_path = sys.argv[2] # './home/tione/notebook/VideoStructuring/dataset/label_id.txt'
28 |     predict_path = sys.argv[3] # './home/tione/notebook/VideoStructuring/KFoldResults/test/results_{}/tagging_5k.json'
29 |     output_path = sys.argv[4] # './home/tione/notebook/VideoStructuring/results/tagging_5k.json'
30 |     topk = int(sys.argv[5]) # 20
31 | 
32 |     str2idx, idx2str = load_label_dict(label_path)
33 |     
34 |     full_predict = np.zeros((5000, num_folds, 82))
35 |     for fold in range(num_folds):
36 |         with open(predict_path.format(fold), encoding='utf8') as f:
37 |             video_names = []
38 |             for vid, (video_name, predict) in enumerate(sorted(json.load(f).items())):
39 |                 video_names.append(video_name)
40 |                 full_predict[vid, fold, :] = json_to_array(predict, str2idx)
41 |             assert len(video_names) == 5000
42 |     full_predict = full_predict.mean(axis=1)
43 | 
44 |     full_result = {}
45 |     for video_name, scores in zip(video_names, full_predict):
46 |         video_result = {"result": [{"labels": [], "scores":[]}]}
47 |         for score, label in sorted(zip(scores, idx2str), reverse=True)[:topk]:
48 |             video_result["result"][0]["labels"].append(label)
49 |             video_result["result"][0]["scores"].append("%.4f" % score)
50 |         full_result[video_name] = video_result
51 | 
52 |     with open(output_path, 'w', encoding='utf8') as f:
53 |         json.dump(full_result, f, ensure_ascii=False, indent=4)
54 |             
55 | 


--------------------------------------------------------------------------------
/utils/metrics/pr_calculator.py:
--------------------------------------------------------------------------------
 1 | #encoding: utf-8
 2 | #Author: jefxiong@tencent.com
 3 | import numpy as np
 4 | 
 5 | class PRCalculator():
 6 |   def __init__(self):
 7 |       # use only two threshold to save eval time
 8 |       self.threshold_dict={0.5:0, 0.1:1} #TODO(jefxiong, range from 0.9~0.01)
 9 |       self.precision = np.zeros((len(self.threshold_dict)))
10 |       self.recall = np.zeros((len(self.threshold_dict)))
11 |       self.accumulate_count = np.zeros((len(self.threshold_dict)))
12 | 
13 |   def accumulate(self, predictions, actuals):
14 |       """
15 |       predictions: n_example X n_classes
16 |       actuals: n_example X n_classes
17 |       """
18 |       #assert isinstance(predictions, np.ndarray)
19 |       #assert isinstance(actuals, np.ndarray)
20 |       n_example = predictions.shape[0]
21 | 
22 |       precision_all = np.zeros((n_example, len(self.threshold_dict)))
23 |       recall_all = np.zeros((n_example, len(self.threshold_dict)))
24 |       for i in range(n_example):
25 |         gt_index = np.nonzero(actuals[i])[0]
26 |         for th, th_index in self.threshold_dict.items():
27 |           pred_index = np.nonzero(predictions[i]>th)[0]
28 |           tp = np.sum([actuals[i][k] for k in pred_index])
29 |           precision_all[i][th_index]  = tp*1.0/len(pred_index) if len(pred_index)>0 else np.nan
30 |           recall_all[i][th_index]  = tp*1.0/len(gt_index) if len(gt_index)>0 else np.nan
31 | 
32 | 
33 |       valid_accumlate = (np.sum(~np.isnan(precision_all), axis=0)) != 0
34 |       self.accumulate_count = self.accumulate_count + valid_accumlate
35 | 
36 |       precision_all = np.nansum(precision_all,axis=0)/(np.sum(~np.isnan(precision_all), axis=0)+1e-10)
37 |       recall_all = np.nansum(recall_all,axis=0)/(np.sum(~np.isnan(recall_all), axis=0)+1e-10)
38 | 
39 |       self.precision = precision_all + self.precision
40 |       self.recall = recall_all + self.recall
41 | 
42 |   def get_precision_at_conf(self, th=0.5):
43 |       index = self.threshold_dict[th]
44 |       return self.precision[index]/(1e-10+self.accumulate_count[index])
45 | 
46 |   def get_recall_at_conf(self, th=0.5):
47 |       index = self.threshold_dict[th]
48 |       return self.recall[index]/(1e-10+self.accumulate_count[index])
49 | 
50 |   def clear(self):
51 |       self.accumulate_count = np.zeros((len(self.threshold_dict)))
52 |       self.precision = np.zeros((len(self.threshold_dict)))
53 |       self.recall = np.zeros((len(self.threshold_dict)))
54 | 


--------------------------------------------------------------------------------
/src/model/cover_head/nasnet/nasnet_utils_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Tests for slim.nets.nasnet.nasnet_utils."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import tensorflow as tf
22 | 
23 | from nets.nasnet import nasnet_utils
24 | 
25 | 
26 | class NasnetUtilsTest(tf.test.TestCase):
27 | 
28 |   def testCalcReductionLayers(self):
29 |     num_cells = 18
30 |     num_reduction_layers = 2
31 |     reduction_layers = nasnet_utils.calc_reduction_layers(
32 |         num_cells, num_reduction_layers)
33 |     self.assertEqual(len(reduction_layers), 2)
34 |     self.assertEqual(reduction_layers[0], 6)
35 |     self.assertEqual(reduction_layers[1], 12)
36 | 
37 |   def testGetChannelIndex(self):
38 |     data_formats = ['NHWC', 'NCHW']
39 |     for data_format in data_formats:
40 |       index = nasnet_utils.get_channel_index(data_format)
41 |       correct_index = 3 if data_format == 'NHWC' else 1
42 |       self.assertEqual(index, correct_index)
43 | 
44 |   def testGetChannelDim(self):
45 |     data_formats = ['NHWC', 'NCHW']
46 |     shape = [10, 20, 30, 40]
47 |     for data_format in data_formats:
48 |       dim = nasnet_utils.get_channel_dim(shape, data_format)
49 |       correct_dim = shape[3] if data_format == 'NHWC' else shape[1]
50 |       self.assertEqual(dim, correct_dim)
51 | 
52 |   def testGlobalAvgPool(self):
53 |     data_formats = ['NHWC', 'NCHW']
54 |     inputs = tf.placeholder(tf.float32, (5, 10, 20, 10))
55 |     for data_format in data_formats:
56 |       output = nasnet_utils.global_avg_pool(
57 |           inputs, data_format)
58 |       self.assertEqual(output.shape, [5, 10])
59 | 
60 | 
61 | if __name__ == '__main__':
62 |   tf.test.main()
63 | 


--------------------------------------------------------------------------------
/utils/save_best_ckpt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | 
 5 | def iterate_files(folder, ftype=None):
 6 |     assert os.path.isdir(folder), "Path should be a folder"
 7 |     if isinstance(ftype, str):
 8 |         ftype = {ftype}
 9 |     elif isinstance(ftype, (list, tuple)):
10 |         ftype = set(ftype)
11 |         
12 |     for file in os.listdir(folder):
13 |         file = os.path.join(folder, file)
14 |         if os.path.isfile(file) and \
15 |            (ftype is None or os.path.split(file)[-1].split(".")[-1] in ftype):
16 |             yield file
17 |             continue
18 |         elif os.path.isdir(file):
19 |             for subfile in iterate_files(file, ftype):
20 |                 yield os.path.join(folder, subfile)
21 |                 
22 |                 
23 | def remove_folder(folder):
24 |     assert os.path.isdir(folder)
25 |     for file in os.listdir(folder):
26 |         file = os.path.join(folder, file)
27 |         if os.path.isfile(file):
28 |             os.remove(file)
29 |             #print("Remove File: %s" % file)
30 |             continue
31 |         remove_folder(file)
32 |     os.rmdir(folder)
33 |     #print("Remove Foler: %s" % folder)
34 | 
35 | 
36 | def select_best_model(folder):
37 |     best_score, best_path = 0.0, None
38 |     for model in os.listdir(folder):
39 |         if model.startswith("step_"):
40 |             _, step, score = model.split("_")
41 |             score = float(score)
42 |             if score > best_score:
43 |                 best_score, best_path = score, model
44 |     return best_path
45 | 
46 | 
47 | def remove_bad_model(folder):
48 |     assert os.path.isdir(folder), "Path should be a folder"
49 |     export = os.path.join(folder, "export")
50 |     assert os.path.isdir(export), "Path should contains folder: export"
51 |     
52 |     best_path = select_best_model(export)
53 |     #print("Best Model: %s" % best_path)
54 |     best_step = int(best_path.split("_")[1])
55 |     
56 |     for file in os.listdir(folder):
57 |         if file.startswith("model.ckpt-") and \
58 |            int(file.split("-")[1].split(".")[0]) != best_step:
59 |             os.remove(os.path.join(folder, file))
60 |             #print("Remove: %s" % os.path.join(folder, file))
61 |             
62 |     for file in os.listdir(export):
63 |         if file != best_path:
64 |             remove_folder(os.path.join(export, file))
65 |               
66 |                 
67 | if __name__ == "__main__":
68 |     remove_bad_model(sys.argv[1])
69 | 


--------------------------------------------------------------------------------
/src/model/classify_head/moe_model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.contrib.slim as slim
 2 | import tensorflow as tf
 3 | 
 4 | class MoeModel():
 5 |   """A softmax over a mixture of logistic models (with L2 regularization)."""
 6 |   def __init__(self, num_classes, num_mixtures=4, l2_penalty=0.0):
 7 |       self.vocab_size = num_classes
 8 |       self.num_mixtures = num_mixtures
 9 |       self.l2_penalty = l2_penalty
10 | 
11 |   def __call__(self, model_input):
12 |     """Creates a Mixture of (Logistic) Experts model.
13 |      The model consists of a per-class softmax distribution over a
14 |      configurable number of logistic classifiers. One of the classifiers in the
15 |      mixture is not trained, and always predicts 0.
16 |     Args:
17 |       model_input: 'batch_size' x 'num_features' matrix of input features.
18 |       vocab_size: The number of classes in the dataset.
19 |       num_mixtures: The number of mixtures (excluding a dummy 'expert' that
20 |         always predicts the non-existence of an entity).
21 |       l2_penalty: How much to penalize the squared magnitudes of parameter
22 |         values.
23 |     Returns:
24 |       A dictionary with a tensor containing the probability predictions of the
25 |       model in the 'predictions' key. The dimensions of the tensor are
26 |       batch_size x num_classes.
27 |     """
28 |     gate_activations = slim.fully_connected(
29 |         model_input,
30 |         self.vocab_size * (self.num_mixtures + 1),
31 |         activation_fn=None,
32 |         biases_initializer=None,
33 |         weights_regularizer=slim.l2_regularizer(self.l2_penalty),
34 |         scope="gates")
35 |     expert_activations = slim.fully_connected(
36 |         model_input,
37 |         self.vocab_size * self.num_mixtures,
38 |         activation_fn=None,
39 |         weights_regularizer=slim.l2_regularizer(self.l2_penalty),
40 |         scope="experts")
41 | 
42 |     gating_distribution = tf.nn.softmax(tf.reshape(
43 |         gate_activations,
44 |         [-1, self.num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
45 |     expert_distribution = tf.nn.sigmoid(tf.reshape(
46 |         expert_activations,
47 |         [-1, self.num_mixtures]))  # (Batch * #Labels) x num_mixtures
48 | 
49 |     final_probabilities_by_class_and_batch = tf.reduce_sum(
50 |         gating_distribution[:, :self.num_mixtures] * expert_distribution, 1)
51 |     final_probabilities = tf.reshape(final_probabilities_by_class_and_batch,
52 |                                      [-1, self.vocab_size])
53 |     return {"predictions": final_probabilities}
54 | 


--------------------------------------------------------------------------------
/src/model/fusion_head/fusion_se.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.contrib.slim as slim
 2 | import tensorflow as tf
 3 | 
 4 | class SE():
 5 |     """Dropout + Channel Attention
 6 |     """
 7 |     def __init__(self, drop_rate, hidden1_size, gating_reduction, gating_last_bn=False):
 8 |         self.drop_rate = drop_rate
 9 |         self.hidden1_size = hidden1_size
10 |         self.gating_reduction = gating_reduction
11 |         self.gating_last_bn = gating_last_bn
12 |         self.expansion = 1.5
13 | 
14 |     def __call__(self, input_list, is_training):
15 |         #features = []
16 |         #for feature in input_list:
17 |         #    feature = slim.dropout(feature, keep_prob=1.0 - self.drop_rate, is_training=is_training)
18 |         #    features.append(slim.fully_connected(feature, 1024, activation_fn=None))
19 |         concat_feat = tf.concat(input_list, 1)
20 |         concat_feat = slim.dropout(concat_feat, keep_prob=1. - self.drop_rate, is_training=is_training, scope="concat_feat_dropout")
21 |         concat_feat_dim = concat_feat.get_shape().as_list()[1]
22 | 
23 |         hidden1_weights = tf.get_variable("hidden1_weights",[concat_feat_dim, self.hidden1_size],
24 |                                           initializer=slim.variance_scaling_initializer())
25 |         activation = tf.matmul(concat_feat, hidden1_weights)
26 |         activation = slim.batch_norm(activation,center=True,scale=True,
27 |                                      is_training=is_training,scope="hidden1_bn",fused=False)
28 | 
29 |         gating_weights_1 = tf.get_variable("gating_weights_1",
30 |                                            [self.hidden1_size, self.hidden1_size // self.gating_reduction],
31 |                                            initializer=slim.variance_scaling_initializer())
32 | 
33 |         gates = tf.matmul(activation, gating_weights_1)
34 | 
35 |         gates = slim.batch_norm(gates,center=True,scale=True,is_training=is_training,
36 |                                 activation_fn=slim.nn.relu, scope="gating_bn")
37 |         gating_weights_2 = tf.get_variable("gating_weights_2",
38 |                                            [self.hidden1_size // self.gating_reduction, self.hidden1_size],
39 |                                            initializer=slim.variance_scaling_initializer()
40 |                                            )
41 |         gates = tf.matmul(gates, gating_weights_2)
42 |         if self.gating_last_bn:
43 |             gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, scope="gating_last_bn")
44 | 
45 |         gates = tf.sigmoid(gates)
46 |         #tf.summary.histogram("final_gates", gates)
47 |         activation = tf.multiply(activation, gates)
48 |         return activation
49 | 


--------------------------------------------------------------------------------
/src/model/cover_head/nasnet/README.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow-Slim NASNet-A Implementation/Checkpoints
 2 | This directory contains the code for the NASNet-A model from the paper
 3 | [Learning Transferable Architectures for Scalable Image Recognition](https://arxiv.org/abs/1707.07012) by Zoph et al.
 4 | In nasnet.py there are three different configurations of NASNet-A that are implementented. One of the models is the NASNet-A built for CIFAR-10 and the
 5 | other two are variants of NASNet-A trained on ImageNet, which are listed below.
 6 | 
 7 | # Pre-Trained Models
 8 | Two NASNet-A checkpoints are available that have been trained on the
 9 | [ILSVRC-2012-CLS](http://www.image-net.org/challenges/LSVRC/2012/)
10 | image classification dataset. Accuracies were computed by evaluating using a single image crop.
11 | 
12 | Model Checkpoint | Million MACs | Million Parameters | Top-1 Accuracy| Top-5 Accuracy |
13 | :----:|:------------:|:----------:|:-------:|:-------:|
14 | [NASNet-A_Mobile_224](https://storage.googleapis.com/download.tensorflow.org/models/nasnet-a_mobile_04_10_2017.tar.gz)|564|5.3|74.0|91.6|
15 | [NASNet-A_Large_331](https://storage.googleapis.com/download.tensorflow.org/models/nasnet-a_large_04_10_2017.tar.gz)|23800|88.9|82.7|96.2|
16 | 
17 | 
18 | Here is an example of how to download the NASNet-A_Mobile_224 checkpoint. The way to download the NASNet-A_Large_331 is the same.
19 | 
20 | ```shell
21 | CHECKPOINT_DIR=/tmp/checkpoints
22 | mkdir ${CHECKPOINT_DIR}
23 | cd ${CHECKPOINT_DIR}
24 | wget https://storage.googleapis.com/download.tensorflow.org/models/nasnet-a_mobile_04_10_2017.tar.gz
25 | tar -xvf nasnet-a_mobile_04_10_2017.tar.gz
26 | rm nasnet-a_mobile_04_10_2017.tar.gz
27 | ```
28 | More information on integrating NASNet Models into your project can be found at the [TF-Slim Image Classification Library](https://github.com/tensorflow/models/blob/master/research/slim/README.md).
29 | 
30 | To get started running models on-device go to [TensorFlow Mobile](https://www.tensorflow.org/mobile/).
31 | 
32 | ## Sample Commands for using NASNet-A Mobile and Large Checkpoints for Inference
33 | -------
34 | Run eval with the NASNet-A mobile ImageNet model
35 | 
36 | ```shell
37 | DATASET_DIR=/tmp/imagenet
38 | EVAL_DIR=/tmp/tfmodel/eval
39 | CHECKPOINT_DIR=/tmp/checkpoints/model.ckpt
40 | python tensorflow_models/research/slim/eval_image_classifier \
41 | --checkpoint_path=${CHECKPOINT_DIR} \
42 | --eval_dir=${EVAL_DIR} \
43 | --dataset_dir=${DATASET_DIR} \
44 | --dataset_name=imagenet \
45 | --dataset_split_name=validation \
46 | --model_name=nasnet_mobile \
47 | --eval_image_size=224
48 | ```
49 | 
50 | Run eval with the NASNet-A large ImageNet model
51 | 
52 | ```shell
53 | DATASET_DIR=/tmp/imagenet
54 | EVAL_DIR=/tmp/tfmodel/eval
55 | CHECKPOINT_DIR=/tmp/checkpoints/model.ckpt
56 | python tensorflow_models/research/slim/eval_image_classifier \
57 | --checkpoint_path=${CHECKPOINT_DIR} \
58 | --eval_dir=${EVAL_DIR} \
59 | --dataset_dir=${DATASET_DIR} \
60 | --dataset_name=imagenet \
61 | --dataset_split_name=validation \
62 | --model_name=nasnet_large \
63 | --eval_image_size=331
64 | ```
65 | 


--------------------------------------------------------------------------------
/src/dataloader/preprocess/label_preprocess.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import codecs
 3 | 
 4 | def extract_dict(dict_file):
 5 |     index_to_tag = {}
 6 |     tag_to_index = {}
 7 |     for i, line in enumerate(codecs.open(dict_file, 'r', encoding='utf-8')):
 8 |         line = line.strip()
 9 |         if '\t' in line:
10 |             index, tag = line.split('\t')[:2]
11 |         elif ' ' in line:
12 |             index, tag = i, line.rsplit(' ', 1)[0]
13 |         else:
14 |             index, tag = i, line
15 | 
16 |         try:
17 |             index = int(index)
18 |         except:
19 |             index, tag = int(tag), index
20 | 
21 |         index_to_tag[index] = tag
22 |         tag_to_index[tag] = index
23 |     return index_to_tag, tag_to_index
24 | 
25 | class Preprocess_index_indentity:
26 |     
27 |     def __init__(self,
28 |                  index_dict,
29 |                  label_num,
30 |                  sep_token=',',
31 |                  is_training=False):
32 |         self.index_to_tag,self.tag_to_index = extract_dict(index_dict)
33 |         self.label_num = label_num
34 |         self.sep_token = sep_token
35 |         self.is_training = is_training
36 | 
37 |     def __call__(self, index_str):
38 |         index_lst = index_str.split(self.sep_token)
39 |         index_lst = [int(index) for index in index_lst]
40 |         for index in index_lst:
41 |             assert index in self.index_to_tag
42 |         return np.array(index_lst).astype('int32')
43 | 
44 | class Preprocess_index_sparse_to_dense:
45 |         
46 |     def __init__(self,
47 |                  index_dict,
48 |                  sep_token=',',
49 |                  is_training=False):
50 |         self.index_to_tag,self.tag_to_index = extract_dict(index_dict)
51 |         self.sep_token = sep_token
52 |         self.is_training = is_training
53 |         self.max_index = 0
54 |         for index in self.index_to_tag:
55 |             self.max_index = max(index, self.max_index)
56 |         self.seq_size = self.max_index + 1
57 |         self.label_num = self.seq_size
58 | 
59 |     def __call__(self, index_str):
60 |         dense_array = np.zeros(self.seq_size)
61 |         index_lst = index_str.split(self.sep_token)
62 |         index_lst = [int(index) for index in index_lst]
63 |         for index in index_lst:
64 |             if index == -1:
65 |                continue
66 |             assert index in self.index_to_tag
67 |             dense_array[index] = 1.0
68 |         return dense_array.astype('float32')
69 | 
70 | class Preprocess_label_sparse_to_dense:
71 |         
72 |     def __init__(self,
73 |                  index_dict,
74 |                  sep_token=',',
75 |                  is_training=False):
76 |         self.index_to_tag,self.tag_to_index = extract_dict(index_dict)
77 |         self.sep_token = sep_token
78 |         self.is_training = is_training
79 |         self.max_index = 0
80 |         for index in self.index_to_tag:
81 |             self.max_index = max(index, self.max_index)
82 |         self.seq_size = self.max_index + 1
83 |         self.label_num = self.seq_size
84 | 
85 |     def __call__(self, index_str, augment):
86 |         dense_array = np.zeros(self.seq_size)
87 |         label_lst = index_str.split(self.sep_token)
88 |         for label in label_lst:
89 |             if label in self.tag_to_index:
90 |                 index = self.tag_to_index[label]
91 |                 dense_array[index] = 1.0
92 |         return dense_array.astype('float32')
93 | 


--------------------------------------------------------------------------------
/src/model/video_head/nextvlad.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.contrib.slim as slim
 2 | import tensorflow as tf
 3 | 
 4 | class NeXtVLAD():
 5 |     def __init__(self, feature_size, max_frames, nextvlad_cluster_size, expansion, groups, directly=False):
 6 |         self.feature_size = feature_size
 7 |         self.max_frames = max_frames
 8 |         self.nextvlad_cluster_size = nextvlad_cluster_size
 9 |         self.expansion = expansion
10 |         self.groups = groups
11 |         self.directly = directly
12 | 
13 |     def __call__(self, input, is_training, mask=None):
14 |         input = slim.fully_connected(input, self.expansion * self.feature_size, activation_fn=None,
15 |                                      weights_initializer=slim.variance_scaling_initializer())
16 | 
17 |         attention = slim.fully_connected(input, self.groups, activation_fn=tf.nn.sigmoid,
18 |                                          weights_initializer=slim.variance_scaling_initializer())
19 |         if mask is not None:
20 |             attention = tf.multiply(attention, tf.expand_dims(mask, -1))
21 |         attention = tf.reshape(attention, [-1, self.max_frames*self.groups, 1])
22 |         feature_size = self.expansion * self.feature_size // self.groups
23 | 
24 |         cluster_weights = tf.get_variable("cluster_weights",
25 |                                           [self.expansion*self.feature_size, self.groups*self.nextvlad_cluster_size],
26 |                                           initializer=slim.variance_scaling_initializer()
27 |                                           )
28 | 
29 |         reshaped_input = tf.reshape(input, [-1, self.expansion * self.feature_size])
30 |         activation = tf.matmul(reshaped_input, cluster_weights)
31 | 
32 |         activation = slim.batch_norm(
33 |             activation,
34 |             center=True,
35 |             scale=True,
36 |             is_training=is_training,
37 |             scope="cluster_bn",
38 |             fused=False)
39 | 
40 |         activation = tf.reshape(activation, [-1, self.max_frames * self.groups, self.nextvlad_cluster_size])
41 |         activation = tf.nn.softmax(activation, axis=-1)
42 |         activation = tf.multiply(activation, attention)
43 |         # tf.summary.histogram("cluster_output", activation)
44 |         a_sum = tf.reduce_sum(activation, -2, keep_dims=True)
45 | 
46 |         cluster_weights2 = tf.get_variable("cluster_weights2",
47 |                                            [1, feature_size, self.nextvlad_cluster_size],
48 |                                            initializer=slim.variance_scaling_initializer()
49 |                                            )
50 |         a = tf.multiply(a_sum, cluster_weights2)
51 | 
52 |         activation = tf.transpose(activation, perm=[0, 2, 1])
53 | 
54 |         reshaped_input = tf.reshape(input, [-1, self.max_frames * self.groups, feature_size])
55 |         vlad = tf.matmul(activation, reshaped_input)
56 |         vlad = tf.transpose(vlad, perm=[0, 2, 1])
57 |         vlad = tf.subtract(vlad, a)
58 | 
59 |         vlad = tf.nn.l2_normalize(vlad, 1)
60 | 
61 |         vlad = tf.reshape(vlad, [-1, self.nextvlad_cluster_size * feature_size])
62 |         #return tf.reshape(vlad, (-1, 16, self.nextvlad_cluster_size * feature_size // 16))
63 |         vlad = slim.batch_norm(vlad,
64 |                 center=True,
65 |                 scale=True,
66 |                 is_training = is_training,
67 |                 scope="vlad_bn",
68 |                 fused=False)
69 |         return vlad
70 | 


--------------------------------------------------------------------------------
/preprocess/feat_extract_main.py:
--------------------------------------------------------------------------------
 1 | #encoding: utf-8
 2 | import sys,os
 3 | sys.path.append(os.getcwd())
 4 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 5 | 
 6 | import time
 7 | import argparse
 8 | import tqdm
 9 | import random
10 | import glob
11 | import traceback
12 | 
13 | from multimodal_feature_extract import MultiModalFeatureExtract
14 | 
15 | 
16 | def process_file(file_path, frame_npy_path, audio_npy_path, text_txt_path, image_jpg_path):
17 |     if not os.path.exists(file_path):
18 |         return
19 |     try:
20 |         print(file_path)
21 |         gen.extract_feat(file_path, frame_npy_path, audio_npy_path, text_txt_path, image_jpg_path)
22 |     except Exception as e:
23 |         print(traceback.format_exc())
24 | 
25 | if __name__ == '__main__':
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument('--test_files_dir', default=None,type=str)
28 |     parser.add_argument('--postfix', default='mp4', type=str)
29 |     parser.add_argument('--frame_npy_folder', default='dataset/frame_npy', type=str)
30 |     parser.add_argument('--audio_npy_folder', default='dataset/audio_npy', type=str)
31 |     parser.add_argument('--image_jpg_folder', default='dataset/image_jpg', type=str)
32 |     parser.add_argument('--text_txt_folder', default='dataset/text_txt', type=str)
33 |     parser.add_argument('--datafile_path', default='dataset/datafile.txt')
34 |     
35 |     parser.add_argument('--extract_type', default=0, type=int) #0:ALL #1:VIDEO #2: AUDIO #3: TEXT
36 |     
37 |     parser.add_argument('--image_batch_size', default=32, type=int)
38 |     parser.add_argument('--imgfeat_extractor', default='Youtube8M', type=str)
39 |     parser.add_argument('--do_logging', default=0, type=int)
40 |     
41 |     args = parser.parse_args()
42 |     os.makedirs(args.frame_npy_folder, exist_ok=True)
43 |     os.makedirs(args.audio_npy_folder, exist_ok=True)
44 |     os.makedirs(args.text_txt_folder, exist_ok=True)
45 |     os.makedirs(args.image_jpg_folder, exist_ok=True)
46 |     gen =  MultiModalFeatureExtract(batch_size = args.image_batch_size,
47 |                              imgfeat_extractor = args.imgfeat_extractor,
48 |                              extract_video = args.extract_type==0 or args.extract_type==1,
49 |                              extract_audio = args.extract_type==0 or args.extract_type==2,
50 |                              extract_text = args.extract_type==0 or args.extract_type==3)
51 |             
52 |     file_paths = glob.glob(args.test_files_dir+'/*.'+args.postfix)
53 |     random.shuffle(file_paths)
54 |     files = tqdm.tqdm(file_paths, total=len(file_paths)) if args.do_logging == 1 else file_paths
55 |     for file_path in files:
56 |         vid = os.path.basename(file_path).split('.m')[0]
57 |         frame_npy_path = os.path.join(args.frame_npy_folder, vid+'.npy')
58 |         audio_npy_path = os.path.join(args.audio_npy_folder, vid+'.npy')
59 |         image_jpg_path = os.path.join(args.image_jpg_folder, vid+'.jpg')
60 |         text_txt_path = os.path.join(args.text_txt_folder, vid+'.txt')
61 |         if args.extract_type == 1:
62 |             audio_npy_path, text_txt_path, image_jpg_path = None, None, None
63 |         elif args.extract_type == 2:
64 |             frame_npy_path, text_txt_path, image_jpg_path = None, None, None
65 |         elif args.extract_type == 3:
66 |             frame_npy_path, audio_npy_path, image_jpg_path = None, None, None
67 |         elif args.extract_type ==4:
68 |             frame_npy_path, audio_npy_path, text_txt_path = None, None, None
69 |         process_file(file_path, frame_npy_path, audio_npy_path, text_txt_path, image_jpg_path)
70 | 


--------------------------------------------------------------------------------
/src/dataloader/preprocess/cnn_preprocessing/preprocessing_factory.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Contains a factory for building various models."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import tensorflow as tf
22 | 
23 | from preprocessing import cifarnet_preprocessing
24 | from preprocessing import inception_preprocessing
25 | from preprocessing import lenet_preprocessing
26 | from preprocessing import vgg_preprocessing
27 | 
28 | slim = tf.contrib.slim
29 | 
30 | 
31 | def get_preprocessing(name, is_training=False):
32 |   """Returns preprocessing_fn(image, height, width, **kwargs).
33 | 
34 |   Args:
35 |     name: The name of the preprocessing function.
36 |     is_training: `True` if the model is being used for training and `False`
37 |       otherwise.
38 | 
39 |   Returns:
40 |     preprocessing_fn: A function that preprocessing a single image (pre-batch).
41 |       It has the following signature:
42 |         image = preprocessing_fn(image, output_height, output_width, ...).
43 | 
44 |   Raises:
45 |     ValueError: If Preprocessing `name` is not recognized.
46 |   """
47 |   preprocessing_fn_map = {
48 |       'cifarnet': cifarnet_preprocessing,
49 |       'inception': inception_preprocessing,
50 |       'inception_v1': inception_preprocessing,
51 |       'inception_v2': inception_preprocessing,
52 |       'inception_v3': inception_preprocessing,
53 |       'inception_v4': inception_preprocessing,
54 |       'inception_resnet_v2': inception_preprocessing,
55 |       'lenet': lenet_preprocessing,
56 |       'mobilenet_v1': inception_preprocessing,
57 |       'mobilenet_v2': inception_preprocessing,
58 |       'mobilenet_v2_035': inception_preprocessing,
59 |       'mobilenet_v2_140': inception_preprocessing,
60 |       'nasnet_mobile': inception_preprocessing,
61 |       'nasnet_large': inception_preprocessing,
62 |       'pnasnet_mobile': inception_preprocessing,
63 |       'pnasnet_large': inception_preprocessing,
64 |       'resnet_v1_50': vgg_preprocessing,
65 |       'resnet_v1_101': vgg_preprocessing,
66 |       'resnet_v1_152': vgg_preprocessing,
67 |       'resnet_v1_200': vgg_preprocessing,
68 |       'resnet_v2_50': vgg_preprocessing,
69 |       'resnet_v2_101': vgg_preprocessing,
70 |       'resnet_v2_152': vgg_preprocessing,
71 |       'resnet_v2_200': vgg_preprocessing,
72 |       'densenet121': vgg_preprocessing,
73 |       'densenet161': vgg_preprocessing,
74 |       'densenet169': vgg_preprocessing,
75 |       'vgg': vgg_preprocessing,
76 |       'vgg_a': vgg_preprocessing,
77 |       'vgg_16': vgg_preprocessing,
78 |       'vgg_19': vgg_preprocessing,
79 |   }
80 | 
81 |   if name not in preprocessing_fn_map:
82 |     raise ValueError('Preprocessing name [%s] was not recognized' % name)
83 | 
84 |   def preprocessing_fn(image, output_height, output_width, **kwargs):
85 |     return preprocessing_fn_map[name].preprocess_image(
86 |         image, output_height, output_width, is_training=is_training, **kwargs)
87 | 
88 |   return preprocessing_fn
89 | 


--------------------------------------------------------------------------------
/src/loss/loss.py:
--------------------------------------------------------------------------------
 1 | """Provides definitions for non-regularized training or test losses."""
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | class BaseLoss(object):
 7 |   """Inherit from this class when implementing new losses."""
 8 | 
 9 |   def calculate_loss(self, unused_predictions, unused_labels, **unused_params):
10 |     """Calculates the average loss of the examples in a mini-batch.
11 |      Args:
12 |       unused_predictions: a 2-d tensor storing the prediction scores, in which
13 |         each row represents a sample in the mini-batch and each column
14 |         represents a class.
15 |       unused_labels: a 2-d tensor storing the labels, which has the same shape
16 |         as the unused_predictions. The labels must be in the range of 0 and 1.
17 |       unused_params: loss specific parameters.
18 |     Returns:
19 |       A scalar loss tensor.
20 |     """
21 |     raise NotImplementedError()
22 | 
23 | 
24 | class CrossEntropyLoss(BaseLoss):
25 |   """Calculate the cross entropy loss between the predictions and labels.
26 |   """
27 | 
28 |   def calculate_loss(self, predictions, labels, **unused_params):
29 |     with tf.name_scope("loss_xent"):
30 |       epsilon = 1e-8
31 |       label_smooth_rate = unused_params.get('label_smooth_rate', 0.0)
32 |       float_labels = tf.cast(labels, tf.float32)*(1.0-label_smooth_rate) + \
33 |                      (1.0-tf.cast(labels, tf.float32)) * label_smooth_rate
34 | 
35 |       cross_entropy_loss = float_labels * tf.log(predictions + epsilon) + (
36 |           1 - float_labels) * tf.log(1 - predictions + epsilon)
37 |       cross_entropy_loss = tf.negative(cross_entropy_loss)
38 |       alpha = unused_params.get('loss_weight', 1.0) #alpha shape=[batch_size]
39 |       return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1)*alpha)
40 | 
41 | 
42 | class HingeLoss(BaseLoss):
43 |   """Calculate the hinge loss between the predictions and labels.
44 |   Note the subgradient is used in the backpropagation, and thus the optimization
45 |   may converge slower. The predictions trained by the hinge loss are between -1
46 |   and +1.
47 |   """
48 | 
49 |   def calculate_loss(self, predictions, labels, b=1.0, **unused_params):
50 |     with tf.name_scope("loss_hinge"):
51 |       float_labels = tf.cast(labels, tf.float32)
52 |       all_zeros = tf.zeros(tf.shape(float_labels), dtype=tf.float32)
53 |       all_ones = tf.ones(tf.shape(float_labels), dtype=tf.float32)
54 |       sign_labels = tf.subtract(tf.scalar_mul(2, float_labels), all_ones)
55 |       hinge_loss = tf.maximum(
56 |           all_zeros, tf.scalar_mul(b, all_ones) - sign_labels * predictions)
57 |       return tf.reduce_mean(tf.reduce_sum(hinge_loss, 1))
58 | 
59 | 
60 | class SoftmaxLoss(BaseLoss):
61 |   """Calculate the softmax loss between the predictions and labels.
62 |   The function calculates the loss in the following way: first we feed the
63 |   predictions to the softmax activation function and then we calculate
64 |   the minus linear dot product between the logged softmax activations and the
65 |   normalized ground truth label.
66 |   It is an extension to the one-hot label. It allows for more than one positive
67 |   labels for each sample.
68 |   """
69 | 
70 |   def calculate_loss(self, predictions, labels, **unused_params):
71 |     with tf.name_scope("loss_softmax"):
72 |       epsilon = 1e-8
73 |       float_labels = tf.cast(labels, tf.float32)
74 |       # l1 normalization (labels are no less than 0)
75 |       label_rowsum = tf.maximum(
76 |           tf.reduce_sum(float_labels, 1, keep_dims=True),
77 |           epsilon)
78 |       norm_float_labels = tf.div(float_labels, label_rowsum)
79 |       softmax_outputs = tf.nn.softmax(predictions)
80 |       softmax_loss = tf.negative(tf.reduce_sum(
81 |           tf.multiply(norm_float_labels, tf.log(softmax_outputs)), 1))
82 |     return tf.reduce_mean(softmax_loss)
83 | 


--------------------------------------------------------------------------------
/src/dataloader/dataloader.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import yaml
 3 | import tensorflow as tf
 4 | import os,sys
 5 | sys.path.append(os.getcwd())
 6 | from src.dataloader.data_generator import Data_Generator
 7 | 
 8 | class Data_Pipeline:
 9 | 
10 |     def __init__(self, data_config):
11 | 
12 |         self.data_config = data_config
13 |         self.batch_size = data_config['batch_size']
14 |         self.data_generator = Data_Generator(data_config=self.data_config)
15 |         self.sample_generator = self.data_generator.get_train_sample_generator
16 |         self.get_valid_sample_generator_dict = self.data_generator.get_valid_sample_generator_dict
17 |         self.label_num_dict = self.data_generator.label_num_dict
18 |         self.dname_string_list = self.data_generator.dname_string_list
19 |         self.data_shape_list = self.data_generator.data_shape_list
20 | 
21 |         self.data_num = len(self.dname_string_list)
22 |         self.dtype_map_dict = {'bool':tf.bool,
23 |                                'int16':tf.int16,
24 |                                'int32': tf.int32,
25 |                                'int64': tf.int64,
26 |                                'float16':tf.float16,
27 |                                'float32': tf.float32,
28 |                                'float64': tf.float64,
29 |                                 'string': tf.string}
30 |         self.dtype_list = [self.dtype_map_dict[string] for string in self.data_generator.dtype_string_list]
31 |         self.dataset = tf.data.Dataset.from_generator(self.sample_generator,
32 |                                                            tuple(self.dtype_list),
33 |                                                            tuple(self.data_shape_list))
34 |         self.dataset = self.dataset.batch(self.batch_size).prefetch(20)
35 |         self.iterator = self.dataset.make_initializable_iterator()
36 |         self.data_op_lst = self.iterator.get_next()
37 |         self.name_to_data_op = {}
38 |         self.data_op_list = []
39 |         for index in range(self.data_num):
40 |             name = self.dname_string_list[index]
41 |             self.name_to_data_op[name] = self.data_op_lst[index]
42 |             self.data_op_list.append(self.name_to_data_op[name])
43 | 
44 | if __name__ == '__main__':
45 |    import argparse
46 |    import time
47 | 
48 |    parser = argparse.ArgumentParser()
49 |    parser.add_argument('--data_config',type=str)
50 |    args = parser.parse_args()
51 |    
52 |    data_config = yaml.load(open(args.data_config))
53 |    data_pipeline =  Data_Pipeline(data_config = data_config['DatasetConfig'])
54 | 
55 |    for name in data_pipeline.name_to_data_op:
56 |        print(name)
57 |        print(data_pipeline.name_to_data_op[name])
58 | 
59 |    Sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
60 |    Sess_config.gpu_options.allow_growth = True
61 |    with tf.Session(config=Sess_config) as sess:
62 |        sess.run(data_pipeline.iterator.initializer)
63 |        sess.run(tf.local_variables_initializer())
64 |        sess.run(tf.global_variables_initializer())
65 |        for _ in range(10):
66 |            print(data_pipeline.label_num_dict)
67 |            start_time = time.time() 
68 |            data_list = sess.run(data_pipeline.data_op_list)
69 |            for data,name in zip(data_list,data_pipeline.dname_string_list):
70 |                print(name,data.shape)
71 |            #time.sleep(0.5)
72 |            end_time = time.time()
73 |            print(end_time-start_time)
74 | 
75 |    def valid():
76 |        valid_sample_generator_dict =  data_pipeline.get_valid_sample_generator_dict()
77 |        for source_name,generator in valid_sample_generator_dict.items():
78 |            for sample in generator:
79 |                for output_name, x in sample.items():
80 |                    print(source_name, output_name,x.shape)
81 |    valid()
82 | 


--------------------------------------------------------------------------------
/preprocess/audio_extractor/vggish_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Compute input examples for VGGish from audio waveform."""
17 | 
18 | import numpy as np
19 | import resampy
20 | 
21 | from preprocess.audio_extractor import mel_features
22 | 
23 | from preprocess.audio_extractor import vggish_params
24 | 
25 | import soundfile as sf
26 | 
27 | 
28 | def waveform_to_examples(data, sample_rate):
29 |   """Converts audio waveform into an array of examples for VGGish.
30 | 
31 |   Args:
32 |     data: np.array of either one dimension (mono) or two dimensions
33 |       (multi-channel, with the outer dimension representing channels).
34 |       Each sample is generally expected to lie in the range [-1.0, +1.0],
35 |       although this is not required.
36 |     sample_rate: Sample rate of data.
37 | 
38 |   Returns:
39 |     3-D np.array of shape [num_examples, num_frames, num_bands] which represents
40 |     a sequence of examples, each of which contains a patch of log mel
41 |     spectrogram, covering num_frames frames of audio and num_bands mel frequency
42 |     bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
43 |   """
44 |   # Convert to mono.
45 |   if len(data.shape) > 1:
46 |     data = np.mean(data, axis=1)
47 |   # Resample to the rate assumed by VGGish.
48 |   if sample_rate != vggish_params.SAMPLE_RATE:
49 |     data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)
50 | 
51 |   # Compute log mel spectrogram features.
52 |   log_mel = mel_features.log_mel_spectrogram(
53 |       data,
54 |       audio_sample_rate=vggish_params.SAMPLE_RATE,
55 |       log_offset=vggish_params.LOG_OFFSET,
56 |       window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
57 |       hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
58 |       num_mel_bins=vggish_params.NUM_MEL_BINS,
59 |       lower_edge_hertz=vggish_params.MEL_MIN_HZ,
60 |       upper_edge_hertz=vggish_params.MEL_MAX_HZ)
61 | 
62 |   # Frame features into examples.
63 |   features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
64 |   example_window_length = int(round(
65 |       vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
66 |   example_hop_length = int(round(
67 |       vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
68 |   log_mel_examples = mel_features.frame(
69 |       log_mel,
70 |       window_length=example_window_length,
71 |       hop_length=example_hop_length)
72 |   return log_mel_examples
73 | 
74 | 
75 | def wavfile_to_examples(wav_file):
76 |   """Convenience wrapper around waveform_to_examples() for a common WAV format.
77 | 
78 |   Args:
79 |     wav_file: String path to a file, or a file-like object. The file
80 |     is assumed to contain WAV audio data with signed 16-bit PCM samples.
81 | 
82 |   Returns:
83 |     See waveform_to_examples.
84 |   """
85 |   #wav_data, sr = sf.read(wav_file)
86 |   #print (wav_data)
87 |   wav_data, sr = sf.read(wav_file, dtype='int16')
88 |   #print (wav_data)
89 |   #print (sr)
90 |   assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
91 |   samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]
92 |   #print (samples)
93 |   return waveform_to_examples(samples, sr)
94 | 


--------------------------------------------------------------------------------
/preprocess/imgfeat_extractor/efficientnet_extractor.py:
--------------------------------------------------------------------------------
 1 | # Author: wuxsmail@163.com
 2 | 
 3 | import time
 4 | 
 5 | import numpy as np
 6 | import tensorflow as tf
 7 | import cv2
 8 | 
 9 | from efficientnet.tfkeras import EfficientNetB5
10 | from efficientnet.tfkeras import center_crop_and_resize, preprocess_input
11 | 
12 | 
13 | def center_crop_and_resize(frame, size):
14 |     """change shape of a frame with shape (h, w, 3) into shape (size, size, 3)
15 |     """
16 |     # prepare_frame
17 |     assert len(frame.shape) == 3 and frame.shape[-1] == 3
18 |     if frame.dtype != np.uint8:
19 |         frame = frame.astype(np.uint8)
20 |         
21 |     # center crop process
22 |     y, x = frame.shape[0:2]
23 |     if x != y:
24 |         min_dim = min(y, x)
25 |         start_x = (x // 2) - (min_dim // 2)
26 |         start_y = (y // 2) - (min_dim // 2)
27 |         frame = frame[start_y:start_y+min_dim,start_x:start_x+min_dim]
28 | 
29 |     # resize process
30 |     h, w = frame.shape[:2]
31 |     if h * w < size ** 2:
32 |         frame = cv2.resize(frame, (size, size), interpolation=cv2.INTER_CUBIC)
33 |     elif not (h == w == size):
34 |         frame = cv2.resize(frame, (size, size), interpolation=cv2.INTER_AREA)
35 |     return np.expand_dims(frame, 0).astype(np.float32)
36 | 
37 | 
38 | class EfficientNetExtractor(object):
39 |     """Extracts EfficientNet features for RGB frames.
40 |     """
41 | 
42 |     def __init__(self, img_size=456, max_pooling=True):
43 |         self.index = 0
44 |         config = tf.ConfigProto()
45 |         config.gpu_options.allow_growth = True
46 |         config.gpu_options.per_process_gpu_memory_fraction = 1.0
47 |         self.session = tf.compat.v1.Session(config=config)
48 |         self.graph = tf.compat.v1.get_default_graph()
49 |         tf.compat.v1.keras.backend.set_session(self.session)
50 |         self.model = EfficientNetB5(
51 |                    weights='pretrained/efficientnet/efficientnet-b5_noisy-student_notop.h5',
52 |                    include_top=False,
53 |                    pooling='avg')
54 |         self.img_size = img_size
55 |         self.block7 = self.model.output
56 |         self.block6 = self.model.layers[-48].output
57 | 
58 |     def extract_rgb_frame_features(self, frame_rgb):
59 |         assert len(frame_rgb.shape) == 4
60 |         assert frame_rgb.shape[3] == 3  # 3 channels (R, G, B)
61 |         with self.graph.as_default():
62 |             tf.keras.backend.set_session(self.session)
63 |             block7, block6 = self.session.run([self.block7, self.block6], feed_dict={self.model.input: frame_rgb})
64 |             return np.hstack([block7, np.reshape(block6, [block6.shape[0], -1, block6.shape[-1]]).mean(1)])
65 | 
66 |     def extract_rgb_frame_features_list(self, frame_rgb_list, batch_size):
67 |         self.index += 1
68 |         def _predict_batch():
69 |             if len(frame_list) > 0:
70 |                 batch_inputs = preprocess_input(np.vstack(frame_list))
71 |                 batch_feat = self.extract_rgb_frame_features(batch_inputs)
72 |                 feature_list.extend(frame for frame in batch_feat)
73 | 
74 |         frame_list = []
75 |         feature_list = []
76 |         for frame in frame_rgb_list:
77 |             frame_list.append(center_crop_and_resize(frame, self.img_size))
78 |             if len(frame_list) == batch_size:
79 |                 _predict_batch()
80 |                 frame_list = []
81 |         else:
82 |             _predict_batch()
83 |         msg = "[%s] Video-%d has Frames: %d | Feature Dimension: %s" % (time.asctime(), 
84 |                                                                         self.index, 
85 |                                                                         len(feature_list), 
86 |                                                                         feature_list[-1].shape[-1])
87 |         with open("/home/tione/notebook/log/extract_train.log", "a+") as f:
88 |             f.write(msg + "\n")
89 |         return feature_list
90 | 


--------------------------------------------------------------------------------
/utils/export_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Utilities to export a model for batch prediction."""
15 | 
16 | import tensorflow as tf
17 | 
18 | from tensorflow.python.saved_model import builder as saved_model_builder
19 | from tensorflow.python.saved_model import signature_constants
20 | from tensorflow.python.saved_model import signature_def_utils
21 | from tensorflow.python.saved_model import tag_constants
22 | from tensorflow.python.saved_model import utils as saved_model_utils
23 | 
24 | _TOP_PREDICTIONS_IN_OUTPUT = 82
25 | 
26 | class ModelExporter(object):
27 | 
28 |   def __init__(self, model, reader):
29 |     self.model = model
30 |     self.reader = reader
31 | 
32 |     with tf.Graph().as_default() as graph:
33 |       self.inputs, self.outputs = self.build_inputs_and_outputs()
34 |       self.graph = graph
35 |       self.saver = tf.train.Saver(tf.global_variables(), sharded=True)
36 |       
37 |   def export_model(self, model_dir, global_step_val, last_checkpoint):
38 |     """Exports the model so that it can used for batch predictions."""
39 | 
40 |     with self.graph.as_default():
41 |       with tf.Session() as session:
42 |         session.run(tf.global_variables_initializer())
43 |         self.saver.restore(session, last_checkpoint)
44 | 
45 |         signature = signature_def_utils.build_signature_def(
46 |             inputs=self.inputs,
47 |             outputs=self.outputs,
48 |             method_name=signature_constants.PREDICT_METHOD_NAME)
49 | 
50 |         signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
51 |                          signature}
52 | 
53 |         model_builder = saved_model_builder.SavedModelBuilder(model_dir)
54 |         model_builder.add_meta_graph_and_variables(session,
55 |             tags=[tag_constants.SERVING],
56 |             signature_def_map=signature_map,
57 |             clear_devices=True)
58 |         model_builder.save()
59 | 
60 |   def build_inputs_and_outputs(self):
61 |     input_name_list = self.reader.dname_string_list #模型输入变量名
62 |     inupt_shape_list = self.reader.data_shape_list  #模型输入shape
63 |     input_dtype_list = self.reader.dtype_list       #模型输入类型
64 |     
65 |     inputs_dict={}
66 |     for input_name,input_shape,input_dtype in zip(input_name_list, inupt_shape_list, input_dtype_list):
67 |       inputs_dict[input_name] = tf.placeholder(shape=[None]+input_shape, dtype=input_dtype, name=input_name) #add batch size dim
68 |         
69 |     with tf.variable_scope("tower"):
70 |       result = self.model(inputs_dict,is_training=False)
71 |       predictions = result["tagging_output_fusion"]["predictions"]
72 |       video_embedding = result["video_embedding"]
73 |       top_predictions, top_indices = tf.nn.top_k(predictions, _TOP_PREDICTIONS_IN_OUTPUT)
74 | 
75 |     #inputs = {"video_input_placeholder": saved_model_utils.build_tensor_info(video_input_placeholder),
76 |     #          "audio_input_placeholder": saved_model_utils.build_tensor_info(audio_input_placeholder),
77 |     #          "text_input_placeholder":  saved_model_utils.build_tensor_info(text_input_placeholder),
78 |     #          "num_frames_placeholder": saved_model_utils.build_tensor_info(num_frames_placeholder)}
79 |     inputs = {key:saved_model_utils.build_tensor_info(val) for key,val in inputs_dict.items()}
80 |     outputs = {
81 |         "class_indexes": saved_model_utils.build_tensor_info(top_indices),
82 |         "video_embedding": saved_model_utils.build_tensor_info(video_embedding),
83 |         "predictions": saved_model_utils.build_tensor_info(top_predictions)}
84 | 
85 |     return inputs, outputs
86 | 


--------------------------------------------------------------------------------
/preprocess/audio_extractor/vggish_postprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | """Post-process embeddings from VGGish."""
17 | import os,sys
18 | sys.path.append(os.path.dirname(__file__))
19 | 
20 | import numpy as np
21 | 
22 | import vggish_params
23 | 
24 | 
25 | class Postprocessor(object):
26 |   """Post-processes VGGish embeddings.
27 | 
28 |   The initial release of AudioSet included 128-D VGGish embeddings for each
29 |   segment of AudioSet. These released embeddings were produced by applying
30 |   a PCA transformation (technically, a whitening transform is included as well)
31 |   and 8-bit quantization to the raw embedding output from VGGish, in order to
32 |   stay compatible with the YouTube-8M project which provides visual embeddings
33 |   in the same format for a large set of YouTube videos. This class implements
34 |   the same PCA (with whitening) and quantization transformations.
35 |   """
36 | 
37 |   def __init__(self, pca_params_npz_path):
38 |     """Constructs a postprocessor.
39 | 
40 |     Args:
41 |       pca_params_npz_path: Path to a NumPy-format .npz file that
42 |         contains the PCA parameters used in postprocessing.
43 |     """
44 |     params = np.load(pca_params_npz_path)
45 |     self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
46 |     # Load means into a column vector for easier broadcasting later.
47 |     self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
48 |     assert self._pca_matrix.shape == (
49 |         vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
50 |             'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
51 |     assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
52 |         'Bad PCA means shape: %r' % (self._pca_means.shape,))
53 | 
54 |   def postprocess(self, embeddings_batch):
55 |     """Applies postprocessing to a batch of embeddings.
56 | 
57 |     Args:
58 |       embeddings_batch: An nparray of shape [batch_size, embedding_size]
59 |         containing output from the embedding layer of VGGish.
60 | 
61 |     Returns:
62 |       An nparray of the same shape as the input but of type uint8,
63 |       containing the PCA-transformed and quantized version of the input.
64 |     """
65 |     assert len(embeddings_batch.shape) == 2, (
66 |         'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
67 |     assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
68 |         'Bad batch shape: %r' % (embeddings_batch.shape,))
69 | 
70 |     # Apply PCA.
71 |     # - Embeddings come in as [batch_size, embedding_size].
72 |     # - Transpose to [embedding_size, batch_size].
73 |     # - Subtract pca_means column vector from each column.
74 |     # - Premultiply by PCA matrix of shape [output_dims, input_dims]
75 |     #   where both are are equal to embedding_size in our case.
76 |     # - Transpose result back to [batch_size, embedding_size].
77 |     pca_applied = np.dot(self._pca_matrix,
78 |                          (embeddings_batch.T - self._pca_means)).T
79 | 
80 |     # Quantize by:
81 |     # - clipping to [min, max] range
82 |     #clipped_embeddings = np.clip(
83 |     #    pca_applied, vggish_params.QUANTIZE_MIN_VAL,
84 |     #    vggish_params.QUANTIZE_MAX_VAL)
85 |     # - convert to 8-bit in range [0.0, 255.0]
86 |     #quantized_embeddings = (
87 |     #    (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
88 |     #    (255.0 /
89 |     #     (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
90 |     # - cast 8-bit float to uint8
91 |     #quantized_embeddings = quantized_embeddings.astype(np.uint8)
92 | 
93 |     #return quantized_embeddings
94 |     return pca_applied
95 | 


--------------------------------------------------------------------------------
/utils/metrics/mean_average_precision_calculator.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Calculate the mean average precision.
 16 | 
 17 | It provides an interface for calculating mean average precision
 18 | for an entire list or the top-n ranked items.
 19 | 
 20 | Example usages:
 21 | We first call the function accumulate many times to process parts of the ranked
 22 | list. After processing all the parts, we call peek_map_at_n
 23 | to calculate the mean average precision.
 24 | 
 25 | ```
 26 | import random
 27 | 
 28 | p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)])
 29 | a = np.array([[random.choice([0, 1]) for _ in xrange(50)]
 30 |      for _ in xrange(1000)])
 31 | 
 32 | # mean average precision for 50 classes.
 33 | calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator(
 34 |             num_class=50)
 35 | calculator.accumulate(p, a)
 36 | aps = calculator.peek_map_at_n()
 37 | ```
 38 | """
 39 | 
 40 | import utils.metrics.average_precision_calculator as average_precision_calculator
 41 | 
 42 | 
 43 | class MeanAveragePrecisionCalculator(object):
 44 |   """This class is to calculate mean average precision.
 45 |   """
 46 | 
 47 |   def __init__(self, num_class):
 48 |     """Construct a calculator to calculate the (macro) average precision.
 49 | 
 50 |     Args:
 51 |       num_class: A positive Integer specifying the number of classes.
 52 |       top_n_array: A list of positive integers specifying the top n for each
 53 |       class. The top n in each class will be used to calculate its average
 54 |       precision at n.
 55 |       The size of the array must be num_class.
 56 | 
 57 |     Raises:
 58 |       ValueError: An error occurred when num_class is not a positive integer;
 59 |       or the top_n_array is not a list of positive integers.
 60 |     """
 61 |     if not isinstance(num_class, int) or num_class <= 1:
 62 |       raise ValueError("num_class must be a positive integer.")
 63 | 
 64 |     self._ap_calculators = []  # member of AveragePrecisionCalculator
 65 |     self._num_class = num_class  # total number of classes
 66 |     for i in range(num_class):
 67 |       self._ap_calculators.append(
 68 |           average_precision_calculator.AveragePrecisionCalculator())
 69 | 
 70 |   def accumulate(self, predictions, actuals, num_positives=None):
 71 |     """Accumulate the predictions and their ground truth labels.
 72 | 
 73 |     Args:
 74 |       predictions: A list of lists storing the prediction scores. The outer
 75 |       dimension corresponds to classes.
 76 |       actuals: A list of lists storing the ground truth labels. The dimensions
 77 |       should correspond to the predictions input. Any value
 78 |       larger than 0 will be treated as positives, otherwise as negatives.
 79 |       num_positives: If provided, it is a list of numbers representing the
 80 |       number of true positives for each class. If not provided, the number of
 81 |       true positives will be inferred from the 'actuals' array.
 82 | 
 83 |     Raises:
 84 |       ValueError: An error occurred when the shape of predictions and actuals
 85 |       does not match.
 86 |     """
 87 |     if not num_positives:
 88 |       num_positives = [None for i in predictions.shape[1]]
 89 | 
 90 |     calculators = self._ap_calculators
 91 |     for i in range(len(predictions)):
 92 |       calculators[i].accumulate(predictions[i], actuals[i], num_positives[i])
 93 | 
 94 |   def clear(self):
 95 |     for calculator in self._ap_calculators:
 96 |       calculator.clear()
 97 | 
 98 |   def is_empty(self):
 99 |     return ([calculator.heap_size for calculator in self._ap_calculators] ==
100 |             [0 for _ in range(self._num_class)])
101 | 
102 |   def peek_map_at_n(self):
103 |     """Peek the non-interpolated mean average precision at n.
104 | 
105 |     Returns:
106 |       An array of non-interpolated average precision at n (default 0) for each
107 |       class.
108 |     """
109 |     aps = [self._ap_calculators[i].peek_ap_at_n()
110 |            for i in range(self._num_class)]
111 |     return aps
112 | 


--------------------------------------------------------------------------------
/configs/config.tagging.5k.yaml:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | #  1. Model Define Configs
  3 | #############################################################
  4 | ModelConfig:
  5 |   model_type: 'NextVladBERT'
  6 |   use_modal_drop: False #在训练过程中，对多模态特征的某一模态进行丢弃
  7 |   with_embedding_bn: False #对不同模态输入特征进行BN归一化
  8 |   modal_drop_rate: 0.3
  9 |   with_video_head: True #视频特征
 10 |   with_audio_head: True #音频特征
 11 |   with_text_head: True  #文本特征
 12 |   with_image_head: False # False #图片特征
 13 | 
 14 |   #视频特征(16384)
 15 |   video_head_type: 'NeXtVLAD'
 16 |   video_head_params:
 17 |     nextvlad_cluster_size: 128
 18 |     groups: 16
 19 |     expansion: 2
 20 |     feature_size: 2552 #inception feature dim
 21 |     directly: True
 22 |     max_frames: 300
 23 | 
 24 |   #语音特征(1024)
 25 |   audio_head_type: 'NeXtVLAD'
 26 |   audio_head_params:
 27 |     nextvlad_cluster_size: 64
 28 |     groups: 16
 29 |     expansion: 2
 30 |     feature_size: 128 #vggfish feature dim
 31 |     directly: True
 32 |     max_frames: 300
 33 | 
 34 |   #文本特征(1024)
 35 |   text_head_type: 'BERT'
 36 |   text_head_params:
 37 |     bert_config:
 38 |       attention_probs_dropout_prob: 0.1
 39 |       hidden_act: "gelu"
 40 |       hidden_dropout_prob: 0.1
 41 |       hidden_size: 768
 42 |       initializer_range: 0.02
 43 |       intermediate_size: 3072
 44 |       max_position_embeddings: 512
 45 |       num_attention_heads: 12
 46 |       num_hidden_layers: 12
 47 |       type_vocab_size: 2
 48 |       vocab_size: 21128
 49 |     bert_emb_encode_size: 1024
 50 |   
 51 |   #图片特征(2048)
 52 |   image_head_type: 'resnet_v2_50'
 53 |   image_head_params: {}
 54 | 
 55 | 
 56 |   #多模态特征融合方式
 57 |   fusion_head_type: 'SE'
 58 |   fusion_head_params:
 59 |     hidden1_size: 1024
 60 |     gating_reduction: 8 # reduction factor in se context gating
 61 |     drop_rate:
 62 |       video: 0.8
 63 |       audio: 0.8
 64 |       image: 0.5
 65 |       text: 0.4
 66 |       fusion: 0.9
 67 |       
 68 |   #tagging分类器参数
 69 |   tagging_classifier_type: 'MoeModel'
 70 |   tagging_classifier_params:
 71 |     num_classes: 82 #标签数目, 按需修改
 72 |     num_mixtures: 2
 73 |     #l2_penalty: 0.0
 74 |     
 75 | #############################################################
 76 | #2. Optimizer & Train Configs
 77 | #############################################################
 78 | OptimizerConfig:
 79 |   optimizer: 'AdamOptimizer'
 80 |   optimizer_init_params: {}
 81 |   clip_gradient_norm: 1.0
 82 |   learning_rate_dict:
 83 |     video: 0.0001
 84 |     audio: 0.0001
 85 |     text: 0.00001
 86 |     image: 0.0001
 87 |     classifier: 0.0005
 88 |   loss_type_dict:
 89 |     tagging: "CrossEntropyLoss"
 90 |   max_step_num: 6500
 91 |   export_model_steps: 500
 92 |   learning_rate_decay: 0.1
 93 |   start_new_model: True # 如果为True，重新训练； 如果False，则resume
 94 |   num_gpu: 1
 95 |   log_device_placement: False
 96 |   gpu_allow_growth: True
 97 |   pretrained_model:
 98 |     text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
 99 |     image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt'
100 |   train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改
101 | 
102 | #############################################################
103 | #    3. DataSet Config
104 | #############################################################
105 | DatasetConfig:
106 |   batch_size: 16
107 |   shuffle: True
108 |   train_data_source_list:
109 |     train799:
110 |       file: '../dataset/tagging/GroundTruth/datafile/train.txt'   # preprocessing脚本生成文件，按需求修改 (datafile)
111 |       batch_size: 16
112 | 
113 |   valid_data_source_list:
114 |     val799:
115 |       file: '../dataset/tagging/GroundTruth/datafile/val.txt'    # preprocessing脚本生成文件，按需求修改
116 |       batch_size: 128
117 | 
118 |   preprocess_root: 'src/dataloader/preprocess/'
119 |   preprocess_config:
120 |     feature:
121 |       - name: 'video,video_frames_num,idx'
122 |         shape: [[300,2552], [],[]]
123 |         dtype: 'float32,int32,string'
124 |         class: 'frames_npy_preprocess.Preprocess'
125 |         extra_args: 
126 |           max_frames: 300
127 |           feat_dim: 2552
128 |           return_frames_num: True
129 |           return_idx: True
130 | 
131 |       - name: 'audio,audio_frames_num'
132 |         shape: [[300,128], []]
133 |         dtype: 'float32,int32'
134 |         class: 'frames_npy_preprocess.Preprocess'
135 |         extra_args: 
136 |           max_frames: 300
137 |           feat_dim: 128
138 |           return_frames_num: True
139 | 
140 |       - name: 'image'
141 |         shape: [[224,224,3]]
142 |         dtype: 'float32'
143 |         class: 'image_preprocess.Preprocess'
144 | 
145 |       - name: 'text'
146 |         shape: [[300]]
147 |         dtype: 'int64'
148 |         class: 'text_preprocess.Preprocess'
149 |         extra_args:
150 |           vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt'
151 |           max_len: 300
152 |     label: 
153 |       - name: 'tagging'
154 |         dtype: 'float32'
155 |         shape: [[82]]   # 根据 num_classes修改
156 |         class: 'label_preprocess.Preprocess_label_sparse_to_dense'
157 |         extra_args: 
158 |           index_dict: '../dataset/label_id.txt'    # 按需求更改
159 | 


--------------------------------------------------------------------------------
/configs/config.tagging.5k.0.yaml:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | #  1. Model Define Configs
  3 | #############################################################
  4 | ModelConfig:
  5 |   model_type: 'NextVladBERT'
  6 |   use_modal_drop: False #在训练过程中，对多模态特征的某一模态进行丢弃
  7 |   with_embedding_bn: False #对不同模态输入特征进行BN归一化
  8 |   modal_drop_rate: 0.3
  9 |   with_video_head: True #视频特征
 10 |   with_audio_head: True #音频特征
 11 |   with_text_head: True  #文本特征
 12 |   with_image_head: False # False #图片特征
 13 | 
 14 |   #视频特征(16384)
 15 |   video_head_type: 'NeXtVLAD'
 16 |   video_head_params:
 17 |     nextvlad_cluster_size: 128
 18 |     groups: 16
 19 |     expansion: 2
 20 |     feature_size: 2552 #inception feature dim
 21 |     directly: True
 22 |     max_frames: 300
 23 | 
 24 |   #语音特征(1024)
 25 |   audio_head_type: 'NeXtVLAD'
 26 |   audio_head_params:
 27 |     nextvlad_cluster_size: 64
 28 |     groups: 16
 29 |     expansion: 2
 30 |     feature_size: 128 #vggfish feature dim
 31 |     directly: True
 32 |     max_frames: 300
 33 | 
 34 |   #文本特征(1024)
 35 |   text_head_type: 'BERT'
 36 |   text_head_params:
 37 |     bert_config:
 38 |       attention_probs_dropout_prob: 0.1
 39 |       hidden_act: "gelu"
 40 |       hidden_dropout_prob: 0.1
 41 |       hidden_size: 768
 42 |       initializer_range: 0.02
 43 |       intermediate_size: 3072
 44 |       max_position_embeddings: 512
 45 |       num_attention_heads: 12
 46 |       num_hidden_layers: 12
 47 |       type_vocab_size: 2
 48 |       vocab_size: 21128
 49 |     bert_emb_encode_size: 1024
 50 |   
 51 |   #图片特征(2048)
 52 |   image_head_type: 'resnet_v2_50'
 53 |   image_head_params: {}
 54 | 
 55 | 
 56 |   #多模态特征融合方式
 57 |   fusion_head_type: 'SE'
 58 |   fusion_head_params:
 59 |     hidden1_size: 1024
 60 |     gating_reduction: 8 # reduction factor in se context gating
 61 |     drop_rate:
 62 |       video: 0.8
 63 |       audio: 0.8
 64 |       image: 0.5
 65 |       text: 0.4
 66 |       fusion: 0.9
 67 |       
 68 |   #tagging分类器参数
 69 |   tagging_classifier_type: 'MoeModel'
 70 |   tagging_classifier_params:
 71 |     num_classes: 82 #标签数目, 按需修改
 72 |     num_mixtures: 2
 73 |     #l2_penalty: 0.0
 74 |     
 75 | #############################################################
 76 | #2. Optimizer & Train Configs
 77 | #############################################################
 78 | OptimizerConfig:
 79 |   optimizer: 'AdamOptimizer'
 80 |   optimizer_init_params: {}
 81 |   clip_gradient_norm: 1.0
 82 |   learning_rate_dict:
 83 |     video: 0.0001
 84 |     audio: 0.0001
 85 |     text: 0.00001
 86 |     image: 0.0001
 87 |     classifier: 0.0005
 88 |   loss_type_dict:
 89 |     tagging: "CrossEntropyLoss"
 90 |   max_step_num: 6500
 91 |   export_model_steps: 500
 92 |   learning_rate_decay: 0.1
 93 |   start_new_model: True # 如果为True，重新训练； 如果False，则resume
 94 |   num_gpu: 1
 95 |   log_device_placement: False
 96 |   gpu_allow_growth: True
 97 |   pretrained_model:
 98 |     text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
 99 |     image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt'
100 |   train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改
101 | 
102 | #############################################################
103 | #    3. DataSet Config
104 | #############################################################
105 | DatasetConfig:
106 |   batch_size: 16
107 |   shuffle: True
108 |   train_data_source_list:
109 |     train799:
110 |       file: '../dataset/tagging/GroundTruth/datafile/train_0.txt'   # preprocessing脚本生成文件，按需求修改 (datafile)
111 |       batch_size: 16
112 | 
113 |   valid_data_source_list:
114 |     val799:
115 |       file: '../dataset/tagging/GroundTruth/datafile/valid_0.txt'    # preprocessing脚本生成文件，按需求修改
116 |       batch_size: 128
117 | 
118 |   preprocess_root: 'src/dataloader/preprocess/'
119 |   preprocess_config:
120 |     feature:
121 |       - name: 'video,video_frames_num,idx'
122 |         shape: [[300,2552], [],[]]
123 |         dtype: 'float32,int32,string'
124 |         class: 'frames_npy_preprocess.Preprocess'
125 |         extra_args: 
126 |           max_frames: 300
127 |           feat_dim: 2552
128 |           return_frames_num: True
129 |           return_idx: True
130 | 
131 |       - name: 'audio,audio_frames_num'
132 |         shape: [[300,128], []]
133 |         dtype: 'float32,int32'
134 |         class: 'frames_npy_preprocess.Preprocess'
135 |         extra_args: 
136 |           max_frames: 300
137 |           feat_dim: 128
138 |           return_frames_num: True
139 | 
140 |       - name: 'image'
141 |         shape: [[224,224,3]]
142 |         dtype: 'float32'
143 |         class: 'image_preprocess.Preprocess'
144 | 
145 |       - name: 'text'
146 |         shape: [[300]]
147 |         dtype: 'int64'
148 |         class: 'text_preprocess.Preprocess'
149 |         extra_args:
150 |           vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt'
151 |           max_len: 300
152 |     label: 
153 |       - name: 'tagging'
154 |         dtype: 'float32'
155 |         shape: [[82]]   # 根据 num_classes修改
156 |         class: 'label_preprocess.Preprocess_label_sparse_to_dense'
157 |         extra_args: 
158 |           index_dict: '../dataset/label_id.txt'    # 按需求更改
159 | 


--------------------------------------------------------------------------------
/configs/config.tagging.5k.1.yaml:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | #  1. Model Define Configs
  3 | #############################################################
  4 | ModelConfig:
  5 |   model_type: 'NextVladBERT'
  6 |   use_modal_drop: False #在训练过程中，对多模态特征的某一模态进行丢弃
  7 |   with_embedding_bn: False #对不同模态输入特征进行BN归一化
  8 |   modal_drop_rate: 0.3
  9 |   with_video_head: True #视频特征
 10 |   with_audio_head: True #音频特征
 11 |   with_text_head: True  #文本特征
 12 |   with_image_head: False # False #图片特征
 13 | 
 14 |   #视频特征(16384)
 15 |   video_head_type: 'NeXtVLAD'
 16 |   video_head_params:
 17 |     nextvlad_cluster_size: 128
 18 |     groups: 16
 19 |     expansion: 2
 20 |     feature_size: 2552 #inception feature dim
 21 |     directly: True
 22 |     max_frames: 300
 23 | 
 24 |   #语音特征(1024)
 25 |   audio_head_type: 'NeXtVLAD'
 26 |   audio_head_params:
 27 |     nextvlad_cluster_size: 64
 28 |     groups: 16
 29 |     expansion: 2
 30 |     feature_size: 128 #vggfish feature dim
 31 |     directly: True
 32 |     max_frames: 300
 33 | 
 34 |   #文本特征(1024)
 35 |   text_head_type: 'BERT'
 36 |   text_head_params:
 37 |     bert_config:
 38 |       attention_probs_dropout_prob: 0.1
 39 |       hidden_act: "gelu"
 40 |       hidden_dropout_prob: 0.1
 41 |       hidden_size: 768
 42 |       initializer_range: 0.02
 43 |       intermediate_size: 3072
 44 |       max_position_embeddings: 512
 45 |       num_attention_heads: 12
 46 |       num_hidden_layers: 12
 47 |       type_vocab_size: 2
 48 |       vocab_size: 21128
 49 |     bert_emb_encode_size: 1024
 50 |   
 51 |   #图片特征(2048)
 52 |   image_head_type: 'resnet_v2_50'
 53 |   image_head_params: {}
 54 | 
 55 | 
 56 |   #多模态特征融合方式
 57 |   fusion_head_type: 'SE'
 58 |   fusion_head_params:
 59 |     hidden1_size: 1024
 60 |     gating_reduction: 8 # reduction factor in se context gating
 61 |     drop_rate:
 62 |       video: 0.8
 63 |       audio: 0.8
 64 |       image: 0.5
 65 |       text: 0.4
 66 |       fusion: 0.9
 67 |       
 68 |   #tagging分类器参数
 69 |   tagging_classifier_type: 'MoeModel'
 70 |   tagging_classifier_params:
 71 |     num_classes: 82 #标签数目, 按需修改
 72 |     num_mixtures: 2
 73 |     #l2_penalty: 0.0
 74 |     
 75 | #############################################################
 76 | #2. Optimizer & Train Configs
 77 | #############################################################
 78 | OptimizerConfig:
 79 |   optimizer: 'AdamOptimizer'
 80 |   optimizer_init_params: {}
 81 |   clip_gradient_norm: 1.0
 82 |   learning_rate_dict:
 83 |     video: 0.0001
 84 |     audio: 0.0001
 85 |     text: 0.00001
 86 |     image: 0.0001
 87 |     classifier: 0.0005
 88 |   loss_type_dict:
 89 |     tagging: "CrossEntropyLoss"
 90 |   max_step_num: 6500
 91 |   export_model_steps: 500
 92 |   learning_rate_decay: 0.1
 93 |   start_new_model: True # 如果为True，重新训练； 如果False，则resume
 94 |   num_gpu: 1
 95 |   log_device_placement: False
 96 |   gpu_allow_growth: True
 97 |   pretrained_model:
 98 |     text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
 99 |     image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt'
100 |   train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改
101 | 
102 | #############################################################
103 | #    3. DataSet Config
104 | #############################################################
105 | DatasetConfig:
106 |   batch_size: 16
107 |   shuffle: True
108 |   train_data_source_list:
109 |     train799:
110 |       file: '../dataset/tagging/GroundTruth/datafile/train_1.txt'   # preprocessing脚本生成文件，按需求修改 (datafile)
111 |       batch_size: 16
112 | 
113 |   valid_data_source_list:
114 |     val799:
115 |       file: '../dataset/tagging/GroundTruth/datafile/valid_1.txt'    # preprocessing脚本生成文件，按需求修改
116 |       batch_size: 128
117 | 
118 |   preprocess_root: 'src/dataloader/preprocess/'
119 |   preprocess_config:
120 |     feature:
121 |       - name: 'video,video_frames_num,idx'
122 |         shape: [[300,2552], [],[]]
123 |         dtype: 'float32,int32,string'
124 |         class: 'frames_npy_preprocess.Preprocess'
125 |         extra_args: 
126 |           max_frames: 300
127 |           feat_dim: 2552
128 |           return_frames_num: True
129 |           return_idx: True
130 | 
131 |       - name: 'audio,audio_frames_num'
132 |         shape: [[300,128], []]
133 |         dtype: 'float32,int32'
134 |         class: 'frames_npy_preprocess.Preprocess'
135 |         extra_args: 
136 |           max_frames: 300
137 |           feat_dim: 128
138 |           return_frames_num: True
139 | 
140 |       - name: 'image'
141 |         shape: [[224,224,3]]
142 |         dtype: 'float32'
143 |         class: 'image_preprocess.Preprocess'
144 | 
145 |       - name: 'text'
146 |         shape: [[300]]
147 |         dtype: 'int64'
148 |         class: 'text_preprocess.Preprocess'
149 |         extra_args:
150 |           vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt'
151 |           max_len: 300
152 |     label: 
153 |       - name: 'tagging'
154 |         dtype: 'float32'
155 |         shape: [[82]]   # 根据 num_classes修改
156 |         class: 'label_preprocess.Preprocess_label_sparse_to_dense'
157 |         extra_args: 
158 |           index_dict: '../dataset/label_id.txt'    # 按需求更改
159 | 


--------------------------------------------------------------------------------
/configs/config.tagging.5k.2.yaml:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | #  1. Model Define Configs
  3 | #############################################################
  4 | ModelConfig:
  5 |   model_type: 'NextVladBERT'
  6 |   use_modal_drop: False #在训练过程中，对多模态特征的某一模态进行丢弃
  7 |   with_embedding_bn: False #对不同模态输入特征进行BN归一化
  8 |   modal_drop_rate: 0.3
  9 |   with_video_head: True #视频特征
 10 |   with_audio_head: True #音频特征
 11 |   with_text_head: True  #文本特征
 12 |   with_image_head: False # False #图片特征
 13 | 
 14 |   #视频特征(16384)
 15 |   video_head_type: 'NeXtVLAD'
 16 |   video_head_params:
 17 |     nextvlad_cluster_size: 128
 18 |     groups: 16
 19 |     expansion: 2
 20 |     feature_size: 2552 #inception feature dim
 21 |     directly: True
 22 |     max_frames: 300
 23 | 
 24 |   #语音特征(1024)
 25 |   audio_head_type: 'NeXtVLAD'
 26 |   audio_head_params:
 27 |     nextvlad_cluster_size: 64
 28 |     groups: 16
 29 |     expansion: 2
 30 |     feature_size: 128 #vggfish feature dim
 31 |     directly: True
 32 |     max_frames: 300
 33 | 
 34 |   #文本特征(1024)
 35 |   text_head_type: 'BERT'
 36 |   text_head_params:
 37 |     bert_config:
 38 |       attention_probs_dropout_prob: 0.1
 39 |       hidden_act: "gelu"
 40 |       hidden_dropout_prob: 0.1
 41 |       hidden_size: 768
 42 |       initializer_range: 0.02
 43 |       intermediate_size: 3072
 44 |       max_position_embeddings: 512
 45 |       num_attention_heads: 12
 46 |       num_hidden_layers: 12
 47 |       type_vocab_size: 2
 48 |       vocab_size: 21128
 49 |     bert_emb_encode_size: 1024
 50 |   
 51 |   #图片特征(2048)
 52 |   image_head_type: 'resnet_v2_50'
 53 |   image_head_params: {}
 54 | 
 55 | 
 56 |   #多模态特征融合方式
 57 |   fusion_head_type: 'SE'
 58 |   fusion_head_params:
 59 |     hidden1_size: 1024
 60 |     gating_reduction: 8 # reduction factor in se context gating
 61 |     drop_rate:
 62 |       video: 0.8
 63 |       audio: 0.8
 64 |       image: 0.5
 65 |       text: 0.4
 66 |       fusion: 0.9
 67 |       
 68 |   #tagging分类器参数
 69 |   tagging_classifier_type: 'MoeModel'
 70 |   tagging_classifier_params:
 71 |     num_classes: 82 #标签数目, 按需修改
 72 |     num_mixtures: 2
 73 |     #l2_penalty: 0.0
 74 |     
 75 | #############################################################
 76 | #2. Optimizer & Train Configs
 77 | #############################################################
 78 | OptimizerConfig:
 79 |   optimizer: 'AdamOptimizer'
 80 |   optimizer_init_params: {}
 81 |   clip_gradient_norm: 1.0
 82 |   learning_rate_dict:
 83 |     video: 0.0001
 84 |     audio: 0.0001
 85 |     text: 0.00001
 86 |     image: 0.0001
 87 |     classifier: 0.0005
 88 |   loss_type_dict:
 89 |     tagging: "CrossEntropyLoss"
 90 |   max_step_num: 6500
 91 |   export_model_steps: 500
 92 |   learning_rate_decay: 0.1
 93 |   start_new_model: True # 如果为True，重新训练； 如果False，则resume
 94 |   num_gpu: 1
 95 |   log_device_placement: False
 96 |   gpu_allow_growth: True
 97 |   pretrained_model:
 98 |     text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
 99 |     image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt'
100 |   train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改
101 | 
102 | #############################################################
103 | #    3. DataSet Config
104 | #############################################################
105 | DatasetConfig:
106 |   batch_size: 16
107 |   shuffle: True
108 |   train_data_source_list:
109 |     train799:
110 |       file: '../dataset/tagging/GroundTruth/datafile/train_2.txt'   # preprocessing脚本生成文件，按需求修改 (datafile)
111 |       batch_size: 16
112 | 
113 |   valid_data_source_list:
114 |     val799:
115 |       file: '../dataset/tagging/GroundTruth/datafile/valid_2.txt'    # preprocessing脚本生成文件，按需求修改
116 |       batch_size: 128
117 | 
118 |   preprocess_root: 'src/dataloader/preprocess/'
119 |   preprocess_config:
120 |     feature:
121 |       - name: 'video,video_frames_num,idx'
122 |         shape: [[300,2552], [],[]]
123 |         dtype: 'float32,int32,string'
124 |         class: 'frames_npy_preprocess.Preprocess'
125 |         extra_args: 
126 |           max_frames: 300
127 |           feat_dim: 2552
128 |           return_frames_num: True
129 |           return_idx: True
130 | 
131 |       - name: 'audio,audio_frames_num'
132 |         shape: [[300,128], []]
133 |         dtype: 'float32,int32'
134 |         class: 'frames_npy_preprocess.Preprocess'
135 |         extra_args: 
136 |           max_frames: 300
137 |           feat_dim: 128
138 |           return_frames_num: True
139 | 
140 |       - name: 'image'
141 |         shape: [[224,224,3]]
142 |         dtype: 'float32'
143 |         class: 'image_preprocess.Preprocess'
144 | 
145 |       - name: 'text'
146 |         shape: [[300]]
147 |         dtype: 'int64'
148 |         class: 'text_preprocess.Preprocess'
149 |         extra_args:
150 |           vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt'
151 |           max_len: 300
152 |     label: 
153 |       - name: 'tagging'
154 |         dtype: 'float32'
155 |         shape: [[82]]   # 根据 num_classes修改
156 |         class: 'label_preprocess.Preprocess_label_sparse_to_dense'
157 |         extra_args: 
158 |           index_dict: '../dataset/label_id.txt'    # 按需求更改
159 | 


--------------------------------------------------------------------------------
/configs/config.tagging.5k.3.yaml:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | #  1. Model Define Configs
  3 | #############################################################
  4 | ModelConfig:
  5 |   model_type: 'NextVladBERT'
  6 |   use_modal_drop: False #在训练过程中，对多模态特征的某一模态进行丢弃
  7 |   with_embedding_bn: False #对不同模态输入特征进行BN归一化
  8 |   modal_drop_rate: 0.3
  9 |   with_video_head: True #视频特征
 10 |   with_audio_head: True #音频特征
 11 |   with_text_head: True  #文本特征
 12 |   with_image_head: False # False #图片特征
 13 | 
 14 |   #视频特征(16384)
 15 |   video_head_type: 'NeXtVLAD'
 16 |   video_head_params:
 17 |     nextvlad_cluster_size: 128
 18 |     groups: 16
 19 |     expansion: 2
 20 |     feature_size: 2552 #inception feature dim
 21 |     directly: True
 22 |     max_frames: 300
 23 | 
 24 |   #语音特征(1024)
 25 |   audio_head_type: 'NeXtVLAD'
 26 |   audio_head_params:
 27 |     nextvlad_cluster_size: 64
 28 |     groups: 16
 29 |     expansion: 2
 30 |     feature_size: 128 #vggfish feature dim
 31 |     directly: True
 32 |     max_frames: 300
 33 | 
 34 |   #文本特征(1024)
 35 |   text_head_type: 'BERT'
 36 |   text_head_params:
 37 |     bert_config:
 38 |       attention_probs_dropout_prob: 0.1
 39 |       hidden_act: "gelu"
 40 |       hidden_dropout_prob: 0.1
 41 |       hidden_size: 768
 42 |       initializer_range: 0.02
 43 |       intermediate_size: 3072
 44 |       max_position_embeddings: 512
 45 |       num_attention_heads: 12
 46 |       num_hidden_layers: 12
 47 |       type_vocab_size: 2
 48 |       vocab_size: 21128
 49 |     bert_emb_encode_size: 1024
 50 |   
 51 |   #图片特征(2048)
 52 |   image_head_type: 'resnet_v2_50'
 53 |   image_head_params: {}
 54 | 
 55 | 
 56 |   #多模态特征融合方式
 57 |   fusion_head_type: 'SE'
 58 |   fusion_head_params:
 59 |     hidden1_size: 1024
 60 |     gating_reduction: 8 # reduction factor in se context gating
 61 |     drop_rate:
 62 |       video: 0.8
 63 |       audio: 0.8
 64 |       image: 0.5
 65 |       text: 0.4
 66 |       fusion: 0.9
 67 |       
 68 |   #tagging分类器参数
 69 |   tagging_classifier_type: 'MoeModel'
 70 |   tagging_classifier_params:
 71 |     num_classes: 82 #标签数目, 按需修改
 72 |     num_mixtures: 2
 73 |     #l2_penalty: 0.0
 74 |     
 75 | #############################################################
 76 | #2. Optimizer & Train Configs
 77 | #############################################################
 78 | OptimizerConfig:
 79 |   optimizer: 'AdamOptimizer'
 80 |   optimizer_init_params: {}
 81 |   clip_gradient_norm: 1.0
 82 |   learning_rate_dict:
 83 |     video: 0.0001
 84 |     audio: 0.0001
 85 |     text: 0.00001
 86 |     image: 0.0001
 87 |     classifier: 0.0005
 88 |   loss_type_dict:
 89 |     tagging: "CrossEntropyLoss"
 90 |   max_step_num: 6500
 91 |   export_model_steps: 500
 92 |   learning_rate_decay: 0.1
 93 |   start_new_model: True # 如果为True，重新训练； 如果False，则resume
 94 |   num_gpu: 1
 95 |   log_device_placement: False
 96 |   gpu_allow_growth: True
 97 |   pretrained_model:
 98 |     text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
 99 |     image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt'
100 |   train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改
101 | 
102 | #############################################################
103 | #    3. DataSet Config
104 | #############################################################
105 | DatasetConfig:
106 |   batch_size: 16
107 |   shuffle: True
108 |   train_data_source_list:
109 |     train799:
110 |       file: '../dataset/tagging/GroundTruth/datafile/train_3.txt'   # preprocessing脚本生成文件，按需求修改 (datafile)
111 |       batch_size: 16
112 | 
113 |   valid_data_source_list:
114 |     val799:
115 |       file: '../dataset/tagging/GroundTruth/datafile/valid_3.txt'    # preprocessing脚本生成文件，按需求修改
116 |       batch_size: 128
117 | 
118 |   preprocess_root: 'src/dataloader/preprocess/'
119 |   preprocess_config:
120 |     feature:
121 |       - name: 'video,video_frames_num,idx'
122 |         shape: [[300,2552], [],[]]
123 |         dtype: 'float32,int32,string'
124 |         class: 'frames_npy_preprocess.Preprocess'
125 |         extra_args: 
126 |           max_frames: 300
127 |           feat_dim: 2552
128 |           return_frames_num: True
129 |           return_idx: True
130 | 
131 |       - name: 'audio,audio_frames_num'
132 |         shape: [[300,128], []]
133 |         dtype: 'float32,int32'
134 |         class: 'frames_npy_preprocess.Preprocess'
135 |         extra_args: 
136 |           max_frames: 300
137 |           feat_dim: 128
138 |           return_frames_num: True
139 | 
140 |       - name: 'image'
141 |         shape: [[224,224,3]]
142 |         dtype: 'float32'
143 |         class: 'image_preprocess.Preprocess'
144 | 
145 |       - name: 'text'
146 |         shape: [[300]]
147 |         dtype: 'int64'
148 |         class: 'text_preprocess.Preprocess'
149 |         extra_args:
150 |           vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt'
151 |           max_len: 300
152 |     label: 
153 |       - name: 'tagging'
154 |         dtype: 'float32'
155 |         shape: [[82]]   # 根据 num_classes修改
156 |         class: 'label_preprocess.Preprocess_label_sparse_to_dense'
157 |         extra_args: 
158 |           index_dict: '../dataset/label_id.txt'    # 按需求更改
159 | 


--------------------------------------------------------------------------------
/configs/config.tagging.5k.4.yaml:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | #  1. Model Define Configs
  3 | #############################################################
  4 | ModelConfig:
  5 |   model_type: 'NextVladBERT'
  6 |   use_modal_drop: False #在训练过程中，对多模态特征的某一模态进行丢弃
  7 |   with_embedding_bn: False #对不同模态输入特征进行BN归一化
  8 |   modal_drop_rate: 0.3
  9 |   with_video_head: True #视频特征
 10 |   with_audio_head: True #音频特征
 11 |   with_text_head: True  #文本特征
 12 |   with_image_head: False # False #图片特征
 13 | 
 14 |   #视频特征(16384)
 15 |   video_head_type: 'NeXtVLAD'
 16 |   video_head_params:
 17 |     nextvlad_cluster_size: 128
 18 |     groups: 16
 19 |     expansion: 2
 20 |     feature_size: 2552 #inception feature dim
 21 |     directly: True
 22 |     max_frames: 300
 23 | 
 24 |   #语音特征(1024)
 25 |   audio_head_type: 'NeXtVLAD'
 26 |   audio_head_params:
 27 |     nextvlad_cluster_size: 64
 28 |     groups: 16
 29 |     expansion: 2
 30 |     feature_size: 128 #vggfish feature dim
 31 |     directly: True
 32 |     max_frames: 300
 33 | 
 34 |   #文本特征(1024)
 35 |   text_head_type: 'BERT'
 36 |   text_head_params:
 37 |     bert_config:
 38 |       attention_probs_dropout_prob: 0.1
 39 |       hidden_act: "gelu"
 40 |       hidden_dropout_prob: 0.1
 41 |       hidden_size: 768
 42 |       initializer_range: 0.02
 43 |       intermediate_size: 3072
 44 |       max_position_embeddings: 512
 45 |       num_attention_heads: 12
 46 |       num_hidden_layers: 12
 47 |       type_vocab_size: 2
 48 |       vocab_size: 21128
 49 |     bert_emb_encode_size: 1024
 50 |   
 51 |   #图片特征(2048)
 52 |   image_head_type: 'resnet_v2_50'
 53 |   image_head_params: {}
 54 | 
 55 | 
 56 |   #多模态特征融合方式
 57 |   fusion_head_type: 'SE'
 58 |   fusion_head_params:
 59 |     hidden1_size: 1024
 60 |     gating_reduction: 8 # reduction factor in se context gating
 61 |     drop_rate:
 62 |       video: 0.8
 63 |       audio: 0.8
 64 |       image: 0.5
 65 |       text: 0.4
 66 |       fusion: 0.9
 67 |       
 68 |   #tagging分类器参数
 69 |   tagging_classifier_type: 'MoeModel'
 70 |   tagging_classifier_params:
 71 |     num_classes: 82 #标签数目, 按需修改
 72 |     num_mixtures: 2
 73 |     #l2_penalty: 0.0
 74 |     
 75 | #############################################################
 76 | #2. Optimizer & Train Configs
 77 | #############################################################
 78 | OptimizerConfig:
 79 |   optimizer: 'AdamOptimizer'
 80 |   optimizer_init_params: {}
 81 |   clip_gradient_norm: 1.0
 82 |   learning_rate_dict:
 83 |     video: 0.0001
 84 |     audio: 0.0001
 85 |     text: 0.00001
 86 |     image: 0.0001
 87 |     classifier: 0.0005
 88 |   loss_type_dict:
 89 |     tagging: "CrossEntropyLoss"
 90 |   max_step_num: 6500
 91 |   export_model_steps: 500
 92 |   learning_rate_decay: 0.1
 93 |   start_new_model: True # 如果为True，重新训练； 如果False，则resume
 94 |   num_gpu: 1
 95 |   log_device_placement: False
 96 |   gpu_allow_growth: True
 97 |   pretrained_model:
 98 |     text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
 99 |     image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt'
100 |   train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改
101 | 
102 | #############################################################
103 | #    3. DataSet Config
104 | #############################################################
105 | DatasetConfig:
106 |   batch_size: 16
107 |   shuffle: True
108 |   train_data_source_list:
109 |     train799:
110 |       file: '../dataset/tagging/GroundTruth/datafile/train_4.txt'   # preprocessing脚本生成文件，按需求修改 (datafile)
111 |       batch_size: 16
112 | 
113 |   valid_data_source_list:
114 |     val799:
115 |       file: '../dataset/tagging/GroundTruth/datafile/valid_4.txt'    # preprocessing脚本生成文件，按需求修改
116 |       batch_size: 128
117 | 
118 |   preprocess_root: 'src/dataloader/preprocess/'
119 |   preprocess_config:
120 |     feature:
121 |       - name: 'video,video_frames_num,idx'
122 |         shape: [[300,2552], [],[]]
123 |         dtype: 'float32,int32,string'
124 |         class: 'frames_npy_preprocess.Preprocess'
125 |         extra_args: 
126 |           max_frames: 300
127 |           feat_dim: 2552
128 |           return_frames_num: True
129 |           return_idx: True
130 | 
131 |       - name: 'audio,audio_frames_num'
132 |         shape: [[300,128], []]
133 |         dtype: 'float32,int32'
134 |         class: 'frames_npy_preprocess.Preprocess'
135 |         extra_args: 
136 |           max_frames: 300
137 |           feat_dim: 128
138 |           return_frames_num: True
139 | 
140 |       - name: 'image'
141 |         shape: [[224,224,3]]
142 |         dtype: 'float32'
143 |         class: 'image_preprocess.Preprocess'
144 | 
145 |       - name: 'text'
146 |         shape: [[300]]
147 |         dtype: 'int64'
148 |         class: 'text_preprocess.Preprocess'
149 |         extra_args:
150 |           vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt'
151 |           max_len: 300
152 |     label: 
153 |       - name: 'tagging'
154 |         dtype: 'float32'
155 |         shape: [[82]]   # 根据 num_classes修改
156 |         class: 'label_preprocess.Preprocess_label_sparse_to_dense'
157 |         extra_args: 
158 |           index_dict: '../dataset/label_id.txt'    # 按需求更改
159 | 


--------------------------------------------------------------------------------
/configs/config.tagging.5k.5.yaml:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | #  1. Model Define Configs
  3 | #############################################################
  4 | ModelConfig:
  5 |   model_type: 'NextVladBERT'
  6 |   use_modal_drop: False #在训练过程中，对多模态特征的某一模态进行丢弃
  7 |   with_embedding_bn: False #对不同模态输入特征进行BN归一化
  8 |   modal_drop_rate: 0.3
  9 |   with_video_head: True #视频特征
 10 |   with_audio_head: True #音频特征
 11 |   with_text_head: True  #文本特征
 12 |   with_image_head: False # False #图片特征
 13 | 
 14 |   #视频特征(16384)
 15 |   video_head_type: 'NeXtVLAD'
 16 |   video_head_params:
 17 |     nextvlad_cluster_size: 128
 18 |     groups: 16
 19 |     expansion: 2
 20 |     feature_size: 2552 #inception feature dim
 21 |     directly: True
 22 |     max_frames: 300
 23 | 
 24 |   #语音特征(1024)
 25 |   audio_head_type: 'NeXtVLAD'
 26 |   audio_head_params:
 27 |     nextvlad_cluster_size: 64
 28 |     groups: 16
 29 |     expansion: 2
 30 |     feature_size: 128 #vggfish feature dim
 31 |     directly: True
 32 |     max_frames: 300
 33 | 
 34 |   #文本特征(1024)
 35 |   text_head_type: 'BERT'
 36 |   text_head_params:
 37 |     bert_config:
 38 |       attention_probs_dropout_prob: 0.1
 39 |       hidden_act: "gelu"
 40 |       hidden_dropout_prob: 0.1
 41 |       hidden_size: 768
 42 |       initializer_range: 0.02
 43 |       intermediate_size: 3072
 44 |       max_position_embeddings: 512
 45 |       num_attention_heads: 12
 46 |       num_hidden_layers: 12
 47 |       type_vocab_size: 2
 48 |       vocab_size: 21128
 49 |     bert_emb_encode_size: 1024
 50 |   
 51 |   #图片特征(2048)
 52 |   image_head_type: 'resnet_v2_50'
 53 |   image_head_params: {}
 54 | 
 55 | 
 56 |   #多模态特征融合方式
 57 |   fusion_head_type: 'SE'
 58 |   fusion_head_params:
 59 |     hidden1_size: 1024
 60 |     gating_reduction: 8 # reduction factor in se context gating
 61 |     drop_rate:
 62 |       video: 0.8
 63 |       audio: 0.8
 64 |       image: 0.5
 65 |       text: 0.4
 66 |       fusion: 0.9
 67 |       
 68 |   #tagging分类器参数
 69 |   tagging_classifier_type: 'MoeModel'
 70 |   tagging_classifier_params:
 71 |     num_classes: 82 #标签数目, 按需修改
 72 |     num_mixtures: 2
 73 |     #l2_penalty: 0.0
 74 |     
 75 | #############################################################
 76 | #2. Optimizer & Train Configs
 77 | #############################################################
 78 | OptimizerConfig:
 79 |   optimizer: 'AdamOptimizer'
 80 |   optimizer_init_params: {}
 81 |   clip_gradient_norm: 1.0
 82 |   learning_rate_dict:
 83 |     video: 0.0001
 84 |     audio: 0.0001
 85 |     text: 0.00001
 86 |     image: 0.0001
 87 |     classifier: 0.0005
 88 |   loss_type_dict:
 89 |     tagging: "CrossEntropyLoss"
 90 |   max_step_num: 6500
 91 |   export_model_steps: 500
 92 |   learning_rate_decay: 0.1
 93 |   start_new_model: True # 如果为True，重新训练； 如果False，则resume
 94 |   num_gpu: 1
 95 |   log_device_placement: False
 96 |   gpu_allow_growth: True
 97 |   pretrained_model:
 98 |     text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
 99 |     image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt'
100 |   train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改
101 | 
102 | #############################################################
103 | #    3. DataSet Config
104 | #############################################################
105 | DatasetConfig:
106 |   batch_size: 16
107 |   shuffle: True
108 |   train_data_source_list:
109 |     train799:
110 |       file: '../dataset/tagging/GroundTruth/datafile/train_5.txt'   # preprocessing脚本生成文件，按需求修改 (datafile)
111 |       batch_size: 16
112 | 
113 |   valid_data_source_list:
114 |     val799:
115 |       file: '../dataset/tagging/GroundTruth/datafile/valid_5.txt'    # preprocessing脚本生成文件，按需求修改
116 |       batch_size: 128
117 | 
118 |   preprocess_root: 'src/dataloader/preprocess/'
119 |   preprocess_config:
120 |     feature:
121 |       - name: 'video,video_frames_num,idx'
122 |         shape: [[300,2552], [],[]]
123 |         dtype: 'float32,int32,string'
124 |         class: 'frames_npy_preprocess.Preprocess'
125 |         extra_args: 
126 |           max_frames: 300
127 |           feat_dim: 2552
128 |           return_frames_num: True
129 |           return_idx: True
130 | 
131 |       - name: 'audio,audio_frames_num'
132 |         shape: [[300,128], []]
133 |         dtype: 'float32,int32'
134 |         class: 'frames_npy_preprocess.Preprocess'
135 |         extra_args: 
136 |           max_frames: 300
137 |           feat_dim: 128
138 |           return_frames_num: True
139 | 
140 |       - name: 'image'
141 |         shape: [[224,224,3]]
142 |         dtype: 'float32'
143 |         class: 'image_preprocess.Preprocess'
144 | 
145 |       - name: 'text'
146 |         shape: [[300]]
147 |         dtype: 'int64'
148 |         class: 'text_preprocess.Preprocess'
149 |         extra_args:
150 |           vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt'
151 |           max_len: 300
152 |     label: 
153 |       - name: 'tagging'
154 |         dtype: 'float32'
155 |         shape: [[82]]   # 根据 num_classes修改
156 |         class: 'label_preprocess.Preprocess_label_sparse_to_dense'
157 |         extra_args: 
158 |           index_dict: '../dataset/label_id.txt'    # 按需求更改
159 | 


--------------------------------------------------------------------------------
/configs/config.tagging.5k.6.yaml:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | #  1. Model Define Configs
  3 | #############################################################
  4 | ModelConfig:
  5 |   model_type: 'NextVladBERT'
  6 |   use_modal_drop: False #在训练过程中，对多模态特征的某一模态进行丢弃
  7 |   with_embedding_bn: False #对不同模态输入特征进行BN归一化
  8 |   modal_drop_rate: 0.3
  9 |   with_video_head: True #视频特征
 10 |   with_audio_head: True #音频特征
 11 |   with_text_head: True  #文本特征
 12 |   with_image_head: False # False #图片特征
 13 | 
 14 |   #视频特征(16384)
 15 |   video_head_type: 'NeXtVLAD'
 16 |   video_head_params:
 17 |     nextvlad_cluster_size: 128
 18 |     groups: 16
 19 |     expansion: 2
 20 |     feature_size: 2552 #inception feature dim
 21 |     directly: True
 22 |     max_frames: 300
 23 | 
 24 |   #语音特征(1024)
 25 |   audio_head_type: 'NeXtVLAD'
 26 |   audio_head_params:
 27 |     nextvlad_cluster_size: 64
 28 |     groups: 16
 29 |     expansion: 2
 30 |     feature_size: 128 #vggfish feature dim
 31 |     directly: True
 32 |     max_frames: 300
 33 | 
 34 |   #文本特征(1024)
 35 |   text_head_type: 'BERT'
 36 |   text_head_params:
 37 |     bert_config:
 38 |       attention_probs_dropout_prob: 0.1
 39 |       hidden_act: "gelu"
 40 |       hidden_dropout_prob: 0.1
 41 |       hidden_size: 768
 42 |       initializer_range: 0.02
 43 |       intermediate_size: 3072
 44 |       max_position_embeddings: 512
 45 |       num_attention_heads: 12
 46 |       num_hidden_layers: 12
 47 |       type_vocab_size: 2
 48 |       vocab_size: 21128
 49 |     bert_emb_encode_size: 1024
 50 |   
 51 |   #图片特征(2048)
 52 |   image_head_type: 'resnet_v2_50'
 53 |   image_head_params: {}
 54 | 
 55 | 
 56 |   #多模态特征融合方式
 57 |   fusion_head_type: 'SE'
 58 |   fusion_head_params:
 59 |     hidden1_size: 1024
 60 |     gating_reduction: 8 # reduction factor in se context gating
 61 |     drop_rate:
 62 |       video: 0.8
 63 |       audio: 0.8
 64 |       image: 0.5
 65 |       text: 0.4
 66 |       fusion: 0.9
 67 |       
 68 |   #tagging分类器参数
 69 |   tagging_classifier_type: 'MoeModel'
 70 |   tagging_classifier_params:
 71 |     num_classes: 82 #标签数目, 按需修改
 72 |     num_mixtures: 2
 73 |     #l2_penalty: 0.0
 74 |     
 75 | #############################################################
 76 | #2. Optimizer & Train Configs
 77 | #############################################################
 78 | OptimizerConfig:
 79 |   optimizer: 'AdamOptimizer'
 80 |   optimizer_init_params: {}
 81 |   clip_gradient_norm: 1.0
 82 |   learning_rate_dict:
 83 |     video: 0.0001
 84 |     audio: 0.0001
 85 |     text: 0.00001
 86 |     image: 0.0001
 87 |     classifier: 0.0005
 88 |   loss_type_dict:
 89 |     tagging: "CrossEntropyLoss"
 90 |   max_step_num: 6500
 91 |   export_model_steps: 500
 92 |   learning_rate_decay: 0.1
 93 |   start_new_model: True # 如果为True，重新训练； 如果False，则resume
 94 |   num_gpu: 1
 95 |   log_device_placement: False
 96 |   gpu_allow_growth: True
 97 |   pretrained_model:
 98 |     text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
 99 |     image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt'
100 |   train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改
101 | 
102 | #############################################################
103 | #    3. DataSet Config
104 | #############################################################
105 | DatasetConfig:
106 |   batch_size: 16
107 |   shuffle: True
108 |   train_data_source_list:
109 |     train799:
110 |       file: '../dataset/tagging/GroundTruth/datafile/train_6.txt'   # preprocessing脚本生成文件，按需求修改 (datafile)
111 |       batch_size: 16
112 | 
113 |   valid_data_source_list:
114 |     val799:
115 |       file: '../dataset/tagging/GroundTruth/datafile/valid_6.txt'    # preprocessing脚本生成文件，按需求修改
116 |       batch_size: 128
117 | 
118 |   preprocess_root: 'src/dataloader/preprocess/'
119 |   preprocess_config:
120 |     feature:
121 |       - name: 'video,video_frames_num,idx'
122 |         shape: [[300,2552], [],[]]
123 |         dtype: 'float32,int32,string'
124 |         class: 'frames_npy_preprocess.Preprocess'
125 |         extra_args: 
126 |           max_frames: 300
127 |           feat_dim: 2552
128 |           return_frames_num: True
129 |           return_idx: True
130 | 
131 |       - name: 'audio,audio_frames_num'
132 |         shape: [[300,128], []]
133 |         dtype: 'float32,int32'
134 |         class: 'frames_npy_preprocess.Preprocess'
135 |         extra_args: 
136 |           max_frames: 300
137 |           feat_dim: 128
138 |           return_frames_num: True
139 | 
140 |       - name: 'image'
141 |         shape: [[224,224,3]]
142 |         dtype: 'float32'
143 |         class: 'image_preprocess.Preprocess'
144 | 
145 |       - name: 'text'
146 |         shape: [[300]]
147 |         dtype: 'int64'
148 |         class: 'text_preprocess.Preprocess'
149 |         extra_args:
150 |           vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt'
151 |           max_len: 300
152 |     label: 
153 |       - name: 'tagging'
154 |         dtype: 'float32'
155 |         shape: [[82]]   # 根据 num_classes修改
156 |         class: 'label_preprocess.Preprocess_label_sparse_to_dense'
157 |         extra_args: 
158 |           index_dict: '../dataset/label_id.txt'    # 按需求更改
159 | 


--------------------------------------------------------------------------------
/configs/config.tagging.5k.7.yaml:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | #  1. Model Define Configs
  3 | #############################################################
  4 | ModelConfig:
  5 |   model_type: 'NextVladBERT'
  6 |   use_modal_drop: False #在训练过程中，对多模态特征的某一模态进行丢弃
  7 |   with_embedding_bn: False #对不同模态输入特征进行BN归一化
  8 |   modal_drop_rate: 0.3
  9 |   with_video_head: True #视频特征
 10 |   with_audio_head: True #音频特征
 11 |   with_text_head: True  #文本特征
 12 |   with_image_head: False # False #图片特征
 13 | 
 14 |   #视频特征(16384)
 15 |   video_head_type: 'NeXtVLAD'
 16 |   video_head_params:
 17 |     nextvlad_cluster_size: 128
 18 |     groups: 16
 19 |     expansion: 2
 20 |     feature_size: 2552 #inception feature dim
 21 |     directly: True
 22 |     max_frames: 300
 23 | 
 24 |   #语音特征(1024)
 25 |   audio_head_type: 'NeXtVLAD'
 26 |   audio_head_params:
 27 |     nextvlad_cluster_size: 64
 28 |     groups: 16
 29 |     expansion: 2
 30 |     feature_size: 128 #vggfish feature dim
 31 |     directly: True
 32 |     max_frames: 300
 33 | 
 34 |   #文本特征(1024)
 35 |   text_head_type: 'BERT'
 36 |   text_head_params:
 37 |     bert_config:
 38 |       attention_probs_dropout_prob: 0.1
 39 |       hidden_act: "gelu"
 40 |       hidden_dropout_prob: 0.1
 41 |       hidden_size: 768
 42 |       initializer_range: 0.02
 43 |       intermediate_size: 3072
 44 |       max_position_embeddings: 512
 45 |       num_attention_heads: 12
 46 |       num_hidden_layers: 12
 47 |       type_vocab_size: 2
 48 |       vocab_size: 21128
 49 |     bert_emb_encode_size: 1024
 50 |   
 51 |   #图片特征(2048)
 52 |   image_head_type: 'resnet_v2_50'
 53 |   image_head_params: {}
 54 | 
 55 | 
 56 |   #多模态特征融合方式
 57 |   fusion_head_type: 'SE'
 58 |   fusion_head_params:
 59 |     hidden1_size: 1024
 60 |     gating_reduction: 8 # reduction factor in se context gating
 61 |     drop_rate:
 62 |       video: 0.8
 63 |       audio: 0.8
 64 |       image: 0.5
 65 |       text: 0.4
 66 |       fusion: 0.9
 67 |       
 68 |   #tagging分类器参数
 69 |   tagging_classifier_type: 'MoeModel'
 70 |   tagging_classifier_params:
 71 |     num_classes: 82 #标签数目, 按需修改
 72 |     num_mixtures: 2
 73 |     #l2_penalty: 0.0
 74 |     
 75 | #############################################################
 76 | #2. Optimizer & Train Configs
 77 | #############################################################
 78 | OptimizerConfig:
 79 |   optimizer: 'AdamOptimizer'
 80 |   optimizer_init_params: {}
 81 |   clip_gradient_norm: 1.0
 82 |   learning_rate_dict:
 83 |     video: 0.0001
 84 |     audio: 0.0001
 85 |     text: 0.00001
 86 |     image: 0.0001
 87 |     classifier: 0.0005
 88 |   loss_type_dict:
 89 |     tagging: "CrossEntropyLoss"
 90 |   max_step_num: 6500
 91 |   export_model_steps: 500
 92 |   learning_rate_decay: 0.1
 93 |   start_new_model: True # 如果为True，重新训练； 如果False，则resume
 94 |   num_gpu: 1
 95 |   log_device_placement: False
 96 |   gpu_allow_growth: True
 97 |   pretrained_model:
 98 |     text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
 99 |     image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt'
100 |   train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改
101 | 
102 | #############################################################
103 | #    3. DataSet Config
104 | #############################################################
105 | DatasetConfig:
106 |   batch_size: 16
107 |   shuffle: True
108 |   train_data_source_list:
109 |     train799:
110 |       file: '../dataset/tagging/GroundTruth/datafile/train_7.txt'   # preprocessing脚本生成文件，按需求修改 (datafile)
111 |       batch_size: 16
112 | 
113 |   valid_data_source_list:
114 |     val799:
115 |       file: '../dataset/tagging/GroundTruth/datafile/valid_7.txt'    # preprocessing脚本生成文件，按需求修改
116 |       batch_size: 128
117 | 
118 |   preprocess_root: 'src/dataloader/preprocess/'
119 |   preprocess_config:
120 |     feature:
121 |       - name: 'video,video_frames_num,idx'
122 |         shape: [[300,2552], [],[]]
123 |         dtype: 'float32,int32,string'
124 |         class: 'frames_npy_preprocess.Preprocess'
125 |         extra_args: 
126 |           max_frames: 300
127 |           feat_dim: 2552
128 |           return_frames_num: True
129 |           return_idx: True
130 | 
131 |       - name: 'audio,audio_frames_num'
132 |         shape: [[300,128], []]
133 |         dtype: 'float32,int32'
134 |         class: 'frames_npy_preprocess.Preprocess'
135 |         extra_args: 
136 |           max_frames: 300
137 |           feat_dim: 128
138 |           return_frames_num: True
139 | 
140 |       - name: 'image'
141 |         shape: [[224,224,3]]
142 |         dtype: 'float32'
143 |         class: 'image_preprocess.Preprocess'
144 | 
145 |       - name: 'text'
146 |         shape: [[300]]
147 |         dtype: 'int64'
148 |         class: 'text_preprocess.Preprocess'
149 |         extra_args:
150 |           vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt'
151 |           max_len: 300
152 |     label: 
153 |       - name: 'tagging'
154 |         dtype: 'float32'
155 |         shape: [[82]]   # 根据 num_classes修改
156 |         class: 'label_preprocess.Preprocess_label_sparse_to_dense'
157 |         extra_args: 
158 |           index_dict: '../dataset/label_id.txt'    # 按需求更改
159 | 


--------------------------------------------------------------------------------
/configs/config.tagging.5k.8.yaml:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | #  1. Model Define Configs
  3 | #############################################################
  4 | ModelConfig:
  5 |   model_type: 'NextVladBERT'
  6 |   use_modal_drop: False #在训练过程中，对多模态特征的某一模态进行丢弃
  7 |   with_embedding_bn: False #对不同模态输入特征进行BN归一化
  8 |   modal_drop_rate: 0.3
  9 |   with_video_head: True #视频特征
 10 |   with_audio_head: True #音频特征
 11 |   with_text_head: True  #文本特征
 12 |   with_image_head: False # False #图片特征
 13 | 
 14 |   #视频特征(16384)
 15 |   video_head_type: 'NeXtVLAD'
 16 |   video_head_params:
 17 |     nextvlad_cluster_size: 128
 18 |     groups: 16
 19 |     expansion: 2
 20 |     feature_size: 2552 #inception feature dim
 21 |     directly: True
 22 |     max_frames: 300
 23 | 
 24 |   #语音特征(1024)
 25 |   audio_head_type: 'NeXtVLAD'
 26 |   audio_head_params:
 27 |     nextvlad_cluster_size: 64
 28 |     groups: 16
 29 |     expansion: 2
 30 |     feature_size: 128 #vggfish feature dim
 31 |     directly: True
 32 |     max_frames: 300
 33 | 
 34 |   #文本特征(1024)
 35 |   text_head_type: 'BERT'
 36 |   text_head_params:
 37 |     bert_config:
 38 |       attention_probs_dropout_prob: 0.1
 39 |       hidden_act: "gelu"
 40 |       hidden_dropout_prob: 0.1
 41 |       hidden_size: 768
 42 |       initializer_range: 0.02
 43 |       intermediate_size: 3072
 44 |       max_position_embeddings: 512
 45 |       num_attention_heads: 12
 46 |       num_hidden_layers: 12
 47 |       type_vocab_size: 2
 48 |       vocab_size: 21128
 49 |     bert_emb_encode_size: 1024
 50 |   
 51 |   #图片特征(2048)
 52 |   image_head_type: 'resnet_v2_50'
 53 |   image_head_params: {}
 54 | 
 55 | 
 56 |   #多模态特征融合方式
 57 |   fusion_head_type: 'SE'
 58 |   fusion_head_params:
 59 |     hidden1_size: 1024
 60 |     gating_reduction: 8 # reduction factor in se context gating
 61 |     drop_rate:
 62 |       video: 0.8
 63 |       audio: 0.8
 64 |       image: 0.5
 65 |       text: 0.4
 66 |       fusion: 0.9
 67 |       
 68 |   #tagging分类器参数
 69 |   tagging_classifier_type: 'MoeModel'
 70 |   tagging_classifier_params:
 71 |     num_classes: 82 #标签数目, 按需修改
 72 |     num_mixtures: 2
 73 |     #l2_penalty: 0.0
 74 |     
 75 | #############################################################
 76 | #2. Optimizer & Train Configs
 77 | #############################################################
 78 | OptimizerConfig:
 79 |   optimizer: 'AdamOptimizer'
 80 |   optimizer_init_params: {}
 81 |   clip_gradient_norm: 1.0
 82 |   learning_rate_dict:
 83 |     video: 0.0001
 84 |     audio: 0.0001
 85 |     text: 0.00001
 86 |     image: 0.0001
 87 |     classifier: 0.0005
 88 |   loss_type_dict:
 89 |     tagging: "CrossEntropyLoss"
 90 |   max_step_num: 6500
 91 |   export_model_steps: 500
 92 |   learning_rate_decay: 0.1
 93 |   start_new_model: True # 如果为True，重新训练； 如果False，则resume
 94 |   num_gpu: 1
 95 |   log_device_placement: False
 96 |   gpu_allow_growth: True
 97 |   pretrained_model:
 98 |     text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
 99 |     image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt'
100 |   train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改
101 | 
102 | #############################################################
103 | #    3. DataSet Config
104 | #############################################################
105 | DatasetConfig:
106 |   batch_size: 16
107 |   shuffle: True
108 |   train_data_source_list:
109 |     train799:
110 |       file: '../dataset/tagging/GroundTruth/datafile/train_8.txt'   # preprocessing脚本生成文件，按需求修改 (datafile)
111 |       batch_size: 16
112 | 
113 |   valid_data_source_list:
114 |     val799:
115 |       file: '../dataset/tagging/GroundTruth/datafile/valid_8.txt'    # preprocessing脚本生成文件，按需求修改
116 |       batch_size: 128
117 | 
118 |   preprocess_root: 'src/dataloader/preprocess/'
119 |   preprocess_config:
120 |     feature:
121 |       - name: 'video,video_frames_num,idx'
122 |         shape: [[300,2552], [],[]]
123 |         dtype: 'float32,int32,string'
124 |         class: 'frames_npy_preprocess.Preprocess'
125 |         extra_args: 
126 |           max_frames: 300
127 |           feat_dim: 2552
128 |           return_frames_num: True
129 |           return_idx: True
130 | 
131 |       - name: 'audio,audio_frames_num'
132 |         shape: [[300,128], []]
133 |         dtype: 'float32,int32'
134 |         class: 'frames_npy_preprocess.Preprocess'
135 |         extra_args: 
136 |           max_frames: 300
137 |           feat_dim: 128
138 |           return_frames_num: True
139 | 
140 |       - name: 'image'
141 |         shape: [[224,224,3]]
142 |         dtype: 'float32'
143 |         class: 'image_preprocess.Preprocess'
144 | 
145 |       - name: 'text'
146 |         shape: [[300]]
147 |         dtype: 'int64'
148 |         class: 'text_preprocess.Preprocess'
149 |         extra_args:
150 |           vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt'
151 |           max_len: 300
152 |     label: 
153 |       - name: 'tagging'
154 |         dtype: 'float32'
155 |         shape: [[82]]   # 根据 num_classes修改
156 |         class: 'label_preprocess.Preprocess_label_sparse_to_dense'
157 |         extra_args: 
158 |           index_dict: '../dataset/label_id.txt'    # 按需求更改
159 | 


--------------------------------------------------------------------------------
/configs/config.tagging.5k.9.yaml:
--------------------------------------------------------------------------------
  1 | #############################################################
  2 | #  1. Model Define Configs
  3 | #############################################################
  4 | ModelConfig:
  5 |   model_type: 'NextVladBERT'
  6 |   use_modal_drop: False #在训练过程中，对多模态特征的某一模态进行丢弃
  7 |   with_embedding_bn: False #对不同模态输入特征进行BN归一化
  8 |   modal_drop_rate: 0.3
  9 |   with_video_head: True #视频特征
 10 |   with_audio_head: True #音频特征
 11 |   with_text_head: True  #文本特征
 12 |   with_image_head: False # False #图片特征
 13 | 
 14 |   #视频特征(16384)
 15 |   video_head_type: 'NeXtVLAD'
 16 |   video_head_params:
 17 |     nextvlad_cluster_size: 128
 18 |     groups: 16
 19 |     expansion: 2
 20 |     feature_size: 2552 #inception feature dim
 21 |     directly: True
 22 |     max_frames: 300
 23 | 
 24 |   #语音特征(1024)
 25 |   audio_head_type: 'NeXtVLAD'
 26 |   audio_head_params:
 27 |     nextvlad_cluster_size: 64
 28 |     groups: 16
 29 |     expansion: 2
 30 |     feature_size: 128 #vggfish feature dim
 31 |     directly: True
 32 |     max_frames: 300
 33 | 
 34 |   #文本特征(1024)
 35 |   text_head_type: 'BERT'
 36 |   text_head_params:
 37 |     bert_config:
 38 |       attention_probs_dropout_prob: 0.1
 39 |       hidden_act: "gelu"
 40 |       hidden_dropout_prob: 0.1
 41 |       hidden_size: 768
 42 |       initializer_range: 0.02
 43 |       intermediate_size: 3072
 44 |       max_position_embeddings: 512
 45 |       num_attention_heads: 12
 46 |       num_hidden_layers: 12
 47 |       type_vocab_size: 2
 48 |       vocab_size: 21128
 49 |     bert_emb_encode_size: 1024
 50 |   
 51 |   #图片特征(2048)
 52 |   image_head_type: 'resnet_v2_50'
 53 |   image_head_params: {}
 54 | 
 55 | 
 56 |   #多模态特征融合方式
 57 |   fusion_head_type: 'SE'
 58 |   fusion_head_params:
 59 |     hidden1_size: 1024
 60 |     gating_reduction: 8 # reduction factor in se context gating
 61 |     drop_rate:
 62 |       video: 0.8
 63 |       audio: 0.8
 64 |       image: 0.5
 65 |       text: 0.4
 66 |       fusion: 0.9
 67 |       
 68 |   #tagging分类器参数
 69 |   tagging_classifier_type: 'MoeModel'
 70 |   tagging_classifier_params:
 71 |     num_classes: 82 #标签数目, 按需修改
 72 |     num_mixtures: 2
 73 |     #l2_penalty: 0.0
 74 |     
 75 | #############################################################
 76 | #2. Optimizer & Train Configs
 77 | #############################################################
 78 | OptimizerConfig:
 79 |   optimizer: 'AdamOptimizer'
 80 |   optimizer_init_params: {}
 81 |   clip_gradient_norm: 1.0
 82 |   learning_rate_dict:
 83 |     video: 0.0001
 84 |     audio: 0.0001
 85 |     text: 0.00001
 86 |     image: 0.0001
 87 |     classifier: 0.0005
 88 |   loss_type_dict:
 89 |     tagging: "CrossEntropyLoss"
 90 |   max_step_num: 6500
 91 |   export_model_steps: 500
 92 |   learning_rate_decay: 0.1
 93 |   start_new_model: True # 如果为True，重新训练； 如果False，则resume
 94 |   num_gpu: 1
 95 |   log_device_placement: False
 96 |   gpu_allow_growth: True
 97 |   pretrained_model:
 98 |     text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
 99 |     image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt'
100 |   train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改
101 | 
102 | #############################################################
103 | #    3. DataSet Config
104 | #############################################################
105 | DatasetConfig:
106 |   batch_size: 16
107 |   shuffle: True
108 |   train_data_source_list:
109 |     train799:
110 |       file: '../dataset/tagging/GroundTruth/datafile/train_9.txt'   # preprocessing脚本生成文件，按需求修改 (datafile)
111 |       batch_size: 16
112 | 
113 |   valid_data_source_list:
114 |     val799:
115 |       file: '../dataset/tagging/GroundTruth/datafile/valid_9.txt'    # preprocessing脚本生成文件，按需求修改
116 |       batch_size: 128
117 | 
118 |   preprocess_root: 'src/dataloader/preprocess/'
119 |   preprocess_config:
120 |     feature:
121 |       - name: 'video,video_frames_num,idx'
122 |         shape: [[300,2552], [],[]]
123 |         dtype: 'float32,int32,string'
124 |         class: 'frames_npy_preprocess.Preprocess'
125 |         extra_args: 
126 |           max_frames: 300
127 |           feat_dim: 2552
128 |           return_frames_num: True
129 |           return_idx: True
130 | 
131 |       - name: 'audio,audio_frames_num'
132 |         shape: [[300,128], []]
133 |         dtype: 'float32,int32'
134 |         class: 'frames_npy_preprocess.Preprocess'
135 |         extra_args: 
136 |           max_frames: 300
137 |           feat_dim: 128
138 |           return_frames_num: True
139 | 
140 |       - name: 'image'
141 |         shape: [[224,224,3]]
142 |         dtype: 'float32'
143 |         class: 'image_preprocess.Preprocess'
144 | 
145 |       - name: 'text'
146 |         shape: [[300]]
147 |         dtype: 'int64'
148 |         class: 'text_preprocess.Preprocess'
149 |         extra_args:
150 |           vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt'
151 |           max_len: 300
152 |     label: 
153 |       - name: 'tagging'
154 |         dtype: 'float32'
155 |         shape: [[82]]   # 根据 num_classes修改
156 |         class: 'label_preprocess.Preprocess_label_sparse_to_dense'
157 |         extra_args: 
158 |           index_dict: '../dataset/label_id.txt'    # 按需求更改
159 | 


--------------------------------------------------------------------------------
/src/dataloader/preprocess/cnn_preprocessing/cifarnet_preprocessing.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Provides utilities to preprocess images in CIFAR-10.
 16 | 
 17 | """
 18 | 
 19 | from __future__ import absolute_import
 20 | from __future__ import division
 21 | from __future__ import print_function
 22 | 
 23 | import tensorflow as tf
 24 | 
 25 | _PADDING = 4
 26 | 
 27 | slim = tf.contrib.slim
 28 | 
 29 | 
 30 | def preprocess_for_train(image,
 31 |                          output_height,
 32 |                          output_width,
 33 |                          padding=_PADDING,
 34 |                          add_image_summaries=True):
 35 |   """Preprocesses the given image for training.
 36 | 
 37 |   Note that the actual resizing scale is sampled from
 38 |     [`resize_size_min`, `resize_size_max`].
 39 | 
 40 |   Args:
 41 |     image: A `Tensor` representing an image of arbitrary size.
 42 |     output_height: The height of the image after preprocessing.
 43 |     output_width: The width of the image after preprocessing.
 44 |     padding: The amound of padding before and after each dimension of the image.
 45 |     add_image_summaries: Enable image summaries.
 46 | 
 47 |   Returns:
 48 |     A preprocessed image.
 49 |   """
 50 |   if add_image_summaries:
 51 |     tf.summary.image('image', tf.expand_dims(image, 0))
 52 | 
 53 |   # Transform the image to floats.
 54 |   image = tf.to_float(image)
 55 |   if padding > 0:
 56 |     image = tf.pad(image, [[padding, padding], [padding, padding], [0, 0]])
 57 |   # Randomly crop a [height, width] section of the image.
 58 |   distorted_image = tf.random_crop(image,
 59 |                                    [output_height, output_width, 3])
 60 | 
 61 |   # Randomly flip the image horizontally.
 62 |   distorted_image = tf.image.random_flip_left_right(distorted_image)
 63 | 
 64 |   if add_image_summaries:
 65 |     tf.summary.image('distorted_image', tf.expand_dims(distorted_image, 0))
 66 | 
 67 |   # Because these operations are not commutative, consider randomizing
 68 |   # the order their operation.
 69 |   distorted_image = tf.image.random_brightness(distorted_image,
 70 |                                                max_delta=63)
 71 |   distorted_image = tf.image.random_contrast(distorted_image,
 72 |                                              lower=0.2, upper=1.8)
 73 |   # Subtract off the mean and divide by the variance of the pixels.
 74 |   return tf.image.per_image_standardization(distorted_image)
 75 | 
 76 | 
 77 | def preprocess_for_eval(image, output_height, output_width,
 78 |                         add_image_summaries=True):
 79 |   """Preprocesses the given image for evaluation.
 80 | 
 81 |   Args:
 82 |     image: A `Tensor` representing an image of arbitrary size.
 83 |     output_height: The height of the image after preprocessing.
 84 |     output_width: The width of the image after preprocessing.
 85 |     add_image_summaries: Enable image summaries.
 86 | 
 87 |   Returns:
 88 |     A preprocessed image.
 89 |   """
 90 |   if add_image_summaries:
 91 |     tf.summary.image('image', tf.expand_dims(image, 0))
 92 |   # Transform the image to floats.
 93 |   image = tf.to_float(image)
 94 | 
 95 |   # Resize and crop if needed.
 96 |   resized_image = tf.image.resize_image_with_crop_or_pad(image,
 97 |                                                          output_width,
 98 |                                                          output_height)
 99 |   if add_image_summaries:
100 |     tf.summary.image('resized_image', tf.expand_dims(resized_image, 0))
101 | 
102 |   # Subtract off the mean and divide by the variance of the pixels.
103 |   return tf.image.per_image_standardization(resized_image)
104 | 
105 | 
106 | def preprocess_image(image, output_height, output_width, is_training=False,
107 |                      add_image_summaries=True):
108 |   """Preprocesses the given image.
109 | 
110 |   Args:
111 |     image: A `Tensor` representing an image of arbitrary size.
112 |     output_height: The height of the image after preprocessing.
113 |     output_width: The width of the image after preprocessing.
114 |     is_training: `True` if we're preprocessing the image for training and
115 |       `False` otherwise.
116 |     add_image_summaries: Enable image summaries.
117 | 
118 |   Returns:
119 |     A preprocessed image.
120 |   """
121 |   if is_training:
122 |     return preprocess_for_train(
123 |         image, output_height, output_width,
124 |         add_image_summaries=add_image_summaries)
125 |   else:
126 |     return preprocess_for_eval(
127 |         image, output_height, output_width,
128 |         add_image_summaries=add_image_summaries)
129 | 


--------------------------------------------------------------------------------
/src/dataloader/preprocess/frames_npy_preprocess.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import random
  5 | import jieba
  6 | 
  7 | 
  8 | def temporal_shift(src, shift_ratio=0.2):
  9 |     ts, fs = src.shape
 10 |     shift_dim = max(int(fs * shift_ratio) // 2, 1)
 11 |     out = np.zeros_like(src)
 12 |     out[1:, :shift_dim] = src[:-1, :shift_dim] # shift later
 13 |     out[:-1, -shift_dim:] = src[1:, -shift_dim:] # shift earlier
 14 |     out[:, shift_dim:-shift_dim] = src[:, shift_dim:-shift_dim] # no shift
 15 |     return out
 16 | 
 17 | 
 18 | def data_augment(src, noisy=0.5):
 19 |     return src + np.random.normal(0, noisy * np.std(src), size=src.shape)
 20 | 
 21 | 
 22 | def load_embeddings(path="/home/tione/notebook/VideoStructuring/taac2021_tagging_pytorchyyds/pretrained/word_embed/Tencent_AILab_ChineseEmbedding_cut100w.txt"):
 23 |     embeddings = {}
 24 |     with open(path) as f:
 25 |         for row in f:
 26 |             char, score = row.strip().split(" ", 1)
 27 |             embeddings[char] = np.fromstring(score, sep=" ")
 28 |     return embeddings
 29 | 
 30 | 
 31 | word2vec = load_embeddings()            
 32 | def concat_w2v(text_path, frames):
 33 |     with open(text_path.replace("video_npy/Youtube8M/", "text_txt/").replace(".npy", ".txt")) as f:
 34 |         data = eval(f.read().strip())
 35 |         tokens = list(jieba.cut(data["video_asr"].replace("|", "")))
 36 |         if len(tokens) < len(frames):
 37 |             text = data["video_ocr"]
 38 |             for char in '''0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*|()~`·、[]【】？、。，,.';；‘:""''':
 39 |                 text = text.replace(char, "")
 40 |             tokens = list(jieba.cut(text))
 41 |     window_size = max(len(tokens) // len(frames), 1)
 42 |     embeddings = []
 43 |     for i in range(len(frames)):
 44 |         start = i * window_size
 45 |         frame_embed = np.zeros((200,))
 46 |         k = 0.0
 47 |         for token in tokens[start:start + window_size]:
 48 |             if token in word2vec:
 49 |                 frame_embed += word2vec[token]
 50 |                 k += 1
 51 |         if k > 0:
 52 |             frame_embed = frame_embed / k
 53 |         embeddings.append(frame_embed)
 54 |     embeddings = np.vstack(embeddings)
 55 |     return np.hstack([frames, embeddings])
 56 | 
 57 | 
 58 | def resize_axis(tensor, axis, new_size, fill_value=0):
 59 |     tensor = tf.convert_to_tensor(tensor)
 60 |     shape = tf.unstack(tf.shape(tensor))
 61 |   
 62 |     pad_shape = shape[:]
 63 |     pad_shape[axis] = tf.maximum(0, new_size - shape[axis])
 64 |   
 65 |     shape[axis] = tf.minimum(shape[axis], new_size)
 66 |     shape = tf.stack(shape)
 67 |   
 68 |     resized = tf.concat([
 69 |         tf.slice(tensor, tf.zeros_like(shape), shape),
 70 |         tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype))
 71 |     ], axis)
 72 | 
 73 |     # Update shape.
 74 |     new_shape = tensor.get_shape().as_list()  # A copy is being made.
 75 |     new_shape[axis] = new_size
 76 |     resized.set_shape(new_shape)
 77 |     return resized
 78 | 
 79 | 
 80 | class Preprocess:
 81 |     
 82 |     def __init__(self, 
 83 |                  max_frames,
 84 |                  return_frames_num,
 85 |                  feat_dim = 128,
 86 |                  is_training=False,
 87 |                  return_idx = False):
 88 |         self.max_frames = max_frames
 89 |         self.return_frames_num = return_frames_num
 90 |         self.is_training = is_training
 91 |         self.return_idx = return_idx
 92 |         self.feat_dim = feat_dim
 93 |         self.frames_placeholder = tf.placeholder(shape=[None,None],dtype=tf.float32)
 94 |         self.num_frames = tf.minimum(tf.shape(self.frames_placeholder)[0], self.max_frames)
 95 |         self.feature_matrix = resize_axis(self.frames_placeholder,axis=0,new_size=self.max_frames)
 96 |         sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 97 |         sess_config.gpu_options.allow_growth = True
 98 |         self.sess = tf.Session(config=sess_config)
 99 | 
100 |     def __call__(self, frames_npy_fn, augment):
101 |         if os.path.exists(frames_npy_fn):
102 |             frames = np.load(frames_npy_fn)
103 |             assert frames.shape[-1] in (2352, 128)
104 |             if augment > 0:
105 |                 if frames.shape[-1] == 2352:
106 |                     frames = np.hstack([data_augment(frames[:, :2048], 0.5),
107 |                                         data_augment(frames[:, 2048:], 0.5)])
108 |                 elif frames.shape[-1] == 128:
109 |                     frames = data_augment(frames, 0.5)
110 |             if frames.shape[-1] != 128:
111 |                 frames = concat_w2v(frames_npy_fn, frames)
112 |             frames = temporal_shift(frames)
113 |         else:
114 |             print("!"*100+"\n Warning: file {} not exits".format(frames_npy_fn))
115 |             frames = np.zeros((1, self.feat_dim))
116 |         feature_matrix,num_frames = self.sess.run([self.feature_matrix, self.num_frames],feed_dict={self.frames_placeholder:frames})
117 |         idx = os.path.basename(frames_npy_fn).split('.')[0]
118 |         return_list = []
119 |         return_list.append(feature_matrix)
120 |         if self.return_frames_num:
121 |             return_list.append(num_frames)
122 |         if self.return_idx:
123 |             return_list.append(idx)
124 |         return tuple(return_list)
125 | 


--------------------------------------------------------------------------------
/src/model/cover_head/mobilenet_v1_eval.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Validate mobilenet_v1 with options for quantization."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import math
 22 | import tensorflow as tf
 23 | 
 24 | from datasets import dataset_factory
 25 | from nets import mobilenet_v1
 26 | from preprocessing import preprocessing_factory
 27 | 
 28 | slim = tf.contrib.slim
 29 | 
 30 | flags = tf.app.flags
 31 | 
 32 | flags.DEFINE_string('master', '', 'Session master')
 33 | flags.DEFINE_integer('batch_size', 250, 'Batch size')
 34 | flags.DEFINE_integer('num_classes', 1001, 'Number of classes to distinguish')
 35 | flags.DEFINE_integer('num_examples', 50000, 'Number of examples to evaluate')
 36 | flags.DEFINE_integer('image_size', 224, 'Input image resolution')
 37 | flags.DEFINE_float('depth_multiplier', 1.0, 'Depth multiplier for mobilenet')
 38 | flags.DEFINE_bool('quantize', False, 'Quantize training')
 39 | flags.DEFINE_string('checkpoint_dir', '', 'The directory for checkpoints')
 40 | flags.DEFINE_string('eval_dir', '', 'Directory for writing eval event logs')
 41 | flags.DEFINE_string('dataset_dir', '', 'Location of dataset')
 42 | 
 43 | FLAGS = flags.FLAGS
 44 | 
 45 | 
 46 | def imagenet_input(is_training):
 47 |   """Data reader for imagenet.
 48 | 
 49 |   Reads in imagenet data and performs pre-processing on the images.
 50 | 
 51 |   Args:
 52 |      is_training: bool specifying if train or validation dataset is needed.
 53 |   Returns:
 54 |      A batch of images and labels.
 55 |   """
 56 |   if is_training:
 57 |     dataset = dataset_factory.get_dataset('imagenet', 'train',
 58 |                                           FLAGS.dataset_dir)
 59 |   else:
 60 |     dataset = dataset_factory.get_dataset('imagenet', 'validation',
 61 |                                           FLAGS.dataset_dir)
 62 | 
 63 |   provider = slim.dataset_data_provider.DatasetDataProvider(
 64 |       dataset,
 65 |       shuffle=is_training,
 66 |       common_queue_capacity=2 * FLAGS.batch_size,
 67 |       common_queue_min=FLAGS.batch_size)
 68 |   [image, label] = provider.get(['image', 'label'])
 69 | 
 70 |   image_preprocessing_fn = preprocessing_factory.get_preprocessing(
 71 |       'mobilenet_v1', is_training=is_training)
 72 | 
 73 |   image = image_preprocessing_fn(image, FLAGS.image_size, FLAGS.image_size)
 74 | 
 75 |   images, labels = tf.train.batch(
 76 |       tensors=[image, label],
 77 |       batch_size=FLAGS.batch_size,
 78 |       num_threads=4,
 79 |       capacity=5 * FLAGS.batch_size)
 80 |   return images, labels
 81 | 
 82 | 
 83 | def metrics(logits, labels):
 84 |   """Specify the metrics for eval.
 85 | 
 86 |   Args:
 87 |     logits: Logits output from the graph.
 88 |     labels: Ground truth labels for inputs.
 89 | 
 90 |   Returns:
 91 |      Eval Op for the graph.
 92 |   """
 93 |   labels = tf.squeeze(labels)
 94 |   names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
 95 |       'Accuracy': tf.metrics.accuracy(tf.argmax(logits, 1), labels),
 96 |       'Recall_5': tf.metrics.recall_at_k(labels, logits, 5),
 97 |   })
 98 |   for name, value in names_to_values.iteritems():
 99 |     slim.summaries.add_scalar_summary(
100 |         value, name, prefix='eval', print_summary=True)
101 |   return names_to_updates.values()
102 | 
103 | 
104 | def build_model():
105 |   """Build the mobilenet_v1 model for evaluation.
106 | 
107 |   Returns:
108 |     g: graph with rewrites after insertion of quantization ops and batch norm
109 |     folding.
110 |     eval_ops: eval ops for inference.
111 |     variables_to_restore: List of variables to restore from checkpoint.
112 |   """
113 |   g = tf.Graph()
114 |   with g.as_default():
115 |     inputs, labels = imagenet_input(is_training=False)
116 | 
117 |     scope = mobilenet_v1.mobilenet_v1_arg_scope(
118 |         is_training=False, weight_decay=0.0)
119 |     with slim.arg_scope(scope):
120 |       logits, _ = mobilenet_v1.mobilenet_v1(
121 |           inputs,
122 |           is_training=False,
123 |           depth_multiplier=FLAGS.depth_multiplier,
124 |           num_classes=FLAGS.num_classes)
125 | 
126 |     if FLAGS.quantize:
127 |       tf.contrib.quantize.create_eval_graph()
128 | 
129 |     eval_ops = metrics(logits, labels)
130 | 
131 |   return g, eval_ops
132 | 
133 | 
134 | def eval_model():
135 |   """Evaluates mobilenet_v1."""
136 |   g, eval_ops = build_model()
137 |   with g.as_default():
138 |     num_batches = math.ceil(FLAGS.num_examples / float(FLAGS.batch_size))
139 |     slim.evaluation.evaluate_once(
140 |         FLAGS.master,
141 |         FLAGS.checkpoint_dir,
142 |         logdir=FLAGS.eval_dir,
143 |         num_evals=num_batches,
144 |         eval_op=eval_ops)
145 | 
146 | 
147 | def main(unused_arg):
148 |   eval_model()
149 | 
150 | 
151 | if __name__ == '__main__':
152 |   tf.app.run(main)
153 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | ## 2021腾讯广告算法大赛-赛道二-第五名解决方案
  2 | 
  3 | > 作者：吴烜圣 ([email](wuxsmail@163.com), [phone](13036864606))、杨非池 ([email](feichi.yang@usc.edu)）、周童（[email](zhoutong0322@163.com)）、林心悦（[email](xl9yr@virginia.edu)）
  4 | 
  5 | 论文：[Rethinking the Impacts of Overfitting and Feature Quality on Small-scale Video Classification](https://dl.acm.org/doi/abs/10.1145/3474085.3479226)
  6 | 
  7 | 视频讲解：https://www.bilibili.com/video/BV1ju411o7qz?spm_id_from=333.999.0.0
  8 | 
  9 | #### 0 代码复现
 10 | 
 11 | * 推荐配置：CPU: 6 Cores  Memory: 16GB  GPU:V100-32GB
 12 | 
 13 | * 配置环境：
 14 | 
 15 |   在完成下述“步骤0”至“步骤4”之后，将得到如下目录：
 16 | 
 17 |   ```shell
 18 |   /home/tione/notebook/
 19 |   ├── algo-2021
 20 |   ├── envs                            
 21 |   │   └── taac2021-tagging-pytrochyyds # 本项目的conda环境（init.sh自动创建）
 22 |   ├── log                              # 运行日志（init.sh自动创建）
 23 |   └── VideoStructuring                 # 项目路径（请手动创建）
 24 |       ├── dataset                      # 数据和特征（init.sh自动创建）
 25 |       ├── KFoldModels                  # K折模型参数（init.sh自动创建）
 26 |       ├── KFoldResults                 # 单折预测结果（init.sh自动创建）
 27 |       └── taac2021_tagging_pytorchyyds # 项目的代码（请手动创建并拷贝代码到此处）
 28 |           ├── init.sh                  # 环境初始化脚本
 29 |           ├── train.sh                 # 训练模型脚本
 30 |           ├── infer.sh                 # 模型推断脚本
 31 |           ├── pretrained               # 预训练模型权重（init.sh自动下载）
 32 |           ├── checkpoints              # K折模型训练ckpt（init.sh自动创建）
 33 |           ├── results                  # k这模型预测（init.sh自动创建）
 34 |           ├── configs
 35 |           ├── infer.py
 36 |           ├── preprocess
 37 |           ├── readme.md
 38 |           ├── requirement.txt     
 39 |           ├── src
 40 |           ├── train.py
 41 |           └── utils
 42 |   ```
 43 | 
 44 |   * **步骤0：** 空的机器仅包含`/home/tione/notebook/algo-2021`一个文件夹
 45 | 
 46 |   * **步骤1：** 创建本项目的文件夹：`mkdir /home/tione/notebook/VideoStructuring`，除了下一步骤创建的代码目录外，`init.sh`脚本还会在此目录下自动创建存放数据的`dataset`文件夹、存放模型的`KFoldModels`文件夹和存放K这交叉结果的`KFoldResults`文件夹。
 47 | 
 48 |   * **步骤2：** 创建本项目的代码目录：`mkdir home/tione/notebook/VideoStructuring/taac2021_tagging_pytorchyyds`，并 **将项目的所有代码移动到此路径下** ，确保`init.sh`、`train.sh`和`infer.sh`三个文件位于该文件夹中。
 49 | 
 50 |   * **步骤3：** `cd /home/tione/notebook/VideoStructuring/taac2021_tagging_pytorchyyds`
 51 | 
 52 |   * **步骤4：** `sudo chmod a+x ./init.sh && ./init.sh`
 53 | 
 54 |     ```shell
 55 |     shell> sudo chmod a+x ./init.sh && ./init.sh
 56 |     [2021-07-05 18:50:24] INFO 开始拷贝数据到本地...
 57 |     [2021-07-05 19:07:20] INFO 数据拷贝完成！
 58 |     [2021-07-05 19:07:20] INFO 开始配置系统环境...
 59 |     [2021-07-05 19:09:23] INFO 系统环境配置完成！
 60 |     [2021-07-05 19:09:23] INFO 开始下载第三方Python环境...
 61 |     [2021-07-05 19:12:21] INFO 第三方Python环境已安装完毕！
 62 |     [2021-07-05 19:12:21] INFO 开始下载预训练模型...
 63 |     [2021-07-05 19:16:16] INFO 预训练模型下载完成！
 64 |     [2021-07-05 19:16:16] INFO 系统初始化完成！请运行sudo chmod a+x ./train.sh && ./train.sh进行K折模型训练！
 65 |     ```
 66 | 
 67 | * 训练集的特征抽取和K折模型训练：
 68 |   * **步骤5：** `sudo chmod a+x ./train.sh && ./train.sh`
 69 |   
 70 |     ```shell
 71 |     shell> sudo chmod a+x ./train.sh && ./train.sh
 72 |     [2021-07-05 19:25:52] INFO 已启动Conda环境！
 73 |     [2021-07-05 19:25:52] INFO 开始拷贝ASR和OCR文本特征...
 74 |     [2021-07-05 19:26:16] INFO 文本特征已就绪！
 75 |     [2021-07-05 19:26:16] INFO 开始拷贝音频特征...
 76 |     [2021-07-05 19:26:39] INFO 音频特征已就绪！
 77 |     [2021-07-05 19:26:39] INFO 开始抽取Video特征...
 78 |     [2021-07-05 20:54:48] INFO 开始检查Video特征抽取结果...
 79 |     [2021-07-05 20:55:09] INFO 视频特征抽取完成！
 80 |     [2021-07-05 20:55:09] INFO 开始准备K折训练数据...
 81 |     [2021-07-05 20:55:09] INFO K折训练数据已就绪！
 82 |     [2021-07-05 20:55:09] INFO 开始进行K折训练...
 83 |     [2021-07-05 20:55:09] INFO 开始训练第0个模型...
 84 |     # ... 此处省略10个模型的训练日志 ....
 85 |     [2021-07-06 07:50:06] INFO 第9个模型训练完成！
 86 |     [2021-07-06 07:50:06] INFO K折训练已完成！
 87 |     [2021-07-06 07:50:06] INFO 模型训练已完成！请运行sudo chmod a+x ./infer.sh && ./infer.sh进行最终的模型预测！
 88 |     ```
 89 |   
 90 | * 测试集的特征抽取和K折模型预测：
 91 |   * **步骤6：** `sudo chmod a+x ./infer.sh && ./infer.sh`
 92 |   
 93 |     ```shell
 94 |     shell> sudo chmod a+x ./infer.sh && ./infer.sh
 95 |     [2021-07-06 14:49:02] INFO 已启动Conda环境！
 96 |     [2021-07-06 14:49:23] INFO 开始拷贝ASR和OCR文本特征...
 97 |     [2021-07-06 14:49:51] INFO 文本特征已就绪！
 98 |     [2021-07-06 14:49:51] INFO 开始拷贝音频特征...
 99 |     [2021-07-06 14:50:29] INFO 音频特征已就绪！
100 |     [2021-07-06 14:50:29] INFO 开始抽取Video特征...
101 |     [2021-07-06 15:16:48] INFO 视频特征抽取完成！
102 |     [2021-07-06 15:16:48] INFO 开始进行预测...
103 |     [2021-07-06 15:16:48] INFO 第0个子模型开始预测...
104 |     # ... 此处省略10个模型的预测日志 ...
105 |     [2021-07-06 15:37:00] INFO K折预测已完成！
106 |     [2021-07-06 15:37:00] INFO 开始进行模型融合...
107 |     [2021-07-06 15:37:05] INFO 模型融合结果已完成！
108 |     [2021-07-06 15:37:05] INFO 'Pytorch永远滴神'团队最终预测结果已保存到：/home/tione/notebook/pytorchyyds_prediction_5k.json
109 |     ```
110 |   
111 |   * **预测结果：** `/home/tione/notebook/pytorchyyds_prediction_5k.json`
112 | 
113 | #### 1 预训练模型
114 | 
115 | ​	   模型仅使用了Video、Text、Audio三种模态，没有使用视频中间帧的Image模态。除此以外，Text流的文本表征除了Chinese Bert以外，我们还使用腾讯AI实验室的预训练词向量，用于增强Video模态中每一帧的表征。**所有需要的预训练模型已经传到[COS对象存储](https://algo-tencent-2021-1256646044.cos.ap-guangzhou.myqcloud.com/pretrained_models/pretrained.zip)，在init.sh过程中会自行下载并解压。**
116 | 
117 | * 与Baseline相同的预训练模型：
118 |   1. Audio模态：Vggish
119 |   2. Text模态：ChineseBert-base
120 | 
121 | * 与Baseline不同的预训练模型
122 |   1. Video模态：EfficientNet-B5-NoisyStudent（[Code](https://github.com/qubvel/efficientnet)、[Paper](https://arxiv.org/pdf/1905.11946.pdf)）
123 |   2. Image模态：该模态被丢弃，未使用任何预训练模型
124 |   3. Text模态：腾讯AI Lab预训练词向量 （[Code](https://ai.tencent.com/ailab/nlp/zh/embedding.html)、[Paper]([Embedding Dataset -- NLP Center, Tencent AI Lab](https://ai.tencent.com/ailab/nlp/zh/embedding.html))）
125 | 
126 | #### 2 预计用时
127 | 
128 | * 初始化阶段：`sudo chmod a+x ./init.sh && ./init.sh` （大约30分钟）
129 |   1. 复制原始视频数据到本地：20分钟
130 |   2. 安装系统环境：3分钟
131 |   3. 安装Python环境：3分钟
132 |   4. 安装预训练模型：5分钟
133 | * 训练阶段：`sudo chmod a+x ./train.sh && ./train.sh` （11小时 + 5k训练集文本抽取标准时间 + 5k训练集Vggish特征抽取标准时间）
134 |   1. 复制训练集Text到本地：baseline标准时间
135 |   2. 复制训练集Audio特征到本地：baseline标准时间
136 |   3. 抽取训练集Video特征：约1小时30分钟
137 |   4. 训练K折模型：约9小时30分钟
138 | * 测试阶段：`sudo chmod a+x ./infer.sh && ./infer.sh` (1小时50分钟 + 5k测试集文本抽取标准时间 + 5k测试集Vggish特征抽取标准时间)
139 | 
140 |   1. 复制测试集Text到本地：baseline标准时间
141 |   2. 复制测试集Audio特征到本地：baseline标准时间
142 |   3. 抽取测试集Video特征：约1小时25分钟
143 |   4. K折模型预测：约20分钟
144 |   5. K折模型结果融合：5秒
145 | 
146 | 


--------------------------------------------------------------------------------
/preprocess/audio_extractor/vggish_slim.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Defines the 'VGGish' model used to generate AudioSet embedding features.
 17 | 
 18 | The public AudioSet release (https://research.google.com/audioset/download.html)
 19 | includes 128-D features extracted from the embedding layer of a VGG-like model
 20 | that was trained on a large Google-internal YouTube dataset. Here we provide
 21 | a TF-Slim definition of the same model, without any dependences on libraries
 22 | internal to Google. We call it 'VGGish'.
 23 | 
 24 | Note that we only define the model up to the embedding layer, which is the
 25 | penultimate layer before the final classifier layer. We also provide various
 26 | hyperparameter values (in vggish_params.py) that were used to train this model
 27 | internally.
 28 | 
 29 | For comparison, here is TF-Slim's VGG definition:
 30 | https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py
 31 | """
 32 | 
 33 | import tensorflow as tf
 34 | import vggish_params as params
 35 | 
 36 | slim = tf.contrib.slim
 37 | 
 38 | 
 39 | def define_vggish_slim(training=False):
 40 |   """Defines the VGGish TensorFlow model.
 41 | 
 42 |   All ops are created in the current default graph, under the scope 'vggish/'.
 43 | 
 44 |   The input is a placeholder named 'vggish/input_features' of type float32 and
 45 |   shape [batch_size, num_frames, num_bands] where batch_size is variable and
 46 |   num_frames and num_bands are constants, and [num_frames, num_bands] represents
 47 |   a log-mel-scale spectrogram patch covering num_bands frequency bands and
 48 |   num_frames time frames (where each frame step is usually 10ms). This is
 49 |   produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
 50 |   The output is an op named 'vggish/embedding' which produces the activations of
 51 |   a 128-D embedding layer, which is usually the penultimate layer when used as
 52 |   part of a full model with a final classifier layer.
 53 | 
 54 |   Args:
 55 |     training: If true, all parameters are marked trainable.
 56 | 
 57 |   Returns:
 58 |     The op 'vggish/embeddings'.
 59 |   """
 60 |   # Defaults:
 61 |   # - All weights are initialized to N(0, INIT_STDDEV).
 62 |   # - All biases are initialized to 0.
 63 |   # - All activations are ReLU.
 64 |   # - All convolutions are 3x3 with stride 1 and SAME padding.
 65 |   # - All max-pools are 2x2 with stride 2 and SAME padding.
 66 |   with slim.arg_scope([slim.conv2d, slim.fully_connected],
 67 |                       weights_initializer=tf.truncated_normal_initializer(
 68 |                           stddev=params.INIT_STDDEV),
 69 |                       biases_initializer=tf.zeros_initializer(),
 70 |                       activation_fn=tf.nn.relu,
 71 |                       trainable=training), \
 72 |        slim.arg_scope([slim.conv2d],
 73 |                       kernel_size=[3, 3], stride=1, padding='SAME'), \
 74 |        slim.arg_scope([slim.max_pool2d],
 75 |                       kernel_size=[2, 2], stride=2, padding='SAME'), \
 76 |        tf.variable_scope('vggish'):
 77 |     # Input: a batch of 2-D log-mel-spectrogram patches.
 78 |     features = tf.placeholder(
 79 |         tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
 80 |         name='input_features')
 81 |     # Reshape to 4-D so that we can convolve a batch with conv2d().
 82 |     net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])
 83 | 
 84 |     # The VGG stack of alternating convolutions and max-pools.
 85 |     net = slim.conv2d(net, 64, scope='conv1')
 86 |     net = slim.max_pool2d(net, scope='pool1')
 87 |     net = slim.conv2d(net, 128, scope='conv2')
 88 |     net = slim.max_pool2d(net, scope='pool2')
 89 |     net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
 90 |     net = slim.max_pool2d(net, scope='pool3')
 91 |     net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
 92 |     net = slim.max_pool2d(net, scope='pool4')
 93 | 
 94 |     # Flatten before entering fully-connected layers
 95 |     net = slim.flatten(net)
 96 |     net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
 97 |     # The embedding layer.
 98 |     net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
 99 |     return tf.identity(net, name='embedding')
100 | 
101 | 
102 | def load_vggish_slim_checkpoint(session, checkpoint_path):
103 |   """Loads a pre-trained VGGish-compatible checkpoint.
104 | 
105 |   This function can be used as an initialization function (referred to as
106 |   init_fn in TensorFlow documentation) which is called in a Session after
107 |   initializating all variables. When used as an init_fn, this will load
108 |   a pre-trained checkpoint that is compatible with the VGGish model
109 |   definition. Only variables defined by VGGish will be loaded.
110 | 
111 |   Args:
112 |     session: an active TensorFlow session.
113 |     checkpoint_path: path to a file containing a checkpoint that is
114 |       compatible with the VGGish model definition.
115 |   """
116 |   # Get the list of names of all VGGish variables that exist in
117 |   # the checkpoint (i.e., all inference-mode VGGish variables).
118 |   with tf.Graph().as_default():
119 |     define_vggish_slim(training=False)
120 |     vggish_var_names = [v.name for v in tf.global_variables()]
121 | 
122 |   # Get the list of all currently existing variables that match
123 |   # the list of variable names we just computed.
124 |   vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names]
125 | 
126 |   # Use a Saver to restore just the variables selected above.
127 |   saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained',
128 |                          write_version=1)
129 |   saver.restore(session, checkpoint_path)
130 | 


--------------------------------------------------------------------------------
/init.sh:
--------------------------------------------------------------------------------
  1 | # Confirm Position of the code
  2 | CONDA_NEW_ENV=taac2021-tagging-pytrochyyds
  3 | ENV_ROOT=/home/tione/notebook
  4 | CODE_ROOT=${ENV_ROOT}/VideoStructuring
  5 | CODE_BASE=${CODE_ROOT}/taac2021_tagging_pytorchyyds
  6 | 
  7 | if [ ! -d "${ENV_ROOT}" ]; then
  8 |     echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 不存在环境根目录：${ENV_ROOT}"
  9 |     exit 1
 10 | fi
 11 | if [ ! -d "${CODE_ROOT}" ]; then
 12 |     echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 不存在项目根目录：${CODE_ROOT}"
 13 |     exit 1
 14 | fi
 15 | if [ ! -d "${CODE_BASE}" ]; then
 16 |   echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 当前代码未拷贝到目录：${CODE_BASE}"
 17 |   exit 1
 18 | fi
 19 | if [ ! "$(pwd)" = "${CODE_BASE}" ]; then
 20 |     echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 请将代码文件夹复制到${CODE_BASE}，并确保工作路径为：${CODE_BASE}"
 21 |     exit 1
 22 | fi
 23 | 
 24 | # Copy Data from Original Shared Folder
 25 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 开始拷贝数据到本地..." # 预计17分钟
 26 | rm -rf /home/tione/notebook/VideoStructuring/dataset
 27 | mkdir /home/tione/notebook/VideoStructuring/dataset
 28 | cp /home/tione/notebook/algo-2021/dataset/label_id.txt /home/tione/notebook/VideoStructuring/dataset/
 29 | 
 30 | mkdir /home/tione/notebook/VideoStructuring/dataset/videos
 31 | cp /home/tione/notebook/algo-2021/dataset/videos/video_5k /home/tione/notebook/VideoStructuring/dataset/videos -r
 32 | cp /home/tione/notebook/algo-2021/dataset/videos/test_5k_2nd /home/tione/notebook/VideoStructuring/dataset/videos -r
 33 | 
 34 | mkdir /home/tione/notebook/VideoStructuring/dataset/pretrained_models
 35 | cp /home/tione/notebook/algo-2021/dataset/pretrained_models/* /home/tione/notebook/VideoStructuring/dataset/pretrained_models -r
 36 | 
 37 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging
 38 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/GroundTruth
 39 | cp /home/tione/notebook/algo-2021/dataset/tagging/GroundTruth/* /home/tione/notebook/VideoStructuring/dataset/tagging/GroundTruth -r
 40 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/
 41 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/audio_npy/
 42 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/audio_npy/Vggish
 43 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/text_txt
 44 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/video_npy/
 45 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/video_npy/Youtube8M
 46 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/
 47 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/audio_npy/
 48 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/audio_npy/Vggish
 49 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/text_txt
 50 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/video_npy
 51 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/video_npy/Youtube8M
 52 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 数据拷贝完成！"
 53 | 
 54 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 开始配置系统环境..." # 预计1.5分钟
 55 | CONDA_CONFIG_ROOT_PREFIX=$(conda config --show root_prefix)
 56 | get_conda_root_prefix() {
 57 |     TMP_POS=$(awk -v a="${CONDA_CONFIG_ROOT_PREFIX}" -v b="/" 'BEGIN{print index(a, b)}')
 58 |     TMP_POS=$((TMP_POS-1))
 59 |     if [ $TMP_POS -ge 0 ]; then
 60 |       echo "${CONDA_CONFIG_ROOT_PREFIX:${TMP_POS}}"
 61 |     fi
 62 | }
 63 | CONDA_ROOT=$(get_conda_root_prefix)
 64 | if [ ! -d "${CONDA_ROOT}" ]; then
 65 |   echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 未检测到CONDA根目录：${CONDA_ROOT}"
 66 |   exit 1
 67 | fi
 68 | 
 69 | OS_ID=$(awk -F= '$1=="ID" { print $2 ;}' /etc/os-release)
 70 | OS_ID=${OS_ID//"\""/""}
 71 | 
 72 | if [ "${OS_ID}" == "ubuntu" ]; then
 73 |     sudo apt-get update
 74 |     sudo apt-get install -y apt-utils libsndfile1-dev ffmpeg
 75 | elif [ "${OS_ID}" == "centos" ]; then
 76 |     yum install -y libsndfile libsndfile-devel ffmpeg ffmpeg-devel
 77 | else
 78 |     echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 不支持的操作系统：${OS_ID}"
 79 |     exit 1
 80 | fi
 81 | 
 82 | source "${CONDA_ROOT}/etc/profile.d/conda.sh"
 83 | 
 84 | conda create --prefix ${ENV_ROOT}/envs/${CONDA_NEW_ENV} -y cudatoolkit=10.0 cudnn=7.6.0 python=3.7 ipykernel
 85 | conda activate ${ENV_ROOT}/envs/${CONDA_NEW_ENV}
 86 | 
 87 | python -m ipykernel install --user --name ${CONDA_NEW_ENV} --display-name "TAAC2021 (${CONDA_NEW_ENV})"
 88 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 系统环境配置完成！"
 89 | 
 90 | 
 91 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 开始下载第三方Python环境..."
 92 | pip config set global.index-url https://mirrors.tencent.com/pypi/simple/
 93 | pip install -r requirement.txt
 94 | pip install tensorflow-gpu==1.14 efficientnet opencv-python torch==1.2.0 scikit-learn jieba
 95 | check_env=$(python -c """
 96 | try:
 97 |     import tensorflow as tf, cv2, torch, efficientnet, sklearn
 98 |     print('[TensorFlow]', tf.__version__, '[Torch]', torch.__version__, '[EfficientNet]', efficientnet.__version__, '[OpenCV]', cv2.__version__, '[ScikitLearn]', sklearn.__version__)
 99 | except Exception as e:
100 |     print('环境安装存在异常！请重新执行init脚本！')
101 | """)
102 | if [ "${check_env}" == "环境安装存在异常！请重新执行init脚本！" ]; then
103 |     echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR ${check_env}"
104 |     exit 1
105 | fi
106 | sed -i "s/\.decode('utf8')//g" /home/tione/notebook/envs/taac2021-tagging-pytrochyyds/lib/python3.7/site-packages/tensorflow/python/keras/saving/hdf5_format.py
107 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 第三方Python环境已安装完毕！"
108 | 
109 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 开始下载预训练模型..."
110 | rm -rf ${CODE_BASE}/pretrained
111 | wget https://algo-tencent-2021-1256646044.cos.ap-guangzhou.myqcloud.com/pretrained_models/pretrained.zip
112 | unzip pretrained.zip
113 | rm -rf pretrained.zip
114 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 预训练模型下载完成！"
115 | 
116 | rm -rf ${ENV_ROOT}/log
117 | mkdir ${ENV_ROOT}/log
118 | rm -rf ${CODE_BASE}/results
119 | mkdir ${CODE_BASE}/results
120 | rm -rf ${CODE_BASE}/checkpoints
121 | mkdir ${CODE_BASE}/checkpoints
122 | rm -rf ${CODE_ROOT}/KFoldModels
123 | mkdir ${CODE_ROOT}/KFoldModels
124 | rm -rf ${CODE_ROOT}/KFoldResults
125 | mkdir ${CODE_ROOT}/KFoldResults
126 | mkdir ${CODE_ROOT}/KFoldResults/train
127 | mkdir ${CODE_ROOT}/KFoldResults/test
128 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 系统初始化完成！请运行sudo chmod a+x ./train.sh && ./train.sh进行K折模型训练！"


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
 1 | # Confirm Position of the code
 2 | CONDA_NEW_ENV=taac2021-tagging-pytrochyyds
 3 | ENV_ROOT=/home/tione/notebook
 4 | CODE_ROOT=${ENV_ROOT}/VideoStructuring
 5 | CODE_BASE=${CODE_ROOT}/taac2021_tagging_pytorchyyds
 6 | DATA_BASE=${CODE_ROOT}/dataset
 7 | # #################### get env directories
 8 | # CONDA_ROOT
 9 | CONDA_CONFIG_ROOT_PREFIX=$(conda config --show root_prefix)
10 | get_conda_root_prefix() {
11 |   TMP_POS=$(awk -v a="${CONDA_CONFIG_ROOT_PREFIX}" -v b="/" 'BEGIN{print index(a, b)}')
12 |   TMP_POS=$((TMP_POS-1))
13 |   if [ $TMP_POS -ge 0 ]; then
14 |     echo "${CONDA_CONFIG_ROOT_PREFIX:${TMP_POS}}"
15 |   else
16 |     echo ""
17 |   fi
18 | }
19 | CONDA_ROOT=$(get_conda_root_prefix)
20 | if [ ! -d "${CONDA_ROOT}" ]; then
21 |   echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到Conda环境：${CONDA_ROOT}"
22 |   exit 1
23 | fi
24 | 
25 | if [ ! -d "${DATA_BASE}" ]; then
26 |   echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到数据集：${DATASET_ROOT}"
27 |   exit 1
28 | fi
29 | 
30 | CONDA_CONFIG_FILE="${CONDA_ROOT}/etc/profile.d/conda.sh"
31 | if [ ! -f "${CONDA_CONFIG_FILE}" ]; then
32 |   echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到Conda配置文件：${CONDA_CONFIG_FILE}"
33 |   exit 1
34 | fi
35 | source "${CONDA_CONFIG_FILE}"
36 | conda activate ${ENV_ROOT}/envs/${CONDA_NEW_ENV}
37 | 
38 | check_gpu=$(python -c "import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'; import tensorflow as tf; print(tf.test.is_gpu_available())")
39 | if [ "${check_gpu}" == "False" ]; then
40 |     echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR Conda环境启动失败！"
41 |     exit 1
42 | fi
43 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 已启动Conda环境！"
44 | 
45 | 
46 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始拷贝ASR和OCR文本特征..."  # 预计用时30秒
47 | cp ${ENV_ROOT}/algo-2021/dataset/tagging/tagging_dataset_train_5k/text_txt/tagging/ ${CODE_ROOT}/dataset/tagging/tagging_dataset_train_5k/text_txt/ -r
48 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 文本特征已就绪！"
49 | 
50 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始拷贝音频特征..."  # 预计用时1分钟
51 | cp ${ENV_ROOT}/algo-2021/dataset/tagging/tagging_dataset_train_5k/audio_npy/Vggish/tagging/ ${CODE_ROOT}/dataset/tagging/tagging_dataset_train_5k/audio_npy/Vggish/ -r
52 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 音频特征已就绪！"
53 | 
54 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始抽取Video特征..."   # 预计用时1小时30分钟
55 | rm -rf ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/
56 | mkdir ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/
57 | python ${CODE_BASE}/preprocess/feat_extract_main.py     --test_files_dir ${DATA_BASE}/videos/video_5k/train_5k     --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/            --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/audio_npy     --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/image_jpg     --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/text_txt     --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt     --extract_type 1     --image_batch_size 300     --imgfeat_extractor efficientnet | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log &
58 | sleep 30s
59 | python ${CODE_BASE}/preprocess/feat_extract_main.py     --test_files_dir ${DATA_BASE}/videos/video_5k/train_5k     --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/            --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/audio_npy     --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/image_jpg     --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/text_txt     --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt     --extract_type 1     --image_batch_size 300     --imgfeat_extractor efficientnet | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log &
60 | sleep 30s
61 | python ${CODE_BASE}/preprocess/feat_extract_main.py     --test_files_dir ${DATA_BASE}/videos/video_5k/train_5k     --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/            --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/audio_npy     --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/image_jpg     --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/text_txt     --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt     --extract_type 1     --image_batch_size 300     --imgfeat_extractor efficientnet --do_logging 1 | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log 
62 | 
63 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始检查Video特征抽取结果..."
64 | python ${CODE_BASE}/preprocess/feat_extract_main.py     --test_files_dir ${DATA_BASE}/videos/video_5k/train_5k     --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/            --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/audio_npy     --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/image_jpg     --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/text_txt     --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt     --extract_type 1     --image_batch_size 300     --imgfeat_extractor efficientnet --do_logging 1 | grep -v "I tensorflow" >> ${ENV_ROOT}/log/feat_extract.log  
65 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 视频特征抽取完成！"
66 | 
67 | 
68 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始准备K折训练数据..."
69 | python ${CODE_BASE}/utils/k_fold_prepare.py ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt ${DATA_BASE}/tagging/GroundTruth/datafile/val.txt ${DATA_BASE}/tagging/GroundTruth/datafile/train_{}.txt ${DATA_BASE}/tagging/GroundTruth/datafile/valid_{}.txt ${CODE_BASE}/configs/
70 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO K折训练数据已就绪！"
71 | 
72 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始进行K折训练..."
73 | for fold in 0 1 2 3 4 5 6 7 8 9
74 | do
75 |     echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始训练第${fold}个模型..."
76 |     python ${CODE_BASE}/train.py --config "${CODE_BASE}/configs/config.tagging.5k.$fold.yaml" > ${ENV_ROOT}/log/train_log_$fold.txt
77 |     BEST_MODEL=$(ls -td -- ${CODE_BASE}/checkpoints/tagging5k_temp/export/* | head -n 1)
78 |     echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 第${fold}个模型训练完成：$BEST_MODEL"
79 |     python ./utils/save_best_ckpt.py ${CODE_BASE}/checkpoints/tagging5k_temp/
80 |     rm -rf ${CODE_ROOT}/KFoldModels/model_"$fold" 
81 |     mkdir ${CODE_ROOT}/KFoldModels/model_"$fold"
82 |     cp -r ${CODE_BASE}/checkpoints/tagging5k_temp/* ${CODE_ROOT}/KFoldModels/model_"$fold"
83 | done
84 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO K折训练已完成！"
85 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 模型训练已完成！请运行sudo chmod a+x ./infer.sh && ./infer.sh进行最终的模型预测！"
86 | 
87 | 


--------------------------------------------------------------------------------
/src/model/models/nextvlad_bert.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | #Author: jefxiong@tencent.com
  3 | 
  4 | import tensorflow as tf
  5 | import tensorflow.contrib.slim as slim
  6 | import src.model.video_head as video_head
  7 | import src.model.text_head as text_head
  8 | import src.model.image_head as image_head
  9 | import src.model.fusion_head as fusion_head
 10 | import src.model.classify_head as classcify_head
 11 | from src.model.models.base_model import BaseModel
 12 | 
 13 | class NextVladBERT(BaseModel):
 14 |     def __init__(self, model_config):
 15 |         self.with_video_head = model_config['with_video_head']
 16 |         self.with_audio_head = model_config['with_audio_head']
 17 |         self.with_text_head = model_config['with_text_head']
 18 |         self.with_image_head = model_config['with_image_head']
 19 |         
 20 |         self.use_modal_drop = model_config['use_modal_drop']
 21 |         self.modal_drop_rate = model_config['modal_drop_rate']
 22 |         self.with_embedding_bn = model_config['with_embedding_bn']
 23 | 
 24 |         self.modal_name_list = []
 25 |         if self.with_video_head:
 26 |             self.modal_name_list.append('video')
 27 |             self.video_max_frame = model_config['video_head_params']['max_frames']
 28 |         if self.with_audio_head:
 29 |             self.modal_name_list.append('audio')
 30 |             self.audio_max_frame = model_config['audio_head_params']['max_frames']
 31 |         if self.with_text_head: 
 32 |             self.modal_name_list.append('text')
 33 |         if self.with_image_head:
 34 |             self.modal_name_list.append('image')
 35 | 
 36 |         self.fusion_head_dict={}
 37 |         self.classifier_dict={}
 38 |         self.head_dict={}
 39 |         
 40 |         for modal in (self.modal_name_list+['fusion']):
 41 |             fusion_head_params = model_config['fusion_head_params'].copy()
 42 |             fusion_head_params['drop_rate'] = fusion_head_params['drop_rate'][modal]
 43 |             
 44 |             self.fusion_head_dict[modal] = fusion_head.get_instance(model_config['fusion_head_type'], fusion_head_params)
 45 |             self.classifier_dict[modal] = classcify_head.get_instance(model_config['tagging_classifier_type'], model_config['tagging_classifier_params'])
 46 |             if modal=='video':
 47 |                 self.head_dict[modal] = video_head.get_instance(model_config['video_head_type'], model_config['video_head_params'])
 48 |             elif modal=='audio':
 49 |                 self.head_dict[modal] = video_head.get_instance(model_config['audio_head_type'], model_config['audio_head_params'])
 50 |             elif modal == 'text':
 51 |                 self.head_dict[modal] = text_head.get_instance(model_config['text_head_type'], model_config['text_head_params'])
 52 |             elif modal == 'image':
 53 |                 self.head_dict[modal] = image_head.get_instance(model_config['image_head_type'], model_config['image_head_params'])
 54 |             elif modal == 'fusion':
 55 |                 pass
 56 |             else:
 57 |                 raise NotImplementedError
 58 | 
 59 |     def  _modal_drop(self, x, rate=0.0, noise_shape=None):
 60 |         """模态dropout"""
 61 |         random_scale = tf.random.uniform(noise_shape)
 62 |         keep_mask = tf.cast(random_scale >= rate, x.dtype)
 63 |         ret = x * keep_mask
 64 |         probs = tf.cast(keep_mask, tf.float32)
 65 |         return ret, probs
 66 | 
 67 |     def __call__(self, inputs_dict, is_training=False, train_batch_size=1):
 68 |         assert is_training is not None
 69 |         prob_dict = {}
 70 |         embedding_list = []
 71 |         
 72 |         for modal_name in self.modal_name_list:    
 73 |           #Modal Dropout
 74 |           if modal_name in ['video', 'audio']:
 75 |             drop_shape = [train_batch_size, 1, 1]
 76 |             mask = tf.sequence_mask(inputs_dict[modal_name+'_frames_num'], self.video_max_frame, dtype=tf.float32)   
 77 |           elif modal_name == 'text': 
 78 |             drop_shape = [train_batch_size, 1]
 79 |           elif modal_name == 'image': 
 80 |             drop_shape = [train_batch_size, 1, 1, 1]
 81 |             
 82 |           if is_training and self.use_modal_drop:
 83 |               inputs_dict[modal_name], prob_dict[modal_name+'_loss_weight'] = self._modal_drop(inputs_dict[modal_name], self.modal_drop_rate, drop_shape)
 84 |         
 85 |           with tf.variable_scope(modal_name):
 86 |             if modal_name in ['video', 'audio']:
 87 |                 embedding = self.head_dict[modal_name](inputs_dict[modal_name], is_training=is_training, mask=mask)
 88 |             else:
 89 |                 embedding =  self.head_dict[modal_name](inputs_dict[modal_name], is_training=is_training)
 90 |             
 91 |           with tf.variable_scope("tag_classifier/"+modal_name[0]):
 92 |             if self.with_embedding_bn:
 93 |                 embedding = slim.batch_norm(embedding, center=True, scale=True, is_training=is_training, scope=modal_name[0]+"_feat_bn")
 94 |             encode_emb = self.fusion_head_dict[modal_name]([embedding], is_training=is_training)
 95 |             prob_dict['tagging_output_'+modal_name] = self.classifier_dict[modal_name](encode_emb)
 96 |             embedding_list.append(embedding)
 97 |             #if is_training:
 98 |             #    tf.summary.histogram("embedding/{}".format(modal_name), embedding)
 99 |             #    tf.summary.histogram("encode_emb/{}".format(modal_name), encode_emb)
100 | 
101 |         #fusion
102 |         with tf.variable_scope("tag_classifier/fusion"):
103 |             fusion_embedding = self.fusion_head_dict['fusion'](embedding_list, is_training = is_training)
104 |             probs = self.classifier_dict['fusion'](fusion_embedding)
105 |             prob_dict['tagging_output_fusion'] = probs
106 |             prob_dict['video_embedding'] = fusion_embedding
107 |         return prob_dict
108 | 
109 |     def build_loss(self, inputs, results, label_loss_fn_dict):
110 |         loss_dict={}
111 |         for key, loss_fn in label_loss_fn_dict.items():
112 |             if key == 'tagging':
113 |                 labels = inputs['tagging']
114 |                 for modal in self.modal_name_list + ['fusion']:
115 |                     loss_weight = results.get(modal+"_loss_weight", 1.0)
116 |                     prediction = results["tagging_output_"+modal]["predictions"]
117 |                     loss_dict["tagging_loss_"+modal] = loss_fn.calculate_loss(prediction, labels,
118 |                                                               **dict(loss_weight = loss_weight))
119 |             else:
120 |                 raise NotImplementedError
121 |         return loss_dict
122 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | #Author: jefxiong@tencent.com
  3 | #Author: xxx@tencent.com
  4 | 
  5 | import sys,os
  6 | sys.path.append(os.getcwd())
  7 | import time
  8 | import numpy as np
  9 | 
 10 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 11 | 
 12 | import tensorflow as tf
 13 | from tensorflow import logging
 14 | import tensorflow.contrib.slim as slim
 15 | import utils.train_util as train_util
 16 | from utils.base_trainer import Trainer, train_main
 17 | 
 18 | 
 19 | 
 20 | class TaggingTrainer(Trainer):
 21 |     def __init__(self, cluster, task, model, reader, configs):
 22 |         super().__init__(cluster, task, model, reader, configs)
 23 | 
 24 |     def get_train_fetch_dict(self):
 25 |         fetch_dict = {}
 26 |         fetch_dict['global_step'] = self.global_step
 27 |         fetch_dict['train_losses_dict'] = self.train_losses_dict
 28 |         fetch_dict['trian_op'] = self.train_op
 29 |         #标签任务相关变量
 30 |         fetch_dict['train_tagging_predictions'] = self.train_tagging_predictions
 31 |         fetch_dict['train_tagging_labels'] = self.train_inputs_dict['tagging']
 32 |         return fetch_dict
 33 | 
 34 |     def get_val_fetch_dict(self):
 35 |         fetch_dict = {}
 36 |         for modal_name in ['fusion'] + self.modal_name_list:
 37 |             fetch_dict['tagging_output_'+modal_name] = tf.get_collection('tagging_output_'+modal_name)[0]
 38 |             fetch_dict['tagging_loss_'+modal_name] = tf.get_collection('tagging_loss_'+modal_name)[0]
 39 |             fetch_dict['val_summary_op'] = tf.get_collection("val_summary_op")[0]
 40 |         return fetch_dict
 41 | 
 42 |     def load_pretrained_model(self):
 43 |         text_pretrained_model = self.optimizer_config.pretrained_model['text_pretrained_model']
 44 |         assignment_map, _ = train_util.get_assignment_map_from_checkpoint(tf.global_variables(), 
 45 |                                                                          text_pretrained_model,
 46 |                                                                          var_prefix='tower/text/',
 47 |                                                                          show=True)
 48 |         tf.train.init_from_checkpoint(text_pretrained_model, assignment_map)
 49 |         print("load text_pretrained_model: {}".format(text_pretrained_model))
 50 | 
 51 |     def train_metric_log(self, train_fetch_dict_val, examples_per_second):
 52 |         """训练集上的结果验证和训练指标tensorboard输出"""
 53 | 
 54 |         predictions_val = train_fetch_dict_val['train_tagging_predictions']
 55 |         labels_val = train_fetch_dict_val['train_tagging_labels']
 56 |         global_step_val = train_fetch_dict_val['global_step']
 57 |         train_losses_dict = train_fetch_dict_val['train_losses_dict']
 58 | 
 59 |         train_pr_calculator = train_util.PRCalculator()
 60 |         gap = train_util.calculate_gap(predictions_val, labels_val)
 61 |         train_pr_calculator.accumulate(predictions_val, labels_val)
 62 |         precision_at_1 = train_pr_calculator.get_precision_at_conf(0.1)
 63 |         precision_at_5 = train_pr_calculator.get_precision_at_conf(0.5)
 64 |         recall_at_1 = train_pr_calculator.get_recall_at_conf(0.1)
 65 |         recall_at_5 = train_pr_calculator.get_recall_at_conf(0.5)
 66 |         train_pr_calculator.clear()
 67 |         
 68 |         train_losses_info = "|".join(["{}: {:.3f}".format(k, v) for k,v in train_losses_dict.items()])
 69 |         logging.info("training step {} |{} | Examples/sec: {:.2f}".format(global_step_val, train_losses_info, examples_per_second))
 70 |         logging.info("GAP: {:.2f} | precision@0.1: {:.2f} | precision@0.5: {:.2f} |recall@0.1: {:.2f} | recall@0.5: {:.2f}".format(gap,
 71 |                                                                              precision_at_1, precision_at_5,recall_at_1, recall_at_5))
 72 |         
 73 |         self.sv.summary_writer.add_summary(train_util.MakeSummary("TrainMetric/GAP", gap), global_step_val)
 74 |         self.sv.summary_writer.add_summary(train_util.MakeSummary("TrainMetric/precision@0.1", precision_at_1), global_step_val)
 75 |         self.sv.summary_writer.add_summary(train_util.MakeSummary("TrainMetric/precision@0.5", precision_at_5), global_step_val)
 76 |         self.sv.summary_writer.add_summary(train_util.MakeSummary("TrainMetric/recall@0.1", recall_at_1), global_step_val)
 77 |         self.sv.summary_writer.add_summary(train_util.MakeSummary("TrainMetric/recall@0.5", recall_at_5), global_step_val)
 78 |         self.sv.summary_writer.flush()
 79 | 
 80 |     def eval(self, sess, global_step_val, data_generater, data_source_name):
 81 |         #taggging eval
 82 |         tagging_class_num = self.reader.label_num_dict['tagging']
 83 |         self.evl_metrics = [train_util.EvaluationMetrics(tagging_class_num, top_k=20)
 84 |                            for i in range(len(self.modal_name_list)+1)] #+1 for fusion
 85 |         for i in range(len(self.evl_metrics)):
 86 |             self.evl_metrics[i].clear()
 87 | 
 88 |         examples_processed = 0
 89 |         
 90 |         for sample in data_generater:
 91 |           batch_start_time = time.time()
 92 |           feed_dict_data = {}
 93 |           for input_name in self.reader.dname_string_list:
 94 |             var_names = tf.get_collection(input_name)
 95 |             assert len(var_names)==1
 96 |             feed_dict_data[var_names[0]] = sample[input_name]
 97 |             
 98 |             
 99 |           fetch_dict_eval = sess.run(self.val_fetch_dict, feed_dict=feed_dict_data)
100 |           seconds_per_batch = time.time() - batch_start_time
101 |           example_per_second = self.reader.batch_size / seconds_per_batch
102 |           examples_processed += self.reader.batch_size
103 | 
104 |           for index, modal_name in enumerate(self.modal_name_list+['fusion']):
105 |             pred = fetch_dict_eval['tagging_output_'+modal_name]
106 |             val_label = sample['tagging']
107 |             gap = train_util.calculate_gap(pred, val_label)
108 |             iteration_info_dict = self.evl_metrics[index].accumulate(pred, val_label, fetch_dict_eval['tagging_loss_'+modal_name])
109 |             iteration_info_dict['GAP'] = gap
110 |           iteration_info_dict["examples_per_second"] = example_per_second
111 |           iterinfo = "|".join(["{}: {:.3f}".format(k,v) for k,v in iteration_info_dict.items()])
112 |           logging.info("examples_processed: %d | %s", examples_processed, iterinfo)
113 |         logging.info("Done with batched inference. Now calculating global performance metrics.")
114 | 
115 |         for index, modal_name in enumerate(self.modal_name_list+['fusion']):
116 |             epoch_info_dict = self.evl_metrics[index].get()
117 |             epoch_info_dict["epoch_id"] = global_step_val
118 |             epochinfo = train_util.FormatEvalInfo(self.summary_writer, global_step_val, epoch_info_dict, prefix="val_"+modal_name)
119 |             logging.info(epochinfo)
120 |             self.evl_metrics[index].clear()
121 |         self.summary_writer.add_summary(fetch_dict_eval['val_summary_op'], global_step_val)
122 | 
123 |         return epoch_info_dict['gap'] #融合特征的预测结果
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     import argparse
128 |     parser = argparse.ArgumentParser()
129 |     parser.add_argument('--config',default='configs/config.example.yaml',type=str)
130 |     args = parser.parse_args()
131 |     train_main(args.config, TaggingTrainer)
132 | 


--------------------------------------------------------------------------------
/infer.sh:
--------------------------------------------------------------------------------
  1 | # Confirm Position of the code
  2 | CONDA_NEW_ENV=taac2021-tagging-pytrochyyds
  3 | ENV_ROOT=/home/tione/notebook
  4 | CODE_ROOT=${ENV_ROOT}/VideoStructuring
  5 | CODE_BASE=${CODE_ROOT}/taac2021_tagging_pytorchyyds
  6 | DATA_BASE=${CODE_ROOT}/dataset
  7 | # #################### get env directories
  8 | # CONDA_ROOT
  9 | CONDA_CONFIG_ROOT_PREFIX=$(conda config --show root_prefix)
 10 | get_conda_root_prefix() {
 11 |   TMP_POS=$(awk -v a="${CONDA_CONFIG_ROOT_PREFIX}" -v b="/" 'BEGIN{print index(a, b)}')
 12 |   TMP_POS=$((TMP_POS-1))
 13 |   if [ $TMP_POS -ge 0 ]; then
 14 |     echo "${CONDA_CONFIG_ROOT_PREFIX:${TMP_POS}}"
 15 |   else
 16 |     echo ""
 17 |   fi
 18 | }
 19 | CONDA_ROOT=$(get_conda_root_prefix)
 20 | if [ ! -d "${CONDA_ROOT}" ]; then
 21 |   echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到Conda环境：${CONDA_ROOT}"
 22 |   exit 1
 23 | fi
 24 | 
 25 | if [ ! -d "${DATA_BASE}" ]; then
 26 |   echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到数据集：${DATASET_ROOT}"
 27 |   exit 1
 28 | fi
 29 | 
 30 | CONDA_CONFIG_FILE="${CONDA_ROOT}/etc/profile.d/conda.sh"
 31 | if [ ! -f "${CONDA_CONFIG_FILE}" ]; then
 32 |   echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到Conda配置文件：${CONDA_CONFIG_FILE}"
 33 |   exit 1
 34 | fi
 35 | source "${CONDA_CONFIG_FILE}"
 36 | conda activate ${ENV_ROOT}/envs/${CONDA_NEW_ENV}
 37 | 
 38 | check_gpu=$(python -c "import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'; import tensorflow.compat.v1 as tf; print(tf.test.is_gpu_available())")
 39 | if [ "${check_gpu}" == "False" ]; then
 40 |     echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR Conda环境启动失败！"
 41 |     exit 1
 42 | fi
 43 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 已启动Conda环境！"
 44 | 
 45 | 
 46 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始拷贝ASR和OCR文本特征..."
 47 | cp ${ENV_ROOT}/algo-2021/dataset/tagging/tagging_dataset_test_5k_2nd/text_txt/tagging/ ${CODE_ROOT}/dataset/tagging/tagging_dataset_test_5k_2nd/text_txt/ -r
 48 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 文本特征已就绪！"
 49 | 
 50 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始拷贝音频特征..."
 51 | cp ${ENV_ROOT}/algo-2021/dataset/tagging/tagging_dataset_test_5k_2nd/audio_npy/Vggish/tagging/ ${CODE_ROOT}/dataset/tagging/tagging_dataset_test_5k_2nd/audio_npy/Vggish/ -r
 52 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 音频特征已就绪！"
 53 | 
 54 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始抽取Video特征..."
 55 | python ${CODE_BASE}/preprocess/feat_extract_main.py     --test_files_dir ${DATA_BASE}/videos/test_5k_2nd/     --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/video_npy/Youtube8M/tagging/            --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/audio_npy     --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/image_jpg     --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/text_txt     --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt     --extract_type 1     --image_batch_size 300     --imgfeat_extractor efficientnet | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log &
 56 | sleep 30s
 57 | python ${CODE_BASE}/preprocess/feat_extract_main.py     --test_files_dir ${DATA_BASE}/videos/test_5k_2nd/     --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/video_npy/Youtube8M/tagging/            --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/audio_npy     --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/image_jpg     --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/text_txt     --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt     --extract_type 1     --image_batch_size 300     --imgfeat_extractor efficientnet | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log &
 58 | sleep 30s
 59 | python ${CODE_BASE}/preprocess/feat_extract_main.py     --test_files_dir ${DATA_BASE}/videos/test_5k_2nd/     --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/video_npy/Youtube8M/tagging/            --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/audio_npy     --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/image_jpg     --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/text_txt     --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt     --extract_type 1     --image_batch_size 300     --imgfeat_extractor efficientnet --do_logging 1 | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log 
 60 | 
 61 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始检查Video特征抽取结果..."
 62 | python ${CODE_BASE}/preprocess/feat_extract_main.py     --test_files_dir ${DATA_BASE}/videos/test_5k_2nd/     --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/video_npy/Youtube8M/tagging/            --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/audio_npy     --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/image_jpg     --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/text_txt     --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt     --extract_type 1     --image_batch_size 300     --imgfeat_extractor efficientnet --do_logging 1 | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log 
 63 | 
 64 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 视频特征抽取完成！"
 65 | 
 66 | 
 67 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始进行预测..."
 68 | for fold in 0 2 4 6 8 
 69 | do
 70 |     echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 第${fold}个子模型开始预测..."
 71 |     rm -rf ${CODE_ROOT}/KFoldResults/test/results_"$fold"/
 72 |     mkdir ${CODE_ROOT}/KFoldResults/test/results_"$fold"/
 73 |     chmod 777 ${CODE_ROOT}/KFoldResults/test/results_"$fold"/
 74 |     BEST_MODEL=$(ls -td -- ${CODE_ROOT}/KFoldModels/model_"$fold"/export/* | head -n 1)
 75 |     python ${CODE_BASE}/infer.py  \
 76 |                     --model_pb ${BEST_MODEL} \
 77 |                     --tag_id_file ${CODE_ROOT}/dataset/label_id.txt \
 78 |                     --test_dir ${CODE_ROOT}/dataset/videos/test_5k_2nd/ \
 79 |                     --output_json ${CODE_ROOT}/KFoldResults/test/results_"$fold"/tagging_5k.json \
 80 |                     --load_feat 1 \
 81 |                     --feat_dir ${CODE_ROOT}/dataset/tagging/tagging_dataset_test_5k_2nd/  \
 82 |                     --top_k 82 > ${ENV_ROOT}/log/test_eval_log.txt &
 83 |                     
 84 |     sleep 10s
 85 |     fold=$(($fold+1))
 86 |     echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 第${fold}个子模型开始预测..."
 87 |     rm -rf ${CODE_ROOT}/KFoldResults/test/results_"$fold"/
 88 |     mkdir ${CODE_ROOT}/KFoldResults/test/results_"$fold"/
 89 |     chmod 777 ${CODE_ROOT}/KFoldResults/test/results_"$fold"/
 90 |     BEST_MODEL=$(ls -td -- ${CODE_ROOT}/KFoldModels/model_"$fold"/export/* | head -n 1)
 91 |     python ${CODE_BASE}/infer.py  \
 92 |                     --model_pb ${BEST_MODEL} \
 93 |                     --tag_id_file ${CODE_ROOT}/dataset/label_id.txt \
 94 |                     --test_dir ${CODE_ROOT}/dataset/videos/test_5k_2nd/ \
 95 |                     --output_json ${CODE_ROOT}/KFoldResults/test/results_"$fold"/tagging_5k.json \
 96 |                     --load_feat 1 \
 97 |                     --feat_dir ${CODE_ROOT}/dataset/tagging/tagging_dataset_test_5k_2nd/  \
 98 |                     --top_k 82 > ${ENV_ROOT}/log/test_eval_log.txt
 99 | done
100 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO K折预测已完成！"
101 | 
102 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始进行模型融合..."
103 | python ${CODE_BASE}/utils/k_fold_fusion.py 10 ${CODE_ROOT}/dataset/label_id.txt ${CODE_ROOT}/KFoldResults/test/results_{}/tagging_5k.json ${CODE_BASE}/results/tagging_5k.json 20
104 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 模型融合结果已完成！"
105 | 
106 | cp ${CODE_BASE}/results/tagging_5k.json ${ENV_ROOT}/pytorchyyds_prediction_5k.json
107 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 'Pytorch永远滴神'团队最终预测结果已保存到：${ENV_ROOT}/pytorchyyds_prediction_5k.json"
108 | 


--------------------------------------------------------------------------------
/src/model/image_head/efficientNet/condconv/efficientnet_condconv_builder.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Builder for EfficientNet-CondConv models."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import tensorflow.compat.v1 as tf
 23 | 
 24 | import efficientnet_builder
 25 | import efficientnet_model
 26 | import utils
 27 | 
 28 | # The input tensor is in the range of [0, 255], we need to scale them to the
 29 | # range of [0, 1]
 30 | MEAN_RGB = [127.0, 127.0, 127.0]
 31 | STDDEV_RGB = [128.0, 128.0, 128.0]
 32 | 
 33 | 
 34 | def efficientnet_condconv_params(model_name):
 35 |   """Get efficientnet-condconv params based on model name."""
 36 |   params_dict = {
 37 |       # (width_coefficient, depth_coefficient, resolution, dropout_rate,
 38 |       #  condconv_num_experts)
 39 |       'efficientnet-condconv-b0-4e': (1.0, 1.0, 224, 0.25, 4),
 40 |       'efficientnet-condconv-b0-8e': (1.0, 1.0, 224, 0.25, 8),
 41 |       'efficientnet-condconv-b0-8e-depth': (1.0, 1.1, 224, 0.25, 8)
 42 |   }
 43 |   return params_dict[model_name]
 44 | 
 45 | 
 46 | def efficientnet_condconv(width_coefficient=None,
 47 |                           depth_coefficient=None,
 48 |                           dropout_rate=0.2,
 49 |                           survival_prob=0.8,
 50 |                           condconv_num_experts=None):
 51 |   """Creates an efficientnet-condconv model."""
 52 |   blocks_args = [
 53 |       'r1_k3_s11_e1_i32_o16_se0.25',
 54 |       'r2_k3_s22_e6_i16_o24_se0.25',
 55 |       'r2_k5_s22_e6_i24_o40_se0.25',
 56 |       'r3_k3_s22_e6_i40_o80_se0.25',
 57 |       'r3_k5_s11_e6_i80_o112_se0.25_cc',
 58 |       'r4_k5_s22_e6_i112_o192_se0.25_cc',
 59 |       'r1_k3_s11_e6_i192_o320_se0.25_cc',
 60 |   ]
 61 |   global_params = efficientnet_model.GlobalParams(
 62 |       batch_norm_momentum=0.99,
 63 |       batch_norm_epsilon=1e-3,
 64 |       dropout_rate=dropout_rate,
 65 |       survival_prob=survival_prob,
 66 |       data_format='channels_last',
 67 |       num_classes=1000,
 68 |       width_coefficient=width_coefficient,
 69 |       depth_coefficient=depth_coefficient,
 70 |       depth_divisor=8,
 71 |       min_depth=None,
 72 |       relu_fn=tf.nn.swish,
 73 |       # The default is TPU-specific batch norm.
 74 |       # The alternative is tf.layers.BatchNormalization.
 75 |       batch_norm=utils.TpuBatchNormalization,  # TPU-specific requirement.
 76 |       use_se=True,
 77 |       condconv_num_experts=condconv_num_experts)
 78 |   decoder = efficientnet_builder.BlockDecoder()
 79 |   return decoder.decode(blocks_args), global_params
 80 | 
 81 | 
 82 | def get_model_params(model_name, override_params):
 83 |   """Get the block args and global params for a given model."""
 84 |   if model_name.startswith('efficientnet-condconv'):
 85 |     (width_coefficient, depth_coefficient, _, dropout_rate,
 86 |      condconv_num_experts) = (
 87 |          efficientnet_condconv_params(model_name))
 88 |     blocks_args, global_params = efficientnet_condconv(
 89 |         width_coefficient=width_coefficient,
 90 |         depth_coefficient=depth_coefficient,
 91 |         dropout_rate=dropout_rate,
 92 |         condconv_num_experts=condconv_num_experts)
 93 |   else:
 94 |     raise NotImplementedError('model name is not pre-defined: %s' % model_name)
 95 | 
 96 |   if override_params:
 97 |     # ValueError will be raised here if override_params has fields not included
 98 |     # in global_params.
 99 |     global_params = global_params._replace(**override_params)
100 | 
101 |   tf.logging.info('global_params= %s', global_params)
102 |   tf.logging.info('blocks_args= %s', blocks_args)
103 |   return blocks_args, global_params
104 | 
105 | 
106 | def build_model(images,
107 |                 model_name,
108 |                 training,
109 |                 override_params=None,
110 |                 model_dir=None,
111 |                 fine_tuning=False):
112 |   """A helper functiion to creates a model and returns predicted logits.
113 | 
114 |   Args:
115 |     images: input images tensor.
116 |     model_name: string, the predefined model name.
117 |     training: boolean, whether the model is constructed for training.
118 |     override_params: A dictionary of params for overriding. Fields must exist in
119 |       efficientnet_model.GlobalParams.
120 |     model_dir: string, optional model dir for saving configs.
121 |     fine_tuning: boolean, whether the model is used for finetuning.
122 | 
123 |   Returns:
124 |     logits: the logits tensor of classes.
125 |     endpoints: the endpoints for each layer.
126 | 
127 |   Raises:
128 |     When model_name specified an undefined model, raises NotImplementedError.
129 |     When override_params has invalid fields, raises ValueError.
130 |   """
131 |   assert isinstance(images, tf.Tensor)
132 |   if not training or fine_tuning:
133 |     if not override_params:
134 |       override_params = {}
135 |     override_params['batch_norm'] = utils.BatchNormalization
136 |   blocks_args, global_params = get_model_params(model_name, override_params)
137 |   if not training or fine_tuning:
138 |     global_params = global_params._replace(batch_norm=utils.BatchNormalization)
139 | 
140 |   if model_dir:
141 |     param_file = os.path.join(model_dir, 'model_params.txt')
142 |     if not tf.gfile.Exists(param_file):
143 |       if not tf.gfile.Exists(model_dir):
144 |         tf.gfile.MakeDirs(model_dir)
145 |       with tf.gfile.GFile(param_file, 'w') as f:
146 |         tf.logging.info('writing to %s' % param_file)
147 |         f.write('model_name= %s\n\n' % model_name)
148 |         f.write('global_params= %s\n\n' % str(global_params))
149 |         f.write('blocks_args= %s\n\n' % str(blocks_args))
150 | 
151 |   with tf.variable_scope(model_name):
152 |     model = efficientnet_model.Model(blocks_args, global_params)
153 |     logits = model(images, training=training)
154 | 
155 |   logits = tf.identity(logits, 'logits')
156 |   return logits, model.endpoints
157 | 
158 | 
159 | def build_model_base(images, model_name, training, override_params=None):
160 |   """A helper functiion to create a base model and return global_pool.
161 | 
162 |   Args:
163 |     images: input images tensor.
164 |     model_name: string, the model name of a pre-defined MnasNet.
165 |     training: boolean, whether the model is constructed for training.
166 |     override_params: A dictionary of params for overriding. Fields must exist in
167 |       mnasnet_model.GlobalParams.
168 | 
169 |   Returns:
170 |     features: global pool features.
171 |     endpoints: the endpoints for each layer.
172 | 
173 |   Raises:
174 |     When model_name specified an undefined model, raises NotImplementedError.
175 |     When override_params has invalid fields, raises ValueError.
176 |   """
177 |   assert isinstance(images, tf.Tensor)
178 |   blocks_args, global_params = get_model_params(model_name, override_params)
179 | 
180 |   with tf.variable_scope(model_name):
181 |     model = efficientnet_model.Model(blocks_args, global_params)
182 |     features = model(images, training=training, features_only=True)
183 | 
184 |   features = tf.identity(features, 'global_pool')
185 |   return features, model.endpoints
186 | 


--------------------------------------------------------------------------------
/src/model/cover_head/mobilenet_v1_train.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Build and train mobilenet_v1 with options for quantization."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import tensorflow as tf
 22 | 
 23 | from datasets import dataset_factory
 24 | from nets import mobilenet_v1
 25 | from preprocessing import preprocessing_factory
 26 | 
 27 | slim = tf.contrib.slim
 28 | 
 29 | flags = tf.app.flags
 30 | 
 31 | flags.DEFINE_string('master', '', 'Session master')
 32 | flags.DEFINE_integer('task', 0, 'Task')
 33 | flags.DEFINE_integer('ps_tasks', 0, 'Number of ps')
 34 | flags.DEFINE_integer('batch_size', 64, 'Batch size')
 35 | flags.DEFINE_integer('num_classes', 1001, 'Number of classes to distinguish')
 36 | flags.DEFINE_integer('number_of_steps', None,
 37 |                      'Number of training steps to perform before stopping')
 38 | flags.DEFINE_integer('image_size', 224, 'Input image resolution')
 39 | flags.DEFINE_float('depth_multiplier', 1.0, 'Depth multiplier for mobilenet')
 40 | flags.DEFINE_bool('quantize', False, 'Quantize training')
 41 | flags.DEFINE_string('fine_tune_checkpoint', '',
 42 |                     'Checkpoint from which to start finetuning.')
 43 | flags.DEFINE_string('checkpoint_dir', '',
 44 |                     'Directory for writing training checkpoints and logs')
 45 | flags.DEFINE_string('dataset_dir', '', 'Location of dataset')
 46 | flags.DEFINE_integer('log_every_n_steps', 100, 'Number of steps per log')
 47 | flags.DEFINE_integer('save_summaries_secs', 100,
 48 |                      'How often to save summaries, secs')
 49 | flags.DEFINE_integer('save_interval_secs', 100,
 50 |                      'How often to save checkpoints, secs')
 51 | 
 52 | FLAGS = flags.FLAGS
 53 | 
 54 | _LEARNING_RATE_DECAY_FACTOR = 0.94
 55 | 
 56 | 
 57 | def get_learning_rate():
 58 |   if FLAGS.fine_tune_checkpoint:
 59 |     # If we are fine tuning a checkpoint we need to start at a lower learning
 60 |     # rate since we are farther along on training.
 61 |     return 1e-4
 62 |   else:
 63 |     return 0.045
 64 | 
 65 | 
 66 | def get_quant_delay():
 67 |   if FLAGS.fine_tune_checkpoint:
 68 |     # We can start quantizing immediately if we are finetuning.
 69 |     return 0
 70 |   else:
 71 |     # We need to wait for the model to train a bit before we quantize if we are
 72 |     # training from scratch.
 73 |     return 250000
 74 | 
 75 | 
 76 | def imagenet_input(is_training):
 77 |   """Data reader for imagenet.
 78 | 
 79 |   Reads in imagenet data and performs pre-processing on the images.
 80 | 
 81 |   Args:
 82 |      is_training: bool specifying if train or validation dataset is needed.
 83 |   Returns:
 84 |      A batch of images and labels.
 85 |   """
 86 |   if is_training:
 87 |     dataset = dataset_factory.get_dataset('imagenet', 'train',
 88 |                                           FLAGS.dataset_dir)
 89 |   else:
 90 |     dataset = dataset_factory.get_dataset('imagenet', 'validation',
 91 |                                           FLAGS.dataset_dir)
 92 | 
 93 |   provider = slim.dataset_data_provider.DatasetDataProvider(
 94 |       dataset,
 95 |       shuffle=is_training,
 96 |       common_queue_capacity=2 * FLAGS.batch_size,
 97 |       common_queue_min=FLAGS.batch_size)
 98 |   [image, label] = provider.get(['image', 'label'])
 99 | 
100 |   image_preprocessing_fn = preprocessing_factory.get_preprocessing(
101 |       'mobilenet_v1', is_training=is_training)
102 | 
103 |   image = image_preprocessing_fn(image, FLAGS.image_size, FLAGS.image_size)
104 | 
105 |   images, labels = tf.train.batch(
106 |       [image, label],
107 |       batch_size=FLAGS.batch_size,
108 |       num_threads=4,
109 |       capacity=5 * FLAGS.batch_size)
110 |   labels = slim.one_hot_encoding(labels, FLAGS.num_classes)
111 |   return images, labels
112 | 
113 | 
114 | def build_model():
115 |   """Builds graph for model to train with rewrites for quantization.
116 | 
117 |   Returns:
118 |     g: Graph with fake quantization ops and batch norm folding suitable for
119 |     training quantized weights.
120 |     train_tensor: Train op for execution during training.
121 |   """
122 |   g = tf.Graph()
123 |   with g.as_default(), tf.device(
124 |       tf.train.replica_device_setter(FLAGS.ps_tasks)):
125 |     inputs, labels = imagenet_input(is_training=True)
126 |     with slim.arg_scope(mobilenet_v1.mobilenet_v1_arg_scope(is_training=True)):
127 |       logits, _ = mobilenet_v1.mobilenet_v1(
128 |           inputs,
129 |           is_training=True,
130 |           depth_multiplier=FLAGS.depth_multiplier,
131 |           num_classes=FLAGS.num_classes)
132 | 
133 |     tf.losses.softmax_cross_entropy(labels, logits)
134 | 
135 |     # Call rewriter to produce graph with fake quant ops and folded batch norms
136 |     # quant_delay delays start of quantization till quant_delay steps, allowing
137 |     # for better model accuracy.
138 |     if FLAGS.quantize:
139 |       tf.contrib.quantize.create_training_graph(quant_delay=get_quant_delay())
140 | 
141 |     total_loss = tf.losses.get_total_loss(name='total_loss')
142 |     # Configure the learning rate using an exponential decay.
143 |     num_epochs_per_decay = 2.5
144 |     imagenet_size = 1271167
145 |     decay_steps = int(imagenet_size / FLAGS.batch_size * num_epochs_per_decay)
146 | 
147 |     learning_rate = tf.train.exponential_decay(
148 |         get_learning_rate(),
149 |         tf.train.get_or_create_global_step(),
150 |         decay_steps,
151 |         _LEARNING_RATE_DECAY_FACTOR,
152 |         staircase=True)
153 |     opt = tf.train.GradientDescentOptimizer(learning_rate)
154 | 
155 |     train_tensor = slim.learning.create_train_op(
156 |         total_loss,
157 |         optimizer=opt)
158 | 
159 |   slim.summaries.add_scalar_summary(total_loss, 'total_loss', 'losses')
160 |   slim.summaries.add_scalar_summary(learning_rate, 'learning_rate', 'training')
161 |   return g, train_tensor
162 | 
163 | 
164 | def get_checkpoint_init_fn():
165 |   """Returns the checkpoint init_fn if the checkpoint is provided."""
166 |   if FLAGS.fine_tune_checkpoint:
167 |     variables_to_restore = slim.get_variables_to_restore()
168 |     global_step_reset = tf.assign(tf.train.get_or_create_global_step(), 0)
169 |     # When restoring from a floating point model, the min/max values for
170 |     # quantized weights and activations are not present.
171 |     # We instruct slim to ignore variables that are missing during restoration
172 |     # by setting ignore_missing_vars=True
173 |     slim_init_fn = slim.assign_from_checkpoint_fn(
174 |         FLAGS.fine_tune_checkpoint,
175 |         variables_to_restore,
176 |         ignore_missing_vars=True)
177 | 
178 |     def init_fn(sess):
179 |       slim_init_fn(sess)
180 |       # If we are restoring from a floating point model, we need to initialize
181 |       # the global step to zero for the exponential decay to result in
182 |       # reasonable learning rates.
183 |       sess.run(global_step_reset)
184 |     return init_fn
185 |   else:
186 |     return None
187 | 
188 | 
189 | def train_model():
190 |   """Trains mobilenet_v1."""
191 |   g, train_tensor = build_model()
192 |   with g.as_default():
193 |     slim.learning.train(
194 |         train_tensor,
195 |         FLAGS.checkpoint_dir,
196 |         is_chief=(FLAGS.task == 0),
197 |         master=FLAGS.master,
198 |         log_every_n_steps=FLAGS.log_every_n_steps,
199 |         graph=g,
200 |         number_of_steps=FLAGS.number_of_steps,
201 |         save_summaries_secs=FLAGS.save_summaries_secs,
202 |         save_interval_secs=FLAGS.save_interval_secs,
203 |         init_fn=get_checkpoint_init_fn(),
204 |         global_step=tf.train.get_global_step())
205 | 
206 | 
207 | def main(unused_arg):
208 |   train_model()
209 | 
210 | 
211 | if __name__ == '__main__':
212 |   tf.app.run(main)
213 | 


--------------------------------------------------------------------------------
/src/model/cover_head/nets_factory.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Contains a factory for building various models."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | import functools
 21 | 
 22 | import tensorflow as tf
 23 | 
 24 | from nets import alexnet
 25 | from nets import cifarnet
 26 | from nets import i3d
 27 | from nets import inception
 28 | from nets import lenet
 29 | from nets import mobilenet_v1
 30 | from nets import overfeat
 31 | from nets import resnet_v1
 32 | from nets import resnet_v2
 33 | from nets import s3dg
 34 | from nets import vgg
 35 | from nets import densenet
 36 | from nets.mobilenet import mobilenet_v2
 37 | from nets.nasnet import nasnet
 38 | from nets.nasnet import pnasnet
 39 | 
 40 | 
 41 | slim = tf.contrib.slim
 42 | 
 43 | networks_map = {'alexnet_v2': alexnet.alexnet_v2,
 44 |                 'cifarnet': cifarnet.cifarnet,
 45 |                 'overfeat': overfeat.overfeat,
 46 |                 'vgg_a': vgg.vgg_a,
 47 |                 'vgg_16': vgg.vgg_16,
 48 |                 'vgg_19': vgg.vgg_19,
 49 |                 'inception_v1': inception.inception_v1,
 50 |                 'inception_v2': inception.inception_v2,
 51 |                 'inception_v3': inception.inception_v3,
 52 |                 'inception_v4': inception.inception_v4,
 53 |                 'inception_resnet_v2': inception.inception_resnet_v2,
 54 |                 'i3d': i3d.i3d,
 55 |                 's3dg': s3dg.s3dg,
 56 |                 'lenet': lenet.lenet,
 57 |                 'resnet_v1_50': resnet_v1.resnet_v1_50,
 58 |                 'resnet_v1_101': resnet_v1.resnet_v1_101,
 59 |                 'resnet_v1_152': resnet_v1.resnet_v1_152,
 60 |                 'resnet_v1_200': resnet_v1.resnet_v1_200,
 61 |                 'resnet_v2_50': resnet_v2.resnet_v2_50,
 62 |                 'resnet_v2_101': resnet_v2.resnet_v2_101,
 63 |                 'resnet_v2_152': resnet_v2.resnet_v2_152,
 64 |                 'resnet_v2_200': resnet_v2.resnet_v2_200,
 65 |                 'densenet121': densenet.densenet121,
 66 |                 'densenet161': densenet.densenet161,
 67 |                 'densenet169': densenet.densenet169,
 68 |                 'mobilenet_v1': mobilenet_v1.mobilenet_v1,
 69 |                 'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_075,
 70 |                 'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_050,
 71 |                 'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_025,
 72 |                 'mobilenet_v2': mobilenet_v2.mobilenet,
 73 |                 'mobilenet_v2_140': mobilenet_v2.mobilenet_v2_140,
 74 |                 'mobilenet_v2_035': mobilenet_v2.mobilenet_v2_035,
 75 |                 'nasnet_cifar': nasnet.build_nasnet_cifar,
 76 |                 'nasnet_mobile': nasnet.build_nasnet_mobile,
 77 |                 'nasnet_large': nasnet.build_nasnet_large,
 78 |                 'pnasnet_large': pnasnet.build_pnasnet_large,
 79 |                 'pnasnet_mobile': pnasnet.build_pnasnet_mobile,
 80 |                }
 81 | 
 82 | arg_scopes_map = {'alexnet_v2': alexnet.alexnet_v2_arg_scope,
 83 |                   'cifarnet': cifarnet.cifarnet_arg_scope,
 84 |                   'overfeat': overfeat.overfeat_arg_scope,
 85 |                   'vgg_a': vgg.vgg_arg_scope,
 86 |                   'vgg_16': vgg.vgg_arg_scope,
 87 |                   'vgg_19': vgg.vgg_arg_scope,
 88 |                   'inception_v1': inception.inception_v3_arg_scope,
 89 |                   'inception_v2': inception.inception_v3_arg_scope,
 90 |                   'inception_v3': inception.inception_v3_arg_scope,
 91 |                   'inception_v4': inception.inception_v4_arg_scope,
 92 |                   'inception_resnet_v2':
 93 |                   inception.inception_resnet_v2_arg_scope,
 94 |                   'i3d': i3d.i3d_arg_scope,
 95 |                   's3dg': s3dg.s3dg_arg_scope,
 96 |                   'lenet': lenet.lenet_arg_scope,
 97 |                   'resnet_v1_50': resnet_v1.resnet_arg_scope,
 98 |                   'resnet_v1_101': resnet_v1.resnet_arg_scope,
 99 |                   'resnet_v1_152': resnet_v1.resnet_arg_scope,
100 |                   'resnet_v1_200': resnet_v1.resnet_arg_scope,
101 |                   'resnet_v2_50': resnet_v2.resnet_arg_scope,
102 |                   'resnet_v2_101': resnet_v2.resnet_arg_scope,
103 |                   'resnet_v2_152': resnet_v2.resnet_arg_scope,
104 |                   'resnet_v2_200': resnet_v2.resnet_arg_scope,
105 |                   'densenet121': densenet.densenet_arg_scope,
106 |                   'densenet161': densenet.densenet_arg_scope,
107 |                   'densenet169': densenet.densenet_arg_scope,
108 |                   'mobilenet_v1': mobilenet_v1.mobilenet_v1_arg_scope,
109 |                   'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_arg_scope,
110 |                   'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_arg_scope,
111 |                   'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_arg_scope,
112 |                   'mobilenet_v2': mobilenet_v2.training_scope,
113 |                   'mobilenet_v2_035': mobilenet_v2.training_scope,
114 |                   'mobilenet_v2_140': mobilenet_v2.training_scope,
115 |                   'nasnet_cifar': nasnet.nasnet_cifar_arg_scope,
116 |                   'nasnet_mobile': nasnet.nasnet_mobile_arg_scope,
117 |                   'nasnet_large': nasnet.nasnet_large_arg_scope,
118 |                   'pnasnet_large': pnasnet.pnasnet_large_arg_scope,
119 |                   'pnasnet_mobile': pnasnet.pnasnet_mobile_arg_scope,
120 |                  }
121 | 
122 | 
123 | def get_network_fn(name, num_classes, weight_decay=0.0, is_training=False):
124 |   """Returns a network_fn such as `logits, end_points = network_fn(images)`.
125 | 
126 |   Args:
127 |     name: The name of the network.
128 |     num_classes: The number of classes to use for classification. If 0 or None,
129 |       the logits layer is omitted and its input features are returned instead.
130 |     weight_decay: The l2 coefficient for the model weights.
131 |     is_training: `True` if the model is being used for training and `False`
132 |       otherwise.
133 | 
134 |   Returns:
135 |     network_fn: A function that applies the model to a batch of images. It has
136 |       the following signature:
137 |           net, end_points = network_fn(images)
138 |       The `images` input is a tensor of shape [batch_size, height, width, 3]
139 |       with height = width = network_fn.default_image_size. (The permissibility
140 |       and treatment of other sizes depends on the network_fn.)
141 |       The returned `end_points` are a dictionary of intermediate activations.
142 |       The returned `net` is the topmost layer, depending on `num_classes`:
143 |       If `num_classes` was a non-zero integer, `net` is a logits tensor
144 |       of shape [batch_size, num_classes].
145 |       If `num_classes` was 0 or `None`, `net` is a tensor with the input
146 |       to the logits layer of shape [batch_size, 1, 1, num_features] or
147 |       [batch_size, num_features]. Dropout has not been applied to this
148 |       (even if the network's original classification does); it remains for
149 |       the caller to do this or not.
150 | 
151 |   Raises:
152 |     ValueError: If network `name` is not recognized.
153 |   """
154 |   if name not in networks_map:
155 |     raise ValueError('Name of network unknown %s' % name)
156 |   func = networks_map[name]
157 |   @functools.wraps(func)
158 |   def network_fn(images, **kwargs):
159 |     arg_scope = arg_scopes_map[name](weight_decay=weight_decay)
160 |     with slim.arg_scope(arg_scope):
161 |       return func(images, num_classes=num_classes, is_training=is_training,
162 |                   **kwargs)
163 |   if hasattr(func, 'default_image_size'):
164 |     network_fn.default_image_size = func.default_image_size
165 | 
166 |   return network_fn
167 | 


--------------------------------------------------------------------------------