├── preprocess ├── __init__.py ├── audio_extractor │ ├── __init__.py │ ├── vggish_params.py │ ├── vggish_input.py │ ├── vggish_postprocess.py │ └── vggish_slim.py ├── imgfeat_extractor │ ├── __init__.py │ └── efficientnet_extractor.py ├── txt_extractor │ └── text_requests.py └── feat_extract_main.py ├── src ├── model │ ├── __init__.py │ ├── cover_head │ │ ├── __init__.py │ │ ├── nasnet │ │ │ ├── __init__.py │ │ │ ├── nasnet_utils_test.py │ │ │ └── README.md │ │ ├── mobilenet_v1_eval.py │ │ ├── mobilenet_v1_train.py │ │ └── nets_factory.py │ ├── fusion_head │ │ ├── __init__.py │ │ └── fusion_se.py │ ├── text_head │ │ ├── __init__.py │ │ └── bert_model.py │ ├── models │ │ ├── __init__.py │ │ ├── base_model.py │ │ └── nextvlad_bert.py │ ├── video_head │ │ ├── __init__.py │ │ └── nextvlad.py │ ├── classify_head │ │ ├── __init__.py │ │ ├── logistic_model.py │ │ └── moe_model.py │ └── image_head │ │ ├── efficientNet │ │ └── condconv │ │ │ ├── __init__.py │ │ │ └── efficientnet_condconv_builder.py │ │ └── __init__.py ├── dataloader │ ├── __init__.py │ ├── preprocess │ │ ├── __init__.py │ │ ├── cnn_preprocessing │ │ │ ├── __init__.py │ │ │ ├── lenet_preprocessing.py │ │ │ ├── preprocessing_factory.py │ │ │ └── cifarnet_preprocessing.py │ │ ├── text_preprocess.py │ │ ├── image_preprocess.py │ │ ├── label_preprocess.py │ │ └── frames_npy_preprocess.py │ └── dataloader.py └── loss │ ├── __init__.py │ └── loss.py ├── utils ├── metrics │ ├── __init__.py │ ├── pr_calculator_per_tag.py │ ├── pr_calculator.py │ └── mean_average_precision_calculator.py ├── k_fold_prepare.py ├── k_fold_fusion.py ├── save_best_ckpt.py └── export_model.py ├── .gitattributes ├── requirement.txt ├── LICENSE ├── .gitignore ├── configs ├── config.tagging.5k.yaml ├── config.tagging.5k.0.yaml ├── config.tagging.5k.1.yaml ├── config.tagging.5k.2.yaml ├── config.tagging.5k.3.yaml ├── config.tagging.5k.4.yaml ├── config.tagging.5k.5.yaml ├── config.tagging.5k.6.yaml ├── config.tagging.5k.7.yaml ├── config.tagging.5k.8.yaml └── config.tagging.5k.9.yaml ├── readme.md ├── init.sh ├── train.sh ├── train.py └── infer.sh /preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/dataloader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/model/cover_head/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /preprocess/audio_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/dataloader/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /preprocess/imgfeat_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/model/cover_head/nasnet/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/dataloader/preprocess/cnn_preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /src/model/fusion_head/__init__.py: -------------------------------------------------------------------------------- 1 | from src.model.fusion_head.fusion_se import SE 2 | 3 | def get_instance(name, paramters): 4 | model = {'SE': SE}[name] 5 | return model(**paramters) -------------------------------------------------------------------------------- /src/model/text_head/__init__.py: -------------------------------------------------------------------------------- 1 | from src.model.text_head.bert_model import BERT 2 | 3 | def get_instance(name, paramters): 4 | model = {'BERT': BERT}[name] 5 | return model(**paramters) -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | pyyaml 2 | tqdm 3 | munch 4 | resampy 5 | soundfile 6 | moviepy==1.0.3 7 | gast==0.2.2 8 | ipython 9 | jupyter 10 | matplotlib 11 | pandas 12 | xlrd 13 | openpyxl 14 | tomorrow3 15 | -------------------------------------------------------------------------------- /src/model/models/__init__.py: -------------------------------------------------------------------------------- 1 | from src.model.models.nextvlad_bert import NextVladBERT 2 | 3 | def get_instance(name, paramters): 4 | model = {"NextVladBERT": NextVladBERT}[name] 5 | return model(paramters) -------------------------------------------------------------------------------- /src/model/video_head/__init__.py: -------------------------------------------------------------------------------- 1 | from src.model.video_head.nextvlad import NeXtVLAD 2 | 3 | def get_instance(name, paramters_dict): 4 | model = {'NeXtVLAD': NeXtVLAD}[name] 5 | return model(**paramters_dict) -------------------------------------------------------------------------------- /src/model/models/base_model.py: -------------------------------------------------------------------------------- 1 | class BaseModel(): 2 | def __init__(self, args): 3 | raise NotImplementedError 4 | def __call__(self, inputs, is_training): 5 | raise NotImplementedError 6 | 7 | def build_loss(self): 8 | raise NotImplementedError 9 | -------------------------------------------------------------------------------- /src/loss/__init__.py: -------------------------------------------------------------------------------- 1 | from src.loss.loss import CrossEntropyLoss 2 | from src.loss.loss import SoftmaxLoss 3 | 4 | def get_instance(name, paramters_dict): 5 | model = {'CrossEntropyLoss': CrossEntropyLoss, 6 | 'SoftmaxLoss': SoftmaxLoss}[name] 7 | return model(**paramters_dict) -------------------------------------------------------------------------------- /src/model/classify_head/__init__.py: -------------------------------------------------------------------------------- 1 | from src.model.classify_head.logistic_model import LogisticModel 2 | from src.model.classify_head.moe_model import MoeModel 3 | 4 | def get_instance(name, paramters_dict): 5 | model = {'LogisticModel': LogisticModel, 6 | 'MoeModel': MoeModel}[name] 7 | return model(**paramters_dict) -------------------------------------------------------------------------------- /preprocess/txt_extractor/text_requests.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class VideoASR(): 4 | """视频ASR""" 5 | def request(self, inp): 6 | return inp 7 | 8 | class VideoOCR(): 9 | """视频OCR""" 10 | def request(self, inp): 11 | return inp 12 | 13 | class ImageOCR(): 14 | """图像OCR""" 15 | def request(self, inp): 16 | return inp 17 | 18 | if __name__ == '__main__': 19 | test_image = './test.jpg' 20 | image_ocr = ImageOCR().request(test_image) 21 | print("image_ocr: {}".format(image_ocr)) 22 | 23 | -------------------------------------------------------------------------------- /src/model/image_head/efficientNet/condconv/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /src/model/classify_head/logistic_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow.contrib.slim as slim 2 | import tensorflow as tf 3 | 4 | class LogisticModel(): 5 | """Logistic model with L2 regularization.""" 6 | def __init__(self, num_classes, l2_penalty=None): 7 | self.num_classes = num_classes 8 | self.l2_penalty = 0.0 if l2_penalty==None else l2_penalty 9 | 10 | def __call__(self, model_input): 11 | """ 12 | model_input: 'batch' x 'num_features' matrix of input features. 13 | Returns: The dimensions of the tensor are batch_size x num_classes.""" 14 | logits = slim.fully_connected( 15 | model_input, self.num_classes, activation_fn=None, 16 | weights_regularizer=slim.l2_regularizer(self.l2_penalty), 17 | biases_regularizer=slim.l2_regularizer(self.l2_penalty), 18 | weights_initializer=slim.variance_scaling_initializer()) 19 | output = tf.nn.sigmoid(logits) 20 | return {"predictions": output, "logits": logits} 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/model/text_head/bert_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from src.model.text_head.bert_base import BertModel,BertConfig 3 | 4 | class BERT(): 5 | def __init__(self, bert_config, bert_emb_encode_size, reuse_variables=tf.AUTO_REUSE): 6 | self.reuse_variables = reuse_variables 7 | self.bert_emb_encode_size = bert_emb_encode_size 8 | self.bert_config = BertConfig(**bert_config) 9 | 10 | def __call__(self, input_ids, is_training): 11 | input_mask = tf.cast(tf.not_equal(input_ids,0),tf.int32) 12 | bert_model = BertModel(config = self.bert_config, 13 | is_training = is_training, 14 | input_ids = input_ids, 15 | input_mask = input_mask, 16 | reuse_variables = self.reuse_variables) 17 | 18 | text_features = bert_model.get_pooled_output() 19 | text_features = tf.layers.dense(text_features, self.bert_emb_encode_size, activation=None, name='text_features', reuse=self.reuse_variables) 20 | text_features = tf.layers.batch_normalization(text_features, training=is_training, reuse=self.reuse_variables) 21 | return text_features 22 | -------------------------------------------------------------------------------- /src/dataloader/preprocess/text_preprocess.py: -------------------------------------------------------------------------------- 1 | import tokenization 2 | import numpy as np 3 | import random 4 | import tensorflow as tf 5 | import os 6 | 7 | seed = 20210627 8 | random.seed(seed) 9 | tf.set_random_seed(seed) 10 | np.random.seed(seed) 11 | os.environ["PYTHONHASHSEED"] = str(seed) 12 | 13 | 14 | class Preprocess: 15 | 16 | def __init__(self, vocab, max_len, is_training=False): 17 | self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab) 18 | self.max_len = max_len 19 | self.is_training = is_training 20 | 21 | def __call__(self, text, augment): 22 | with open(text) as f: 23 | data = eval(f.read().strip()) 24 | text = data['video_ocr'] + data['video_asr'] 25 | text = text.replace("|", "") 26 | if augment > 0: 27 | text = text[random.randint(0, int(max(0, len(text) - 50))):] 28 | tokens = ['[CLS]'] + self.tokenizer.tokenize(text) 29 | if augment == 2: 30 | tokens = ['[CLS]'] + [token for token in tokens[1:] if random.random() > 0.1] 31 | ids = self.tokenizer.convert_tokens_to_ids(tokens)[:self.max_len] 32 | ids = ids + [0]*(self.max_len-len(ids)) 33 | return np.array(ids).astype('int64') -------------------------------------------------------------------------------- /src/dataloader/preprocess/image_preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | from cnn_preprocessing import inception_preprocessing 6 | 7 | class Preprocess: 8 | 9 | def __init__(self, is_training, return_idx=False): 10 | self.is_training = is_training 11 | #with tf.get_default_graph(): 12 | self.path_placeholder = tf.placeholder(shape=None,dtype=tf.string) 13 | image = tf.io.read_file(self.path_placeholder) 14 | image = tf.io.decode_image(image, channels=3) 15 | self.image_shape = (224, 224, 3) 16 | #TODO(jefxiong, 对不同模型预处理要通用) 17 | image.set_shape(self.image_shape) 18 | self.image = inception_preprocessing.preprocess_image(image, 224, 224, 19 | is_training=self.is_training, 20 | add_image_summaries=False, 21 | crop_image=self.is_training) 22 | sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 23 | sess_config.gpu_options.allow_growth = True 24 | self.sess = tf.Session(config=sess_config) 25 | self.return_idx = return_idx 26 | 27 | def __call__(self, path, augment): 28 | if os.path.exists(path): 29 | image = self.sess.run(self.image,feed_dict={self.path_placeholder:path}) 30 | else: 31 | image = np.zeros(self.image_shape) 32 | if self.return_idx: 33 | idx = os.path.basename(path).split('.')[0] 34 | return image, idx 35 | return image 36 | -------------------------------------------------------------------------------- /utils/k_fold_prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from sklearn.model_selection import KFold 5 | 6 | 7 | def parse_ground_truth(path): 8 | dataset = {} 9 | with open(path) as f: 10 | video_feat = [] 11 | for row in f: 12 | row = row.strip() 13 | if len(row) == 0: 14 | assert len(video_feat) == 5 15 | dataset[os.path.split(video_feat[0])[-1].split(".")[0]] = video_feat 16 | video_feat = [] 17 | else: 18 | video_feat.append(row) 19 | return dataset 20 | 21 | 22 | if __name__ == "__main__": 23 | save_train, save_valid, config_path = sys.argv[3:6] 24 | train_valid_data = parse_ground_truth(sys.argv[1]) 25 | train_valid_data.update(parse_ground_truth(sys.argv[2])) 26 | videos = list(train_valid_data.keys()) 27 | kf = KFold(n_splits=10, random_state=2021, shuffle=True) 28 | for i, (train_index, valid_index) in enumerate(kf.split(videos)): 29 | with open(config_path + 'config.tagging.5k.yaml', 'r') as fin, \ 30 | open(config_path + 'config.tagging.5k.{}.yaml'.format(i), 'w') as fout: 31 | config = fin.read().replace("train.txt", 'train_{}.txt'.format(i)) 32 | fout.write(config.replace("val.txt", 'valid_{}.txt'.format(i))) 33 | 34 | with open(save_train.format(i), 'w') as f: 35 | for idx in train_index: 36 | f.write(u"\n".join(train_valid_data[videos[idx]]) + '\n\n') 37 | 38 | with open(save_valid.format(i), 'w') as f: 39 | for idx in valid_index: 40 | f.write(u"\n".join(train_valid_data[videos[idx]]) + '\n\n') 41 | -------------------------------------------------------------------------------- /src/dataloader/preprocess/cnn_preprocessing/lenet_preprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Provides utilities for preprocessing.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import tensorflow as tf 22 | 23 | slim = tf.contrib.slim 24 | 25 | 26 | def preprocess_image(image, output_height, output_width, is_training): 27 | """Preprocesses the given image. 28 | 29 | Args: 30 | image: A `Tensor` representing an image of arbitrary size. 31 | output_height: The height of the image after preprocessing. 32 | output_width: The width of the image after preprocessing. 33 | is_training: `True` if we're preprocessing the image for training and 34 | `False` otherwise. 35 | 36 | Returns: 37 | A preprocessed image. 38 | """ 39 | image = tf.to_float(image) 40 | image = tf.image.resize_image_with_crop_or_pad( 41 | image, output_width, output_height) 42 | image = tf.subtract(image, 128.0) 43 | image = tf.div(image, 128.0) 44 | return image 45 | -------------------------------------------------------------------------------- /utils/metrics/pr_calculator_per_tag.py: -------------------------------------------------------------------------------- 1 | #encoding: utf-8 2 | #Author: jefxiong@tencent.com 3 | 4 | from utils.metrics.pr_calculator import PRCalculator 5 | import numpy as np 6 | import time 7 | 8 | def count_func_time(func): 9 | def call_fun(*args, **kwargs): 10 | start_time = time.time() 11 | func(*args, **kwargs) 12 | end_time = time.time() 13 | print('{} cost {:.3f} sec'.format(func.__name__, end_time-start_time)) 14 | return call_fun 15 | 16 | def map_func(obj, x1, x2): 17 | obj.accumulate(x1, x2) 18 | 19 | class PRCalculatorPerTag(): 20 | def __init__(self, tag_num): 21 | self.tag_num = tag_num 22 | self.pr_calculators = [] 23 | for i in range(self.tag_num): 24 | self.pr_calculators.append(PRCalculator()) 25 | 26 | #@count_func_time 27 | def accumulate(self, predictions, actuals): 28 | """ 29 | predictions: n_example X n_classes 30 | actuals: n_example X n_classes 31 | """ 32 | #n_example X n_classes ==> n_classes * [n_example x 1] 33 | pred_per_tag_list = np.expand_dims(predictions.transpose(), -1) 34 | actuals_per_tag_list = np.expand_dims(actuals.transpose(), -1) 35 | 36 | for i in range(self.tag_num): 37 | self.pr_calculators[i].accumulate(pred_per_tag_list[i], actuals_per_tag_list[i]) 38 | #ret = list(map(map_func, self.pr_calculators, pred_per_tag_list, actuals_per_tag_list)) 39 | 40 | def get_precision_list(self, th=0.5): 41 | return [self.pr_calculators[i].get_precision_at_conf(th) for i in range(self.tag_num)] 42 | 43 | def get_recall_list(self, th=0.5): 44 | return [self.pr_calculators[i].get_recall_at_conf(th) for i in range(self.tag_num)] 45 | 46 | def clear(self): 47 | for i in range(self.tag_num): 48 | self.pr_calculators[i].clear() 49 | -------------------------------------------------------------------------------- /src/model/image_head/__init__.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib.slim.nets import resnet_v2 3 | import functools 4 | import tensorflow.contrib.slim as slim 5 | 6 | import src.model.image_head.efficientNet.efficientnet_builder as efficientnet_builder 7 | 8 | networks_map={ 9 | 'resnet_v2_50': resnet_v2.resnet_v2_50, 10 | 'resnet_v2_101': resnet_v2.resnet_v2_101, 11 | 'resnet_v2_152': resnet_v2.resnet_v2_152, 12 | 'resnet_v2_200': resnet_v2.resnet_v2_200, 13 | 'efficientnet': efficientnet_builder.build_model_base, 14 | } 15 | 16 | arg_scopes_map={ 17 | 'resnet_v2_50': resnet_v2.resnet_arg_scope, 18 | 'resnet_v2_101': resnet_v2.resnet_arg_scope, 19 | 'resnet_v2_152': resnet_v2.resnet_arg_scope, 20 | 'resnet_v2_200': resnet_v2.resnet_arg_scope, 21 | } 22 | 23 | def get_network_fn(name, model_name = None): 24 | if name not in networks_map: 25 | raise ValueError('Name of network unknown %s' % name) 26 | func = networks_map.get(name, None) 27 | if model_name is not None: 28 | func = functools.partial(func, model_name = model_name) 29 | @functools.wraps(func) 30 | def network_fn(images, is_training, **kwargs): 31 | if arg_scopes_map.get(name,None) is not None: 32 | arg_scope = arg_scopes_map[name](weight_decay=1e-5) 33 | with slim.arg_scope(arg_scope): 34 | out, _ = func(images, num_classes=None, is_training=is_training, **kwargs) 35 | else: 36 | out, _ = func(images, is_training=is_training, **kwargs) 37 | if len(out.get_shape()) ==4: 38 | return out[:,0,0,:] #squeeze conv feat 39 | else: 40 | return out 41 | 42 | if hasattr(func, 'default_image_size'): 43 | network_fn.default_image_size = func.default_image_size 44 | 45 | return network_fn 46 | 47 | def get_instance(name, paramters): 48 | return get_network_fn(name, **paramters) 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # IPython 77 | profile_default/ 78 | ipython_config.py 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | .dmypy.json 111 | dmypy.json 112 | 113 | # Pyre type checker 114 | .pyre/ 115 | -------------------------------------------------------------------------------- /preprocess/audio_extractor/vggish_params.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Global parameters for the VGGish model. 17 | 18 | See vggish_slim.py for more information. 19 | """ 20 | 21 | # Architectural constants. 22 | NUM_FRAMES = 96 # Frames in input mel-spectrogram patch. 23 | NUM_BANDS = 64 # Frequency bands in input mel-spectrogram patch. 24 | EMBEDDING_SIZE = 128 # Size of embedding layer. 25 | 26 | # Hyperparameters used in feature and example generation. 27 | SAMPLE_RATE = 16000 28 | STFT_WINDOW_LENGTH_SECONDS = 0.025 29 | STFT_HOP_LENGTH_SECONDS = 0.010 30 | NUM_MEL_BINS = NUM_BANDS 31 | MEL_MIN_HZ = 125 32 | MEL_MAX_HZ = 7500 33 | LOG_OFFSET = 0.01 # Offset used for stabilized log of input mel-spectrogram. 34 | EXAMPLE_WINDOW_SECONDS = 0.96 # Each example contains 96 10ms frames 35 | EXAMPLE_HOP_SECONDS = 0.96 # with zero overlap. 36 | 37 | # Parameters used for embedding postprocessing. 38 | PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors' 39 | PCA_MEANS_NAME = 'pca_means' 40 | QUANTIZE_MIN_VAL = -2.0 41 | QUANTIZE_MAX_VAL = +2.0 42 | 43 | # Hyperparameters used in training. 44 | INIT_STDDEV = 0.01 # Standard deviation used to initialize weights. 45 | LEARNING_RATE = 1e-4 # Learning rate for the Adam optimizer. 46 | ADAM_EPSILON = 1e-8 # Epsilon for the Adam optimizer. 47 | 48 | # Names of ops, tensors, and features. 49 | INPUT_OP_NAME = 'vggish/input_features' 50 | INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0' 51 | OUTPUT_OP_NAME = 'vggish/embedding' 52 | OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0' 53 | AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding' 54 | -------------------------------------------------------------------------------- /utils/k_fold_fusion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import sys 4 | 5 | 6 | def load_label_dict(path): 7 | str2idx, idx2str = {}, [] 8 | with open(path, encoding="utf8") as f: 9 | for line in f.readlines(): 10 | label, index = line.strip().split('\t') 11 | str2idx[label] = int(index) 12 | idx2str.append(label) 13 | return str2idx, idx2str 14 | 15 | 16 | def json_to_array(output, str2idx): 17 | target = np.zeros(82) 18 | output = output['result'][0] 19 | for label, score in zip(output['labels'], 20 | output['scores']): 21 | target[str2idx[label]] = float(score) 22 | return target 23 | 24 | 25 | if __name__ == "__main__": 26 | num_folds = int(sys.argv[1]) 27 | label_path = sys.argv[2] # './home/tione/notebook/VideoStructuring/dataset/label_id.txt' 28 | predict_path = sys.argv[3] # './home/tione/notebook/VideoStructuring/KFoldResults/test/results_{}/tagging_5k.json' 29 | output_path = sys.argv[4] # './home/tione/notebook/VideoStructuring/results/tagging_5k.json' 30 | topk = int(sys.argv[5]) # 20 31 | 32 | str2idx, idx2str = load_label_dict(label_path) 33 | 34 | full_predict = np.zeros((5000, num_folds, 82)) 35 | for fold in range(num_folds): 36 | with open(predict_path.format(fold), encoding='utf8') as f: 37 | video_names = [] 38 | for vid, (video_name, predict) in enumerate(sorted(json.load(f).items())): 39 | video_names.append(video_name) 40 | full_predict[vid, fold, :] = json_to_array(predict, str2idx) 41 | assert len(video_names) == 5000 42 | full_predict = full_predict.mean(axis=1) 43 | 44 | full_result = {} 45 | for video_name, scores in zip(video_names, full_predict): 46 | video_result = {"result": [{"labels": [], "scores":[]}]} 47 | for score, label in sorted(zip(scores, idx2str), reverse=True)[:topk]: 48 | video_result["result"][0]["labels"].append(label) 49 | video_result["result"][0]["scores"].append("%.4f" % score) 50 | full_result[video_name] = video_result 51 | 52 | with open(output_path, 'w', encoding='utf8') as f: 53 | json.dump(full_result, f, ensure_ascii=False, indent=4) 54 | 55 | -------------------------------------------------------------------------------- /utils/metrics/pr_calculator.py: -------------------------------------------------------------------------------- 1 | #encoding: utf-8 2 | #Author: jefxiong@tencent.com 3 | import numpy as np 4 | 5 | class PRCalculator(): 6 | def __init__(self): 7 | # use only two threshold to save eval time 8 | self.threshold_dict={0.5:0, 0.1:1} #TODO(jefxiong, range from 0.9~0.01) 9 | self.precision = np.zeros((len(self.threshold_dict))) 10 | self.recall = np.zeros((len(self.threshold_dict))) 11 | self.accumulate_count = np.zeros((len(self.threshold_dict))) 12 | 13 | def accumulate(self, predictions, actuals): 14 | """ 15 | predictions: n_example X n_classes 16 | actuals: n_example X n_classes 17 | """ 18 | #assert isinstance(predictions, np.ndarray) 19 | #assert isinstance(actuals, np.ndarray) 20 | n_example = predictions.shape[0] 21 | 22 | precision_all = np.zeros((n_example, len(self.threshold_dict))) 23 | recall_all = np.zeros((n_example, len(self.threshold_dict))) 24 | for i in range(n_example): 25 | gt_index = np.nonzero(actuals[i])[0] 26 | for th, th_index in self.threshold_dict.items(): 27 | pred_index = np.nonzero(predictions[i]>th)[0] 28 | tp = np.sum([actuals[i][k] for k in pred_index]) 29 | precision_all[i][th_index] = tp*1.0/len(pred_index) if len(pred_index)>0 else np.nan 30 | recall_all[i][th_index] = tp*1.0/len(gt_index) if len(gt_index)>0 else np.nan 31 | 32 | 33 | valid_accumlate = (np.sum(~np.isnan(precision_all), axis=0)) != 0 34 | self.accumulate_count = self.accumulate_count + valid_accumlate 35 | 36 | precision_all = np.nansum(precision_all,axis=0)/(np.sum(~np.isnan(precision_all), axis=0)+1e-10) 37 | recall_all = np.nansum(recall_all,axis=0)/(np.sum(~np.isnan(recall_all), axis=0)+1e-10) 38 | 39 | self.precision = precision_all + self.precision 40 | self.recall = recall_all + self.recall 41 | 42 | def get_precision_at_conf(self, th=0.5): 43 | index = self.threshold_dict[th] 44 | return self.precision[index]/(1e-10+self.accumulate_count[index]) 45 | 46 | def get_recall_at_conf(self, th=0.5): 47 | index = self.threshold_dict[th] 48 | return self.recall[index]/(1e-10+self.accumulate_count[index]) 49 | 50 | def clear(self): 51 | self.accumulate_count = np.zeros((len(self.threshold_dict))) 52 | self.precision = np.zeros((len(self.threshold_dict))) 53 | self.recall = np.zeros((len(self.threshold_dict))) 54 | -------------------------------------------------------------------------------- /src/model/cover_head/nasnet/nasnet_utils_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for slim.nets.nasnet.nasnet_utils.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import tensorflow as tf 22 | 23 | from nets.nasnet import nasnet_utils 24 | 25 | 26 | class NasnetUtilsTest(tf.test.TestCase): 27 | 28 | def testCalcReductionLayers(self): 29 | num_cells = 18 30 | num_reduction_layers = 2 31 | reduction_layers = nasnet_utils.calc_reduction_layers( 32 | num_cells, num_reduction_layers) 33 | self.assertEqual(len(reduction_layers), 2) 34 | self.assertEqual(reduction_layers[0], 6) 35 | self.assertEqual(reduction_layers[1], 12) 36 | 37 | def testGetChannelIndex(self): 38 | data_formats = ['NHWC', 'NCHW'] 39 | for data_format in data_formats: 40 | index = nasnet_utils.get_channel_index(data_format) 41 | correct_index = 3 if data_format == 'NHWC' else 1 42 | self.assertEqual(index, correct_index) 43 | 44 | def testGetChannelDim(self): 45 | data_formats = ['NHWC', 'NCHW'] 46 | shape = [10, 20, 30, 40] 47 | for data_format in data_formats: 48 | dim = nasnet_utils.get_channel_dim(shape, data_format) 49 | correct_dim = shape[3] if data_format == 'NHWC' else shape[1] 50 | self.assertEqual(dim, correct_dim) 51 | 52 | def testGlobalAvgPool(self): 53 | data_formats = ['NHWC', 'NCHW'] 54 | inputs = tf.placeholder(tf.float32, (5, 10, 20, 10)) 55 | for data_format in data_formats: 56 | output = nasnet_utils.global_avg_pool( 57 | inputs, data_format) 58 | self.assertEqual(output.shape, [5, 10]) 59 | 60 | 61 | if __name__ == '__main__': 62 | tf.test.main() 63 | -------------------------------------------------------------------------------- /utils/save_best_ckpt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | def iterate_files(folder, ftype=None): 6 | assert os.path.isdir(folder), "Path should be a folder" 7 | if isinstance(ftype, str): 8 | ftype = {ftype} 9 | elif isinstance(ftype, (list, tuple)): 10 | ftype = set(ftype) 11 | 12 | for file in os.listdir(folder): 13 | file = os.path.join(folder, file) 14 | if os.path.isfile(file) and \ 15 | (ftype is None or os.path.split(file)[-1].split(".")[-1] in ftype): 16 | yield file 17 | continue 18 | elif os.path.isdir(file): 19 | for subfile in iterate_files(file, ftype): 20 | yield os.path.join(folder, subfile) 21 | 22 | 23 | def remove_folder(folder): 24 | assert os.path.isdir(folder) 25 | for file in os.listdir(folder): 26 | file = os.path.join(folder, file) 27 | if os.path.isfile(file): 28 | os.remove(file) 29 | #print("Remove File: %s" % file) 30 | continue 31 | remove_folder(file) 32 | os.rmdir(folder) 33 | #print("Remove Foler: %s" % folder) 34 | 35 | 36 | def select_best_model(folder): 37 | best_score, best_path = 0.0, None 38 | for model in os.listdir(folder): 39 | if model.startswith("step_"): 40 | _, step, score = model.split("_") 41 | score = float(score) 42 | if score > best_score: 43 | best_score, best_path = score, model 44 | return best_path 45 | 46 | 47 | def remove_bad_model(folder): 48 | assert os.path.isdir(folder), "Path should be a folder" 49 | export = os.path.join(folder, "export") 50 | assert os.path.isdir(export), "Path should contains folder: export" 51 | 52 | best_path = select_best_model(export) 53 | #print("Best Model: %s" % best_path) 54 | best_step = int(best_path.split("_")[1]) 55 | 56 | for file in os.listdir(folder): 57 | if file.startswith("model.ckpt-") and \ 58 | int(file.split("-")[1].split(".")[0]) != best_step: 59 | os.remove(os.path.join(folder, file)) 60 | #print("Remove: %s" % os.path.join(folder, file)) 61 | 62 | for file in os.listdir(export): 63 | if file != best_path: 64 | remove_folder(os.path.join(export, file)) 65 | 66 | 67 | if __name__ == "__main__": 68 | remove_bad_model(sys.argv[1]) 69 | -------------------------------------------------------------------------------- /src/model/classify_head/moe_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow.contrib.slim as slim 2 | import tensorflow as tf 3 | 4 | class MoeModel(): 5 | """A softmax over a mixture of logistic models (with L2 regularization).""" 6 | def __init__(self, num_classes, num_mixtures=4, l2_penalty=0.0): 7 | self.vocab_size = num_classes 8 | self.num_mixtures = num_mixtures 9 | self.l2_penalty = l2_penalty 10 | 11 | def __call__(self, model_input): 12 | """Creates a Mixture of (Logistic) Experts model. 13 | The model consists of a per-class softmax distribution over a 14 | configurable number of logistic classifiers. One of the classifiers in the 15 | mixture is not trained, and always predicts 0. 16 | Args: 17 | model_input: 'batch_size' x 'num_features' matrix of input features. 18 | vocab_size: The number of classes in the dataset. 19 | num_mixtures: The number of mixtures (excluding a dummy 'expert' that 20 | always predicts the non-existence of an entity). 21 | l2_penalty: How much to penalize the squared magnitudes of parameter 22 | values. 23 | Returns: 24 | A dictionary with a tensor containing the probability predictions of the 25 | model in the 'predictions' key. The dimensions of the tensor are 26 | batch_size x num_classes. 27 | """ 28 | gate_activations = slim.fully_connected( 29 | model_input, 30 | self.vocab_size * (self.num_mixtures + 1), 31 | activation_fn=None, 32 | biases_initializer=None, 33 | weights_regularizer=slim.l2_regularizer(self.l2_penalty), 34 | scope="gates") 35 | expert_activations = slim.fully_connected( 36 | model_input, 37 | self.vocab_size * self.num_mixtures, 38 | activation_fn=None, 39 | weights_regularizer=slim.l2_regularizer(self.l2_penalty), 40 | scope="experts") 41 | 42 | gating_distribution = tf.nn.softmax(tf.reshape( 43 | gate_activations, 44 | [-1, self.num_mixtures + 1])) # (Batch * #Labels) x (num_mixtures + 1) 45 | expert_distribution = tf.nn.sigmoid(tf.reshape( 46 | expert_activations, 47 | [-1, self.num_mixtures])) # (Batch * #Labels) x num_mixtures 48 | 49 | final_probabilities_by_class_and_batch = tf.reduce_sum( 50 | gating_distribution[:, :self.num_mixtures] * expert_distribution, 1) 51 | final_probabilities = tf.reshape(final_probabilities_by_class_and_batch, 52 | [-1, self.vocab_size]) 53 | return {"predictions": final_probabilities} 54 | -------------------------------------------------------------------------------- /src/model/fusion_head/fusion_se.py: -------------------------------------------------------------------------------- 1 | import tensorflow.contrib.slim as slim 2 | import tensorflow as tf 3 | 4 | class SE(): 5 | """Dropout + Channel Attention 6 | """ 7 | def __init__(self, drop_rate, hidden1_size, gating_reduction, gating_last_bn=False): 8 | self.drop_rate = drop_rate 9 | self.hidden1_size = hidden1_size 10 | self.gating_reduction = gating_reduction 11 | self.gating_last_bn = gating_last_bn 12 | self.expansion = 1.5 13 | 14 | def __call__(self, input_list, is_training): 15 | #features = [] 16 | #for feature in input_list: 17 | # feature = slim.dropout(feature, keep_prob=1.0 - self.drop_rate, is_training=is_training) 18 | # features.append(slim.fully_connected(feature, 1024, activation_fn=None)) 19 | concat_feat = tf.concat(input_list, 1) 20 | concat_feat = slim.dropout(concat_feat, keep_prob=1. - self.drop_rate, is_training=is_training, scope="concat_feat_dropout") 21 | concat_feat_dim = concat_feat.get_shape().as_list()[1] 22 | 23 | hidden1_weights = tf.get_variable("hidden1_weights",[concat_feat_dim, self.hidden1_size], 24 | initializer=slim.variance_scaling_initializer()) 25 | activation = tf.matmul(concat_feat, hidden1_weights) 26 | activation = slim.batch_norm(activation,center=True,scale=True, 27 | is_training=is_training,scope="hidden1_bn",fused=False) 28 | 29 | gating_weights_1 = tf.get_variable("gating_weights_1", 30 | [self.hidden1_size, self.hidden1_size // self.gating_reduction], 31 | initializer=slim.variance_scaling_initializer()) 32 | 33 | gates = tf.matmul(activation, gating_weights_1) 34 | 35 | gates = slim.batch_norm(gates,center=True,scale=True,is_training=is_training, 36 | activation_fn=slim.nn.relu, scope="gating_bn") 37 | gating_weights_2 = tf.get_variable("gating_weights_2", 38 | [self.hidden1_size // self.gating_reduction, self.hidden1_size], 39 | initializer=slim.variance_scaling_initializer() 40 | ) 41 | gates = tf.matmul(gates, gating_weights_2) 42 | if self.gating_last_bn: 43 | gates = slim.batch_norm(gates, center=True, scale=True, is_training=is_training, scope="gating_last_bn") 44 | 45 | gates = tf.sigmoid(gates) 46 | #tf.summary.histogram("final_gates", gates) 47 | activation = tf.multiply(activation, gates) 48 | return activation 49 | -------------------------------------------------------------------------------- /src/model/cover_head/nasnet/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow-Slim NASNet-A Implementation/Checkpoints 2 | This directory contains the code for the NASNet-A model from the paper 3 | [Learning Transferable Architectures for Scalable Image Recognition](https://arxiv.org/abs/1707.07012) by Zoph et al. 4 | In nasnet.py there are three different configurations of NASNet-A that are implementented. One of the models is the NASNet-A built for CIFAR-10 and the 5 | other two are variants of NASNet-A trained on ImageNet, which are listed below. 6 | 7 | # Pre-Trained Models 8 | Two NASNet-A checkpoints are available that have been trained on the 9 | [ILSVRC-2012-CLS](http://www.image-net.org/challenges/LSVRC/2012/) 10 | image classification dataset. Accuracies were computed by evaluating using a single image crop. 11 | 12 | Model Checkpoint | Million MACs | Million Parameters | Top-1 Accuracy| Top-5 Accuracy | 13 | :----:|:------------:|:----------:|:-------:|:-------:| 14 | [NASNet-A_Mobile_224](https://storage.googleapis.com/download.tensorflow.org/models/nasnet-a_mobile_04_10_2017.tar.gz)|564|5.3|74.0|91.6| 15 | [NASNet-A_Large_331](https://storage.googleapis.com/download.tensorflow.org/models/nasnet-a_large_04_10_2017.tar.gz)|23800|88.9|82.7|96.2| 16 | 17 | 18 | Here is an example of how to download the NASNet-A_Mobile_224 checkpoint. The way to download the NASNet-A_Large_331 is the same. 19 | 20 | ```shell 21 | CHECKPOINT_DIR=/tmp/checkpoints 22 | mkdir ${CHECKPOINT_DIR} 23 | cd ${CHECKPOINT_DIR} 24 | wget https://storage.googleapis.com/download.tensorflow.org/models/nasnet-a_mobile_04_10_2017.tar.gz 25 | tar -xvf nasnet-a_mobile_04_10_2017.tar.gz 26 | rm nasnet-a_mobile_04_10_2017.tar.gz 27 | ``` 28 | More information on integrating NASNet Models into your project can be found at the [TF-Slim Image Classification Library](https://github.com/tensorflow/models/blob/master/research/slim/README.md). 29 | 30 | To get started running models on-device go to [TensorFlow Mobile](https://www.tensorflow.org/mobile/). 31 | 32 | ## Sample Commands for using NASNet-A Mobile and Large Checkpoints for Inference 33 | ------- 34 | Run eval with the NASNet-A mobile ImageNet model 35 | 36 | ```shell 37 | DATASET_DIR=/tmp/imagenet 38 | EVAL_DIR=/tmp/tfmodel/eval 39 | CHECKPOINT_DIR=/tmp/checkpoints/model.ckpt 40 | python tensorflow_models/research/slim/eval_image_classifier \ 41 | --checkpoint_path=${CHECKPOINT_DIR} \ 42 | --eval_dir=${EVAL_DIR} \ 43 | --dataset_dir=${DATASET_DIR} \ 44 | --dataset_name=imagenet \ 45 | --dataset_split_name=validation \ 46 | --model_name=nasnet_mobile \ 47 | --eval_image_size=224 48 | ``` 49 | 50 | Run eval with the NASNet-A large ImageNet model 51 | 52 | ```shell 53 | DATASET_DIR=/tmp/imagenet 54 | EVAL_DIR=/tmp/tfmodel/eval 55 | CHECKPOINT_DIR=/tmp/checkpoints/model.ckpt 56 | python tensorflow_models/research/slim/eval_image_classifier \ 57 | --checkpoint_path=${CHECKPOINT_DIR} \ 58 | --eval_dir=${EVAL_DIR} \ 59 | --dataset_dir=${DATASET_DIR} \ 60 | --dataset_name=imagenet \ 61 | --dataset_split_name=validation \ 62 | --model_name=nasnet_large \ 63 | --eval_image_size=331 64 | ``` 65 | -------------------------------------------------------------------------------- /src/dataloader/preprocess/label_preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import codecs 3 | 4 | def extract_dict(dict_file): 5 | index_to_tag = {} 6 | tag_to_index = {} 7 | for i, line in enumerate(codecs.open(dict_file, 'r', encoding='utf-8')): 8 | line = line.strip() 9 | if '\t' in line: 10 | index, tag = line.split('\t')[:2] 11 | elif ' ' in line: 12 | index, tag = i, line.rsplit(' ', 1)[0] 13 | else: 14 | index, tag = i, line 15 | 16 | try: 17 | index = int(index) 18 | except: 19 | index, tag = int(tag), index 20 | 21 | index_to_tag[index] = tag 22 | tag_to_index[tag] = index 23 | return index_to_tag, tag_to_index 24 | 25 | class Preprocess_index_indentity: 26 | 27 | def __init__(self, 28 | index_dict, 29 | label_num, 30 | sep_token=',', 31 | is_training=False): 32 | self.index_to_tag,self.tag_to_index = extract_dict(index_dict) 33 | self.label_num = label_num 34 | self.sep_token = sep_token 35 | self.is_training = is_training 36 | 37 | def __call__(self, index_str): 38 | index_lst = index_str.split(self.sep_token) 39 | index_lst = [int(index) for index in index_lst] 40 | for index in index_lst: 41 | assert index in self.index_to_tag 42 | return np.array(index_lst).astype('int32') 43 | 44 | class Preprocess_index_sparse_to_dense: 45 | 46 | def __init__(self, 47 | index_dict, 48 | sep_token=',', 49 | is_training=False): 50 | self.index_to_tag,self.tag_to_index = extract_dict(index_dict) 51 | self.sep_token = sep_token 52 | self.is_training = is_training 53 | self.max_index = 0 54 | for index in self.index_to_tag: 55 | self.max_index = max(index, self.max_index) 56 | self.seq_size = self.max_index + 1 57 | self.label_num = self.seq_size 58 | 59 | def __call__(self, index_str): 60 | dense_array = np.zeros(self.seq_size) 61 | index_lst = index_str.split(self.sep_token) 62 | index_lst = [int(index) for index in index_lst] 63 | for index in index_lst: 64 | if index == -1: 65 | continue 66 | assert index in self.index_to_tag 67 | dense_array[index] = 1.0 68 | return dense_array.astype('float32') 69 | 70 | class Preprocess_label_sparse_to_dense: 71 | 72 | def __init__(self, 73 | index_dict, 74 | sep_token=',', 75 | is_training=False): 76 | self.index_to_tag,self.tag_to_index = extract_dict(index_dict) 77 | self.sep_token = sep_token 78 | self.is_training = is_training 79 | self.max_index = 0 80 | for index in self.index_to_tag: 81 | self.max_index = max(index, self.max_index) 82 | self.seq_size = self.max_index + 1 83 | self.label_num = self.seq_size 84 | 85 | def __call__(self, index_str, augment): 86 | dense_array = np.zeros(self.seq_size) 87 | label_lst = index_str.split(self.sep_token) 88 | for label in label_lst: 89 | if label in self.tag_to_index: 90 | index = self.tag_to_index[label] 91 | dense_array[index] = 1.0 92 | return dense_array.astype('float32') 93 | -------------------------------------------------------------------------------- /src/model/video_head/nextvlad.py: -------------------------------------------------------------------------------- 1 | import tensorflow.contrib.slim as slim 2 | import tensorflow as tf 3 | 4 | class NeXtVLAD(): 5 | def __init__(self, feature_size, max_frames, nextvlad_cluster_size, expansion, groups, directly=False): 6 | self.feature_size = feature_size 7 | self.max_frames = max_frames 8 | self.nextvlad_cluster_size = nextvlad_cluster_size 9 | self.expansion = expansion 10 | self.groups = groups 11 | self.directly = directly 12 | 13 | def __call__(self, input, is_training, mask=None): 14 | input = slim.fully_connected(input, self.expansion * self.feature_size, activation_fn=None, 15 | weights_initializer=slim.variance_scaling_initializer()) 16 | 17 | attention = slim.fully_connected(input, self.groups, activation_fn=tf.nn.sigmoid, 18 | weights_initializer=slim.variance_scaling_initializer()) 19 | if mask is not None: 20 | attention = tf.multiply(attention, tf.expand_dims(mask, -1)) 21 | attention = tf.reshape(attention, [-1, self.max_frames*self.groups, 1]) 22 | feature_size = self.expansion * self.feature_size // self.groups 23 | 24 | cluster_weights = tf.get_variable("cluster_weights", 25 | [self.expansion*self.feature_size, self.groups*self.nextvlad_cluster_size], 26 | initializer=slim.variance_scaling_initializer() 27 | ) 28 | 29 | reshaped_input = tf.reshape(input, [-1, self.expansion * self.feature_size]) 30 | activation = tf.matmul(reshaped_input, cluster_weights) 31 | 32 | activation = slim.batch_norm( 33 | activation, 34 | center=True, 35 | scale=True, 36 | is_training=is_training, 37 | scope="cluster_bn", 38 | fused=False) 39 | 40 | activation = tf.reshape(activation, [-1, self.max_frames * self.groups, self.nextvlad_cluster_size]) 41 | activation = tf.nn.softmax(activation, axis=-1) 42 | activation = tf.multiply(activation, attention) 43 | # tf.summary.histogram("cluster_output", activation) 44 | a_sum = tf.reduce_sum(activation, -2, keep_dims=True) 45 | 46 | cluster_weights2 = tf.get_variable("cluster_weights2", 47 | [1, feature_size, self.nextvlad_cluster_size], 48 | initializer=slim.variance_scaling_initializer() 49 | ) 50 | a = tf.multiply(a_sum, cluster_weights2) 51 | 52 | activation = tf.transpose(activation, perm=[0, 2, 1]) 53 | 54 | reshaped_input = tf.reshape(input, [-1, self.max_frames * self.groups, feature_size]) 55 | vlad = tf.matmul(activation, reshaped_input) 56 | vlad = tf.transpose(vlad, perm=[0, 2, 1]) 57 | vlad = tf.subtract(vlad, a) 58 | 59 | vlad = tf.nn.l2_normalize(vlad, 1) 60 | 61 | vlad = tf.reshape(vlad, [-1, self.nextvlad_cluster_size * feature_size]) 62 | #return tf.reshape(vlad, (-1, 16, self.nextvlad_cluster_size * feature_size // 16)) 63 | vlad = slim.batch_norm(vlad, 64 | center=True, 65 | scale=True, 66 | is_training = is_training, 67 | scope="vlad_bn", 68 | fused=False) 69 | return vlad 70 | -------------------------------------------------------------------------------- /preprocess/feat_extract_main.py: -------------------------------------------------------------------------------- 1 | #encoding: utf-8 2 | import sys,os 3 | sys.path.append(os.getcwd()) 4 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 5 | 6 | import time 7 | import argparse 8 | import tqdm 9 | import random 10 | import glob 11 | import traceback 12 | 13 | from multimodal_feature_extract import MultiModalFeatureExtract 14 | 15 | 16 | def process_file(file_path, frame_npy_path, audio_npy_path, text_txt_path, image_jpg_path): 17 | if not os.path.exists(file_path): 18 | return 19 | try: 20 | print(file_path) 21 | gen.extract_feat(file_path, frame_npy_path, audio_npy_path, text_txt_path, image_jpg_path) 22 | except Exception as e: 23 | print(traceback.format_exc()) 24 | 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('--test_files_dir', default=None,type=str) 28 | parser.add_argument('--postfix', default='mp4', type=str) 29 | parser.add_argument('--frame_npy_folder', default='dataset/frame_npy', type=str) 30 | parser.add_argument('--audio_npy_folder', default='dataset/audio_npy', type=str) 31 | parser.add_argument('--image_jpg_folder', default='dataset/image_jpg', type=str) 32 | parser.add_argument('--text_txt_folder', default='dataset/text_txt', type=str) 33 | parser.add_argument('--datafile_path', default='dataset/datafile.txt') 34 | 35 | parser.add_argument('--extract_type', default=0, type=int) #0:ALL #1:VIDEO #2: AUDIO #3: TEXT 36 | 37 | parser.add_argument('--image_batch_size', default=32, type=int) 38 | parser.add_argument('--imgfeat_extractor', default='Youtube8M', type=str) 39 | parser.add_argument('--do_logging', default=0, type=int) 40 | 41 | args = parser.parse_args() 42 | os.makedirs(args.frame_npy_folder, exist_ok=True) 43 | os.makedirs(args.audio_npy_folder, exist_ok=True) 44 | os.makedirs(args.text_txt_folder, exist_ok=True) 45 | os.makedirs(args.image_jpg_folder, exist_ok=True) 46 | gen = MultiModalFeatureExtract(batch_size = args.image_batch_size, 47 | imgfeat_extractor = args.imgfeat_extractor, 48 | extract_video = args.extract_type==0 or args.extract_type==1, 49 | extract_audio = args.extract_type==0 or args.extract_type==2, 50 | extract_text = args.extract_type==0 or args.extract_type==3) 51 | 52 | file_paths = glob.glob(args.test_files_dir+'/*.'+args.postfix) 53 | random.shuffle(file_paths) 54 | files = tqdm.tqdm(file_paths, total=len(file_paths)) if args.do_logging == 1 else file_paths 55 | for file_path in files: 56 | vid = os.path.basename(file_path).split('.m')[0] 57 | frame_npy_path = os.path.join(args.frame_npy_folder, vid+'.npy') 58 | audio_npy_path = os.path.join(args.audio_npy_folder, vid+'.npy') 59 | image_jpg_path = os.path.join(args.image_jpg_folder, vid+'.jpg') 60 | text_txt_path = os.path.join(args.text_txt_folder, vid+'.txt') 61 | if args.extract_type == 1: 62 | audio_npy_path, text_txt_path, image_jpg_path = None, None, None 63 | elif args.extract_type == 2: 64 | frame_npy_path, text_txt_path, image_jpg_path = None, None, None 65 | elif args.extract_type == 3: 66 | frame_npy_path, audio_npy_path, image_jpg_path = None, None, None 67 | elif args.extract_type ==4: 68 | frame_npy_path, audio_npy_path, text_txt_path = None, None, None 69 | process_file(file_path, frame_npy_path, audio_npy_path, text_txt_path, image_jpg_path) 70 | -------------------------------------------------------------------------------- /src/dataloader/preprocess/cnn_preprocessing/preprocessing_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Contains a factory for building various models.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import tensorflow as tf 22 | 23 | from preprocessing import cifarnet_preprocessing 24 | from preprocessing import inception_preprocessing 25 | from preprocessing import lenet_preprocessing 26 | from preprocessing import vgg_preprocessing 27 | 28 | slim = tf.contrib.slim 29 | 30 | 31 | def get_preprocessing(name, is_training=False): 32 | """Returns preprocessing_fn(image, height, width, **kwargs). 33 | 34 | Args: 35 | name: The name of the preprocessing function. 36 | is_training: `True` if the model is being used for training and `False` 37 | otherwise. 38 | 39 | Returns: 40 | preprocessing_fn: A function that preprocessing a single image (pre-batch). 41 | It has the following signature: 42 | image = preprocessing_fn(image, output_height, output_width, ...). 43 | 44 | Raises: 45 | ValueError: If Preprocessing `name` is not recognized. 46 | """ 47 | preprocessing_fn_map = { 48 | 'cifarnet': cifarnet_preprocessing, 49 | 'inception': inception_preprocessing, 50 | 'inception_v1': inception_preprocessing, 51 | 'inception_v2': inception_preprocessing, 52 | 'inception_v3': inception_preprocessing, 53 | 'inception_v4': inception_preprocessing, 54 | 'inception_resnet_v2': inception_preprocessing, 55 | 'lenet': lenet_preprocessing, 56 | 'mobilenet_v1': inception_preprocessing, 57 | 'mobilenet_v2': inception_preprocessing, 58 | 'mobilenet_v2_035': inception_preprocessing, 59 | 'mobilenet_v2_140': inception_preprocessing, 60 | 'nasnet_mobile': inception_preprocessing, 61 | 'nasnet_large': inception_preprocessing, 62 | 'pnasnet_mobile': inception_preprocessing, 63 | 'pnasnet_large': inception_preprocessing, 64 | 'resnet_v1_50': vgg_preprocessing, 65 | 'resnet_v1_101': vgg_preprocessing, 66 | 'resnet_v1_152': vgg_preprocessing, 67 | 'resnet_v1_200': vgg_preprocessing, 68 | 'resnet_v2_50': vgg_preprocessing, 69 | 'resnet_v2_101': vgg_preprocessing, 70 | 'resnet_v2_152': vgg_preprocessing, 71 | 'resnet_v2_200': vgg_preprocessing, 72 | 'densenet121': vgg_preprocessing, 73 | 'densenet161': vgg_preprocessing, 74 | 'densenet169': vgg_preprocessing, 75 | 'vgg': vgg_preprocessing, 76 | 'vgg_a': vgg_preprocessing, 77 | 'vgg_16': vgg_preprocessing, 78 | 'vgg_19': vgg_preprocessing, 79 | } 80 | 81 | if name not in preprocessing_fn_map: 82 | raise ValueError('Preprocessing name [%s] was not recognized' % name) 83 | 84 | def preprocessing_fn(image, output_height, output_width, **kwargs): 85 | return preprocessing_fn_map[name].preprocess_image( 86 | image, output_height, output_width, is_training=is_training, **kwargs) 87 | 88 | return preprocessing_fn 89 | -------------------------------------------------------------------------------- /src/loss/loss.py: -------------------------------------------------------------------------------- 1 | """Provides definitions for non-regularized training or test losses.""" 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class BaseLoss(object): 7 | """Inherit from this class when implementing new losses.""" 8 | 9 | def calculate_loss(self, unused_predictions, unused_labels, **unused_params): 10 | """Calculates the average loss of the examples in a mini-batch. 11 | Args: 12 | unused_predictions: a 2-d tensor storing the prediction scores, in which 13 | each row represents a sample in the mini-batch and each column 14 | represents a class. 15 | unused_labels: a 2-d tensor storing the labels, which has the same shape 16 | as the unused_predictions. The labels must be in the range of 0 and 1. 17 | unused_params: loss specific parameters. 18 | Returns: 19 | A scalar loss tensor. 20 | """ 21 | raise NotImplementedError() 22 | 23 | 24 | class CrossEntropyLoss(BaseLoss): 25 | """Calculate the cross entropy loss between the predictions and labels. 26 | """ 27 | 28 | def calculate_loss(self, predictions, labels, **unused_params): 29 | with tf.name_scope("loss_xent"): 30 | epsilon = 1e-8 31 | label_smooth_rate = unused_params.get('label_smooth_rate', 0.0) 32 | float_labels = tf.cast(labels, tf.float32)*(1.0-label_smooth_rate) + \ 33 | (1.0-tf.cast(labels, tf.float32)) * label_smooth_rate 34 | 35 | cross_entropy_loss = float_labels * tf.log(predictions + epsilon) + ( 36 | 1 - float_labels) * tf.log(1 - predictions + epsilon) 37 | cross_entropy_loss = tf.negative(cross_entropy_loss) 38 | alpha = unused_params.get('loss_weight', 1.0) #alpha shape=[batch_size] 39 | return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1)*alpha) 40 | 41 | 42 | class HingeLoss(BaseLoss): 43 | """Calculate the hinge loss between the predictions and labels. 44 | Note the subgradient is used in the backpropagation, and thus the optimization 45 | may converge slower. The predictions trained by the hinge loss are between -1 46 | and +1. 47 | """ 48 | 49 | def calculate_loss(self, predictions, labels, b=1.0, **unused_params): 50 | with tf.name_scope("loss_hinge"): 51 | float_labels = tf.cast(labels, tf.float32) 52 | all_zeros = tf.zeros(tf.shape(float_labels), dtype=tf.float32) 53 | all_ones = tf.ones(tf.shape(float_labels), dtype=tf.float32) 54 | sign_labels = tf.subtract(tf.scalar_mul(2, float_labels), all_ones) 55 | hinge_loss = tf.maximum( 56 | all_zeros, tf.scalar_mul(b, all_ones) - sign_labels * predictions) 57 | return tf.reduce_mean(tf.reduce_sum(hinge_loss, 1)) 58 | 59 | 60 | class SoftmaxLoss(BaseLoss): 61 | """Calculate the softmax loss between the predictions and labels. 62 | The function calculates the loss in the following way: first we feed the 63 | predictions to the softmax activation function and then we calculate 64 | the minus linear dot product between the logged softmax activations and the 65 | normalized ground truth label. 66 | It is an extension to the one-hot label. It allows for more than one positive 67 | labels for each sample. 68 | """ 69 | 70 | def calculate_loss(self, predictions, labels, **unused_params): 71 | with tf.name_scope("loss_softmax"): 72 | epsilon = 1e-8 73 | float_labels = tf.cast(labels, tf.float32) 74 | # l1 normalization (labels are no less than 0) 75 | label_rowsum = tf.maximum( 76 | tf.reduce_sum(float_labels, 1, keep_dims=True), 77 | epsilon) 78 | norm_float_labels = tf.div(float_labels, label_rowsum) 79 | softmax_outputs = tf.nn.softmax(predictions) 80 | softmax_loss = tf.negative(tf.reduce_sum( 81 | tf.multiply(norm_float_labels, tf.log(softmax_outputs)), 1)) 82 | return tf.reduce_mean(softmax_loss) 83 | -------------------------------------------------------------------------------- /src/dataloader/dataloader.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import yaml 3 | import tensorflow as tf 4 | import os,sys 5 | sys.path.append(os.getcwd()) 6 | from src.dataloader.data_generator import Data_Generator 7 | 8 | class Data_Pipeline: 9 | 10 | def __init__(self, data_config): 11 | 12 | self.data_config = data_config 13 | self.batch_size = data_config['batch_size'] 14 | self.data_generator = Data_Generator(data_config=self.data_config) 15 | self.sample_generator = self.data_generator.get_train_sample_generator 16 | self.get_valid_sample_generator_dict = self.data_generator.get_valid_sample_generator_dict 17 | self.label_num_dict = self.data_generator.label_num_dict 18 | self.dname_string_list = self.data_generator.dname_string_list 19 | self.data_shape_list = self.data_generator.data_shape_list 20 | 21 | self.data_num = len(self.dname_string_list) 22 | self.dtype_map_dict = {'bool':tf.bool, 23 | 'int16':tf.int16, 24 | 'int32': tf.int32, 25 | 'int64': tf.int64, 26 | 'float16':tf.float16, 27 | 'float32': tf.float32, 28 | 'float64': tf.float64, 29 | 'string': tf.string} 30 | self.dtype_list = [self.dtype_map_dict[string] for string in self.data_generator.dtype_string_list] 31 | self.dataset = tf.data.Dataset.from_generator(self.sample_generator, 32 | tuple(self.dtype_list), 33 | tuple(self.data_shape_list)) 34 | self.dataset = self.dataset.batch(self.batch_size).prefetch(20) 35 | self.iterator = self.dataset.make_initializable_iterator() 36 | self.data_op_lst = self.iterator.get_next() 37 | self.name_to_data_op = {} 38 | self.data_op_list = [] 39 | for index in range(self.data_num): 40 | name = self.dname_string_list[index] 41 | self.name_to_data_op[name] = self.data_op_lst[index] 42 | self.data_op_list.append(self.name_to_data_op[name]) 43 | 44 | if __name__ == '__main__': 45 | import argparse 46 | import time 47 | 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument('--data_config',type=str) 50 | args = parser.parse_args() 51 | 52 | data_config = yaml.load(open(args.data_config)) 53 | data_pipeline = Data_Pipeline(data_config = data_config['DatasetConfig']) 54 | 55 | for name in data_pipeline.name_to_data_op: 56 | print(name) 57 | print(data_pipeline.name_to_data_op[name]) 58 | 59 | Sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 60 | Sess_config.gpu_options.allow_growth = True 61 | with tf.Session(config=Sess_config) as sess: 62 | sess.run(data_pipeline.iterator.initializer) 63 | sess.run(tf.local_variables_initializer()) 64 | sess.run(tf.global_variables_initializer()) 65 | for _ in range(10): 66 | print(data_pipeline.label_num_dict) 67 | start_time = time.time() 68 | data_list = sess.run(data_pipeline.data_op_list) 69 | for data,name in zip(data_list,data_pipeline.dname_string_list): 70 | print(name,data.shape) 71 | #time.sleep(0.5) 72 | end_time = time.time() 73 | print(end_time-start_time) 74 | 75 | def valid(): 76 | valid_sample_generator_dict = data_pipeline.get_valid_sample_generator_dict() 77 | for source_name,generator in valid_sample_generator_dict.items(): 78 | for sample in generator: 79 | for output_name, x in sample.items(): 80 | print(source_name, output_name,x.shape) 81 | valid() 82 | -------------------------------------------------------------------------------- /preprocess/audio_extractor/vggish_input.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Compute input examples for VGGish from audio waveform.""" 17 | 18 | import numpy as np 19 | import resampy 20 | 21 | from preprocess.audio_extractor import mel_features 22 | 23 | from preprocess.audio_extractor import vggish_params 24 | 25 | import soundfile as sf 26 | 27 | 28 | def waveform_to_examples(data, sample_rate): 29 | """Converts audio waveform into an array of examples for VGGish. 30 | 31 | Args: 32 | data: np.array of either one dimension (mono) or two dimensions 33 | (multi-channel, with the outer dimension representing channels). 34 | Each sample is generally expected to lie in the range [-1.0, +1.0], 35 | although this is not required. 36 | sample_rate: Sample rate of data. 37 | 38 | Returns: 39 | 3-D np.array of shape [num_examples, num_frames, num_bands] which represents 40 | a sequence of examples, each of which contains a patch of log mel 41 | spectrogram, covering num_frames frames of audio and num_bands mel frequency 42 | bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. 43 | """ 44 | # Convert to mono. 45 | if len(data.shape) > 1: 46 | data = np.mean(data, axis=1) 47 | # Resample to the rate assumed by VGGish. 48 | if sample_rate != vggish_params.SAMPLE_RATE: 49 | data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) 50 | 51 | # Compute log mel spectrogram features. 52 | log_mel = mel_features.log_mel_spectrogram( 53 | data, 54 | audio_sample_rate=vggish_params.SAMPLE_RATE, 55 | log_offset=vggish_params.LOG_OFFSET, 56 | window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, 57 | hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, 58 | num_mel_bins=vggish_params.NUM_MEL_BINS, 59 | lower_edge_hertz=vggish_params.MEL_MIN_HZ, 60 | upper_edge_hertz=vggish_params.MEL_MAX_HZ) 61 | 62 | # Frame features into examples. 63 | features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS 64 | example_window_length = int(round( 65 | vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) 66 | example_hop_length = int(round( 67 | vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) 68 | log_mel_examples = mel_features.frame( 69 | log_mel, 70 | window_length=example_window_length, 71 | hop_length=example_hop_length) 72 | return log_mel_examples 73 | 74 | 75 | def wavfile_to_examples(wav_file): 76 | """Convenience wrapper around waveform_to_examples() for a common WAV format. 77 | 78 | Args: 79 | wav_file: String path to a file, or a file-like object. The file 80 | is assumed to contain WAV audio data with signed 16-bit PCM samples. 81 | 82 | Returns: 83 | See waveform_to_examples. 84 | """ 85 | #wav_data, sr = sf.read(wav_file) 86 | #print (wav_data) 87 | wav_data, sr = sf.read(wav_file, dtype='int16') 88 | #print (wav_data) 89 | #print (sr) 90 | assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype 91 | samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] 92 | #print (samples) 93 | return waveform_to_examples(samples, sr) 94 | -------------------------------------------------------------------------------- /preprocess/imgfeat_extractor/efficientnet_extractor.py: -------------------------------------------------------------------------------- 1 | # Author: wuxsmail@163.com 2 | 3 | import time 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | import cv2 8 | 9 | from efficientnet.tfkeras import EfficientNetB5 10 | from efficientnet.tfkeras import center_crop_and_resize, preprocess_input 11 | 12 | 13 | def center_crop_and_resize(frame, size): 14 | """change shape of a frame with shape (h, w, 3) into shape (size, size, 3) 15 | """ 16 | # prepare_frame 17 | assert len(frame.shape) == 3 and frame.shape[-1] == 3 18 | if frame.dtype != np.uint8: 19 | frame = frame.astype(np.uint8) 20 | 21 | # center crop process 22 | y, x = frame.shape[0:2] 23 | if x != y: 24 | min_dim = min(y, x) 25 | start_x = (x // 2) - (min_dim // 2) 26 | start_y = (y // 2) - (min_dim // 2) 27 | frame = frame[start_y:start_y+min_dim,start_x:start_x+min_dim] 28 | 29 | # resize process 30 | h, w = frame.shape[:2] 31 | if h * w < size ** 2: 32 | frame = cv2.resize(frame, (size, size), interpolation=cv2.INTER_CUBIC) 33 | elif not (h == w == size): 34 | frame = cv2.resize(frame, (size, size), interpolation=cv2.INTER_AREA) 35 | return np.expand_dims(frame, 0).astype(np.float32) 36 | 37 | 38 | class EfficientNetExtractor(object): 39 | """Extracts EfficientNet features for RGB frames. 40 | """ 41 | 42 | def __init__(self, img_size=456, max_pooling=True): 43 | self.index = 0 44 | config = tf.ConfigProto() 45 | config.gpu_options.allow_growth = True 46 | config.gpu_options.per_process_gpu_memory_fraction = 1.0 47 | self.session = tf.compat.v1.Session(config=config) 48 | self.graph = tf.compat.v1.get_default_graph() 49 | tf.compat.v1.keras.backend.set_session(self.session) 50 | self.model = EfficientNetB5( 51 | weights='pretrained/efficientnet/efficientnet-b5_noisy-student_notop.h5', 52 | include_top=False, 53 | pooling='avg') 54 | self.img_size = img_size 55 | self.block7 = self.model.output 56 | self.block6 = self.model.layers[-48].output 57 | 58 | def extract_rgb_frame_features(self, frame_rgb): 59 | assert len(frame_rgb.shape) == 4 60 | assert frame_rgb.shape[3] == 3 # 3 channels (R, G, B) 61 | with self.graph.as_default(): 62 | tf.keras.backend.set_session(self.session) 63 | block7, block6 = self.session.run([self.block7, self.block6], feed_dict={self.model.input: frame_rgb}) 64 | return np.hstack([block7, np.reshape(block6, [block6.shape[0], -1, block6.shape[-1]]).mean(1)]) 65 | 66 | def extract_rgb_frame_features_list(self, frame_rgb_list, batch_size): 67 | self.index += 1 68 | def _predict_batch(): 69 | if len(frame_list) > 0: 70 | batch_inputs = preprocess_input(np.vstack(frame_list)) 71 | batch_feat = self.extract_rgb_frame_features(batch_inputs) 72 | feature_list.extend(frame for frame in batch_feat) 73 | 74 | frame_list = [] 75 | feature_list = [] 76 | for frame in frame_rgb_list: 77 | frame_list.append(center_crop_and_resize(frame, self.img_size)) 78 | if len(frame_list) == batch_size: 79 | _predict_batch() 80 | frame_list = [] 81 | else: 82 | _predict_batch() 83 | msg = "[%s] Video-%d has Frames: %d | Feature Dimension: %s" % (time.asctime(), 84 | self.index, 85 | len(feature_list), 86 | feature_list[-1].shape[-1]) 87 | with open("/home/tione/notebook/log/extract_train.log", "a+") as f: 88 | f.write(msg + "\n") 89 | return feature_list 90 | -------------------------------------------------------------------------------- /utils/export_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utilities to export a model for batch prediction.""" 15 | 16 | import tensorflow as tf 17 | 18 | from tensorflow.python.saved_model import builder as saved_model_builder 19 | from tensorflow.python.saved_model import signature_constants 20 | from tensorflow.python.saved_model import signature_def_utils 21 | from tensorflow.python.saved_model import tag_constants 22 | from tensorflow.python.saved_model import utils as saved_model_utils 23 | 24 | _TOP_PREDICTIONS_IN_OUTPUT = 82 25 | 26 | class ModelExporter(object): 27 | 28 | def __init__(self, model, reader): 29 | self.model = model 30 | self.reader = reader 31 | 32 | with tf.Graph().as_default() as graph: 33 | self.inputs, self.outputs = self.build_inputs_and_outputs() 34 | self.graph = graph 35 | self.saver = tf.train.Saver(tf.global_variables(), sharded=True) 36 | 37 | def export_model(self, model_dir, global_step_val, last_checkpoint): 38 | """Exports the model so that it can used for batch predictions.""" 39 | 40 | with self.graph.as_default(): 41 | with tf.Session() as session: 42 | session.run(tf.global_variables_initializer()) 43 | self.saver.restore(session, last_checkpoint) 44 | 45 | signature = signature_def_utils.build_signature_def( 46 | inputs=self.inputs, 47 | outputs=self.outputs, 48 | method_name=signature_constants.PREDICT_METHOD_NAME) 49 | 50 | signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: 51 | signature} 52 | 53 | model_builder = saved_model_builder.SavedModelBuilder(model_dir) 54 | model_builder.add_meta_graph_and_variables(session, 55 | tags=[tag_constants.SERVING], 56 | signature_def_map=signature_map, 57 | clear_devices=True) 58 | model_builder.save() 59 | 60 | def build_inputs_and_outputs(self): 61 | input_name_list = self.reader.dname_string_list #模型输入变量名 62 | inupt_shape_list = self.reader.data_shape_list #模型输入shape 63 | input_dtype_list = self.reader.dtype_list #模型输入类型 64 | 65 | inputs_dict={} 66 | for input_name,input_shape,input_dtype in zip(input_name_list, inupt_shape_list, input_dtype_list): 67 | inputs_dict[input_name] = tf.placeholder(shape=[None]+input_shape, dtype=input_dtype, name=input_name) #add batch size dim 68 | 69 | with tf.variable_scope("tower"): 70 | result = self.model(inputs_dict,is_training=False) 71 | predictions = result["tagging_output_fusion"]["predictions"] 72 | video_embedding = result["video_embedding"] 73 | top_predictions, top_indices = tf.nn.top_k(predictions, _TOP_PREDICTIONS_IN_OUTPUT) 74 | 75 | #inputs = {"video_input_placeholder": saved_model_utils.build_tensor_info(video_input_placeholder), 76 | # "audio_input_placeholder": saved_model_utils.build_tensor_info(audio_input_placeholder), 77 | # "text_input_placeholder": saved_model_utils.build_tensor_info(text_input_placeholder), 78 | # "num_frames_placeholder": saved_model_utils.build_tensor_info(num_frames_placeholder)} 79 | inputs = {key:saved_model_utils.build_tensor_info(val) for key,val in inputs_dict.items()} 80 | outputs = { 81 | "class_indexes": saved_model_utils.build_tensor_info(top_indices), 82 | "video_embedding": saved_model_utils.build_tensor_info(video_embedding), 83 | "predictions": saved_model_utils.build_tensor_info(top_predictions)} 84 | 85 | return inputs, outputs 86 | -------------------------------------------------------------------------------- /preprocess/audio_extractor/vggish_postprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Post-process embeddings from VGGish.""" 17 | import os,sys 18 | sys.path.append(os.path.dirname(__file__)) 19 | 20 | import numpy as np 21 | 22 | import vggish_params 23 | 24 | 25 | class Postprocessor(object): 26 | """Post-processes VGGish embeddings. 27 | 28 | The initial release of AudioSet included 128-D VGGish embeddings for each 29 | segment of AudioSet. These released embeddings were produced by applying 30 | a PCA transformation (technically, a whitening transform is included as well) 31 | and 8-bit quantization to the raw embedding output from VGGish, in order to 32 | stay compatible with the YouTube-8M project which provides visual embeddings 33 | in the same format for a large set of YouTube videos. This class implements 34 | the same PCA (with whitening) and quantization transformations. 35 | """ 36 | 37 | def __init__(self, pca_params_npz_path): 38 | """Constructs a postprocessor. 39 | 40 | Args: 41 | pca_params_npz_path: Path to a NumPy-format .npz file that 42 | contains the PCA parameters used in postprocessing. 43 | """ 44 | params = np.load(pca_params_npz_path) 45 | self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] 46 | # Load means into a column vector for easier broadcasting later. 47 | self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) 48 | assert self._pca_matrix.shape == ( 49 | vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 50 | 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) 51 | assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 52 | 'Bad PCA means shape: %r' % (self._pca_means.shape,)) 53 | 54 | def postprocess(self, embeddings_batch): 55 | """Applies postprocessing to a batch of embeddings. 56 | 57 | Args: 58 | embeddings_batch: An nparray of shape [batch_size, embedding_size] 59 | containing output from the embedding layer of VGGish. 60 | 61 | Returns: 62 | An nparray of the same shape as the input but of type uint8, 63 | containing the PCA-transformed and quantized version of the input. 64 | """ 65 | assert len(embeddings_batch.shape) == 2, ( 66 | 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) 67 | assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 68 | 'Bad batch shape: %r' % (embeddings_batch.shape,)) 69 | 70 | # Apply PCA. 71 | # - Embeddings come in as [batch_size, embedding_size]. 72 | # - Transpose to [embedding_size, batch_size]. 73 | # - Subtract pca_means column vector from each column. 74 | # - Premultiply by PCA matrix of shape [output_dims, input_dims] 75 | # where both are are equal to embedding_size in our case. 76 | # - Transpose result back to [batch_size, embedding_size]. 77 | pca_applied = np.dot(self._pca_matrix, 78 | (embeddings_batch.T - self._pca_means)).T 79 | 80 | # Quantize by: 81 | # - clipping to [min, max] range 82 | #clipped_embeddings = np.clip( 83 | # pca_applied, vggish_params.QUANTIZE_MIN_VAL, 84 | # vggish_params.QUANTIZE_MAX_VAL) 85 | # - convert to 8-bit in range [0.0, 255.0] 86 | #quantized_embeddings = ( 87 | # (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * 88 | # (255.0 / 89 | # (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) 90 | # - cast 8-bit float to uint8 91 | #quantized_embeddings = quantized_embeddings.astype(np.uint8) 92 | 93 | #return quantized_embeddings 94 | return pca_applied 95 | -------------------------------------------------------------------------------- /utils/metrics/mean_average_precision_calculator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Calculate the mean average precision. 16 | 17 | It provides an interface for calculating mean average precision 18 | for an entire list or the top-n ranked items. 19 | 20 | Example usages: 21 | We first call the function accumulate many times to process parts of the ranked 22 | list. After processing all the parts, we call peek_map_at_n 23 | to calculate the mean average precision. 24 | 25 | ``` 26 | import random 27 | 28 | p = np.array([[random.random() for _ in xrange(50)] for _ in xrange(1000)]) 29 | a = np.array([[random.choice([0, 1]) for _ in xrange(50)] 30 | for _ in xrange(1000)]) 31 | 32 | # mean average precision for 50 classes. 33 | calculator = mean_average_precision_calculator.MeanAveragePrecisionCalculator( 34 | num_class=50) 35 | calculator.accumulate(p, a) 36 | aps = calculator.peek_map_at_n() 37 | ``` 38 | """ 39 | 40 | import utils.metrics.average_precision_calculator as average_precision_calculator 41 | 42 | 43 | class MeanAveragePrecisionCalculator(object): 44 | """This class is to calculate mean average precision. 45 | """ 46 | 47 | def __init__(self, num_class): 48 | """Construct a calculator to calculate the (macro) average precision. 49 | 50 | Args: 51 | num_class: A positive Integer specifying the number of classes. 52 | top_n_array: A list of positive integers specifying the top n for each 53 | class. The top n in each class will be used to calculate its average 54 | precision at n. 55 | The size of the array must be num_class. 56 | 57 | Raises: 58 | ValueError: An error occurred when num_class is not a positive integer; 59 | or the top_n_array is not a list of positive integers. 60 | """ 61 | if not isinstance(num_class, int) or num_class <= 1: 62 | raise ValueError("num_class must be a positive integer.") 63 | 64 | self._ap_calculators = [] # member of AveragePrecisionCalculator 65 | self._num_class = num_class # total number of classes 66 | for i in range(num_class): 67 | self._ap_calculators.append( 68 | average_precision_calculator.AveragePrecisionCalculator()) 69 | 70 | def accumulate(self, predictions, actuals, num_positives=None): 71 | """Accumulate the predictions and their ground truth labels. 72 | 73 | Args: 74 | predictions: A list of lists storing the prediction scores. The outer 75 | dimension corresponds to classes. 76 | actuals: A list of lists storing the ground truth labels. The dimensions 77 | should correspond to the predictions input. Any value 78 | larger than 0 will be treated as positives, otherwise as negatives. 79 | num_positives: If provided, it is a list of numbers representing the 80 | number of true positives for each class. If not provided, the number of 81 | true positives will be inferred from the 'actuals' array. 82 | 83 | Raises: 84 | ValueError: An error occurred when the shape of predictions and actuals 85 | does not match. 86 | """ 87 | if not num_positives: 88 | num_positives = [None for i in predictions.shape[1]] 89 | 90 | calculators = self._ap_calculators 91 | for i in range(len(predictions)): 92 | calculators[i].accumulate(predictions[i], actuals[i], num_positives[i]) 93 | 94 | def clear(self): 95 | for calculator in self._ap_calculators: 96 | calculator.clear() 97 | 98 | def is_empty(self): 99 | return ([calculator.heap_size for calculator in self._ap_calculators] == 100 | [0 for _ in range(self._num_class)]) 101 | 102 | def peek_map_at_n(self): 103 | """Peek the non-interpolated mean average precision at n. 104 | 105 | Returns: 106 | An array of non-interpolated average precision at n (default 0) for each 107 | class. 108 | """ 109 | aps = [self._ap_calculators[i].peek_ap_at_n() 110 | for i in range(self._num_class)] 111 | return aps 112 | -------------------------------------------------------------------------------- /configs/config.tagging.5k.yaml: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # 1. Model Define Configs 3 | ############################################################# 4 | ModelConfig: 5 | model_type: 'NextVladBERT' 6 | use_modal_drop: False #在训练过程中,对多模态特征的某一模态进行丢弃 7 | with_embedding_bn: False #对不同模态输入特征进行BN归一化 8 | modal_drop_rate: 0.3 9 | with_video_head: True #视频特征 10 | with_audio_head: True #音频特征 11 | with_text_head: True #文本特征 12 | with_image_head: False # False #图片特征 13 | 14 | #视频特征(16384) 15 | video_head_type: 'NeXtVLAD' 16 | video_head_params: 17 | nextvlad_cluster_size: 128 18 | groups: 16 19 | expansion: 2 20 | feature_size: 2552 #inception feature dim 21 | directly: True 22 | max_frames: 300 23 | 24 | #语音特征(1024) 25 | audio_head_type: 'NeXtVLAD' 26 | audio_head_params: 27 | nextvlad_cluster_size: 64 28 | groups: 16 29 | expansion: 2 30 | feature_size: 128 #vggfish feature dim 31 | directly: True 32 | max_frames: 300 33 | 34 | #文本特征(1024) 35 | text_head_type: 'BERT' 36 | text_head_params: 37 | bert_config: 38 | attention_probs_dropout_prob: 0.1 39 | hidden_act: "gelu" 40 | hidden_dropout_prob: 0.1 41 | hidden_size: 768 42 | initializer_range: 0.02 43 | intermediate_size: 3072 44 | max_position_embeddings: 512 45 | num_attention_heads: 12 46 | num_hidden_layers: 12 47 | type_vocab_size: 2 48 | vocab_size: 21128 49 | bert_emb_encode_size: 1024 50 | 51 | #图片特征(2048) 52 | image_head_type: 'resnet_v2_50' 53 | image_head_params: {} 54 | 55 | 56 | #多模态特征融合方式 57 | fusion_head_type: 'SE' 58 | fusion_head_params: 59 | hidden1_size: 1024 60 | gating_reduction: 8 # reduction factor in se context gating 61 | drop_rate: 62 | video: 0.8 63 | audio: 0.8 64 | image: 0.5 65 | text: 0.4 66 | fusion: 0.9 67 | 68 | #tagging分类器参数 69 | tagging_classifier_type: 'MoeModel' 70 | tagging_classifier_params: 71 | num_classes: 82 #标签数目, 按需修改 72 | num_mixtures: 2 73 | #l2_penalty: 0.0 74 | 75 | ############################################################# 76 | #2. Optimizer & Train Configs 77 | ############################################################# 78 | OptimizerConfig: 79 | optimizer: 'AdamOptimizer' 80 | optimizer_init_params: {} 81 | clip_gradient_norm: 1.0 82 | learning_rate_dict: 83 | video: 0.0001 84 | audio: 0.0001 85 | text: 0.00001 86 | image: 0.0001 87 | classifier: 0.0005 88 | loss_type_dict: 89 | tagging: "CrossEntropyLoss" 90 | max_step_num: 6500 91 | export_model_steps: 500 92 | learning_rate_decay: 0.1 93 | start_new_model: True # 如果为True,重新训练; 如果False,则resume 94 | num_gpu: 1 95 | log_device_placement: False 96 | gpu_allow_growth: True 97 | pretrained_model: 98 | text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 99 | image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt' 100 | train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改 101 | 102 | ############################################################# 103 | # 3. DataSet Config 104 | ############################################################# 105 | DatasetConfig: 106 | batch_size: 16 107 | shuffle: True 108 | train_data_source_list: 109 | train799: 110 | file: '../dataset/tagging/GroundTruth/datafile/train.txt' # preprocessing脚本生成文件,按需求修改 (datafile) 111 | batch_size: 16 112 | 113 | valid_data_source_list: 114 | val799: 115 | file: '../dataset/tagging/GroundTruth/datafile/val.txt' # preprocessing脚本生成文件,按需求修改 116 | batch_size: 128 117 | 118 | preprocess_root: 'src/dataloader/preprocess/' 119 | preprocess_config: 120 | feature: 121 | - name: 'video,video_frames_num,idx' 122 | shape: [[300,2552], [],[]] 123 | dtype: 'float32,int32,string' 124 | class: 'frames_npy_preprocess.Preprocess' 125 | extra_args: 126 | max_frames: 300 127 | feat_dim: 2552 128 | return_frames_num: True 129 | return_idx: True 130 | 131 | - name: 'audio,audio_frames_num' 132 | shape: [[300,128], []] 133 | dtype: 'float32,int32' 134 | class: 'frames_npy_preprocess.Preprocess' 135 | extra_args: 136 | max_frames: 300 137 | feat_dim: 128 138 | return_frames_num: True 139 | 140 | - name: 'image' 141 | shape: [[224,224,3]] 142 | dtype: 'float32' 143 | class: 'image_preprocess.Preprocess' 144 | 145 | - name: 'text' 146 | shape: [[300]] 147 | dtype: 'int64' 148 | class: 'text_preprocess.Preprocess' 149 | extra_args: 150 | vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt' 151 | max_len: 300 152 | label: 153 | - name: 'tagging' 154 | dtype: 'float32' 155 | shape: [[82]] # 根据 num_classes修改 156 | class: 'label_preprocess.Preprocess_label_sparse_to_dense' 157 | extra_args: 158 | index_dict: '../dataset/label_id.txt' # 按需求更改 159 | -------------------------------------------------------------------------------- /configs/config.tagging.5k.0.yaml: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # 1. Model Define Configs 3 | ############################################################# 4 | ModelConfig: 5 | model_type: 'NextVladBERT' 6 | use_modal_drop: False #在训练过程中,对多模态特征的某一模态进行丢弃 7 | with_embedding_bn: False #对不同模态输入特征进行BN归一化 8 | modal_drop_rate: 0.3 9 | with_video_head: True #视频特征 10 | with_audio_head: True #音频特征 11 | with_text_head: True #文本特征 12 | with_image_head: False # False #图片特征 13 | 14 | #视频特征(16384) 15 | video_head_type: 'NeXtVLAD' 16 | video_head_params: 17 | nextvlad_cluster_size: 128 18 | groups: 16 19 | expansion: 2 20 | feature_size: 2552 #inception feature dim 21 | directly: True 22 | max_frames: 300 23 | 24 | #语音特征(1024) 25 | audio_head_type: 'NeXtVLAD' 26 | audio_head_params: 27 | nextvlad_cluster_size: 64 28 | groups: 16 29 | expansion: 2 30 | feature_size: 128 #vggfish feature dim 31 | directly: True 32 | max_frames: 300 33 | 34 | #文本特征(1024) 35 | text_head_type: 'BERT' 36 | text_head_params: 37 | bert_config: 38 | attention_probs_dropout_prob: 0.1 39 | hidden_act: "gelu" 40 | hidden_dropout_prob: 0.1 41 | hidden_size: 768 42 | initializer_range: 0.02 43 | intermediate_size: 3072 44 | max_position_embeddings: 512 45 | num_attention_heads: 12 46 | num_hidden_layers: 12 47 | type_vocab_size: 2 48 | vocab_size: 21128 49 | bert_emb_encode_size: 1024 50 | 51 | #图片特征(2048) 52 | image_head_type: 'resnet_v2_50' 53 | image_head_params: {} 54 | 55 | 56 | #多模态特征融合方式 57 | fusion_head_type: 'SE' 58 | fusion_head_params: 59 | hidden1_size: 1024 60 | gating_reduction: 8 # reduction factor in se context gating 61 | drop_rate: 62 | video: 0.8 63 | audio: 0.8 64 | image: 0.5 65 | text: 0.4 66 | fusion: 0.9 67 | 68 | #tagging分类器参数 69 | tagging_classifier_type: 'MoeModel' 70 | tagging_classifier_params: 71 | num_classes: 82 #标签数目, 按需修改 72 | num_mixtures: 2 73 | #l2_penalty: 0.0 74 | 75 | ############################################################# 76 | #2. Optimizer & Train Configs 77 | ############################################################# 78 | OptimizerConfig: 79 | optimizer: 'AdamOptimizer' 80 | optimizer_init_params: {} 81 | clip_gradient_norm: 1.0 82 | learning_rate_dict: 83 | video: 0.0001 84 | audio: 0.0001 85 | text: 0.00001 86 | image: 0.0001 87 | classifier: 0.0005 88 | loss_type_dict: 89 | tagging: "CrossEntropyLoss" 90 | max_step_num: 6500 91 | export_model_steps: 500 92 | learning_rate_decay: 0.1 93 | start_new_model: True # 如果为True,重新训练; 如果False,则resume 94 | num_gpu: 1 95 | log_device_placement: False 96 | gpu_allow_growth: True 97 | pretrained_model: 98 | text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 99 | image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt' 100 | train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改 101 | 102 | ############################################################# 103 | # 3. DataSet Config 104 | ############################################################# 105 | DatasetConfig: 106 | batch_size: 16 107 | shuffle: True 108 | train_data_source_list: 109 | train799: 110 | file: '../dataset/tagging/GroundTruth/datafile/train_0.txt' # preprocessing脚本生成文件,按需求修改 (datafile) 111 | batch_size: 16 112 | 113 | valid_data_source_list: 114 | val799: 115 | file: '../dataset/tagging/GroundTruth/datafile/valid_0.txt' # preprocessing脚本生成文件,按需求修改 116 | batch_size: 128 117 | 118 | preprocess_root: 'src/dataloader/preprocess/' 119 | preprocess_config: 120 | feature: 121 | - name: 'video,video_frames_num,idx' 122 | shape: [[300,2552], [],[]] 123 | dtype: 'float32,int32,string' 124 | class: 'frames_npy_preprocess.Preprocess' 125 | extra_args: 126 | max_frames: 300 127 | feat_dim: 2552 128 | return_frames_num: True 129 | return_idx: True 130 | 131 | - name: 'audio,audio_frames_num' 132 | shape: [[300,128], []] 133 | dtype: 'float32,int32' 134 | class: 'frames_npy_preprocess.Preprocess' 135 | extra_args: 136 | max_frames: 300 137 | feat_dim: 128 138 | return_frames_num: True 139 | 140 | - name: 'image' 141 | shape: [[224,224,3]] 142 | dtype: 'float32' 143 | class: 'image_preprocess.Preprocess' 144 | 145 | - name: 'text' 146 | shape: [[300]] 147 | dtype: 'int64' 148 | class: 'text_preprocess.Preprocess' 149 | extra_args: 150 | vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt' 151 | max_len: 300 152 | label: 153 | - name: 'tagging' 154 | dtype: 'float32' 155 | shape: [[82]] # 根据 num_classes修改 156 | class: 'label_preprocess.Preprocess_label_sparse_to_dense' 157 | extra_args: 158 | index_dict: '../dataset/label_id.txt' # 按需求更改 159 | -------------------------------------------------------------------------------- /configs/config.tagging.5k.1.yaml: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # 1. Model Define Configs 3 | ############################################################# 4 | ModelConfig: 5 | model_type: 'NextVladBERT' 6 | use_modal_drop: False #在训练过程中,对多模态特征的某一模态进行丢弃 7 | with_embedding_bn: False #对不同模态输入特征进行BN归一化 8 | modal_drop_rate: 0.3 9 | with_video_head: True #视频特征 10 | with_audio_head: True #音频特征 11 | with_text_head: True #文本特征 12 | with_image_head: False # False #图片特征 13 | 14 | #视频特征(16384) 15 | video_head_type: 'NeXtVLAD' 16 | video_head_params: 17 | nextvlad_cluster_size: 128 18 | groups: 16 19 | expansion: 2 20 | feature_size: 2552 #inception feature dim 21 | directly: True 22 | max_frames: 300 23 | 24 | #语音特征(1024) 25 | audio_head_type: 'NeXtVLAD' 26 | audio_head_params: 27 | nextvlad_cluster_size: 64 28 | groups: 16 29 | expansion: 2 30 | feature_size: 128 #vggfish feature dim 31 | directly: True 32 | max_frames: 300 33 | 34 | #文本特征(1024) 35 | text_head_type: 'BERT' 36 | text_head_params: 37 | bert_config: 38 | attention_probs_dropout_prob: 0.1 39 | hidden_act: "gelu" 40 | hidden_dropout_prob: 0.1 41 | hidden_size: 768 42 | initializer_range: 0.02 43 | intermediate_size: 3072 44 | max_position_embeddings: 512 45 | num_attention_heads: 12 46 | num_hidden_layers: 12 47 | type_vocab_size: 2 48 | vocab_size: 21128 49 | bert_emb_encode_size: 1024 50 | 51 | #图片特征(2048) 52 | image_head_type: 'resnet_v2_50' 53 | image_head_params: {} 54 | 55 | 56 | #多模态特征融合方式 57 | fusion_head_type: 'SE' 58 | fusion_head_params: 59 | hidden1_size: 1024 60 | gating_reduction: 8 # reduction factor in se context gating 61 | drop_rate: 62 | video: 0.8 63 | audio: 0.8 64 | image: 0.5 65 | text: 0.4 66 | fusion: 0.9 67 | 68 | #tagging分类器参数 69 | tagging_classifier_type: 'MoeModel' 70 | tagging_classifier_params: 71 | num_classes: 82 #标签数目, 按需修改 72 | num_mixtures: 2 73 | #l2_penalty: 0.0 74 | 75 | ############################################################# 76 | #2. Optimizer & Train Configs 77 | ############################################################# 78 | OptimizerConfig: 79 | optimizer: 'AdamOptimizer' 80 | optimizer_init_params: {} 81 | clip_gradient_norm: 1.0 82 | learning_rate_dict: 83 | video: 0.0001 84 | audio: 0.0001 85 | text: 0.00001 86 | image: 0.0001 87 | classifier: 0.0005 88 | loss_type_dict: 89 | tagging: "CrossEntropyLoss" 90 | max_step_num: 6500 91 | export_model_steps: 500 92 | learning_rate_decay: 0.1 93 | start_new_model: True # 如果为True,重新训练; 如果False,则resume 94 | num_gpu: 1 95 | log_device_placement: False 96 | gpu_allow_growth: True 97 | pretrained_model: 98 | text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 99 | image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt' 100 | train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改 101 | 102 | ############################################################# 103 | # 3. DataSet Config 104 | ############################################################# 105 | DatasetConfig: 106 | batch_size: 16 107 | shuffle: True 108 | train_data_source_list: 109 | train799: 110 | file: '../dataset/tagging/GroundTruth/datafile/train_1.txt' # preprocessing脚本生成文件,按需求修改 (datafile) 111 | batch_size: 16 112 | 113 | valid_data_source_list: 114 | val799: 115 | file: '../dataset/tagging/GroundTruth/datafile/valid_1.txt' # preprocessing脚本生成文件,按需求修改 116 | batch_size: 128 117 | 118 | preprocess_root: 'src/dataloader/preprocess/' 119 | preprocess_config: 120 | feature: 121 | - name: 'video,video_frames_num,idx' 122 | shape: [[300,2552], [],[]] 123 | dtype: 'float32,int32,string' 124 | class: 'frames_npy_preprocess.Preprocess' 125 | extra_args: 126 | max_frames: 300 127 | feat_dim: 2552 128 | return_frames_num: True 129 | return_idx: True 130 | 131 | - name: 'audio,audio_frames_num' 132 | shape: [[300,128], []] 133 | dtype: 'float32,int32' 134 | class: 'frames_npy_preprocess.Preprocess' 135 | extra_args: 136 | max_frames: 300 137 | feat_dim: 128 138 | return_frames_num: True 139 | 140 | - name: 'image' 141 | shape: [[224,224,3]] 142 | dtype: 'float32' 143 | class: 'image_preprocess.Preprocess' 144 | 145 | - name: 'text' 146 | shape: [[300]] 147 | dtype: 'int64' 148 | class: 'text_preprocess.Preprocess' 149 | extra_args: 150 | vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt' 151 | max_len: 300 152 | label: 153 | - name: 'tagging' 154 | dtype: 'float32' 155 | shape: [[82]] # 根据 num_classes修改 156 | class: 'label_preprocess.Preprocess_label_sparse_to_dense' 157 | extra_args: 158 | index_dict: '../dataset/label_id.txt' # 按需求更改 159 | -------------------------------------------------------------------------------- /configs/config.tagging.5k.2.yaml: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # 1. Model Define Configs 3 | ############################################################# 4 | ModelConfig: 5 | model_type: 'NextVladBERT' 6 | use_modal_drop: False #在训练过程中,对多模态特征的某一模态进行丢弃 7 | with_embedding_bn: False #对不同模态输入特征进行BN归一化 8 | modal_drop_rate: 0.3 9 | with_video_head: True #视频特征 10 | with_audio_head: True #音频特征 11 | with_text_head: True #文本特征 12 | with_image_head: False # False #图片特征 13 | 14 | #视频特征(16384) 15 | video_head_type: 'NeXtVLAD' 16 | video_head_params: 17 | nextvlad_cluster_size: 128 18 | groups: 16 19 | expansion: 2 20 | feature_size: 2552 #inception feature dim 21 | directly: True 22 | max_frames: 300 23 | 24 | #语音特征(1024) 25 | audio_head_type: 'NeXtVLAD' 26 | audio_head_params: 27 | nextvlad_cluster_size: 64 28 | groups: 16 29 | expansion: 2 30 | feature_size: 128 #vggfish feature dim 31 | directly: True 32 | max_frames: 300 33 | 34 | #文本特征(1024) 35 | text_head_type: 'BERT' 36 | text_head_params: 37 | bert_config: 38 | attention_probs_dropout_prob: 0.1 39 | hidden_act: "gelu" 40 | hidden_dropout_prob: 0.1 41 | hidden_size: 768 42 | initializer_range: 0.02 43 | intermediate_size: 3072 44 | max_position_embeddings: 512 45 | num_attention_heads: 12 46 | num_hidden_layers: 12 47 | type_vocab_size: 2 48 | vocab_size: 21128 49 | bert_emb_encode_size: 1024 50 | 51 | #图片特征(2048) 52 | image_head_type: 'resnet_v2_50' 53 | image_head_params: {} 54 | 55 | 56 | #多模态特征融合方式 57 | fusion_head_type: 'SE' 58 | fusion_head_params: 59 | hidden1_size: 1024 60 | gating_reduction: 8 # reduction factor in se context gating 61 | drop_rate: 62 | video: 0.8 63 | audio: 0.8 64 | image: 0.5 65 | text: 0.4 66 | fusion: 0.9 67 | 68 | #tagging分类器参数 69 | tagging_classifier_type: 'MoeModel' 70 | tagging_classifier_params: 71 | num_classes: 82 #标签数目, 按需修改 72 | num_mixtures: 2 73 | #l2_penalty: 0.0 74 | 75 | ############################################################# 76 | #2. Optimizer & Train Configs 77 | ############################################################# 78 | OptimizerConfig: 79 | optimizer: 'AdamOptimizer' 80 | optimizer_init_params: {} 81 | clip_gradient_norm: 1.0 82 | learning_rate_dict: 83 | video: 0.0001 84 | audio: 0.0001 85 | text: 0.00001 86 | image: 0.0001 87 | classifier: 0.0005 88 | loss_type_dict: 89 | tagging: "CrossEntropyLoss" 90 | max_step_num: 6500 91 | export_model_steps: 500 92 | learning_rate_decay: 0.1 93 | start_new_model: True # 如果为True,重新训练; 如果False,则resume 94 | num_gpu: 1 95 | log_device_placement: False 96 | gpu_allow_growth: True 97 | pretrained_model: 98 | text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 99 | image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt' 100 | train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改 101 | 102 | ############################################################# 103 | # 3. DataSet Config 104 | ############################################################# 105 | DatasetConfig: 106 | batch_size: 16 107 | shuffle: True 108 | train_data_source_list: 109 | train799: 110 | file: '../dataset/tagging/GroundTruth/datafile/train_2.txt' # preprocessing脚本生成文件,按需求修改 (datafile) 111 | batch_size: 16 112 | 113 | valid_data_source_list: 114 | val799: 115 | file: '../dataset/tagging/GroundTruth/datafile/valid_2.txt' # preprocessing脚本生成文件,按需求修改 116 | batch_size: 128 117 | 118 | preprocess_root: 'src/dataloader/preprocess/' 119 | preprocess_config: 120 | feature: 121 | - name: 'video,video_frames_num,idx' 122 | shape: [[300,2552], [],[]] 123 | dtype: 'float32,int32,string' 124 | class: 'frames_npy_preprocess.Preprocess' 125 | extra_args: 126 | max_frames: 300 127 | feat_dim: 2552 128 | return_frames_num: True 129 | return_idx: True 130 | 131 | - name: 'audio,audio_frames_num' 132 | shape: [[300,128], []] 133 | dtype: 'float32,int32' 134 | class: 'frames_npy_preprocess.Preprocess' 135 | extra_args: 136 | max_frames: 300 137 | feat_dim: 128 138 | return_frames_num: True 139 | 140 | - name: 'image' 141 | shape: [[224,224,3]] 142 | dtype: 'float32' 143 | class: 'image_preprocess.Preprocess' 144 | 145 | - name: 'text' 146 | shape: [[300]] 147 | dtype: 'int64' 148 | class: 'text_preprocess.Preprocess' 149 | extra_args: 150 | vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt' 151 | max_len: 300 152 | label: 153 | - name: 'tagging' 154 | dtype: 'float32' 155 | shape: [[82]] # 根据 num_classes修改 156 | class: 'label_preprocess.Preprocess_label_sparse_to_dense' 157 | extra_args: 158 | index_dict: '../dataset/label_id.txt' # 按需求更改 159 | -------------------------------------------------------------------------------- /configs/config.tagging.5k.3.yaml: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # 1. Model Define Configs 3 | ############################################################# 4 | ModelConfig: 5 | model_type: 'NextVladBERT' 6 | use_modal_drop: False #在训练过程中,对多模态特征的某一模态进行丢弃 7 | with_embedding_bn: False #对不同模态输入特征进行BN归一化 8 | modal_drop_rate: 0.3 9 | with_video_head: True #视频特征 10 | with_audio_head: True #音频特征 11 | with_text_head: True #文本特征 12 | with_image_head: False # False #图片特征 13 | 14 | #视频特征(16384) 15 | video_head_type: 'NeXtVLAD' 16 | video_head_params: 17 | nextvlad_cluster_size: 128 18 | groups: 16 19 | expansion: 2 20 | feature_size: 2552 #inception feature dim 21 | directly: True 22 | max_frames: 300 23 | 24 | #语音特征(1024) 25 | audio_head_type: 'NeXtVLAD' 26 | audio_head_params: 27 | nextvlad_cluster_size: 64 28 | groups: 16 29 | expansion: 2 30 | feature_size: 128 #vggfish feature dim 31 | directly: True 32 | max_frames: 300 33 | 34 | #文本特征(1024) 35 | text_head_type: 'BERT' 36 | text_head_params: 37 | bert_config: 38 | attention_probs_dropout_prob: 0.1 39 | hidden_act: "gelu" 40 | hidden_dropout_prob: 0.1 41 | hidden_size: 768 42 | initializer_range: 0.02 43 | intermediate_size: 3072 44 | max_position_embeddings: 512 45 | num_attention_heads: 12 46 | num_hidden_layers: 12 47 | type_vocab_size: 2 48 | vocab_size: 21128 49 | bert_emb_encode_size: 1024 50 | 51 | #图片特征(2048) 52 | image_head_type: 'resnet_v2_50' 53 | image_head_params: {} 54 | 55 | 56 | #多模态特征融合方式 57 | fusion_head_type: 'SE' 58 | fusion_head_params: 59 | hidden1_size: 1024 60 | gating_reduction: 8 # reduction factor in se context gating 61 | drop_rate: 62 | video: 0.8 63 | audio: 0.8 64 | image: 0.5 65 | text: 0.4 66 | fusion: 0.9 67 | 68 | #tagging分类器参数 69 | tagging_classifier_type: 'MoeModel' 70 | tagging_classifier_params: 71 | num_classes: 82 #标签数目, 按需修改 72 | num_mixtures: 2 73 | #l2_penalty: 0.0 74 | 75 | ############################################################# 76 | #2. Optimizer & Train Configs 77 | ############################################################# 78 | OptimizerConfig: 79 | optimizer: 'AdamOptimizer' 80 | optimizer_init_params: {} 81 | clip_gradient_norm: 1.0 82 | learning_rate_dict: 83 | video: 0.0001 84 | audio: 0.0001 85 | text: 0.00001 86 | image: 0.0001 87 | classifier: 0.0005 88 | loss_type_dict: 89 | tagging: "CrossEntropyLoss" 90 | max_step_num: 6500 91 | export_model_steps: 500 92 | learning_rate_decay: 0.1 93 | start_new_model: True # 如果为True,重新训练; 如果False,则resume 94 | num_gpu: 1 95 | log_device_placement: False 96 | gpu_allow_growth: True 97 | pretrained_model: 98 | text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 99 | image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt' 100 | train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改 101 | 102 | ############################################################# 103 | # 3. DataSet Config 104 | ############################################################# 105 | DatasetConfig: 106 | batch_size: 16 107 | shuffle: True 108 | train_data_source_list: 109 | train799: 110 | file: '../dataset/tagging/GroundTruth/datafile/train_3.txt' # preprocessing脚本生成文件,按需求修改 (datafile) 111 | batch_size: 16 112 | 113 | valid_data_source_list: 114 | val799: 115 | file: '../dataset/tagging/GroundTruth/datafile/valid_3.txt' # preprocessing脚本生成文件,按需求修改 116 | batch_size: 128 117 | 118 | preprocess_root: 'src/dataloader/preprocess/' 119 | preprocess_config: 120 | feature: 121 | - name: 'video,video_frames_num,idx' 122 | shape: [[300,2552], [],[]] 123 | dtype: 'float32,int32,string' 124 | class: 'frames_npy_preprocess.Preprocess' 125 | extra_args: 126 | max_frames: 300 127 | feat_dim: 2552 128 | return_frames_num: True 129 | return_idx: True 130 | 131 | - name: 'audio,audio_frames_num' 132 | shape: [[300,128], []] 133 | dtype: 'float32,int32' 134 | class: 'frames_npy_preprocess.Preprocess' 135 | extra_args: 136 | max_frames: 300 137 | feat_dim: 128 138 | return_frames_num: True 139 | 140 | - name: 'image' 141 | shape: [[224,224,3]] 142 | dtype: 'float32' 143 | class: 'image_preprocess.Preprocess' 144 | 145 | - name: 'text' 146 | shape: [[300]] 147 | dtype: 'int64' 148 | class: 'text_preprocess.Preprocess' 149 | extra_args: 150 | vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt' 151 | max_len: 300 152 | label: 153 | - name: 'tagging' 154 | dtype: 'float32' 155 | shape: [[82]] # 根据 num_classes修改 156 | class: 'label_preprocess.Preprocess_label_sparse_to_dense' 157 | extra_args: 158 | index_dict: '../dataset/label_id.txt' # 按需求更改 159 | -------------------------------------------------------------------------------- /configs/config.tagging.5k.4.yaml: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # 1. Model Define Configs 3 | ############################################################# 4 | ModelConfig: 5 | model_type: 'NextVladBERT' 6 | use_modal_drop: False #在训练过程中,对多模态特征的某一模态进行丢弃 7 | with_embedding_bn: False #对不同模态输入特征进行BN归一化 8 | modal_drop_rate: 0.3 9 | with_video_head: True #视频特征 10 | with_audio_head: True #音频特征 11 | with_text_head: True #文本特征 12 | with_image_head: False # False #图片特征 13 | 14 | #视频特征(16384) 15 | video_head_type: 'NeXtVLAD' 16 | video_head_params: 17 | nextvlad_cluster_size: 128 18 | groups: 16 19 | expansion: 2 20 | feature_size: 2552 #inception feature dim 21 | directly: True 22 | max_frames: 300 23 | 24 | #语音特征(1024) 25 | audio_head_type: 'NeXtVLAD' 26 | audio_head_params: 27 | nextvlad_cluster_size: 64 28 | groups: 16 29 | expansion: 2 30 | feature_size: 128 #vggfish feature dim 31 | directly: True 32 | max_frames: 300 33 | 34 | #文本特征(1024) 35 | text_head_type: 'BERT' 36 | text_head_params: 37 | bert_config: 38 | attention_probs_dropout_prob: 0.1 39 | hidden_act: "gelu" 40 | hidden_dropout_prob: 0.1 41 | hidden_size: 768 42 | initializer_range: 0.02 43 | intermediate_size: 3072 44 | max_position_embeddings: 512 45 | num_attention_heads: 12 46 | num_hidden_layers: 12 47 | type_vocab_size: 2 48 | vocab_size: 21128 49 | bert_emb_encode_size: 1024 50 | 51 | #图片特征(2048) 52 | image_head_type: 'resnet_v2_50' 53 | image_head_params: {} 54 | 55 | 56 | #多模态特征融合方式 57 | fusion_head_type: 'SE' 58 | fusion_head_params: 59 | hidden1_size: 1024 60 | gating_reduction: 8 # reduction factor in se context gating 61 | drop_rate: 62 | video: 0.8 63 | audio: 0.8 64 | image: 0.5 65 | text: 0.4 66 | fusion: 0.9 67 | 68 | #tagging分类器参数 69 | tagging_classifier_type: 'MoeModel' 70 | tagging_classifier_params: 71 | num_classes: 82 #标签数目, 按需修改 72 | num_mixtures: 2 73 | #l2_penalty: 0.0 74 | 75 | ############################################################# 76 | #2. Optimizer & Train Configs 77 | ############################################################# 78 | OptimizerConfig: 79 | optimizer: 'AdamOptimizer' 80 | optimizer_init_params: {} 81 | clip_gradient_norm: 1.0 82 | learning_rate_dict: 83 | video: 0.0001 84 | audio: 0.0001 85 | text: 0.00001 86 | image: 0.0001 87 | classifier: 0.0005 88 | loss_type_dict: 89 | tagging: "CrossEntropyLoss" 90 | max_step_num: 6500 91 | export_model_steps: 500 92 | learning_rate_decay: 0.1 93 | start_new_model: True # 如果为True,重新训练; 如果False,则resume 94 | num_gpu: 1 95 | log_device_placement: False 96 | gpu_allow_growth: True 97 | pretrained_model: 98 | text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 99 | image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt' 100 | train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改 101 | 102 | ############################################################# 103 | # 3. DataSet Config 104 | ############################################################# 105 | DatasetConfig: 106 | batch_size: 16 107 | shuffle: True 108 | train_data_source_list: 109 | train799: 110 | file: '../dataset/tagging/GroundTruth/datafile/train_4.txt' # preprocessing脚本生成文件,按需求修改 (datafile) 111 | batch_size: 16 112 | 113 | valid_data_source_list: 114 | val799: 115 | file: '../dataset/tagging/GroundTruth/datafile/valid_4.txt' # preprocessing脚本生成文件,按需求修改 116 | batch_size: 128 117 | 118 | preprocess_root: 'src/dataloader/preprocess/' 119 | preprocess_config: 120 | feature: 121 | - name: 'video,video_frames_num,idx' 122 | shape: [[300,2552], [],[]] 123 | dtype: 'float32,int32,string' 124 | class: 'frames_npy_preprocess.Preprocess' 125 | extra_args: 126 | max_frames: 300 127 | feat_dim: 2552 128 | return_frames_num: True 129 | return_idx: True 130 | 131 | - name: 'audio,audio_frames_num' 132 | shape: [[300,128], []] 133 | dtype: 'float32,int32' 134 | class: 'frames_npy_preprocess.Preprocess' 135 | extra_args: 136 | max_frames: 300 137 | feat_dim: 128 138 | return_frames_num: True 139 | 140 | - name: 'image' 141 | shape: [[224,224,3]] 142 | dtype: 'float32' 143 | class: 'image_preprocess.Preprocess' 144 | 145 | - name: 'text' 146 | shape: [[300]] 147 | dtype: 'int64' 148 | class: 'text_preprocess.Preprocess' 149 | extra_args: 150 | vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt' 151 | max_len: 300 152 | label: 153 | - name: 'tagging' 154 | dtype: 'float32' 155 | shape: [[82]] # 根据 num_classes修改 156 | class: 'label_preprocess.Preprocess_label_sparse_to_dense' 157 | extra_args: 158 | index_dict: '../dataset/label_id.txt' # 按需求更改 159 | -------------------------------------------------------------------------------- /configs/config.tagging.5k.5.yaml: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # 1. Model Define Configs 3 | ############################################################# 4 | ModelConfig: 5 | model_type: 'NextVladBERT' 6 | use_modal_drop: False #在训练过程中,对多模态特征的某一模态进行丢弃 7 | with_embedding_bn: False #对不同模态输入特征进行BN归一化 8 | modal_drop_rate: 0.3 9 | with_video_head: True #视频特征 10 | with_audio_head: True #音频特征 11 | with_text_head: True #文本特征 12 | with_image_head: False # False #图片特征 13 | 14 | #视频特征(16384) 15 | video_head_type: 'NeXtVLAD' 16 | video_head_params: 17 | nextvlad_cluster_size: 128 18 | groups: 16 19 | expansion: 2 20 | feature_size: 2552 #inception feature dim 21 | directly: True 22 | max_frames: 300 23 | 24 | #语音特征(1024) 25 | audio_head_type: 'NeXtVLAD' 26 | audio_head_params: 27 | nextvlad_cluster_size: 64 28 | groups: 16 29 | expansion: 2 30 | feature_size: 128 #vggfish feature dim 31 | directly: True 32 | max_frames: 300 33 | 34 | #文本特征(1024) 35 | text_head_type: 'BERT' 36 | text_head_params: 37 | bert_config: 38 | attention_probs_dropout_prob: 0.1 39 | hidden_act: "gelu" 40 | hidden_dropout_prob: 0.1 41 | hidden_size: 768 42 | initializer_range: 0.02 43 | intermediate_size: 3072 44 | max_position_embeddings: 512 45 | num_attention_heads: 12 46 | num_hidden_layers: 12 47 | type_vocab_size: 2 48 | vocab_size: 21128 49 | bert_emb_encode_size: 1024 50 | 51 | #图片特征(2048) 52 | image_head_type: 'resnet_v2_50' 53 | image_head_params: {} 54 | 55 | 56 | #多模态特征融合方式 57 | fusion_head_type: 'SE' 58 | fusion_head_params: 59 | hidden1_size: 1024 60 | gating_reduction: 8 # reduction factor in se context gating 61 | drop_rate: 62 | video: 0.8 63 | audio: 0.8 64 | image: 0.5 65 | text: 0.4 66 | fusion: 0.9 67 | 68 | #tagging分类器参数 69 | tagging_classifier_type: 'MoeModel' 70 | tagging_classifier_params: 71 | num_classes: 82 #标签数目, 按需修改 72 | num_mixtures: 2 73 | #l2_penalty: 0.0 74 | 75 | ############################################################# 76 | #2. Optimizer & Train Configs 77 | ############################################################# 78 | OptimizerConfig: 79 | optimizer: 'AdamOptimizer' 80 | optimizer_init_params: {} 81 | clip_gradient_norm: 1.0 82 | learning_rate_dict: 83 | video: 0.0001 84 | audio: 0.0001 85 | text: 0.00001 86 | image: 0.0001 87 | classifier: 0.0005 88 | loss_type_dict: 89 | tagging: "CrossEntropyLoss" 90 | max_step_num: 6500 91 | export_model_steps: 500 92 | learning_rate_decay: 0.1 93 | start_new_model: True # 如果为True,重新训练; 如果False,则resume 94 | num_gpu: 1 95 | log_device_placement: False 96 | gpu_allow_growth: True 97 | pretrained_model: 98 | text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 99 | image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt' 100 | train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改 101 | 102 | ############################################################# 103 | # 3. DataSet Config 104 | ############################################################# 105 | DatasetConfig: 106 | batch_size: 16 107 | shuffle: True 108 | train_data_source_list: 109 | train799: 110 | file: '../dataset/tagging/GroundTruth/datafile/train_5.txt' # preprocessing脚本生成文件,按需求修改 (datafile) 111 | batch_size: 16 112 | 113 | valid_data_source_list: 114 | val799: 115 | file: '../dataset/tagging/GroundTruth/datafile/valid_5.txt' # preprocessing脚本生成文件,按需求修改 116 | batch_size: 128 117 | 118 | preprocess_root: 'src/dataloader/preprocess/' 119 | preprocess_config: 120 | feature: 121 | - name: 'video,video_frames_num,idx' 122 | shape: [[300,2552], [],[]] 123 | dtype: 'float32,int32,string' 124 | class: 'frames_npy_preprocess.Preprocess' 125 | extra_args: 126 | max_frames: 300 127 | feat_dim: 2552 128 | return_frames_num: True 129 | return_idx: True 130 | 131 | - name: 'audio,audio_frames_num' 132 | shape: [[300,128], []] 133 | dtype: 'float32,int32' 134 | class: 'frames_npy_preprocess.Preprocess' 135 | extra_args: 136 | max_frames: 300 137 | feat_dim: 128 138 | return_frames_num: True 139 | 140 | - name: 'image' 141 | shape: [[224,224,3]] 142 | dtype: 'float32' 143 | class: 'image_preprocess.Preprocess' 144 | 145 | - name: 'text' 146 | shape: [[300]] 147 | dtype: 'int64' 148 | class: 'text_preprocess.Preprocess' 149 | extra_args: 150 | vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt' 151 | max_len: 300 152 | label: 153 | - name: 'tagging' 154 | dtype: 'float32' 155 | shape: [[82]] # 根据 num_classes修改 156 | class: 'label_preprocess.Preprocess_label_sparse_to_dense' 157 | extra_args: 158 | index_dict: '../dataset/label_id.txt' # 按需求更改 159 | -------------------------------------------------------------------------------- /configs/config.tagging.5k.6.yaml: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # 1. Model Define Configs 3 | ############################################################# 4 | ModelConfig: 5 | model_type: 'NextVladBERT' 6 | use_modal_drop: False #在训练过程中,对多模态特征的某一模态进行丢弃 7 | with_embedding_bn: False #对不同模态输入特征进行BN归一化 8 | modal_drop_rate: 0.3 9 | with_video_head: True #视频特征 10 | with_audio_head: True #音频特征 11 | with_text_head: True #文本特征 12 | with_image_head: False # False #图片特征 13 | 14 | #视频特征(16384) 15 | video_head_type: 'NeXtVLAD' 16 | video_head_params: 17 | nextvlad_cluster_size: 128 18 | groups: 16 19 | expansion: 2 20 | feature_size: 2552 #inception feature dim 21 | directly: True 22 | max_frames: 300 23 | 24 | #语音特征(1024) 25 | audio_head_type: 'NeXtVLAD' 26 | audio_head_params: 27 | nextvlad_cluster_size: 64 28 | groups: 16 29 | expansion: 2 30 | feature_size: 128 #vggfish feature dim 31 | directly: True 32 | max_frames: 300 33 | 34 | #文本特征(1024) 35 | text_head_type: 'BERT' 36 | text_head_params: 37 | bert_config: 38 | attention_probs_dropout_prob: 0.1 39 | hidden_act: "gelu" 40 | hidden_dropout_prob: 0.1 41 | hidden_size: 768 42 | initializer_range: 0.02 43 | intermediate_size: 3072 44 | max_position_embeddings: 512 45 | num_attention_heads: 12 46 | num_hidden_layers: 12 47 | type_vocab_size: 2 48 | vocab_size: 21128 49 | bert_emb_encode_size: 1024 50 | 51 | #图片特征(2048) 52 | image_head_type: 'resnet_v2_50' 53 | image_head_params: {} 54 | 55 | 56 | #多模态特征融合方式 57 | fusion_head_type: 'SE' 58 | fusion_head_params: 59 | hidden1_size: 1024 60 | gating_reduction: 8 # reduction factor in se context gating 61 | drop_rate: 62 | video: 0.8 63 | audio: 0.8 64 | image: 0.5 65 | text: 0.4 66 | fusion: 0.9 67 | 68 | #tagging分类器参数 69 | tagging_classifier_type: 'MoeModel' 70 | tagging_classifier_params: 71 | num_classes: 82 #标签数目, 按需修改 72 | num_mixtures: 2 73 | #l2_penalty: 0.0 74 | 75 | ############################################################# 76 | #2. Optimizer & Train Configs 77 | ############################################################# 78 | OptimizerConfig: 79 | optimizer: 'AdamOptimizer' 80 | optimizer_init_params: {} 81 | clip_gradient_norm: 1.0 82 | learning_rate_dict: 83 | video: 0.0001 84 | audio: 0.0001 85 | text: 0.00001 86 | image: 0.0001 87 | classifier: 0.0005 88 | loss_type_dict: 89 | tagging: "CrossEntropyLoss" 90 | max_step_num: 6500 91 | export_model_steps: 500 92 | learning_rate_decay: 0.1 93 | start_new_model: True # 如果为True,重新训练; 如果False,则resume 94 | num_gpu: 1 95 | log_device_placement: False 96 | gpu_allow_growth: True 97 | pretrained_model: 98 | text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 99 | image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt' 100 | train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改 101 | 102 | ############################################################# 103 | # 3. DataSet Config 104 | ############################################################# 105 | DatasetConfig: 106 | batch_size: 16 107 | shuffle: True 108 | train_data_source_list: 109 | train799: 110 | file: '../dataset/tagging/GroundTruth/datafile/train_6.txt' # preprocessing脚本生成文件,按需求修改 (datafile) 111 | batch_size: 16 112 | 113 | valid_data_source_list: 114 | val799: 115 | file: '../dataset/tagging/GroundTruth/datafile/valid_6.txt' # preprocessing脚本生成文件,按需求修改 116 | batch_size: 128 117 | 118 | preprocess_root: 'src/dataloader/preprocess/' 119 | preprocess_config: 120 | feature: 121 | - name: 'video,video_frames_num,idx' 122 | shape: [[300,2552], [],[]] 123 | dtype: 'float32,int32,string' 124 | class: 'frames_npy_preprocess.Preprocess' 125 | extra_args: 126 | max_frames: 300 127 | feat_dim: 2552 128 | return_frames_num: True 129 | return_idx: True 130 | 131 | - name: 'audio,audio_frames_num' 132 | shape: [[300,128], []] 133 | dtype: 'float32,int32' 134 | class: 'frames_npy_preprocess.Preprocess' 135 | extra_args: 136 | max_frames: 300 137 | feat_dim: 128 138 | return_frames_num: True 139 | 140 | - name: 'image' 141 | shape: [[224,224,3]] 142 | dtype: 'float32' 143 | class: 'image_preprocess.Preprocess' 144 | 145 | - name: 'text' 146 | shape: [[300]] 147 | dtype: 'int64' 148 | class: 'text_preprocess.Preprocess' 149 | extra_args: 150 | vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt' 151 | max_len: 300 152 | label: 153 | - name: 'tagging' 154 | dtype: 'float32' 155 | shape: [[82]] # 根据 num_classes修改 156 | class: 'label_preprocess.Preprocess_label_sparse_to_dense' 157 | extra_args: 158 | index_dict: '../dataset/label_id.txt' # 按需求更改 159 | -------------------------------------------------------------------------------- /configs/config.tagging.5k.7.yaml: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # 1. Model Define Configs 3 | ############################################################# 4 | ModelConfig: 5 | model_type: 'NextVladBERT' 6 | use_modal_drop: False #在训练过程中,对多模态特征的某一模态进行丢弃 7 | with_embedding_bn: False #对不同模态输入特征进行BN归一化 8 | modal_drop_rate: 0.3 9 | with_video_head: True #视频特征 10 | with_audio_head: True #音频特征 11 | with_text_head: True #文本特征 12 | with_image_head: False # False #图片特征 13 | 14 | #视频特征(16384) 15 | video_head_type: 'NeXtVLAD' 16 | video_head_params: 17 | nextvlad_cluster_size: 128 18 | groups: 16 19 | expansion: 2 20 | feature_size: 2552 #inception feature dim 21 | directly: True 22 | max_frames: 300 23 | 24 | #语音特征(1024) 25 | audio_head_type: 'NeXtVLAD' 26 | audio_head_params: 27 | nextvlad_cluster_size: 64 28 | groups: 16 29 | expansion: 2 30 | feature_size: 128 #vggfish feature dim 31 | directly: True 32 | max_frames: 300 33 | 34 | #文本特征(1024) 35 | text_head_type: 'BERT' 36 | text_head_params: 37 | bert_config: 38 | attention_probs_dropout_prob: 0.1 39 | hidden_act: "gelu" 40 | hidden_dropout_prob: 0.1 41 | hidden_size: 768 42 | initializer_range: 0.02 43 | intermediate_size: 3072 44 | max_position_embeddings: 512 45 | num_attention_heads: 12 46 | num_hidden_layers: 12 47 | type_vocab_size: 2 48 | vocab_size: 21128 49 | bert_emb_encode_size: 1024 50 | 51 | #图片特征(2048) 52 | image_head_type: 'resnet_v2_50' 53 | image_head_params: {} 54 | 55 | 56 | #多模态特征融合方式 57 | fusion_head_type: 'SE' 58 | fusion_head_params: 59 | hidden1_size: 1024 60 | gating_reduction: 8 # reduction factor in se context gating 61 | drop_rate: 62 | video: 0.8 63 | audio: 0.8 64 | image: 0.5 65 | text: 0.4 66 | fusion: 0.9 67 | 68 | #tagging分类器参数 69 | tagging_classifier_type: 'MoeModel' 70 | tagging_classifier_params: 71 | num_classes: 82 #标签数目, 按需修改 72 | num_mixtures: 2 73 | #l2_penalty: 0.0 74 | 75 | ############################################################# 76 | #2. Optimizer & Train Configs 77 | ############################################################# 78 | OptimizerConfig: 79 | optimizer: 'AdamOptimizer' 80 | optimizer_init_params: {} 81 | clip_gradient_norm: 1.0 82 | learning_rate_dict: 83 | video: 0.0001 84 | audio: 0.0001 85 | text: 0.00001 86 | image: 0.0001 87 | classifier: 0.0005 88 | loss_type_dict: 89 | tagging: "CrossEntropyLoss" 90 | max_step_num: 6500 91 | export_model_steps: 500 92 | learning_rate_decay: 0.1 93 | start_new_model: True # 如果为True,重新训练; 如果False,则resume 94 | num_gpu: 1 95 | log_device_placement: False 96 | gpu_allow_growth: True 97 | pretrained_model: 98 | text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 99 | image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt' 100 | train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改 101 | 102 | ############################################################# 103 | # 3. DataSet Config 104 | ############################################################# 105 | DatasetConfig: 106 | batch_size: 16 107 | shuffle: True 108 | train_data_source_list: 109 | train799: 110 | file: '../dataset/tagging/GroundTruth/datafile/train_7.txt' # preprocessing脚本生成文件,按需求修改 (datafile) 111 | batch_size: 16 112 | 113 | valid_data_source_list: 114 | val799: 115 | file: '../dataset/tagging/GroundTruth/datafile/valid_7.txt' # preprocessing脚本生成文件,按需求修改 116 | batch_size: 128 117 | 118 | preprocess_root: 'src/dataloader/preprocess/' 119 | preprocess_config: 120 | feature: 121 | - name: 'video,video_frames_num,idx' 122 | shape: [[300,2552], [],[]] 123 | dtype: 'float32,int32,string' 124 | class: 'frames_npy_preprocess.Preprocess' 125 | extra_args: 126 | max_frames: 300 127 | feat_dim: 2552 128 | return_frames_num: True 129 | return_idx: True 130 | 131 | - name: 'audio,audio_frames_num' 132 | shape: [[300,128], []] 133 | dtype: 'float32,int32' 134 | class: 'frames_npy_preprocess.Preprocess' 135 | extra_args: 136 | max_frames: 300 137 | feat_dim: 128 138 | return_frames_num: True 139 | 140 | - name: 'image' 141 | shape: [[224,224,3]] 142 | dtype: 'float32' 143 | class: 'image_preprocess.Preprocess' 144 | 145 | - name: 'text' 146 | shape: [[300]] 147 | dtype: 'int64' 148 | class: 'text_preprocess.Preprocess' 149 | extra_args: 150 | vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt' 151 | max_len: 300 152 | label: 153 | - name: 'tagging' 154 | dtype: 'float32' 155 | shape: [[82]] # 根据 num_classes修改 156 | class: 'label_preprocess.Preprocess_label_sparse_to_dense' 157 | extra_args: 158 | index_dict: '../dataset/label_id.txt' # 按需求更改 159 | -------------------------------------------------------------------------------- /configs/config.tagging.5k.8.yaml: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # 1. Model Define Configs 3 | ############################################################# 4 | ModelConfig: 5 | model_type: 'NextVladBERT' 6 | use_modal_drop: False #在训练过程中,对多模态特征的某一模态进行丢弃 7 | with_embedding_bn: False #对不同模态输入特征进行BN归一化 8 | modal_drop_rate: 0.3 9 | with_video_head: True #视频特征 10 | with_audio_head: True #音频特征 11 | with_text_head: True #文本特征 12 | with_image_head: False # False #图片特征 13 | 14 | #视频特征(16384) 15 | video_head_type: 'NeXtVLAD' 16 | video_head_params: 17 | nextvlad_cluster_size: 128 18 | groups: 16 19 | expansion: 2 20 | feature_size: 2552 #inception feature dim 21 | directly: True 22 | max_frames: 300 23 | 24 | #语音特征(1024) 25 | audio_head_type: 'NeXtVLAD' 26 | audio_head_params: 27 | nextvlad_cluster_size: 64 28 | groups: 16 29 | expansion: 2 30 | feature_size: 128 #vggfish feature dim 31 | directly: True 32 | max_frames: 300 33 | 34 | #文本特征(1024) 35 | text_head_type: 'BERT' 36 | text_head_params: 37 | bert_config: 38 | attention_probs_dropout_prob: 0.1 39 | hidden_act: "gelu" 40 | hidden_dropout_prob: 0.1 41 | hidden_size: 768 42 | initializer_range: 0.02 43 | intermediate_size: 3072 44 | max_position_embeddings: 512 45 | num_attention_heads: 12 46 | num_hidden_layers: 12 47 | type_vocab_size: 2 48 | vocab_size: 21128 49 | bert_emb_encode_size: 1024 50 | 51 | #图片特征(2048) 52 | image_head_type: 'resnet_v2_50' 53 | image_head_params: {} 54 | 55 | 56 | #多模态特征融合方式 57 | fusion_head_type: 'SE' 58 | fusion_head_params: 59 | hidden1_size: 1024 60 | gating_reduction: 8 # reduction factor in se context gating 61 | drop_rate: 62 | video: 0.8 63 | audio: 0.8 64 | image: 0.5 65 | text: 0.4 66 | fusion: 0.9 67 | 68 | #tagging分类器参数 69 | tagging_classifier_type: 'MoeModel' 70 | tagging_classifier_params: 71 | num_classes: 82 #标签数目, 按需修改 72 | num_mixtures: 2 73 | #l2_penalty: 0.0 74 | 75 | ############################################################# 76 | #2. Optimizer & Train Configs 77 | ############################################################# 78 | OptimizerConfig: 79 | optimizer: 'AdamOptimizer' 80 | optimizer_init_params: {} 81 | clip_gradient_norm: 1.0 82 | learning_rate_dict: 83 | video: 0.0001 84 | audio: 0.0001 85 | text: 0.00001 86 | image: 0.0001 87 | classifier: 0.0005 88 | loss_type_dict: 89 | tagging: "CrossEntropyLoss" 90 | max_step_num: 6500 91 | export_model_steps: 500 92 | learning_rate_decay: 0.1 93 | start_new_model: True # 如果为True,重新训练; 如果False,则resume 94 | num_gpu: 1 95 | log_device_placement: False 96 | gpu_allow_growth: True 97 | pretrained_model: 98 | text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 99 | image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt' 100 | train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改 101 | 102 | ############################################################# 103 | # 3. DataSet Config 104 | ############################################################# 105 | DatasetConfig: 106 | batch_size: 16 107 | shuffle: True 108 | train_data_source_list: 109 | train799: 110 | file: '../dataset/tagging/GroundTruth/datafile/train_8.txt' # preprocessing脚本生成文件,按需求修改 (datafile) 111 | batch_size: 16 112 | 113 | valid_data_source_list: 114 | val799: 115 | file: '../dataset/tagging/GroundTruth/datafile/valid_8.txt' # preprocessing脚本生成文件,按需求修改 116 | batch_size: 128 117 | 118 | preprocess_root: 'src/dataloader/preprocess/' 119 | preprocess_config: 120 | feature: 121 | - name: 'video,video_frames_num,idx' 122 | shape: [[300,2552], [],[]] 123 | dtype: 'float32,int32,string' 124 | class: 'frames_npy_preprocess.Preprocess' 125 | extra_args: 126 | max_frames: 300 127 | feat_dim: 2552 128 | return_frames_num: True 129 | return_idx: True 130 | 131 | - name: 'audio,audio_frames_num' 132 | shape: [[300,128], []] 133 | dtype: 'float32,int32' 134 | class: 'frames_npy_preprocess.Preprocess' 135 | extra_args: 136 | max_frames: 300 137 | feat_dim: 128 138 | return_frames_num: True 139 | 140 | - name: 'image' 141 | shape: [[224,224,3]] 142 | dtype: 'float32' 143 | class: 'image_preprocess.Preprocess' 144 | 145 | - name: 'text' 146 | shape: [[300]] 147 | dtype: 'int64' 148 | class: 'text_preprocess.Preprocess' 149 | extra_args: 150 | vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt' 151 | max_len: 300 152 | label: 153 | - name: 'tagging' 154 | dtype: 'float32' 155 | shape: [[82]] # 根据 num_classes修改 156 | class: 'label_preprocess.Preprocess_label_sparse_to_dense' 157 | extra_args: 158 | index_dict: '../dataset/label_id.txt' # 按需求更改 159 | -------------------------------------------------------------------------------- /configs/config.tagging.5k.9.yaml: -------------------------------------------------------------------------------- 1 | ############################################################# 2 | # 1. Model Define Configs 3 | ############################################################# 4 | ModelConfig: 5 | model_type: 'NextVladBERT' 6 | use_modal_drop: False #在训练过程中,对多模态特征的某一模态进行丢弃 7 | with_embedding_bn: False #对不同模态输入特征进行BN归一化 8 | modal_drop_rate: 0.3 9 | with_video_head: True #视频特征 10 | with_audio_head: True #音频特征 11 | with_text_head: True #文本特征 12 | with_image_head: False # False #图片特征 13 | 14 | #视频特征(16384) 15 | video_head_type: 'NeXtVLAD' 16 | video_head_params: 17 | nextvlad_cluster_size: 128 18 | groups: 16 19 | expansion: 2 20 | feature_size: 2552 #inception feature dim 21 | directly: True 22 | max_frames: 300 23 | 24 | #语音特征(1024) 25 | audio_head_type: 'NeXtVLAD' 26 | audio_head_params: 27 | nextvlad_cluster_size: 64 28 | groups: 16 29 | expansion: 2 30 | feature_size: 128 #vggfish feature dim 31 | directly: True 32 | max_frames: 300 33 | 34 | #文本特征(1024) 35 | text_head_type: 'BERT' 36 | text_head_params: 37 | bert_config: 38 | attention_probs_dropout_prob: 0.1 39 | hidden_act: "gelu" 40 | hidden_dropout_prob: 0.1 41 | hidden_size: 768 42 | initializer_range: 0.02 43 | intermediate_size: 3072 44 | max_position_embeddings: 512 45 | num_attention_heads: 12 46 | num_hidden_layers: 12 47 | type_vocab_size: 2 48 | vocab_size: 21128 49 | bert_emb_encode_size: 1024 50 | 51 | #图片特征(2048) 52 | image_head_type: 'resnet_v2_50' 53 | image_head_params: {} 54 | 55 | 56 | #多模态特征融合方式 57 | fusion_head_type: 'SE' 58 | fusion_head_params: 59 | hidden1_size: 1024 60 | gating_reduction: 8 # reduction factor in se context gating 61 | drop_rate: 62 | video: 0.8 63 | audio: 0.8 64 | image: 0.5 65 | text: 0.4 66 | fusion: 0.9 67 | 68 | #tagging分类器参数 69 | tagging_classifier_type: 'MoeModel' 70 | tagging_classifier_params: 71 | num_classes: 82 #标签数目, 按需修改 72 | num_mixtures: 2 73 | #l2_penalty: 0.0 74 | 75 | ############################################################# 76 | #2. Optimizer & Train Configs 77 | ############################################################# 78 | OptimizerConfig: 79 | optimizer: 'AdamOptimizer' 80 | optimizer_init_params: {} 81 | clip_gradient_norm: 1.0 82 | learning_rate_dict: 83 | video: 0.0001 84 | audio: 0.0001 85 | text: 0.00001 86 | image: 0.0001 87 | classifier: 0.0005 88 | loss_type_dict: 89 | tagging: "CrossEntropyLoss" 90 | max_step_num: 6500 91 | export_model_steps: 500 92 | learning_rate_decay: 0.1 93 | start_new_model: True # 如果为True,重新训练; 如果False,则resume 94 | num_gpu: 1 95 | log_device_placement: False 96 | gpu_allow_growth: True 97 | pretrained_model: 98 | text_pretrained_model: 'pretrained/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 99 | image_pretrained_model: 'pretrained/resnet_v2_50/resnet_v2_50.ckpt' 100 | train_dir: './checkpoints/tagging5k_temp' #训练模型保存目录,按需修改 101 | 102 | ############################################################# 103 | # 3. DataSet Config 104 | ############################################################# 105 | DatasetConfig: 106 | batch_size: 16 107 | shuffle: True 108 | train_data_source_list: 109 | train799: 110 | file: '../dataset/tagging/GroundTruth/datafile/train_9.txt' # preprocessing脚本生成文件,按需求修改 (datafile) 111 | batch_size: 16 112 | 113 | valid_data_source_list: 114 | val799: 115 | file: '../dataset/tagging/GroundTruth/datafile/valid_9.txt' # preprocessing脚本生成文件,按需求修改 116 | batch_size: 128 117 | 118 | preprocess_root: 'src/dataloader/preprocess/' 119 | preprocess_config: 120 | feature: 121 | - name: 'video,video_frames_num,idx' 122 | shape: [[300,2552], [],[]] 123 | dtype: 'float32,int32,string' 124 | class: 'frames_npy_preprocess.Preprocess' 125 | extra_args: 126 | max_frames: 300 127 | feat_dim: 2552 128 | return_frames_num: True 129 | return_idx: True 130 | 131 | - name: 'audio,audio_frames_num' 132 | shape: [[300,128], []] 133 | dtype: 'float32,int32' 134 | class: 'frames_npy_preprocess.Preprocess' 135 | extra_args: 136 | max_frames: 300 137 | feat_dim: 128 138 | return_frames_num: True 139 | 140 | - name: 'image' 141 | shape: [[224,224,3]] 142 | dtype: 'float32' 143 | class: 'image_preprocess.Preprocess' 144 | 145 | - name: 'text' 146 | shape: [[300]] 147 | dtype: 'int64' 148 | class: 'text_preprocess.Preprocess' 149 | extra_args: 150 | vocab: 'pretrained/bert/chinese_L-12_H-768_A-12/vocab.txt' 151 | max_len: 300 152 | label: 153 | - name: 'tagging' 154 | dtype: 'float32' 155 | shape: [[82]] # 根据 num_classes修改 156 | class: 'label_preprocess.Preprocess_label_sparse_to_dense' 157 | extra_args: 158 | index_dict: '../dataset/label_id.txt' # 按需求更改 159 | -------------------------------------------------------------------------------- /src/dataloader/preprocess/cnn_preprocessing/cifarnet_preprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Provides utilities to preprocess images in CIFAR-10. 16 | 17 | """ 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import tensorflow as tf 24 | 25 | _PADDING = 4 26 | 27 | slim = tf.contrib.slim 28 | 29 | 30 | def preprocess_for_train(image, 31 | output_height, 32 | output_width, 33 | padding=_PADDING, 34 | add_image_summaries=True): 35 | """Preprocesses the given image for training. 36 | 37 | Note that the actual resizing scale is sampled from 38 | [`resize_size_min`, `resize_size_max`]. 39 | 40 | Args: 41 | image: A `Tensor` representing an image of arbitrary size. 42 | output_height: The height of the image after preprocessing. 43 | output_width: The width of the image after preprocessing. 44 | padding: The amound of padding before and after each dimension of the image. 45 | add_image_summaries: Enable image summaries. 46 | 47 | Returns: 48 | A preprocessed image. 49 | """ 50 | if add_image_summaries: 51 | tf.summary.image('image', tf.expand_dims(image, 0)) 52 | 53 | # Transform the image to floats. 54 | image = tf.to_float(image) 55 | if padding > 0: 56 | image = tf.pad(image, [[padding, padding], [padding, padding], [0, 0]]) 57 | # Randomly crop a [height, width] section of the image. 58 | distorted_image = tf.random_crop(image, 59 | [output_height, output_width, 3]) 60 | 61 | # Randomly flip the image horizontally. 62 | distorted_image = tf.image.random_flip_left_right(distorted_image) 63 | 64 | if add_image_summaries: 65 | tf.summary.image('distorted_image', tf.expand_dims(distorted_image, 0)) 66 | 67 | # Because these operations are not commutative, consider randomizing 68 | # the order their operation. 69 | distorted_image = tf.image.random_brightness(distorted_image, 70 | max_delta=63) 71 | distorted_image = tf.image.random_contrast(distorted_image, 72 | lower=0.2, upper=1.8) 73 | # Subtract off the mean and divide by the variance of the pixels. 74 | return tf.image.per_image_standardization(distorted_image) 75 | 76 | 77 | def preprocess_for_eval(image, output_height, output_width, 78 | add_image_summaries=True): 79 | """Preprocesses the given image for evaluation. 80 | 81 | Args: 82 | image: A `Tensor` representing an image of arbitrary size. 83 | output_height: The height of the image after preprocessing. 84 | output_width: The width of the image after preprocessing. 85 | add_image_summaries: Enable image summaries. 86 | 87 | Returns: 88 | A preprocessed image. 89 | """ 90 | if add_image_summaries: 91 | tf.summary.image('image', tf.expand_dims(image, 0)) 92 | # Transform the image to floats. 93 | image = tf.to_float(image) 94 | 95 | # Resize and crop if needed. 96 | resized_image = tf.image.resize_image_with_crop_or_pad(image, 97 | output_width, 98 | output_height) 99 | if add_image_summaries: 100 | tf.summary.image('resized_image', tf.expand_dims(resized_image, 0)) 101 | 102 | # Subtract off the mean and divide by the variance of the pixels. 103 | return tf.image.per_image_standardization(resized_image) 104 | 105 | 106 | def preprocess_image(image, output_height, output_width, is_training=False, 107 | add_image_summaries=True): 108 | """Preprocesses the given image. 109 | 110 | Args: 111 | image: A `Tensor` representing an image of arbitrary size. 112 | output_height: The height of the image after preprocessing. 113 | output_width: The width of the image after preprocessing. 114 | is_training: `True` if we're preprocessing the image for training and 115 | `False` otherwise. 116 | add_image_summaries: Enable image summaries. 117 | 118 | Returns: 119 | A preprocessed image. 120 | """ 121 | if is_training: 122 | return preprocess_for_train( 123 | image, output_height, output_width, 124 | add_image_summaries=add_image_summaries) 125 | else: 126 | return preprocess_for_eval( 127 | image, output_height, output_width, 128 | add_image_summaries=add_image_summaries) 129 | -------------------------------------------------------------------------------- /src/dataloader/preprocess/frames_npy_preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | import numpy as np 4 | import random 5 | import jieba 6 | 7 | 8 | def temporal_shift(src, shift_ratio=0.2): 9 | ts, fs = src.shape 10 | shift_dim = max(int(fs * shift_ratio) // 2, 1) 11 | out = np.zeros_like(src) 12 | out[1:, :shift_dim] = src[:-1, :shift_dim] # shift later 13 | out[:-1, -shift_dim:] = src[1:, -shift_dim:] # shift earlier 14 | out[:, shift_dim:-shift_dim] = src[:, shift_dim:-shift_dim] # no shift 15 | return out 16 | 17 | 18 | def data_augment(src, noisy=0.5): 19 | return src + np.random.normal(0, noisy * np.std(src), size=src.shape) 20 | 21 | 22 | def load_embeddings(path="/home/tione/notebook/VideoStructuring/taac2021_tagging_pytorchyyds/pretrained/word_embed/Tencent_AILab_ChineseEmbedding_cut100w.txt"): 23 | embeddings = {} 24 | with open(path) as f: 25 | for row in f: 26 | char, score = row.strip().split(" ", 1) 27 | embeddings[char] = np.fromstring(score, sep=" ") 28 | return embeddings 29 | 30 | 31 | word2vec = load_embeddings() 32 | def concat_w2v(text_path, frames): 33 | with open(text_path.replace("video_npy/Youtube8M/", "text_txt/").replace(".npy", ".txt")) as f: 34 | data = eval(f.read().strip()) 35 | tokens = list(jieba.cut(data["video_asr"].replace("|", ""))) 36 | if len(tokens) < len(frames): 37 | text = data["video_ocr"] 38 | for char in '''0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*|()~`·、[]【】?、。,,.';;‘:""''': 39 | text = text.replace(char, "") 40 | tokens = list(jieba.cut(text)) 41 | window_size = max(len(tokens) // len(frames), 1) 42 | embeddings = [] 43 | for i in range(len(frames)): 44 | start = i * window_size 45 | frame_embed = np.zeros((200,)) 46 | k = 0.0 47 | for token in tokens[start:start + window_size]: 48 | if token in word2vec: 49 | frame_embed += word2vec[token] 50 | k += 1 51 | if k > 0: 52 | frame_embed = frame_embed / k 53 | embeddings.append(frame_embed) 54 | embeddings = np.vstack(embeddings) 55 | return np.hstack([frames, embeddings]) 56 | 57 | 58 | def resize_axis(tensor, axis, new_size, fill_value=0): 59 | tensor = tf.convert_to_tensor(tensor) 60 | shape = tf.unstack(tf.shape(tensor)) 61 | 62 | pad_shape = shape[:] 63 | pad_shape[axis] = tf.maximum(0, new_size - shape[axis]) 64 | 65 | shape[axis] = tf.minimum(shape[axis], new_size) 66 | shape = tf.stack(shape) 67 | 68 | resized = tf.concat([ 69 | tf.slice(tensor, tf.zeros_like(shape), shape), 70 | tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype)) 71 | ], axis) 72 | 73 | # Update shape. 74 | new_shape = tensor.get_shape().as_list() # A copy is being made. 75 | new_shape[axis] = new_size 76 | resized.set_shape(new_shape) 77 | return resized 78 | 79 | 80 | class Preprocess: 81 | 82 | def __init__(self, 83 | max_frames, 84 | return_frames_num, 85 | feat_dim = 128, 86 | is_training=False, 87 | return_idx = False): 88 | self.max_frames = max_frames 89 | self.return_frames_num = return_frames_num 90 | self.is_training = is_training 91 | self.return_idx = return_idx 92 | self.feat_dim = feat_dim 93 | self.frames_placeholder = tf.placeholder(shape=[None,None],dtype=tf.float32) 94 | self.num_frames = tf.minimum(tf.shape(self.frames_placeholder)[0], self.max_frames) 95 | self.feature_matrix = resize_axis(self.frames_placeholder,axis=0,new_size=self.max_frames) 96 | sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 97 | sess_config.gpu_options.allow_growth = True 98 | self.sess = tf.Session(config=sess_config) 99 | 100 | def __call__(self, frames_npy_fn, augment): 101 | if os.path.exists(frames_npy_fn): 102 | frames = np.load(frames_npy_fn) 103 | assert frames.shape[-1] in (2352, 128) 104 | if augment > 0: 105 | if frames.shape[-1] == 2352: 106 | frames = np.hstack([data_augment(frames[:, :2048], 0.5), 107 | data_augment(frames[:, 2048:], 0.5)]) 108 | elif frames.shape[-1] == 128: 109 | frames = data_augment(frames, 0.5) 110 | if frames.shape[-1] != 128: 111 | frames = concat_w2v(frames_npy_fn, frames) 112 | frames = temporal_shift(frames) 113 | else: 114 | print("!"*100+"\n Warning: file {} not exits".format(frames_npy_fn)) 115 | frames = np.zeros((1, self.feat_dim)) 116 | feature_matrix,num_frames = self.sess.run([self.feature_matrix, self.num_frames],feed_dict={self.frames_placeholder:frames}) 117 | idx = os.path.basename(frames_npy_fn).split('.')[0] 118 | return_list = [] 119 | return_list.append(feature_matrix) 120 | if self.return_frames_num: 121 | return_list.append(num_frames) 122 | if self.return_idx: 123 | return_list.append(idx) 124 | return tuple(return_list) 125 | -------------------------------------------------------------------------------- /src/model/cover_head/mobilenet_v1_eval.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Validate mobilenet_v1 with options for quantization.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import math 22 | import tensorflow as tf 23 | 24 | from datasets import dataset_factory 25 | from nets import mobilenet_v1 26 | from preprocessing import preprocessing_factory 27 | 28 | slim = tf.contrib.slim 29 | 30 | flags = tf.app.flags 31 | 32 | flags.DEFINE_string('master', '', 'Session master') 33 | flags.DEFINE_integer('batch_size', 250, 'Batch size') 34 | flags.DEFINE_integer('num_classes', 1001, 'Number of classes to distinguish') 35 | flags.DEFINE_integer('num_examples', 50000, 'Number of examples to evaluate') 36 | flags.DEFINE_integer('image_size', 224, 'Input image resolution') 37 | flags.DEFINE_float('depth_multiplier', 1.0, 'Depth multiplier for mobilenet') 38 | flags.DEFINE_bool('quantize', False, 'Quantize training') 39 | flags.DEFINE_string('checkpoint_dir', '', 'The directory for checkpoints') 40 | flags.DEFINE_string('eval_dir', '', 'Directory for writing eval event logs') 41 | flags.DEFINE_string('dataset_dir', '', 'Location of dataset') 42 | 43 | FLAGS = flags.FLAGS 44 | 45 | 46 | def imagenet_input(is_training): 47 | """Data reader for imagenet. 48 | 49 | Reads in imagenet data and performs pre-processing on the images. 50 | 51 | Args: 52 | is_training: bool specifying if train or validation dataset is needed. 53 | Returns: 54 | A batch of images and labels. 55 | """ 56 | if is_training: 57 | dataset = dataset_factory.get_dataset('imagenet', 'train', 58 | FLAGS.dataset_dir) 59 | else: 60 | dataset = dataset_factory.get_dataset('imagenet', 'validation', 61 | FLAGS.dataset_dir) 62 | 63 | provider = slim.dataset_data_provider.DatasetDataProvider( 64 | dataset, 65 | shuffle=is_training, 66 | common_queue_capacity=2 * FLAGS.batch_size, 67 | common_queue_min=FLAGS.batch_size) 68 | [image, label] = provider.get(['image', 'label']) 69 | 70 | image_preprocessing_fn = preprocessing_factory.get_preprocessing( 71 | 'mobilenet_v1', is_training=is_training) 72 | 73 | image = image_preprocessing_fn(image, FLAGS.image_size, FLAGS.image_size) 74 | 75 | images, labels = tf.train.batch( 76 | tensors=[image, label], 77 | batch_size=FLAGS.batch_size, 78 | num_threads=4, 79 | capacity=5 * FLAGS.batch_size) 80 | return images, labels 81 | 82 | 83 | def metrics(logits, labels): 84 | """Specify the metrics for eval. 85 | 86 | Args: 87 | logits: Logits output from the graph. 88 | labels: Ground truth labels for inputs. 89 | 90 | Returns: 91 | Eval Op for the graph. 92 | """ 93 | labels = tf.squeeze(labels) 94 | names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({ 95 | 'Accuracy': tf.metrics.accuracy(tf.argmax(logits, 1), labels), 96 | 'Recall_5': tf.metrics.recall_at_k(labels, logits, 5), 97 | }) 98 | for name, value in names_to_values.iteritems(): 99 | slim.summaries.add_scalar_summary( 100 | value, name, prefix='eval', print_summary=True) 101 | return names_to_updates.values() 102 | 103 | 104 | def build_model(): 105 | """Build the mobilenet_v1 model for evaluation. 106 | 107 | Returns: 108 | g: graph with rewrites after insertion of quantization ops and batch norm 109 | folding. 110 | eval_ops: eval ops for inference. 111 | variables_to_restore: List of variables to restore from checkpoint. 112 | """ 113 | g = tf.Graph() 114 | with g.as_default(): 115 | inputs, labels = imagenet_input(is_training=False) 116 | 117 | scope = mobilenet_v1.mobilenet_v1_arg_scope( 118 | is_training=False, weight_decay=0.0) 119 | with slim.arg_scope(scope): 120 | logits, _ = mobilenet_v1.mobilenet_v1( 121 | inputs, 122 | is_training=False, 123 | depth_multiplier=FLAGS.depth_multiplier, 124 | num_classes=FLAGS.num_classes) 125 | 126 | if FLAGS.quantize: 127 | tf.contrib.quantize.create_eval_graph() 128 | 129 | eval_ops = metrics(logits, labels) 130 | 131 | return g, eval_ops 132 | 133 | 134 | def eval_model(): 135 | """Evaluates mobilenet_v1.""" 136 | g, eval_ops = build_model() 137 | with g.as_default(): 138 | num_batches = math.ceil(FLAGS.num_examples / float(FLAGS.batch_size)) 139 | slim.evaluation.evaluate_once( 140 | FLAGS.master, 141 | FLAGS.checkpoint_dir, 142 | logdir=FLAGS.eval_dir, 143 | num_evals=num_batches, 144 | eval_op=eval_ops) 145 | 146 | 147 | def main(unused_arg): 148 | eval_model() 149 | 150 | 151 | if __name__ == '__main__': 152 | tf.app.run(main) 153 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## 2021腾讯广告算法大赛-赛道二-第五名解决方案 2 | 3 | > 作者:吴烜圣 ([email](wuxsmail@163.com), [phone](13036864606))、杨非池 ([email](feichi.yang@usc.edu))、周童([email](zhoutong0322@163.com))、林心悦([email](xl9yr@virginia.edu)) 4 | 5 | 论文:[Rethinking the Impacts of Overfitting and Feature Quality on Small-scale Video Classification](https://dl.acm.org/doi/abs/10.1145/3474085.3479226) 6 | 7 | 视频讲解:https://www.bilibili.com/video/BV1ju411o7qz?spm_id_from=333.999.0.0 8 | 9 | #### 0 代码复现 10 | 11 | * 推荐配置:CPU: 6 Cores Memory: 16GB GPU:V100-32GB 12 | 13 | * 配置环境: 14 | 15 | 在完成下述“步骤0”至“步骤4”之后,将得到如下目录: 16 | 17 | ```shell 18 | /home/tione/notebook/ 19 | ├── algo-2021 20 | ├── envs 21 | │ └── taac2021-tagging-pytrochyyds # 本项目的conda环境(init.sh自动创建) 22 | ├── log # 运行日志(init.sh自动创建) 23 | └── VideoStructuring # 项目路径(请手动创建) 24 | ├── dataset # 数据和特征(init.sh自动创建) 25 | ├── KFoldModels # K折模型参数(init.sh自动创建) 26 | ├── KFoldResults # 单折预测结果(init.sh自动创建) 27 | └── taac2021_tagging_pytorchyyds # 项目的代码(请手动创建并拷贝代码到此处) 28 | ├── init.sh # 环境初始化脚本 29 | ├── train.sh # 训练模型脚本 30 | ├── infer.sh # 模型推断脚本 31 | ├── pretrained # 预训练模型权重(init.sh自动下载) 32 | ├── checkpoints # K折模型训练ckpt(init.sh自动创建) 33 | ├── results # k这模型预测(init.sh自动创建) 34 | ├── configs 35 | ├── infer.py 36 | ├── preprocess 37 | ├── readme.md 38 | ├── requirement.txt 39 | ├── src 40 | ├── train.py 41 | └── utils 42 | ``` 43 | 44 | * **步骤0:** 空的机器仅包含`/home/tione/notebook/algo-2021`一个文件夹 45 | 46 | * **步骤1:** 创建本项目的文件夹:`mkdir /home/tione/notebook/VideoStructuring`,除了下一步骤创建的代码目录外,`init.sh`脚本还会在此目录下自动创建存放数据的`dataset`文件夹、存放模型的`KFoldModels`文件夹和存放K这交叉结果的`KFoldResults`文件夹。 47 | 48 | * **步骤2:** 创建本项目的代码目录:`mkdir home/tione/notebook/VideoStructuring/taac2021_tagging_pytorchyyds`,并 **将项目的所有代码移动到此路径下** ,确保`init.sh`、`train.sh`和`infer.sh`三个文件位于该文件夹中。 49 | 50 | * **步骤3:** `cd /home/tione/notebook/VideoStructuring/taac2021_tagging_pytorchyyds` 51 | 52 | * **步骤4:** `sudo chmod a+x ./init.sh && ./init.sh` 53 | 54 | ```shell 55 | shell> sudo chmod a+x ./init.sh && ./init.sh 56 | [2021-07-05 18:50:24] INFO 开始拷贝数据到本地... 57 | [2021-07-05 19:07:20] INFO 数据拷贝完成! 58 | [2021-07-05 19:07:20] INFO 开始配置系统环境... 59 | [2021-07-05 19:09:23] INFO 系统环境配置完成! 60 | [2021-07-05 19:09:23] INFO 开始下载第三方Python环境... 61 | [2021-07-05 19:12:21] INFO 第三方Python环境已安装完毕! 62 | [2021-07-05 19:12:21] INFO 开始下载预训练模型... 63 | [2021-07-05 19:16:16] INFO 预训练模型下载完成! 64 | [2021-07-05 19:16:16] INFO 系统初始化完成!请运行sudo chmod a+x ./train.sh && ./train.sh进行K折模型训练! 65 | ``` 66 | 67 | * 训练集的特征抽取和K折模型训练: 68 | * **步骤5:** `sudo chmod a+x ./train.sh && ./train.sh` 69 | 70 | ```shell 71 | shell> sudo chmod a+x ./train.sh && ./train.sh 72 | [2021-07-05 19:25:52] INFO 已启动Conda环境! 73 | [2021-07-05 19:25:52] INFO 开始拷贝ASR和OCR文本特征... 74 | [2021-07-05 19:26:16] INFO 文本特征已就绪! 75 | [2021-07-05 19:26:16] INFO 开始拷贝音频特征... 76 | [2021-07-05 19:26:39] INFO 音频特征已就绪! 77 | [2021-07-05 19:26:39] INFO 开始抽取Video特征... 78 | [2021-07-05 20:54:48] INFO 开始检查Video特征抽取结果... 79 | [2021-07-05 20:55:09] INFO 视频特征抽取完成! 80 | [2021-07-05 20:55:09] INFO 开始准备K折训练数据... 81 | [2021-07-05 20:55:09] INFO K折训练数据已就绪! 82 | [2021-07-05 20:55:09] INFO 开始进行K折训练... 83 | [2021-07-05 20:55:09] INFO 开始训练第0个模型... 84 | # ... 此处省略10个模型的训练日志 .... 85 | [2021-07-06 07:50:06] INFO 第9个模型训练完成! 86 | [2021-07-06 07:50:06] INFO K折训练已完成! 87 | [2021-07-06 07:50:06] INFO 模型训练已完成!请运行sudo chmod a+x ./infer.sh && ./infer.sh进行最终的模型预测! 88 | ``` 89 | 90 | * 测试集的特征抽取和K折模型预测: 91 | * **步骤6:** `sudo chmod a+x ./infer.sh && ./infer.sh` 92 | 93 | ```shell 94 | shell> sudo chmod a+x ./infer.sh && ./infer.sh 95 | [2021-07-06 14:49:02] INFO 已启动Conda环境! 96 | [2021-07-06 14:49:23] INFO 开始拷贝ASR和OCR文本特征... 97 | [2021-07-06 14:49:51] INFO 文本特征已就绪! 98 | [2021-07-06 14:49:51] INFO 开始拷贝音频特征... 99 | [2021-07-06 14:50:29] INFO 音频特征已就绪! 100 | [2021-07-06 14:50:29] INFO 开始抽取Video特征... 101 | [2021-07-06 15:16:48] INFO 视频特征抽取完成! 102 | [2021-07-06 15:16:48] INFO 开始进行预测... 103 | [2021-07-06 15:16:48] INFO 第0个子模型开始预测... 104 | # ... 此处省略10个模型的预测日志 ... 105 | [2021-07-06 15:37:00] INFO K折预测已完成! 106 | [2021-07-06 15:37:00] INFO 开始进行模型融合... 107 | [2021-07-06 15:37:05] INFO 模型融合结果已完成! 108 | [2021-07-06 15:37:05] INFO 'Pytorch永远滴神'团队最终预测结果已保存到:/home/tione/notebook/pytorchyyds_prediction_5k.json 109 | ``` 110 | 111 | * **预测结果:** `/home/tione/notebook/pytorchyyds_prediction_5k.json` 112 | 113 | #### 1 预训练模型 114 | 115 | ​ 模型仅使用了Video、Text、Audio三种模态,没有使用视频中间帧的Image模态。除此以外,Text流的文本表征除了Chinese Bert以外,我们还使用腾讯AI实验室的预训练词向量,用于增强Video模态中每一帧的表征。**所有需要的预训练模型已经传到[COS对象存储](https://algo-tencent-2021-1256646044.cos.ap-guangzhou.myqcloud.com/pretrained_models/pretrained.zip),在init.sh过程中会自行下载并解压。** 116 | 117 | * 与Baseline相同的预训练模型: 118 | 1. Audio模态:Vggish 119 | 2. Text模态:ChineseBert-base 120 | 121 | * 与Baseline不同的预训练模型 122 | 1. Video模态:EfficientNet-B5-NoisyStudent([Code](https://github.com/qubvel/efficientnet)、[Paper](https://arxiv.org/pdf/1905.11946.pdf)) 123 | 2. Image模态:该模态被丢弃,未使用任何预训练模型 124 | 3. Text模态:腾讯AI Lab预训练词向量 ([Code](https://ai.tencent.com/ailab/nlp/zh/embedding.html)、[Paper]([Embedding Dataset -- NLP Center, Tencent AI Lab](https://ai.tencent.com/ailab/nlp/zh/embedding.html))) 125 | 126 | #### 2 预计用时 127 | 128 | * 初始化阶段:`sudo chmod a+x ./init.sh && ./init.sh` (大约30分钟) 129 | 1. 复制原始视频数据到本地:20分钟 130 | 2. 安装系统环境:3分钟 131 | 3. 安装Python环境:3分钟 132 | 4. 安装预训练模型:5分钟 133 | * 训练阶段:`sudo chmod a+x ./train.sh && ./train.sh` (11小时 + 5k训练集文本抽取标准时间 + 5k训练集Vggish特征抽取标准时间) 134 | 1. 复制训练集Text到本地:baseline标准时间 135 | 2. 复制训练集Audio特征到本地:baseline标准时间 136 | 3. 抽取训练集Video特征:约1小时30分钟 137 | 4. 训练K折模型:约9小时30分钟 138 | * 测试阶段:`sudo chmod a+x ./infer.sh && ./infer.sh` (1小时50分钟 + 5k测试集文本抽取标准时间 + 5k测试集Vggish特征抽取标准时间) 139 | 140 | 1. 复制测试集Text到本地:baseline标准时间 141 | 2. 复制测试集Audio特征到本地:baseline标准时间 142 | 3. 抽取测试集Video特征:约1小时25分钟 143 | 4. K折模型预测:约20分钟 144 | 5. K折模型结果融合:5秒 145 | 146 | -------------------------------------------------------------------------------- /preprocess/audio_extractor/vggish_slim.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Defines the 'VGGish' model used to generate AudioSet embedding features. 17 | 18 | The public AudioSet release (https://research.google.com/audioset/download.html) 19 | includes 128-D features extracted from the embedding layer of a VGG-like model 20 | that was trained on a large Google-internal YouTube dataset. Here we provide 21 | a TF-Slim definition of the same model, without any dependences on libraries 22 | internal to Google. We call it 'VGGish'. 23 | 24 | Note that we only define the model up to the embedding layer, which is the 25 | penultimate layer before the final classifier layer. We also provide various 26 | hyperparameter values (in vggish_params.py) that were used to train this model 27 | internally. 28 | 29 | For comparison, here is TF-Slim's VGG definition: 30 | https://github.com/tensorflow/models/blob/master/research/slim/nets/vgg.py 31 | """ 32 | 33 | import tensorflow as tf 34 | import vggish_params as params 35 | 36 | slim = tf.contrib.slim 37 | 38 | 39 | def define_vggish_slim(training=False): 40 | """Defines the VGGish TensorFlow model. 41 | 42 | All ops are created in the current default graph, under the scope 'vggish/'. 43 | 44 | The input is a placeholder named 'vggish/input_features' of type float32 and 45 | shape [batch_size, num_frames, num_bands] where batch_size is variable and 46 | num_frames and num_bands are constants, and [num_frames, num_bands] represents 47 | a log-mel-scale spectrogram patch covering num_bands frequency bands and 48 | num_frames time frames (where each frame step is usually 10ms). This is 49 | produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). 50 | The output is an op named 'vggish/embedding' which produces the activations of 51 | a 128-D embedding layer, which is usually the penultimate layer when used as 52 | part of a full model with a final classifier layer. 53 | 54 | Args: 55 | training: If true, all parameters are marked trainable. 56 | 57 | Returns: 58 | The op 'vggish/embeddings'. 59 | """ 60 | # Defaults: 61 | # - All weights are initialized to N(0, INIT_STDDEV). 62 | # - All biases are initialized to 0. 63 | # - All activations are ReLU. 64 | # - All convolutions are 3x3 with stride 1 and SAME padding. 65 | # - All max-pools are 2x2 with stride 2 and SAME padding. 66 | with slim.arg_scope([slim.conv2d, slim.fully_connected], 67 | weights_initializer=tf.truncated_normal_initializer( 68 | stddev=params.INIT_STDDEV), 69 | biases_initializer=tf.zeros_initializer(), 70 | activation_fn=tf.nn.relu, 71 | trainable=training), \ 72 | slim.arg_scope([slim.conv2d], 73 | kernel_size=[3, 3], stride=1, padding='SAME'), \ 74 | slim.arg_scope([slim.max_pool2d], 75 | kernel_size=[2, 2], stride=2, padding='SAME'), \ 76 | tf.variable_scope('vggish'): 77 | # Input: a batch of 2-D log-mel-spectrogram patches. 78 | features = tf.placeholder( 79 | tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), 80 | name='input_features') 81 | # Reshape to 4-D so that we can convolve a batch with conv2d(). 82 | net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) 83 | 84 | # The VGG stack of alternating convolutions and max-pools. 85 | net = slim.conv2d(net, 64, scope='conv1') 86 | net = slim.max_pool2d(net, scope='pool1') 87 | net = slim.conv2d(net, 128, scope='conv2') 88 | net = slim.max_pool2d(net, scope='pool2') 89 | net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') 90 | net = slim.max_pool2d(net, scope='pool3') 91 | net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') 92 | net = slim.max_pool2d(net, scope='pool4') 93 | 94 | # Flatten before entering fully-connected layers 95 | net = slim.flatten(net) 96 | net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') 97 | # The embedding layer. 98 | net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') 99 | return tf.identity(net, name='embedding') 100 | 101 | 102 | def load_vggish_slim_checkpoint(session, checkpoint_path): 103 | """Loads a pre-trained VGGish-compatible checkpoint. 104 | 105 | This function can be used as an initialization function (referred to as 106 | init_fn in TensorFlow documentation) which is called in a Session after 107 | initializating all variables. When used as an init_fn, this will load 108 | a pre-trained checkpoint that is compatible with the VGGish model 109 | definition. Only variables defined by VGGish will be loaded. 110 | 111 | Args: 112 | session: an active TensorFlow session. 113 | checkpoint_path: path to a file containing a checkpoint that is 114 | compatible with the VGGish model definition. 115 | """ 116 | # Get the list of names of all VGGish variables that exist in 117 | # the checkpoint (i.e., all inference-mode VGGish variables). 118 | with tf.Graph().as_default(): 119 | define_vggish_slim(training=False) 120 | vggish_var_names = [v.name for v in tf.global_variables()] 121 | 122 | # Get the list of all currently existing variables that match 123 | # the list of variable names we just computed. 124 | vggish_vars = [v for v in tf.global_variables() if v.name in vggish_var_names] 125 | 126 | # Use a Saver to restore just the variables selected above. 127 | saver = tf.train.Saver(vggish_vars, name='vggish_load_pretrained', 128 | write_version=1) 129 | saver.restore(session, checkpoint_path) 130 | -------------------------------------------------------------------------------- /init.sh: -------------------------------------------------------------------------------- 1 | # Confirm Position of the code 2 | CONDA_NEW_ENV=taac2021-tagging-pytrochyyds 3 | ENV_ROOT=/home/tione/notebook 4 | CODE_ROOT=${ENV_ROOT}/VideoStructuring 5 | CODE_BASE=${CODE_ROOT}/taac2021_tagging_pytorchyyds 6 | 7 | if [ ! -d "${ENV_ROOT}" ]; then 8 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 不存在环境根目录:${ENV_ROOT}" 9 | exit 1 10 | fi 11 | if [ ! -d "${CODE_ROOT}" ]; then 12 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 不存在项目根目录:${CODE_ROOT}" 13 | exit 1 14 | fi 15 | if [ ! -d "${CODE_BASE}" ]; then 16 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 当前代码未拷贝到目录:${CODE_BASE}" 17 | exit 1 18 | fi 19 | if [ ! "$(pwd)" = "${CODE_BASE}" ]; then 20 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 请将代码文件夹复制到${CODE_BASE},并确保工作路径为:${CODE_BASE}" 21 | exit 1 22 | fi 23 | 24 | # Copy Data from Original Shared Folder 25 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 开始拷贝数据到本地..." # 预计17分钟 26 | rm -rf /home/tione/notebook/VideoStructuring/dataset 27 | mkdir /home/tione/notebook/VideoStructuring/dataset 28 | cp /home/tione/notebook/algo-2021/dataset/label_id.txt /home/tione/notebook/VideoStructuring/dataset/ 29 | 30 | mkdir /home/tione/notebook/VideoStructuring/dataset/videos 31 | cp /home/tione/notebook/algo-2021/dataset/videos/video_5k /home/tione/notebook/VideoStructuring/dataset/videos -r 32 | cp /home/tione/notebook/algo-2021/dataset/videos/test_5k_2nd /home/tione/notebook/VideoStructuring/dataset/videos -r 33 | 34 | mkdir /home/tione/notebook/VideoStructuring/dataset/pretrained_models 35 | cp /home/tione/notebook/algo-2021/dataset/pretrained_models/* /home/tione/notebook/VideoStructuring/dataset/pretrained_models -r 36 | 37 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging 38 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/GroundTruth 39 | cp /home/tione/notebook/algo-2021/dataset/tagging/GroundTruth/* /home/tione/notebook/VideoStructuring/dataset/tagging/GroundTruth -r 40 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/ 41 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/audio_npy/ 42 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/audio_npy/Vggish 43 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/text_txt 44 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/video_npy/ 45 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_train_5k/video_npy/Youtube8M 46 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/ 47 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/audio_npy/ 48 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/audio_npy/Vggish 49 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/text_txt 50 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/video_npy 51 | mkdir /home/tione/notebook/VideoStructuring/dataset/tagging/tagging_dataset_test_5k_2nd/video_npy/Youtube8M 52 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 数据拷贝完成!" 53 | 54 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 开始配置系统环境..." # 预计1.5分钟 55 | CONDA_CONFIG_ROOT_PREFIX=$(conda config --show root_prefix) 56 | get_conda_root_prefix() { 57 | TMP_POS=$(awk -v a="${CONDA_CONFIG_ROOT_PREFIX}" -v b="/" 'BEGIN{print index(a, b)}') 58 | TMP_POS=$((TMP_POS-1)) 59 | if [ $TMP_POS -ge 0 ]; then 60 | echo "${CONDA_CONFIG_ROOT_PREFIX:${TMP_POS}}" 61 | fi 62 | } 63 | CONDA_ROOT=$(get_conda_root_prefix) 64 | if [ ! -d "${CONDA_ROOT}" ]; then 65 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 未检测到CONDA根目录:${CONDA_ROOT}" 66 | exit 1 67 | fi 68 | 69 | OS_ID=$(awk -F= '$1=="ID" { print $2 ;}' /etc/os-release) 70 | OS_ID=${OS_ID//"\""/""} 71 | 72 | if [ "${OS_ID}" == "ubuntu" ]; then 73 | sudo apt-get update 74 | sudo apt-get install -y apt-utils libsndfile1-dev ffmpeg 75 | elif [ "${OS_ID}" == "centos" ]; then 76 | yum install -y libsndfile libsndfile-devel ffmpeg ffmpeg-devel 77 | else 78 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 不支持的操作系统:${OS_ID}" 79 | exit 1 80 | fi 81 | 82 | source "${CONDA_ROOT}/etc/profile.d/conda.sh" 83 | 84 | conda create --prefix ${ENV_ROOT}/envs/${CONDA_NEW_ENV} -y cudatoolkit=10.0 cudnn=7.6.0 python=3.7 ipykernel 85 | conda activate ${ENV_ROOT}/envs/${CONDA_NEW_ENV} 86 | 87 | python -m ipykernel install --user --name ${CONDA_NEW_ENV} --display-name "TAAC2021 (${CONDA_NEW_ENV})" 88 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 系统环境配置完成!" 89 | 90 | 91 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 开始下载第三方Python环境..." 92 | pip config set global.index-url https://mirrors.tencent.com/pypi/simple/ 93 | pip install -r requirement.txt 94 | pip install tensorflow-gpu==1.14 efficientnet opencv-python torch==1.2.0 scikit-learn jieba 95 | check_env=$(python -c """ 96 | try: 97 | import tensorflow as tf, cv2, torch, efficientnet, sklearn 98 | print('[TensorFlow]', tf.__version__, '[Torch]', torch.__version__, '[EfficientNet]', efficientnet.__version__, '[OpenCV]', cv2.__version__, '[ScikitLearn]', sklearn.__version__) 99 | except Exception as e: 100 | print('环境安装存在异常!请重新执行init脚本!') 101 | """) 102 | if [ "${check_env}" == "环境安装存在异常!请重新执行init脚本!" ]; then 103 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR ${check_env}" 104 | exit 1 105 | fi 106 | sed -i "s/\.decode('utf8')//g" /home/tione/notebook/envs/taac2021-tagging-pytrochyyds/lib/python3.7/site-packages/tensorflow/python/keras/saving/hdf5_format.py 107 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 第三方Python环境已安装完毕!" 108 | 109 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 开始下载预训练模型..." 110 | rm -rf ${CODE_BASE}/pretrained 111 | wget https://algo-tencent-2021-1256646044.cos.ap-guangzhou.myqcloud.com/pretrained_models/pretrained.zip 112 | unzip pretrained.zip 113 | rm -rf pretrained.zip 114 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 预训练模型下载完成!" 115 | 116 | rm -rf ${ENV_ROOT}/log 117 | mkdir ${ENV_ROOT}/log 118 | rm -rf ${CODE_BASE}/results 119 | mkdir ${CODE_BASE}/results 120 | rm -rf ${CODE_BASE}/checkpoints 121 | mkdir ${CODE_BASE}/checkpoints 122 | rm -rf ${CODE_ROOT}/KFoldModels 123 | mkdir ${CODE_ROOT}/KFoldModels 124 | rm -rf ${CODE_ROOT}/KFoldResults 125 | mkdir ${CODE_ROOT}/KFoldResults 126 | mkdir ${CODE_ROOT}/KFoldResults/train 127 | mkdir ${CODE_ROOT}/KFoldResults/test 128 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] INFO 系统初始化完成!请运行sudo chmod a+x ./train.sh && ./train.sh进行K折模型训练!" -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | # Confirm Position of the code 2 | CONDA_NEW_ENV=taac2021-tagging-pytrochyyds 3 | ENV_ROOT=/home/tione/notebook 4 | CODE_ROOT=${ENV_ROOT}/VideoStructuring 5 | CODE_BASE=${CODE_ROOT}/taac2021_tagging_pytorchyyds 6 | DATA_BASE=${CODE_ROOT}/dataset 7 | # #################### get env directories 8 | # CONDA_ROOT 9 | CONDA_CONFIG_ROOT_PREFIX=$(conda config --show root_prefix) 10 | get_conda_root_prefix() { 11 | TMP_POS=$(awk -v a="${CONDA_CONFIG_ROOT_PREFIX}" -v b="/" 'BEGIN{print index(a, b)}') 12 | TMP_POS=$((TMP_POS-1)) 13 | if [ $TMP_POS -ge 0 ]; then 14 | echo "${CONDA_CONFIG_ROOT_PREFIX:${TMP_POS}}" 15 | else 16 | echo "" 17 | fi 18 | } 19 | CONDA_ROOT=$(get_conda_root_prefix) 20 | if [ ! -d "${CONDA_ROOT}" ]; then 21 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到Conda环境:${CONDA_ROOT}" 22 | exit 1 23 | fi 24 | 25 | if [ ! -d "${DATA_BASE}" ]; then 26 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到数据集:${DATASET_ROOT}" 27 | exit 1 28 | fi 29 | 30 | CONDA_CONFIG_FILE="${CONDA_ROOT}/etc/profile.d/conda.sh" 31 | if [ ! -f "${CONDA_CONFIG_FILE}" ]; then 32 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到Conda配置文件:${CONDA_CONFIG_FILE}" 33 | exit 1 34 | fi 35 | source "${CONDA_CONFIG_FILE}" 36 | conda activate ${ENV_ROOT}/envs/${CONDA_NEW_ENV} 37 | 38 | check_gpu=$(python -c "import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'; import tensorflow as tf; print(tf.test.is_gpu_available())") 39 | if [ "${check_gpu}" == "False" ]; then 40 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR Conda环境启动失败!" 41 | exit 1 42 | fi 43 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 已启动Conda环境!" 44 | 45 | 46 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始拷贝ASR和OCR文本特征..." # 预计用时30秒 47 | cp ${ENV_ROOT}/algo-2021/dataset/tagging/tagging_dataset_train_5k/text_txt/tagging/ ${CODE_ROOT}/dataset/tagging/tagging_dataset_train_5k/text_txt/ -r 48 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 文本特征已就绪!" 49 | 50 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始拷贝音频特征..." # 预计用时1分钟 51 | cp ${ENV_ROOT}/algo-2021/dataset/tagging/tagging_dataset_train_5k/audio_npy/Vggish/tagging/ ${CODE_ROOT}/dataset/tagging/tagging_dataset_train_5k/audio_npy/Vggish/ -r 52 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 音频特征已就绪!" 53 | 54 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始抽取Video特征..." # 预计用时1小时30分钟 55 | rm -rf ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/ 56 | mkdir ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/ 57 | python ${CODE_BASE}/preprocess/feat_extract_main.py --test_files_dir ${DATA_BASE}/videos/video_5k/train_5k --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/ --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/audio_npy --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/image_jpg --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/text_txt --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt --extract_type 1 --image_batch_size 300 --imgfeat_extractor efficientnet | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log & 58 | sleep 30s 59 | python ${CODE_BASE}/preprocess/feat_extract_main.py --test_files_dir ${DATA_BASE}/videos/video_5k/train_5k --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/ --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/audio_npy --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/image_jpg --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/text_txt --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt --extract_type 1 --image_batch_size 300 --imgfeat_extractor efficientnet | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log & 60 | sleep 30s 61 | python ${CODE_BASE}/preprocess/feat_extract_main.py --test_files_dir ${DATA_BASE}/videos/video_5k/train_5k --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/ --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/audio_npy --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/image_jpg --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/text_txt --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt --extract_type 1 --image_batch_size 300 --imgfeat_extractor efficientnet --do_logging 1 | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log 62 | 63 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始检查Video特征抽取结果..." 64 | python ${CODE_BASE}/preprocess/feat_extract_main.py --test_files_dir ${DATA_BASE}/videos/video_5k/train_5k --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/video_npy/Youtube8M/tagging/ --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/audio_npy --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/image_jpg --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_train_5k/text_txt --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt --extract_type 1 --image_batch_size 300 --imgfeat_extractor efficientnet --do_logging 1 | grep -v "I tensorflow" >> ${ENV_ROOT}/log/feat_extract.log 65 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 视频特征抽取完成!" 66 | 67 | 68 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始准备K折训练数据..." 69 | python ${CODE_BASE}/utils/k_fold_prepare.py ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt ${DATA_BASE}/tagging/GroundTruth/datafile/val.txt ${DATA_BASE}/tagging/GroundTruth/datafile/train_{}.txt ${DATA_BASE}/tagging/GroundTruth/datafile/valid_{}.txt ${CODE_BASE}/configs/ 70 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO K折训练数据已就绪!" 71 | 72 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始进行K折训练..." 73 | for fold in 0 1 2 3 4 5 6 7 8 9 74 | do 75 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始训练第${fold}个模型..." 76 | python ${CODE_BASE}/train.py --config "${CODE_BASE}/configs/config.tagging.5k.$fold.yaml" > ${ENV_ROOT}/log/train_log_$fold.txt 77 | BEST_MODEL=$(ls -td -- ${CODE_BASE}/checkpoints/tagging5k_temp/export/* | head -n 1) 78 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 第${fold}个模型训练完成:$BEST_MODEL" 79 | python ./utils/save_best_ckpt.py ${CODE_BASE}/checkpoints/tagging5k_temp/ 80 | rm -rf ${CODE_ROOT}/KFoldModels/model_"$fold" 81 | mkdir ${CODE_ROOT}/KFoldModels/model_"$fold" 82 | cp -r ${CODE_BASE}/checkpoints/tagging5k_temp/* ${CODE_ROOT}/KFoldModels/model_"$fold" 83 | done 84 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO K折训练已完成!" 85 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 模型训练已完成!请运行sudo chmod a+x ./infer.sh && ./infer.sh进行最终的模型预测!" 86 | 87 | -------------------------------------------------------------------------------- /src/model/models/nextvlad_bert.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | #Author: jefxiong@tencent.com 3 | 4 | import tensorflow as tf 5 | import tensorflow.contrib.slim as slim 6 | import src.model.video_head as video_head 7 | import src.model.text_head as text_head 8 | import src.model.image_head as image_head 9 | import src.model.fusion_head as fusion_head 10 | import src.model.classify_head as classcify_head 11 | from src.model.models.base_model import BaseModel 12 | 13 | class NextVladBERT(BaseModel): 14 | def __init__(self, model_config): 15 | self.with_video_head = model_config['with_video_head'] 16 | self.with_audio_head = model_config['with_audio_head'] 17 | self.with_text_head = model_config['with_text_head'] 18 | self.with_image_head = model_config['with_image_head'] 19 | 20 | self.use_modal_drop = model_config['use_modal_drop'] 21 | self.modal_drop_rate = model_config['modal_drop_rate'] 22 | self.with_embedding_bn = model_config['with_embedding_bn'] 23 | 24 | self.modal_name_list = [] 25 | if self.with_video_head: 26 | self.modal_name_list.append('video') 27 | self.video_max_frame = model_config['video_head_params']['max_frames'] 28 | if self.with_audio_head: 29 | self.modal_name_list.append('audio') 30 | self.audio_max_frame = model_config['audio_head_params']['max_frames'] 31 | if self.with_text_head: 32 | self.modal_name_list.append('text') 33 | if self.with_image_head: 34 | self.modal_name_list.append('image') 35 | 36 | self.fusion_head_dict={} 37 | self.classifier_dict={} 38 | self.head_dict={} 39 | 40 | for modal in (self.modal_name_list+['fusion']): 41 | fusion_head_params = model_config['fusion_head_params'].copy() 42 | fusion_head_params['drop_rate'] = fusion_head_params['drop_rate'][modal] 43 | 44 | self.fusion_head_dict[modal] = fusion_head.get_instance(model_config['fusion_head_type'], fusion_head_params) 45 | self.classifier_dict[modal] = classcify_head.get_instance(model_config['tagging_classifier_type'], model_config['tagging_classifier_params']) 46 | if modal=='video': 47 | self.head_dict[modal] = video_head.get_instance(model_config['video_head_type'], model_config['video_head_params']) 48 | elif modal=='audio': 49 | self.head_dict[modal] = video_head.get_instance(model_config['audio_head_type'], model_config['audio_head_params']) 50 | elif modal == 'text': 51 | self.head_dict[modal] = text_head.get_instance(model_config['text_head_type'], model_config['text_head_params']) 52 | elif modal == 'image': 53 | self.head_dict[modal] = image_head.get_instance(model_config['image_head_type'], model_config['image_head_params']) 54 | elif modal == 'fusion': 55 | pass 56 | else: 57 | raise NotImplementedError 58 | 59 | def _modal_drop(self, x, rate=0.0, noise_shape=None): 60 | """模态dropout""" 61 | random_scale = tf.random.uniform(noise_shape) 62 | keep_mask = tf.cast(random_scale >= rate, x.dtype) 63 | ret = x * keep_mask 64 | probs = tf.cast(keep_mask, tf.float32) 65 | return ret, probs 66 | 67 | def __call__(self, inputs_dict, is_training=False, train_batch_size=1): 68 | assert is_training is not None 69 | prob_dict = {} 70 | embedding_list = [] 71 | 72 | for modal_name in self.modal_name_list: 73 | #Modal Dropout 74 | if modal_name in ['video', 'audio']: 75 | drop_shape = [train_batch_size, 1, 1] 76 | mask = tf.sequence_mask(inputs_dict[modal_name+'_frames_num'], self.video_max_frame, dtype=tf.float32) 77 | elif modal_name == 'text': 78 | drop_shape = [train_batch_size, 1] 79 | elif modal_name == 'image': 80 | drop_shape = [train_batch_size, 1, 1, 1] 81 | 82 | if is_training and self.use_modal_drop: 83 | inputs_dict[modal_name], prob_dict[modal_name+'_loss_weight'] = self._modal_drop(inputs_dict[modal_name], self.modal_drop_rate, drop_shape) 84 | 85 | with tf.variable_scope(modal_name): 86 | if modal_name in ['video', 'audio']: 87 | embedding = self.head_dict[modal_name](inputs_dict[modal_name], is_training=is_training, mask=mask) 88 | else: 89 | embedding = self.head_dict[modal_name](inputs_dict[modal_name], is_training=is_training) 90 | 91 | with tf.variable_scope("tag_classifier/"+modal_name[0]): 92 | if self.with_embedding_bn: 93 | embedding = slim.batch_norm(embedding, center=True, scale=True, is_training=is_training, scope=modal_name[0]+"_feat_bn") 94 | encode_emb = self.fusion_head_dict[modal_name]([embedding], is_training=is_training) 95 | prob_dict['tagging_output_'+modal_name] = self.classifier_dict[modal_name](encode_emb) 96 | embedding_list.append(embedding) 97 | #if is_training: 98 | # tf.summary.histogram("embedding/{}".format(modal_name), embedding) 99 | # tf.summary.histogram("encode_emb/{}".format(modal_name), encode_emb) 100 | 101 | #fusion 102 | with tf.variable_scope("tag_classifier/fusion"): 103 | fusion_embedding = self.fusion_head_dict['fusion'](embedding_list, is_training = is_training) 104 | probs = self.classifier_dict['fusion'](fusion_embedding) 105 | prob_dict['tagging_output_fusion'] = probs 106 | prob_dict['video_embedding'] = fusion_embedding 107 | return prob_dict 108 | 109 | def build_loss(self, inputs, results, label_loss_fn_dict): 110 | loss_dict={} 111 | for key, loss_fn in label_loss_fn_dict.items(): 112 | if key == 'tagging': 113 | labels = inputs['tagging'] 114 | for modal in self.modal_name_list + ['fusion']: 115 | loss_weight = results.get(modal+"_loss_weight", 1.0) 116 | prediction = results["tagging_output_"+modal]["predictions"] 117 | loss_dict["tagging_loss_"+modal] = loss_fn.calculate_loss(prediction, labels, 118 | **dict(loss_weight = loss_weight)) 119 | else: 120 | raise NotImplementedError 121 | return loss_dict 122 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | #Author: jefxiong@tencent.com 3 | #Author: xxx@tencent.com 4 | 5 | import sys,os 6 | sys.path.append(os.getcwd()) 7 | import time 8 | import numpy as np 9 | 10 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 11 | 12 | import tensorflow as tf 13 | from tensorflow import logging 14 | import tensorflow.contrib.slim as slim 15 | import utils.train_util as train_util 16 | from utils.base_trainer import Trainer, train_main 17 | 18 | 19 | 20 | class TaggingTrainer(Trainer): 21 | def __init__(self, cluster, task, model, reader, configs): 22 | super().__init__(cluster, task, model, reader, configs) 23 | 24 | def get_train_fetch_dict(self): 25 | fetch_dict = {} 26 | fetch_dict['global_step'] = self.global_step 27 | fetch_dict['train_losses_dict'] = self.train_losses_dict 28 | fetch_dict['trian_op'] = self.train_op 29 | #标签任务相关变量 30 | fetch_dict['train_tagging_predictions'] = self.train_tagging_predictions 31 | fetch_dict['train_tagging_labels'] = self.train_inputs_dict['tagging'] 32 | return fetch_dict 33 | 34 | def get_val_fetch_dict(self): 35 | fetch_dict = {} 36 | for modal_name in ['fusion'] + self.modal_name_list: 37 | fetch_dict['tagging_output_'+modal_name] = tf.get_collection('tagging_output_'+modal_name)[0] 38 | fetch_dict['tagging_loss_'+modal_name] = tf.get_collection('tagging_loss_'+modal_name)[0] 39 | fetch_dict['val_summary_op'] = tf.get_collection("val_summary_op")[0] 40 | return fetch_dict 41 | 42 | def load_pretrained_model(self): 43 | text_pretrained_model = self.optimizer_config.pretrained_model['text_pretrained_model'] 44 | assignment_map, _ = train_util.get_assignment_map_from_checkpoint(tf.global_variables(), 45 | text_pretrained_model, 46 | var_prefix='tower/text/', 47 | show=True) 48 | tf.train.init_from_checkpoint(text_pretrained_model, assignment_map) 49 | print("load text_pretrained_model: {}".format(text_pretrained_model)) 50 | 51 | def train_metric_log(self, train_fetch_dict_val, examples_per_second): 52 | """训练集上的结果验证和训练指标tensorboard输出""" 53 | 54 | predictions_val = train_fetch_dict_val['train_tagging_predictions'] 55 | labels_val = train_fetch_dict_val['train_tagging_labels'] 56 | global_step_val = train_fetch_dict_val['global_step'] 57 | train_losses_dict = train_fetch_dict_val['train_losses_dict'] 58 | 59 | train_pr_calculator = train_util.PRCalculator() 60 | gap = train_util.calculate_gap(predictions_val, labels_val) 61 | train_pr_calculator.accumulate(predictions_val, labels_val) 62 | precision_at_1 = train_pr_calculator.get_precision_at_conf(0.1) 63 | precision_at_5 = train_pr_calculator.get_precision_at_conf(0.5) 64 | recall_at_1 = train_pr_calculator.get_recall_at_conf(0.1) 65 | recall_at_5 = train_pr_calculator.get_recall_at_conf(0.5) 66 | train_pr_calculator.clear() 67 | 68 | train_losses_info = "|".join(["{}: {:.3f}".format(k, v) for k,v in train_losses_dict.items()]) 69 | logging.info("training step {} |{} | Examples/sec: {:.2f}".format(global_step_val, train_losses_info, examples_per_second)) 70 | logging.info("GAP: {:.2f} | precision@0.1: {:.2f} | precision@0.5: {:.2f} |recall@0.1: {:.2f} | recall@0.5: {:.2f}".format(gap, 71 | precision_at_1, precision_at_5,recall_at_1, recall_at_5)) 72 | 73 | self.sv.summary_writer.add_summary(train_util.MakeSummary("TrainMetric/GAP", gap), global_step_val) 74 | self.sv.summary_writer.add_summary(train_util.MakeSummary("TrainMetric/precision@0.1", precision_at_1), global_step_val) 75 | self.sv.summary_writer.add_summary(train_util.MakeSummary("TrainMetric/precision@0.5", precision_at_5), global_step_val) 76 | self.sv.summary_writer.add_summary(train_util.MakeSummary("TrainMetric/recall@0.1", recall_at_1), global_step_val) 77 | self.sv.summary_writer.add_summary(train_util.MakeSummary("TrainMetric/recall@0.5", recall_at_5), global_step_val) 78 | self.sv.summary_writer.flush() 79 | 80 | def eval(self, sess, global_step_val, data_generater, data_source_name): 81 | #taggging eval 82 | tagging_class_num = self.reader.label_num_dict['tagging'] 83 | self.evl_metrics = [train_util.EvaluationMetrics(tagging_class_num, top_k=20) 84 | for i in range(len(self.modal_name_list)+1)] #+1 for fusion 85 | for i in range(len(self.evl_metrics)): 86 | self.evl_metrics[i].clear() 87 | 88 | examples_processed = 0 89 | 90 | for sample in data_generater: 91 | batch_start_time = time.time() 92 | feed_dict_data = {} 93 | for input_name in self.reader.dname_string_list: 94 | var_names = tf.get_collection(input_name) 95 | assert len(var_names)==1 96 | feed_dict_data[var_names[0]] = sample[input_name] 97 | 98 | 99 | fetch_dict_eval = sess.run(self.val_fetch_dict, feed_dict=feed_dict_data) 100 | seconds_per_batch = time.time() - batch_start_time 101 | example_per_second = self.reader.batch_size / seconds_per_batch 102 | examples_processed += self.reader.batch_size 103 | 104 | for index, modal_name in enumerate(self.modal_name_list+['fusion']): 105 | pred = fetch_dict_eval['tagging_output_'+modal_name] 106 | val_label = sample['tagging'] 107 | gap = train_util.calculate_gap(pred, val_label) 108 | iteration_info_dict = self.evl_metrics[index].accumulate(pred, val_label, fetch_dict_eval['tagging_loss_'+modal_name]) 109 | iteration_info_dict['GAP'] = gap 110 | iteration_info_dict["examples_per_second"] = example_per_second 111 | iterinfo = "|".join(["{}: {:.3f}".format(k,v) for k,v in iteration_info_dict.items()]) 112 | logging.info("examples_processed: %d | %s", examples_processed, iterinfo) 113 | logging.info("Done with batched inference. Now calculating global performance metrics.") 114 | 115 | for index, modal_name in enumerate(self.modal_name_list+['fusion']): 116 | epoch_info_dict = self.evl_metrics[index].get() 117 | epoch_info_dict["epoch_id"] = global_step_val 118 | epochinfo = train_util.FormatEvalInfo(self.summary_writer, global_step_val, epoch_info_dict, prefix="val_"+modal_name) 119 | logging.info(epochinfo) 120 | self.evl_metrics[index].clear() 121 | self.summary_writer.add_summary(fetch_dict_eval['val_summary_op'], global_step_val) 122 | 123 | return epoch_info_dict['gap'] #融合特征的预测结果 124 | 125 | 126 | if __name__ == "__main__": 127 | import argparse 128 | parser = argparse.ArgumentParser() 129 | parser.add_argument('--config',default='configs/config.example.yaml',type=str) 130 | args = parser.parse_args() 131 | train_main(args.config, TaggingTrainer) 132 | -------------------------------------------------------------------------------- /infer.sh: -------------------------------------------------------------------------------- 1 | # Confirm Position of the code 2 | CONDA_NEW_ENV=taac2021-tagging-pytrochyyds 3 | ENV_ROOT=/home/tione/notebook 4 | CODE_ROOT=${ENV_ROOT}/VideoStructuring 5 | CODE_BASE=${CODE_ROOT}/taac2021_tagging_pytorchyyds 6 | DATA_BASE=${CODE_ROOT}/dataset 7 | # #################### get env directories 8 | # CONDA_ROOT 9 | CONDA_CONFIG_ROOT_PREFIX=$(conda config --show root_prefix) 10 | get_conda_root_prefix() { 11 | TMP_POS=$(awk -v a="${CONDA_CONFIG_ROOT_PREFIX}" -v b="/" 'BEGIN{print index(a, b)}') 12 | TMP_POS=$((TMP_POS-1)) 13 | if [ $TMP_POS -ge 0 ]; then 14 | echo "${CONDA_CONFIG_ROOT_PREFIX:${TMP_POS}}" 15 | else 16 | echo "" 17 | fi 18 | } 19 | CONDA_ROOT=$(get_conda_root_prefix) 20 | if [ ! -d "${CONDA_ROOT}" ]; then 21 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到Conda环境:${CONDA_ROOT}" 22 | exit 1 23 | fi 24 | 25 | if [ ! -d "${DATA_BASE}" ]; then 26 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到数据集:${DATASET_ROOT}" 27 | exit 1 28 | fi 29 | 30 | CONDA_CONFIG_FILE="${CONDA_ROOT}/etc/profile.d/conda.sh" 31 | if [ ! -f "${CONDA_CONFIG_FILE}" ]; then 32 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR 找不到Conda配置文件:${CONDA_CONFIG_FILE}" 33 | exit 1 34 | fi 35 | source "${CONDA_CONFIG_FILE}" 36 | conda activate ${ENV_ROOT}/envs/${CONDA_NEW_ENV} 37 | 38 | check_gpu=$(python -c "import os; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'; import tensorflow.compat.v1 as tf; print(tf.test.is_gpu_available())") 39 | if [ "${check_gpu}" == "False" ]; then 40 | echo "[$(date "+%Y-%m-%d %H:%M:%S")] ERROR Conda环境启动失败!" 41 | exit 1 42 | fi 43 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 已启动Conda环境!" 44 | 45 | 46 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始拷贝ASR和OCR文本特征..." 47 | cp ${ENV_ROOT}/algo-2021/dataset/tagging/tagging_dataset_test_5k_2nd/text_txt/tagging/ ${CODE_ROOT}/dataset/tagging/tagging_dataset_test_5k_2nd/text_txt/ -r 48 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 文本特征已就绪!" 49 | 50 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始拷贝音频特征..." 51 | cp ${ENV_ROOT}/algo-2021/dataset/tagging/tagging_dataset_test_5k_2nd/audio_npy/Vggish/tagging/ ${CODE_ROOT}/dataset/tagging/tagging_dataset_test_5k_2nd/audio_npy/Vggish/ -r 52 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 音频特征已就绪!" 53 | 54 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始抽取Video特征..." 55 | python ${CODE_BASE}/preprocess/feat_extract_main.py --test_files_dir ${DATA_BASE}/videos/test_5k_2nd/ --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/video_npy/Youtube8M/tagging/ --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/audio_npy --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/image_jpg --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/text_txt --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt --extract_type 1 --image_batch_size 300 --imgfeat_extractor efficientnet | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log & 56 | sleep 30s 57 | python ${CODE_BASE}/preprocess/feat_extract_main.py --test_files_dir ${DATA_BASE}/videos/test_5k_2nd/ --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/video_npy/Youtube8M/tagging/ --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/audio_npy --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/image_jpg --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/text_txt --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt --extract_type 1 --image_batch_size 300 --imgfeat_extractor efficientnet | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log & 58 | sleep 30s 59 | python ${CODE_BASE}/preprocess/feat_extract_main.py --test_files_dir ${DATA_BASE}/videos/test_5k_2nd/ --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/video_npy/Youtube8M/tagging/ --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/audio_npy --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/image_jpg --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/text_txt --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt --extract_type 1 --image_batch_size 300 --imgfeat_extractor efficientnet --do_logging 1 | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log 60 | 61 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始检查Video特征抽取结果..." 62 | python ${CODE_BASE}/preprocess/feat_extract_main.py --test_files_dir ${DATA_BASE}/videos/test_5k_2nd/ --frame_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/video_npy/Youtube8M/tagging/ --audio_npy_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/audio_npy --image_jpg_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/image_jpg --text_txt_folder ${DATA_BASE}/tagging/tagging_dataset_test_5k_2nd/text_txt --datafile_path ${DATA_BASE}/tagging/GroundTruth/datafile/train.txt --extract_type 1 --image_batch_size 300 --imgfeat_extractor efficientnet --do_logging 1 | grep -v "I tensorflow" >> /home/tione/notebook/log/feat_extract.log 63 | 64 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 视频特征抽取完成!" 65 | 66 | 67 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始进行预测..." 68 | for fold in 0 2 4 6 8 69 | do 70 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 第${fold}个子模型开始预测..." 71 | rm -rf ${CODE_ROOT}/KFoldResults/test/results_"$fold"/ 72 | mkdir ${CODE_ROOT}/KFoldResults/test/results_"$fold"/ 73 | chmod 777 ${CODE_ROOT}/KFoldResults/test/results_"$fold"/ 74 | BEST_MODEL=$(ls -td -- ${CODE_ROOT}/KFoldModels/model_"$fold"/export/* | head -n 1) 75 | python ${CODE_BASE}/infer.py \ 76 | --model_pb ${BEST_MODEL} \ 77 | --tag_id_file ${CODE_ROOT}/dataset/label_id.txt \ 78 | --test_dir ${CODE_ROOT}/dataset/videos/test_5k_2nd/ \ 79 | --output_json ${CODE_ROOT}/KFoldResults/test/results_"$fold"/tagging_5k.json \ 80 | --load_feat 1 \ 81 | --feat_dir ${CODE_ROOT}/dataset/tagging/tagging_dataset_test_5k_2nd/ \ 82 | --top_k 82 > ${ENV_ROOT}/log/test_eval_log.txt & 83 | 84 | sleep 10s 85 | fold=$(($fold+1)) 86 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 第${fold}个子模型开始预测..." 87 | rm -rf ${CODE_ROOT}/KFoldResults/test/results_"$fold"/ 88 | mkdir ${CODE_ROOT}/KFoldResults/test/results_"$fold"/ 89 | chmod 777 ${CODE_ROOT}/KFoldResults/test/results_"$fold"/ 90 | BEST_MODEL=$(ls -td -- ${CODE_ROOT}/KFoldModels/model_"$fold"/export/* | head -n 1) 91 | python ${CODE_BASE}/infer.py \ 92 | --model_pb ${BEST_MODEL} \ 93 | --tag_id_file ${CODE_ROOT}/dataset/label_id.txt \ 94 | --test_dir ${CODE_ROOT}/dataset/videos/test_5k_2nd/ \ 95 | --output_json ${CODE_ROOT}/KFoldResults/test/results_"$fold"/tagging_5k.json \ 96 | --load_feat 1 \ 97 | --feat_dir ${CODE_ROOT}/dataset/tagging/tagging_dataset_test_5k_2nd/ \ 98 | --top_k 82 > ${ENV_ROOT}/log/test_eval_log.txt 99 | done 100 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO K折预测已完成!" 101 | 102 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 开始进行模型融合..." 103 | python ${CODE_BASE}/utils/k_fold_fusion.py 10 ${CODE_ROOT}/dataset/label_id.txt ${CODE_ROOT}/KFoldResults/test/results_{}/tagging_5k.json ${CODE_BASE}/results/tagging_5k.json 20 104 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 模型融合结果已完成!" 105 | 106 | cp ${CODE_BASE}/results/tagging_5k.json ${ENV_ROOT}/pytorchyyds_prediction_5k.json 107 | echo "[$(date '+%Y-%m-%d %H:%M:%S')] INFO 'Pytorch永远滴神'团队最终预测结果已保存到:${ENV_ROOT}/pytorchyyds_prediction_5k.json" 108 | -------------------------------------------------------------------------------- /src/model/image_head/efficientNet/condconv/efficientnet_condconv_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Builder for EfficientNet-CondConv models.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import tensorflow.compat.v1 as tf 23 | 24 | import efficientnet_builder 25 | import efficientnet_model 26 | import utils 27 | 28 | # The input tensor is in the range of [0, 255], we need to scale them to the 29 | # range of [0, 1] 30 | MEAN_RGB = [127.0, 127.0, 127.0] 31 | STDDEV_RGB = [128.0, 128.0, 128.0] 32 | 33 | 34 | def efficientnet_condconv_params(model_name): 35 | """Get efficientnet-condconv params based on model name.""" 36 | params_dict = { 37 | # (width_coefficient, depth_coefficient, resolution, dropout_rate, 38 | # condconv_num_experts) 39 | 'efficientnet-condconv-b0-4e': (1.0, 1.0, 224, 0.25, 4), 40 | 'efficientnet-condconv-b0-8e': (1.0, 1.0, 224, 0.25, 8), 41 | 'efficientnet-condconv-b0-8e-depth': (1.0, 1.1, 224, 0.25, 8) 42 | } 43 | return params_dict[model_name] 44 | 45 | 46 | def efficientnet_condconv(width_coefficient=None, 47 | depth_coefficient=None, 48 | dropout_rate=0.2, 49 | survival_prob=0.8, 50 | condconv_num_experts=None): 51 | """Creates an efficientnet-condconv model.""" 52 | blocks_args = [ 53 | 'r1_k3_s11_e1_i32_o16_se0.25', 54 | 'r2_k3_s22_e6_i16_o24_se0.25', 55 | 'r2_k5_s22_e6_i24_o40_se0.25', 56 | 'r3_k3_s22_e6_i40_o80_se0.25', 57 | 'r3_k5_s11_e6_i80_o112_se0.25_cc', 58 | 'r4_k5_s22_e6_i112_o192_se0.25_cc', 59 | 'r1_k3_s11_e6_i192_o320_se0.25_cc', 60 | ] 61 | global_params = efficientnet_model.GlobalParams( 62 | batch_norm_momentum=0.99, 63 | batch_norm_epsilon=1e-3, 64 | dropout_rate=dropout_rate, 65 | survival_prob=survival_prob, 66 | data_format='channels_last', 67 | num_classes=1000, 68 | width_coefficient=width_coefficient, 69 | depth_coefficient=depth_coefficient, 70 | depth_divisor=8, 71 | min_depth=None, 72 | relu_fn=tf.nn.swish, 73 | # The default is TPU-specific batch norm. 74 | # The alternative is tf.layers.BatchNormalization. 75 | batch_norm=utils.TpuBatchNormalization, # TPU-specific requirement. 76 | use_se=True, 77 | condconv_num_experts=condconv_num_experts) 78 | decoder = efficientnet_builder.BlockDecoder() 79 | return decoder.decode(blocks_args), global_params 80 | 81 | 82 | def get_model_params(model_name, override_params): 83 | """Get the block args and global params for a given model.""" 84 | if model_name.startswith('efficientnet-condconv'): 85 | (width_coefficient, depth_coefficient, _, dropout_rate, 86 | condconv_num_experts) = ( 87 | efficientnet_condconv_params(model_name)) 88 | blocks_args, global_params = efficientnet_condconv( 89 | width_coefficient=width_coefficient, 90 | depth_coefficient=depth_coefficient, 91 | dropout_rate=dropout_rate, 92 | condconv_num_experts=condconv_num_experts) 93 | else: 94 | raise NotImplementedError('model name is not pre-defined: %s' % model_name) 95 | 96 | if override_params: 97 | # ValueError will be raised here if override_params has fields not included 98 | # in global_params. 99 | global_params = global_params._replace(**override_params) 100 | 101 | tf.logging.info('global_params= %s', global_params) 102 | tf.logging.info('blocks_args= %s', blocks_args) 103 | return blocks_args, global_params 104 | 105 | 106 | def build_model(images, 107 | model_name, 108 | training, 109 | override_params=None, 110 | model_dir=None, 111 | fine_tuning=False): 112 | """A helper functiion to creates a model and returns predicted logits. 113 | 114 | Args: 115 | images: input images tensor. 116 | model_name: string, the predefined model name. 117 | training: boolean, whether the model is constructed for training. 118 | override_params: A dictionary of params for overriding. Fields must exist in 119 | efficientnet_model.GlobalParams. 120 | model_dir: string, optional model dir for saving configs. 121 | fine_tuning: boolean, whether the model is used for finetuning. 122 | 123 | Returns: 124 | logits: the logits tensor of classes. 125 | endpoints: the endpoints for each layer. 126 | 127 | Raises: 128 | When model_name specified an undefined model, raises NotImplementedError. 129 | When override_params has invalid fields, raises ValueError. 130 | """ 131 | assert isinstance(images, tf.Tensor) 132 | if not training or fine_tuning: 133 | if not override_params: 134 | override_params = {} 135 | override_params['batch_norm'] = utils.BatchNormalization 136 | blocks_args, global_params = get_model_params(model_name, override_params) 137 | if not training or fine_tuning: 138 | global_params = global_params._replace(batch_norm=utils.BatchNormalization) 139 | 140 | if model_dir: 141 | param_file = os.path.join(model_dir, 'model_params.txt') 142 | if not tf.gfile.Exists(param_file): 143 | if not tf.gfile.Exists(model_dir): 144 | tf.gfile.MakeDirs(model_dir) 145 | with tf.gfile.GFile(param_file, 'w') as f: 146 | tf.logging.info('writing to %s' % param_file) 147 | f.write('model_name= %s\n\n' % model_name) 148 | f.write('global_params= %s\n\n' % str(global_params)) 149 | f.write('blocks_args= %s\n\n' % str(blocks_args)) 150 | 151 | with tf.variable_scope(model_name): 152 | model = efficientnet_model.Model(blocks_args, global_params) 153 | logits = model(images, training=training) 154 | 155 | logits = tf.identity(logits, 'logits') 156 | return logits, model.endpoints 157 | 158 | 159 | def build_model_base(images, model_name, training, override_params=None): 160 | """A helper functiion to create a base model and return global_pool. 161 | 162 | Args: 163 | images: input images tensor. 164 | model_name: string, the model name of a pre-defined MnasNet. 165 | training: boolean, whether the model is constructed for training. 166 | override_params: A dictionary of params for overriding. Fields must exist in 167 | mnasnet_model.GlobalParams. 168 | 169 | Returns: 170 | features: global pool features. 171 | endpoints: the endpoints for each layer. 172 | 173 | Raises: 174 | When model_name specified an undefined model, raises NotImplementedError. 175 | When override_params has invalid fields, raises ValueError. 176 | """ 177 | assert isinstance(images, tf.Tensor) 178 | blocks_args, global_params = get_model_params(model_name, override_params) 179 | 180 | with tf.variable_scope(model_name): 181 | model = efficientnet_model.Model(blocks_args, global_params) 182 | features = model(images, training=training, features_only=True) 183 | 184 | features = tf.identity(features, 'global_pool') 185 | return features, model.endpoints 186 | -------------------------------------------------------------------------------- /src/model/cover_head/mobilenet_v1_train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Build and train mobilenet_v1 with options for quantization.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import tensorflow as tf 22 | 23 | from datasets import dataset_factory 24 | from nets import mobilenet_v1 25 | from preprocessing import preprocessing_factory 26 | 27 | slim = tf.contrib.slim 28 | 29 | flags = tf.app.flags 30 | 31 | flags.DEFINE_string('master', '', 'Session master') 32 | flags.DEFINE_integer('task', 0, 'Task') 33 | flags.DEFINE_integer('ps_tasks', 0, 'Number of ps') 34 | flags.DEFINE_integer('batch_size', 64, 'Batch size') 35 | flags.DEFINE_integer('num_classes', 1001, 'Number of classes to distinguish') 36 | flags.DEFINE_integer('number_of_steps', None, 37 | 'Number of training steps to perform before stopping') 38 | flags.DEFINE_integer('image_size', 224, 'Input image resolution') 39 | flags.DEFINE_float('depth_multiplier', 1.0, 'Depth multiplier for mobilenet') 40 | flags.DEFINE_bool('quantize', False, 'Quantize training') 41 | flags.DEFINE_string('fine_tune_checkpoint', '', 42 | 'Checkpoint from which to start finetuning.') 43 | flags.DEFINE_string('checkpoint_dir', '', 44 | 'Directory for writing training checkpoints and logs') 45 | flags.DEFINE_string('dataset_dir', '', 'Location of dataset') 46 | flags.DEFINE_integer('log_every_n_steps', 100, 'Number of steps per log') 47 | flags.DEFINE_integer('save_summaries_secs', 100, 48 | 'How often to save summaries, secs') 49 | flags.DEFINE_integer('save_interval_secs', 100, 50 | 'How often to save checkpoints, secs') 51 | 52 | FLAGS = flags.FLAGS 53 | 54 | _LEARNING_RATE_DECAY_FACTOR = 0.94 55 | 56 | 57 | def get_learning_rate(): 58 | if FLAGS.fine_tune_checkpoint: 59 | # If we are fine tuning a checkpoint we need to start at a lower learning 60 | # rate since we are farther along on training. 61 | return 1e-4 62 | else: 63 | return 0.045 64 | 65 | 66 | def get_quant_delay(): 67 | if FLAGS.fine_tune_checkpoint: 68 | # We can start quantizing immediately if we are finetuning. 69 | return 0 70 | else: 71 | # We need to wait for the model to train a bit before we quantize if we are 72 | # training from scratch. 73 | return 250000 74 | 75 | 76 | def imagenet_input(is_training): 77 | """Data reader for imagenet. 78 | 79 | Reads in imagenet data and performs pre-processing on the images. 80 | 81 | Args: 82 | is_training: bool specifying if train or validation dataset is needed. 83 | Returns: 84 | A batch of images and labels. 85 | """ 86 | if is_training: 87 | dataset = dataset_factory.get_dataset('imagenet', 'train', 88 | FLAGS.dataset_dir) 89 | else: 90 | dataset = dataset_factory.get_dataset('imagenet', 'validation', 91 | FLAGS.dataset_dir) 92 | 93 | provider = slim.dataset_data_provider.DatasetDataProvider( 94 | dataset, 95 | shuffle=is_training, 96 | common_queue_capacity=2 * FLAGS.batch_size, 97 | common_queue_min=FLAGS.batch_size) 98 | [image, label] = provider.get(['image', 'label']) 99 | 100 | image_preprocessing_fn = preprocessing_factory.get_preprocessing( 101 | 'mobilenet_v1', is_training=is_training) 102 | 103 | image = image_preprocessing_fn(image, FLAGS.image_size, FLAGS.image_size) 104 | 105 | images, labels = tf.train.batch( 106 | [image, label], 107 | batch_size=FLAGS.batch_size, 108 | num_threads=4, 109 | capacity=5 * FLAGS.batch_size) 110 | labels = slim.one_hot_encoding(labels, FLAGS.num_classes) 111 | return images, labels 112 | 113 | 114 | def build_model(): 115 | """Builds graph for model to train with rewrites for quantization. 116 | 117 | Returns: 118 | g: Graph with fake quantization ops and batch norm folding suitable for 119 | training quantized weights. 120 | train_tensor: Train op for execution during training. 121 | """ 122 | g = tf.Graph() 123 | with g.as_default(), tf.device( 124 | tf.train.replica_device_setter(FLAGS.ps_tasks)): 125 | inputs, labels = imagenet_input(is_training=True) 126 | with slim.arg_scope(mobilenet_v1.mobilenet_v1_arg_scope(is_training=True)): 127 | logits, _ = mobilenet_v1.mobilenet_v1( 128 | inputs, 129 | is_training=True, 130 | depth_multiplier=FLAGS.depth_multiplier, 131 | num_classes=FLAGS.num_classes) 132 | 133 | tf.losses.softmax_cross_entropy(labels, logits) 134 | 135 | # Call rewriter to produce graph with fake quant ops and folded batch norms 136 | # quant_delay delays start of quantization till quant_delay steps, allowing 137 | # for better model accuracy. 138 | if FLAGS.quantize: 139 | tf.contrib.quantize.create_training_graph(quant_delay=get_quant_delay()) 140 | 141 | total_loss = tf.losses.get_total_loss(name='total_loss') 142 | # Configure the learning rate using an exponential decay. 143 | num_epochs_per_decay = 2.5 144 | imagenet_size = 1271167 145 | decay_steps = int(imagenet_size / FLAGS.batch_size * num_epochs_per_decay) 146 | 147 | learning_rate = tf.train.exponential_decay( 148 | get_learning_rate(), 149 | tf.train.get_or_create_global_step(), 150 | decay_steps, 151 | _LEARNING_RATE_DECAY_FACTOR, 152 | staircase=True) 153 | opt = tf.train.GradientDescentOptimizer(learning_rate) 154 | 155 | train_tensor = slim.learning.create_train_op( 156 | total_loss, 157 | optimizer=opt) 158 | 159 | slim.summaries.add_scalar_summary(total_loss, 'total_loss', 'losses') 160 | slim.summaries.add_scalar_summary(learning_rate, 'learning_rate', 'training') 161 | return g, train_tensor 162 | 163 | 164 | def get_checkpoint_init_fn(): 165 | """Returns the checkpoint init_fn if the checkpoint is provided.""" 166 | if FLAGS.fine_tune_checkpoint: 167 | variables_to_restore = slim.get_variables_to_restore() 168 | global_step_reset = tf.assign(tf.train.get_or_create_global_step(), 0) 169 | # When restoring from a floating point model, the min/max values for 170 | # quantized weights and activations are not present. 171 | # We instruct slim to ignore variables that are missing during restoration 172 | # by setting ignore_missing_vars=True 173 | slim_init_fn = slim.assign_from_checkpoint_fn( 174 | FLAGS.fine_tune_checkpoint, 175 | variables_to_restore, 176 | ignore_missing_vars=True) 177 | 178 | def init_fn(sess): 179 | slim_init_fn(sess) 180 | # If we are restoring from a floating point model, we need to initialize 181 | # the global step to zero for the exponential decay to result in 182 | # reasonable learning rates. 183 | sess.run(global_step_reset) 184 | return init_fn 185 | else: 186 | return None 187 | 188 | 189 | def train_model(): 190 | """Trains mobilenet_v1.""" 191 | g, train_tensor = build_model() 192 | with g.as_default(): 193 | slim.learning.train( 194 | train_tensor, 195 | FLAGS.checkpoint_dir, 196 | is_chief=(FLAGS.task == 0), 197 | master=FLAGS.master, 198 | log_every_n_steps=FLAGS.log_every_n_steps, 199 | graph=g, 200 | number_of_steps=FLAGS.number_of_steps, 201 | save_summaries_secs=FLAGS.save_summaries_secs, 202 | save_interval_secs=FLAGS.save_interval_secs, 203 | init_fn=get_checkpoint_init_fn(), 204 | global_step=tf.train.get_global_step()) 205 | 206 | 207 | def main(unused_arg): 208 | train_model() 209 | 210 | 211 | if __name__ == '__main__': 212 | tf.app.run(main) 213 | -------------------------------------------------------------------------------- /src/model/cover_head/nets_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Contains a factory for building various models.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | import functools 21 | 22 | import tensorflow as tf 23 | 24 | from nets import alexnet 25 | from nets import cifarnet 26 | from nets import i3d 27 | from nets import inception 28 | from nets import lenet 29 | from nets import mobilenet_v1 30 | from nets import overfeat 31 | from nets import resnet_v1 32 | from nets import resnet_v2 33 | from nets import s3dg 34 | from nets import vgg 35 | from nets import densenet 36 | from nets.mobilenet import mobilenet_v2 37 | from nets.nasnet import nasnet 38 | from nets.nasnet import pnasnet 39 | 40 | 41 | slim = tf.contrib.slim 42 | 43 | networks_map = {'alexnet_v2': alexnet.alexnet_v2, 44 | 'cifarnet': cifarnet.cifarnet, 45 | 'overfeat': overfeat.overfeat, 46 | 'vgg_a': vgg.vgg_a, 47 | 'vgg_16': vgg.vgg_16, 48 | 'vgg_19': vgg.vgg_19, 49 | 'inception_v1': inception.inception_v1, 50 | 'inception_v2': inception.inception_v2, 51 | 'inception_v3': inception.inception_v3, 52 | 'inception_v4': inception.inception_v4, 53 | 'inception_resnet_v2': inception.inception_resnet_v2, 54 | 'i3d': i3d.i3d, 55 | 's3dg': s3dg.s3dg, 56 | 'lenet': lenet.lenet, 57 | 'resnet_v1_50': resnet_v1.resnet_v1_50, 58 | 'resnet_v1_101': resnet_v1.resnet_v1_101, 59 | 'resnet_v1_152': resnet_v1.resnet_v1_152, 60 | 'resnet_v1_200': resnet_v1.resnet_v1_200, 61 | 'resnet_v2_50': resnet_v2.resnet_v2_50, 62 | 'resnet_v2_101': resnet_v2.resnet_v2_101, 63 | 'resnet_v2_152': resnet_v2.resnet_v2_152, 64 | 'resnet_v2_200': resnet_v2.resnet_v2_200, 65 | 'densenet121': densenet.densenet121, 66 | 'densenet161': densenet.densenet161, 67 | 'densenet169': densenet.densenet169, 68 | 'mobilenet_v1': mobilenet_v1.mobilenet_v1, 69 | 'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_075, 70 | 'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_050, 71 | 'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_025, 72 | 'mobilenet_v2': mobilenet_v2.mobilenet, 73 | 'mobilenet_v2_140': mobilenet_v2.mobilenet_v2_140, 74 | 'mobilenet_v2_035': mobilenet_v2.mobilenet_v2_035, 75 | 'nasnet_cifar': nasnet.build_nasnet_cifar, 76 | 'nasnet_mobile': nasnet.build_nasnet_mobile, 77 | 'nasnet_large': nasnet.build_nasnet_large, 78 | 'pnasnet_large': pnasnet.build_pnasnet_large, 79 | 'pnasnet_mobile': pnasnet.build_pnasnet_mobile, 80 | } 81 | 82 | arg_scopes_map = {'alexnet_v2': alexnet.alexnet_v2_arg_scope, 83 | 'cifarnet': cifarnet.cifarnet_arg_scope, 84 | 'overfeat': overfeat.overfeat_arg_scope, 85 | 'vgg_a': vgg.vgg_arg_scope, 86 | 'vgg_16': vgg.vgg_arg_scope, 87 | 'vgg_19': vgg.vgg_arg_scope, 88 | 'inception_v1': inception.inception_v3_arg_scope, 89 | 'inception_v2': inception.inception_v3_arg_scope, 90 | 'inception_v3': inception.inception_v3_arg_scope, 91 | 'inception_v4': inception.inception_v4_arg_scope, 92 | 'inception_resnet_v2': 93 | inception.inception_resnet_v2_arg_scope, 94 | 'i3d': i3d.i3d_arg_scope, 95 | 's3dg': s3dg.s3dg_arg_scope, 96 | 'lenet': lenet.lenet_arg_scope, 97 | 'resnet_v1_50': resnet_v1.resnet_arg_scope, 98 | 'resnet_v1_101': resnet_v1.resnet_arg_scope, 99 | 'resnet_v1_152': resnet_v1.resnet_arg_scope, 100 | 'resnet_v1_200': resnet_v1.resnet_arg_scope, 101 | 'resnet_v2_50': resnet_v2.resnet_arg_scope, 102 | 'resnet_v2_101': resnet_v2.resnet_arg_scope, 103 | 'resnet_v2_152': resnet_v2.resnet_arg_scope, 104 | 'resnet_v2_200': resnet_v2.resnet_arg_scope, 105 | 'densenet121': densenet.densenet_arg_scope, 106 | 'densenet161': densenet.densenet_arg_scope, 107 | 'densenet169': densenet.densenet_arg_scope, 108 | 'mobilenet_v1': mobilenet_v1.mobilenet_v1_arg_scope, 109 | 'mobilenet_v1_075': mobilenet_v1.mobilenet_v1_arg_scope, 110 | 'mobilenet_v1_050': mobilenet_v1.mobilenet_v1_arg_scope, 111 | 'mobilenet_v1_025': mobilenet_v1.mobilenet_v1_arg_scope, 112 | 'mobilenet_v2': mobilenet_v2.training_scope, 113 | 'mobilenet_v2_035': mobilenet_v2.training_scope, 114 | 'mobilenet_v2_140': mobilenet_v2.training_scope, 115 | 'nasnet_cifar': nasnet.nasnet_cifar_arg_scope, 116 | 'nasnet_mobile': nasnet.nasnet_mobile_arg_scope, 117 | 'nasnet_large': nasnet.nasnet_large_arg_scope, 118 | 'pnasnet_large': pnasnet.pnasnet_large_arg_scope, 119 | 'pnasnet_mobile': pnasnet.pnasnet_mobile_arg_scope, 120 | } 121 | 122 | 123 | def get_network_fn(name, num_classes, weight_decay=0.0, is_training=False): 124 | """Returns a network_fn such as `logits, end_points = network_fn(images)`. 125 | 126 | Args: 127 | name: The name of the network. 128 | num_classes: The number of classes to use for classification. If 0 or None, 129 | the logits layer is omitted and its input features are returned instead. 130 | weight_decay: The l2 coefficient for the model weights. 131 | is_training: `True` if the model is being used for training and `False` 132 | otherwise. 133 | 134 | Returns: 135 | network_fn: A function that applies the model to a batch of images. It has 136 | the following signature: 137 | net, end_points = network_fn(images) 138 | The `images` input is a tensor of shape [batch_size, height, width, 3] 139 | with height = width = network_fn.default_image_size. (The permissibility 140 | and treatment of other sizes depends on the network_fn.) 141 | The returned `end_points` are a dictionary of intermediate activations. 142 | The returned `net` is the topmost layer, depending on `num_classes`: 143 | If `num_classes` was a non-zero integer, `net` is a logits tensor 144 | of shape [batch_size, num_classes]. 145 | If `num_classes` was 0 or `None`, `net` is a tensor with the input 146 | to the logits layer of shape [batch_size, 1, 1, num_features] or 147 | [batch_size, num_features]. Dropout has not been applied to this 148 | (even if the network's original classification does); it remains for 149 | the caller to do this or not. 150 | 151 | Raises: 152 | ValueError: If network `name` is not recognized. 153 | """ 154 | if name not in networks_map: 155 | raise ValueError('Name of network unknown %s' % name) 156 | func = networks_map[name] 157 | @functools.wraps(func) 158 | def network_fn(images, **kwargs): 159 | arg_scope = arg_scopes_map[name](weight_decay=weight_decay) 160 | with slim.arg_scope(arg_scope): 161 | return func(images, num_classes=num_classes, is_training=is_training, 162 | **kwargs) 163 | if hasattr(func, 'default_image_size'): 164 | network_fn.default_image_size = func.default_image_size 165 | 166 | return network_fn 167 | --------------------------------------------------------------------------------