├── README.md ├── baseline-cca ├── cca.py ├── dcca.py ├── dccae.py └── util.py ├── baseline-cpmnet ├── change_format.py ├── test_lianzheng.py └── util │ ├── classfiy.py │ ├── get_sn.py │ ├── model.py │ └── util.py ├── baseline-mmin ├── auto │ ├── close_screen.sh │ ├── combine_results.py │ └── task_generate.py ├── change_format.py ├── data │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── base_dataset.cpython-37.pyc │ │ ├── base_dataset.cpython-38.pyc │ │ ├── cmumosei_miss_dataset.cpython-38.pyc │ │ ├── cmumosei_multimodal_dataset.cpython-38.pyc │ │ ├── iemocapfour_miss_dataset.cpython-37.pyc │ │ ├── iemocapfour_miss_dataset.cpython-38.pyc │ │ ├── iemocapfour_multimodal_dataset.cpython-38.pyc │ │ ├── iemocapsix_miss_dataset.cpython-38.pyc │ │ └── iemocapsix_multimodal_dataset.cpython-38.pyc │ ├── base_dataset.py │ ├── cmumosei_miss_dataset.py │ ├── cmumosei_multimodal_dataset.py │ ├── cmumosi_miss_dataset.py │ ├── cmumosi_multimodal_dataset.py │ ├── comparE_dataset.py │ ├── config │ │ ├── CMUMOSEI_config.json │ │ ├── CMUMOSI_config.json │ │ ├── IEMOCAPFOUR_config.json │ │ └── IEMOCAPSIX_config.json │ ├── iemocapfour_miss_dataset.py │ ├── iemocapfour_multimodal_dataset.py │ ├── iemocapsix_miss_dataset.py │ ├── iemocapsix_multimodal_dataset.py │ ├── melspec_dataset.py │ ├── msp_miss_dataset.py │ ├── msp_multimodal_dataset.py │ ├── multimodal_dataset.py │ ├── multimodal_miss_dataset.py │ └── word_aligned_dataset.py ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── base_model.cpython-37.pyc │ │ ├── base_model.cpython-38.pyc │ │ ├── mmin_AE_model.cpython-38.pyc │ │ ├── mmin_CRA_model.cpython-37.pyc │ │ ├── mmin_CRA_model.cpython-38.pyc │ │ ├── mmin_model.cpython-38.pyc │ │ ├── utt_fusion_model.cpython-37.pyc │ │ └── utt_fusion_model.cpython-38.pyc │ ├── base_model.py │ ├── lstm_audio_model.py │ ├── mmin_AE_model.py │ ├── mmin_CRA_model.py │ ├── mmin_ablation_model.py │ ├── mmin_model.py │ ├── mmin_no_cycle_model.py │ ├── mmin_old_model.py │ ├── networks │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── autoencoder.cpython-37.pyc │ │ │ ├── autoencoder.cpython-38.pyc │ │ │ ├── classifier.cpython-37.pyc │ │ │ ├── classifier.cpython-38.pyc │ │ │ ├── fc.cpython-37.pyc │ │ │ ├── fc.cpython-38.pyc │ │ │ ├── lstm.cpython-37.pyc │ │ │ ├── lstm.cpython-38.pyc │ │ │ ├── textcnn.cpython-37.pyc │ │ │ ├── textcnn.cpython-38.pyc │ │ │ ├── tools.cpython-37.pyc │ │ │ └── tools.cpython-38.pyc │ │ ├── autoencoder.py │ │ ├── classifier.py │ │ ├── fc.py │ │ ├── lstm.py │ │ ├── textcnn.py │ │ └── tools.py │ ├── utils │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── config.cpython-37.pyc │ │ │ └── config.cpython-38.pyc │ │ ├── config.py │ │ └── load_pretrained.py │ ├── utt_fusion_model.py │ └── uttf_dataaug_model.py ├── opts │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── base_opts.cpython-37.pyc │ │ ├── base_opts.cpython-38.pyc │ │ ├── train_opts.cpython-37.pyc │ │ └── train_opts.cpython-38.pyc │ ├── base_opts.py │ ├── test_opts.py │ └── train_opts.py ├── preprocess │ ├── IEMOCAP │ │ ├── make_aligned.py │ │ ├── make_comparE.py │ │ ├── make_melspec.py │ │ ├── make_torch_denseface.py │ │ ├── melspec_extractor.py │ │ ├── migrate_VL_feat.py │ │ ├── migrate_compaeE_tonpy.py │ │ └── statis_comparE.py │ ├── MSP │ │ ├── make_aligned.py │ │ ├── make_aligned_info.py │ │ └── make_comparE.py │ ├── debug.py │ └── tools │ │ ├── bert_extractor.py │ │ ├── denseface │ │ ├── densenet.py │ │ ├── densenet_train.py │ │ └── vision_network │ │ │ ├── __init__.py │ │ │ ├── data_providers │ │ │ ├── __init__.py │ │ │ ├── base_provider.py │ │ │ ├── cifar.py │ │ │ ├── downloader.py │ │ │ ├── fer.py │ │ │ ├── svhn.py │ │ │ └── utils.py │ │ │ ├── denseface_feature.py │ │ │ ├── models │ │ │ ├── __init__.py │ │ │ └── dense_net.py │ │ │ └── run_dense_net.py │ │ └── denseface_extractor.py ├── train_baseline.py ├── train_miss.py └── utils │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── logger.cpython-37.pyc │ └── logger.cpython-38.pyc │ ├── image_pool.py │ └── logger.py ├── config.py ├── dataset ├── CMUMOSEI │ └── CMUMOSEI_features_raw_2way.pkl ├── CMUMOSI │ └── CMUMOSI_features_raw_2way.pkl └── IEMOCAP │ ├── IEMOCAP_features_raw_4way.pkl │ └── IEMOCAP_features_raw_6way.pkl ├── detect.py ├── environment.yml ├── face_detection_yunet_2021sep.onnx ├── feature_extraction ├── audio │ ├── __pycache__ │ │ ├── config.cpython-38.pyc │ │ ├── feature_extractor.cpython-38.pyc │ │ └── util.cpython-38.pyc │ ├── extract_handcrafted_feature.py │ ├── extract_panns_embedding.py │ ├── extract_vggish_embedding.py │ ├── extract_wav2vec2_embedding.py │ ├── extract_wav2vec_embedding.py │ ├── feature_extractor.py │ ├── panns │ │ ├── __pycache__ │ │ │ ├── models.cpython-38.pyc │ │ │ └── pytorch_utils.cpython-38.pyc │ │ ├── evaluate.py │ │ ├── finetune_template.py │ │ ├── inference.py │ │ ├── losses.py │ │ ├── main.py │ │ ├── models.py │ │ └── pytorch_utils.py │ ├── run.sh │ ├── smile.log │ ├── util.py │ └── vggish │ │ ├── README.md │ │ ├── __pycache__ │ │ ├── mel_features.cpython-38.pyc │ │ ├── vggish_input.cpython-38.pyc │ │ ├── vggish_params.cpython-38.pyc │ │ └── vggish_slim.cpython-38.pyc │ │ ├── mel_features.py │ │ ├── vggish_inference_demo.py │ │ ├── vggish_input.py │ │ ├── vggish_params.py │ │ ├── vggish_pca_params.npz │ │ ├── vggish_postprocess.py │ │ ├── vggish_slim.py │ │ ├── vggish_smoke_test.py │ │ └── vggish_train_demo.py ├── text │ ├── __pycache__ │ │ └── util.cpython-38.pyc │ ├── extract_text_embedding.py │ ├── extract_text_embedding_LZ.py │ └── util.py └── visual │ ├── __pycache__ │ ├── config.cpython-38.pyc │ ├── dataset.cpython-38.pyc │ └── util.cpython-38.pyc │ ├── dataset.py │ ├── emonet │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── data_augmentation.cpython-38.pyc │ ├── data │ │ ├── __init__.py │ │ └── affecnet.py │ ├── data_augmentation.py │ ├── evaluation.py │ ├── metrics.py │ └── models │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ └── emonet.cpython-38.pyc │ │ └── emonet.py │ ├── extract_emonet_embedding.py │ ├── extract_ferplus_embedding.py │ ├── extract_manet_embedding.py │ ├── extract_openface.py │ ├── manet │ ├── LICENSE │ ├── README.md │ ├── log │ │ ├── AffectNet7.png │ │ ├── AffectNet7.txt │ │ ├── AffectNet8.png │ │ ├── AffectNet8.txt │ │ ├── CAER-S.png │ │ ├── CAER-S.txt │ │ ├── FED-RO.png │ │ ├── FED-RO.txt │ │ ├── RAF-DB.png │ │ ├── RAF-DB.txt │ │ ├── SFEW.png │ │ ├── SFEW.txt │ │ ├── [02-08]-[16-22]-cnn.png │ │ ├── [02-08]-[16-22]-log.txt │ │ ├── [02-08]-[19-12]-cnn.png │ │ ├── [02-08]-[19-12]-log.txt │ │ ├── [02-08]-[21-19]-cnn.png │ │ ├── [02-08]-[21-19]-log.txt │ │ ├── [02-08]-[22-55]-cnn.png │ │ ├── [02-08]-[22-55]-log.txt │ │ ├── [02-12]-[19-11]-cnn.png │ │ ├── [02-12]-[19-11]-log.txt │ │ ├── [02-12]-[19-11]-scratch-log.txt │ │ ├── [02-12]-[22-21]-cnn.png │ │ ├── [02-12]-[22-21]-log.txt │ │ ├── [02-12]-[22-21]-scratch-lr0.01-log.txt │ │ ├── [05-28]-[13-07]-cnn.png │ │ ├── [05-28]-[13-07]-log.txt │ │ └── [05-28]-[13-07]-scratch-lr0.1-log.txt │ ├── main.py │ ├── model │ │ ├── __pycache__ │ │ │ ├── attention.cpython-38.pyc │ │ │ └── manet.cpython-38.pyc │ │ ├── attention.py │ │ └── manet.py │ └── reorganize_rafdb.py │ ├── pytorch-benchmarks │ ├── .gitignore │ ├── LICENSE.md │ ├── README.md │ ├── fer2013 │ │ ├── __init__.py │ │ ├── fer.py │ │ └── fer_loader.py │ ├── imagenet │ │ ├── __init__.py │ │ ├── evaluation.py │ │ └── imagenet.py.bak │ ├── lfw_eval.py │ ├── matlab_cp2tform.py │ ├── model │ │ ├── alexnet_face_fer_bn_dag.py │ │ ├── resnet50_ferplus_dag.py │ │ ├── senet50_ferplus_dag.py │ │ ├── vgg_m_face_bn_fer_dag.py │ │ └── vgg_vd_face_fer_dag.py │ ├── run_fer_benchmarks.py │ ├── run_imagenet_benchmarks.py │ └── utils │ │ ├── __init__.py │ │ └── benchmark_helpers.py │ └── util.py ├── gcnet ├── dataloader_cmumosi.py ├── dataloader_iemocap.py ├── graph.py ├── loss.py ├── model.py ├── module.py └── train_gcnet.py ├── preprocess.py ├── requirements-cpmnet.txt ├── requirements.txt ├── run.sh ├── run_ae.sh ├── run_cca.sh ├── run_cpmnetsub1.sh ├── run_cpmnetsub2.sh ├── run_cpmnetsub3.sh ├── run_cra.sh ├── run_dcca.sh ├── run_dccae.sh ├── run_gcnet.sh └── run_mmin.sh /baseline-cpmnet/util/classfiy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.preprocessing import OneHotEncoder 3 | 4 | 5 | def convert_to_one_hot(y, C): 6 | return np.eye(C)[y.reshape(-1)] 7 | 8 | 9 | def vote(lsd1, lsd2, label, n=1): 10 | """Sometimes the prediction accuracy will be higher in this way. 11 | :param lsd1: train set's latent space data 12 | :param lsd2: test set's latent space data 13 | :param label: label of train set 14 | :param n: Similar to K in k-nearest neighbors algorithm 15 | :return: Predicted label 16 | """ 17 | F_h_h = np.dot(lsd2, np.transpose(lsd1)) 18 | gt_list = [] 19 | label = label.reshape(len(label), 1) 20 | for num in range(n): 21 | F_h_h_argmax = np.argmax(F_h_h, axis=1) 22 | F_h_h_onehot = convert_to_one_hot(F_h_h_argmax, len(label)) 23 | F_h_h = F_h_h - np.multiply(F_h_h, F_h_h_onehot) 24 | gt_list.append(np.dot(F_h_h_onehot, label)) 25 | gt_ = np.array(gt_list).transpose(2, 1, 0)[0].astype(np.int64) 26 | count_list = [] 27 | count_list.append([np.argmax(np.bincount(gt_[i])) for i in range(lsd2.shape[0])]) 28 | gt_pre = np.array(count_list) 29 | return gt_pre.transpose() 30 | 31 | def ave(lsd1, lsd2, label): 32 | """In most cases, this method is used to predict the highest accuracy. 33 | :param lsd1: train set's latent space data 34 | :param lsd2: test set's latent space data 35 | :param label: label of train set 36 | :return: Predicted label 37 | """ 38 | F_h_h = np.dot(lsd2, np.transpose(lsd1)) # 每个测试样本和所有训练样本的距离 39 | label = label.reshape(len(label), 1) - 1 40 | enc = OneHotEncoder() 41 | a = enc.fit_transform(label) 42 | label_onehot = a.toarray() # 每个测试样本的onehot标签 43 | label_num = np.sum(label_onehot, axis=0) # 没有样本数量【训练集】 44 | F_h_h_sum = np.dot(F_h_h, label_onehot) 45 | F_h_h_mean = F_h_h_sum / label_num # 每个类的中性距离 46 | label_pre = np.argmax(F_h_h_mean, axis=1) + 1 47 | return label_pre # 计算预测标签结果 48 | -------------------------------------------------------------------------------- /baseline-cpmnet/util/get_sn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.random import randint 3 | from sklearn.preprocessing import OneHotEncoder 4 | 5 | 6 | def get_sn(view_num, alldata_len, missing_rate): 7 | """Randomly generate incomplete data information, simulate partial view data with complete view data 8 | :param view_num:view number 9 | :param alldata_len:number of samples 10 | :param missing_rate:Defined in section 3.2 of the paper 11 | :return:Sn 12 | """ 13 | one_rate = 1-missing_rate # missing_rate: 0.8; one_rate: 0.2 14 | 15 | if one_rate <= (1 / view_num): # 16 | enc = OneHotEncoder(categories=[np.arange(view_num)]) 17 | view_preserve = enc.fit_transform(randint(0, view_num, size=(alldata_len, 1))).toarray() # only select one view [avoid all zero input] 18 | return view_preserve # [samplenum, viewnum=2] => one value set=1, others=0 19 | 20 | if one_rate == 1: 21 | matrix = randint(1, 2, size=(alldata_len, view_num)) # [samplenum, viewnum=2] => all ones 22 | return matrix 23 | 24 | ## for one_rate between [1 / view_num, 1] => can have multi view input 25 | ## ensure at least one of them is avaliable 26 | ## since some sample is overlapped, which increase difficulties 27 | error = 1 28 | while error >= 0.005: 29 | 30 | ## gain initial view_preserve 31 | enc = OneHotEncoder(categories=[np.arange(view_num)]) 32 | view_preserve = enc.fit_transform(randint(0, view_num, size=(alldata_len, 1))).toarray() # [samplenum, viewnum=2] => one value set=1, others=0 33 | 34 | ## further generate one_num samples 35 | one_num = view_num * alldata_len * one_rate - alldata_len # left one_num after previous step 36 | ratio = one_num / (view_num * alldata_len) # now processed ratio 37 | print (f'first ratio: {ratio}') 38 | matrix_iter = (randint(0, 100, size=(alldata_len, view_num)) < int(ratio * 100)).astype(np.int) # based on ratio => matrix_iter 39 | a = np.sum(((matrix_iter + view_preserve) > 1).astype(np.int)) # a: overlap number 40 | one_num_iter = one_num / (1 - a / one_num) 41 | ratio = one_num_iter / (view_num * alldata_len) 42 | print (f'second ratio: {ratio}') 43 | matrix_iter = (randint(0, 100, size=(alldata_len, view_num)) < int(ratio * 100)).astype(np.int) 44 | matrix = ((matrix_iter + view_preserve) > 0).astype(np.int) 45 | ratio = np.sum(matrix) / (view_num * alldata_len) 46 | print (f'third ratio: {ratio}') 47 | error = abs(one_rate - ratio) 48 | 49 | return matrix 50 | 51 | 52 | def save_Sn(Sn, str_name): 53 | np.savetxt(str_name + '.csv', Sn, delimiter=',') 54 | 55 | 56 | def load_Sn(str_name): 57 | return np.loadtxt(str_name + '.csv', delimiter=',') 58 | -------------------------------------------------------------------------------- /baseline-mmin/auto/close_screen.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | grep_name=$1 3 | echo "screen contains name $grep_name:" 4 | screen -ls | grep $grep_name 5 | while true 6 | do 7 | read -r -p "Close these screens? [Y/n] " input 8 | 9 | case $input in 10 | [yY][eE][sS]|[yY]) 11 | screen -ls | awk '{print $1}'| grep $grep_name | awk '{print "screen -S "$1" -X quit"}'| sh 12 | echo "Finished" 13 | exit 1 14 | ;; 15 | 16 | [nN][oO]|[nN]) 17 | echo "Abort" 18 | exit 1 19 | ;; 20 | 21 | *) 22 | echo "Invalid input..." 23 | ;; 24 | esac 25 | done 26 | -------------------------------------------------------------------------------- /baseline-mmin/auto/combine_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | def read_results(file): 5 | ans = [] 6 | lines = open(file).readlines() 7 | for line in lines: 8 | if not line.startswith('0'): continue 9 | ans.append(list(map(lambda x: float(x), line.strip().split('\t')))) 10 | 11 | data = np.array(ans).astype(np.float) 12 | assert data.shape[0] == 24 13 | val_data = data[0: 10] 14 | tst_data = data[12: 22] 15 | return val_data, tst_data 16 | 17 | def combine(result1, result2): 18 | result = result1 * (result1>=result2) + result2 * (result1: initialize the class; first call BaseModel.__init__(self, opt). 6 | -- : unpack data from dataset and apply preprocessing. 7 | -- : produce intermediate results. 8 | -- : calculate loss, gradients, and update network weights. 9 | -- : (optionally) add model-specific options and set default options. 10 | 11 | In the function <__init__>, you need to define four lists: 12 | -- self.loss_names (str list): specify the training losses that you want to plot and save. 13 | -- self.model_names (str list): define networks used in our training. 14 | -- self.visual_names (str list): specify the images that you want to display and save. 15 | -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage. 16 | 17 | Now you can use the model class by specifying flag '--model dummy'. 18 | See our template model class 'template_model.py' for more details. 19 | """ 20 | 21 | import importlib 22 | from models.base_model import BaseModel 23 | 24 | 25 | def find_model_using_name(model_name): 26 | """Import the module "models/[model_name]_model.py". 27 | 28 | In the file, the class called DatasetNameModel() will 29 | be instantiated. It has to be a subclass of BaseModel, 30 | and it is case-insensitive. 31 | """ 32 | model_filename = "models." + model_name + "_model" # 'models.mmin_model' 33 | modellib = importlib.import_module(model_filename) 34 | model = None 35 | target_model_name = model_name.replace('_', '') + 'model' 36 | for name, cls in modellib.__dict__.items(): 37 | if name.lower() == target_model_name.lower() and issubclass(cls, BaseModel): 38 | model = cls 39 | 40 | if model is None: 41 | print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name)) 42 | exit(0) 43 | 44 | return model 45 | 46 | 47 | def get_option_setter(model_name): 48 | """Return the static method of the model class.""" 49 | model_class = find_model_using_name(model_name) 50 | return model_class.modify_commandline_options 51 | 52 | 53 | def create_model(opt): 54 | """Create a model given the option. 55 | 56 | This function warps the class CustomDatasetDataLoader. 57 | This is the main interface between this package and 'train.py'/'test.py' 58 | 59 | Example: 60 | >>> from models import create_model 61 | >>> model = create_model(opt) 62 | """ 63 | model = find_model_using_name(opt.model) 64 | instance = model(opt) 65 | print("model [%s] was created" % type(instance).__name__) 66 | return instance 67 | -------------------------------------------------------------------------------- /baseline-mmin/models/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/__pycache__/base_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/__pycache__/base_model.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/__pycache__/base_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/__pycache__/base_model.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/__pycache__/mmin_AE_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/__pycache__/mmin_AE_model.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/__pycache__/mmin_CRA_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/__pycache__/mmin_CRA_model.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/__pycache__/mmin_CRA_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/__pycache__/mmin_CRA_model.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/__pycache__/mmin_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/__pycache__/mmin_model.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/__pycache__/utt_fusion_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/__pycache__/utt_fusion_model.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/__pycache__/utt_fusion_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/__pycache__/utt_fusion_model.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/lstm_audio_model.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import os 4 | import json 5 | import torch.nn.functional as F 6 | from models.base_model import BaseModel 7 | from models.networks.lstm import LSTMEncoder 8 | from models.networks.fc_encoder import FcEncoder 9 | 10 | 11 | class LSTMAudioModel(BaseModel): 12 | ''' 13 | A: DNN 14 | V: denseface + LSTM + maxpool 15 | L: bert + textcnn 16 | ''' 17 | @staticmethod 18 | def modify_commandline_options(parser, is_train=True): 19 | parser.add_argument('--input_dim', type=int, default=130) 20 | parser.add_argument('--cls_layers', type=str, default='256,128') 21 | parser.add_argument('--hidden_size', type=int, default=256) 22 | parser.add_argument('--embd_method', type=str, default='maxpool') 23 | return parser 24 | 25 | def __init__(self, opt): 26 | """Initialize the LSTM autoencoder class 27 | Parameters: 28 | opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions 29 | """ 30 | super().__init__(opt) 31 | # our expriment is on 10 fold setting, teacher is on 5 fold setting, the train set should match 32 | self.loss_names = ['CE'] 33 | self.model_names = ['A', 'C'] 34 | self.netA = LSTMEncoder(opt.input_dim, opt.hidden_size, embd_method=opt.embd_method) 35 | cls_layers = [int(x) for x in opt.cls_layers.split(',')] + [opt.output_dim] 36 | self.netC = FcEncoder(opt.hidden_size, cls_layers, dropout=0.3) 37 | 38 | if self.isTrain: 39 | self.criterion_ce = torch.nn.CrossEntropyLoss() 40 | # initialize optimizers; schedulers will be automatically created by function . 41 | paremeters = [{'params': getattr(self, 'net'+net).parameters()} for net in self.model_names] 42 | self.optimizer = torch.optim.Adam(paremeters, lr=opt.lr, betas=(opt.beta1, 0.998)) # 0.999 43 | self.optimizers.append(self.optimizer) 44 | self.output_dim = opt.output_dim 45 | 46 | # modify save_dir 47 | self.save_dir = os.path.join(self.save_dir, str(opt.cvNo)) 48 | if not os.path.exists(self.save_dir): 49 | os.mkdir(self.save_dir) 50 | 51 | def set_input(self, input): 52 | """ 53 | Unpack input data from the dataloader and perform necessary pre-processing steps. 54 | Parameters: 55 | input (dict): include the data itself and its metadata information. 56 | """ 57 | self.A_feat = input['A_feat'].to(self.device) 58 | self.label = input['label'].to(self.device) 59 | self.input = input 60 | 61 | def forward(self): 62 | """Run forward pass; called by both functions and .""" 63 | self.feat = self.netA(self.A_feat) 64 | self.logits = self.netC(self.feat) 65 | self.pred = F.softmax(self.logits, dim=-1) 66 | 67 | def backward(self): 68 | """Calculate the loss for back propagation""" 69 | self.loss_CE = self.criterion_ce(self.logits, self.label) 70 | loss = self.loss_CE 71 | loss.backward() 72 | for model in self.model_names: 73 | torch.nn.utils.clip_grad_norm_(getattr(self, 'net'+model).parameters(), 5.0) # 0.1 74 | 75 | def optimize_parameters(self, epoch): 76 | """Calculate losses, gradients, and update network weights; called in every training iteration""" 77 | self.forward() 78 | self.optimizer.zero_grad() 79 | self.backward() 80 | self.optimizer.step() 81 | -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__init__.py: -------------------------------------------------------------------------------- 1 | ''' Contains network files. ''' -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/autoencoder.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/autoencoder.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/autoencoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/autoencoder.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/classifier.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/classifier.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/classifier.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/classifier.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/fc.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/fc.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/fc.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/fc.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/lstm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/lstm.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/lstm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/lstm.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/textcnn.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/textcnn.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/textcnn.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/textcnn.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/tools.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/tools.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/__pycache__/tools.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/networks/__pycache__/tools.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/networks/fc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class FcEncoder(nn.Module): 5 | def __init__(self, input_dim, layers, dropout=0.5, use_bn=False): 6 | ''' Fully Connect classifier 7 | fc+relu+bn+dropout, 最后分类128-4层是直接fc的 8 | Parameters: 9 | -------------------------- 10 | input_dim: input feature dim 11 | layers: [x1, x2, x3] will create 3 layers with x1, x2, x3 hidden nodes respectively. 12 | dropout: dropout rate 13 | use_bn: use batchnorm or not 14 | ''' 15 | super().__init__() 16 | self.all_layers = [] 17 | for i in range(0, len(layers)): 18 | self.all_layers.append(nn.Linear(input_dim, layers[i])) 19 | self.all_layers.append(nn.ReLU()) 20 | if use_bn: 21 | self.all_layers.append(nn.BatchNorm1d(layers[i])) 22 | if dropout > 0: 23 | self.all_layers.append(nn.Dropout(dropout)) 24 | input_dim = layers[i] 25 | 26 | self.module = nn.Sequential(*self.all_layers) 27 | 28 | def forward(self, x): 29 | ## make layers to a whole module 30 | feat = self.module(x) 31 | return feat -------------------------------------------------------------------------------- /baseline-mmin/models/networks/lstm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class LSTMEncoder(nn.Module): 7 | ''' one directional LSTM encoder 8 | ''' 9 | def __init__(self, input_size, hidden_size, embd_method='last'): 10 | super(LSTMEncoder, self).__init__() 11 | self.input_size = input_size 12 | self.hidden_size = hidden_size 13 | self.rnn = nn.LSTM(self.input_size, self.hidden_size, batch_first=True) 14 | assert embd_method in ['maxpool', 'attention', 'last'] 15 | self.embd_method = embd_method 16 | 17 | if self.embd_method == 'attention': 18 | self.attention_vector_weight = nn.Parameter(torch.Tensor(hidden_size, 1)) 19 | self.attention_layer = nn.Sequential( 20 | nn.Linear(self.hidden_size, self.hidden_size), 21 | nn.Tanh(), 22 | ) 23 | self.softmax = nn.Softmax(dim=-1) 24 | 25 | def embd_attention(self, r_out, h_n): 26 | '''' 27 | 参考这篇博客的实现: 28 | https://blog.csdn.net/dendi_hust/article/details/94435919 29 | https://blog.csdn.net/fkyyly/article/details/82501126 30 | 论文:Hierarchical Attention Networks for Document Classification 31 | formulation: lstm_output*softmax(u * tanh(W*lstm_output + Bias) 32 | W and Bias 是映射函数,其中 Bias 可加可不加 33 | u 是 attention vector 大小等于 hidden size 34 | ''' 35 | hidden_reps = self.attention_layer(r_out) # [batch_size, seq_len, hidden_size] 36 | atten_weight = (hidden_reps @ self.attention_vector_weight) # [batch_size, seq_len, 1] 37 | atten_weight = self.softmax(atten_weight) # [batch_size, seq_len, 1] 38 | # [batch_size, seq_len, hidden_size] * [batch_size, seq_len, 1] = [batch_size, seq_len, hidden_size] 39 | sentence_vector = torch.sum(r_out * atten_weight, dim=1) # [batch_size, hidden_size] 40 | return sentence_vector 41 | 42 | def embd_maxpool(self, r_out, h_n): 43 | # embd = self.maxpool(r_out.transpose(1,2)) # r_out.size()=>[batch_size, seq_len, hidden_size] 44 | # r_out.transpose(1, 2) => [batch_size, hidden_size, seq_len] 45 | in_feat = r_out.transpose(1,2) 46 | embd = F.max_pool1d(in_feat, in_feat.size(2), in_feat.size(2)) 47 | return embd.squeeze() 48 | 49 | def embd_last(self, r_out, h_n): 50 | #Just for one layer and single direction 51 | return h_n.squeeze() 52 | 53 | def forward(self, x): 54 | ''' 55 | r_out shape: seq_len, batch, num_directions * hidden_size 56 | hn and hc shape: num_layers * num_directions, batch, hidden_size 57 | ''' 58 | r_out, (h_n, h_c) = self.rnn(x) 59 | embd = getattr(self, 'embd_'+self.embd_method)(r_out, h_n) 60 | return embd -------------------------------------------------------------------------------- /baseline-mmin/models/networks/textcnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class TextCNN(nn.Module): 7 | def __init__(self, input_dim, emb_size=128, in_channels=1, out_channels=128, kernel_heights=[3,4,5], dropout=0.5): 8 | super().__init__() 9 | ''' 10 | cat((conv1-relu+conv2-relu+conv3-relu)+maxpool) + dropout, and to trans 11 | ''' 12 | self.conv1 = nn.Conv2d(in_channels, out_channels, (kernel_heights[0], input_dim), stride=1, padding=0) 13 | self.conv2 = nn.Conv2d(in_channels, out_channels, (kernel_heights[1], input_dim), stride=1, padding=0) 14 | self.conv3 = nn.Conv2d(in_channels, out_channels, (kernel_heights[2], input_dim), stride=1, padding=0) 15 | self.dropout = nn.Dropout(dropout) 16 | self.embd = nn.Sequential( 17 | nn.Linear(len(kernel_heights)*out_channels, emb_size), 18 | nn.ReLU(inplace=True), 19 | ) 20 | 21 | def conv_block(self, input, conv_layer): 22 | conv_out = conv_layer(input)# conv_out.size() = (batch_size, out_channels, dim, 1) 23 | activation = F.relu(conv_out.squeeze(3))# activation.size() = (batch_size, out_channels, dim1) 24 | max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2) # maxpool_out.size() = (batch_size, out_channels) 25 | return max_out 26 | 27 | def forward(self, frame_x): 28 | batch_size, seq_len, feat_dim = frame_x.size() 29 | frame_x = frame_x.view(batch_size, 1, seq_len, feat_dim) 30 | max_out1 = self.conv_block(frame_x, self.conv1) 31 | max_out2 = self.conv_block(frame_x, self.conv2) 32 | max_out3 = self.conv_block(frame_x, self.conv3) 33 | all_out = torch.cat((max_out1, max_out2, max_out3), 1) 34 | fc_in = self.dropout(all_out) 35 | embd = self.embd(fc_in) 36 | return embd -------------------------------------------------------------------------------- /baseline-mmin/models/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/utils/__init__.py -------------------------------------------------------------------------------- /baseline-mmin/models/utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/utils/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/utils/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/utils/__pycache__/config.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/models/utils/__pycache__/config.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/models/utils/config.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | class OptConfig(object): 4 | def __init__(self): 5 | pass 6 | 7 | def load(self, config_dict): 8 | if sys.version > '3': 9 | for key, value in config_dict.items(): 10 | if not isinstance(value, dict): 11 | setattr(self, key, value) 12 | else: 13 | self.load(value) 14 | else: 15 | for key, value in config_dict.iteritems(): 16 | if not isinstance(value, dict): 17 | setattr(self, key, value) 18 | else: 19 | self.load(value) -------------------------------------------------------------------------------- /baseline-mmin/models/utils/load_pretrained.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from .config import OptConfig 4 | 5 | def load_from_opt_record(file_path): 6 | opt_content = json.load(open(file_path, 'r')) 7 | opt = OptConfig() 8 | opt.load(opt_content) 9 | return opt 10 | 11 | def load_pretrained_model(model_class, checkpoints_dir, cv, gpu_ids): 12 | path = os.path.join(checkpoints_dir, str(cv)) 13 | config_path = os.path.join(checkpoints_dir, 'train_opt.conf') 14 | config = load_from_opt_record(config_path) 15 | config.isTrain = False # teacher model should be in test mode 16 | config.gpu_ids = gpu_ids # set gpu to the same 17 | model = model_class(config) 18 | model.cuda() 19 | model.load_networks_cv(path) 20 | model.eval() 21 | return model 22 | -------------------------------------------------------------------------------- /baseline-mmin/opts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/opts/__init__.py -------------------------------------------------------------------------------- /baseline-mmin/opts/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/opts/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/opts/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/opts/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/opts/__pycache__/base_opts.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/opts/__pycache__/base_opts.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/opts/__pycache__/base_opts.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/opts/__pycache__/base_opts.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/opts/__pycache__/train_opts.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/opts/__pycache__/train_opts.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/opts/__pycache__/train_opts.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/opts/__pycache__/train_opts.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/opts/test_opts.py: -------------------------------------------------------------------------------- 1 | from .base_opts import BaseOptions 2 | 3 | 4 | class TestOptions(BaseOptions): 5 | """This class includes training options. 6 | 7 | It also includes shared options defined in BaseOptions. 8 | """ 9 | 10 | def initialize(self, parser): 11 | parser = BaseOptions.initialize(self, parser) 12 | parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc') 13 | parser.add_argument('--method', type=str, default='mean', help='How to calculate final test result, [concat, mean]') 14 | parser.add_argument('--simple', action='store_true', help='simple print information') 15 | self.isTrain = False 16 | return parser 17 | -------------------------------------------------------------------------------- /baseline-mmin/opts/train_opts.py: -------------------------------------------------------------------------------- 1 | from .base_opts import BaseOptions 2 | 3 | 4 | class TrainOptions(BaseOptions): 5 | """This class includes training options. 6 | 7 | It also includes shared options defined in BaseOptions. 8 | """ 9 | 10 | def initialize(self, parser): 11 | parser = BaseOptions.initialize(self, parser) 12 | # network saving and loading parameters 13 | # parser.add_argument('--cvNo', type=int, default=5, help='which cross validation set') 14 | parser.add_argument('--print_freq', type=int, default=100, help='frequency of showing training results on console') 15 | parser.add_argument('--save_latest_freq', type=int, default=1000, help='frequency of saving the latest results') 16 | parser.add_argument('--save_epoch_freq', type=int, default=1, help='frequency of saving checkpoints at the end of epochs') 17 | parser.add_argument('--save_by_iter', action='store_true', help='whether saves model by iteration') 18 | parser.add_argument('--continue_train', action='store_true', help='continue training: load the latest model') 19 | parser.add_argument('--epoch_count', type=int, default=1, help='the starting epoch count, we save the model by , +, ...') 20 | parser.add_argument('--phase', type=str, default='train', help='train, val, test, etc') 21 | 22 | # training parameters 23 | parser.add_argument('--mask_rate', type=float, default=0.0, help='input mask rate, ranging from [0.0, 0.1, ..., 1.0]') 24 | parser.add_argument('--niter', type=int, default=20, help='# of iter at starting learning rate') 25 | parser.add_argument('--niter_decay', type=int, default=80, help='# of iter to linearly decay learning rate to zero') 26 | parser.add_argument('--beta1', type=float, default=0.5, help='momentum term of adam') 27 | parser.add_argument('--lr', type=float, default=2e-4, help='initial learning rate for adam') 28 | parser.add_argument('--lr_policy', type=str, default='linear', help='learning rate policy. [linear | step | plateau | cosine]') 29 | parser.add_argument('--lr_decay_iters', type=int, default=50, help='multiply by a gamma every lr_decay_iters iterations') 30 | 31 | # test with predefined mask path 32 | parser.add_argument('--test_mask', type=str, default=None, help='test under same mask for fair comparision') 33 | 34 | # expr setting 35 | parser.add_argument('--run_idx', type=int, default=1, help='experiment number; for repeat experiment') 36 | self.isTrain = True 37 | return parser 38 | -------------------------------------------------------------------------------- /baseline-mmin/preprocess/IEMOCAP/make_comparE.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import json 4 | import numpy as np 5 | import pandas as pd 6 | import scipy.signal as spsig 7 | from tqdm import tqdm 8 | 9 | 10 | class ComParEExtractor(object): 11 | ''' 抽取comparE特征, 输入音频路径, 输出npy数组, 每帧130d 12 | ''' 13 | def __init__(self, opensmile_tool_dir=None, downsample=10, tmp_dir='.tmp', no_tmp=False): 14 | ''' Extract ComparE feature 15 | tmp_dir: where to save opensmile csv file 16 | no_tmp: if true, delete tmp file 17 | ''' 18 | if not os.path.exists(tmp_dir): 19 | os.makedirs(tmp_dir) 20 | if opensmile_tool_dir is None: 21 | opensmile_tool_dir = '/root/opensmile-2.3.0/' 22 | self.opensmile_tool_dir = opensmile_tool_dir 23 | self.tmp_dir = tmp_dir 24 | self.downsample = downsample 25 | self.no_tmp = no_tmp 26 | 27 | def __call__(self, wav): 28 | basename = os.path.basename(wav).split('.')[0] 29 | save_path = os.path.join(self.tmp_dir, basename+".csv") 30 | cmd = 'SMILExtract -C {}/config/ComParE_2016.conf \ 31 | -appendcsvlld 0 -timestampcsvlld 1 -headercsvlld 1 \ 32 | -I {} -lldcsvoutput {} -instname xx -O ? -noconsoleoutput 1' 33 | os.system(cmd.format(self.opensmile_tool_dir, wav, save_path)) 34 | 35 | df = pd.read_csv(save_path, delimiter=';') 36 | wav_data = df.iloc[:, 2:] 37 | if len(wav_data) > self.downsample: 38 | wav_data = spsig.resample_poly(wav_data, up=1, down=self.downsample, axis=0) 39 | if self.no_tmp: 40 | os.remove(save_path) 41 | else: 42 | wav_data = None 43 | self.print(f'Error in {wav}, no feature extracted') 44 | 45 | return wav_data 46 | 47 | 48 | def get_trn_val_tst(target_root_dir, cv, setname): 49 | int2name = np.load(os.path.join(target_root_dir, str(cv), '{}_int2name.npy'.format(setname))) 50 | int2label = np.load(os.path.join(target_root_dir, str(cv), '{}_label.npy'.format(setname))) 51 | assert len(int2name) == len(int2label) 52 | return int2name, int2label 53 | 54 | def make_all_comparE(config): 55 | extractor = ComParEExtractor() 56 | trn_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'trn') 57 | val_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'val') 58 | tst_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'tst') 59 | trn_int2name = list(map(lambda x: x[0].decode(), trn_int2name)) 60 | val_int2name = list(map(lambda x: x[0].decode(), val_int2name)) 61 | tst_int2name = list(map(lambda x: x[0].decode(), tst_int2name)) 62 | all_utt_ids = trn_int2name + val_int2name + tst_int2name 63 | all_h5f = h5py.File(os.path.join(config['feature_root'], 'A', 'comparE.h5'), 'w') 64 | for utt_id in tqdm(all_utt_ids): 65 | ses_id = utt_id[4] 66 | dialog_id = '_'.join(utt_id.split('_')[:-1]) 67 | wav_path = os.path.join(config['data_root'], f'Session{ses_id}', 'sentences', 'wav', f'{dialog_id}', f'{utt_id}.wav') 68 | feat = extractor(wav_path) 69 | all_h5f[utt_id] = feat 70 | 71 | def normlize_on_trn(config, input_file, output_file): 72 | h5f = h5py.File(output_file, 'w') 73 | in_data = h5py.File(input_file, 'r') 74 | for cv in range(1, 11): 75 | trn_int2name, _ = get_trn_val_tst(config['target_root'], cv, 'trn') 76 | trn_int2name = list(map(lambda x: x[0].decode(), trn_int2name)) 77 | all_feat = [in_data[utt_id][()] for utt_id in trn_int2name] 78 | all_feat = np.concatenate(all_feat, axis=0) 79 | mean_f = np.mean(all_feat, axis=0) 80 | std_f = np.std(all_feat, axis=0) 81 | std_f[std_f == 0.0] = 1.0 82 | cv_group = h5f.create_group(str(cv)) 83 | cv_group['mean'] = mean_f 84 | cv_group['std'] = std_f 85 | print(cv) 86 | print("mean:", np.sum(mean_f)) 87 | print("std:", np.sum(std_f)) 88 | 89 | 90 | if __name__ == '__main__': 91 | pwd = os.path.abspath(__file__) 92 | pwd = os.path.dirname(pwd) 93 | config_path = os.path.join(pwd, '../', 'data/config', 'IEMOCAP_config.json') 94 | config = json.load(open(config_path)) 95 | # make_all_comparE(config) 96 | normlize_on_trn(config, os.path.join(config['feature_root'], 'A', 'comparE.h5'), os.path.join(config['feature_root'], 'A', 'comparE_mean_std.h5')) -------------------------------------------------------------------------------- /baseline-mmin/preprocess/IEMOCAP/make_melspec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | import h5py 5 | from tqdm import tqdm 6 | from preprocess.melspec_extractor import MelSpecExtractor 7 | 8 | def get_trn_val_tst(target_root_dir, cv, setname): 9 | int2name = np.load(os.path.join(target_root_dir, str(cv), '{}_int2name.npy'.format(setname))) 10 | int2label = np.load(os.path.join(target_root_dir, str(cv), '{}_label.npy'.format(setname))) 11 | assert len(int2name) == len(int2label) 12 | return int2name, int2label 13 | 14 | def extract_all_melspec(config): 15 | extractor = MelSpecExtractor() 16 | trn_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'trn') 17 | val_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'val') 18 | tst_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'tst') 19 | trn_int2name = list(map(lambda x: x[0].decode(), trn_int2name)) 20 | val_int2name = list(map(lambda x: x[0].decode(), val_int2name)) 21 | tst_int2name = list(map(lambda x: x[0].decode(), tst_int2name)) 22 | all_utt_ids = trn_int2name + val_int2name + tst_int2name 23 | all_h5f = h5py.File(os.path.join(config['feature_root'], 'A', 'melspec.h5'), 'w') 24 | for utt_id in tqdm(all_utt_ids): 25 | ses_id = utt_id[4] 26 | dialog_id = '_'.join(utt_id.split('_')[:-1]) 27 | wav_path = os.path.join(config['data_root'], f'Session{ses_id}', 'sentences', 'wav', f'{dialog_id}', f'{utt_id}.wav') 28 | melspec = extractor.extract(wav_path) 29 | all_h5f[utt_id] = melspec 30 | 31 | if __name__ == '__main__': 32 | pwd = os.path.abspath(__file__) 33 | pwd = os.path.dirname(pwd) 34 | config_path = os.path.join(pwd, '../', 'data/config', 'IEMOCAP_config.json') 35 | config = json.load(open(config_path)) 36 | extract_all_melspec(config) 37 | -------------------------------------------------------------------------------- /baseline-mmin/preprocess/IEMOCAP/make_torch_denseface.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import json 4 | import numpy as np 5 | import pandas as pd 6 | import scipy.signal as spsig 7 | from tqdm import tqdm 8 | 9 | def get_trn_val_tst(target_root_dir, cv, setname): 10 | int2name = np.load(os.path.join(target_root_dir, str(cv), '{}_int2name.npy'.format(setname))) 11 | int2label = np.load(os.path.join(target_root_dir, str(cv), '{}_label.npy'.format(setname))) 12 | assert len(int2name) == len(int2label) 13 | return int2name, int2label 14 | 15 | def make_all_comparE(config): 16 | extractor = ComParEExtractor() 17 | trn_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'trn') 18 | val_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'val') 19 | tst_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'tst') 20 | trn_int2name = list(map(lambda x: x[0].decode(), trn_int2name)) 21 | val_int2name = list(map(lambda x: x[0].decode(), val_int2name)) 22 | tst_int2name = list(map(lambda x: x[0].decode(), tst_int2name)) 23 | all_utt_ids = trn_int2name + val_int2name + tst_int2name 24 | all_h5f = h5py.File(os.path.join(config['feature_root'], 'A', 'comparE.h5'), 'w') 25 | for utt_id in tqdm(all_utt_ids): 26 | ses_id = utt_id[4] 27 | dialog_id = '_'.join(utt_id.split('_')[:-1]) 28 | wav_path = os.path.join(config['data_root'], f'Session{ses_id}', 'sentences', 'wav', f'{dialog_id}', f'{utt_id}.wav') 29 | feat = extractor(wav_path) 30 | all_h5f[utt_id] = feat 31 | 32 | def normlize_on_trn(config, input_file, output_file): 33 | h5f = h5py.File(output_file, 'w') 34 | in_data = h5py.File(input_file, 'r') 35 | for cv in range(1, 11): 36 | trn_int2name, _ = get_trn_val_tst(config['target_root'], cv, 'trn') 37 | trn_int2name = list(map(lambda x: x[0].decode(), trn_int2name)) 38 | all_feat = [in_data[utt_id][()] for utt_id in trn_int2name] 39 | all_feat = np.concatenate(all_feat, axis=0) 40 | mean_f = np.mean(all_feat, axis=0) 41 | std_f = np.std(all_feat, axis=0) 42 | std_f[std_f == 0.0] = 1.0 43 | cv_group = h5f.create_group(str(cv)) 44 | cv_group['mean'] = mean_f 45 | cv_group['std'] = std_f 46 | print(cv) 47 | print("mean:", np.sum(mean_f)) 48 | print("std:", np.sum(std_f)) 49 | 50 | 51 | if __name__ == '__main__': 52 | pwd = os.path.abspath(__file__) 53 | pwd = os.path.dirname(pwd) 54 | config_path = os.path.join(pwd, '../', 'data/config', 'IEMOCAP_config.json') 55 | config = json.load(open(config_path)) 56 | # make_all_comparE(config) 57 | normlize_on_trn(config, os.path.join(config['feature_root'], 'A', 'comparE.h5'), os.path.join(config['feature_root'], 'A', 'comparE_mean_std.h5')) -------------------------------------------------------------------------------- /baseline-mmin/preprocess/IEMOCAP/melspec_extractor.py: -------------------------------------------------------------------------------- 1 | from re import M 2 | import numpy as np # linear algebra 3 | from tqdm import tqdm 4 | import PIL 5 | import os 6 | import librosa 7 | import random 8 | 9 | 10 | class default_config: 11 | sampling_rate = 16000 12 | duration = 2 # sec 13 | hop_length = 125 * duration # to make time steps 128 14 | fmin = 20 15 | fmax = sampling_rate // 2 16 | n_mels = 128 17 | n_fft = n_mels * 20 18 | padmode = 'constant' 19 | samples = sampling_rate * duration 20 | 21 | 22 | class MelSpecExtractor(object): 23 | def __init__(self, sampling_rate=None, duration=None, hop_length=None, \ 24 | fmin=None, fmax=None, n_mels=None, n_fft=None, padmode=None, max_samples=None): 25 | self.sampling_rate = sampling_rate or default_config.sampling_rate 26 | self.duration = duration or default_config.duration 27 | self.hop_length = hop_length or default_config.hop_length 28 | self.fmin = fmin or default_config.fmin 29 | self.fmax = fmax or default_config.fmax 30 | self.n_mels = n_mels or default_config.n_mels 31 | self.n_fft = n_fft or default_config.n_fft 32 | self.padmode = padmode or default_config.padmode 33 | self.max_samples = max_samples or default_config.samples 34 | assert self.max_samples > 0, 'max_samples parameters must be larger than zero' 35 | 36 | 37 | def read_audio(self, pathname, trim_long_data): 38 | y, _ = librosa.load(pathname, sr=self.sampling_rate) 39 | # trim silence 40 | if 0 < len(y): # workaround: 0 length causes error 41 | y, _ = librosa.effects.trim(y) # trim, top_db=default(60) 42 | else: 43 | print(f"found zero length audio {pathname}") 44 | y = np.zeros((self.max_samples,), np.float32) 45 | # make it unified length to self.samples 46 | if len(y) > self.max_samples: # long enough 47 | if trim_long_data: 48 | y = y[0 : self.max_samples] 49 | else: # pad blank 50 | leny = len(y) 51 | padding = self.max_samples - len(y) # add padding at both ends 52 | offset = padding // 2 53 | y = np.pad(y, (offset, self.max_samples - len(y) - offset), self.padmode) 54 | return y 55 | 56 | 57 | def audio_to_melspectrogram(self, audio): 58 | spectrogram = librosa.feature.melspectrogram(audio, 59 | sr=self.sampling_rate, 60 | n_mels=self.n_mels, 61 | hop_length=self.hop_length, 62 | n_fft=self.n_fft, 63 | fmin=self.fmin, 64 | fmax=self.fmax) 65 | spectrogram = librosa.power_to_db(spectrogram) 66 | spectrogram = spectrogram.astype(np.float32) 67 | return spectrogram 68 | 69 | 70 | def read_as_melspectrogram(self, pathname, trim_long_data=False): 71 | x = self.read_audio(pathname, trim_long_data) 72 | mels = self.audio_to_melspectrogram(x) 73 | return mels 74 | 75 | 76 | def mono_to_color(self, X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6): 77 | # Stack X as [X,X,X] 78 | X = np.stack([X, X, X], axis=-1) 79 | 80 | # Standardize 81 | mean = mean or X.mean() 82 | X = X - mean 83 | std = std or X.std() 84 | Xstd = X / (std + eps) 85 | _min, _max = Xstd.min(), Xstd.max() 86 | norm_max = norm_max or _max 87 | norm_min = norm_min or _min 88 | if (_max - _min) > eps: 89 | # Normalize to [0, 255] 90 | V = Xstd 91 | V[V < norm_min] = norm_min 92 | V[V > norm_max] = norm_max 93 | V = 255 * (V - norm_min) / (norm_max - norm_min) 94 | V = V.astype(np.uint8) 95 | else: 96 | # Just zero 97 | V = np.zeros_like(Xstd, dtype=np.uint8) 98 | return V 99 | 100 | 101 | def extract(self, wav_path): 102 | x = self.read_as_melspectrogram(wav_path, trim_long_data=False) 103 | x_color = self.mono_to_color(x) 104 | return x_color 105 | 106 | 107 | if __name__ == '__main__': 108 | extractor = MelSpecExtractor() 109 | # wav_path = '/data3/lrc/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_script03_2/Ses01F_script03_2_M001.wav' 110 | wav_path = '/data3/lrc/IEMOCAP_full_release/Session1/sentences/wav/Ses01F_script03_2/Ses01F_script03_2_M026.wav' 111 | melspec = extractor.extract(wav_path) 112 | print(melspec.shape) -------------------------------------------------------------------------------- /baseline-mmin/preprocess/IEMOCAP/migrate_VL_feat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import json 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | def get_trn_val_tst(target_root_dir, cv, setname): 8 | int2name = np.load(os.path.join(target_root_dir, str(cv), '{}_int2name.npy'.format(setname))) 9 | int2label = np.load(os.path.join(target_root_dir, str(cv), '{}_label.npy'.format(setname))) 10 | assert len(int2name) == len(int2label) 11 | return int2name, int2label 12 | 13 | 14 | def migrate_V(config): 15 | migrate_root = os.path.join('/data3/lrc/Iemocap_feature/cv_level/feature/denseface/', str(1)) 16 | src_v_trn = np.load(os.path.join(migrate_root, 'trn.npy')) 17 | src_v_val = np.load(os.path.join(migrate_root, 'val.npy')) 18 | src_v_tst = np.load(os.path.join(migrate_root, 'tst.npy')) 19 | src_v_feat = np.concatenate([src_v_trn, src_v_val, src_v_tst], axis=0) 20 | trn_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'trn') 21 | val_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'val') 22 | tst_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'tst') 23 | trn_int2name = list(map(lambda x: x[0].decode(), trn_int2name)) 24 | val_int2name = list(map(lambda x: x[0].decode(), val_int2name)) 25 | tst_int2name = list(map(lambda x: x[0].decode(), tst_int2name)) 26 | all_utt_ids = trn_int2name + val_int2name + tst_int2name 27 | all_h5f = h5py.File(os.path.join(config['feature_root'], 'V', 'denseface.h5'), 'w') 28 | for utt_id, v_feat in tqdm(zip(all_utt_ids, src_v_feat), total=len(all_utt_ids)): 29 | all_h5f[utt_id] = v_feat 30 | 31 | 32 | def migrate_L(config): 33 | migrate_root = os.path.join('/data3/lrc/Iemocap_feature/cv_level/feature/text/', str(1)) 34 | src_l_trn = np.load(os.path.join(migrate_root, 'trn.npy')) 35 | src_l_val = np.load(os.path.join(migrate_root, 'val.npy')) 36 | src_l_tst = np.load(os.path.join(migrate_root, 'tst.npy')) 37 | src_l_feat = np.concatenate([src_l_trn, src_l_val, src_l_tst], axis=0) 38 | trn_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'trn') 39 | val_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'val') 40 | tst_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'tst') 41 | trn_int2name = list(map(lambda x: x[0].decode(), trn_int2name)) 42 | val_int2name = list(map(lambda x: x[0].decode(), val_int2name)) 43 | tst_int2name = list(map(lambda x: x[0].decode(), tst_int2name)) 44 | all_utt_ids = trn_int2name + val_int2name + tst_int2name 45 | all_h5f = h5py.File(os.path.join(config['feature_root'], 'L', 'bert_large.h5'), 'w') 46 | for utt_id, l_feat in tqdm(zip(all_utt_ids, src_l_feat), total=len(all_utt_ids)): 47 | all_h5f[utt_id] = l_feat 48 | 49 | 50 | if __name__ == '__main__': 51 | pwd = os.path.abspath(__file__) 52 | pwd = os.path.dirname(pwd) 53 | config_path = os.path.join(pwd, '../', 'data/config', 'IEMOCAP_config.json') 54 | config = json.load(open(config_path)) 55 | # migrate_V(config) 56 | migrate_L(config) 57 | -------------------------------------------------------------------------------- /baseline-mmin/preprocess/IEMOCAP/migrate_compaeE_tonpy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import h5py 4 | import numpy as np 5 | 6 | def get_trn_val_tst(target_root_dir, cv, setname): 7 | int2name = np.load(os.path.join(target_root_dir, str(cv), '{}_int2name.npy'.format(setname))) 8 | int2label = np.load(os.path.join(target_root_dir, str(cv), '{}_label.npy'.format(setname))) 9 | assert len(int2name) == len(int2label) 10 | return int2name, int2label 11 | 12 | def padding_to_fixlen(feat, max_len): 13 | assert feat.ndim == 2 14 | if feat.shape[0] >= max_len: 15 | feat = feat[:max_len] 16 | else: 17 | feat = np.concatenate([feat, \ 18 | np.zeros((max_len-feat.shape[0], feat.shape[1]))], axis=0) 19 | return feat 20 | 21 | def migrate_comparE_to_npy(config): 22 | max_len = 60 23 | feat_path = os.path.join(config['feature_root'], 'A', 'comparE.h5') 24 | mean_std_path = os.path.join(config['feature_root'], 'A', 'comparE_mean_std.h5') 25 | feat_h5f = h5py.File(feat_path, 'r') 26 | mean_std = h5py.File(mean_std_path, 'r') 27 | for cv in range(1, 11): 28 | save_dir = f'/data3/lrc/Iemocap_feature/cv_level/feature/comparE/{cv}' 29 | if not os.path.exists(save_dir): 30 | os.makedirs(save_dir) 31 | mean = mean_std[str(cv)]['mean'][()] 32 | std = mean_std[str(cv)]['std'][()] 33 | for part in ['trn', 'val', 'tst']: 34 | part_feat = [] 35 | int2name, _ = get_trn_val_tst(config['target_root'], cv, part) 36 | int2name = [x[0].decode() for x in int2name] 37 | for utt_id in int2name: 38 | feat = feat_h5f[utt_id][()] 39 | feat = (feat-mean)/std 40 | feat = padding_to_fixlen(feat, max_len) 41 | part_feat.append(feat) 42 | part_feat = np.array(part_feat) 43 | print(f"cv: {cv} {part} {part_feat.shape}") 44 | save_path = os.path.join(save_dir, f"{part}.npy") 45 | np.save(save_path, part_feat) 46 | 47 | if __name__ == '__main__': 48 | pwd = os.path.abspath(__file__) 49 | pwd = os.path.dirname(pwd) 50 | config_path = os.path.join(pwd, '../', 'data/config', 'IEMOCAP_config.json') 51 | config = json.load(open(config_path)) 52 | migrate_comparE_to_npy(config) 53 | 54 | 55 | -------------------------------------------------------------------------------- /baseline-mmin/preprocess/IEMOCAP/statis_comparE.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import json 4 | 5 | def statis_comparE(config): 6 | path = os.path.join(config['feature_root'], 'A', 'comparE.h5') 7 | h5f = h5py.File(path, 'r') 8 | lengths = [] 9 | for utt_id in h5f.keys(): 10 | lengths.append(h5f[utt_id][()].shape[0]) 11 | lengths = sorted(lengths) 12 | print('MIN:', min(lengths)) 13 | print('MAX:', max(lengths)) 14 | print('MEAN: {:.2f}'.format(sum(lengths) / len(lengths))) 15 | print('50%:', lengths[len(lengths)//2]) 16 | print('75%:', lengths[int(len(lengths)*0.75)]) 17 | print('90%:', lengths[int(len(lengths)*0.9)]) 18 | 19 | if __name__ == '__main__': 20 | pwd = os.path.abspath(__file__) 21 | pwd = os.path.dirname(pwd) 22 | config_path = os.path.join(pwd, '../', 'data/config', 'IEMOCAP_config.json') 23 | config = json.load(open(config_path)) 24 | statis_comparE(config) -------------------------------------------------------------------------------- /baseline-mmin/preprocess/MSP/make_aligned_info.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | from tqdm import tqdm 5 | import re 6 | import string 7 | 8 | def get_trn_val_tst(target_root_dir, cv, setname): 9 | int2name = np.load(os.path.join(target_root_dir, str(cv), '{}_int2name.npy'.format(setname))) 10 | int2label = np.load(os.path.join(target_root_dir, str(cv), '{}_label.npy'.format(setname))) 11 | assert len(int2name) == len(int2label) 12 | return int2name, int2label 13 | 14 | 15 | def get_all_utt_id(config): 16 | trn_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'trn') 17 | val_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'val') 18 | tst_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'tst') 19 | trn_int2name = trn_int2name.tolist() 20 | val_int2name = val_int2name.tolist() 21 | tst_int2name = tst_int2name.tolist() 22 | all_utt_ids = trn_int2name + val_int2name + tst_int2name 23 | return all_utt_ids 24 | 25 | def align_script(wav, config, out): 26 | _cmd = 'python /data6/p2fa-vislab/align.py {} {} {} '.format(wav, config, out) # >/dev/null 2>&1 27 | os.system(_cmd) 28 | 29 | def clean(text): 30 | punc = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~' 31 | text = re.sub(r"[%s]+" %punc, " ",text) 32 | text.replace(' ', ' ') 33 | return text 34 | 35 | def make_aligned_info(config): 36 | save_dir = os.path.join(config['feature_root'], 'aligned', 'word_aligned_info') 37 | tmp_dir = os.path.join(config['feature_root'], 'aligned', 'tmp') 38 | if not os.path.exists(save_dir): 39 | os.mkdir(save_dir) 40 | if not os.path.exists(tmp_dir): 41 | os.mkdir(tmp_dir) 42 | transcript_dir = os.path.join(config['data_root'], 'All_human_transcriptions') 43 | all_utt_ids = get_all_utt_id(config) 44 | for utt_id in tqdm(all_utt_ids): 45 | align_save_path = os.path.join(save_dir, utt_id + '.json') 46 | if os.path.exists(align_save_path): 47 | continue 48 | wav_dir = os.path.join(config['feature_root'], 'audio_11025') 49 | wav_path = os.path.join(wav_dir, '{}.wav'.format(utt_id)) 50 | transcript_path = os.path.join(transcript_dir, utt_id + '.txt') 51 | transcript = open(transcript_path).read().strip() 52 | transcript = clean(transcript) 53 | print('"' + transcript + '"') 54 | tmp_path = os.path.join(tmp_dir, utt_id + '.json') 55 | tmp_data = [{ 56 | "speaker": "Steve", 57 | "line": transcript, 58 | }] 59 | json.dump(tmp_data, open(tmp_path, 'w')) 60 | align_script(wav_path, tmp_path, align_save_path) 61 | 62 | def convert_sr(config): 63 | sampled_audio_dir = os.path.join(config['feature_root'], 'audio_11025') 64 | if not os.path.exists(sampled_audio_dir): 65 | os.mkdir(sampled_audio_dir) 66 | all_utt_ids = get_all_utt_id(config) 67 | for utt_id in tqdm(all_utt_ids): 68 | ses_id = int(utt_id.split('-')[3][-1]) 69 | dialog_id = utt_id.split('-')[2] 70 | wav_path = os.path.join(config['data_root'], 'Audio', \ 71 | 'session{}'.format(ses_id), dialog_id, 'S', '{}.wav'.format(utt_id)) 72 | cmd = 'sox {} -r 11025 {}' 73 | new_audio_path = os.path.join(sampled_audio_dir, utt_id + '.wav') 74 | os.system(cmd.format(wav_path, new_audio_path)) 75 | 76 | if __name__ == '__main__': 77 | pwd = os.path.abspath(__file__) 78 | pwd = os.path.dirname(pwd) 79 | config_path = os.path.join(pwd, '../../', 'data/config', 'MSP_config.json') 80 | config = json.load(open(config_path)) 81 | make_aligned_info(config) 82 | # convert_sr(config) -------------------------------------------------------------------------------- /baseline-mmin/preprocess/MSP/make_comparE.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import json 4 | import numpy as np 5 | import pandas as pd 6 | import scipy.signal as spsig 7 | from torch.nn.functional import normalize 8 | from tqdm import tqdm 9 | 10 | 11 | class ComParEExtractor(object): 12 | ''' 抽取comparE特征, 输入音频路径, 输出npy数组, 每帧130d 13 | ''' 14 | def __init__(self, opensmile_tool_dir=None, downsample=10, tmp_dir='.tmp', no_tmp=False): 15 | ''' Extract ComparE feature 16 | tmp_dir: where to save opensmile csv file 17 | no_tmp: if true, delete tmp file 18 | ''' 19 | if not os.path.exists(tmp_dir): 20 | os.makedirs(tmp_dir) 21 | if opensmile_tool_dir is None: 22 | opensmile_tool_dir = '/root/opensmile-2.3.0/' 23 | self.opensmile_tool_dir = opensmile_tool_dir 24 | self.tmp_dir = tmp_dir 25 | self.downsample = downsample 26 | self.no_tmp = no_tmp 27 | 28 | def __call__(self, wav): 29 | basename = os.path.basename(wav).split('.')[0] 30 | save_path = os.path.join(self.tmp_dir, basename+".csv") 31 | cmd = 'SMILExtract -C {}/config/ComParE_2016.conf \ 32 | -appendcsvlld 0 -timestampcsvlld 1 -headercsvlld 1 \ 33 | -I {} -lldcsvoutput {} -instname xx -O ? -noconsoleoutput 1' 34 | os.system(cmd.format(self.opensmile_tool_dir, wav, save_path)) 35 | 36 | df = pd.read_csv(save_path, delimiter=';') 37 | wav_data = df.iloc[:, 2:] 38 | if len(wav_data) > self.downsample: 39 | wav_data = spsig.resample_poly(wav_data, up=1, down=self.downsample, axis=0) 40 | if self.no_tmp: 41 | os.remove(save_path) 42 | else: 43 | wav_data = None 44 | self.print(f'Error in {wav}, no feature extracted') 45 | 46 | return wav_data 47 | 48 | 49 | def get_trn_val_tst(target_root_dir, cv, setname): 50 | int2name = np.load(os.path.join(target_root_dir, str(cv), '{}_int2name.npy'.format(setname))) 51 | int2label = np.load(os.path.join(target_root_dir, str(cv), '{}_label.npy'.format(setname))) 52 | assert len(int2name) == len(int2label) 53 | return int2name, int2label 54 | 55 | def padding_to_fixlen(feat, max_len): 56 | assert feat.ndim == 2 57 | if feat.shape[0] >= max_len: 58 | feat = feat[:max_len] 59 | else: 60 | feat = np.concatenate([feat, \ 61 | np.zeros((max_len-feat.shape[0], feat.shape[1]))], axis=0) 62 | return feat 63 | 64 | def make_all_comparE(config): 65 | max_len = 50 66 | extractor = ComParEExtractor() 67 | trn_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'trn') 68 | val_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'val') 69 | tst_int2name, _ = get_trn_val_tst(config['target_root'], 1, 'tst') 70 | trn_int2name = trn_int2name.tolist() 71 | val_int2name = val_int2name.tolist() 72 | tst_int2name = tst_int2name.tolist() 73 | all_utt_ids = trn_int2name + val_int2name + tst_int2name 74 | all_feat = {} 75 | for utt_id in tqdm(all_utt_ids): # MSP-IMPROV-S01A-F01-S-FM01 76 | ses_id = int(utt_id.split('-')[3][-1]) 77 | dialog_id = utt_id.split('-')[2] 78 | wav_path = os.path.join(config['data_root'], 'Audio', f'session{ses_id}', dialog_id, 'S', f'{utt_id}.wav') 79 | feat = extractor(wav_path) 80 | all_feat[utt_id] = padding_to_fixlen(feat, max_len) 81 | 82 | for cv in range(1, config['total_cv']+1): 83 | save_dir = os.path.join(config['feature_root'], 'A', str(cv)) 84 | if not os.path.exists(save_dir): 85 | os.makedirs(save_dir) 86 | for set_name in ['trn', 'val', 'tst']: 87 | int2name, _ = get_trn_val_tst(config['target_root'], cv, set_name) 88 | cv_feats = [] 89 | for utt_id in int2name: 90 | cv_feats.append(all_feat[utt_id]) 91 | cv_feats = np.array(cv_feats) 92 | cv_feats = normalize(cv_feats) 93 | save_path = os.path.join(save_dir, set_name + '.npy') 94 | print(f'fold:{cv} {set_name} {cv_feats.shape}') 95 | np.save(save_path, cv_feats) 96 | 97 | def normalize(feats): 98 | _feats = feats.reshape(-1, feats.shape[2]) 99 | mean = np.mean(_feats, axis=0) 100 | std = np.std(_feats, axis=0) 101 | std[std == 0.0] = 1.0 102 | ret = (feats-mean) / (std) 103 | return ret 104 | 105 | if __name__ == '__main__': 106 | pwd = os.path.abspath(__file__) 107 | pwd = os.path.dirname(pwd) 108 | config_path = os.path.join(pwd, '../../', 'data/config', 'MSP_config.json') 109 | config = json.load(open(config_path)) 110 | make_all_comparE(config) -------------------------------------------------------------------------------- /baseline-mmin/preprocess/debug.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | def show_wdseg(utt_id): 5 | root = '/data3/lrc/IEMOCAP_full_release' 6 | word_info_dir = os.path.join(root, 'Session{}/sentences/ForcedAlignment/{}') 7 | session_id = int(utt_id[4]) 8 | dialog_id = '_'.join(utt_id.split('_')[:-1]) 9 | word_info_path = os.path.join(word_info_dir.format(session_id, dialog_id), utt_id + '.wdseg') 10 | print(f'{utt_id} wdset info:') 11 | print(open(word_info_path, 'r').read()) 12 | 13 | def show_sentence(utt_id): 14 | root = '/data3/lrc/IEMOCAP_full_release' 15 | sentence_dir = os.path.join(root, 'Session{}/dialog/transcriptions/{}.txt') 16 | session_id = int(utt_id[4]) 17 | dialog_id = '_'.join(utt_id.split('_')[:-1]) 18 | transcript_path = sentence_dir.format(session_id, dialog_id) 19 | print(f'{utt_id} transcripts:') 20 | for line in open(transcript_path).readlines(): 21 | if line.startswith(utt_id): 22 | print(line) 23 | break 24 | 25 | if __name__ == '__main__': 26 | utt_id = sys.argv[1] 27 | show_wdseg(utt_id) 28 | show_sentence(utt_id) 29 | -------------------------------------------------------------------------------- /baseline-mmin/preprocess/tools/bert_extractor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from transformers import BertTokenizer, BertModel 4 | 5 | class BertExtractor(object): 6 | def __init__(self, cuda=False, cuda_num=None): 7 | self.tokenizer = BertTokenizer.from_pretrained('/data2/lrc/bert_cache/pytorch') 8 | self.model = BertModel.from_pretrained('/data2/lrc/bert_cache/pytorch') 9 | self.model.eval() 10 | 11 | if cuda: 12 | self.cuda = True 13 | self.cuda_num = cuda_num 14 | self.model = self.model.cuda(self.cuda_num) 15 | else: 16 | self.cuda = False 17 | 18 | def tokenize(self, word_lst): 19 | word_lst = ['[CLS]'] + word_lst + ['[SEP]'] 20 | word_idx = [] 21 | ids = [] 22 | for idx, word in enumerate(word_lst): 23 | ws = self.tokenizer.tokenize(word) 24 | if not ws: 25 | # some special char 26 | continue 27 | token_ids = self.tokenizer.convert_tokens_to_ids(ws) 28 | ids.extend(token_ids) 29 | if word not in ['[CLS]', '[SEP]']: 30 | word_idx += [idx-1] * len(token_ids) 31 | return ids, word_idx 32 | 33 | def get_embd(self, token_ids): 34 | # token_ids = torch.tensor(token_ids) 35 | # print('TOKENIZER:', [self.tokenizer._convert_id_to_token(_id) for _id in token_ids]) 36 | token_ids = torch.tensor(token_ids).unsqueeze(0) 37 | if self.cuda: 38 | token_ids = token_ids.to(self.cuda_num) 39 | 40 | with torch.no_grad(): 41 | outputs = self.model(token_ids) 42 | 43 | # last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple 44 | 45 | sequence_output = outputs[0] 46 | pooled_output = outputs[1] 47 | return sequence_output, pooled_output 48 | 49 | def extract(self, text): 50 | input_ids = torch.tensor(self.tokenizer.encode(text)).unsqueeze(0) # Batch size 1 51 | if self.cuda: 52 | input_ids = input_ids.cuda(self.cuda_num) 53 | 54 | with torch.no_grad(): 55 | outputs = self.model(input_ids) 56 | 57 | # last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple 58 | 59 | sequence_output = outputs[0] 60 | pooled_output = outputs[1] 61 | return sequence_output, pooled_output -------------------------------------------------------------------------------- /baseline-mmin/preprocess/tools/denseface/densenet_train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division 3 | 4 | import tensorflow as tf 5 | import framework.model.trntst 6 | 7 | 8 | def TrnTst(framework.model.trntst.TrnTst): 9 | 10 | def _construct_feed_dict_in_trn(self, data): 11 | raise NotImplementedError("""please customize construct_feed_dict_in_trn""") 12 | 13 | # return loss value 14 | def feed_data_and_run_loss_op_in_val(self, data, sess): 15 | raise NotImplementedError("""please customize feed_data_and_run_loss_op_in_val""") 16 | 17 | # add eval result to metrics dictionary, key is metric name, val is metric value 18 | def predict_and_eval_in_val(self, sess, tst_reader, metrics): 19 | raise NotImplementedError("""please customize predict_and_eval_in_val""") 20 | 21 | # write predict result to predict_file 22 | def predict_in_tst(self, sess, tst_reader, predict_file): 23 | raise NotImplementedError("""please customize predict_in_tst""") 24 | 25 | def _iterate_epoch(self, sess, trn_reader, tst_reader, 26 | summarywriter, step, total_step, epoch): 27 | 28 | trn_batch_size = self.model_cfg.trn_batch_size 29 | avg_trn_loss = 0. 30 | batches_per_epoch = 0 31 | 32 | for data in trn_reader.yield_trn_batch(trn_batch_size): 33 | if self.model_cfg.monitor_iter > 0 and step % self.model_cfg.monitor_iter == 0: 34 | self.feed_data_and_monitor(data, sess, step) 35 | 36 | loss_value = self.feed_data_and_trn(data, sess, summarywriter=summarywriter, step=step) 37 | # print('step', step, 'loss', loss_value) 38 | avg_trn_loss += loss_value 39 | batches_per_epoch += 1 40 | 41 | step += 1 42 | 43 | if self.model_cfg.summary_iter > 0 and step % self.model_cfg.summary_iter == 0: 44 | summarystr = self.feed_data_and_summary(data, sess) 45 | summarywriter.add_summary(summarystr, step) 46 | 47 | if self.model_cfg.val_iter > 0 and step % self.model_cfg.val_iter == 0: 48 | metrics = self._validation(sess, tst_reader) 49 | metrics_str = 'step (%d/%d) '%(step, total_step) 50 | for key in metrics: 51 | metrics_str += '%s:%.4f '%(key, metrics[key]) 52 | self._logger.info(metrics_str) 53 | 54 | self.model.saver.save( 55 | sess, os.path.join(self.path_cfg.model_dir, 'epoch'), global_step=epoch) 56 | 57 | avg_trn_loss /= batches_per_epoch 58 | return step, avg_trn_loss 59 | 60 | 61 | -------------------------------------------------------------------------------- /baseline-mmin/preprocess/tools/denseface/vision_network/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/preprocess/tools/denseface/vision_network/__init__.py -------------------------------------------------------------------------------- /baseline-mmin/preprocess/tools/denseface/vision_network/data_providers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/preprocess/tools/denseface/vision_network/data_providers/__init__.py -------------------------------------------------------------------------------- /baseline-mmin/preprocess/tools/denseface/vision_network/data_providers/base_provider.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division 3 | 4 | import numpy as np 5 | 6 | 7 | class DataSet: 8 | """Class to represent some dataset: train, validation, test""" 9 | @property 10 | def num_examples(self): 11 | """Return qtty of examples in dataset""" 12 | raise NotImplementedError 13 | 14 | def next_batch(self, batch_size): 15 | """Return batch of required size of data, labels""" 16 | raise NotImplementedError 17 | 18 | 19 | class ImagesDataSet(DataSet): 20 | """Dataset for images that provide some often used methods""" 21 | 22 | def _measure_mean_and_std(self): 23 | # for every channel in image 24 | means = [] 25 | stds = [] 26 | # for every channel in image(assume this is last dimension) 27 | for ch in range(self.images.shape[-1]): 28 | means.append(np.mean(self.images[:, :, :, ch])) 29 | stds.append(np.std(self.images[:, :, :, ch])) 30 | self._means = np.array(means, np.float32) 31 | self._stds = np.array(stds, np.float32) 32 | 33 | @property 34 | def images_means(self): 35 | if not hasattr(self, '_means'): 36 | self._measure_mean_and_std() 37 | return self._means 38 | 39 | @property 40 | def images_stds(self): 41 | if not hasattr(self, '_stds'): 42 | self._measure_mean_and_std() 43 | return self._stds 44 | 45 | def shuffle_images_and_labels(self, images, labels): 46 | rand_indexes = np.random.permutation(images.shape[0]) 47 | shuffled_images = images[rand_indexes] 48 | shuffled_labels = labels[rand_indexes] 49 | return shuffled_images, shuffled_labels 50 | 51 | def normalize_images(self, images, normalization_type): 52 | """ 53 | Args: 54 | images: numpy 4D array 55 | normalization_type: `str`, available choices: 56 | - divide_255 57 | - divide_256 58 | - by_chanels 59 | """ 60 | if normalization_type == 'divide_255': 61 | images = images / 255 62 | elif normalization_type == 'divide_256': 63 | images = images / 256 64 | elif normalization_type == 'by_chanels': 65 | images = images.astype('float64') 66 | # for every channel in image(assume this is last dimension) 67 | for i in range(images.shape[-1]): 68 | images[:, :, :, i] = ((images[:, :, :, i] - self.images_means[i]) / 69 | self.images_stds[i]) 70 | else: 71 | raise Exception("Unknown type of normalization") 72 | return images 73 | 74 | def normalize_all_images_by_chanels(self, initial_images): 75 | new_images = np.zeros(initial_images.shape) 76 | for i in range(initial_images.shape[0]): 77 | new_images[i] = self.normalize_image_by_chanel(initial_images[i]) 78 | return new_images 79 | 80 | def normalize_image_by_chanel(self, image): 81 | new_image = np.zeros(image.shape) 82 | for chanel in range(3): 83 | mean = np.mean(image[:, :, chanel]) 84 | std = np.std(image[:, :, chanel]) 85 | new_image[:, :, chanel] = (image[:, :, chanel] - mean) / std 86 | return new_image 87 | 88 | 89 | class DataProvider: 90 | @property 91 | def data_shape(self): 92 | """Return shape as python list of one data entry""" 93 | raise NotImplementedError 94 | 95 | @property 96 | def n_classes(self): 97 | """Return `int` of num classes""" 98 | raise NotImplementedError 99 | 100 | def labels_to_one_hot(self, labels): 101 | """Convert 1D array of labels to one hot representation 102 | 103 | Args: 104 | labels: 1D numpy array 105 | """ 106 | new_labels = np.zeros((labels.shape[0], self.n_classes)) 107 | new_labels[range(labels.shape[0]), labels] = np.ones(labels.shape) 108 | return new_labels 109 | 110 | def labels_from_one_hot(self, labels): 111 | """Convert 2D array of labels to 1D class based representation 112 | 113 | Args: 114 | labels: 2D numpy array 115 | """ 116 | return np.argmax(labels, axis=1) 117 | -------------------------------------------------------------------------------- /baseline-mmin/preprocess/tools/denseface/vision_network/data_providers/downloader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import urllib 4 | import tarfile 5 | import zipfile 6 | 7 | 8 | def report_download_progress(count, block_size, total_size): 9 | pct_complete = float(count * block_size) / total_size 10 | msg = "\r {0:.1%} already downloaded".format(pct_complete) 11 | sys.stdout.write(msg) 12 | sys.stdout.flush() 13 | 14 | 15 | def download_data_url(url, download_dir): 16 | filename = url.split('/')[-1] 17 | file_path = os.path.join(download_dir, filename) 18 | 19 | if not os.path.exists(file_path): 20 | os.makedirs(download_dir) 21 | 22 | print("Download %s to %s" % (url, file_path)) 23 | file_path, _ = urllib.urlretrieve( 24 | url=url, 25 | filename=file_path, 26 | reporthook=report_download_progress) 27 | 28 | print("\nExtracting files") 29 | if file_path.endswith(".zip"): 30 | zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir) 31 | elif file_path.endswith((".tar.gz", ".tgz")): 32 | tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) 33 | -------------------------------------------------------------------------------- /baseline-mmin/preprocess/tools/denseface/vision_network/data_providers/utils.py: -------------------------------------------------------------------------------- 1 | from .cifar import Cifar10DataProvider, Cifar100DataProvider, \ 2 | Cifar10AugmentedDataProvider, Cifar100AugmentedDataProvider 3 | from .svhn import SVHNDataProvider 4 | from .fer import FERPlusDataProvider, AVECDataProvider, MUSEDataProvider, VGGFACE2DataProvieder 5 | 6 | 7 | def get_data_provider_by_name(name, data_dir, train_params): 8 | """Return required data provider class""" 9 | if name == 'C10': 10 | return Cifar10DataProvider(save_path=data_dir, **train_params) 11 | if name == 'C10+': 12 | return Cifar10AugmentedDataProvider(save_path=data_dir, **train_params) 13 | if name == 'C100': 14 | return Cifar100DataProvider(save_path=data_dir, **train_params) 15 | if name == 'C100+': 16 | return Cifar100AugmentedDataProvider(save_path=data_dir, **train_params) 17 | if name == 'SVHN': 18 | return SVHNDataProvider(**train_params) 19 | if name == 'FER+': 20 | return FERPlusDataProvider(data_dir, **train_params) 21 | if name == 'AVEC': 22 | return AVECDataProvider(data_dir, **train_params) 23 | if name == 'MUSE': 24 | return MUSEDataProvider(data_dir, **train_params) 25 | if name == 'VGGFACE2': 26 | return VGGFACE2DataProvieder(data_dir, **train_params) 27 | else: 28 | print("Sorry, data provider for `%s` dataset " 29 | "was not implemented yet" % name) 30 | exit() 31 | -------------------------------------------------------------------------------- /baseline-mmin/preprocess/tools/denseface/vision_network/denseface_feature.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division 3 | 4 | import os 5 | import argparse 6 | import cv2 7 | import collections 8 | import numpy as np 9 | 10 | from models.dense_net import DenseNet 11 | 12 | img_size = 64 13 | 14 | # # FER+ MODEL 15 | # images_mean = 129 16 | # images_std = 63.58 17 | 18 | # FER+-MEC finetune MODEL 19 | images_mean = 106 20 | images_std = 58 21 | 22 | def parse_opts(): 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument( 25 | '--model_type', '-m', type=str, choices=['DenseNet', 'DenseNet-BC'], 26 | default='DenseNet', 27 | help='What type of model to use') 28 | parser.add_argument( 29 | '--growth_rate', '-k', type=int, choices=[12, 24, 40], 30 | default=12, 31 | help='Grows rate for every layer, ' 32 | 'choices were restricted to used in paper') 33 | parser.add_argument( 34 | '--depth', '-d', type=int, choices=[40, 100, 190, 250], 35 | default=40, 36 | help='Depth of whole network, restricted to paper choices') 37 | parser.add_argument( 38 | '--total_blocks', '-tb', type=int, default=3, metavar='', 39 | help='Total blocks of layers stack (default: %(default)s)') 40 | parser.add_argument( 41 | '--reduction', '-red', type=float, default=0.5, metavar='', 42 | help='reduction Theta at transition layer for DenseNets-BC models') 43 | parser.add_argument('--batch_size', dest='batch_size', type=int, 44 | default=32) 45 | 46 | parser.add_argument('--face_dir', dest='face_dir', help='face dir') 47 | parser.add_argument('--outft_dir', dest='outft_dir') 48 | parser.add_argument('--model_path', dest='model_path') 49 | 50 | args = parser.parse_args() 51 | 52 | args.keep_prob = 1.0 53 | if args.model_type == 'DenseNet': 54 | args.bc_mode = False 55 | args.reduction = 1.0 56 | elif args.model_type == 'DenseNet-BC': 57 | args.bc_mode = True 58 | 59 | return args 60 | 61 | 62 | def extract_feature_batch(): 63 | 64 | args = parse_opts() 65 | model_params = vars(args) 66 | batch_size = args.batch_size 67 | 68 | print("Initialize the model..") 69 | # fake data_provider 70 | DataProvider = collections.namedtuple('DataProvider', ['data_shape', 'n_classes']) 71 | data_provider = DataProvider(data_shape=(img_size, img_size, 1), n_classes=10) 72 | model = DenseNet(data_provider=data_provider, **model_params) 73 | end_points = model.end_points 74 | # for key, value in end_points.iteritems(): 75 | # print(key, value.get_shape().as_list()) 76 | # restore model 77 | model.saver.restore(model.sess, args.model_path) 78 | print("Successfully load model from model path: %s" % args.model_path) 79 | 80 | video_names = [x for x in os.listdir(args.face_dir)] 81 | video_names.sort() 82 | avg_num_imgs = 0 83 | 84 | for vid, video_name in enumerate(video_names): 85 | video_dir = os.path.join(args.face_dir, video_name) 86 | img_paths = os.listdir(video_dir) 87 | if len(img_paths) == 0: 88 | continue 89 | 90 | output_subdir = os.path.join(args.outft_dir, video_name) 91 | if os.path.exists(output_subdir): 92 | continue 93 | else: 94 | os.makedirs(output_subdir) 95 | 96 | img_paths.sort(key=lambda x:int(x.split('.')[0])) 97 | avg_num_imgs += len(img_paths) 98 | 99 | imgs = [] 100 | for img_path in img_paths: 101 | img_path = os.path.join(video_dir, img_path) 102 | img = cv2.imread(img_path) 103 | img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 104 | img = cv2.resize(img, (img_size, img_size)) 105 | imgs.append(img) 106 | 107 | imgs = (np.array(imgs, np.float32) - images_mean) / images_std 108 | imgs = np.expand_dims(imgs, 3) 109 | # pool4.shape=(batch_size, 4, 4, 256) 110 | # fc5.shape=fc6.shape=(batch_size, 1, 1, 512) 111 | # prob.shape=(batch_size, num_classes) 112 | fcs, probs = [], [] 113 | for i in xrange(0, imgs.shape[0], batch_size): 114 | feed_dict = { 115 | model.images: imgs[i: i + batch_size], 116 | model.is_training: False 117 | } 118 | fc, prob = model.sess.run( 119 | [end_points['fc'], end_points['preds']], 120 | feed_dict=feed_dict) 121 | # prev_last_pools.extend(prev_last_pool) 122 | fcs.extend(fc) 123 | probs.extend(prob) 124 | 125 | # prev_last_pools = np.array(prev_last_pools, np.float32) 126 | fcs = np.array(fcs, np.float32) 127 | probs = np.array(probs, np.float32) 128 | 129 | 130 | # with open(os.path.join(output_subdir, 'pool.npy'), 'wb') as f: 131 | # np.save(f, prev_last_pools) 132 | with open(os.path.join(output_subdir, 'fc.npy'), 'wb') as f: 133 | np.save(f, fcs) 134 | with open(os.path.join(output_subdir, 'prob.npy'), 'wb') as f: 135 | np.save(f, probs) 136 | 137 | print(vid, video_name, len(img_paths), 138 | fcs.shape, probs.shape) 139 | 140 | avg_num_imgs /= float(len(video_names)) 141 | print('average faces per video', avg_num_imgs) 142 | 143 | 144 | if __name__ == '__main__': 145 | extract_feature_batch() 146 | 147 | 148 | 149 | -------------------------------------------------------------------------------- /baseline-mmin/preprocess/tools/denseface/vision_network/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/preprocess/tools/denseface/vision_network/models/__init__.py -------------------------------------------------------------------------------- /baseline-mmin/preprocess/tools/denseface_extractor.py: -------------------------------------------------------------------------------- 1 | 2 | import os, glob 3 | import cv2 4 | import numpy as np 5 | import tensorflow as tf 6 | import collections 7 | 8 | from preprocess.tools.denseface.vision_network.models.dense_net import DenseNet 9 | 10 | class DensefaceExtractor(object): 11 | def __init__(self, restore_path=None, mean=131.0754, std=47.858177, device=0, smooth=False): 12 | """ extract densenet feature 13 | Parameters: 14 | ------------------------ 15 | model: model class returned by function 'load_model' 16 | """ 17 | if restore_path is None: 18 | restore_path = '/data2/zjm/tools/FER_models/denseface/DenseNet-BC_growth-rate12_depth100_FERPlus/model/epoch-200' 19 | self.model = self.load_model(restore_path) 20 | self.mean = mean 21 | self.std = std 22 | self.previous_img = None # smooth 的情况下, 如果没有人脸则用上一张人脸填充 23 | self.previous_img_path = None 24 | self.smooth = smooth 25 | self.dim = 342 # returned feature dim 26 | self.device = device 27 | 28 | def load_model(self, restore_path): 29 | print("Initialize the model..") 30 | # fake data_provider 31 | growth_rate = 12 32 | img_size = 64 33 | depth = 100 34 | total_blocks = 3 35 | reduction = 0.5 36 | keep_prob = 1.0 37 | bc_mode = True 38 | model_path = restore_path 39 | dataset = 'FER+' 40 | num_class = 8 41 | 42 | DataProvider = collections.namedtuple('DataProvider', ['data_shape', 'n_classes']) 43 | data_provider = DataProvider(data_shape=(img_size, img_size, 1), n_classes=num_class) 44 | model = DenseNet(data_provider=data_provider, growth_rate=growth_rate, depth=depth, 45 | total_blocks=total_blocks, keep_prob=keep_prob, reduction=reduction, 46 | bc_mode=bc_mode, dataset=dataset) 47 | 48 | model.saver.restore(model.sess, model_path) 49 | print("Successfully load model from model path: {}".format(model_path)) 50 | return model 51 | 52 | def __call__(self, img_path): 53 | if os.path.exists(img_path): 54 | img = cv2.imread(img_path) 55 | if not isinstance(img, np.ndarray): 56 | print(f'Warning: Error in {img_path}') 57 | return None 58 | 59 | img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 60 | img = cv2.resize(img, (64, 64)) 61 | if self.smooth: 62 | self.previous_img = img 63 | self.previous_img_path = img_path 64 | 65 | elif self.smooth and self.previous_img is not None: 66 | # print('Path {} does not exists. Use previous img: {}'.format(img_path, self.previous_img_path)) 67 | img = self.previous_img 68 | 69 | else: 70 | feat = np.zeros([1, self.dim]) # smooth的话第一张就是黑图的话就直接返回0特征, 不smooth缺图就返回0 71 | return feat 72 | 73 | img = (img - self.mean) / self.std 74 | img = np.expand_dims(img, -1) # channel = 1 75 | img = np.expand_dims(img, 0) # batch_size=1 76 | with tf.device('/gpu:{}'.format(self.device)): 77 | feed_dict = { 78 | self.model.images: img, 79 | self.model.is_training: False 80 | } 81 | 82 | # emo index 83 | # fer_idx_to_class = ['neu', 'hap', 'sur', 'sad', 'ang', 'dis', 'fea', 'con'] 84 | 85 | ft, soft_label = \ 86 | self.model.sess.run([self.model.end_points['fc'], 87 | self.model.end_points['preds']], feed_dict=feed_dict) 88 | return ft, soft_label -------------------------------------------------------------------------------- /baseline-mmin/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/utils/__init__.py -------------------------------------------------------------------------------- /baseline-mmin/utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/utils/__pycache__/logger.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/utils/__pycache__/logger.cpython-37.pyc -------------------------------------------------------------------------------- /baseline-mmin/utils/__pycache__/logger.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/baseline-mmin/utils/__pycache__/logger.cpython-38.pyc -------------------------------------------------------------------------------- /baseline-mmin/utils/image_pool.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | 4 | 5 | class ImagePool(): 6 | """This class implements an image buffer that stores previously generated images. 7 | 8 | This buffer enables us to update discriminators using a history of generated images 9 | rather than the ones produced by the latest generators. 10 | """ 11 | 12 | def __init__(self, pool_size): 13 | """Initialize the ImagePool class 14 | 15 | Parameters: 16 | pool_size (int) -- the size of image buffer, if pool_size=0, no buffer will be created 17 | """ 18 | self.pool_size = pool_size 19 | if self.pool_size > 0: # create an empty pool 20 | self.num_imgs = 0 21 | self.images = [] 22 | 23 | def query(self, images): 24 | """Return an image from the pool. 25 | 26 | Parameters: 27 | images: the latest generated images from the generator 28 | 29 | Returns images from the buffer. 30 | 31 | By 50/100, the buffer will return input images. 32 | By 50/100, the buffer will return images previously stored in the buffer, 33 | and insert the current images to the buffer. 34 | """ 35 | if self.pool_size == 0: # if the buffer size is 0, do nothing 36 | return images 37 | return_images = [] 38 | for image in images: 39 | image = torch.unsqueeze(image.data, 0) 40 | if self.num_imgs < self.pool_size: # if the buffer is not full; keep inserting current images to the buffer 41 | self.num_imgs = self.num_imgs + 1 42 | self.images.append(image) 43 | return_images.append(image) 44 | else: 45 | p = random.uniform(0, 1) 46 | if p > 0.5: # by 50% chance, the buffer will return a previously stored image, and insert the current image into the buffer 47 | random_id = random.randint(0, self.pool_size - 1) # randint is inclusive 48 | tmp = self.images[random_id].clone() 49 | self.images[random_id] = image 50 | return_images.append(tmp) 51 | else: # by another 50% chance, the buffer will return the current image 52 | return_images.append(image) 53 | return_images = torch.cat(return_images, 0) # collect all the images and return 54 | return return_images 55 | -------------------------------------------------------------------------------- /baseline-mmin/utils/logger.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import logging 4 | import fcntl 5 | 6 | def get_logger(path, suffix): 7 | cur_time = time.strftime('%Y-%m-%d-%H.%M.%S',time.localtime(time.time())) 8 | logger = logging.getLogger(__name__+cur_time) 9 | logger.setLevel(level = logging.INFO) 10 | handler = logging.FileHandler(os.path.join(path, f"{suffix}_{cur_time}.log")) 11 | handler.setLevel(logging.INFO) 12 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 13 | handler.setFormatter(formatter) 14 | 15 | console = logging.StreamHandler() 16 | console.setLevel(logging.INFO) 17 | 18 | logger.addHandler(handler) 19 | logger.addHandler(console) 20 | return logger 21 | 22 | class ResultRecorder(object): 23 | def __init__(self, path, total_cv=10): 24 | self.path = path # ./logs/utt_fusion_AVL_run2/result.tsv 25 | self.total_cv = total_cv 26 | if not os.path.exists(self.path): 27 | f = open(self.path, 'w') 28 | f.write('acc\tuar\tf1\n') 29 | f.close() 30 | 31 | def is_full(self, content): 32 | if len(content) < self.total_cv+1: 33 | return False 34 | 35 | for line in content: 36 | if not len(line.split('\t')) == 3: 37 | return False 38 | return True 39 | 40 | def calc_mean(self, content): 41 | acc = [float(line.split('\t')[0]) for line in content[1:]] 42 | uar = [float(line.split('\t')[1]) for line in content[1:]] 43 | f1 = [float(line.split('\t')[2]) for line in content[1:]] 44 | mean_acc = sum(acc) / len(acc) 45 | mean_uar = sum(uar) / len(uar) 46 | mean_f1 = sum(f1) / len(f1) 47 | return mean_acc, mean_uar, mean_f1 48 | 49 | def write_result_to_tsv(self, results, cvNo): 50 | # 使用fcntl对文件加锁,避免多个不同进程同时操作同一个文件 51 | f_in = open(self.path) 52 | fcntl.flock(f_in.fileno(), fcntl.LOCK_EX) # 加锁 53 | content = f_in.readlines() 54 | if len(content) < self.total_cv+1: 55 | content += ['\n'] * (self.total_cv-len(content)+1) 56 | content[cvNo] = '{:.4f}\t{:.4f}\t{:.4f}\n'.format(results['acc'], results['uar'], results['f1']) 57 | if self.is_full(content): 58 | mean_acc, mean_uar, mean_f1 = self.calc_mean(content) 59 | content.append('{:.4f}\t{:.4f}\t{:.4f}\n'.format(mean_acc, mean_uar, mean_f1)) 60 | 61 | f_out = open(self.path, 'w') 62 | f_out.writelines(content) 63 | f_out.close() 64 | f_in.close() # 释放锁 65 | 66 | 67 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | import os 3 | import sys 4 | import socket 5 | 6 | ## gain linux ip 7 | def get_host_ip(): 8 | try: 9 | s = socket.socket(socket.AF_INET,socket.SOCK_DGRAM) 10 | s.connect(('10.0.0.1',8080)) 11 | ip= s.getsockname()[0] 12 | finally: 13 | s.close() 14 | return ip 15 | 16 | ############ For LINUX ############## 17 | # path 18 | DATA_DIR = { 19 | 'CMUMOSI': '/share/home/lianzheng/gcnet-master/dataset/CMUMOSI', # for nlpr 20 | 'CMUMOSEI': '/share/home/lianzheng/gcnet-master/dataset/CMUMOSEI',# for nlpr 21 | 'IEMOCAPSix': '/share/home/lianzheng/gcnet-master/dataset/IEMOCAP', # for nlpr 22 | 'IEMOCAPFour': '/share/home/lianzheng/gcnet-master/dataset/IEMOCAP', # for nlpr 23 | } 24 | PATH_TO_RAW_AUDIO = { 25 | 'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'subaudio'), 26 | 'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'subaudio'), 27 | 'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'subaudio'), 28 | 'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'subaudio'), 29 | } 30 | PATH_TO_RAW_FACE = { 31 | 'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'openface_face'), 32 | 'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'openface_face'), 33 | 'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'subvideofaces'), # without openfac 34 | 'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'subvideofaces'), 35 | } 36 | PATH_TO_TRANSCRIPTIONS = { 37 | 'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'transcription.csv'), 38 | 'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'transcription.csv'), 39 | 'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'transcription.csv'), 40 | 'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'transcription.csv'), 41 | } 42 | PATH_TO_FEATURES = { 43 | 'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'features'), 44 | 'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'features'), 45 | 'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'features'), 46 | 'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'features'), 47 | } 48 | PATH_TO_LABEL = { 49 | 'CMUMOSI': os.path.join(DATA_DIR['CMUMOSI'], 'CMUMOSI_features_raw_2way.pkl'), 50 | 'CMUMOSEI': os.path.join(DATA_DIR['CMUMOSEI'], 'CMUMOSEI_features_raw_2way.pkl'), 51 | 'IEMOCAPSix': os.path.join(DATA_DIR['IEMOCAPSix'], 'IEMOCAP_features_raw_6way.pkl'), 52 | 'IEMOCAPFour': os.path.join(DATA_DIR['IEMOCAPFour'], 'IEMOCAP_features_raw_4way.pkl'), 53 | } 54 | 55 | # pre-trained models, including supervised and unsupervised 56 | PATH_TO_PRETRAINED_MODELS = '/share/home/lianzheng/tools' 57 | PATH_TO_OPENSMILE = '/share/home/lianzheng/tools/opensmile-2.3.0/' 58 | PATH_TO_FFMPEG = '/share/home/lianzheng/tools/ffmpeg-4.4.1-i686-static/ffmpeg' 59 | 60 | # dir 61 | SAVED_ROOT = os.path.join('../saved') 62 | DATA_DIR = os.path.join(SAVED_ROOT, 'data') 63 | MODEL_DIR = os.path.join(SAVED_ROOT, 'model') 64 | LOG_DIR = os.path.join(SAVED_ROOT, 'log') 65 | 66 | 67 | 68 | ############ For Windows ############## 69 | DATA_DIR_Win = { 70 | 'CMUMOSI': 'E:\\Dataset\\CMU-MOSI\\Raw', 71 | 'CMUMOSEI1': 'E:\\Dataset\\CMU-MOSEI', # extract openface in five subprocess 72 | 'CMUMOSEI2': 'E:\\Dataset\\CMU-MOSEI', # extract openface in five subprocess 73 | 'CMUMOSEI3': 'E:\\Dataset\\CMU-MOSEI', # extract openface in five subprocess 74 | 'CMUMOSEI4': 'E:\\Dataset\\CMU-MOSEI', # extract openface in five subprocess 75 | 'CMUMOSEI5': 'E:\\Dataset\\CMU-MOSEI', # extract openface in five subprocess 76 | } 77 | 78 | PATH_TO_RAW_FACE_Win = { 79 | 'CMUMOSI': os.path.join(DATA_DIR_Win['CMUMOSI'], 'Video\\Segmented'), 80 | 'CMUMOSEI1': os.path.join(DATA_DIR_Win['CMUMOSEI1'], 'subvideo1'), 81 | 'CMUMOSEI2': os.path.join(DATA_DIR_Win['CMUMOSEI2'], 'subvideo2'), 82 | 'CMUMOSEI3': os.path.join(DATA_DIR_Win['CMUMOSEI3'], 'subvideo3'), 83 | 'CMUMOSEI4': os.path.join(DATA_DIR_Win['CMUMOSEI4'], 'subvideo4'), 84 | 'CMUMOSEI5': os.path.join(DATA_DIR_Win['CMUMOSEI5'], 'subvideo5'), 85 | } 86 | 87 | PATH_TO_FEATURES_Win = { 88 | 'CMUMOSI': os.path.join(DATA_DIR_Win['CMUMOSI'], 'features'), 89 | 'CMUMOSEI1': os.path.join(DATA_DIR_Win['CMUMOSEI1'], 'features'), 90 | 'CMUMOSEI2': os.path.join(DATA_DIR_Win['CMUMOSEI2'], 'features'), 91 | 'CMUMOSEI3': os.path.join(DATA_DIR_Win['CMUMOSEI3'], 'features'), 92 | 'CMUMOSEI4': os.path.join(DATA_DIR_Win['CMUMOSEI4'], 'features'), 93 | 'CMUMOSEI5': os.path.join(DATA_DIR_Win['CMUMOSEI5'], 'features'), 94 | } 95 | 96 | PATH_TO_OPENFACE_Win = "H:\\desktop\\Multimedia-Transformer\\gcnet-master\\OpenFace_2.2.0_win_x64\\OpenFace_2.2.0_win_x64" 97 | PATH_TO_FFMPEG_Win = "H:\\desktop\\Multimedia-Transformer\\tools\\ffmpeg-3.4.1-win32-static\\bin\\ffmpeg" 98 | 99 | -------------------------------------------------------------------------------- /dataset/CMUMOSEI/CMUMOSEI_features_raw_2way.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/dataset/CMUMOSEI/CMUMOSEI_features_raw_2way.pkl -------------------------------------------------------------------------------- /dataset/CMUMOSI/CMUMOSI_features_raw_2way.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/dataset/CMUMOSI/CMUMOSI_features_raw_2way.pkl -------------------------------------------------------------------------------- /dataset/IEMOCAP/IEMOCAP_features_raw_4way.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/dataset/IEMOCAP/IEMOCAP_features_raw_4way.pkl -------------------------------------------------------------------------------- /dataset/IEMOCAP/IEMOCAP_features_raw_6way.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/dataset/IEMOCAP/IEMOCAP_features_raw_6way.pkl -------------------------------------------------------------------------------- /face_detection_yunet_2021sep.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/face_detection_yunet_2021sep.onnx -------------------------------------------------------------------------------- /feature_extraction/audio/__pycache__/config.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/audio/__pycache__/config.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/audio/__pycache__/feature_extractor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/audio/__pycache__/feature_extractor.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/audio/__pycache__/util.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/audio/__pycache__/util.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/audio/panns/__pycache__/models.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/audio/panns/__pycache__/models.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/audio/panns/__pycache__/pytorch_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/audio/panns/__pycache__/pytorch_utils.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/audio/panns/evaluate.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | 3 | from pytorch_utils import forward 4 | 5 | 6 | class Evaluator(object): 7 | def __init__(self, model): 8 | """Evaluator. 9 | 10 | Args: 11 | model: object 12 | """ 13 | self.model = model 14 | 15 | def evaluate(self, data_loader): 16 | """Forward evaluation data and calculate statistics. 17 | 18 | Args: 19 | data_loader: object 20 | 21 | Returns: 22 | statistics: dict, 23 | {'average_precision': (classes_num,), 'auc': (classes_num,)} 24 | """ 25 | 26 | # Forward 27 | output_dict = forward( 28 | model=self.model, 29 | generator=data_loader, 30 | return_target=True) 31 | 32 | clipwise_output = output_dict['clipwise_output'] # (audios_num, classes_num) 33 | target = output_dict['target'] # (audios_num, classes_num) 34 | 35 | average_precision = metrics.average_precision_score( 36 | target, clipwise_output, average=None) 37 | 38 | auc = metrics.roc_auc_score(target, clipwise_output, average=None) 39 | 40 | statistics = {'average_precision': average_precision, 'auc': auc} 41 | 42 | return statistics -------------------------------------------------------------------------------- /feature_extraction/audio/panns/finetune_template.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(1, os.path.join(sys.path[0], '../utils')) 4 | import numpy as np 5 | import argparse 6 | import h5py 7 | import math 8 | import time 9 | import logging 10 | import matplotlib.pyplot as plt 11 | 12 | import torch 13 | torch.backends.cudnn.benchmark=True 14 | torch.manual_seed(0) 15 | import torch.nn as nn 16 | import torch.nn.functional as F 17 | import torch.optim as optim 18 | import torch.utils.data 19 | 20 | from utilities import get_filename 21 | from models import * 22 | import config 23 | 24 | 25 | class Transfer_Cnn14(nn.Module): 26 | def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 27 | fmax, classes_num, freeze_base): 28 | """Classifier for a new task using pretrained Cnn14 as a sub module. 29 | """ 30 | super(Transfer_Cnn14, self).__init__() 31 | audioset_classes_num = 527 32 | 33 | self.base = Cnn14(sample_rate, window_size, hop_size, mel_bins, fmin, 34 | fmax, audioset_classes_num) 35 | 36 | # Transfer to another task layer 37 | self.fc_transfer = nn.Linear(2048, classes_num, bias=True) 38 | 39 | if freeze_base: 40 | # Freeze AudioSet pretrained layers 41 | for param in self.base.parameters(): 42 | param.requires_grad = False 43 | 44 | self.init_weights() 45 | 46 | def init_weights(self): 47 | init_layer(self.fc_transfer) 48 | 49 | def load_from_pretrain(self, pretrained_checkpoint_path): 50 | checkpoint = torch.load(pretrained_checkpoint_path) 51 | self.base.load_state_dict(checkpoint['model']) 52 | 53 | def forward(self, input, mixup_lambda=None): 54 | """Input: (batch_size, data_length) 55 | """ 56 | output_dict = self.base(input, mixup_lambda) 57 | embedding = output_dict['embedding'] 58 | 59 | clipwise_output = torch.log_softmax(self.fc_transfer(embedding), dim=-1) 60 | output_dict['clipwise_output'] = clipwise_output 61 | 62 | return output_dict 63 | 64 | 65 | def train(args): 66 | 67 | # Arugments & parameters 68 | sample_rate = args.sample_rate 69 | window_size = args.window_size 70 | hop_size = args.hop_size 71 | mel_bins = args.mel_bins 72 | fmin = args.fmin 73 | fmax = args.fmax 74 | model_type = args.model_type 75 | pretrained_checkpoint_path = args.pretrained_checkpoint_path 76 | freeze_base = args.freeze_base 77 | device = 'cuda' if (args.cuda and torch.cuda.is_available()) else 'cpu' 78 | 79 | classes_num = config.classes_num 80 | pretrain = True if pretrained_checkpoint_path else False 81 | 82 | # Model 83 | Model = eval(model_type) 84 | model = Model(sample_rate, window_size, hop_size, mel_bins, fmin, fmax, 85 | classes_num, freeze_base) 86 | 87 | # Load pretrained model 88 | if pretrain: 89 | logging.info('Load pretrained model from {}'.format(pretrained_checkpoint_path)) 90 | model.load_from_pretrain(pretrained_checkpoint_path) 91 | 92 | # Parallel 93 | print('GPU number: {}'.format(torch.cuda.device_count())) 94 | model = torch.nn.DataParallel(model) 95 | 96 | if 'cuda' in device: 97 | model.to(device) 98 | 99 | print('Load pretrained model successfully!') 100 | 101 | 102 | if __name__ == '__main__': 103 | parser = argparse.ArgumentParser(description='Example of parser. ') 104 | subparsers = parser.add_subparsers(dest='mode') 105 | 106 | # Train 107 | parser_train = subparsers.add_parser('train') 108 | parser_train.add_argument('--sample_rate', type=int, required=True) 109 | parser_train.add_argument('--window_size', type=int, required=True) 110 | parser_train.add_argument('--hop_size', type=int, required=True) 111 | parser_train.add_argument('--mel_bins', type=int, required=True) 112 | parser_train.add_argument('--fmin', type=int, required=True) 113 | parser_train.add_argument('--fmax', type=int, required=True) 114 | parser_train.add_argument('--model_type', type=str, required=True) 115 | parser_train.add_argument('--pretrained_checkpoint_path', type=str) 116 | parser_train.add_argument('--freeze_base', action='store_true', default=False) 117 | parser_train.add_argument('--cuda', action='store_true', default=False) 118 | 119 | # Parse arguments 120 | args = parser.parse_args() 121 | args.filename = get_filename(__file__) 122 | 123 | if args.mode == 'train': 124 | train(args) 125 | 126 | else: 127 | raise Exception('Error argument!') -------------------------------------------------------------------------------- /feature_extraction/audio/panns/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def clip_bce(output_dict, target_dict): 6 | """Binary crossentropy loss. 7 | """ 8 | return F.binary_cross_entropy( 9 | output_dict['clipwise_output'], target_dict['target']) 10 | 11 | 12 | def get_loss_func(loss_type): 13 | if loss_type == 'clip_bce': 14 | return clip_bce -------------------------------------------------------------------------------- /feature_extraction/audio/run.sh: -------------------------------------------------------------------------------- 1 | python extract_handcrafted_feature.py --dataset='CHEAVD' --feature_extractor='pyAudio' --feature_set='pyAudio' --feature_level='UTTERANCE' 2 | python extract_handcrafted_feature.py --dataset='CHEAVD' --feature_extractor='opensmile' --feature_set='IS09' --feature_level='UTTERANCE' 3 | python extract_handcrafted_feature.py --dataset='CHEAVD' --feature_extractor='opensmile' --feature_set='IS10' --feature_level='UTTERANCE' 4 | python extract_handcrafted_feature.py --dataset='CHEAVD' --feature_extractor='opensmile' --feature_set='IS13' --feature_level='UTTERANCE' 5 | python extract_handcrafted_feature.py --dataset='CHEAVD' --feature_extractor='opensmile' --feature_set='eGeMAPS' --feature_level='UTTERANCE' 6 | python extract_handcrafted_feature.py --dataset='CHEAVD' --feature_extractor='Librosa' --feature_set='mel_spec' --feature_level='UTTERANCE' 7 | python extract_handcrafted_feature.py --dataset='CHEAVD' --feature_extractor='Librosa' --feature_set='mfcc' --feature_level='UTTERANCE' 8 | python extract_wav2vec_embedding.py --dataset='CHEAVD' --feature_level='UTTERANCE' --gpu=0 9 | python extract_wav2vec2_embedding.py --dataset='CHEAVD' --model_name='wav2vec2-base' --feature_level='UTTERANCE' --gpu=0 10 | python extract_wav2vec2_embedding.py --dataset='CHEAVD' --model_name='wav2vec2-base-960h' --feature_level='UTTERANCE' --gpu=0 11 | python extract_wav2vec2_embedding.py --dataset='CHEAVD' --model_name='wav2vec2-large-960h' --feature_level='UTTERANCE' --gpu=0 12 | python extract_panns_embedding.py --dataset='CHEAVD' --feature_level='UTTERANCE' --gpu=0 13 | python extract_vggish_embedding.py --dataset='CHEAVD' --feature_level='UTTERANCE' --gpu=0 -------------------------------------------------------------------------------- /feature_extraction/audio/smile.log: -------------------------------------------------------------------------------- 1 | [ 13.01.2022 - 16:42:06 ] 2 | (MSG) [2] in SMILExtract : openSMILE starting! 3 | [ 13.01.2022 - 16:42:06 ] 4 | (MSG) [2] in SMILExtract : config file is: /share/home/lianzheng/tools/opensmile-2.3.0/config/gemaps/eGeMAPSv01a.conf 5 | [ 13.01.2022 - 16:42:06 ] 6 | (MSG) [2] in cComponentManager : successfully registered 96 component types. 7 | [ 13.01.2022 - 16:42:06 ] 8 | (MSG) [2] in instance 'gemapsv01a_logSpectral' : logSpecFloor = -140.00 (specFloor = 1.000000e-14) 9 | [ 13.01.2022 - 16:42:06 ] 10 | (MSG) [2] in instance 'egemapsv01a_logSpectral_flux' : logSpecFloor = -140.00 (specFloor = 1.000000e-14) 11 | [ 13.01.2022 - 16:42:06 ] 12 | (MSG) [2] in instance 'lldsink' : No filename given, disabling this sink component. 13 | [ 13.01.2022 - 16:42:06 ] 14 | (MSG) [2] in instance 'lldhtksink' : No filename given, disabling this sink component. 15 | [ 13.01.2022 - 16:42:06 ] 16 | (MSG) [2] in instance 'lldarffsink' : No filename given, disabling this sink component. 17 | [ 13.01.2022 - 16:42:06 ] 18 | (MSG) [2] in instance 'arffsink' : No filename given, disabling this sink component. 19 | [ 13.01.2022 - 16:42:06 ] 20 | (MSG) [2] in instance 'htksink' : No filename given, disabling this sink component. 21 | [ 13.01.2022 - 16:42:06 ] 22 | (WARN) [1] in instance 'gemapsv01a_formantVoiced.reader' : Mismatch in input level buffer sizes (levelconf.nT). Level #0 has size 5 which is smaller than the max. input size of all input levels (150). This might cause the processing to hang unpredictably or cause incomplete processing. 23 | [ 13.01.2022 - 16:42:06 ] 24 | (WARN) [1] in instance 'gemapsv01a_logSpectralVoiced.reader' : Mismatch in input level buffer sizes (levelconf.nT). Level #0 has size 5 which is smaller than the max. input size of all input levels (150). This might cause the processing to hang unpredictably or cause incomplete processing. 25 | [ 13.01.2022 - 16:42:06 ] 26 | (WARN) [1] in instance 'gemapsv01a_logSpectralUnvoiced.reader' : Mismatch in input level buffer sizes (levelconf.nT). Level #0 has size 5 which is smaller than the max. input size of all input levels (150). This might cause the processing to hang unpredictably or cause incomplete processing. 27 | [ 13.01.2022 - 16:42:06 ] 28 | (WARN) [1] in instance 'egemapsv01a_logSpectralVoiced.reader' : Mismatch in input level buffer sizes (levelconf.nT). Level #0 has size 5 which is smaller than the max. input size of all input levels (150). This might cause the processing to hang unpredictably or cause incomplete processing. 29 | [ 13.01.2022 - 16:42:06 ] 30 | (WARN) [1] in instance 'egemapsv01a_logSpectralUnvoiced.reader' : Mismatch in input level buffer sizes (levelconf.nT). Level #0 has size 5 which is smaller than the max. input size of all input levels (150). This might cause the processing to hang unpredictably or cause incomplete processing. 31 | [ 13.01.2022 - 16:42:06 ] 32 | (MSG) [2] in cComponentManager : successfully finished createInstances 33 | (77 component instances were finalised, 1 data memories were finalised) 34 | [ 13.01.2022 - 16:42:06 ] 35 | (MSG) [2] in cComponentManager : starting single thread processing loop 36 | [ 13.01.2022 - 16:42:06 ] 37 | (MSG) [2] in cComponentManager : Processing finished! System ran for 603 ticks. 38 | -------------------------------------------------------------------------------- /feature_extraction/audio/vggish/__pycache__/mel_features.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/audio/vggish/__pycache__/mel_features.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/audio/vggish/__pycache__/vggish_input.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/audio/vggish/__pycache__/vggish_input.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/audio/vggish/__pycache__/vggish_params.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/audio/vggish/__pycache__/vggish_params.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/audio/vggish/__pycache__/vggish_slim.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/audio/vggish/__pycache__/vggish_slim.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/audio/vggish/vggish_input.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Compute input examples for VGGish from audio waveform.""" 17 | 18 | import numpy as np 19 | import resampy # verison: 0.2.2, pip install resampy 20 | import math 21 | from vggish import mel_features 22 | from vggish import vggish_params 23 | 24 | try: 25 | import soundfile as sf 26 | 27 | def wav_read(wav_file): 28 | wav_data, sr = sf.read(wav_file, dtype='int16') 29 | return wav_data, sr 30 | 31 | except ImportError: 32 | 33 | def wav_read(wav_file): 34 | raise NotImplementedError('WAV file reading requires soundfile package.') 35 | 36 | 37 | def waveform_to_examples(data, sample_rate, hop_sec): 38 | """Converts audio waveform into an array of examples for VGGish. 39 | 40 | Args: 41 | data: np.array of either one dimension (mono) or two dimensions 42 | (multi-channel, with the outer dimension representing channels). 43 | Each sample is generally expected to lie in the range [-1.0, +1.0], 44 | although this is not required. 45 | sample_rate: Sample rate of data. 46 | 47 | Returns: 48 | 3-D np.array of shape [num_examples, num_frames, num_bands] which represents 49 | a sequence of examples, each of which contains a patch of log mel 50 | spectrogram, covering num_frames frames of audio and num_bands mel frequency 51 | bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. 52 | """ 53 | # Convert to mono. 54 | if len(data.shape) > 1: 55 | data = np.mean(data, axis=1) 56 | # Resample to the rate assumed by VGGish. 57 | if sample_rate != vggish_params.SAMPLE_RATE: 58 | data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) 59 | 60 | # Compute log mel spectrogram features. 61 | log_mel = mel_features.log_mel_spectrogram( 62 | data, 63 | audio_sample_rate=vggish_params.SAMPLE_RATE, 64 | log_offset=vggish_params.LOG_OFFSET, 65 | window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, 66 | hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, 67 | num_mel_bins=vggish_params.NUM_MEL_BINS, 68 | lower_edge_hertz=vggish_params.MEL_MIN_HZ, 69 | upper_edge_hertz=vggish_params.MEL_MAX_HZ) 70 | 71 | # Frame features into examples. 72 | features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS 73 | example_window_length = int(round( 74 | vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) 75 | example_hop_length = int(round( 76 | hop_sec * features_sample_rate)) 77 | # vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) # orginal 78 | log_mel_examples = mel_features.frame( 79 | log_mel, 80 | window_length=example_window_length, 81 | hop_length=example_hop_length) 82 | return log_mel_examples 83 | 84 | 85 | def wavfile_to_examples(wav_file, hop_sec): 86 | """Convenience wrapper around waveform_to_examples() for a common WAV format. 87 | 88 | Args: 89 | wav_file: String path to a file, or a file-like object. The file 90 | is assumed to contain WAV audio data with signed 16-bit PCM samples. 91 | 92 | Returns: 93 | See waveform_to_examples. 94 | """ 95 | wav_data, sr = wav_read(wav_file) 96 | assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype 97 | samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] 98 | 99 | ### process for samples < 1000ms, pad to longer than 1000ms 100 | if len(samples) < sr: 101 | samples = samples.tolist() 102 | samples = samples * math.ceil(sr/len(samples)) 103 | samples = np.array(samples) 104 | 105 | return waveform_to_examples(samples, sr, hop_sec) 106 | -------------------------------------------------------------------------------- /feature_extraction/audio/vggish/vggish_params.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Global parameters for the VGGish model. 17 | 18 | See vggish_slim.py for more information. 19 | """ 20 | 21 | # Architectural constants. 22 | NUM_FRAMES = 96 # Frames in input mel-spectrogram patch. 23 | NUM_BANDS = 64 # Frequency bands in input mel-spectrogram patch. 24 | EMBEDDING_SIZE = 128 # Size of embedding layer. 25 | 26 | # Hyperparameters used in feature and example generation. 27 | SAMPLE_RATE = 16000 28 | STFT_WINDOW_LENGTH_SECONDS = 0.025 29 | STFT_HOP_LENGTH_SECONDS = 0.010 30 | NUM_MEL_BINS = NUM_BANDS 31 | MEL_MIN_HZ = 125 32 | MEL_MAX_HZ = 7500 33 | LOG_OFFSET = 0.01 # Offset used for stabilized log of input mel-spectrogram. 34 | EXAMPLE_WINDOW_SECONDS = 0.96 # Each example contains 96 10ms frames 35 | # Note: original value for EXAMPLE_HOP_SECONDS is 0.96, i.e. no overlapping between adjacent examples 36 | # EXAMPLE_HOP_SECONDS = 0.25 # with zero overlap. 37 | 38 | # Parameters used for embedding postprocessing. 39 | PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors' 40 | PCA_MEANS_NAME = 'pca_means' 41 | QUANTIZE_MIN_VAL = -2.0 42 | QUANTIZE_MAX_VAL = +2.0 43 | 44 | # Hyperparameters used in training. 45 | INIT_STDDEV = 0.01 # Standard deviation used to initialize weights. 46 | LEARNING_RATE = 1e-4 # Learning rate for the Adam optimizer. 47 | ADAM_EPSILON = 1e-8 # Epsilon for the Adam optimizer. 48 | 49 | # Names of ops, tensors, and features. 50 | INPUT_OP_NAME = 'vggish/input_features' 51 | INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0' 52 | OUTPUT_OP_NAME = 'vggish/embedding' 53 | OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0' 54 | AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding' 55 | -------------------------------------------------------------------------------- /feature_extraction/audio/vggish/vggish_pca_params.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/audio/vggish/vggish_pca_params.npz -------------------------------------------------------------------------------- /feature_extraction/audio/vggish/vggish_postprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Post-process embeddings from VGGish.""" 17 | 18 | import numpy as np 19 | 20 | import vggish_params 21 | 22 | 23 | class Postprocessor(object): 24 | """Post-processes VGGish embeddings. 25 | 26 | The initial release of AudioSet included 128-D VGGish embeddings for each 27 | segment of AudioSet. These released embeddings were produced by applying 28 | a PCA transformation (technically, a whitening transform is included as well) 29 | and 8-bit quantization to the raw embedding output from VGGish, in order to 30 | stay compatible with the YouTube-8M project which provides visual embeddings 31 | in the same format for a large set of YouTube videos. This class implements 32 | the same PCA (with whitening) and quantization transformations. 33 | """ 34 | 35 | def __init__(self, pca_params_npz_path): 36 | """Constructs a postprocessor. 37 | 38 | Args: 39 | pca_params_npz_path: Path to a NumPy-format .npz file that 40 | contains the PCA parameters used in postprocessing. 41 | """ 42 | params = np.load(pca_params_npz_path) 43 | self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] 44 | # Load means into a column vector for easier broadcasting later. 45 | self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) 46 | assert self._pca_matrix.shape == ( 47 | vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 48 | 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) 49 | assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 50 | 'Bad PCA means shape: %r' % (self._pca_means.shape,)) 51 | 52 | def postprocess(self, embeddings_batch): 53 | """Applies postprocessing to a batch of embeddings. 54 | 55 | Args: 56 | embeddings_batch: An nparray of shape [batch_size, embedding_size] 57 | containing output from the embedding layer of VGGish. 58 | 59 | Returns: 60 | An nparray of the same shape as the input but of type uint8, 61 | containing the PCA-transformed and quantized version of the input. 62 | """ 63 | assert len(embeddings_batch.shape) == 2, ( 64 | 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) 65 | assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 66 | 'Bad batch shape: %r' % (embeddings_batch.shape,)) 67 | 68 | # Apply PCA. 69 | # - Embeddings come in as [batch_size, embedding_size]. 70 | # - Transpose to [embedding_size, batch_size]. 71 | # - Subtract pca_means column vector from each column. 72 | # - Premultiply by PCA matrix of shape [output_dims, input_dims] 73 | # where both are are equal to embedding_size in our case. 74 | # - Transpose result back to [batch_size, embedding_size]. 75 | pca_applied = np.dot(self._pca_matrix, 76 | (embeddings_batch.T - self._pca_means)).T 77 | 78 | # Quantize by: 79 | # - clipping to [min, max] range 80 | clipped_embeddings = np.clip( 81 | pca_applied, vggish_params.QUANTIZE_MIN_VAL, 82 | vggish_params.QUANTIZE_MAX_VAL) 83 | # - convert to 8-bit in range [0.0, 255.0] 84 | quantized_embeddings = ( 85 | (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * 86 | (255.0 / 87 | (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) 88 | # - cast 8-bit float to uint8 89 | quantized_embeddings = quantized_embeddings.astype(np.uint8) 90 | 91 | return quantized_embeddings 92 | -------------------------------------------------------------------------------- /feature_extraction/audio/vggish/vggish_smoke_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """A smoke test for VGGish. 17 | 18 | This is a simple smoke test of a local install of VGGish and its associated 19 | downloaded files. We create a synthetic sound, extract log mel spectrogram 20 | features, run them through VGGish, post-process the embedding ouputs, and 21 | check some simple statistics of the results, allowing for variations that 22 | might occur due to platform/version differences in the libraries we use. 23 | 24 | Usage: 25 | - Download the VGGish checkpoint and PCA parameters into the same directory as 26 | the VGGish source code. If you keep them elsewhere, update the checkpoint_path 27 | and pca_params_path variables below. 28 | - Run: 29 | $ python vggish_smoke_test.py 30 | """ 31 | 32 | from __future__ import print_function 33 | 34 | import numpy as np 35 | import tensorflow.compat.v1 as tf 36 | import os 37 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 38 | tf.disable_v2_behavior() 39 | 40 | import vggish_input 41 | import vggish_params 42 | import vggish_postprocess 43 | import vggish_slim 44 | 45 | print('\nTesting your install of VGGish\n') 46 | 47 | # Paths to downloaded VGGish files. 48 | checkpoint_path = 'vggish_model.ckpt' 49 | pca_params_path = 'vggish_pca_params.npz' 50 | 51 | # Relative tolerance of errors in mean and standard deviation of embeddings. 52 | rel_error = 0.1 # Up to 10% 53 | 54 | # Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate 55 | # to test resampling to 16 kHz during feature extraction). 56 | num_secs = 3 57 | freq = 1000 58 | sr = 44100 59 | t = np.linspace(0, num_secs, int(num_secs * sr)) 60 | x = np.sin(2 * np.pi * freq * t) 61 | 62 | # Produce a batch of log mel spectrogram examples. 63 | input_batch = vggish_input.waveform_to_examples(x, sr) 64 | print('Log Mel Spectrogram example: ', input_batch[0]) 65 | np.testing.assert_equal( 66 | input_batch.shape, 67 | [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS]) 68 | 69 | # Define VGGish, load the checkpoint, and run the batch through the model to 70 | # produce embeddings. 71 | with tf.Graph().as_default(), tf.Session() as sess: 72 | vggish_slim.define_vggish_slim() 73 | vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) 74 | 75 | features_tensor = sess.graph.get_tensor_by_name( 76 | vggish_params.INPUT_TENSOR_NAME) 77 | embedding_tensor = sess.graph.get_tensor_by_name( 78 | vggish_params.OUTPUT_TENSOR_NAME) 79 | [embedding_batch] = sess.run([embedding_tensor], 80 | feed_dict={features_tensor: input_batch}) 81 | print('VGGish embedding: ', embedding_batch[0]) 82 | expected_embedding_mean = 0.131 83 | expected_embedding_std = 0.238 84 | np.testing.assert_allclose( 85 | [np.mean(embedding_batch), np.std(embedding_batch)], 86 | [expected_embedding_mean, expected_embedding_std], 87 | rtol=rel_error) 88 | 89 | # Postprocess the results to produce whitened quantized embeddings. 90 | pproc = vggish_postprocess.Postprocessor(pca_params_path) 91 | postprocessed_batch = pproc.postprocess(embedding_batch) 92 | print('Postprocessed VGGish embedding: ', postprocessed_batch[0]) 93 | expected_postprocessed_mean = 123.0 94 | expected_postprocessed_std = 75.0 95 | np.testing.assert_allclose( 96 | [np.mean(postprocessed_batch), np.std(postprocessed_batch)], 97 | [expected_postprocessed_mean, expected_postprocessed_std], 98 | rtol=rel_error) 99 | 100 | print('\nLooks Good To Me!\n') 101 | -------------------------------------------------------------------------------- /feature_extraction/text/__pycache__/util.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/text/__pycache__/util.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/text/util.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | import os 3 | import re 4 | import pandas as pd 5 | import numpy as np 6 | import unicodedata 7 | 8 | 9 | def write_feature_to_csv(embeddings, timestamps, words, csv_file, log_file=None, embedding_dim=None): 10 | # get label file 11 | vid = os.path.basename(os.path.splitext(csv_file)[0]) 12 | # label_dir = os.path.abspath(os.path.join(os.path.dirname(csv_file), '../../label_segments/arousal')) 13 | # assert os.path.exists(label_dir), f'Error: label dir "{label_dir}" does not exist!' 14 | save_dir = os.path.dirname(csv_file) 15 | task_id = int(re.search('c(\d)_muse_', save_dir).group(1)) # infer the task id from save_dir (naive/unelegant approach) 16 | if task_id == 2: # for task "c2" 17 | rel_path = '../au' # use csv file in "au" feature as reference beacause of there is no timestamp in the label file 18 | elif task_id == 4: # for task "c4" 19 | rel_path = '../../label_segments/anno12_EDA' # no arousal label for this task 20 | else: 21 | rel_path = '../../label_segments/arousal' 22 | label_dir = os.path.abspath(os.path.join(save_dir, rel_path)) 23 | assert os.path.exists(label_dir), f'Error: label dir "{label_dir}" does not exist!' 24 | label_file = os.path.join(label_dir, f'{vid}.csv') 25 | df_label = pd.read_csv(label_file) 26 | meta_columns = ['timestamp', 'segment_id'] 27 | metas = df_label[meta_columns].values 28 | label_timestamps = metas[:,0] 29 | # align word, timestamp & embedding 30 | # embedding_dim = len(embeddings[0]) # use the argument "embedding_dim" instead, in case of embeddings is [] 31 | n_frames = len(label_timestamps) 32 | aligned_embeddings = np.zeros((n_frames, embedding_dim)) 33 | aligned_timestamps = np.empty((n_frames, 2), dtype=np.object) 34 | aligned_words = np.empty((n_frames,), dtype=np.object) 35 | label_timestamp_idxs = np.arange(n_frames) 36 | hit_count = 0 37 | for i, (s_t, e_t) in enumerate(timestamps): 38 | idxs = label_timestamp_idxs[np.where((label_timestamps >= s_t) & (label_timestamps < e_t))] 39 | if len(idxs) > 0: 40 | aligned_embeddings[idxs] = embeddings[i] 41 | aligned_timestamps[idxs] = [int(s_t), int(e_t)] 42 | aligned_words[idxs] = words[i] 43 | hit_count += len(idxs) 44 | print(f'Video "{vid}" hit rate: {hit_count/n_frames:.1%}.') 45 | # write csv file 46 | columns = meta_columns + [str(i) for i in range(embedding_dim)] 47 | data = np.column_stack([metas, aligned_embeddings]) 48 | df = pd.DataFrame(data=data, columns=columns) 49 | df[meta_columns] = df[meta_columns].astype(np.int64) 50 | df.to_csv(csv_file, index=False) 51 | # write log file 52 | if log_file is not None: 53 | log_columns = meta_columns + ['start', 'end', 'word'] 54 | log_data = np.column_stack([metas, aligned_timestamps, aligned_words]) 55 | log_df = pd.DataFrame(data=log_data, columns=log_columns) 56 | log_df[meta_columns] = log_df[meta_columns].astype(np.int64) 57 | if not os.path.exists(os.path.dirname(log_file)): 58 | os.makedirs(os.path.dirname(log_file)) 59 | log_df.to_csv(log_file, index=False) 60 | return data 61 | 62 | 63 | 64 | 65 | def load_glove(embedding_file): 66 | embeddings = {} 67 | with open(embedding_file, 'r') as f: 68 | for line in f.readlines(): 69 | splited_line = line.split(' ') 70 | word = splited_line[0] 71 | embedding = np.array([float(val) for val in splited_line[1:]]) # to numpy 72 | embeddings[word] = embedding 73 | embedding_dim = len(embedding) 74 | return embeddings, embedding_dim 75 | 76 | 77 | def load_word2vec(embedding_file): 78 | import gensim 79 | model = gensim.models.KeyedVectors.load_word2vec_format(embedding_file, binary=True) 80 | # embeddings = dict(zip(model.vocab, model.vectors)) # for Gensim 3.x 81 | embedding_dim = model.vector_size 82 | return model, embedding_dim 83 | 84 | 85 | # strip accent in unicode string 86 | def strip_accent(string): 87 | return ''.join( 88 | character for character in unicodedata.normalize('NFD', string) 89 | if unicodedata.category(character) != 'Mn' 90 | ) 91 | 92 | 93 | 94 | 95 | if __name__ == '__main__': 96 | main() -------------------------------------------------------------------------------- /feature_extraction/visual/__pycache__/config.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/__pycache__/config.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/__pycache__/dataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/__pycache__/dataset.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/__pycache__/util.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/__pycache__/util.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/dataset.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | import os 3 | import glob 4 | from PIL import Image 5 | from skimage import io 6 | import torch.utils.data as data 7 | 8 | 9 | class FaceDataset(data.Dataset): 10 | def __init__(self, vid, face_dir, transform=None): 11 | super(FaceDataset, self).__init__() 12 | self.vid = vid 13 | self.path = os.path.join(face_dir, vid) 14 | self.transform = transform 15 | self.frames = self.get_frames() 16 | 17 | def get_frames(self): 18 | frames = glob.glob(os.path.join(self.path, '*')) 19 | # if len(frames) == 0: 20 | # raise ValueError("number of frames of video {} should not be zero.".format(self.vid)) 21 | # frames = sorted(frames, key=lambda x: int(os.path.basename(os.path.splitext(x)[0]))) 22 | # frame_ids = [int(os.path.basename(os.path.splitext(file)[0])) for file in frames] 23 | 24 | return frames 25 | 26 | def __len__(self): 27 | return len(self.frames) 28 | 29 | def __getitem__(self, index): 30 | path = self.frames[index] 31 | img = Image.open(path) 32 | if self.transform is not None: 33 | img = self.transform(img) 34 | # fid = int(os.path.basename(os.path.splitext(path)[0])) 35 | name = os.path.basename(path)[:-4] 36 | return img, name 37 | 38 | 39 | 40 | class FaceDatasetForEmoNet(data.Dataset): 41 | def __init__(self, vid, face_dir, transform=None, augmentor=None): 42 | super(FaceDatasetForEmoNet, self).__init__() 43 | self.vid = vid 44 | self.path = os.path.join(face_dir, vid) 45 | self.augmentor = augmentor 46 | self.transform = transform 47 | self.frames = self.get_frames() 48 | 49 | def get_frames(self): 50 | frames = glob.glob(os.path.join(self.path, '*')) 51 | # frames = sorted(frames, key=lambda x: int(os.path.basename(os.path.splitext(x)[0]))) 52 | return frames 53 | 54 | def __len__(self): 55 | return len(self.frames) 56 | 57 | def __getitem__(self, index): 58 | path = self.frames[index] 59 | img = io.imread(path) 60 | if self.augmentor is not None: 61 | img = self.augmentor(img)[0] 62 | if self.transform is not None: 63 | img = self.transform(img) 64 | # fid = int(os.path.basename(os.path.splitext(path)[0])) 65 | # return img, fid 66 | name = os.path.basename(path)[:-4] 67 | return img, name -------------------------------------------------------------------------------- /feature_extraction/visual/emonet/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | -------------------------------------------------------------------------------- /feature_extraction/visual/emonet/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/emonet/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/emonet/__pycache__/data_augmentation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/emonet/__pycache__/data_augmentation.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/emonet/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .affecnet import AffectNet 2 | -------------------------------------------------------------------------------- /feature_extraction/visual/emonet/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def ACC(ground_truth, predictions): 5 | """Evaluates the mean accuracy 6 | """ 7 | return np.mean(ground_truth.astype(int) == predictions.astype(int)) 8 | 9 | def RMSE(ground_truth, predictions): 10 | """ 11 | Evaluates the RMSE between estimate and ground truth. 12 | """ 13 | return np.sqrt(np.mean((ground_truth-predictions)**2)) 14 | 15 | 16 | def SAGR(ground_truth, predictions): 17 | """ 18 | Evaluates the SAGR between estimate and ground truth. 19 | """ 20 | return np.mean(np.sign(ground_truth) == np.sign(predictions)) 21 | 22 | 23 | def PCC(ground_truth, predictions): 24 | """ 25 | Evaluates the Pearson Correlation Coefficient. 26 | Inputs are numpy arrays. 27 | Corr = Cov(GT, Est)/(std(GT)std(Est)) 28 | """ 29 | return np.corrcoef(ground_truth, predictions)[0,1] 30 | 31 | 32 | def CCC(ground_truth, predictions): 33 | """ 34 | Evaluates the Concordance Correlation Coefficient. 35 | Inputs are numpy arrays. 36 | """ 37 | mean_pred = np.mean(predictions) 38 | mean_gt = np.mean(ground_truth) 39 | 40 | std_pred= np.std(predictions) 41 | std_gt = np.std(ground_truth) 42 | 43 | pearson = PCC(ground_truth, predictions) 44 | return 2.0*pearson*std_pred*std_gt/(std_pred**2+std_gt**2+(mean_pred-mean_gt)**2) 45 | 46 | def ICC(labels, predictions): 47 | """Evaluates the ICC(3, 1) 48 | """ 49 | naus = predictions.shape[1] 50 | icc = np.zeros(naus) 51 | 52 | n = predictions.shape[0] 53 | 54 | for i in range(0,naus): 55 | a = np.asmatrix(labels[:,i]).transpose() 56 | b = np.asmatrix(predictions[:,i]).transpose() 57 | dat = np.hstack((a, b)) 58 | mpt = np.mean(dat, axis=1) 59 | mpr = np.mean(dat, axis=0) 60 | tm = np.mean(mpt, axis=0) 61 | BSS = np.sum(np.square(mpt-tm))*2 62 | BMS = BSS/(n-1) 63 | RSS = np.sum(np.square(mpr-tm))*n 64 | tmp = np.square(dat - np.hstack((mpt,mpt))) 65 | WSS = np.sum(np.sum(tmp, axis=1)) 66 | ESS = WSS - RSS 67 | EMS = ESS/(n-1) 68 | icc[i] = (BMS - EMS)/(BMS + EMS) 69 | 70 | return icc 71 | -------------------------------------------------------------------------------- /feature_extraction/visual/emonet/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .emonet import EmoNet 2 | 3 | -------------------------------------------------------------------------------- /feature_extraction/visual/emonet/models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/emonet/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/emonet/models/__pycache__/emonet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/emonet/models/__pycache__/emonet.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/extract_emonet_embedding.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | import os 3 | import argparse 4 | from tqdm import tqdm 5 | import torch 6 | import torch.nn.parallel 7 | import torch.optim 8 | import torch.utils.data 9 | import torch.utils.data.distributed 10 | import torchvision.transforms as transforms 11 | import numpy as np 12 | 13 | from emonet.models.emonet import EmoNet 14 | from dataset import FaceDatasetForEmoNet 15 | from util import write_feature_to_csv, get_vids, write_feature_to_npy 16 | from emonet.data_augmentation import DataAugmentor 17 | 18 | # import config 19 | import sys 20 | sys.path.append('../../') 21 | import config 22 | 23 | def extract(data_loader, model): 24 | model.eval() 25 | with torch.no_grad(): 26 | features, timestamps = [], [] 27 | for images, names in tqdm(data_loader): 28 | images = images.cuda() 29 | embedding = model(images, return_embedding=True) 30 | features.append(embedding.cpu().detach().numpy()) 31 | timestamps.extend(names) 32 | features, timestamps = np.row_stack(features), np.array(timestamps) 33 | return features, timestamps 34 | 35 | 36 | 37 | def main(params): 38 | os.environ["CUDA_VISIBLE_DEVICES"] = params.gpu 39 | 40 | print(f'==> Extracting emonet embedding...') 41 | # in: face dir 42 | face_dir = config.PATH_TO_RAW_FACE[params.dataset] 43 | # out: feature csv dir 44 | save_dir = os.path.join(config.PATH_TO_FEATURES[params.dataset], 'emonet') 45 | if not os.path.exists(save_dir): 46 | os.mkdir(save_dir) 47 | elif params.overwrite: 48 | print(f'==> Warning: overwrite save_dir "{save_dir}"!') 49 | else: 50 | raise Exception(f'==> Error: save_dir "{save_dir}" already exists, set overwrite=TRUE if needed!') 51 | 52 | # load model 53 | model = EmoNet().cuda() 54 | # model = torch.nn.DataParallel(model).cuda() 55 | checkpoint_file = os.path.join(config.PATH_TO_PRETRAINED_MODELS, 'emonet/emonet_8.pth') 56 | checkpoint = torch.load(checkpoint_file) 57 | pre_trained_dict = {k.replace('module.', ''): v for k,v in checkpoint.items()} 58 | model.load_state_dict(pre_trained_dict) 59 | 60 | # transform 61 | augmentor = DataAugmentor(256, 256) 62 | transform = transforms.Compose([transforms.ToTensor()]) 63 | 64 | # extract embedding video by video 65 | vids = get_vids(face_dir) 66 | print(f'Find total "{len(vids)}" videos.') 67 | for i, vid in enumerate(vids, 1): 68 | print(f"Processing video '{vid}' ({i}/{len(vids)})...") 69 | # forward 70 | dataset = FaceDatasetForEmoNet(vid, face_dir, transform=transform, augmentor=augmentor) 71 | if len(dataset) == 0: 72 | print("Warning: number of frames of video {} should not be zero.".format(vid)) 73 | features, timestamps = [], [] 74 | else: 75 | data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, num_workers=4, pin_memory=True) 76 | features, timestamps = extract(data_loader, model) 77 | 78 | # write 79 | # write_feature_to_csv(features, timestamps, save_dir, vid, feature_dim=feature_dim) 80 | write_feature_to_npy(features, timestamps, save_dir, vid) 81 | 82 | 83 | 84 | if __name__ == '__main__': 85 | parser = argparse.ArgumentParser(description='Run.') 86 | parser.add_argument('--gpu', type=str, default='5', help='gpu id') 87 | parser.add_argument('--overwrite', action='store_true', default=True, help='whether overwrite existed feature folder.') 88 | parser.add_argument('--dataset', type=str, default='BoxOfLies', help='input dataset') 89 | params = parser.parse_args() 90 | 91 | main(params) -------------------------------------------------------------------------------- /feature_extraction/visual/manet/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Zengqun Zhao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /feature_extraction/visual/manet/README.md: -------------------------------------------------------------------------------- 1 | # MA-Net 2 | 3 | PyTorch implementation of the paper *“Learning Deep Global Multi-scale and Local Attention Features 4 | for Facial Expression Recognition in the Wild”*, This work is under submission. 5 | 6 | ## Requirements 7 | - Python $\geq$3.6 8 | - PyTorch $\geq$1.2 9 | - torchvision $\geq$0.4.0 10 | - numpy 11 | - matplotlib 12 | - datetime 13 | - shutil 14 | - time 15 | - argparse 16 | - os 17 | 18 | ## Training 19 | 20 | - Step 1: download basic emotions dataset of [RAF-DB](http://www.whdeng.cn/raf/model1.html), and make sure it have the structure like following: 21 | 22 | ``` 23 | ./RAF-DB/ 24 | train/ 25 | 0/ 26 | train_09748.jpg 27 | ... 28 | train_12271.jpg 29 | 1/ 30 | ... 31 | 6/ 32 | test/ 33 | 0/ 34 | ... 35 | 6/ 36 | 37 | [Note] 0: Neutral; 1: Happiness; 2: Sadness; 3: Surprise; 4: Fear; 5: Disgust; 6: Anger 38 | ``` 39 | 40 | - Step 2: download pre-trained model from 41 | [Google Drive](https://drive.google.com/file/d/1tro_RCovLKNACt4MKYp3dmIvvxiOC2pi/view?usp=sharing), 42 | and put it into ***./checkpoint***. 43 | 44 | - Step 3: change the ***project_path*** and ***data_path*** in *main.py* to your path 45 | 46 | - Step 4: run ```python main.py ``` 47 | -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/AffectNet7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/AffectNet7.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/AffectNet8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/AffectNet8.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/CAER-S.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/CAER-S.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/FED-RO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/FED-RO.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/RAF-DB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/RAF-DB.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/SFEW.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/SFEW.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/[02-08]-[16-22]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/[02-08]-[16-22]-cnn.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/[02-08]-[19-12]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/[02-08]-[19-12]-cnn.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/[02-08]-[21-19]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/[02-08]-[21-19]-cnn.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/[02-08]-[22-55]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/[02-08]-[22-55]-cnn.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/[02-12]-[19-11]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/[02-12]-[19-11]-cnn.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/[02-12]-[22-21]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/[02-12]-[22-21]-cnn.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/log/[05-28]-[13-07]-cnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/log/[05-28]-[13-07]-cnn.png -------------------------------------------------------------------------------- /feature_extraction/visual/manet/model/__pycache__/attention.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/model/__pycache__/attention.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/manet/model/__pycache__/manet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/manet/model/__pycache__/manet.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/manet/model/attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class BasicConv(nn.Module): 7 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False): 8 | super(BasicConv, self).__init__() 9 | self.out_channels = out_planes 10 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) 11 | self.bn = nn.BatchNorm2d(out_planes, eps=1e-5, momentum=0.01, affine=True) if bn else None 12 | self.relu = nn.ReLU() if relu else None 13 | 14 | def forward(self, x): 15 | x = self.conv(x) 16 | if self.bn is not None: 17 | x = self.bn(x) 18 | if self.relu is not None: 19 | x = self.relu(x) 20 | return x 21 | 22 | 23 | class Flatten(nn.Module): 24 | def forward(self, x): 25 | return x.view(x.size(0), -1) 26 | 27 | 28 | class ChannelGate(nn.Module): 29 | def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']): 30 | super(ChannelGate, self).__init__() 31 | self.gate_channels = gate_channels 32 | self.mlp = nn.Sequential(Flatten(), 33 | nn.Linear(gate_channels, gate_channels // reduction_ratio), 34 | nn.ReLU(), 35 | nn.Linear(gate_channels // reduction_ratio, gate_channels)) 36 | self.pool_types = pool_types 37 | 38 | def forward(self, x): 39 | channel_att_sum = None 40 | for pool_type in self.pool_types: 41 | if pool_type == 'avg': 42 | avg_pool = F.avg_pool2d(x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3))) 43 | channel_att_raw = self.mlp(avg_pool ) 44 | elif pool_type == 'max': 45 | max_pool = F.max_pool2d(x, (x.size(2), x.size(3)), stride=(x.size(2), x.size(3))) 46 | channel_att_raw = self.mlp(max_pool) 47 | if channel_att_sum is None: 48 | channel_att_sum = channel_att_raw 49 | else: 50 | channel_att_sum = channel_att_sum + channel_att_raw 51 | 52 | scale = torch.sigmoid(channel_att_sum).unsqueeze(2).unsqueeze(3).expand_as(x) 53 | return x * scale 54 | 55 | 56 | class ChannelPool(nn.Module): 57 | def forward(self, x): 58 | return torch.cat((torch.max(x, 1)[0].unsqueeze(1), torch.mean(x, 1).unsqueeze(1)), dim=1) 59 | 60 | 61 | class SpatialGate(nn.Module): 62 | def __init__(self): 63 | super(SpatialGate, self).__init__() 64 | kernel_size = 7 65 | self.compress = ChannelPool() 66 | self.spatial = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size-1) // 2, relu=False) 67 | 68 | def forward(self, x): 69 | x_compress = self.compress(x) 70 | x_out = self.spatial(x_compress) 71 | scale = torch.sigmoid(x_out) 72 | return x * scale 73 | 74 | 75 | class CBAM(nn.Module): 76 | def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']): 77 | super(CBAM, self).__init__() 78 | self.ChannelGate = ChannelGate(gate_channels, reduction_ratio, pool_types) 79 | self.SpatialGate = SpatialGate() 80 | 81 | def forward(self, x): 82 | x_out = self.ChannelGate(x) 83 | x_out = self.SpatialGate(x_out) 84 | 85 | return x_out 86 | -------------------------------------------------------------------------------- /feature_extraction/visual/manet/reorganize_rafdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import pandas as pd 4 | import shutil 5 | 6 | 7 | rafdb_path = '/data1/sunlicai/Affective Computing/Dataset/RAF-DB/basic' 8 | src_path = os.path.join(rafdb_path, 'Image/aligned') 9 | tgt_path = os.path.join(rafdb_path, 'Image/aligned_c') # split/class_id/img_file 10 | label_file = os.path.join(rafdb_path, 'EmoLabel/list_patition_label.txt') 11 | df = pd.read_csv(label_file, header=None, delimiter=' ') 12 | file_names, label_ids = df[0].values, df[1].values 13 | print(f'Number of images: {len(df)}.') 14 | name_to_label = dict(zip(file_names, label_ids)) 15 | img_files = glob.glob(os.path.join(src_path, '*.jpg')) 16 | 17 | for src_file in img_files: 18 | img_name = os.path.basename(src_file).replace('_aligned', '') 19 | label = name_to_label[img_name] 20 | split = img_name.split('_')[0] 21 | saved_path = os.path.join(tgt_path, split, str(label)) 22 | if not os.path.exists(saved_path): 23 | os.makedirs(saved_path) 24 | tgt_file = os.path.join(saved_path, img_name) 25 | shutil.copyfile(src_file, tgt_file) 26 | print(f'Copy "{src_file}" to "{tgt_file}".') -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | .nfs* 4 | scratch 5 | res_cache 6 | -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Samuel Albanie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/README.md: -------------------------------------------------------------------------------- 1 | ### pytorch-benchmark 2 | 3 | Some scripts for validating models on common benchmarks. Assumes at least Python3 and PyTorch 4.0. 4 | 5 | 6 | ### Supported datasets: 7 | 8 | * **ImageNet** (this is essentially just a cut-down version of the [official example](https://github.com/pytorch/examples/tree/master/imagenet)) 9 | * **Fer2013** - A dataset of greyscale faces labelled with emotions. 10 | 11 | 12 | 13 | ### References 14 | 15 | **ImageNet**: [paper](https://arxiv.org/abs/1409.0575) 16 | 17 | ``` 18 | @article{ILSVRC15, 19 | Author = {Olga Russakovsky and Jia Deng and Hao Su and Jonathan Krause and Sanjeev Satheesh and Sean Ma and Zhiheng Huang and Andrej Karpathy and Aditya Khosla and Michael Bernstein and Alexander C. Berg and Li Fei-Fei}, 20 | Title = {{ImageNet Large Scale Visual Recognition Challenge}}, 21 | Year = {2015}, 22 | journal = {International Journal of Computer Vision (IJCV)}, 23 | doi = {10.1007/s11263-015-0816-y}, 24 | volume={115}, 25 | number={3}, 26 | pages={211-252} 27 | } 28 | ``` 29 | 30 | **FER2013**: [paper](https://arxiv.org/abs/1307.0414) 31 | 32 | ``` 33 | @inproceedings{goodfellow2013challenges, 34 | title={Challenges in representation learning: A report on three machine learning contests}, 35 | author={Goodfellow, Ian J and Erhan, Dumitru and Carrier, Pierre Luc and Courville, Aaron and Mirza, Mehdi and Hamner, Ben and Cukierski, Will and Tang, Yichuan and Thaler, David and Lee, Dong-Hyun and others}, 36 | booktitle={International Conference on Neural Information Processing}, 37 | pages={117--124}, 38 | year={2013}, 39 | organization={Springer} 40 | } 41 | ``` 42 | 43 | -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/fer2013/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/pytorch-benchmarks/fer2013/__init__.py -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/fer2013/fer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Fer2013 benchmark 3 | 4 | The module evaluates the performance of a pytorch model on the FER2013 5 | benchmark. 6 | """ 7 | 8 | from __future__ import division 9 | 10 | import os 11 | import time 12 | 13 | import torch 14 | import numpy as np 15 | import torch.utils.data 16 | import torch.backends.cudnn as cudnn 17 | from fer2013.fer_loader import Fer2013Dataset, Fer2013PlusDataset 18 | from utils.benchmark_helpers import compose_transforms 19 | 20 | def fer2013_benchmark(model, data_dir, res_cache, refresh_cache, 21 | batch_size=256, num_workers=2, fer_plus=False): 22 | if not refresh_cache: # load result from cache, if available 23 | if os.path.isfile(res_cache): 24 | res = torch.load(res_cache) 25 | prec1_val, prec1_test = res['prec1_val'], res['prec1_test'] 26 | print("=> loaded results from '{}'".format(res_cache)) 27 | info = (prec1_val, prec1_test, res['speed']) 28 | msg = 'val acc: {:.2f}, test acc: {:.2f}, Speed: {:.1f}Hz' 29 | print(msg.format(*info)) 30 | return 31 | 32 | meta = model.meta 33 | cudnn.benchmark = True 34 | model = torch.nn.DataParallel(model).cuda() 35 | preproc_transforms = compose_transforms(meta, center_crop=False) 36 | if fer_plus: 37 | dataset = Fer2013PlusDataset 38 | else: 39 | dataset = Fer2013Dataset 40 | speeds = [] 41 | res = {} 42 | for mode in 'val', 'test': 43 | loader = torch.utils.data.DataLoader( 44 | dataset(data_dir, mode=mode, transform=preproc_transforms), 45 | batch_size=batch_size, shuffle=False, 46 | num_workers=num_workers, pin_memory=True) 47 | prec1, speed = validate(loader, model, mode) 48 | res['prec1_{}'.format(mode)] = prec1 49 | speeds.append(speed) 50 | res['speed'] = np.mean(speed) 51 | torch.save(res, res_cache) 52 | 53 | def validate(val_loader, model, mode): 54 | model.eval() 55 | top1 = AverageMeter() 56 | speed = WarmupAverageMeter() 57 | end = time.time() 58 | with torch.no_grad(): 59 | for ii, (ims, target) in enumerate(val_loader): 60 | # target = target.cuda(async=True) 61 | target = target.cuda() 62 | output = model(ims) # compute output 63 | prec1, = accuracy(output.data, target, topk=(1,)) 64 | top1.update(prec1[0], ims.size(0)) 65 | speed.update(time.time() - end, ims.size(0)) 66 | end = time.time() 67 | if ii % 10 == 0: 68 | msg = ('{0}: [{1}/{2}]\tSpeed {speed.current:.1f}Hz\t' 69 | '({speed.avg:.1f})Hz\tPrec@1 {top1.avg:.3f}') 70 | print(msg.format(mode, ii, len(val_loader), 71 | speed=speed, top1=top1)) 72 | print(' * Accuracy {0:.3f}'.format(top1.avg)) 73 | return top1.avg, speed.avg 74 | 75 | class WarmupAverageMeter(object): 76 | """Computes and stores the average and current value, after a fixed 77 | warmup period (useful for approximate benchmarking) 78 | 79 | Args: 80 | warmup (int) [3]: The number of updates to be ignored before the 81 | average starts to be computed. 82 | """ 83 | def __init__(self, warmup=3): 84 | self.reset() 85 | self.warmup = warmup 86 | 87 | def reset(self): 88 | self.avg = 0 89 | self.current = 0 90 | self.delta_sum = 0 91 | self.count = 0 92 | self.warmup_count = 0 93 | 94 | def update(self, delta, n): 95 | self.warmup_count = self.warmup_count + 1 96 | if self.warmup_count >= self.warmup: 97 | self.current = n / delta 98 | self.delta_sum += delta 99 | self.count += n 100 | self.avg = self.count / self.delta_sum 101 | 102 | class AverageMeter(object): 103 | """Computes and stores the average and current value""" 104 | def __init__(self): 105 | self.reset() 106 | 107 | def reset(self): 108 | self.val = 0 109 | self.avg = 0 110 | self.sum = 0 111 | self.count = 0 112 | 113 | def update(self, val, n=1): 114 | self.val = val 115 | self.sum += val * n 116 | self.count += n 117 | self.avg = self.sum / self.count 118 | 119 | def accuracy(output, target, topk=(1,)): 120 | """Computes the precision@k for the specified values of k""" 121 | maxk = max(topk) 122 | batch_size = target.size(0) 123 | output = output.squeeze(-1).squeeze(-1) 124 | _, pred = output.topk(maxk, 1, True, True) 125 | pred = pred.t() 126 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 127 | 128 | res = [] 129 | for k in topk: 130 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 131 | res.append(correct_k.mul_(100.0 / batch_size)) 132 | return res 133 | -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/imagenet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/pytorch-benchmarks/imagenet/__init__.py -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/model/alexnet_face_fer_bn_dag.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class Alexnet_face_fer_bn_dag(nn.Module): 8 | 9 | def __init__(self): 10 | super(Alexnet_face_fer_bn_dag, self).__init__() 11 | self.meta = {'mean': [131.09375, 103.88607788085938, 91.47599792480469], 12 | 'std': [1, 1, 1], 13 | 'imageSize': [227, 227, 3]} 14 | self.conv1 = nn.Conv2d(3, 96, kernel_size=[11, 11], stride=(4, 4)) 15 | self.bn1 = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 16 | self.relu1 = nn.ReLU() 17 | self.pool1 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 18 | self.conv2 = nn.Conv2d(96, 256, kernel_size=[5, 5], stride=(1, 1), padding=(2, 2), groups=2) 19 | self.bn2 = nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 20 | self.relu2 = nn.ReLU() 21 | self.pool2 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 22 | self.conv3 = nn.Conv2d(256, 384, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 23 | self.bn3 = nn.BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 24 | self.relu3 = nn.ReLU() 25 | self.conv4 = nn.Conv2d(384, 384, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1), groups=2) 26 | self.bn4 = nn.BatchNorm2d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 27 | self.relu4 = nn.ReLU() 28 | self.conv5 = nn.Conv2d(384, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1), groups=2) 29 | self.bn5 = nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 30 | self.relu5 = nn.ReLU() 31 | self.pool5 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 32 | self.fc6 = nn.Conv2d(256, 4096, kernel_size=[6, 6], stride=(1, 1)) 33 | self.bn6 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 34 | self.relu6 = nn.ReLU() 35 | self.fc7 = nn.Conv2d(4096, 4096, kernel_size=[1, 1], stride=(1, 1)) 36 | self.bn7 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 37 | self.relu7 = nn.ReLU() 38 | self.fc8 = nn.Linear(in_features=4096, out_features=7, bias=True) 39 | 40 | def forward(self, data): 41 | x1 = self.conv1(data) 42 | x2 = self.bn1(x1) 43 | x3 = self.relu1(x2) 44 | x4 = self.pool1(x3) 45 | x5 = self.conv2(x4) 46 | x6 = self.bn2(x5) 47 | x7 = self.relu2(x6) 48 | x8 = self.pool2(x7) 49 | x9 = self.conv3(x8) 50 | x10 = self.bn3(x9) 51 | x11 = self.relu3(x10) 52 | x12 = self.conv4(x11) 53 | x13 = self.bn4(x12) 54 | x14 = self.relu4(x13) 55 | x15 = self.conv5(x14) 56 | x16 = self.bn5(x15) 57 | x17 = self.relu5(x16) 58 | x18 = self.pool5(x17) 59 | x19 = self.fc6(x18) 60 | x20 = self.bn6(x19) 61 | x21 = self.relu6(x20) 62 | x22 = self.fc7(x21) 63 | x23 = self.bn7(x22) 64 | x24_preflatten = self.relu7(x23) 65 | x24 = x24_preflatten.view(x24_preflatten.size(0), -1) 66 | prediction = self.fc8(x24) 67 | return prediction 68 | 69 | def alexnet_face_fer_bn_dag(weights_path=None, **kwargs): 70 | """ 71 | load imported model instance 72 | 73 | Args: 74 | weights_path (str): If set, loads model weights from the given path 75 | """ 76 | model = Alexnet_face_fer_bn_dag() 77 | if weights_path: 78 | state_dict = torch.load(weights_path) 79 | model.load_state_dict(state_dict) 80 | return model -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/model/vgg_m_face_bn_fer_dag.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class Vgg_m_face_bn_fer_dag(nn.Module): 8 | 9 | def __init__(self): 10 | super(Vgg_m_face_bn_fer_dag, self).__init__() 11 | self.meta = {'mean': [131.45376586914062, 103.98748016357422, 91.46234893798828], 12 | 'std': [1, 1, 1], 13 | 'imageSize': [224, 224, 3]} 14 | self.conv1 = nn.Conv2d(3, 96, kernel_size=[7, 7], stride=(2, 2)) 15 | self.bn49 = nn.BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 16 | self.relu1 = nn.ReLU() 17 | self.pool1 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 18 | self.conv2 = nn.Conv2d(96, 256, kernel_size=[5, 5], stride=(2, 2), padding=(1, 1)) 19 | self.bn50 = nn.BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 20 | self.relu2 = nn.ReLU() 21 | self.pool2 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=(0, 0), dilation=1, ceil_mode=True) 22 | self.conv3 = nn.Conv2d(256, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 23 | self.bn51 = nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 24 | self.relu3 = nn.ReLU() 25 | self.conv4 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 26 | self.bn52 = nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 27 | self.relu4 = nn.ReLU() 28 | self.conv5 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 29 | self.bn53 = nn.BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 30 | self.relu5 = nn.ReLU() 31 | self.pool5 = nn.MaxPool2d(kernel_size=[3, 3], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 32 | self.fc6 = nn.Conv2d(512, 4096, kernel_size=[6, 6], stride=(1, 1)) 33 | self.bn54 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 34 | self.relu6 = nn.ReLU() 35 | self.fc7 = nn.Conv2d(4096, 4096, kernel_size=[1, 1], stride=(1, 1)) 36 | self.bn55 = nn.BatchNorm2d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) 37 | self.relu7 = nn.ReLU() 38 | self.fc8 = nn.Linear(in_features=4096, out_features=7, bias=True) 39 | 40 | def forward(self, data): 41 | x1 = self.conv1(data) 42 | x2 = self.bn49(x1) 43 | x3 = self.relu1(x2) 44 | x4 = self.pool1(x3) 45 | x5 = self.conv2(x4) 46 | x6 = self.bn50(x5) 47 | x7 = self.relu2(x6) 48 | x8 = self.pool2(x7) 49 | x9 = self.conv3(x8) 50 | x10 = self.bn51(x9) 51 | x11 = self.relu3(x10) 52 | x12 = self.conv4(x11) 53 | x13 = self.bn52(x12) 54 | x14 = self.relu4(x13) 55 | x15 = self.conv5(x14) 56 | x16 = self.bn53(x15) 57 | x17 = self.relu5(x16) 58 | x18 = self.pool5(x17) 59 | x19 = self.fc6(x18) 60 | x20 = self.bn54(x19) 61 | x21 = self.relu6(x20) 62 | x22 = self.fc7(x21) 63 | x23 = self.bn55(x22) 64 | x24_preflatten = self.relu7(x23) 65 | x24 = x24_preflatten.view(x24_preflatten.size(0), -1) 66 | prediction = self.fc8(x24) 67 | return prediction 68 | 69 | def vgg_m_face_bn_fer_dag(weights_path=None, **kwargs): 70 | """ 71 | load imported model instance 72 | 73 | Args: 74 | weights_path (str): If set, loads model weights from the given path 75 | """ 76 | model = Vgg_m_face_bn_fer_dag() 77 | if weights_path: 78 | state_dict = torch.load(weights_path) 79 | model.load_state_dict(state_dict) 80 | return model -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/model/vgg_vd_face_fer_dag.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class Vgg_vd_face_fer_dag(nn.Module): 8 | 9 | def __init__(self): 10 | super(Vgg_vd_face_fer_dag, self).__init__() 11 | self.meta = {'mean': [129.186279296875, 104.76238250732422, 93.59396362304688], 12 | 'std': [1, 1, 1], 13 | 'imageSize': [224, 224, 3]} 14 | self.conv1_1 = nn.Conv2d(3, 64, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 15 | self.relu1_1 = nn.ReLU() 16 | self.conv1_2 = nn.Conv2d(64, 64, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 17 | self.relu1_2 = nn.ReLU() 18 | self.pool1 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 19 | self.conv2_1 = nn.Conv2d(64, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 20 | self.relu2_1 = nn.ReLU() 21 | self.conv2_2 = nn.Conv2d(128, 128, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 22 | self.relu2_2 = nn.ReLU() 23 | self.pool2 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 24 | self.conv3_1 = nn.Conv2d(128, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 25 | self.relu3_1 = nn.ReLU() 26 | self.conv3_2 = nn.Conv2d(256, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 27 | self.relu3_2 = nn.ReLU() 28 | self.conv3_3 = nn.Conv2d(256, 256, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 29 | self.relu3_3 = nn.ReLU() 30 | self.pool3 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 31 | self.conv4_1 = nn.Conv2d(256, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 32 | self.relu4_1 = nn.ReLU() 33 | self.conv4_2 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 34 | self.relu4_2 = nn.ReLU() 35 | self.conv4_3 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 36 | self.relu4_3 = nn.ReLU() 37 | self.pool4 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 38 | self.conv5_1 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 39 | self.relu5_1 = nn.ReLU() 40 | self.conv5_2 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 41 | self.relu5_2 = nn.ReLU() 42 | self.conv5_3 = nn.Conv2d(512, 512, kernel_size=[3, 3], stride=(1, 1), padding=(1, 1)) 43 | self.relu5_3 = nn.ReLU() 44 | self.pool5 = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2], padding=0, dilation=1, ceil_mode=False) 45 | self.fc6 = nn.Conv2d(512, 4096, kernel_size=[7, 7], stride=(1, 1)) 46 | self.relu6 = nn.ReLU() 47 | self.fc7 = nn.Linear(in_features=4096, out_features=4096, bias=True) 48 | self.relu7 = nn.ReLU() 49 | self.fc8 = nn.Linear(in_features=4096, out_features=7, bias=True) 50 | 51 | def forward(self, data): 52 | x1 = self.conv1_1(data) 53 | x2 = self.relu1_1(x1) 54 | x3 = self.conv1_2(x2) 55 | x4 = self.relu1_2(x3) 56 | x5 = self.pool1(x4) 57 | x6 = self.conv2_1(x5) 58 | x7 = self.relu2_1(x6) 59 | x8 = self.conv2_2(x7) 60 | x9 = self.relu2_2(x8) 61 | x10 = self.pool2(x9) 62 | x11 = self.conv3_1(x10) 63 | x12 = self.relu3_1(x11) 64 | x13 = self.conv3_2(x12) 65 | x14 = self.relu3_2(x13) 66 | x15 = self.conv3_3(x14) 67 | x16 = self.relu3_3(x15) 68 | x17 = self.pool3(x16) 69 | x18 = self.conv4_1(x17) 70 | x19 = self.relu4_1(x18) 71 | x20 = self.conv4_2(x19) 72 | x21 = self.relu4_2(x20) 73 | x22 = self.conv4_3(x21) 74 | x23 = self.relu4_3(x22) 75 | x24 = self.pool4(x23) 76 | x25 = self.conv5_1(x24) 77 | x26 = self.relu5_1(x25) 78 | x27 = self.conv5_2(x26) 79 | x28 = self.relu5_2(x27) 80 | x29 = self.conv5_3(x28) 81 | x30 = self.relu5_3(x29) 82 | x31 = self.pool5(x30) 83 | x32 = self.fc6(x31) 84 | x33_preflatten = self.relu6(x32) 85 | x33 = x33_preflatten.view(x33_preflatten.size(0), -1) 86 | x34 = self.fc7(x33) 87 | x35 = self.relu7(x34) 88 | prediction = self.fc8(x35) 89 | return prediction 90 | 91 | def vgg_vd_face_fer_dag(weights_path=None, **kwargs): 92 | """ 93 | load imported model instance 94 | 95 | Args: 96 | weights_path (str): If set, loads model weights from the given path 97 | """ 98 | model = Vgg_vd_face_fer_dag() 99 | if weights_path: 100 | state_dict = torch.load(weights_path) 101 | model.load_state_dict(state_dict) 102 | return model -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/run_fer_benchmarks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """This module evaluates imported PyTorch models on fer2013 3 | """ 4 | 5 | import os 6 | import argparse 7 | from os.path import join as pjoin 8 | from fer2013.fer import fer2013_benchmark 9 | from utils.benchmark_helpers import load_module_2or3 10 | 11 | # MODEL_DIR = os.path.expanduser('~/data/models/pytorch/mcn_imports') 12 | # FER_DIR = os.path.expanduser('~/data/datasets/fer2013+') 13 | MODEL_DIR = './pretrained/' 14 | FER_DIR = os.path.expanduser('~/Affective Computing/Dataset/FERPlus') 15 | 16 | CACHE_DIR = 'res_cache/fer2013+' 17 | 18 | def load_model(model_name): 19 | """Load imoprted PyTorch model by name 20 | 21 | Args: 22 | model_name (str): the name of the model to be loaded 23 | 24 | Return: 25 | nn.Module: the loaded network 26 | """ 27 | model_def_path = pjoin('model', model_name + '.py') 28 | weights_path = pjoin(MODEL_DIR, model_name + '.pth') 29 | mod = load_module_2or3(model_name, model_def_path) 30 | func = getattr(mod, model_name) 31 | net = func(weights_path=weights_path) 32 | return net 33 | 34 | def run_benchmarks(gpus, refresh, fer_plus): 35 | """Run bencmarks for imported models 36 | 37 | Args: 38 | gpus (str): comma separated gpu device identifiers 39 | refresh (bool): whether to overwrite the results of existing runs 40 | fer_plus (bool): whether to evaluate on the ferplus benchmark, 41 | rather than the standard fer benchmark. 42 | """ 43 | 44 | # Select models (and their batch sizes) to include in the benchmark. 45 | if fer_plus: 46 | model_list = [ 47 | ('resnet50_ferplus_dag', 32), 48 | ('senet50_ferplus_dag', 32), 49 | ] 50 | else: 51 | model_list = [ 52 | ('alexnet_face_fer_bn_dag', 32), 53 | ('vgg_m_face_bn_fer_dag', 32), 54 | ('vgg_vd_face_fer_dag', 32), 55 | ] 56 | 57 | if not os.path.exists(CACHE_DIR): 58 | os.makedirs(CACHE_DIR) 59 | os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' 60 | os.environ['CUDA_VISIBLE_DEVICES'] = str(gpus) 61 | 62 | opts = {'data_dir': FER_DIR, 'refresh_cache': refresh} 63 | 64 | for model_name, batch_size in model_list: 65 | cache_name = model_name 66 | if fer_plus: 67 | cache_name = cache_name + 'fer_plus' 68 | opts['res_cache'] = '{}/{}.pth'.format(CACHE_DIR, cache_name) 69 | opts['fer_plus'] = fer_plus 70 | model = load_model(model_name) 71 | print('benchmarking {}'.format(model_name)) 72 | fer2013_benchmark(model, batch_size=batch_size, **opts) 73 | 74 | parser = argparse.ArgumentParser(description='Run PyTorch benchmarks.') 75 | parser.add_argument('--gpus', nargs='?', dest='gpus', 76 | help='select gpu device id') 77 | parser.add_argument('--refresh', dest='refresh', action='store_true', 78 | help='refresh results cache') 79 | parser.add_argument('--ferplus', dest='ferplus', action='store_true', 80 | help='run ferplus (rather than fer) benchmarks') 81 | parser.set_defaults(gpus=None) 82 | parser.set_defaults(refresh=False) 83 | parsed = parser.parse_args() 84 | 85 | if __name__ == '__main__': 86 | run_benchmarks(parsed.gpus, parsed.refresh, parsed.ferplus) 87 | -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeroQiaoba/GCNet/8cce4a0c9a50172abfa79971e77c71c40c1d733d/feature_extraction/visual/pytorch-benchmarks/utils/__init__.py -------------------------------------------------------------------------------- /feature_extraction/visual/pytorch-benchmarks/utils/benchmark_helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Utilties shared among the benchmarking protocols 3 | """ 4 | import os 5 | import sys 6 | import six 7 | 8 | import torchvision.transforms as transforms 9 | 10 | 11 | def compose_transforms(meta, resize=256, center_crop=True, 12 | override_meta_imsize=False): 13 | """Compose preprocessing transforms for model 14 | 15 | The imported models use a range of different preprocessing options, 16 | depending on how they were originally trained. Models trained in MatConvNet 17 | typically require input images that have been scaled to [0,255], rather 18 | than the [0,1] range favoured by PyTorch. 19 | 20 | Args: 21 | meta (dict): model preprocessing requirements 22 | resize (int) [256]: resize the input image to this size 23 | center_crop (bool) [True]: whether to center crop the image 24 | override_meta_imsize (bool) [False]: if true, use the value of `resize` 25 | to select the image input size, rather than the properties contained 26 | in meta (this option only applies when center cropping is not used. 27 | 28 | Return: 29 | (transforms.Compose): Composition of preprocessing transforms 30 | """ 31 | normalize = transforms.Normalize(mean=meta['mean'], std=meta['std']) 32 | im_size = meta['imageSize'] 33 | assert im_size[0] == im_size[1], 'expected square image size' 34 | if center_crop: 35 | transform_list = [transforms.Resize(resize), 36 | transforms.CenterCrop(size=(im_size[0], im_size[1]))] 37 | else: 38 | if override_meta_imsize: 39 | im_size = (resize, resize) 40 | transform_list = [transforms.Resize(size=(im_size[0], im_size[1]))] 41 | transform_list += [transforms.ToTensor()] 42 | if meta['std'] == [1, 1, 1]: # common amongst mcn models 43 | transform_list += [lambda x: x * 255.0] 44 | transform_list.append(normalize) 45 | return transforms.Compose(transform_list) 46 | 47 | 48 | def load_module_2or3(model_name, model_def_path): 49 | """Load model definition module in a manner that is compatible with 50 | both Python2 and Python3 51 | 52 | Args: 53 | model_name: The name of the model to be loaded 54 | model_def_path: The filepath of the module containing the definition 55 | 56 | Return: 57 | The loaded python module.""" 58 | if six.PY3: 59 | import importlib.util 60 | spec = importlib.util.spec_from_file_location(model_name, model_def_path) 61 | mod = importlib.util.module_from_spec(spec) 62 | spec.loader.exec_module(mod) 63 | else: 64 | import importlib 65 | dirname = os.path.dirname(model_def_path) 66 | sys.path.insert(0, dirname) 67 | module_name = os.path.splitext(os.path.basename(model_def_path))[0] 68 | mod = importlib.import_module(module_name) 69 | return mod 70 | -------------------------------------------------------------------------------- /gcnet/loss.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import glob 4 | import pickle 5 | import random 6 | import argparse 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from torch.autograd import Variable 12 | from torch.nn.utils.rnn import pad_sequence 13 | from torch_geometric.nn import RGCNConv, GraphConv 14 | 15 | 16 | ## for reconstruction [only recon loss on miss part] 17 | class MaskedReconLoss(nn.Module): 18 | 19 | def __init__(self): 20 | super(MaskedReconLoss, self).__init__() 21 | self.loss = nn.MSELoss(reduction='none') 22 | 23 | def forward(self, recon_input, target_input, input_mask, umask, adim, tdim, vdim): 24 | """ ? => refer to spk and modality 25 | recon_input -> ? * [seqlen, batch, dim] 26 | target_input -> ? * [seqlen, batch, dim] 27 | input_mask -> ? * [seqlen, batch, dim] 28 | umask -> [batch, seqlen] 29 | """ 30 | assert len(recon_input) == 1 31 | recon = recon_input[0] # [seqlen, batch, dim] 32 | target = target_input[0] # [seqlen, batch, dim] 33 | mask = input_mask[0] # [seqlen, batch, 3] 34 | 35 | recon = torch.reshape(recon, (-1, recon.size(2))) # [seqlen*batch, dim] 36 | target = torch.reshape(target, (-1, target.size(2))) # [seqlen*batch, dim] 37 | mask = torch.reshape(mask, (-1, mask.size(2))) # [seqlen*batch, 3] 1(exist); 0(mask) 38 | umask = torch.reshape(umask, (-1, 1)) # [seqlen*batch, 1] 39 | 40 | A_rec = recon[:, :adim] 41 | L_rec = recon[:, adim:adim+tdim] 42 | V_rec = recon[:, adim+tdim:] 43 | A_full = target[:, :adim] 44 | L_full = target[:, adim:adim+tdim] 45 | V_full = target[:, adim+tdim:] 46 | A_miss_index = torch.reshape(mask[:, 0], (-1, 1)) 47 | L_miss_index = torch.reshape(mask[:, 1], (-1, 1)) 48 | V_miss_index = torch.reshape(mask[:, 2], (-1, 1)) 49 | 50 | loss_recon1 = self.loss(A_rec*umask, A_full*umask) * -1 * (A_miss_index - 1) 51 | loss_recon2 = self.loss(L_rec*umask, L_full*umask) * -1 * (L_miss_index - 1) 52 | loss_recon3 = self.loss(V_rec*umask, V_full*umask) * -1 * (V_miss_index - 1) 53 | loss_recon1 = torch.sum(loss_recon1) / adim 54 | loss_recon2 = torch.sum(loss_recon2) / tdim 55 | loss_recon3 = torch.sum(loss_recon3) / vdim 56 | loss_recon = (loss_recon1 + loss_recon2 + loss_recon3) / torch.sum(umask) 57 | 58 | return loss_recon 59 | 60 | 61 | ## iemocap loss function: same with CE loss 62 | class MaskedCELoss(nn.Module): 63 | 64 | def __init__(self): 65 | super(MaskedCELoss, self).__init__() 66 | self.loss = nn.NLLLoss(reduction='sum') 67 | 68 | def forward(self, pred, target, umask): 69 | """ 70 | pred -> [batch*seq_len, n_classes] 71 | target -> [batch*seq_len] 72 | umask -> [batch, seq_len] 73 | """ 74 | umask = umask.view(-1,1) # [batch*seq_len, 1] 75 | target = target.view(-1,1) # [batch*seq_len, 1] 76 | pred = F.log_softmax(pred, 1) # [batch*seqlen, n_classes] 77 | loss = self.loss(pred*umask, (target*umask).squeeze().long()) / torch.sum(umask) 78 | return loss 79 | 80 | 81 | ## for cmumosi and cmumosei loss calculation 82 | class MaskedMSELoss(nn.Module): 83 | 84 | def __init__(self): 85 | super(MaskedMSELoss, self).__init__() 86 | self.loss = nn.MSELoss(reduction='sum') 87 | 88 | def forward(self, pred, target, umask): 89 | """ 90 | pred -> [batch*seq_len] 91 | target -> [batch*seq_len] 92 | umask -> [batch*seq_len] 93 | """ 94 | pred = pred.view(-1, 1) # [batch*seq_len, 1] 95 | target = target.view(-1, 1) # [batch*seq_len, 1] 96 | umask = umask.view(-1, 1) # [batch*seq_len, 1] 97 | loss = self.loss(pred*umask, target*umask) / torch.sum(umask) 98 | return loss -------------------------------------------------------------------------------- /gcnet/module.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import glob 4 | import pickle 5 | import random 6 | import argparse 7 | import numpy as np 8 | import pandas as pd 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | from torch.autograd import Variable 14 | from torch.nn.utils.rnn import pad_sequence 15 | from torch_geometric.nn import RGCNConv, GraphConv 16 | 17 | 18 | class MatchingAttention(nn.Module): 19 | 20 | def __init__(self, mem_dim, cand_dim, alpha_dim=None, att_type='general'): 21 | super(MatchingAttention, self).__init__() 22 | assert att_type!='concat' or alpha_dim!=None 23 | assert att_type!='dot' or mem_dim==cand_dim 24 | self.mem_dim = mem_dim 25 | self.cand_dim = cand_dim 26 | self.att_type = att_type 27 | if att_type=='general': 28 | self.transform = nn.Linear(cand_dim, mem_dim, bias=False) 29 | if att_type=='general2': 30 | self.transform = nn.Linear(cand_dim, mem_dim, bias=True) 31 | elif att_type=='concat': 32 | self.transform = nn.Linear(cand_dim+mem_dim, alpha_dim, bias=False) 33 | self.vector_prod = nn.Linear(alpha_dim, 1, bias=False) 34 | 35 | def forward(self, M, x, mask=None): 36 | """ 37 | M -> (seq_len, batch, mem_dim) 38 | x -> (batch, cand_dim) 39 | mask -> (batch, seq_len) 40 | """ 41 | if type(mask)==type(None): 42 | mask = torch.ones(M.size(1), M.size(0)).type(M.type()) # [batch, seq_len] 43 | 44 | if self.att_type=='dot': 45 | M_ = M.permute(1,2,0) # batch, vector, seqlen 46 | x_ = x.unsqueeze(1) # batch, 1, vector 47 | alpha = F.softmax(torch.bmm(x_, M_), dim=2) # batch, 1, seqlen 48 | elif self.att_type=='general': 49 | M_ = M.permute(1,2,0) # batch, mem_dim, seqlen 50 | x_ = self.transform(x).unsqueeze(1) # batch, 1, mem_dim 51 | alpha = F.softmax(torch.bmm(x_, M_), dim=2) # batch, 1, seqlen 52 | elif self.att_type=='general2': 53 | M_ = M.permute(1,2,0) # [batch, mem_dim, seqlen] 54 | x_ = self.transform(x).unsqueeze(1) # [batch, 1, mem_dim] 55 | mask_ = mask.unsqueeze(2).repeat(1, 1, self.mem_dim).transpose(1, 2) # [batch, mem_dim, seq_len] 56 | M_ = M_ * mask_ # [batch, mem_dim, seqlen] 57 | alpha_ = torch.bmm(x_, M_)*mask.unsqueeze(1) # attention value: [batch, 1, seqlen] 58 | alpha_ = torch.tanh(alpha_) 59 | alpha_ = F.softmax(alpha_, dim=2) # [batch, 1, seqlen] 60 | alpha_masked = alpha_*mask.unsqueeze(1) # [batch, 1, seqlen] 61 | alpha_sum = torch.sum(alpha_masked, dim=2, keepdim=True) # [batch, 1, 1] 62 | alpha = alpha_masked/alpha_sum # normalized attention: [batch, 1, seqlen] 63 | # alpha = torch.where(alpha.isnan(), alpha_masked, alpha) 64 | else: 65 | M_ = M.transpose(0,1) # batch, seqlen, mem_dim 66 | x_ = x.unsqueeze(1).expand(-1,M.size()[0],-1) # batch, seqlen, cand_dim 67 | M_x_ = torch.cat([M_,x_],2) # batch, seqlen, mem_dim+cand_dim 68 | mx_a = F.tanh(self.transform(M_x_)) # batch, seqlen, alpha_dim 69 | alpha = F.softmax(self.vector_prod(mx_a),1).transpose(1,2) # [batch, 1, seqlen] 70 | 71 | attn_pool = torch.bmm(alpha, M.transpose(0,1))[:,0,:] # [batch, mem_dim] 72 | return attn_pool, alpha 73 | 74 | 75 | # change [num_utterance, dim] => [seqlen, batch, dim] 76 | def utterance_to_conversation(outputs, seq_lengths, umask, no_cuda): 77 | input_conversation_length = torch.tensor(seq_lengths) # [6, 24, 13, 9] 78 | start_zero = input_conversation_length.data.new(1).zero_() # [0] 79 | 80 | if not no_cuda: 81 | input_conversation_length = input_conversation_length.cuda() 82 | start_zero = start_zero.cuda() 83 | 84 | max_len = max(seq_lengths) # [int] 85 | start = torch.cumsum(torch.cat((start_zero, input_conversation_length[:-1])), 0) # [0, 6, 30, 43] 86 | 87 | outputs = torch.stack([pad(outputs.narrow(0, s, l), max_len, no_cuda) # [seqlen, batch, dim] 88 | for s, l in zip(start.data.tolist(), 89 | input_conversation_length.data.tolist())], 0).transpose(0, 1) 90 | return outputs 91 | 92 | 93 | def pad(tensor, length, no_cuda): 94 | if isinstance(tensor, Variable): 95 | var = tensor 96 | if length > var.size(0): 97 | if not no_cuda: 98 | return torch.cat([var, torch.zeros(length - var.size(0), *var.size()[1:]).cuda()]) 99 | else: 100 | return torch.cat([var, torch.zeros(length - var.size(0), *var.size()[1:])]) 101 | else: 102 | return var 103 | else: 104 | if length > tensor.size(0): 105 | if not no_cuda: 106 | return torch.cat([tensor, torch.zeros(length - tensor.size(0), *tensor.size()[1:]).cuda()]) 107 | else: 108 | return torch.cat([tensor, torch.zeros(length - tensor.size(0), *tensor.size()[1:])]) 109 | else: 110 | return tensor 111 | -------------------------------------------------------------------------------- /requirements-cpmnet.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.0.0 2 | astor==0.8.1 3 | backcall==0.2.0 4 | cached-property==1.5.2 5 | certifi==2021.10.8 6 | decorator==5.1.1 7 | gast==0.2.2 8 | google-pasta==0.2.0 9 | grpcio==1.43.0 10 | h5py==3.6.0 11 | importlib-metadata==4.10.1 12 | ipython==7.20.0 13 | joblib==1.1.0 14 | Keras-Applications==1.0.8 15 | Keras-Preprocessing==1.1.2 16 | Markdown==3.3.6 17 | mkl-fft==1.3.1 18 | mkl-random @ file:///tmp/build/80754af9/mkl_random_1626179032232/work 19 | mkl-service==2.4.0 20 | numpy @ file:///tmp/build/80754af9/numpy_and_numpy_base_1634106693478/work 21 | olefile==0.46 22 | opt-einsum==3.3.0 23 | pexpect==4.8.0 24 | pickleshare==0.7.5 25 | Pillow==8.4.0 26 | prompt-toolkit==3.0.29 27 | protobuf==3.19.3 28 | ptyprocess==0.7.0 29 | Pygments==2.12.0 30 | scikit-learn==1.0.2 31 | scipy==1.1.0 32 | six @ file:///tmp/build/80754af9/six_1623709665295/work 33 | tensorboard==1.15.0 34 | tensorflow-estimator==1.15.1 35 | tensorflow-gpu==1.15.0 36 | termcolor==1.1.0 37 | threadpoolctl==3.0.0 38 | torch==1.4.0 39 | torchvision==0.5.0 40 | tqdm==4.62.3 41 | traitlets==5.1.0 42 | typing_extensions==4.0.1 43 | wcwidth==0.2.5 44 | Werkzeug==2.0.2 45 | wrapt==1.13.3 46 | zipp==3.7.0 47 | --------------------------------------------------------------------------------