├── DepressionCollected ├── Classification │ ├── AudioModelChecking.py │ ├── AudioTraditionalClassifiers.py │ ├── FuseModelChecking.py │ ├── TextModelChecking.py │ ├── TextTraditionalClassifiers.py │ ├── audio_features_whole.py │ ├── audio_gru_whole.py │ ├── fuse_net_whole.py │ ├── text_bilstm_whole.py │ └── text_features_whole.py ├── DAICFeatureExtarction │ ├── feature_extraction.py │ └── queries.txt └── Regression │ ├── AudioModelChecking.py │ ├── audio_bilstm_perm.py │ ├── fuse_net.py │ └── text_bilstm_perm.py └── README.md /DepressionCollected/Classification/AudioModelChecking.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | from torch.nn import functional as F 5 | import torch.optim as optim 6 | from sklearn.metrics import confusion_matrix 7 | import numpy as np 8 | import pandas as pd 9 | import wave 10 | import re 11 | import os 12 | import tensorflow.compat.v1 as tf 13 | import random 14 | import itertools 15 | from audio_gru_whole import AudioBiLSTM 16 | 17 | from sklearn.preprocessing import StandardScaler 18 | import pickle 19 | 20 | class BiLSTM(nn.Module): 21 | def __init__(self, rnn_layers, dropout, num_classes, audio_hidden_dims, audio_embed_size): 22 | super(BiLSTM, self).__init__() 23 | 24 | self.lstm_net_audio = nn.GRU(audio_embed_size, audio_hidden_dims, 25 | num_layers=rnn_layers, dropout=dropout, batch_first=True) 26 | 27 | self.fc_audio = nn.Sequential( 28 | nn.Dropout(dropout), 29 | nn.Linear(audio_hidden_dims, audio_hidden_dims), 30 | nn.ReLU(), 31 | nn.Dropout(dropout), 32 | nn.Linear(audio_hidden_dims, num_classes), 33 | # nn.ReLU(), 34 | nn.Softmax(dim=1) 35 | ) 36 | 37 | def forward(self, x): 38 | x, _ = self.lstm_net_audio(x) 39 | # x = self.bn(x) 40 | x = x.sum(dim=1) 41 | out = self.fc_audio(x) 42 | return out 43 | 44 | # prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) 45 | # audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/Audio/whole_samples_clf_avid256.npz'))['arr_0'], axis=2) 46 | # audio_targets = np.load(os.path.join(prefix, 'Features/Audio/whole_labels_clf_avid256.npz'))['arr_0'] 47 | 48 | prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) 49 | audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2) 50 | audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0'] 51 | 52 | audio_dep_idxs = np.where(audio_targets == 1)[0] 53 | audio_non_idxs = np.where(audio_targets == 0)[0] 54 | 55 | def standard_confusion_matrix(y_test, y_test_pred): 56 | """ 57 | Make confusion matrix with format: 58 | ----------- 59 | | TP | FP | 60 | ----------- 61 | | FN | TN | 62 | ----------- 63 | Parameters 64 | ---------- 65 | y_true : ndarray - 1D 66 | y_pred : ndarray - 1D 67 | 68 | Returns 69 | ------- 70 | ndarray - 2D 71 | """ 72 | [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) 73 | return np.array([[tp, fp], [fn, tn]]) 74 | 75 | def model_performance(y_test, y_test_pred_proba): 76 | """ 77 | Evaluation metrics for network performance. 78 | """ 79 | # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] 80 | y_test_pred = y_test_pred_proba 81 | 82 | # Computing confusion matrix for test dataset 83 | conf_matrix = standard_confusion_matrix(y_test, y_test_pred) 84 | print("Confusion Matrix:") 85 | print(conf_matrix) 86 | 87 | return y_test_pred, conf_matrix 88 | 89 | config = { 90 | 'num_classes': 2, 91 | 'dropout': 0.5, 92 | 'rnn_layers': 2, 93 | 'embedding_size': 256, 94 | 'batch_size': 4, 95 | 'epochs': 100, 96 | 'learning_rate': 1e-5, 97 | 'hidden_dims': 256, 98 | 'bidirectional': False, 99 | 'cuda': False 100 | } 101 | 102 | # audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio/BiLSTM_gru_vlad256_256_0.80.pt')) 103 | # audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio3/BiLSTM_gru_vlad256_256_0.89.pt')) 104 | # audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio2/BiLSTM_gru_vlad256_256_0.65.pt')) 105 | 106 | # model = BiLSTM(config['rnn_layers'], config['dropout'], config['num_classes'], \ 107 | # config['hidden_dims'], config['embedding_size']) 108 | 109 | # model_state_dict = {} 110 | # model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0'] 111 | # model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0'] 112 | # model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0'] 113 | # model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0'] 114 | 115 | # model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1'] 116 | # model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1'] 117 | # model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1'] 118 | # model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1'] 119 | 120 | # model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight'] 121 | # model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias'] 122 | # model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight'] 123 | # model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias'] 124 | # model_state_dict = audio_lstm_model.state_dict() 125 | # model.load_state_dict(model_state_dict, strict=False) 126 | 127 | def evaluate(model, test_idxs): 128 | model.eval() 129 | batch_idx = 1 130 | total_loss = 0 131 | pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor) 132 | # X_test = audio_features[test_dep_idxs+test_non_idxs] 133 | # Y_test = audio_targets[test_dep_idxs+test_non_idxs] 134 | X_test = audio_features[test_idxs] 135 | Y_test = audio_targets[test_idxs] 136 | global max_train_acc, max_acc,max_f1 137 | for i in range(0, X_test.shape[0], config['batch_size']): 138 | if i + config['batch_size'] > X_test.shape[0]: 139 | x, y = X_test[i:], Y_test[i:] 140 | else: 141 | x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])] 142 | if config['cuda']: 143 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() 144 | else: 145 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), Variable(torch.from_numpy(y)) 146 | with torch.no_grad(): 147 | output = model(x.squeeze(2)) 148 | pred = torch.cat((pred, output.data.max(1, keepdim=True)[1])) 149 | 150 | y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:]) 151 | print('Calculating additional test metrics...') 152 | accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) 153 | precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) 154 | recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) 155 | f1_score = 2 * (precision * recall) / (precision + recall) 156 | print("Accuracy: {}".format(accuracy)) 157 | print("Precision: {}".format(precision)) 158 | print("Recall: {}".format(recall)) 159 | print("F1-Score: {}\n".format(f1_score)) 160 | print('='*89) 161 | return precision, recall, f1_score 162 | 163 | 164 | # evaluate(audio_features_test, fuse_targets_test, audio_lstm_model) 165 | # evaluate(model) 166 | 167 | idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy'] 168 | audio_model_paths = ['BiLSTM_gru_vlad256_256_0.67_1.pt', 'BiLSTM_gru_vlad256_256_0.67_2.pt', 'BiLSTM_gru_vlad256_256_0.63_3.pt'] 169 | ps, rs, fs = [], [], [] 170 | for fold in range(3): 171 | train_idxs_tmp = np.load(os.path.join(prefix, 'Features/TextWhole/{}'.format(idxs_paths[fold])), allow_pickle=True) 172 | test_idxs_tmp = list(set(list(audio_dep_idxs)+list(audio_non_idxs)) - set(train_idxs_tmp)) 173 | audio_lstm_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Audio/{}'.format(audio_model_paths[fold]))) 174 | 175 | train_idxs, test_idxs = [], [] 176 | for idx in train_idxs_tmp: 177 | if idx in audio_dep_idxs: 178 | feat = audio_features[idx] 179 | count = 0 180 | resample_idxs = [0,1,2,3,4,5] 181 | for i in itertools.permutations(feat, feat.shape[0]): 182 | if count in resample_idxs: 183 | audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) 184 | audio_targets = np.hstack((audio_targets, 1)) 185 | train_idxs.append(len(audio_features)-1) 186 | count += 1 187 | else: 188 | train_idxs.append(idx) 189 | 190 | for idx in test_idxs_tmp: 191 | if idx in audio_dep_idxs: 192 | feat = audio_features[idx] 193 | count = 0 194 | # resample_idxs = random.sample(range(6), 4) 195 | resample_idxs = [0,1,4,5] 196 | for i in itertools.permutations(feat, feat.shape[0]): 197 | if count in resample_idxs: 198 | audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) 199 | audio_targets = np.hstack((audio_targets, 1)) 200 | test_idxs.append(len(audio_features)-1) 201 | count += 1 202 | else: 203 | test_idxs.append(idx) 204 | p, r, f = evaluate(audio_lstm_model, test_idxs) 205 | ps.append(p) 206 | rs.append(r) 207 | fs.append(f) 208 | print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs))) 209 | 210 | 211 | -------------------------------------------------------------------------------- /DepressionCollected/Classification/AudioTraditionalClassifiers.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import KFold 2 | import numpy as np 3 | import pandas as pd 4 | import os 5 | import pickle 6 | import random 7 | import itertools 8 | from sklearn.metrics import confusion_matrix 9 | from sklearn.model_selection import train_test_split 10 | 11 | 12 | prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) 13 | audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2) 14 | audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0'] 15 | audio_dep_idxs_tmp = np.where(audio_targets == 1)[0] 16 | audio_non_idxs = np.where(audio_targets == 0)[0] 17 | 18 | def model_performance(y_test, y_test_pred_proba): 19 | """ 20 | Evaluation metrics for network performance. 21 | """ 22 | # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] 23 | y_test_pred = y_test_pred_proba 24 | 25 | # Computing confusion matrix for test dataset 26 | conf_matrix = standard_confusion_matrix(y_test, y_test_pred) 27 | print("Confusion Matrix:") 28 | print(conf_matrix) 29 | 30 | return y_test_pred, conf_matrix 31 | 32 | def standard_confusion_matrix(y_test, y_test_pred): 33 | [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) 34 | return np.array([[tp, fp], [fn, tn]]) 35 | 36 | train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True), 37 | np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.65_2.npy'), allow_pickle=True), 38 | np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)] 39 | precs, recs, f1s = [], [], [] 40 | for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps): 41 | test_idxs_tmp = list(set(list(audio_dep_idxs_tmp)+list(audio_non_idxs)) - set(train_idxs_tmp)) 42 | train_idxs, test_idxs = [], [] 43 | # depression data augmentation 44 | for idx in train_idxs_tmp: 45 | if idx in audio_dep_idxs_tmp: 46 | feat = audio_features[idx] 47 | count = 0 48 | resample_idxs = [0,1,2,3,4,5] 49 | for i in itertools.permutations(feat, feat.shape[0]): 50 | if count in resample_idxs: 51 | audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) 52 | audio_targets = np.hstack((audio_targets, 1)) 53 | train_idxs.append(len(audio_features)-1) 54 | count += 1 55 | else: 56 | train_idxs.append(idx) 57 | 58 | for idx in test_idxs_tmp: 59 | if idx in audio_dep_idxs_tmp: 60 | feat = audio_features[idx] 61 | count = 0 62 | # resample_idxs = random.sample(range(6), 4) 63 | resample_idxs = [0,1,4,5] 64 | for i in itertools.permutations(feat, feat.shape[0]): 65 | if count in resample_idxs: 66 | audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) 67 | audio_targets = np.hstack((audio_targets, 1)) 68 | test_idxs.append(len(audio_features)-1) 69 | count += 1 70 | else: 71 | test_idxs.append(idx) 72 | 73 | X_train = audio_features[train_idxs] 74 | Y_train = audio_targets[train_idxs] 75 | X_test = audio_features[test_idxs] 76 | Y_test = audio_targets[test_idxs] 77 | 78 | # Decision Tree 79 | # from sklearn import tree 80 | # clf = tree.DecisionTreeClassifier(max_depth=20) 81 | 82 | # svm 83 | # from sklearn.svm import SVC 84 | # clf = SVC(kernel='sigmoid') 85 | 86 | # rf 87 | from sklearn.ensemble import RandomForestClassifier 88 | clf = RandomForestClassifier(n_estimators=50) 89 | 90 | # lr 91 | # from sklearn.linear_model import LogisticRegression 92 | # clf = LogisticRegression(solver='newton-cg') 93 | 94 | clf.fit([f.flatten() for f in X_train], Y_train) 95 | pred = clf.predict([f.flatten() for f in X_test]) 96 | # clf.fit([f.sum(axis=0) for f in X_train], Y_train) 97 | # pred = clf.predict([f.sum(axis=0) for f in X_test]) 98 | 99 | y_test_pred, conf_matrix = model_performance(Y_test, pred) 100 | 101 | # custom evaluation metrics 102 | print('Calculating additional test metrics...') 103 | accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) 104 | precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) 105 | recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) 106 | f1_score = 2 * (precision * recall) / (precision + recall) 107 | print("Accuracy: {}".format(accuracy)) 108 | print("Precision: {}".format(precision)) 109 | print("Recall: {}".format(recall)) 110 | print("F1-Score: {}\n".format(f1_score)) 111 | print('='*89) 112 | precs.append(0 if np.isnan(precision) else precision) 113 | recs.append(0 if np.isnan(recall) else recall) 114 | f1s.append(0 if np.isnan(f1_score) else f1_score) 115 | # precs.append(precision) 116 | # recs.append(recall) 117 | # f1s.append(f1_score) 118 | print(np.mean(precs), np.mean(recs), np.mean(f1s)) -------------------------------------------------------------------------------- /DepressionCollected/Classification/FuseModelChecking.py: -------------------------------------------------------------------------------- 1 | from fuse_net_whole import fusion_net, config, model_performance 2 | import os 3 | import numpy as np 4 | import torch 5 | from torch.autograd import Variable 6 | import itertools 7 | 8 | prefix = os.path.abspath(os.path.join(os.getcwd(), "./")) 9 | idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy'] 10 | text_model_paths = ['BiLSTM_128_0.67_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.66_3.pt'] 11 | audio_model_paths = ['BiLSTM_gru_vlad256_256_0.63_1.pt', 'BiLSTM_gru_vlad256_256_0.65_2.pt', 'BiLSTM_gru_vlad256_256_0.60_3.pt'] 12 | fuse_model_paths = ['fuse_0.69_1.pt', 'fuse_0.68_2.pt', 'fuse_0.62_3.pt'] 13 | text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0'] 14 | text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0'] 15 | audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2) 16 | audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0'] 17 | fuse_features = [[audio_features[i], text_features[i]] for i in range(text_features.shape[0])] 18 | fuse_targets = text_targets 19 | fuse_dep_idxs = np.where(text_targets == 1)[0] 20 | fuse_non_idxs = np.where(text_targets == 0)[0] 21 | 22 | def evaluate(model, test_idxs): 23 | model.eval() 24 | pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor) 25 | X_test = [] 26 | Y_test = [] 27 | for idx in test_idxs: 28 | X_test.append(fuse_features[idx]) 29 | Y_test.append(fuse_targets[idx]) 30 | global max_train_acc, max_acc,max_f1 31 | for i in range(0, len(X_test), config['batch_size']): 32 | if i + config['batch_size'] > len(X_test): 33 | x, y = X_test[i:], Y_test[i:] 34 | else: 35 | x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])] 36 | if config['cuda']: 37 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() 38 | text_feature, audio_feature = model.pretrained_feature(x) 39 | with torch.no_grad(): 40 | # concat_x = torch.cat((audio_feature, text_feature), dim=1) 41 | audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std() 42 | text_feature_norm = (text_feature - text_feature.mean())/text_feature.std() 43 | concat_x = torch.cat((text_feature, audio_feature), dim=1) 44 | output = model(concat_x) 45 | pred = torch.cat((pred, output.data.max(1, keepdim=True)[1])) 46 | 47 | y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:]) 48 | # custom evaluation metrics 49 | print('Calculating additional test metrics...') 50 | accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) 51 | precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) 52 | recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) 53 | f1_score = 2 * (precision * recall) / (precision + recall) 54 | print("Accuracy: {}".format(accuracy)) 55 | print("Precision: {}".format(precision)) 56 | print("Recall: {}".format(recall)) 57 | print("F1-Score: {}\n".format(f1_score)) 58 | print('='*89) 59 | 60 | return precision, recall, f1_score 61 | 62 | ps, rs, fs = [], [], [] 63 | for fold in range(3): 64 | train_idxs_tmp = np.load(os.path.join(prefix, 'Features/TextWhole/{}'.format(idxs_paths[fold])), allow_pickle=True) 65 | test_idxs_tmp = list(set(list(fuse_dep_idxs)+list(fuse_non_idxs)) - set(train_idxs_tmp)) 66 | resample_idxs = list(range(6)) 67 | train_idxs, test_idxs = [], [] 68 | # depression data augmentation 69 | for idx in train_idxs_tmp: 70 | if idx in fuse_dep_idxs: 71 | feat = fuse_features[idx] 72 | audio_perm = itertools.permutations(feat[0], 3) 73 | text_perm = itertools.permutations(feat[1], 3) 74 | count = 0 75 | for fuse_perm in zip(audio_perm, text_perm): 76 | if count in resample_idxs: 77 | fuse_features.append(fuse_perm) 78 | fuse_targets = np.hstack((fuse_targets, 1)) 79 | train_idxs.append(len(fuse_features)-1) 80 | count += 1 81 | else: 82 | train_idxs.append(idx) 83 | 84 | for idx in test_idxs_tmp: 85 | if idx in fuse_dep_idxs: 86 | feat = fuse_features[idx] 87 | audio_perm = itertools.permutations(feat[0], 3) 88 | text_perm = itertools.permutations(feat[1], 3) 89 | count = 0 90 | resample_idxs = [0,1,4,5] 91 | for fuse_perm in zip(audio_perm, text_perm): 92 | if count in resample_idxs: 93 | fuse_features.append(fuse_perm) 94 | fuse_targets = np.hstack((fuse_targets, 1)) 95 | test_idxs.append(len(fuse_features)-1) 96 | count += 1 97 | else: 98 | test_idxs.append(idx) 99 | 100 | fuse_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Fuse/{}'.format(fuse_model_paths[fold]))) 101 | p, r, f = evaluate(fuse_model, test_idxs) 102 | ps.append(p) 103 | rs.append(r) 104 | fs.append(f) 105 | print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs))) 106 | -------------------------------------------------------------------------------- /DepressionCollected/Classification/TextModelChecking.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | from torch.nn import functional as F 6 | import torch.optim as optim 7 | from sklearn.metrics import confusion_matrix 8 | import numpy as np 9 | import pandas as pd 10 | import wave 11 | import re 12 | import os 13 | import tensorflow.compat.v1 as tf 14 | import random 15 | import itertools 16 | 17 | from sklearn.preprocessing import StandardScaler 18 | import pickle 19 | 20 | # prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) 21 | # text_features = np.load(os.path.join(prefix, 'Features/Text/whole_samples_clf_avg.npz'))['arr_0'] 22 | # text_targets = np.load(os.path.join(prefix, 'Features/Text/whole_labels_clf_avg.npz'))['arr_0'] 23 | 24 | # audio_dep_idxs = np.where(text_targets == 1)[0] 25 | # audio_non_idxs = np.where(text_targets == 0)[0] 26 | # # train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.80.npy'), allow_pickle=True) 27 | # # train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.80.npy'), allow_pickle=True)) 28 | # # train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.65_2.npy'), allow_pickle=True) 29 | # # train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.65_2.npy'), allow_pickle=True)) 30 | # train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.89_3.npy'), allow_pickle=True) 31 | # train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.89_3.npy'), allow_pickle=True)) 32 | 33 | # test_dep_idxs_tmp = list(set(audio_dep_idxs) - set(train_dep_idxs_tmp)) 34 | # test_non_idxs = list(set(audio_non_idxs) - set(train_non_idxs)) 35 | 36 | prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) 37 | text_features = np.load(os.path.join( 38 | prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0'] 39 | text_targets = np.load(os.path.join( 40 | prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0'] 41 | text_dep_idxs_tmp = np.where(text_targets == 1)[0] 42 | text_non_idxs = np.where(text_targets == 0)[0] 43 | 44 | 45 | 46 | 47 | # # training data augmentation 48 | # train_dep_idxs = [] 49 | # for idx in train_dep_idxs_tmp: 50 | # feat = text_features[idx] 51 | # for i in itertools.permutations(feat, feat.shape[0]): 52 | # text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) 53 | # text_targets = np.hstack((text_targets, 1)) 54 | # train_dep_idxs.append(len(text_features)-1) 55 | 56 | # text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) 57 | # text_targets = np.hstack((text_targets, 1)) 58 | # train_dep_idxs.append(len(text_features)-1) 59 | 60 | # # test data augmentation 61 | # test_dep_idxs = [] 62 | # for idx in test_dep_idxs_tmp: 63 | # feat = text_features[idx] 64 | # for i in itertools.permutations(feat, feat.shape[0]): 65 | # text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) 66 | # text_targets = np.hstack((text_targets, 1)) 67 | # test_dep_idxs.append(len(text_features)-1) 68 | 69 | def standard_confusion_matrix(y_test, y_test_pred): 70 | """ 71 | Make confusion matrix with format: 72 | ----------- 73 | | TP | FP | 74 | ----------- 75 | | FN | TN | 76 | ----------- 77 | Parameters 78 | ---------- 79 | y_true : ndarray - 1D 80 | y_pred : ndarray - 1D 81 | 82 | Returns 83 | ------- 84 | ndarray - 2D 85 | """ 86 | [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) 87 | return np.array([[tp, fp], [fn, tn]]) 88 | 89 | 90 | def model_performance(y_test, y_test_pred_proba): 91 | """ 92 | Evaluation metrics for network performance. 93 | """ 94 | # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] 95 | y_test_pred = y_test_pred_proba 96 | 97 | # Computing confusion matrix for test dataset 98 | conf_matrix = standard_confusion_matrix(y_test, y_test_pred) 99 | print("Confusion Matrix:") 100 | print(conf_matrix) 101 | 102 | return y_test_pred, conf_matrix 103 | 104 | 105 | class TextBiLSTM(nn.Module): 106 | def __init__(self, config): 107 | super(TextBiLSTM, self).__init__() 108 | self.num_classes = config['num_classes'] 109 | self.learning_rate = config['learning_rate'] 110 | self.dropout = config['dropout'] 111 | self.hidden_dims = config['hidden_dims'] 112 | self.rnn_layers = config['rnn_layers'] 113 | self.embedding_size = config['embedding_size'] 114 | self.bidirectional = config['bidirectional'] 115 | 116 | self.build_model() 117 | self.init_weight() 118 | 119 | def init_weight(net): 120 | for name, param in net.named_parameters(): 121 | if 'bias' in name: 122 | nn.init.constant_(param, 0.0) 123 | elif 'weight' in name: 124 | nn.init.xavier_uniform_(param) 125 | 126 | def build_model(self): 127 | # attention layer 128 | self.attention_layer = nn.Sequential( 129 | nn.Linear(self.hidden_dims, self.hidden_dims), 130 | nn.ReLU(inplace=True) 131 | ) 132 | # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) 133 | 134 | # 双层lstm 135 | self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims, 136 | num_layers=self.rnn_layers, dropout=self.dropout, 137 | bidirectional=self.bidirectional) 138 | 139 | # self.init_weight() 140 | 141 | # FC层 142 | # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes) 143 | self.fc_out = nn.Sequential( 144 | nn.Dropout(self.dropout), 145 | nn.Linear(self.hidden_dims, self.hidden_dims), 146 | nn.ReLU(), 147 | nn.Dropout(self.dropout), 148 | nn.Linear(self.hidden_dims, self.num_classes), 149 | # nn.ReLU(), 150 | nn.Softmax(dim=1), 151 | ) 152 | 153 | def attention_net_with_w(self, lstm_out, lstm_hidden): 154 | ''' 155 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 156 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 157 | :return: [batch_size, n_hidden] 158 | ''' 159 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 160 | # h [batch_size, time_step, hidden_dims] 161 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 162 | # h = lstm_out 163 | # [batch_size, num_layers * num_directions, n_hidden] 164 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 165 | # [batch_size, 1, n_hidden] 166 | lstm_hidden = lstm_hidden.unsqueeze(1) 167 | # atten_w [batch_size, 1, hidden_dims] 168 | atten_w = self.attention_layer(lstm_hidden) 169 | # m [batch_size, time_step, hidden_dims] 170 | m = nn.Tanh()(h) 171 | # atten_context [batch_size, 1, time_step] 172 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 173 | # softmax_w [batch_size, 1, time_step] 174 | softmax_w = F.softmax(atten_context, dim=-1) 175 | # context [batch_size, 1, hidden_dims] 176 | context = torch.bmm(softmax_w, h) 177 | result = context.squeeze(1) 178 | return result 179 | 180 | def forward(self, x): 181 | 182 | # x : [len_seq, batch_size, embedding_dim] 183 | x = x.permute(1, 0, 2) 184 | output, (final_hidden_state, final_cell_state) = self.lstm_net(x) 185 | # output : [batch_size, len_seq, n_hidden * 2] 186 | output = output.permute(1, 0, 2) 187 | # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] 188 | final_hidden_state = final_hidden_state.permute(1, 0, 2) 189 | # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) 190 | # atten_out = self.attention_net(output, final_hidden_state) 191 | atten_out = self.attention_net_with_w(output, final_hidden_state) 192 | return self.fc_out(atten_out) 193 | 194 | class BiLSTM(nn.Module): 195 | def __init__(self, rnn_layers, dropout, num_classes, text_hidden_dims, text_embed_size): 196 | super(BiLSTM, self).__init__() 197 | 198 | self.text_embed_size = text_embed_size 199 | self.text_hidden_dims = text_hidden_dims 200 | self.rnn_layers = rnn_layers 201 | self.dropout = dropout 202 | self.num_classes = num_classes 203 | 204 | # attention layer 205 | self.attention_layer = nn.Sequential( 206 | nn.Linear(self.text_hidden_dims, self.text_hidden_dims), 207 | nn.ReLU(inplace=True) 208 | ) 209 | 210 | # 双层lstm 211 | self.lstm_net = nn.LSTM(self.text_embed_size, self.text_hidden_dims, 212 | num_layers=self.rnn_layers, dropout=self.dropout, 213 | bidirectional=True) 214 | # FC层 215 | self.fc_out = nn.Sequential( 216 | nn.Dropout(self.dropout), 217 | nn.Linear(self.text_hidden_dims, self.text_hidden_dims), 218 | nn.ReLU(), 219 | nn.Dropout(self.dropout), 220 | nn.Linear(self.text_hidden_dims, self.num_classes), 221 | # nn.ReLU(), 222 | nn.Softmax(dim=1), 223 | ) 224 | 225 | def attention_net_with_w(self, lstm_out, lstm_hidden): 226 | ''' 227 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 228 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 229 | :return: [batch_size, n_hidden] 230 | ''' 231 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 232 | # h [batch_size, time_step, hidden_dims] 233 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 234 | # [batch_size, num_layers * num_directions, n_hidden] 235 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 236 | # [batch_size, 1, n_hidden] 237 | lstm_hidden = lstm_hidden.unsqueeze(1) 238 | # atten_w [batch_size, 1, hidden_dims] 239 | atten_w = self.attention_layer(lstm_hidden) 240 | # m [batch_size, time_step, hidden_dims] 241 | m = nn.Tanh()(h) 242 | # atten_context [batch_size, 1, time_step] 243 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 244 | # softmax_w [batch_size, 1, time_step] 245 | softmax_w = F.softmax(atten_context, dim=-1) 246 | # context [batch_size, 1, hidden_dims] 247 | context = torch.bmm(softmax_w, h) 248 | result = context.squeeze(1) 249 | return result 250 | 251 | def forward(self, x_text): 252 | # x : [len_seq, batch_size, embedding_dim] 253 | x_text = x_text.permute(1, 0, 2) 254 | output, (final_hidden_state, _) = self.lstm_net(x_text) 255 | # output : [batch_size, len_seq, n_hidden * 2] 256 | output = output.permute(1, 0, 2) 257 | # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] 258 | final_hidden_state = final_hidden_state.permute(1, 0, 2) 259 | # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) 260 | # atten_out = self.attention_net(output, final_hidden_state) 261 | atten_out = self.attention_net_with_w(output, final_hidden_state) 262 | text_feature = self.fc_out(atten_out) 263 | 264 | return text_feature 265 | 266 | def evaluate(model, test_idxs): 267 | model.eval() 268 | batch_idx = 1 269 | total_loss = 0 270 | pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor) 271 | # X_test = text_features[test_dep_idxs+test_non_idxs] 272 | # Y_test = text_targets[test_dep_idxs+test_non_idxs] 273 | X_test = text_features[test_idxs] 274 | Y_test = text_targets[test_idxs] 275 | global max_train_acc, max_acc, max_f1 276 | for i in range(0, X_test.shape[0], config['batch_size']): 277 | if i + config['batch_size'] > X_test.shape[0]: 278 | x, y = X_test[i:], Y_test[i:] 279 | else: 280 | x, y = X_test[i:(i+config['batch_size']) 281 | ], Y_test[i:(i+config['batch_size'])] 282 | if config['cuda']: 283 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda( 284 | ), Variable(torch.from_numpy(y)).cuda() 285 | else: 286 | x, y = Variable(torch.from_numpy(x).type( 287 | torch.FloatTensor), requires_grad=True), Variable(torch.from_numpy(y)) 288 | with torch.no_grad(): 289 | output = model(x.squeeze(2)) 290 | pred = torch.cat((pred, output.data.max(1, keepdim=True)[1])) 291 | 292 | y_test_pred, conf_matrix = model_performance( 293 | Y_test, pred[config['batch_size']:]) 294 | print('Calculating additional test metrics...') 295 | accuracy = float(conf_matrix[0][0] + 296 | conf_matrix[1][1]) / np.sum(conf_matrix) 297 | precision = float(conf_matrix[0][0]) / \ 298 | (conf_matrix[0][0] + conf_matrix[0][1]) 299 | recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) 300 | f1_score = 2 * (precision * recall) / (precision + recall) 301 | print("Accuracy: {}".format(accuracy)) 302 | print("Precision: {}".format(precision)) 303 | print("Recall: {}".format(recall)) 304 | print("F1-Score: {}\n".format(f1_score)) 305 | print('='*89) 306 | return precision, recall, f1_score 307 | 308 | text_model_paths = ['BiLSTM_128_0.64_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.66_3.pt'] 309 | train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True), 310 | np.load(os.path.join( 311 | prefix, 'Features/TextWhole/train_idxs_0.60_2.npy'), allow_pickle=True), 312 | np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)] 313 | resample_idxs = [0, 1, 2, 3, 4, 5] 314 | fold = 1 315 | ps, rs, fs = [], [], [] 316 | for idx_i, train_idxs_tmp in enumerate(train_idxs_tmps): 317 | test_idxs_tmp = list( 318 | set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp)) 319 | train_idxs, test_idxs = [], [] 320 | # depression data augmentation 321 | for idx in train_idxs_tmp: 322 | if idx in text_dep_idxs_tmp: 323 | feat = text_features[idx] 324 | count = 0 325 | for i in itertools.permutations(feat, feat.shape[0]): 326 | if count in resample_idxs: 327 | text_features = np.vstack( 328 | (text_features, np.expand_dims(list(i), 0))) 329 | text_targets = np.hstack((text_targets, 1)) 330 | train_idxs.append(len(text_features)-1) 331 | count += 1 332 | else: 333 | train_idxs.append(idx) 334 | 335 | for idx in test_idxs_tmp: 336 | if idx in text_dep_idxs_tmp: 337 | feat = text_features[idx] 338 | count = 0 339 | # resample_idxs = random.sample(range(6), 4) 340 | resample_idxs = [0,1,4,5] 341 | for i in itertools.permutations(feat, feat.shape[0]): 342 | if count in resample_idxs: 343 | text_features = np.vstack( 344 | (text_features, np.expand_dims(list(i), 0))) 345 | text_targets = np.hstack((text_targets, 1)) 346 | test_idxs.append(len(text_features)-1) 347 | count += 1 348 | else: 349 | test_idxs.append(idx) 350 | 351 | config = { 352 | 'num_classes': 2, 353 | 'dropout': 0.5, 354 | 'rnn_layers': 2, 355 | 'embedding_size': 1024, 356 | 'batch_size': 4, 357 | 'epochs': 100, 358 | 'learning_rate': 2e-5, 359 | 'hidden_dims': 128, 360 | 'bidirectional': True, 361 | 'cuda': False, 362 | } 363 | 364 | text_lstm_model = torch.load(os.path.join( 365 | prefix, 'Model/ClassificationWhole/Text/{}'.format(text_model_paths[idx_i]))) 366 | 367 | model = BiLSTM(config['rnn_layers'], config['dropout'], config['num_classes'], 368 | config['hidden_dims'], config['embedding_size']) 369 | 370 | # model_state_dict = {} 371 | # model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0'] 372 | # model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0'] 373 | # model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0'] 374 | # model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0'] 375 | 376 | # model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1'] 377 | # model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1'] 378 | # model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1'] 379 | # model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1'] 380 | 381 | # model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight'] 382 | # model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias'] 383 | # model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight'] 384 | # model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias'] 385 | # model_state_dict = text_lstm_model.state_dict() 386 | # model.load_state_dict(model_state_dict) 387 | 388 | # evaluate(text_features_test, fuse_targets_test, audio_lstm_model) 389 | # evaluate(model, test_idxs) 390 | 391 | p, r, f = evaluate(text_lstm_model, test_idxs) 392 | ps.append(p) 393 | rs.append(r) 394 | fs.append(f) 395 | print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs))) 396 | -------------------------------------------------------------------------------- /DepressionCollected/Classification/TextTraditionalClassifiers.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import KFold 2 | import numpy as np 3 | import pandas as pd 4 | import os 5 | import pickle 6 | import random 7 | import itertools 8 | from sklearn.metrics import confusion_matrix 9 | from sklearn.model_selection import train_test_split 10 | 11 | 12 | prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) 13 | text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0'] 14 | text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0'] 15 | text_dep_idxs_tmp = np.where(text_targets == 1)[0] 16 | text_non_idxs = np.where(text_targets == 0)[0] 17 | 18 | def model_performance(y_test, y_test_pred_proba): 19 | """ 20 | Evaluation metrics for network performance. 21 | """ 22 | # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] 23 | y_test_pred = y_test_pred_proba 24 | 25 | # Computing confusion matrix for test dataset 26 | conf_matrix = standard_confusion_matrix(y_test, y_test_pred) 27 | print("Confusion Matrix:") 28 | print(conf_matrix) 29 | 30 | return y_test_pred, conf_matrix 31 | 32 | def standard_confusion_matrix(y_test, y_test_pred): 33 | [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) 34 | return np.array([[tp, fp], [fn, tn]]) 35 | 36 | train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True), 37 | np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.65_2.npy'), allow_pickle=True), 38 | np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)] 39 | precs, recs, f1s = [], [], [] 40 | 41 | for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps): 42 | test_idxs_tmp = list(set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp)) 43 | train_idxs, test_idxs = [], [] 44 | 45 | # depression data augmentation 46 | for idx in train_idxs_tmp: 47 | if idx in text_dep_idxs_tmp: 48 | feat = text_features[idx] 49 | count = 0 50 | resample_idxs = [0,1,2,3,4,5] 51 | for i in itertools.permutations(feat, feat.shape[0]): 52 | if count in resample_idxs: 53 | text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) 54 | text_targets = np.hstack((text_targets, 1)) 55 | train_idxs.append(len(text_features)-1) 56 | count += 1 57 | else: 58 | train_idxs.append(idx) 59 | 60 | for idx in test_idxs_tmp: 61 | if idx in text_dep_idxs_tmp: 62 | feat = text_features[idx] 63 | count = 0 64 | # resample_idxs = random.sample(range(6), 4) 65 | resample_idxs = [0,1,4,5] 66 | for i in itertools.permutations(feat, feat.shape[0]): 67 | if count in resample_idxs: 68 | text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) 69 | text_targets = np.hstack((text_targets, 1)) 70 | test_idxs.append(len(text_features)-1) 71 | count += 1 72 | else: 73 | test_idxs.append(idx) 74 | # train_idxs = train_idxs_tmp 75 | # test_idxs = test_idxs_tmp 76 | 77 | X_train = text_features[train_idxs] 78 | Y_train = text_targets[train_idxs] 79 | X_test = text_features[test_idxs] 80 | Y_test = text_targets[test_idxs] 81 | 82 | # Decision Tree 83 | from sklearn import tree 84 | clf = tree.DecisionTreeClassifier(max_depth=20) 85 | 86 | # svm 87 | # from sklearn.svm import SVC 88 | # clf = SVC(kernel='rbf', gamma='auto') 89 | 90 | # rf 91 | # from sklearn.ensemble import RandomForestClassifier 92 | # clf = RandomForestClassifier(n_estimators=10, max_depth=20) 93 | 94 | # lr 95 | # from sklearn.linear_model import LogisticRegression 96 | # clf = LogisticRegression() 97 | 98 | clf.fit([f.flatten() for f in X_train], Y_train) 99 | pred = clf.predict([f.flatten() for f in X_test]) 100 | # clf.fit([f.sum(axis=0) for f in X_train], Y_train) 101 | # pred = clf.predict([f.sum(axis=0) for f in X_test]) 102 | 103 | y_test_pred, conf_matrix = model_performance(Y_test, pred) 104 | 105 | # custom evaluation metrics 106 | print('Calculating additional test metrics...') 107 | accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) 108 | precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) 109 | recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) 110 | f1_score = 2 * (precision * recall) / (precision + recall) 111 | print("Accuracy: {}".format(accuracy)) 112 | print("Precision: {}".format(precision)) 113 | print("Recall: {}".format(recall)) 114 | print("F1-Score: {}\n".format(f1_score)) 115 | print('='*89) 116 | # precs.append(0 if np.isnan(precision) else precision) 117 | # recs.append(0 if np.isnan(recall) else recall) 118 | # f1s.append(0 if np.isnan(f1_score) else f1_score) 119 | precs.append(precision) 120 | recs.append(recall) 121 | f1s.append(f1_score) 122 | print(np.mean(precs), np.mean(recs), np.mean(f1s)) -------------------------------------------------------------------------------- /DepressionCollected/Classification/audio_features_whole.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import wave 5 | import librosa 6 | from python_speech_features import * 7 | import sys 8 | import pickle 9 | sys.path.append('/Users/linlin/Desktop/depression/classfication') 10 | 11 | import tensorflow.compat.v1 as tf 12 | 13 | import vggish.vggish_input as vggish_input 14 | import vggish.vggish_params as vggish_params 15 | import vggish.vggish_postprocess as vggish_postprocess 16 | import vggish.vggish_slim as vggish_slim 17 | 18 | import loupe_keras as lpk 19 | 20 | from allennlp.commands.elmo import ElmoEmbedder 21 | 22 | tf.enable_eager_execution() 23 | 24 | elmo = ElmoEmbedder() 25 | 26 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" 27 | 28 | prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) 29 | 30 | # Paths to downloaded VGGish files. 31 | checkpoint_path =os.path.join(os.getcwd(), 'vggish/vggish_model.ckpt') 32 | pca_params_path = os.path.join(os.getcwd(), 'vggish/vggish_pca_params.npz') 33 | 34 | cluster_size = 16 35 | 36 | min_len = 100 37 | max_len = -1 38 | 39 | def to_vggish_embedds(x, sr): 40 | # x为输入的音频,sr为sample_rate 41 | input_batch = vggish_input.waveform_to_examples(x, sr) 42 | with tf.Graph().as_default(), tf.Session() as sess: 43 | vggish_slim.define_vggish_slim() 44 | vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) 45 | 46 | features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) 47 | embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) 48 | [embedding_batch] = sess.run([embedding_tensor], 49 | feed_dict={features_tensor: input_batch}) 50 | 51 | # Postprocess the results to produce whitened quantized embeddings. 52 | pproc = vggish_postprocess.Postprocessor(pca_params_path) 53 | postprocessed_batch = pproc.postprocess(embedding_batch) 54 | 55 | return tf.cast(postprocessed_batch, dtype='float32') 56 | 57 | def wav2vlad(wave_data, sr): 58 | global cluster_size 59 | signal = wave_data 60 | melspec = librosa.feature.melspectrogram(signal, n_mels=80,sr=sr).astype(np.float32).T 61 | melspec = np.log(np.maximum(1e-6, melspec)) 62 | feature_size = melspec.shape[1] 63 | max_samples = melspec.shape[0] 64 | output_dim = cluster_size * 16 65 | feat = lpk.NetVLAD(feature_size=feature_size, max_samples=max_samples, \ 66 | cluster_size=cluster_size, output_dim=output_dim) \ 67 | (tf.convert_to_tensor(melspec)) 68 | with tf.Session() as sess: 69 | init = tf.global_variables_initializer() 70 | sess.run(init) 71 | r = feat.numpy() 72 | return r 73 | 74 | def extract_features(number, audio_features, targets, path): 75 | global max_len, min_len 76 | if not os.path.exists(os.path.join(prefix, '{1}/{0}/positive_out.wav'.format(number, path))): 77 | return 78 | positive_file = wave.open(os.path.join(prefix, '{1}/{0}/positive_out.wav'.format(number, path))) 79 | sr1 = positive_file.getframerate() 80 | nframes1 = positive_file.getnframes() 81 | wave_data1 = np.frombuffer(positive_file.readframes(nframes1), dtype=np.short).astype(np.float) 82 | len1 = nframes1 / sr1 83 | 84 | neutral_file = wave.open(os.path.join(prefix, '{1}/{0}/neutral_out.wav'.format(number, path))) 85 | sr2 = neutral_file.getframerate() 86 | nframes2 = neutral_file.getnframes() 87 | wave_data2 = np.frombuffer(neutral_file.readframes(nframes2), dtype=np.short).astype(np.float) 88 | len2 = nframes2 / sr2 89 | 90 | negative_file = wave.open(os.path.join(prefix, '{1}/{0}/negative_out.wav'.format(number, path))) 91 | sr3 = negative_file.getframerate() 92 | nframes3 = negative_file.getnframes() 93 | wave_data3 = np.frombuffer(negative_file.readframes(nframes3), dtype=np.short).astype(np.float) 94 | len3 = nframes3/sr3 95 | 96 | for l in [len1, len2, len3]: 97 | if l > max_len: 98 | max_len = l 99 | if l < min_len: 100 | min_len = l 101 | 102 | with open(os.path.join(prefix, '{1}/{0}/new_label.txt'.format(number, path))) as fli: 103 | target = float(fli.readline()) 104 | 105 | if wave_data1.shape[0] < 1: 106 | wave_data1 = np.array([1e-4]*sr1*5) 107 | if wave_data2.shape[0] < 1: 108 | wave_data2 = np.array([1e-4]*sr2*5) 109 | if wave_data3.shape[0] < 1: 110 | wave_data3 = np.array([1e-4]*sr3*5) 111 | audio_features.append([wav2vlad(wave_data1, sr1), wav2vlad(wave_data2, sr2), \ 112 | wav2vlad(wave_data3, sr3)]) 113 | # targets.append(1 if target >= 53 else 0) 114 | targets.append(target) 115 | 116 | 117 | audio_features = [] 118 | audio_targets = [] 119 | 120 | for index in range(114): 121 | extract_features(index+1, audio_features, audio_targets, 'Data') 122 | 123 | for index in range(114): 124 | extract_features(index+1, audio_features, audio_targets, 'ValidationData') 125 | 126 | 127 | print("Saving npz file locally...") 128 | np.savez(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_%d.npz'%(cluster_size*16)), audio_features) 129 | np.savez(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_%d.npz')%(cluster_size*16), audio_targets) 130 | 131 | print(max_len, min_len) -------------------------------------------------------------------------------- /DepressionCollected/Classification/audio_gru_whole.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | from torch.nn import functional as F 5 | import torch.optim as optim 6 | from sklearn.metrics import confusion_matrix 7 | from sklearn.metrics import mean_absolute_error, mean_squared_error 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.model_selection import KFold 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import os 14 | import pickle 15 | import random 16 | import itertools 17 | 18 | prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) 19 | audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2) 20 | audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0'] 21 | audio_dep_idxs_tmp = np.where(audio_targets == 1)[0] 22 | audio_non_idxs = np.where(audio_targets == 0)[0] 23 | 24 | class AudioBiLSTM(nn.Module): 25 | def __init__(self, config): 26 | super(AudioBiLSTM, self).__init__() 27 | self.num_classes = config['num_classes'] 28 | self.learning_rate = config['learning_rate'] 29 | self.dropout = config['dropout'] 30 | self.hidden_dims = config['hidden_dims'] 31 | self.rnn_layers = config['rnn_layers'] 32 | self.embedding_size = config['embedding_size'] 33 | self.bidirectional = config['bidirectional'] 34 | 35 | self.build_model() 36 | # self.init_weight() 37 | 38 | def init_weight(net): 39 | for name, param in net.named_parameters(): 40 | if not 'ln' in name: 41 | if 'bias' in name: 42 | nn.init.constant_(param, 0.0) 43 | elif 'weight' in name: 44 | nn.init.xavier_uniform_(param) 45 | 46 | def build_model(self): 47 | # attention layer 48 | self.attention_layer = nn.Sequential( 49 | nn.Linear(self.hidden_dims, self.hidden_dims), 50 | nn.ReLU(inplace=True)) 51 | # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) 52 | 53 | # self.lstm_net_audio = nn.LSTM(self.embedding_size, 54 | # self.hidden_dims, 55 | # num_layers=self.rnn_layers, 56 | # dropout=self.dropout, 57 | # bidirectional=self.bidirectional, 58 | # batch_first=True) 59 | self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, 60 | num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True) 61 | 62 | self.ln = nn.LayerNorm(self.embedding_size) 63 | 64 | # FC层 65 | self.fc_audio = nn.Sequential( 66 | nn.Dropout(self.dropout), 67 | nn.Linear(self.hidden_dims, self.hidden_dims), 68 | nn.ReLU(), 69 | nn.Dropout(self.dropout), 70 | nn.Linear(self.hidden_dims, self.num_classes), 71 | # nn.ReLU(), 72 | nn.Softmax(dim=1) 73 | ) 74 | 75 | def attention_net_with_w(self, lstm_out, lstm_hidden): 76 | ''' 77 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 78 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 79 | :return: [batch_size, n_hidden] 80 | ''' 81 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 82 | # h [batch_size, time_step, hidden_dims] 83 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 84 | # h = lstm_out 85 | # [batch_size, num_layers * num_directions, n_hidden] 86 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 87 | # [batch_size, 1, n_hidden] 88 | lstm_hidden = lstm_hidden.unsqueeze(1) 89 | # atten_w [batch_size, 1, hidden_dims] 90 | atten_w = self.attention_layer(lstm_hidden) 91 | # m [batch_size, time_step, hidden_dims] 92 | m = nn.Tanh()(h) 93 | # atten_context [batch_size, 1, time_step] 94 | # print(atten_w.shape, m.transpose(1, 2).shape) 95 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 96 | # softmax_w [batch_size, 1, time_step] 97 | softmax_w = F.softmax(atten_context, dim=-1) 98 | # context [batch_size, 1, hidden_dims] 99 | context = torch.bmm(softmax_w, h) 100 | result = context.squeeze(1) 101 | return result 102 | 103 | def forward(self, x): 104 | x = self.ln(x) 105 | x, _ = self.lstm_net_audio(x) 106 | x = x.mean(dim=1) 107 | out = self.fc_audio(x) 108 | return out 109 | 110 | config = { 111 | 'num_classes': 2, 112 | 'dropout': 0.5, 113 | 'rnn_layers': 2, 114 | 'embedding_size': 256, 115 | 'batch_size': 8, 116 | 'epochs': 170, 117 | 'learning_rate': 6e-6, 118 | 'hidden_dims': 256, 119 | 'bidirectional': False, 120 | 'cuda': False 121 | } 122 | 123 | def save(model, filename): 124 | save_filename = '{}.pt'.format(filename) 125 | torch.save(model, save_filename) 126 | print('Saved as %s' % save_filename) 127 | 128 | def standard_confusion_matrix(y_test, y_test_pred): 129 | """ 130 | Make confusion matrix with format: 131 | ----------- 132 | | TP | FP | 133 | ----------- 134 | | FN | TN | 135 | ----------- 136 | Parameters 137 | ---------- 138 | y_true : ndarray - 1D 139 | y_pred : ndarray - 1D 140 | 141 | Returns 142 | ------- 143 | ndarray - 2D 144 | """ 145 | [[tn, fp], [fn, tp]] = confusion_matrix(y_test.cpu().numpy(), y_test_pred) 146 | return np.array([[tp, fp], [fn, tn]]) 147 | 148 | def model_performance(y_test, y_test_pred_proba): 149 | """ 150 | Evaluation metrics for network performance. 151 | """ 152 | y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] 153 | 154 | # Computing confusion matrix for test dataset 155 | conf_matrix = standard_confusion_matrix(y_test, y_test_pred.numpy()) 156 | print("Confusion Matrix:") 157 | print(conf_matrix) 158 | 159 | return y_test_pred, conf_matrix 160 | 161 | def train(epoch, train_idxs): 162 | global lr, train_acc 163 | model.train() 164 | batch_idx = 1 165 | total_loss = 0 166 | correct = 0 167 | pred = np.array([]) 168 | X_train = audio_features[train_idxs] 169 | Y_train = audio_targets[train_idxs] 170 | for i in range(0, X_train.shape[0], config['batch_size']): 171 | if i + config['batch_size'] > X_train.shape[0]: 172 | x, y = X_train[i:], Y_train[i:] 173 | else: 174 | x, y = X_train[i:(i + config['batch_size'])], Y_train[i:( 175 | i + config['batch_size'])] 176 | if config['cuda']: 177 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() 178 | else: 179 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \ 180 | Variable(torch.from_numpy(y)) 181 | 182 | # 将模型的参数梯度设置为0 183 | optimizer.zero_grad() 184 | output = model(x) 185 | pred = output.data.max(1, keepdim=True)[1] 186 | #print(pred.shape, y.shape) 187 | correct += pred.eq(y.data.view_as(pred)).cpu().sum() 188 | loss = criterion(output, y) 189 | # 后向传播调整参数 190 | loss.backward() 191 | # 根据梯度更新网络参数 192 | optimizer.step() 193 | batch_idx += 1 194 | # loss.item()能够得到张量中的元素值 195 | total_loss += loss.item() 196 | 197 | train_acc = correct 198 | print( 199 | 'Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n ' 200 | .format(epoch + 1, config['learning_rate'], total_loss, correct, 201 | X_train.shape[0], 100. * correct / X_train.shape[0])) 202 | 203 | 204 | def evaluate(model, test_idxs, fold, train_idxs_tmp, train_idxs): 205 | model.eval() 206 | batch_idx = 1 207 | total_loss = 0 208 | global max_f1, max_acc, min_mae, X_test_lens, max_prec, max_rec 209 | pred = np.array([]) 210 | with torch.no_grad(): 211 | if config['cuda']: 212 | x, y = Variable(torch.from_numpy(audio_features[test_idxs]).type(torch.FloatTensor), requires_grad=True).cuda(),\ 213 | Variable(torch.from_numpy(audio_targets[test_idxs])).cuda() 214 | else: 215 | x, y = Variable(torch.from_numpy(audio_features[test_idxs]).type(torch.FloatTensor), requires_grad=True), \ 216 | Variable(torch.from_numpy(audio_targets[test_idxs])).type(torch.LongTensor) 217 | 218 | optimizer.zero_grad() 219 | output = model(x) 220 | loss = criterion(output, y) 221 | total_loss += loss.item() 222 | y_test_pred, conf_matrix = model_performance(y, output.cpu()) 223 | accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) 224 | precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) 225 | recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) 226 | f1_score = 2 * (precision * recall) / (precision + recall) 227 | print("Accuracy: {}".format(accuracy)) 228 | print("Precision: {}".format(precision)) 229 | print("Recall: {}".format(recall)) 230 | print("F1-Score: {}\n".format(f1_score)) 231 | print('=' * 89) 232 | 233 | if max_f1 <= f1_score and train_acc > len(train_idxs)*0.90 and f1_score > 0.5: 234 | max_f1 = f1_score 235 | max_acc = accuracy 236 | max_rec = recall 237 | max_prec = precision 238 | mode ='gru' 239 | save(model, os.path.join(prefix, 'Model/ClassificationWhole/Audio/BiLSTM_{}_vlad{}_{}_{:.2f}_{}'.format(mode, config['embedding_size'], config['hidden_dims'], max_f1, fold))) 240 | np.save(os.path.join(prefix, 'Features/TextWhole/train_idxs_{:.2f}_{}.npy'.format(f1_score, fold)), train_idxs_tmp) 241 | print('*' * 64) 242 | print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc)) 243 | print('*' * 64) 244 | 245 | return total_loss 246 | 247 | def get_param_group(model): 248 | nd_list = [] 249 | param_list = [] 250 | for name, param in model.named_parameters(): 251 | if 'ln' in name: 252 | nd_list.append(param) 253 | else: 254 | param_list.append(param) 255 | return [{'params': param_list, 'weight_decay': 1e-5}, {'params': nd_list, 'weight_decay': 0}] 256 | 257 | if __name__ == '__main__': 258 | # kf = KFold(n_splits=3, shuffle=True) 259 | # fold = 1 260 | # for train_idxs_tmp, test_idxs_tmp in kf.split(audio_features): 261 | train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True), 262 | np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_2.npy'), allow_pickle=True), 263 | np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)] 264 | for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps): 265 | fold = idx_idx + 1 266 | # if idx_idx != 1: 267 | # continue 268 | test_idxs_tmp = list(set(list(audio_dep_idxs_tmp)+list(audio_non_idxs)) - set(train_idxs_tmp)) 269 | train_idxs, test_idxs = [], [] 270 | resample_idxs = [0,1,2,3,4,5] 271 | # depression data augmentation 272 | for idx in train_idxs_tmp: 273 | if idx in audio_dep_idxs_tmp: 274 | feat = audio_features[idx] 275 | count = 0 276 | for i in itertools.permutations(feat, feat.shape[0]): 277 | if count in resample_idxs: 278 | audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) 279 | audio_targets = np.hstack((audio_targets, 1)) 280 | train_idxs.append(len(audio_features)-1) 281 | count += 1 282 | else: 283 | train_idxs.append(idx) 284 | 285 | for idx in test_idxs_tmp: 286 | if idx in audio_dep_idxs_tmp: 287 | feat = audio_features[idx] 288 | count = 0 289 | # resample_idxs = random.sample(range(6), 4) 290 | resample_idxs = [0,1,4,5] 291 | for i in itertools.permutations(feat, feat.shape[0]): 292 | if count in resample_idxs: 293 | audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) 294 | audio_targets = np.hstack((audio_targets, 1)) 295 | test_idxs.append(len(audio_features)-1) 296 | count += 1 297 | else: 298 | test_idxs.append(idx) 299 | # test_idxs.append(idx) 300 | 301 | model = AudioBiLSTM(config) 302 | 303 | if config['cuda']: 304 | model = model.cuda() 305 | 306 | param_group = get_param_group(model) 307 | optimizer = optim.AdamW(param_group, lr=config['learning_rate']) 308 | criterion = nn.CrossEntropyLoss() 309 | # criterion = FocalLoss(class_num=2) 310 | max_f1 = -1 311 | max_acc = -1 312 | max_rec = -1 313 | max_prec = -1 314 | train_acc = -1 315 | 316 | for ep in range(1, config['epochs']): 317 | train(ep, train_idxs) 318 | tloss = evaluate(model, test_idxs, fold, train_idxs_tmp, train_idxs) 319 | fold += 1 -------------------------------------------------------------------------------- /DepressionCollected/Classification/fuse_net_whole.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | from torch.nn import functional as F 6 | import torch.optim as optim 7 | from sklearn.metrics import confusion_matrix 8 | import numpy as np 9 | import pandas as pd 10 | import wave 11 | import librosa 12 | from python_speech_features import * 13 | import re 14 | from allennlp.commands.elmo import ElmoEmbedder 15 | import os 16 | import tensorflow.compat.v1 as tf 17 | import itertools 18 | 19 | prefix = os.path.abspath(os.path.join(os.getcwd(), "./")) 20 | 21 | text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0'] 22 | text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0'] 23 | audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2) 24 | audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0'] 25 | fuse_features = [[audio_features[i], text_features[i]] for i in range(text_features.shape[0])] 26 | fuse_targets = text_targets 27 | 28 | fuse_dep_idxs = np.where(text_targets == 1)[0] 29 | fuse_non_idxs = np.where(text_targets == 0)[0] 30 | 31 | def save(model, filename): 32 | save_filename = '{}.pt'.format(filename) 33 | torch.save(model, save_filename) 34 | print('Saved as %s' % save_filename) 35 | 36 | def standard_confusion_matrix(y_test, y_test_pred): 37 | """ 38 | Make confusion matrix with format: 39 | ----------- 40 | | TP | FP | 41 | ----------- 42 | | FN | TN | 43 | ----------- 44 | Parameters 45 | ---------- 46 | y_true : ndarray - 1D 47 | y_pred : ndarray - 1D 48 | 49 | Returns 50 | ------- 51 | ndarray - 2D 52 | """ 53 | [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) 54 | return np.array([[tp, fp], [fn, tn]]) 55 | 56 | def model_performance(y_test, y_test_pred_proba): 57 | """ 58 | Evaluation metrics for network performance. 59 | """ 60 | # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] 61 | y_test_pred = y_test_pred_proba 62 | 63 | # Computing confusion matrix for test dataset 64 | conf_matrix = standard_confusion_matrix(y_test, y_test_pred) 65 | print("Confusion Matrix:") 66 | print(conf_matrix) 67 | 68 | return y_test_pred, conf_matrix 69 | 70 | class TextBiLSTM(nn.Module): 71 | def __init__(self, config): 72 | super(TextBiLSTM, self).__init__() 73 | self.num_classes = config['num_classes'] 74 | self.learning_rate = config['learning_rate'] 75 | self.dropout = config['dropout'] 76 | self.hidden_dims = config['hidden_dims'] 77 | self.rnn_layers = config['rnn_layers'] 78 | self.embedding_size = config['embedding_size'] 79 | self.bidirectional = config['bidirectional'] 80 | 81 | self.build_model() 82 | self.init_weight() 83 | 84 | def init_weight(net): 85 | for name, param in net.named_parameters(): 86 | if 'bias' in name: 87 | nn.init.constant_(param, 0.0) 88 | elif 'weight' in name: 89 | nn.init.xavier_uniform_(param) 90 | 91 | def build_model(self): 92 | # attention layer 93 | self.attention_layer = nn.Sequential( 94 | nn.Linear(self.hidden_dims, self.hidden_dims), 95 | nn.ReLU(inplace=True) 96 | ) 97 | # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) 98 | 99 | # 双层lstm 100 | self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims, 101 | num_layers=self.rnn_layers, dropout=self.dropout, 102 | bidirectional=self.bidirectional) 103 | 104 | # self.init_weight() 105 | 106 | # FC层 107 | # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes) 108 | self.fc_out = nn.Sequential( 109 | nn.Dropout(self.dropout), 110 | nn.Linear(self.hidden_dims, self.hidden_dims), 111 | nn.ReLU(), 112 | nn.Dropout(self.dropout), 113 | nn.Linear(self.hidden_dims, self.num_classes), 114 | # nn.ReLU(), 115 | nn.Softmax(dim=1), 116 | ) 117 | 118 | def attention_net_with_w(self, lstm_out, lstm_hidden): 119 | ''' 120 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 121 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 122 | :return: [batch_size, n_hidden] 123 | ''' 124 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 125 | # h [batch_size, time_step, hidden_dims] 126 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 127 | # h = lstm_out 128 | # [batch_size, num_layers * num_directions, n_hidden] 129 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 130 | # [batch_size, 1, n_hidden] 131 | lstm_hidden = lstm_hidden.unsqueeze(1) 132 | # atten_w [batch_size, 1, hidden_dims] 133 | atten_w = self.attention_layer(lstm_hidden) 134 | # m [batch_size, time_step, hidden_dims] 135 | m = nn.Tanh()(h) 136 | # atten_context [batch_size, 1, time_step] 137 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 138 | # softmax_w [batch_size, 1, time_step] 139 | softmax_w = F.softmax(atten_context, dim=-1) 140 | # context [batch_size, 1, hidden_dims] 141 | context = torch.bmm(softmax_w, h) 142 | result = context.squeeze(1) 143 | return result 144 | 145 | def forward(self, x): 146 | 147 | # x : [len_seq, batch_size, embedding_dim] 148 | x = x.permute(1, 0, 2) 149 | output, (final_hidden_state, final_cell_state) = self.lstm_net(x) 150 | # output : [batch_size, len_seq, n_hidden * 2] 151 | output = output.permute(1, 0, 2) 152 | # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] 153 | final_hidden_state = final_hidden_state.permute(1, 0, 2) 154 | # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) 155 | # atten_out = self.attention_net(output, final_hidden_state) 156 | atten_out = self.attention_net_with_w(output, final_hidden_state) 157 | return self.fc_out(atten_out) 158 | 159 | class AudioBiLSTM(nn.Module): 160 | def __init__(self, config): 161 | super(AudioBiLSTM, self).__init__() 162 | self.num_classes = config['num_classes'] 163 | self.learning_rate = config['learning_rate'] 164 | self.dropout = config['dropout'] 165 | self.hidden_dims = config['hidden_dims'] 166 | self.rnn_layers = config['rnn_layers'] 167 | self.embedding_size = config['embedding_size'] 168 | self.bidirectional = config['bidirectional'] 169 | 170 | self.build_model() 171 | # self.init_weight() 172 | 173 | def init_weight(net): 174 | for name, param in net.named_parameters(): 175 | if not 'ln' in name: 176 | if 'bias' in name: 177 | nn.init.constant_(param, 0.0) 178 | elif 'weight' in name: 179 | nn.init.xavier_uniform_(param) 180 | 181 | def build_model(self): 182 | # attention layer 183 | self.attention_layer = nn.Sequential( 184 | nn.Linear(self.hidden_dims, self.hidden_dims), 185 | nn.ReLU(inplace=True)) 186 | # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) 187 | 188 | # self.lstm_net_audio = nn.LSTM(self.embedding_size, 189 | # self.hidden_dims, 190 | # num_layers=self.rnn_layers, 191 | # dropout=self.dropout, 192 | # bidirectional=self.bidirectional, 193 | # batch_first=True) 194 | self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, 195 | num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True) 196 | 197 | self.ln = nn.LayerNorm(self.embedding_size) 198 | 199 | # FC层 200 | self.fc_audio = nn.Sequential( 201 | nn.Dropout(self.dropout), 202 | nn.Linear(self.hidden_dims, self.hidden_dims), 203 | nn.ReLU(), 204 | nn.Dropout(self.dropout), 205 | nn.Linear(self.hidden_dims, self.num_classes), 206 | # nn.ReLU(), 207 | nn.Softmax(dim=1) 208 | ) 209 | 210 | def attention_net_with_w(self, lstm_out, lstm_hidden): 211 | ''' 212 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 213 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 214 | :return: [batch_size, n_hidden] 215 | ''' 216 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 217 | # h [batch_size, time_step, hidden_dims] 218 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 219 | # h = lstm_out 220 | # [batch_size, num_layers * num_directions, n_hidden] 221 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 222 | # [batch_size, 1, n_hidden] 223 | lstm_hidden = lstm_hidden.unsqueeze(1) 224 | # atten_w [batch_size, 1, hidden_dims] 225 | atten_w = self.attention_layer(lstm_hidden) 226 | # m [batch_size, time_step, hidden_dims] 227 | m = nn.Tanh()(h) 228 | # atten_context [batch_size, 1, time_step] 229 | # print(atten_w.shape, m.transpose(1, 2).shape) 230 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 231 | # softmax_w [batch_size, 1, time_step] 232 | softmax_w = F.softmax(atten_context, dim=-1) 233 | # context [batch_size, 1, hidden_dims] 234 | context = torch.bmm(softmax_w, h) 235 | result = context.squeeze(1) 236 | return result 237 | 238 | def forward(self, x): 239 | x = self.ln(x) 240 | x, _ = self.lstm_net_audio(x) 241 | x = x.mean(dim=1) 242 | out = self.fc_audio(x) 243 | return out 244 | 245 | class fusion_net(nn.Module): 246 | def __init__(self, text_embed_size, text_hidden_dims, rnn_layers, dropout, num_classes, \ 247 | audio_hidden_dims, audio_embed_size): 248 | super(fusion_net, self).__init__() 249 | self.text_embed_size = text_embed_size 250 | self.audio_embed_size = audio_embed_size 251 | self.text_hidden_dims = text_hidden_dims 252 | self.audio_hidden_dims = audio_hidden_dims 253 | self.rnn_layers = rnn_layers 254 | self.dropout = dropout 255 | self.num_classes = num_classes 256 | 257 | # ============================= TextBiLSTM ================================= 258 | 259 | # attention layer 260 | self.attention_layer = nn.Sequential( 261 | nn.Linear(self.text_hidden_dims, self.text_hidden_dims), 262 | nn.ReLU(inplace=True) 263 | ) 264 | 265 | # 双层lstm 266 | self.lstm_net = nn.LSTM(self.text_embed_size, self.text_hidden_dims, 267 | num_layers=self.rnn_layers, dropout=self.dropout, 268 | bidirectional=True) 269 | # FC层 270 | self.fc_out = nn.Sequential( 271 | nn.Dropout(self.dropout), 272 | nn.Linear(self.text_hidden_dims, self.text_hidden_dims), 273 | nn.ReLU(), 274 | nn.Dropout(self.dropout) 275 | ) 276 | 277 | # ============================= TextBiLSTM ================================= 278 | 279 | # ============================= AudioBiLSTM ============================= 280 | 281 | self.lstm_net_audio = nn.GRU(self.audio_embed_size, 282 | self.audio_hidden_dims, 283 | num_layers=self.rnn_layers, 284 | dropout=self.dropout, 285 | bidirectional=False, 286 | batch_first=True) 287 | 288 | self.fc_audio = nn.Sequential( 289 | nn.Dropout(self.dropout), 290 | nn.Linear(self.audio_hidden_dims, self.audio_hidden_dims), 291 | nn.ReLU(), 292 | nn.Dropout(self.dropout) 293 | ) 294 | 295 | self.ln = nn.LayerNorm(self.audio_embed_size) 296 | 297 | # ============================= AudioBiLSTM ============================= 298 | 299 | # ============================= last fc layer ============================= 300 | # self.bn = nn.BatchNorm1d(self.text_hidden_dims + self.audio_hidden_dims) 301 | # modal attention 302 | self.modal_attn = nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.text_hidden_dims + self.audio_hidden_dims, bias=False) 303 | self.fc_final = nn.Sequential( 304 | nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.num_classes, bias=False), 305 | # nn.ReLU(), 306 | nn.Softmax(dim=1), 307 | # nn.Sigmoid() 308 | ) 309 | 310 | def attention_net_with_w(self, lstm_out, lstm_hidden): 311 | ''' 312 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 313 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 314 | :return: [batch_size, n_hidden] 315 | ''' 316 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 317 | # h [batch_size, time_step, hidden_dims] 318 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 319 | # [batch_size, num_layers * num_directions, n_hidden] 320 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 321 | # [batch_size, 1, n_hidden] 322 | lstm_hidden = lstm_hidden.unsqueeze(1) 323 | # atten_w [batch_size, 1, hidden_dims] 324 | atten_w = self.attention_layer(lstm_hidden) 325 | # m [batch_size, time_step, hidden_dims] 326 | m = nn.Tanh()(h) 327 | # atten_context [batch_size, 1, time_step] 328 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 329 | # softmax_w [batch_size, 1, time_step] 330 | softmax_w = F.softmax(atten_context, dim=-1) 331 | # context [batch_size, 1, hidden_dims] 332 | context = torch.bmm(softmax_w, h) 333 | result = context.squeeze(1) 334 | return result 335 | 336 | def pretrained_feature(self, x): 337 | with torch.no_grad(): 338 | x_text = [] 339 | x_audio = [] 340 | for ele in x: 341 | x_text.append(ele[1]) 342 | x_audio.append(ele[0]) 343 | x_text, x_audio = Variable(torch.tensor(x_text).type(torch.FloatTensor), requires_grad=False), Variable(torch.tensor(x_audio).type(torch.FloatTensor), requires_grad=False) 344 | # ============================= TextBiLSTM ================================= 345 | # x : [len_seq, batch_size, embedding_dim] 346 | x_text = x_text.permute(1, 0, 2) 347 | output, (final_hidden_state, _) = self.lstm_net(x_text) 348 | # output : [batch_size, len_seq, n_hidden * 2] 349 | output = output.permute(1, 0, 2) 350 | # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] 351 | final_hidden_state = final_hidden_state.permute(1, 0, 2) 352 | # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) 353 | # atten_out = self.attention_net(output, final_hidden_state) 354 | atten_out = self.attention_net_with_w(output, final_hidden_state) 355 | text_feature = self.fc_out(atten_out) 356 | 357 | # ============================= TextBiLSTM ================================= 358 | 359 | # ============================= AudioBiLSTM ============================= 360 | x_audio = self.ln(x_audio) 361 | x_audio, _ = self.lstm_net_audio(x_audio) 362 | x_audio = x_audio.sum(dim=1) 363 | audio_feature = self.fc_audio(x_audio) 364 | 365 | # ============================= AudioBiLSTM ============================= 366 | return (text_feature, audio_feature) 367 | 368 | def forward(self, x): 369 | # x = self.bn(x) 370 | # modal_weights = torch.softmax(self.modal_attn(x), dim=1) 371 | # modal_weights = self.modal_attn(x) 372 | # x = (modal_weights * x) 373 | output = self.fc_final(x) 374 | return output 375 | 376 | class MyLoss(nn.Module): 377 | def __init__(self): 378 | super(MyLoss, self).__init__() 379 | 380 | def forward(self, text_feature, audio_feature, target, model): 381 | weight = model.fc_final[0].weight 382 | # bias = model.fc_final[0].bias 383 | # print(weight, bias) 384 | pred_text = F.linear(text_feature, weight[:, :config['text_hidden_dims']]) 385 | pred_audio = F.linear(audio_feature, weight[:, config['text_hidden_dims']:]) 386 | l = nn.CrossEntropyLoss() 387 | target = torch.tensor(target) 388 | # l = nn.BCEWithLogitsLoss() 389 | # target = F.one_hot(target, num_classes=2).type(torch.FloatTensor) 390 | # print('y: {}\npred_audio: {}\npred_text: {}\n'.format(target, pred_audio.data.max(1, keepdim=True)[1], pred_text.data.max(1, keepdim=True)[1])) 391 | # return l(pred_text, target) + l(pred_audio, target) + \ 392 | # config['lambda']*torch.norm(weight[:, :config['text_hidden_dims']]) + \ 393 | # config['lambda']*torch.norm(weight[:, config['text_hidden_dims']:]) 394 | # a = F.softmax(pred_text, dim=1) + F.softmax(pred_audio, dim=1) 395 | return l(pred_text, target) + l(pred_audio, target) 396 | 397 | 398 | config = { 399 | 'num_classes': 2, 400 | 'dropout': 0.3, 401 | 'rnn_layers': 2, 402 | 'audio_embed_size': 256, 403 | 'text_embed_size': 1024, 404 | 'batch_size': 2, 405 | 'epochs': 100, 406 | 'learning_rate': 8e-6, 407 | 'audio_hidden_dims': 256, 408 | 'text_hidden_dims': 128, 409 | 'cuda': False, 410 | 'lambda': 1e-5, 411 | } 412 | 413 | model = fusion_net(config['text_embed_size'], config['text_hidden_dims'], config['rnn_layers'], \ 414 | config['dropout'], config['num_classes'], config['audio_hidden_dims'], config['audio_embed_size']) 415 | 416 | optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) 417 | # optimizer = optim.Adam(model.parameters()) 418 | # criterion = nn.CrossEntropyLoss() 419 | criterion = MyLoss() 420 | 421 | def train(epoch, train_idxs): 422 | global max_train_acc, train_acc 423 | model.train() 424 | batch_idx = 1 425 | total_loss = 0 426 | correct = 0 427 | X_train = [] 428 | Y_train = [] 429 | for idx in train_idxs: 430 | X_train.append(fuse_features[idx]) 431 | Y_train.append(fuse_targets[idx]) 432 | for i in range(0, len(X_train), config['batch_size']): 433 | if i + config['batch_size'] > len(X_train): 434 | x, y = X_train[i:], Y_train[i:] 435 | else: 436 | x, y = X_train[i:(i+config['batch_size'])], Y_train[i:(i+config['batch_size'])] 437 | if config['cuda']: 438 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() 439 | # 将模型的参数梯度设置为0 440 | optimizer.zero_grad() 441 | text_feature, audio_feature = model.pretrained_feature(x) 442 | # text_feature = torch.from_numpy(ss.fit_transform(text_feature.numpy())) 443 | # audio_feature = torch.from_numpy(ss.fit_transform(audio_feature.numpy())) 444 | # concat_x = torch.cat((audio_feature, text_feature), dim=1) 445 | concat_x = torch.cat((text_feature, audio_feature), dim=1) 446 | # dot_x = text_feature.mul(audio_feature) 447 | # add_x = text_feature.add(audio_feature) 448 | output = model(concat_x) 449 | pred = output.data.max(1, keepdim=True)[1] 450 | correct += pred.eq(torch.tensor(y).data.view_as(pred)).cpu().sum() 451 | # loss = criterion(output, torch.tensor(y)) 452 | loss = criterion(text_feature, audio_feature, y, model) 453 | # 后向传播调整参数 454 | loss.backward() 455 | # 根据梯度更新网络参数 456 | optimizer.step() 457 | batch_idx += 1 458 | # loss.item()能够得到张量中的元素值 459 | total_loss += loss.item() 460 | cur_loss = total_loss 461 | max_train_acc = correct 462 | train_acc = correct 463 | print('Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n '.format( 464 | epoch, config['learning_rate'], cur_loss/len(X_train), correct, len(X_train), 465 | 100. * correct / len(X_train))) 466 | 467 | 468 | def evaluate(model, test_idxs, fold, train_idxs): 469 | model.eval() 470 | batch_idx = 1 471 | total_loss = 0 472 | pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor) 473 | X_test = [] 474 | Y_test = [] 475 | for idx in test_idxs: 476 | X_test.append(fuse_features[idx]) 477 | Y_test.append(fuse_targets[idx]) 478 | global max_train_acc, max_acc,max_f1 479 | for i in range(0, len(X_test), config['batch_size']): 480 | if i + config['batch_size'] > len(X_test): 481 | x, y = X_test[i:], Y_test[i:] 482 | else: 483 | x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])] 484 | if config['cuda']: 485 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() 486 | text_feature, audio_feature = model.pretrained_feature(x) 487 | with torch.no_grad(): 488 | # concat_x = torch.cat((audio_feature, text_feature), dim=1) 489 | audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std() 490 | text_feature_norm = (text_feature - text_feature.mean())/text_feature.std() 491 | concat_x = torch.cat((text_feature, audio_feature), dim=1) 492 | output = model(concat_x) 493 | # loss = criterion(output, torch.tensor(y)) 494 | loss = criterion(text_feature, audio_feature, y, model) 495 | pred = torch.cat((pred, output.data.max(1, keepdim=True)[1])) 496 | total_loss += loss.item() 497 | 498 | y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:]) 499 | 500 | print('\nTest set: Average loss: {:.4f}'.format(total_loss/len(X_test))) 501 | # custom evaluation metrics 502 | print('Calculating additional test metrics...') 503 | accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) 504 | precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) 505 | recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) 506 | f1_score = 2 * (precision * recall) / (precision + recall) 507 | print("Accuracy: {}".format(accuracy)) 508 | print("Precision: {}".format(precision)) 509 | print("Recall: {}".format(recall)) 510 | print("F1-Score: {}\n".format(f1_score)) 511 | print('='*89) 512 | 513 | if max_f1 < f1_score and max_train_acc >= len(train_idxs)*0.9 and f1_score > 0.61: 514 | max_f1 = f1_score 515 | max_acc = accuracy 516 | save(model, os.path.join(prefix, 'Model/ClassificationWhole/Fuse/fuse_{:.2f}_{}'.format(max_f1, fold))) 517 | print('*'*64) 518 | print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc)) 519 | print('*'*64) 520 | return total_loss 521 | 522 | if __name__ == '__main__': 523 | idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy'] 524 | text_model_paths = ['BiLSTM_128_0.64_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.62_3.pt'] 525 | audio_model_paths = ['BiLSTM_gru_vlad256_256_0.67_1.pt', 'BiLSTM_gru_vlad256_256_0.67_2.pt', 'BiLSTM_gru_vlad256_256_0.63_3.pt'] 526 | for fold in range(1, 4): 527 | # if fold != 2: 528 | # continue 529 | train_idxs_tmp = np.load(os.path.join(prefix, 'Features/TextWhole/{}'.format(idxs_paths[fold-1])), allow_pickle=True) 530 | test_idxs_tmp = list(set(list(fuse_dep_idxs)+list(fuse_non_idxs)) - set(train_idxs_tmp)) 531 | resample_idxs = list(range(6)) 532 | 533 | train_idxs, test_idxs = [], [] 534 | # depression data augmentation 535 | for idx in train_idxs_tmp: 536 | if idx in fuse_dep_idxs: 537 | feat = fuse_features[idx] 538 | audio_perm = itertools.permutations(feat[0], 3) 539 | text_perm = itertools.permutations(feat[1], 3) 540 | count = 0 541 | for fuse_perm in zip(audio_perm, text_perm): 542 | if count in resample_idxs: 543 | fuse_features.append(fuse_perm) 544 | fuse_targets = np.hstack((fuse_targets, 1)) 545 | train_idxs.append(len(fuse_features)-1) 546 | count += 1 547 | else: 548 | train_idxs.append(idx) 549 | 550 | for idx in test_idxs_tmp: 551 | if idx in fuse_dep_idxs: 552 | feat = fuse_features[idx] 553 | audio_perm = itertools.permutations(feat[0], 3) 554 | text_perm = itertools.permutations(feat[1], 3) 555 | count = 0 556 | resample_idxs = [0,1,4,5] 557 | for fuse_perm in zip(audio_perm, text_perm): 558 | if count in resample_idxs: 559 | fuse_features.append(fuse_perm) 560 | fuse_targets = np.hstack((fuse_targets, 1)) 561 | test_idxs.append(len(fuse_features)-1) 562 | count += 1 563 | else: 564 | test_idxs.append(idx) 565 | 566 | text_lstm_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Text/{}'.format(text_model_paths[fold-1]))) 567 | audio_lstm_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Audio/{}'.format(audio_model_paths[fold-1]))) 568 | model_state_dict = {} 569 | model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0'] 570 | model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0'] 571 | model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0'] 572 | model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0'] 573 | 574 | model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1'] 575 | model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1'] 576 | model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1'] 577 | model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1'] 578 | 579 | model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight'] 580 | model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias'] 581 | model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight'] 582 | model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias'] 583 | 584 | model_state_dict['ln.weight'] = audio_lstm_model.state_dict()['ln.weight'] 585 | model_state_dict['ln.bias'] = audio_lstm_model.state_dict()['ln.bias'] 586 | model.load_state_dict(text_lstm_model.state_dict(), strict=False) 587 | # model.load_state_dict(audio_lstm_model.state_dict(), strict=False) 588 | model.load_state_dict(model_state_dict, strict=False) 589 | 590 | for param in model.parameters(): 591 | param.requires_grad = False 592 | 593 | model.fc_final[0].weight.requires_grad = True 594 | # model.fc_final[0].bias.requires_grad = True 595 | # model.modal_attn.weight.requires_grad = True 596 | 597 | max_f1 = -1 598 | max_acc = -1 599 | max_train_acc = -1 600 | 601 | for ep in range(1, config['epochs']): 602 | train(ep, train_idxs) 603 | tloss = evaluate(model, test_idxs, fold, train_idxs) -------------------------------------------------------------------------------- /DepressionCollected/Classification/text_bilstm_whole.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | from torch.nn import functional as F 5 | import torch.optim as optim 6 | from sklearn.metrics import confusion_matrix 7 | from sklearn.metrics import mean_absolute_error, mean_squared_error 8 | from sklearn.model_selection import train_test_split 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import os 13 | import pickle 14 | import random 15 | import itertools 16 | 17 | prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) 18 | text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0'] 19 | text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0'] 20 | text_dep_idxs_tmp = np.where(text_targets == 1)[0] 21 | text_non_idxs = np.where(text_targets == 0)[0] 22 | 23 | class TextBiLSTM(nn.Module): 24 | def __init__(self, config): 25 | super(TextBiLSTM, self).__init__() 26 | self.num_classes = config['num_classes'] 27 | self.learning_rate = config['learning_rate'] 28 | self.dropout = config['dropout'] 29 | self.hidden_dims = config['hidden_dims'] 30 | self.rnn_layers = config['rnn_layers'] 31 | self.embedding_size = config['embedding_size'] 32 | self.bidirectional = config['bidirectional'] 33 | 34 | self.build_model() 35 | self.init_weight() 36 | 37 | def init_weight(net): 38 | for name, param in net.named_parameters(): 39 | if 'ln' not in name: 40 | if 'bias' in name: 41 | nn.init.constant_(param, 0.0) 42 | elif 'weight' in name: 43 | nn.init.xavier_uniform_(param) 44 | 45 | def build_model(self): 46 | # attention layer 47 | self.attention_layer = nn.Sequential( 48 | nn.Linear(self.hidden_dims, self.hidden_dims), 49 | nn.ReLU(inplace=True) 50 | ) 51 | # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) 52 | 53 | # 双层lstm 54 | self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims, 55 | num_layers=self.rnn_layers, dropout=self.dropout, 56 | bidirectional=self.bidirectional) 57 | 58 | # FC层 59 | # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes) 60 | self.fc_out = nn.Sequential( 61 | # nn.Dropout(self.dropout), 62 | nn.Linear(self.hidden_dims, self.hidden_dims), 63 | nn.ReLU(), 64 | nn.Dropout(self.dropout), 65 | nn.Linear(self.hidden_dims, self.num_classes), 66 | # nn.ReLU(), 67 | nn.Softmax(dim=1), 68 | ) 69 | 70 | self.ln1 = nn.LayerNorm(self.embedding_size) 71 | self.ln2 = nn.LayerNorm(self.hidden_dims) 72 | 73 | 74 | def attention_net_with_w(self, lstm_out, lstm_hidden): 75 | ''' 76 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 77 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 78 | :return: [batch_size, n_hidden] 79 | ''' 80 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 81 | # h [batch_size, time_step, hidden_dims] 82 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 83 | # h = lstm_out 84 | # [batch_size, num_layers * num_directions, n_hidden] 85 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 86 | # [batch_size, 1, n_hidden] 87 | lstm_hidden = lstm_hidden.unsqueeze(1) 88 | # atten_w [batch_size, 1, hidden_dims] 89 | atten_w = self.attention_layer(lstm_hidden) 90 | # m [batch_size, time_step, hidden_dims] 91 | m = nn.Tanh()(h) 92 | # atten_context [batch_size, 1, time_step] 93 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 94 | # softmax_w [batch_size, 1, time_step] 95 | softmax_w = F.softmax(atten_context, dim=-1) 96 | # context [batch_size, 1, hidden_dims] 97 | context = torch.bmm(softmax_w, h) 98 | result = context.squeeze(1) 99 | return result 100 | 101 | def forward(self, x): 102 | # x : [len_seq, batch_size, embedding_dim] 103 | x = x.permute(1, 0, 2) 104 | # x = self.ln1(x) 105 | output, (final_hidden_state, _) = self.lstm_net(x) 106 | # output : [batch_size, len_seq, n_hidden * 2] 107 | output = output.permute(1, 0, 2) 108 | # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] 109 | final_hidden_state = final_hidden_state.permute(1, 0, 2) 110 | # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) 111 | # atten_out = self.attention_net(output, final_hidden_state) 112 | atten_out = self.attention_net_with_w(output, final_hidden_state) 113 | # atten_out = self.ln2(atten_out) 114 | return self.fc_out(atten_out) 115 | 116 | def save(model, filename): 117 | save_filename = '{}.pt'.format(filename) 118 | torch.save(model, save_filename) 119 | print('Saved as %s' % save_filename) 120 | 121 | def standard_confusion_matrix(y_test, y_test_pred): 122 | """ 123 | Make confusion matrix with format: 124 | ----------- 125 | | TP | FP | 126 | ----------- 127 | | FN | TN | 128 | ----------- 129 | Parameters 130 | ---------- 131 | y_true : ndarray - 1D 132 | y_pred : ndarray - 1D 133 | 134 | Returns 135 | ------- 136 | ndarray - 2D 137 | """ 138 | [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) 139 | return np.array([[tp, fp], [fn, tn]]) 140 | 141 | def model_performance(y_test, y_test_pred_proba): 142 | """ 143 | Evaluation metrics for network performance. 144 | """ 145 | y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] 146 | 147 | # Computing confusion matrix for test dataset 148 | conf_matrix = standard_confusion_matrix(y_test, y_test_pred) 149 | print("Confusion Matrix:") 150 | print(conf_matrix) 151 | 152 | return y_test_pred, conf_matrix 153 | 154 | def train(epoch, train_idxs): 155 | global lr, train_acc 156 | model.train() 157 | batch_idx = 1 158 | total_loss = 0 159 | correct = 0 160 | X_train = text_features[train_idxs] 161 | Y_train = text_targets[train_idxs] 162 | for i in range(0, X_train.shape[0], config['batch_size']): 163 | if i + config['batch_size'] > X_train.shape[0]: 164 | x, y = X_train[i:], Y_train[i:] 165 | else: 166 | x, y = X_train[i:(i + config['batch_size'])], Y_train[i:( 167 | i + config['batch_size'])] 168 | if config['cuda']: 169 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() 170 | else: 171 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \ 172 | Variable(torch.from_numpy(y)) 173 | 174 | # 将模型的参数梯度设置为0 175 | optimizer.zero_grad() 176 | output = model(x) 177 | pred = output.data.max(1, keepdim=True)[1] 178 | #print(pred.shape, y.shape) 179 | correct += pred.eq(y.data.view_as(pred)).cpu().sum() 180 | loss = criterion(output, y) 181 | # 后向传播调整参数 182 | loss.backward() 183 | # 根据梯度更新网络参数 184 | optimizer.step() 185 | batch_idx += 1 186 | # loss.item()能够得到张量中的元素值 187 | total_loss += loss.item() 188 | 189 | train_acc = correct 190 | print( 191 | 'Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n ' 192 | .format(epoch + 1, config['learning_rate'], total_loss, correct, 193 | X_train.shape[0], 100. * correct / X_train.shape[0])) 194 | 195 | 196 | def evaluate(model, test_idxs, fold, train_idxs): 197 | model.eval() 198 | batch_idx = 1 199 | total_loss = 0 200 | global max_f1, max_acc, min_mae, X_test_lens, max_prec, max_rec 201 | pred = np.array([]) 202 | with torch.no_grad(): 203 | if config['cuda']: 204 | x, y = Variable(torch.from_numpy(text_features[test_idxs]).type(torch.FloatTensor), requires_grad=True).cuda(),\ 205 | Variable(torch.from_numpy(text_targets[test_idxs])).cuda() 206 | else: 207 | x, y = Variable(torch.from_numpy(text_features[test_idxs]).type(torch.FloatTensor), requires_grad=True), \ 208 | Variable(torch.from_numpy(text_targets[test_idxs])).type(torch.LongTensor) 209 | 210 | optimizer.zero_grad() 211 | output = model(x) 212 | loss = criterion(output, y) 213 | total_loss += loss.item() 214 | y_test_pred, conf_matrix = model_performance(y, output.cpu()) 215 | accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) 216 | precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) 217 | recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) 218 | f1_score = 2 * (precision * recall) / (precision + recall) 219 | print("Accuracy: {}".format(accuracy)) 220 | print("Precision: {}".format(precision)) 221 | print("Recall: {}".format(recall)) 222 | print("F1-Score: {}\n".format(f1_score)) 223 | print('=' * 89) 224 | 225 | if max_f1 <= f1_score and train_acc > len(train_idxs)*0.9 and f1_score > 0.5: 226 | max_f1 = f1_score 227 | max_acc = accuracy 228 | max_rec = recall 229 | max_prec = precision 230 | save(model, os.path.join(prefix, 'Model/ClassificationWhole/Text/BiLSTM_{}_{:.2f}_{}'.format(config['hidden_dims'], max_f1, fold))) 231 | print('*' * 64) 232 | print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc)) 233 | print('*' * 64) 234 | 235 | return total_loss 236 | 237 | def get_param_group(model): 238 | nd_list = [] 239 | param_list = [] 240 | for name, param in model.named_parameters(): 241 | if 'ln' in name: 242 | nd_list.append(param) 243 | else: 244 | param_list.append(param) 245 | return [{'params': param_list, 'weight_decay': 1e-5}, {'params': nd_list, 'weight_decay': 0}] 246 | 247 | config = { 248 | 'num_classes': 2, 249 | 'dropout': 0.5, 250 | 'rnn_layers': 2, 251 | 'embedding_size': 1024, 252 | 'batch_size': 4, 253 | 'epochs': 150, 254 | 'learning_rate': 1e-5, 255 | 'hidden_dims': 128, 256 | 'bidirectional': True, 257 | 'cuda': False, 258 | } 259 | 260 | train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True), 261 | np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_2.npy'), allow_pickle=True), 262 | np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)] 263 | fold = 1 264 | 265 | for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps): 266 | # if idx_idx != 2: 267 | # continue 268 | test_idxs_tmp = list(set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp)) 269 | train_idxs, test_idxs = [], [] 270 | # depression data augmentation 271 | for idx in train_idxs_tmp: 272 | if idx in text_dep_idxs_tmp: 273 | feat = text_features[idx] 274 | count = 0 275 | resample_idxs = [0,1,2,3,4,5] 276 | for i in itertools.permutations(feat, feat.shape[0]): 277 | if count in resample_idxs: 278 | text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) 279 | text_targets = np.hstack((text_targets, 1)) 280 | train_idxs.append(len(text_features)-1) 281 | count += 1 282 | else: 283 | train_idxs.append(idx) 284 | 285 | for idx in test_idxs_tmp: 286 | if idx in text_dep_idxs_tmp: 287 | feat = text_features[idx] 288 | count = 0 289 | # resample_idxs = random.sample(range(6), 4) 290 | resample_idxs = [0,1,4,5] 291 | for i in itertools.permutations(feat, feat.shape[0]): 292 | if count in resample_idxs: 293 | text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) 294 | text_targets = np.hstack((text_targets, 1)) 295 | test_idxs.append(len(text_features)-1) 296 | count += 1 297 | else: 298 | test_idxs.append(idx) 299 | 300 | model = TextBiLSTM(config) 301 | 302 | param_group = get_param_group(model) 303 | optimizer = optim.AdamW(param_group, lr=config['learning_rate']) 304 | criterion = nn.CrossEntropyLoss() 305 | max_f1 = -1 306 | max_acc = -1 307 | max_rec = -1 308 | max_prec = -1 309 | train_acc = -1 310 | 311 | for ep in range(1, config['epochs']): 312 | train(ep, train_idxs) 313 | tloss = evaluate(model, test_idxs, fold, train_idxs) 314 | fold += 1 -------------------------------------------------------------------------------- /DepressionCollected/Classification/text_features_whole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import wave 4 | import librosa 5 | import re 6 | # from allennlp.commands.elmo import ElmoEmbedder 7 | import os 8 | prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) 9 | from elmoformanylangs import Embedder 10 | import pkuseg 11 | import thulac 12 | # from pyhanlp import HanLP 13 | import jieba 14 | # seg = pkuseg.pkuseg() 15 | # thu1 = thulac.thulac(seg_only=True) 16 | elmo = Embedder('/Users/linlin/Desktop/SpeechRecognition/DepressionCode/ELMoForManyLangs/zhs.model') 17 | 18 | topics = ['positive', 'neutral', 'negative'] 19 | answers = {} 20 | text_features = [] 21 | text_targets = [] 22 | 23 | def extract_features(text_features, text_targets, path): 24 | for index in range(114): 25 | if os.path.isdir(os.path.join(prefix, path, str(index+1))): 26 | answers[index+1] = [] 27 | for topic in topics: 28 | with open(os.path.join(prefix, path, str(index+1), '%s.txt'%(topic)) ,'r') as f: 29 | lines = f.readlines()[0] 30 | # seg_text = seg.cut(lines) 31 | # seg_text = thu1.cut(lines) 32 | # seg_text_iter = HanLP.segment(lines) 33 | seg_text_iter = jieba.cut(lines, cut_all=False) 34 | answers[index+1].append([item for item in seg_text_iter]) 35 | # answers[dir].append(seg_text) 36 | with open(os.path.join(prefix, '{1}/{0}/new_label.txt'.format(index+1, path))) as fli: 37 | target = float(fli.readline()) 38 | # text_targets.append(1 if target >= 53 else 0) 39 | text_targets.append(target) 40 | text_features.append([np.array(item).mean(axis=0) for item in elmo.sents2elmo(answers[index+1])]) 41 | 42 | extract_features(text_features, text_targets, 'Data') 43 | extract_features(text_features, text_targets, 'ValidationData') 44 | 45 | print("Saving npz file locally...") 46 | np.savez(os.path.join(prefix, 'Features/TextWhole/whole_samples_reg_avg.npz'), text_features) 47 | np.savez(os.path.join(prefix, 'Features/TextWhole/whole_labels_reg_avg.npz'), text_targets) 48 | -------------------------------------------------------------------------------- /DepressionCollected/DAICFeatureExtarction/feature_extraction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append('/Users/linlin/Desktop/DepressionCollected') 4 | from Classification.audio_features_whole import wav2vlad 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import wave 9 | 10 | prefix = os.getcwd() 11 | train_split_df = pd.read_csv(os.path.join(prefix, 'DAIC/train_split_Depression_AVEC2017.csv')) 12 | test_split_df = pd.read_csv(os.path.join(prefix, 'DAIC/dev_split_Depression_AVEC2017.csv')) 13 | train_split_num = train_split_df[['Participant_ID']]['Participant_ID'].tolist() 14 | test_split_num = test_split_df[['Participant_ID']]['Participant_ID'].tolist() 15 | train_split_clabel = train_split_df[['PHQ8_Binary']]['PHQ8_Binary'].tolist() 16 | test_split_clabel = test_split_df[['PHQ8_Binary']]['PHQ8_Binary'].tolist() 17 | train_split_rlabel = train_split_df[['PHQ8_Score']]['PHQ8_Score'].tolist() 18 | test_split_rlabel = test_split_df[['PHQ8_Score']]['PHQ8_Score'].tolist() 19 | 20 | with open('./queries.txt') as f: 21 | queries = f.readlines() 22 | 23 | def identify_topics(sentence): 24 | for query in queries: 25 | query = query.strip('\n') 26 | sentence = sentence.strip('\n') 27 | if query == sentence: 28 | return True 29 | return False 30 | 31 | def extract_features(number): 32 | transcript = pd.read_csv(os.path.join(prefix, 'DAIC/{0}_P/{0}_TRANSCRIPT.csv'.format(number)), sep='\t').fillna('') 33 | 34 | wavefile = wave.open(os.path.join(prefix, 'DAIC/{0}_P/{0}_AUDIO.wav'.format(number, 'r'))) 35 | sr = wavefile.getframerate() 36 | nframes = wavefile.getnframes() 37 | wave_data = np.frombuffer(wavefile.readframes(nframes), dtype=np.short) 38 | 39 | response = '' 40 | start_time = 0 41 | stop_time = 0 42 | feats = [] 43 | signal = [] 44 | 45 | for t in transcript.itertuples(): 46 | # 问题开始 47 | if getattr(t,'speaker') == 'Ellie' and (identify_topics(getattr(t,'value')) or 'i think i have asked everything' in getattr(t,'value')): 48 | # 初始化 49 | response = '' 50 | if len(signal) == 0: 51 | continue 52 | feats.append(wav2vlad(signal, sr)) 53 | signal = [] 54 | elif getattr(t,'speaker') == 'Participant': 55 | if 'scrubbed_entry' in getattr(t,'value'): 56 | continue 57 | start_time = int(getattr(t,'start_time')*sr) 58 | stop_time = int(getattr(t,'stop_time')*sr) 59 | response += (' ' + getattr(t,'value')) 60 | signal = np.hstack((signal, wave_data[start_time:stop_time].astype(np.float))) 61 | 62 | print(np.shape(feats)) 63 | print('{}_P feature done'.format(number)) 64 | return feats 65 | 66 | # training set 67 | audio_features_train = [] 68 | audio_ctargets_train = [] 69 | audio_rtargets_train = [] 70 | 71 | # test set 72 | audio_features_test = [] 73 | audio_ctargets_test = [] 74 | audio_rtargets_test = [] 75 | 76 | # training set 77 | for index in range(len(train_split_num)): 78 | feat = extract_features(train_split_num[index]) 79 | audio_features_train.append(feat) 80 | audio_ctargets_train.append(train_split_clabel[index]) 81 | audio_rtargets_train.append(train_split_rlabel[index]) 82 | 83 | print("Saving npz file locally...") 84 | np.savez(os.path.join(prefix, 'DAICCode/Features/train_samples_clf.npz'), audio_features_train) 85 | np.savez(os.path.join(prefix, 'DAICCode/Features/train_samples_reg.npz'), audio_features_train) 86 | np.savez(os.path.join(prefix, 'DAICCode/Features/train_labels_clf.npz'), audio_ctargets_train) 87 | np.savez(os.path.join(prefix, 'DAICCode/Features/train_labels_reg.npz'), audio_rtargets_train) 88 | 89 | # test set 90 | for index in range(len(test_split_num)): 91 | feat = extract_features(test_split_num[index]) 92 | audio_features_test.append(feat) 93 | audio_ctargets_test.append(test_split_clabel[index]) 94 | audio_rtargets_test.append(test_split_rlabel[index]) 95 | 96 | print("Saving npz file locally...") 97 | np.savez(os.path.join(prefix, 'DAICCode/Features/test_samples_clf.npz'), audio_features_test) 98 | np.savez(os.path.join(prefix, 'DAICCode/Features/test_samples_reg.npz'), audio_features_test) 99 | np.savez(os.path.join(prefix, 'DAICCode/Features/test_labels_clf.npz'), audio_ctargets_test) 100 | np.savez(os.path.join(prefix, 'DAICCode/Features/test_labels_reg.npz'), audio_rtargets_test) 101 | -------------------------------------------------------------------------------- /DepressionCollected/DAICFeatureExtarction/queries.txt: -------------------------------------------------------------------------------- 1 | how are you doing today 2 | where are you from originally 3 | why'd you move to l_a 4 | how do you like l_a 5 | what are some things you really like about l_a 6 | how easy was it for you to get used to living in l_a 7 | what are some things you don't really like about l_a 8 | what'd you study at school 9 | are you still doing that 10 | what's your dream job 11 | do you travel a lot 12 | why 13 | how often do you go back to your hometown 14 | do you consider yourself an introvert 15 | what do you do to relax 16 | how are you at controlling your temper 17 | when was the last time you argued with someone and what was it about 18 | how did you feel in that moment 19 | tell me more about that 20 | how close are you to them 21 | how do you know them 22 | what are some things you like to do for fun 23 | who's someone that's been a positive influence in your life 24 | can you tell me about that 25 | how close are you to your family 26 | is there anything you regret 27 | what made you decide to do that 28 | could you have done anything to avoid it 29 | what's one of your most memorable experiences 30 | what's it like for you living with them 31 | how do you like your living situation 32 | do you have roommates 33 | how easy is it for you to get a good night's sleep 34 | do you feel that way often 35 | what are you like when you don't sleep well 36 | do you feel down 37 | have you been diagnosed with depression 38 | have you ever been diagnosed with p_t_s_d 39 | have you ever served in the military 40 | when was the last time you felt really happy 41 | what do you think of today's kids 42 | can you give me an example of that 43 | what do you do when you're annoyed 44 | when was the last time that happened 45 | how would your best friend describe you 46 | where do you live 47 | how hard is that 48 | what do you do now 49 | are you happy you did that 50 | what are some things that make you really mad 51 | what do you do to relax 52 | like what 53 | are you still working in that 54 | can you give me an example of that 55 | do you feel down 56 | like what 57 | how do you cope with them 58 | have you noticed any changes in your behavior or thoughts lately 59 | do you have disturbing thoughts 60 | how easy is it for you to get a good night sleep 61 | what do you enjoy about traveling 62 | i'd love to hear about one of your trips 63 | what advice would you give yourself ten or twenty years ago 64 | what are some things you really like about l_a 65 | how are you at controlling your temper 66 | has that gotten you in trouble 67 | do you find it easy to be a parent 68 | what's the hardest thing about being a parent 69 | tell me about your kids 70 | what's one of your most memorable experiences 71 | how did you feel in that moment 72 | have you ever served in the military 73 | have you been diagnosed with depression 74 | how would you best friend describe you 75 | what'd you study at school 76 | nice are you still doing that 77 | what are some things that make you really mad 78 | could you have done anything to avoid it 79 | could you say a little more about that 80 | when was the last time you argued with someone and what was it about 81 | do you travel a lot 82 | when was the last time that happened 83 | have you ever been diagnosed with p_t_s_d 84 | how would your best friend describe you 85 | when was the last time you felt really happy 86 | how did you decide to do that 87 | okay could you have done anything to avoid it 88 | do you feel like therapy is useful 89 | did you think you had a problem before you found out 90 | how has seeing a therapist affected you 91 | what sort of changes have you noticed since you've been going to therapy 92 | why did you stop 93 | who's someone that's been a positive influence in your life 94 | when did you move to l_a 95 | how often do you go back to your home town 96 | what got you to seek help 97 | what were your symptoms 98 | yeah what do you enjoy about traveling 99 | okay what's the best thing about being a parent 100 | when was the last time you argued with someone and what was it about 101 | could you say a little more about that 102 | how long ago were you diagnosed 103 | so how are you doing today 104 | could you say a little more about that 105 | do you still go to therapy now 106 | do you feel like therapy's useful 107 | have you noticed any changes in your behavior or thoughts lately 108 | tell me about that 109 | what would you say are some of your best qualities 110 | what are some things that usually put you in a good mood 111 | what are you most proud of in your life 112 | how does it compare to l_a 113 | tell me about something you did recently that you really enjoyed 114 | is going to a therapist helping you 115 | how have you been feeling lately 116 | are they triggered by something 117 | what's the best thing about being a parent 118 | why'd you decide to enlist in the military 119 | how old were you when you joined the military 120 | how did serving in the military change you 121 | what did you do after the military 122 | when'd you move to l_a 123 | how has seeing a therapist affected you 124 | who's someone that's been a positive influence in your life 125 | what are some things you like to do for fun who's someone that's been a positive influence in your life 126 | what was it about 127 | do you think that maybe you're being a little hard on yourself 128 | so how are you doing today 129 | where are you from originally 130 | how easy was it for you to get used to living in l_a 131 | what are some things you don't really like about l_a 132 | how often to you go back to your home town 133 | why 134 | how close are you to your family 135 | do you travel a lot 136 | what do you enjoy about traveling 137 | i'd love to hear about one of your trips 138 | do you consider yourself an introvert 139 | can you give me an example of that 140 | what do you do when you're annoyed 141 | what do you do to relax 142 | what's your dream job 143 | how long ago were you diagnosed 144 | what got you to seek help 145 | do you feel like therapy's useful 146 | do you still go to therapy now 147 | what sort of changes have you noticed since you've been going to therapy 148 | how have you been feeling lately 149 | tell me more about that 150 | what would you say are some of your best qualities 151 | what are some things that usually put you in a good mood 152 | when was the last time you felt really happy 153 | who's someone that's been a positive influence in your life 154 | how do you know them 155 | how close are you to them 156 | what are you most proud of in your life 157 | are you still doing that 158 | do you consider yourself an introvert 159 | do you feel that way often 160 | how do you like your living situation 161 | do you have roommates 162 | how easy is it for you to get a good night's sleep 163 | what are you like when you don't sleep well 164 | what advice would you give yourself ten or twenty years ago 165 | how close are you to your family 166 | tell me about something you did recently that you really enjoyed 167 | what are some things that usually put you in a good mood 168 | why why 169 | what made you decide to go and see someone 170 | okay so how are you doing today 171 | why'd you move to l_a 172 | how often do you go back to your hometown 173 | how did you decide to do that 174 | is there anything you regret 175 | could you have done anything to avoid it 176 | how easy is it for you to get a good night's sleep 177 | do you find it easy to be a parent 178 | what's the best thing about being a parent 179 | what's the hardest thing about being a parent 180 | and please feel free to tell me anything you answers are totally confidential 181 | and please feel free to tell me anything you're answers are totally confidential 182 | what made you decide to do that 183 | what advice would you give yourself ten or twenty years ago 184 | what do you think of today's kids 185 | tell me about that 186 | how hard is that 187 | can you tell me about that 188 | so how are you doing today 189 | are you still working in that 190 | what are some things you like to do for fun 191 | that's good where are you from originally 192 | when was the last time you argued with someone and what was it about 193 | where do you live 194 | did you think you had a problem before you found out 195 | what were your symptoms 196 | why did you stop 197 | okay so how are you doing today 198 | what do you do now 199 | are you happy you did that 200 | are they triggered by something 201 | how do you cope with them 202 | has that gotten you in trouble 203 | what are you 204 | what are some things that make you really mad 205 | how has seeing a therapist affected you 206 | yeah how hard is that 207 | mhm what are some things you don't really like about l_a 208 | mhm how did you decide to do that 209 | how close are you to your family do you find it easy to be a parent 210 | that's good what do you think of today's kids 211 | awesome how did you decide to do that 212 | uh huh uh huh uh huh is there anything you regret is there anything you regret 213 | how old were you when you joined the military 214 | did you ever see combat 215 | how did serving in the military change you 216 | what did you do after the military 217 | how easy was it for you to go back to civilian life 218 | is going to a therapist helping you 219 | that's good where are you from originally 220 | tell me about your kids 221 | yeah how hard is that 222 | do you think that maybe you're being a little hard on yourself 223 | do you consider yourself and introvert 224 | how often do you go back to your home town 225 | how_doingV (so how are you doing today) 226 | where_originally (where are you from originally) 227 | like_about_LA (what are some things you really like about l_a) 228 | dont_like_LA (what are some things you don't really like about l_a) 229 | study (what did you study at school) 230 | still_doing_X (are you still doing that) 231 | change_directions (what made you decide to do that) 232 | happy_didthat (are you happy you did that) 233 | job_virtually (i love my job you could almost say it's virtually made for me what's your dream job) 234 | shyoutgoing (do you consider yourself more shy or outgoing) 235 | tell_about_that (can you tell me about that) 236 | relax_fishtank (sometimes when i'm feeling tense i turn on the fish tank screensaver hey i know it's not hawaii but it's the best i've got what do you do to relax) 237 | control_temper (how are you at controlling your temper) 238 | last_argument (when was the last time you argued with someone and what was it about) 239 | hard_decisionB (tell me about the hardest decision you've ever had to make) 240 | family_relationship (tell me about your relationship with your family) 241 | feelguilty (what's something you feel guilty about) 242 | give_example (can you give me an example of that) 243 | describe_felt (how did you feel in that moment) 244 | ptsd_diagnosed (have you ever been diagnosed with p_t_s_d) 245 | depression_diagnosed (have you been diagnosed with depression) 246 | easy_sleep (how easy is it for you to get a good night's sleep) 247 | feel_down (do you feel down) 248 | behavior_changes (have you noticed any changes in your behavior or thoughts lately) 249 | happy_lasttime (tell me about the last time you felt really happy) 250 | self_change (what are some things you wish you could change about yourself) 251 | symptoms_cope (how do you cope with them) 252 | regret (is there anything you regret) 253 | advice_back (what advice would you give to yourself ten or twenty years ago) 254 | Ellie17Dec2012_08 (what are you most proud of in your life) 255 | difficult (how hard is that) 256 | BF_describe (how would your best friend describe you) 257 | ideal_weekendC (tell me how you spend your ideal weekend) 258 | asked_everything (okay i think i have asked everything i need to) 259 | travel_shoes (i'm sure you can tell by my shoes i'm not much of a world explorer do you travel a lot) 260 | like_what (like what) 261 | travel_trips (i'd love to hear about one of your trips) 262 | still_working_on_X (are you still working in that) 263 | dream_job (what's your dream job) 264 | situation_handled (tell me about a situation that you wish you had handled differently) 265 | why_enlist (why'd you decide to enlist in the military) 266 | old (how old were you when you joined the military) 267 | combat (did you ever see combat) 268 | why2 (why) 269 | effectB (how did serving in the military change you) 270 | after (what did you do after the military) 271 | civilian_life (how easy was it for you to go back to civilian life) 272 | feel_lately (how have you been feeling lately) 273 | therapy_useful (do you feel like therapy is useful) 274 | why_seek_help (what got you to seek help) 275 | therapy_going (do you still go to therapy now) 276 | therapist_affect (how has seeing a therapist affected you) 277 | landed_trouble (has that gotten you in trouble) 278 | when_LA (when did you move to l_a) 279 | often_backB (how often do you go back to your hometown) 280 | compares_LA (how does it compare to l_a) 281 | why_LA (why did you move to l_a) 282 | adapted_LA (how easy was it for you to get used to living in l_a) 283 | hard_decision (how did you decide to do that) 284 | easy_parent (do you find it easy to be a parent) 285 | parent_hardest (what's the hardest thing about being a parent) 286 | parent_best (what's the best thing about being a parent) 287 | parent_differences (what are some ways that you're different as a parent than your parents) 288 | military (have you ever served in the military) 289 | too_hard (do you think that maybe you're being a little hard on yourself) 290 | Ellie17Dec2012_07 (what would you say are some of your best qualities) 291 | memorableB (what's one of your most memorable experiences) 292 | travel_changed (what do you enjoy about traveling) 293 | memory_erase (tell me about an event or something that you wish you could erase from your memory) 294 | bouts_symptoms (when was the last time that happened) 295 | argument_about (what was it about) 296 | avoid (could you have done anything to avoid it) 297 | trigger (are they triggered by something) 298 | sleep_affects (what are you like when you don't sleep well) 299 | when_diagnosed (how long ago were you diagnosed) 300 | therapy_changes (what sort of changes have you noticed since you've been going to therapy) 301 | feelbadly (tell me about a time when someone made you feel really badly about yourself) 302 | more (tell me more about that) 303 | disturbing_thoughts (do you have disturbing thoughts) 304 | Ellie17Dec2012_10 (tell me about something you did recently that you really enjoyed) 305 | Ellie17Dec2012_09 (what are some things that usually put you in a good mood) 306 | do_fun (what are some things you like to do for fun) 307 | influence_positive (who's someone that's been a positive influence in your life) 308 | how_close (how close are you to them) 309 | tell_me_about (tell me about that) 310 | suspect_problem (did you think you had a problem before you found out) 311 | symptoms_what (what were your symptoms) 312 | how_know (how do you know them) 313 | therapist_useful (is going to a therapist helping you) 314 | stop_going (why did you stop) 315 | mad_makeyou (what are some things that make you really mad) 316 | where_live (where do you live) 317 | roommates (do you have roommates) 318 | living_situation (how do you like your living situation) 319 | what_do_when_annoyed (what do you do when you are annoyed) 320 | elaborate (could you say a little more about that) 321 | family_roleB (how close are you to your family) 322 | todays_kids (what do you think of today's kids) 323 | tell_me_moreV2 (can you tell me more about that) 324 | kids_elaborate (tell me about your kids) -------------------------------------------------------------------------------- /DepressionCollected/Regression/AudioModelChecking.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | from torch.nn import functional as F 5 | import torch.optim as optim 6 | from sklearn.metrics import confusion_matrix 7 | from sklearn.metrics import mean_absolute_error, mean_squared_error 8 | from sklearn.model_selection import train_test_split 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import os 13 | import pickle 14 | import random 15 | import itertools 16 | 17 | 18 | prefix = os.path.abspath(os.path.join(os.getcwd(), "./")) 19 | audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_256.npz'))['arr_0'], axis=2) 20 | audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_256.npz'))['arr_0'] 21 | 22 | audio_dep_idxs = np.where(audio_targets >= 53)[0] 23 | audio_non_idxs = np.where(audio_targets < 53)[0] 24 | dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True) 25 | non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True) 26 | 27 | config = { 28 | 'num_classes': 1, 29 | 'dropout': 0.5, 30 | 'rnn_layers': 2, 31 | 'embedding_size': 256, 32 | 'batch_size': 4, 33 | 'epochs': 100, 34 | 'learning_rate': 5e-5, 35 | 'hidden_dims': 256, 36 | 'bidirectional': False, 37 | 'cuda': False 38 | } 39 | 40 | class AudioBiLSTM(nn.Module): 41 | def __init__(self, config): 42 | super(AudioBiLSTM, self).__init__() 43 | self.num_classes = config['num_classes'] 44 | self.learning_rate = config['learning_rate'] 45 | self.dropout = config['dropout'] 46 | self.hidden_dims = config['hidden_dims'] 47 | self.rnn_layers = config['rnn_layers'] 48 | self.embedding_size = config['embedding_size'] 49 | self.bidirectional = config['bidirectional'] 50 | 51 | self.build_model() 52 | 53 | def init_weight(net): 54 | for name, param in net.named_parameters(): 55 | if 'bias' in name: 56 | nn.init.constant_(param, 0.0) 57 | elif 'weight' in name: 58 | nn.init.xavier_uniform_(param) 59 | 60 | def build_model(self): 61 | # attention layer 62 | self.attention_layer = nn.Sequential( 63 | nn.Linear(self.hidden_dims, self.hidden_dims), 64 | nn.ReLU(inplace=True)) 65 | # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) 66 | 67 | self.lstm_net_audio = nn.GRU(self.embedding_size, 68 | self.hidden_dims, 69 | num_layers=self.rnn_layers, 70 | dropout=self.dropout, 71 | bidirectional=self.bidirectional, 72 | batch_first=True) 73 | # self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, 74 | # num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True) 75 | 76 | self.bn = nn.BatchNorm1d(3) 77 | 78 | # FC层 79 | self.fc_audio = nn.Sequential( 80 | nn.Dropout(self.dropout), 81 | nn.Linear(self.hidden_dims, self.hidden_dims), 82 | nn.ReLU(), 83 | nn.Dropout(self.dropout), 84 | nn.Linear(self.hidden_dims, self.num_classes), 85 | nn.ReLU(), 86 | # nn.Softmax(dim=1) 87 | ) 88 | 89 | def attention_net_with_w(self, lstm_out, lstm_hidden): 90 | ''' 91 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 92 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 93 | :return: [batch_size, n_hidden] 94 | ''' 95 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 96 | # h [batch_size, time_step, hidden_dims] 97 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 98 | # h = lstm_out 99 | # [batch_size, num_layers * num_directions, n_hidden] 100 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 101 | # [batch_size, 1, n_hidden] 102 | lstm_hidden = lstm_hidden.unsqueeze(1) 103 | # atten_w [batch_size, 1, hidden_dims] 104 | atten_w = self.attention_layer(lstm_hidden) 105 | # m [batch_size, time_step, hidden_dims] 106 | m = nn.Tanh()(h) 107 | # atten_context [batch_size, 1, time_step] 108 | # print(atten_w.shape, m.transpose(1, 2).shape) 109 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 110 | # softmax_w [batch_size, 1, time_step] 111 | softmax_w = F.softmax(atten_context, dim=-1) 112 | # context [batch_size, 1, hidden_dims] 113 | context = torch.bmm(softmax_w, h) 114 | result = context.squeeze(1) 115 | return result 116 | 117 | def forward(self, x): 118 | x, _ = self.lstm_net_audio(x) 119 | # x = self.bn(x) 120 | x = x.sum(dim=1) 121 | out = self.fc_audio(x) 122 | return out 123 | 124 | def save(model, filename): 125 | save_filename = '{}.pt'.format(filename) 126 | torch.save(model, save_filename) 127 | print('Saved as %s' % save_filename) 128 | 129 | def evaluate(fold, model): 130 | model.eval() 131 | batch_idx = 1 132 | total_loss = 0 133 | global min_mae, min_rmse, test_dep_idxs, test_non_idxs 134 | pred = np.array([]) 135 | X_test = audio_features[list(test_dep_idxs)+list(test_non_idxs)] 136 | Y_test = audio_targets[list(test_dep_idxs)+list(test_non_idxs)] 137 | with torch.no_grad(): 138 | if config['cuda']: 139 | x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\ 140 | Variable(torch.from_numpy(Y_test)).cuda() 141 | else: 142 | x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \ 143 | Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor) 144 | 145 | optimizer.zero_grad() 146 | output = model(x) 147 | loss = criterion(output, y.view_as(output)) 148 | total_loss += loss.item() 149 | pred = output.flatten().detach().numpy() 150 | 151 | mae = mean_absolute_error(Y_test, pred) 152 | rmse = np.sqrt(mean_squared_error(Y_test, pred)) 153 | 154 | print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 155 | print('='*89) 156 | fold = 2 157 | audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Regression/Audio%d/gru_vlad256_256_8.25.pt'%(fold+1))) 158 | model = AudioBiLSTM(config) 159 | # model_state_dict = {} 160 | # model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0'] 161 | # model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0'] 162 | # model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0'] 163 | # model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0'] 164 | 165 | # model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1'] 166 | # model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1'] 167 | # model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1'] 168 | # model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1'] 169 | 170 | # model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight'] 171 | # model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias'] 172 | # model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight'] 173 | # model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias'] 174 | model_state_dict = audio_lstm_model.state_dict() 175 | model.load_state_dict(model_state_dict, strict=True) 176 | 177 | test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10] 178 | test_non_idxs = non_idxs[fold*44:(fold+1)*44] 179 | train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp)) 180 | train_non_idxs = list(set(non_idxs) - set(test_non_idxs)) 181 | 182 | # training data augmentation 183 | train_dep_idxs = [] 184 | for (i, idx) in enumerate(train_dep_idxs_tmp): 185 | feat = audio_features[idx] 186 | if i < 14: 187 | for i in itertools.permutations(feat, feat.shape[0]): 188 | audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) 189 | audio_targets = np.hstack((audio_targets, audio_targets[idx])) 190 | train_dep_idxs.append(len(audio_features)-1) 191 | else: 192 | train_dep_idxs.append(idx) 193 | 194 | # test data augmentation 195 | # test_dep_idxs = [] 196 | # for idx in test_dep_idxs_tmp: 197 | # feat = audio_features[idx] 198 | # for i in itertools.permutations(feat, feat.shape[0]): 199 | # audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) 200 | # audio_targets = np.hstack((audio_targets, audio_targets[idx])) 201 | # test_dep_idxs.append(len(audio_features)-1) 202 | test_dep_idxs = test_dep_idxs_tmp 203 | 204 | optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) 205 | criterion = nn.SmoothL1Loss() 206 | # criterion = FocalLoss(class_num=2) 207 | # evaluate(fold, model) 208 | evaluate(fold, model) 209 | -------------------------------------------------------------------------------- /DepressionCollected/Regression/audio_bilstm_perm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | from torch.nn import functional as F 5 | import torch.optim as optim 6 | from sklearn.metrics import confusion_matrix 7 | from sklearn.metrics import mean_absolute_error, mean_squared_error 8 | from sklearn.model_selection import train_test_split 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import os 13 | import pickle 14 | import random 15 | import itertools 16 | 17 | prefix = os.path.abspath(os.path.join(os.getcwd(), "./")) 18 | audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_256.npz'))['arr_0'], axis=2) 19 | audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_256.npz'))['arr_0'] 20 | 21 | # audio_dep_idxs = np.where(audio_targets >= 53)[0] 22 | # audio_non_idxs = np.where(audio_targets < 53)[0] 23 | # dep_orders = random.sample(range(len(audio_dep_idxs)), len(audio_dep_idxs)) 24 | # non_orders = random.sample(range(len(audio_non_idxs)), len(audio_non_idxs)) 25 | # dep_idxs = audio_dep_idxs[dep_orders] 26 | # non_idxs = audio_non_idxs[non_orders] 27 | # np.save(os.path.join(prefix, 'Features/AudioWhole/dep_idxs'), dep_idxs) 28 | # np.save(os.path.join(prefix, 'Features/AudioWhole/non_idxs'), non_idxs) 29 | dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True) 30 | non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True) 31 | 32 | config = { 33 | 'num_classes': 1, 34 | 'dropout': 0.5, 35 | 'rnn_layers': 2, 36 | 'embedding_size': 256, 37 | 'batch_size': 2, 38 | 'epochs': 120, 39 | 'learning_rate': 1e-5, 40 | 'hidden_dims': 256, 41 | 'bidirectional': False, 42 | 'cuda': False 43 | } 44 | 45 | class AudioBiLSTM(nn.Module): 46 | def __init__(self, config): 47 | super(AudioBiLSTM, self).__init__() 48 | self.num_classes = config['num_classes'] 49 | self.learning_rate = config['learning_rate'] 50 | self.dropout = config['dropout'] 51 | self.hidden_dims = config['hidden_dims'] 52 | self.rnn_layers = config['rnn_layers'] 53 | self.embedding_size = config['embedding_size'] 54 | self.bidirectional = config['bidirectional'] 55 | 56 | self.build_model() 57 | 58 | def init_weight(net): 59 | for name, param in net.named_parameters(): 60 | if 'bias' in name: 61 | nn.init.constant_(param, 0.0) 62 | elif 'weight' in name: 63 | nn.init.xavier_uniform_(param) 64 | 65 | def build_model(self): 66 | # attention layer 67 | self.attention_layer = nn.Sequential( 68 | nn.Linear(self.hidden_dims, self.hidden_dims), 69 | nn.ReLU(inplace=True)) 70 | # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) 71 | 72 | self.lstm_net_audio = nn.GRU(self.embedding_size, 73 | self.hidden_dims, 74 | num_layers=self.rnn_layers, 75 | dropout=self.dropout, 76 | bidirectional=self.bidirectional, 77 | batch_first=True) 78 | # self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, 79 | # num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True) 80 | 81 | self.bn = nn.BatchNorm1d(3) 82 | 83 | # FC层 84 | self.fc_audio = nn.Sequential( 85 | nn.Dropout(self.dropout), 86 | nn.Linear(self.hidden_dims, self.hidden_dims), 87 | nn.ReLU(), 88 | nn.Dropout(self.dropout), 89 | nn.Linear(self.hidden_dims, self.num_classes), 90 | nn.ReLU(), 91 | # nn.Softmax(dim=1) 92 | ) 93 | 94 | def attention_net_with_w(self, lstm_out, lstm_hidden): 95 | ''' 96 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 97 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 98 | :return: [batch_size, n_hidden] 99 | ''' 100 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 101 | # h [batch_size, time_step, hidden_dims] 102 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 103 | # h = lstm_out 104 | # [batch_size, num_layers * num_directions, n_hidden] 105 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 106 | # [batch_size, 1, n_hidden] 107 | lstm_hidden = lstm_hidden.unsqueeze(1) 108 | # atten_w [batch_size, 1, hidden_dims] 109 | atten_w = self.attention_layer(lstm_hidden) 110 | # m [batch_size, time_step, hidden_dims] 111 | m = nn.Tanh()(h) 112 | # atten_context [batch_size, 1, time_step] 113 | # print(atten_w.shape, m.transpose(1, 2).shape) 114 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 115 | # softmax_w [batch_size, 1, time_step] 116 | softmax_w = F.softmax(atten_context, dim=-1) 117 | # context [batch_size, 1, hidden_dims] 118 | context = torch.bmm(softmax_w, h) 119 | result = context.squeeze(1) 120 | return result 121 | 122 | def forward(self, x): 123 | x, _ = self.lstm_net_audio(x) 124 | # x = self.bn(x) 125 | x = x.sum(dim=1) 126 | out = self.fc_audio(x) 127 | return out 128 | 129 | def save(model, filename): 130 | save_filename = '{}.pt'.format(filename) 131 | torch.save(model, save_filename) 132 | print('Saved as %s' % save_filename) 133 | 134 | def train(epoch): 135 | global lr, train_acc 136 | model.train() 137 | batch_idx = 1 138 | total_loss = 0 139 | correct = 0 140 | pred = np.array([]) 141 | X_train = audio_features[train_dep_idxs+train_non_idxs] 142 | Y_train = audio_targets[train_dep_idxs+train_non_idxs] 143 | for i in range(0, X_train.shape[0], config['batch_size']): 144 | if i + config['batch_size'] > X_train.shape[0]: 145 | x, y = X_train[i:], Y_train[i:] 146 | else: 147 | x, y = X_train[i:(i + config['batch_size'])], Y_train[i:( 148 | i + config['batch_size'])] 149 | if config['cuda']: 150 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() 151 | else: 152 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \ 153 | Variable(torch.from_numpy(y)).type(torch.FloatTensor) 154 | 155 | # 将模型的参数梯度设置为0 156 | optimizer.zero_grad() 157 | output = model(x) 158 | loss = criterion(output, y.view_as(output)) 159 | # 后向传播调整参数 160 | loss.backward() 161 | # 根据梯度更新网络参数 162 | optimizer.step() 163 | batch_idx += 1 164 | # loss.item()能够得到张量中的元素值 165 | pred = np.hstack((pred, output.flatten().detach().numpy())) 166 | total_loss += loss.item() 167 | train_mae = mean_absolute_error(Y_train, pred) 168 | 169 | print('Train Epoch: {:2d}\t Learning rate: {:.4f}\t Loss: {:.4f}\t MAE: {:.4f}\t RMSE: {:.4f}\n ' 170 | .format(epoch + 1, config['learning_rate'], total_loss, train_mae, \ 171 | np.sqrt(mean_squared_error(Y_train, pred)))) 172 | return train_mae 173 | 174 | 175 | def evaluate(fold, model, train_mae): 176 | model.eval() 177 | batch_idx = 1 178 | total_loss = 0 179 | global min_mae, min_rmse, test_dep_idxs, test_non_idxs 180 | pred = np.array([]) 181 | X_test = audio_features[list(test_dep_idxs)+list(test_non_idxs)] 182 | Y_test = audio_targets[list(test_dep_idxs)+list(test_non_idxs)] 183 | with torch.no_grad(): 184 | if config['cuda']: 185 | x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\ 186 | Variable(torch.from_numpy(Y_test)).cuda() 187 | else: 188 | x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \ 189 | Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor) 190 | 191 | optimizer.zero_grad() 192 | output = model(x) 193 | loss = criterion(output, y.view_as(output)) 194 | total_loss += loss.item() 195 | pred = output.flatten().detach().numpy() 196 | 197 | mae = mean_absolute_error(Y_test, pred) 198 | rmse = np.sqrt(mean_squared_error(Y_test, pred)) 199 | 200 | print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 201 | print('='*89) 202 | 203 | if mae <= min_mae and mae < 8.5 and train_mae < 13: 204 | min_mae = mae 205 | min_rmse = rmse 206 | mode = 'bi' if config['bidirectional'] else 'norm' 207 | mode ='gru' 208 | save(model, os.path.join(prefix, 'Model/Regression/Audio{}/{}_vlad{}_{}_{:.2f}'.format(fold+1,mode, config['embedding_size'], config['hidden_dims'], min_mae))) 209 | print('*' * 64) 210 | print('model saved: mae: {}\t rmse: {}'.format(min_mae, min_rmse)) 211 | print('*' * 64) 212 | 213 | return total_loss 214 | 215 | for fold in range(3): 216 | test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10] 217 | test_non_idxs = non_idxs[fold*44:(fold+1)*44] 218 | train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp)) 219 | train_non_idxs = list(set(non_idxs) - set(test_non_idxs)) 220 | 221 | # training data augmentation 222 | train_dep_idxs = [] 223 | for (i, idx) in enumerate(train_dep_idxs_tmp): 224 | feat = audio_features[idx] 225 | if i < 14: 226 | for i in itertools.permutations(feat, feat.shape[0]): 227 | audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) 228 | audio_targets = np.hstack((audio_targets, audio_targets[idx])) 229 | train_dep_idxs.append(len(audio_features)-1) 230 | else: 231 | train_dep_idxs.append(idx) 232 | 233 | # test data augmentation 234 | # test_dep_idxs = [] 235 | # for idx in test_dep_idxs_tmp: 236 | # feat = audio_features[idx] 237 | # for i in itertools.permutations(feat, feat.shape[0]): 238 | # audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) 239 | # audio_targets = np.hstack((audio_targets, audio_targets[idx])) 240 | # test_dep_idxs.append(len(audio_features)-1) 241 | test_dep_idxs = test_dep_idxs_tmp 242 | 243 | 244 | model = AudioBiLSTM(config) 245 | 246 | if config['cuda']: 247 | model = model.cuda() 248 | 249 | # optimizer = optim.Adam(model.parameters()) 250 | optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) 251 | criterion = nn.L1Loss() 252 | # criterion = FocalLoss(class_num=2) 253 | min_mae = 100 254 | min_rmse = 100 255 | train_mae = 100 256 | 257 | 258 | for ep in range(1, config['epochs']): 259 | train_mae = train(ep) 260 | tloss = evaluate(fold, model, train_mae) 261 | 262 | # ============== prep ============== 263 | # X_test = np.squeeze(np.load(os.path.join(prefix, 'Features/Audio/val_samples_reg_avid256.npz'))['arr_0'], axis=2) 264 | # Y_test = np.load(os.path.join(prefix, 'Features/Audio/val_labels_reg_avid256.npz'))['arr_0'] 265 | # ============== prep ============== 266 | 267 | 268 | # ============== SVM ============== 269 | 270 | # from sklearn.svm import SVR 271 | # from sklearn.model_selection import KFold 272 | 273 | # X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 274 | # Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 275 | # kf = KFold(n_splits=3) 276 | # regr = SVR(kernel='linear', gamma='auto') 277 | # maes, rmses = [], [] 278 | # for train_index, test_index in kf.split(X): 279 | # # X_train, X_test = X[train_index], X[test_index] 280 | # # Y_train, Y_test = Y[train_index], Y[test_index] 281 | # X_train, Y_train = X[train_index], Y[train_index] 282 | # regr.fit([f.flatten() for f in X_train], Y_train) 283 | # pred = regr.predict([f.flatten() for f in X_test]) 284 | 285 | # mae = mean_absolute_error(Y_test, pred) 286 | # rmse = np.sqrt(mean_squared_error(Y_test, pred)) 287 | # maes.append(mae) 288 | # rmses.append(rmse) 289 | 290 | # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 291 | # print('='*89) 292 | # # break 293 | 294 | # print(np.mean(maes), np.mean(rmses)) 295 | # ============== SVM ============== 296 | 297 | # # ============== DT ============== 298 | # from sklearn.tree import DecisionTreeRegressor 299 | # from sklearn.model_selection import KFold 300 | 301 | # X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 302 | # Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 303 | # kf = KFold(n_splits=3) 304 | # regr = DecisionTreeRegressor(max_depth=100, random_state=0, criterion="mse") 305 | # maes, rmses = [], [] 306 | # for train_index, test_index in kf.split(X): 307 | # # X_train, X_test = X[train_index], X[test_index] 308 | # # Y_train, Y_test = Y[train_index], Y[test_index] 309 | # X_train, Y_train = X[train_index], Y[train_index] 310 | # regr.fit([f.flatten() for f in X_train], Y_train) 311 | # pred = regr.predict([f.flatten() for f in X_test]) 312 | 313 | # mae = mean_absolute_error(Y_test, pred) 314 | # rmse = np.sqrt(mean_squared_error(Y_test, pred)) 315 | # maes.append(mae) 316 | # rmses.append(rmse) 317 | 318 | # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 319 | # print('='*89) 320 | 321 | # print(np.mean(maes), np.mean(rmses)) 322 | # # ============== DT ============== 323 | 324 | # # ============== RF ============== 325 | # from sklearn.ensemble import RandomForestRegressor 326 | # from sklearn.model_selection import KFold 327 | 328 | # X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 329 | # Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 330 | # kf = KFold(n_splits=3) 331 | # regr = RandomForestRegressor(max_depth=100, random_state=0, criterion="mse") 332 | # maes, rmses = [], [] 333 | # for train_index, test_index in kf.split(X): 334 | # # X_train, X_test = X[train_index], X[test_index] 335 | # # Y_train, Y_test = Y[train_index], Y[test_index] 336 | # X_train, Y_train = X[train_index], Y[train_index] 337 | # regr.fit([f.flatten() for f in X_train], Y_train) 338 | # pred = regr.predict([f.flatten() for f in X_test]) 339 | 340 | # mae = mean_absolute_error(Y_test, pred) 341 | # rmse = np.sqrt(mean_squared_error(Y_test, pred)) 342 | # maes.append(mae) 343 | # rmses.append(rmse) 344 | 345 | # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 346 | # print('='*89) 347 | 348 | # print(np.mean(maes), np.mean(rmses)) 349 | # # ============== RF ============== 350 | 351 | # ============== ada ============== 352 | # from sklearn.ensemble import AdaBoostRegressor 353 | # from sklearn.model_selection import KFold 354 | 355 | # X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 356 | # Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 357 | # kf = KFold(n_splits=3) 358 | # regr = AdaBoostRegressor(n_estimators=50) 359 | # maes, rmses = [], [] 360 | # for train_index, test_index in kf.split(X): 361 | # # X_train, X_test = X[train_index], X[test_index] 362 | # # Y_train, Y_test = Y[train_index], Y[test_index] 363 | # X_train, Y_train = X[train_index], Y[train_index] 364 | # regr.fit([f.flatten() for f in X_train], Y_train) 365 | # pred = regr.predict([f.flatten() for f in X_test]) 366 | 367 | # mae = mean_absolute_error(Y_test, pred) 368 | # rmse = np.sqrt(mean_squared_error(Y_test, pred)) 369 | # maes.append(mae) 370 | # rmses.append(rmse) 371 | 372 | # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 373 | # print('='*89) 374 | 375 | # print(np.mean(maes), np.mean(rmses)) 376 | # ============== ada ============== 377 | -------------------------------------------------------------------------------- /DepressionCollected/Regression/fuse_net.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | from torch.nn import functional as F 6 | import torch.optim as optim 7 | from sklearn.metrics import mean_absolute_error, mean_squared_error 8 | import numpy as np 9 | import pandas as pd 10 | import wave 11 | import librosa 12 | from python_speech_features import * 13 | import re 14 | from allennlp.commands.elmo import ElmoEmbedder 15 | import os 16 | import tensorflow.compat.v1 as tf 17 | import itertools 18 | 19 | prefix = os.path.abspath(os.path.join(os.getcwd(), "./")) 20 | 21 | text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_reg_avg.npz'))['arr_0'] 22 | text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_reg_avg.npz'))['arr_0'] 23 | audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_256.npz'))['arr_0'], axis=2) 24 | audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_256.npz'))['arr_0'] 25 | fuse_features = [[audio_features[i], text_features[i]] for i in range(text_features.shape[0])] 26 | fuse_targets = text_targets 27 | 28 | fuse_dep_idxs = np.where(text_targets >= 53)[0] 29 | fuse_non_idxs = np.where(text_targets < 53)[0] 30 | dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True) 31 | non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True) 32 | 33 | text_model_paths = ['Model/Regression/Text1/BiLSTM_128_7.75.pt', 'Model/Regression/Text2/BiLSTM_128_8.46.pt', 'Model/Regression/Text3/BiLSTM_128_8.01.pt'] 34 | audio_model_paths = ['Model/Regression/Audio1/gru_vlad256_256_7.60.pt', 'Model/Regression/Audio2/gru_vlad256_256_8.38.pt', 'Model/Regression/Audio3/gru_vlad256_256_8.25.pt'] 35 | 36 | config = { 37 | 'num_classes': 1, 38 | 'dropout': 0.5, 39 | 'rnn_layers': 2, 40 | 'audio_embed_size': 256, 41 | 'text_embed_size': 1024, 42 | 'batch_size': 4, 43 | 'epochs': 150, 44 | 'learning_rate': 8e-5, 45 | 'audio_hidden_dims': 256, 46 | 'text_hidden_dims': 128, 47 | 'cuda': False, 48 | 'lambda': 1e-2, 49 | } 50 | 51 | class TextBiLSTM(nn.Module): 52 | def __init__(self, config): 53 | super(TextBiLSTM, self).__init__() 54 | self.num_classes = config['num_classes'] 55 | self.learning_rate = config['learning_rate'] 56 | self.dropout = config['dropout'] 57 | self.hidden_dims = config['hidden_dims'] 58 | self.rnn_layers = config['rnn_layers'] 59 | self.embedding_size = config['embedding_size'] 60 | self.bidirectional = config['bidirectional'] 61 | 62 | self.build_model() 63 | self.init_weight() 64 | 65 | def init_weight(net): 66 | for name, param in net.named_parameters(): 67 | if 'bias' in name: 68 | nn.init.constant_(param, 0.0) 69 | elif 'weight' in name: 70 | nn.init.xavier_uniform_(param) 71 | 72 | def build_model(self): 73 | # attention layer 74 | self.attention_layer = nn.Sequential( 75 | nn.Linear(self.hidden_dims, self.hidden_dims), 76 | nn.ReLU(inplace=True) 77 | ) 78 | # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) 79 | 80 | # 双层lstm 81 | self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims, 82 | num_layers=self.rnn_layers, dropout=self.dropout, 83 | bidirectional=self.bidirectional) 84 | 85 | # self.init_weight() 86 | 87 | # FC层 88 | # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes) 89 | self.fc_out = nn.Sequential( 90 | nn.Dropout(self.dropout), 91 | nn.Linear(self.hidden_dims, self.hidden_dims), 92 | nn.ReLU(), 93 | nn.Dropout(self.dropout), 94 | nn.Linear(self.hidden_dims, self.num_classes), 95 | nn.ReLU(), 96 | # nn.Softmax(dim=1), 97 | ) 98 | 99 | def attention_net_with_w(self, lstm_out, lstm_hidden): 100 | ''' 101 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 102 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 103 | :return: [batch_size, n_hidden] 104 | ''' 105 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 106 | # h [batch_size, time_step, hidden_dims] 107 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 108 | # h = lstm_out 109 | # [batch_size, num_layers * num_directions, n_hidden] 110 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 111 | # [batch_size, 1, n_hidden] 112 | lstm_hidden = lstm_hidden.unsqueeze(1) 113 | # atten_w [batch_size, 1, hidden_dims] 114 | atten_w = self.attention_layer(lstm_hidden) 115 | # m [batch_size, time_step, hidden_dims] 116 | m = nn.Tanh()(h) 117 | # atten_context [batch_size, 1, time_step] 118 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 119 | # softmax_w [batch_size, 1, time_step] 120 | softmax_w = F.softmax(atten_context, dim=-1) 121 | # context [batch_size, 1, hidden_dims] 122 | context = torch.bmm(softmax_w, h) 123 | result = context.squeeze(1) 124 | return result 125 | 126 | def forward(self, x): 127 | 128 | # x : [len_seq, batch_size, embedding_dim] 129 | x = x.permute(1, 0, 2) 130 | output, (final_hidden_state, final_cell_state) = self.lstm_net(x) 131 | # output : [batch_size, len_seq, n_hidden * 2] 132 | output = output.permute(1, 0, 2) 133 | # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] 134 | final_hidden_state = final_hidden_state.permute(1, 0, 2) 135 | # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) 136 | # atten_out = self.attention_net(output, final_hidden_state) 137 | atten_out = self.attention_net_with_w(output, final_hidden_state) 138 | return self.fc_out(atten_out) 139 | 140 | class AudioBiLSTM(nn.Module): 141 | def __init__(self, config): 142 | super(AudioBiLSTM, self).__init__() 143 | self.num_classes = config['num_classes'] 144 | self.learning_rate = config['learning_rate'] 145 | self.dropout = config['dropout'] 146 | self.hidden_dims = config['hidden_dims'] 147 | self.rnn_layers = config['rnn_layers'] 148 | self.embedding_size = config['embedding_size'] 149 | self.bidirectional = config['bidirectional'] 150 | 151 | self.build_model() 152 | 153 | def init_weight(net): 154 | for name, param in net.named_parameters(): 155 | if 'bias' in name: 156 | nn.init.constant_(param, 0.0) 157 | elif 'weight' in name: 158 | nn.init.xavier_uniform_(param) 159 | 160 | def build_model(self): 161 | # attention layer 162 | self.attention_layer = nn.Sequential( 163 | nn.Linear(self.hidden_dims, self.hidden_dims), 164 | nn.ReLU(inplace=True)) 165 | # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) 166 | 167 | self.lstm_net_audio = nn.GRU(self.embedding_size, 168 | self.hidden_dims, 169 | num_layers=self.rnn_layers, 170 | dropout=self.dropout, 171 | bidirectional=self.bidirectional, 172 | batch_first=True) 173 | # self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, 174 | # num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True) 175 | 176 | self.bn = nn.BatchNorm1d(3) 177 | 178 | # FC层 179 | self.fc_audio = nn.Sequential( 180 | nn.Dropout(self.dropout), 181 | nn.Linear(self.hidden_dims, self.hidden_dims), 182 | nn.ReLU(), 183 | nn.Dropout(self.dropout), 184 | nn.Linear(self.hidden_dims, self.num_classes), 185 | nn.ReLU(), 186 | # nn.Softmax(dim=1) 187 | ) 188 | 189 | def attention_net_with_w(self, lstm_out, lstm_hidden): 190 | ''' 191 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 192 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 193 | :return: [batch_size, n_hidden] 194 | ''' 195 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 196 | # h [batch_size, time_step, hidden_dims] 197 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 198 | # h = lstm_out 199 | # [batch_size, num_layers * num_directions, n_hidden] 200 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 201 | # [batch_size, 1, n_hidden] 202 | lstm_hidden = lstm_hidden.unsqueeze(1) 203 | # atten_w [batch_size, 1, hidden_dims] 204 | atten_w = self.attention_layer(lstm_hidden) 205 | # m [batch_size, time_step, hidden_dims] 206 | m = nn.Tanh()(h) 207 | # atten_context [batch_size, 1, time_step] 208 | # print(atten_w.shape, m.transpose(1, 2).shape) 209 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 210 | # softmax_w [batch_size, 1, time_step] 211 | softmax_w = F.softmax(atten_context, dim=-1) 212 | # context [batch_size, 1, hidden_dims] 213 | context = torch.bmm(softmax_w, h) 214 | result = context.squeeze(1) 215 | return result 216 | 217 | def forward(self, x): 218 | x, _ = self.lstm_net_audio(x) 219 | # x = self.bn(x) 220 | x = x.sum(dim=1) 221 | out = self.fc_audio(x) 222 | return out 223 | 224 | class fusion_net(nn.Module): 225 | def __init__(self, text_embed_size, text_hidden_dims, rnn_layers, dropout, num_classes, \ 226 | audio_hidden_dims, audio_embed_size): 227 | super(fusion_net, self).__init__() 228 | self.text_embed_size = text_embed_size 229 | self.audio_embed_size = audio_embed_size 230 | self.text_hidden_dims = text_hidden_dims 231 | self.audio_hidden_dims = audio_hidden_dims 232 | self.rnn_layers = rnn_layers 233 | self.dropout = dropout 234 | self.num_classes = num_classes 235 | 236 | # ============================= TextBiLSTM ================================= 237 | 238 | # attention layer 239 | self.attention_layer = nn.Sequential( 240 | nn.Linear(self.text_hidden_dims, self.text_hidden_dims), 241 | nn.ReLU(inplace=True) 242 | ) 243 | 244 | # 双层lstm 245 | self.lstm_net = nn.LSTM(self.text_embed_size, self.text_hidden_dims, 246 | num_layers=self.rnn_layers, dropout=self.dropout, 247 | bidirectional=True) 248 | # FC层 249 | self.fc_out = nn.Sequential( 250 | nn.Dropout(self.dropout), 251 | nn.Linear(self.text_hidden_dims, self.text_hidden_dims), 252 | nn.ReLU(), 253 | nn.Dropout(self.dropout) 254 | ) 255 | 256 | # ============================= TextBiLSTM ================================= 257 | 258 | # ============================= AudioBiLSTM ============================= 259 | 260 | self.lstm_net_audio = nn.GRU(self.audio_embed_size, 261 | self.audio_hidden_dims, 262 | num_layers=self.rnn_layers, 263 | dropout=self.dropout, 264 | bidirectional=False, 265 | batch_first=True) 266 | 267 | self.fc_audio = nn.Sequential( 268 | nn.Dropout(self.dropout), 269 | nn.Linear(self.audio_hidden_dims, self.audio_hidden_dims), 270 | nn.ReLU(), 271 | nn.Dropout(self.dropout) 272 | ) 273 | 274 | # ============================= AudioBiLSTM ============================= 275 | 276 | # ============================= last fc layer ============================= 277 | # self.bn = nn.BatchNorm1d(self.text_hidden_dims + self.audio_hidden_dims) 278 | # modal attention 279 | self.modal_attn = nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.text_hidden_dims + self.audio_hidden_dims, bias=False) 280 | self.fc_final = nn.Sequential( 281 | nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.num_classes, bias=False), 282 | nn.ReLU(), 283 | # nn.Softmax(dim=1), 284 | # nn.Sigmoid() 285 | ) 286 | 287 | def attention_net_with_w(self, lstm_out, lstm_hidden): 288 | ''' 289 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 290 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 291 | :return: [batch_size, n_hidden] 292 | ''' 293 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 294 | # h [batch_size, time_step, hidden_dims] 295 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 296 | # [batch_size, num_layers * num_directions, n_hidden] 297 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 298 | # [batch_size, 1, n_hidden] 299 | lstm_hidden = lstm_hidden.unsqueeze(1) 300 | # atten_w [batch_size, 1, hidden_dims] 301 | atten_w = self.attention_layer(lstm_hidden) 302 | # m [batch_size, time_step, hidden_dims] 303 | m = nn.Tanh()(h) 304 | # atten_context [batch_size, 1, time_step] 305 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 306 | # softmax_w [batch_size, 1, time_step] 307 | softmax_w = F.softmax(atten_context, dim=-1) 308 | # context [batch_size, 1, hidden_dims] 309 | context = torch.bmm(softmax_w, h) 310 | result = context.squeeze(1) 311 | return result 312 | 313 | def pretrained_feature(self, x): 314 | with torch.no_grad(): 315 | x_text = [] 316 | x_audio = [] 317 | for ele in x: 318 | x_text.append(ele[1]) 319 | x_audio.append(ele[0]) 320 | x_text, x_audio = Variable(torch.tensor(x_text).type(torch.FloatTensor), requires_grad=False), Variable(torch.tensor(x_audio).type(torch.FloatTensor), requires_grad=False) 321 | # ============================= TextBiLSTM ================================= 322 | # x : [len_seq, batch_size, embedding_dim] 323 | x_text = x_text.permute(1, 0, 2) 324 | output, (final_hidden_state, _) = self.lstm_net(x_text) 325 | # output : [batch_size, len_seq, n_hidden * 2] 326 | output = output.permute(1, 0, 2) 327 | # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] 328 | final_hidden_state = final_hidden_state.permute(1, 0, 2) 329 | # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) 330 | # atten_out = self.attention_net(output, final_hidden_state) 331 | atten_out = self.attention_net_with_w(output, final_hidden_state) 332 | text_feature = self.fc_out(atten_out) 333 | 334 | # ============================= TextBiLSTM ================================= 335 | 336 | # ============================= AudioBiLSTM ============================= 337 | 338 | x_audio, _ = self.lstm_net_audio(x_audio) 339 | x_audio = x_audio.sum(dim=1) 340 | audio_feature = self.fc_audio(x_audio) 341 | 342 | # ============================= AudioBiLSTM ============================= 343 | return (text_feature, audio_feature) 344 | 345 | def forward(self, x): 346 | # x = self.bn(x) 347 | modal_weights = torch.sigmoid(self.modal_attn(x)) 348 | # modal_weights = self.modal_attn(x) 349 | x = (modal_weights * x) 350 | output = self.fc_final(x) 351 | return output 352 | 353 | class MyLoss(nn.Module): 354 | def __init__(self): 355 | super(MyLoss, self).__init__() 356 | 357 | def forward(self, text_feature, audio_feature, target, model): 358 | weight = model.fc_final[0].weight 359 | # bias = model.fc_final[0].bias 360 | # print(weight, bias) 361 | pred_text = F.linear(text_feature, weight[:, :config['text_hidden_dims']]) 362 | pred_audio = F.linear(audio_feature, weight[:, config['text_hidden_dims']:]) 363 | # l = nn.CrossEntropyLoss() 364 | l = nn.SmoothL1Loss() 365 | target = torch.tensor(target).view_as(pred_text).float() 366 | return l(pred_text, target) + l(pred_audio, target) 367 | 368 | def save(model, filename): 369 | save_filename = '{}.pt'.format(filename) 370 | torch.save(model, save_filename) 371 | print('Saved as %s' % save_filename) 372 | 373 | def train(model, epoch): 374 | global max_train_acc, train_acc 375 | model.train() 376 | batch_idx = 1 377 | total_loss = 0 378 | correct = 0 379 | pred = np.array([]) 380 | X_train = [] 381 | Y_train = [] 382 | for idx in train_dep_idxs+train_non_idxs: 383 | X_train.append(fuse_features[idx]) 384 | Y_train.append(fuse_targets[idx]) 385 | for i in range(0, len(X_train), config['batch_size']): 386 | if i + config['batch_size'] > len(X_train): 387 | x, y = X_train[i:], Y_train[i:] 388 | else: 389 | x, y = X_train[i:(i+config['batch_size'])], Y_train[i:(i+config['batch_size'])] 390 | # 将模型的参数梯度设置为0 391 | optimizer.zero_grad() 392 | text_feature, audio_feature = model.pretrained_feature(x) 393 | audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std() 394 | text_feature_norm = (text_feature - text_feature.mean())/text_feature.std() 395 | # concat_x = torch.cat((text_feature_norm, audio_feature_norm), dim=1) 396 | concat_x = torch.cat((text_feature, audio_feature), dim=1) 397 | output = model(concat_x) 398 | # loss = criterion(output, torch.tensor(y).float()) 399 | loss = criterion(text_feature, audio_feature, y, model) 400 | # 后向传播调整参数 401 | loss.backward() 402 | # 根据梯度更新网络参数 403 | optimizer.step() 404 | batch_idx += 1 405 | # loss.item()能够得到张量中的元素值 406 | pred = np.hstack((pred, output.flatten().detach().numpy())) 407 | total_loss += loss.item() 408 | train_mae = mean_absolute_error(Y_train, pred) 409 | print('Train Epoch: {:2d}\t Learning rate: {:.4f}\t Loss: {:.4f}\t MAE: {:.4f}\t RMSE: {:.4f}\n ' 410 | .format(epoch + 1, config['learning_rate'], total_loss, train_mae, \ 411 | np.sqrt(mean_squared_error(Y_train, pred)))) 412 | return train_mae 413 | 414 | def evaluate(model, fold, train_mae): 415 | model.eval() 416 | batch_idx = 1 417 | total_loss = 0 418 | global min_mae, min_rmse, test_dep_idxs, test_non_idxs 419 | pred = np.array([]) 420 | X_test = [] 421 | Y_test = [] 422 | for idx in list(test_dep_idxs)+list(test_non_idxs): 423 | X_test.append(fuse_features[idx]) 424 | Y_test.append(fuse_targets[idx]) 425 | for i in range(0, len(X_test), config['batch_size']): 426 | if i + config['batch_size'] > len(X_test): 427 | x, y = X_test[i:], Y_test[i:] 428 | else: 429 | x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])] 430 | text_feature, audio_feature = model.pretrained_feature(x) 431 | with torch.no_grad(): 432 | audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std() 433 | text_feature_norm = (text_feature - text_feature.mean())/text_feature.std() 434 | concat_x = torch.cat((text_feature, audio_feature), dim=1) 435 | # concat_x = torch.cat((text_feature_norm, audio_feature_norm), dim=1) 436 | output = model(concat_x) 437 | # loss = criterion(output, torch.tensor(y).float()) 438 | loss = criterion(text_feature, audio_feature, y, model) 439 | pred = np.hstack((pred, output.flatten().detach().numpy())) 440 | total_loss += loss.item() 441 | 442 | mae = mean_absolute_error(Y_test, pred) 443 | rmse = np.sqrt(mean_squared_error(Y_test, pred)) 444 | 445 | print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 446 | print('='*89) 447 | 448 | if mae <= min_mae and mae < 8.2 and train_mae < 13: 449 | min_mae = mae 450 | min_rmse = rmse 451 | save(model, os.path.join(prefix, 'Model/Regression/Fuse{}/fuse_{:.2f}'.format(fold+1, min_mae))) 452 | print('*' * 64) 453 | print('model saved: mae: {}\t rmse: {}'.format(min_mae, min_rmse)) 454 | print('*' * 64) 455 | 456 | return total_loss 457 | 458 | def evaluate_audio(model): 459 | model.eval() 460 | batch_idx = 1 461 | total_loss = 0 462 | global min_mae, min_rmse, test_dep_idxs, test_non_idxs 463 | pred = np.array([]) 464 | X_test = [] 465 | Y_test = [] 466 | for idx in list(test_dep_idxs)+list(test_non_idxs): 467 | X_test.append(fuse_features[idx][0]) 468 | Y_test.append(fuse_targets[idx]) 469 | X_test = np.array(X_test) 470 | Y_test = np.array(Y_test) 471 | 472 | with torch.no_grad(): 473 | if config['cuda']: 474 | x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\ 475 | Variable(torch.from_numpy(Y_test)).cuda() 476 | else: 477 | x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \ 478 | Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor) 479 | 480 | optimizer.zero_grad() 481 | output = model(x) 482 | loss = criterion(output, y.view_as(output)) 483 | total_loss += loss.item() 484 | pred = output.flatten().detach().numpy() 485 | 486 | mae = mean_absolute_error(Y_test, pred) 487 | rmse = np.sqrt(mean_squared_error(Y_test, pred)) 488 | 489 | print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 490 | print('='*89) 491 | 492 | def evaluate_text(model): 493 | model.eval() 494 | batch_idx = 1 495 | total_loss = 0 496 | global min_mae, min_rmse, test_dep_idxs, test_non_idxs 497 | pred = np.array([]) 498 | X_test = [] 499 | Y_test = [] 500 | for idx in list(test_dep_idxs)+list(test_non_idxs): 501 | X_test.append(fuse_features[idx][1]) 502 | Y_test.append(fuse_targets[idx]) 503 | X_test = np.array(X_test) 504 | Y_test = np.array(Y_test) 505 | criterion = nn.SmoothL1Loss() 506 | with torch.no_grad(): 507 | if config['cuda']: 508 | x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\ 509 | Variable(torch.from_numpy(Y_test)).cuda() 510 | else: 511 | x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \ 512 | Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor) 513 | 514 | optimizer.zero_grad() 515 | output = model(x) 516 | loss = criterion(output, y.view_as(output)) 517 | total_loss += loss.item() 518 | pred = output.flatten().detach().numpy() 519 | 520 | mae = mean_absolute_error(Y_test, pred) 521 | rmse = np.sqrt(mean_squared_error(Y_test, pred)) 522 | 523 | print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 524 | print('='*89) 525 | 526 | for fold in range(3): 527 | test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10] 528 | test_non_idxs = non_idxs[fold*44:(fold+1)*44] 529 | train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp)) 530 | train_non_idxs = list(set(non_idxs) - set(test_non_idxs)) 531 | 532 | train_dep_idxs = [] 533 | test_dep_idxs = [] 534 | # depression data augmentation 535 | for (i, idx) in enumerate(train_dep_idxs_tmp): 536 | feat = fuse_features[idx] 537 | audio_perm = itertools.permutations(feat[0], 3) 538 | text_perm = itertools.permutations(feat[1], 3) 539 | if i < 14: 540 | for fuse_perm in zip(audio_perm, text_perm): 541 | fuse_features.append(fuse_perm) 542 | fuse_targets = np.hstack((fuse_targets, fuse_targets[idx])) 543 | train_dep_idxs.append(len(fuse_features)-1) 544 | else: 545 | train_dep_idxs.append(idx) 546 | 547 | test_dep_idxs = test_dep_idxs_tmp 548 | 549 | model = fusion_net(config['text_embed_size'], config['text_hidden_dims'], config['rnn_layers'], \ 550 | config['dropout'], config['num_classes'], config['audio_hidden_dims'], config['audio_embed_size']) 551 | 552 | optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) 553 | # optimizer = optim.Adam(model.parameters()) 554 | # criterion = nn.SmoothL1Loss() 555 | criterion = MyLoss() 556 | 557 | text_lstm_model = torch.load(os.path.join(prefix, text_model_paths[fold])) 558 | audio_lstm_model = torch.load(os.path.join(prefix, audio_model_paths[fold])) 559 | model_state_dict = {} 560 | model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0'] 561 | model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0'] 562 | model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0'] 563 | model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0'] 564 | 565 | model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1'] 566 | model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1'] 567 | model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1'] 568 | model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1'] 569 | 570 | model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight'] 571 | model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias'] 572 | model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight'] 573 | model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias'] 574 | model.load_state_dict(text_lstm_model.state_dict(), strict=False) 575 | # model.load_state_dict(audio_lstm_model.state_dict(), strict=False) 576 | model.load_state_dict(model_state_dict, strict=False) 577 | 578 | for param in model.parameters(): 579 | param.requires_grad = True 580 | 581 | model.fc_final[0].weight.requires_grad = True 582 | # model.fc_final[0].bias.requires_grad = True 583 | model.modal_attn.weight.requires_grad = True 584 | min_mae = 100 585 | min_rmse = 100 586 | train_mae = 100 587 | 588 | for ep in range(1, config['epochs']): 589 | train_mae = train(model, ep) 590 | tloss = evaluate(model, fold, train_mae) 591 | # evaluate_audio(audio_lstm_model) 592 | # evaluate_text(text_lstm_model) -------------------------------------------------------------------------------- /DepressionCollected/Regression/text_bilstm_perm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | from torch.nn import functional as F 5 | import torch.optim as optim 6 | from sklearn.metrics import confusion_matrix 7 | from sklearn.metrics import mean_absolute_error, mean_squared_error 8 | from sklearn.model_selection import train_test_split 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import os 13 | import pickle 14 | import random 15 | import itertools 16 | 17 | prefix = os.path.abspath(os.path.join(os.getcwd(), "../")) 18 | text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_reg_avg.npz'))['arr_0'] 19 | text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_reg_avg.npz'))['arr_0'] 20 | 21 | dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True) 22 | non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True) 23 | 24 | config = { 25 | 'num_classes': 1, 26 | 'dropout': 0.5, 27 | 'rnn_layers': 2, 28 | 'embedding_size': 1024, 29 | 'batch_size': 2, 30 | 'epochs': 110, 31 | 'learning_rate': 1e-5, 32 | 'hidden_dims': 128, 33 | 'bidirectional': True, 34 | 'cuda': False, 35 | } 36 | 37 | class TextBiLSTM(nn.Module): 38 | def __init__(self, config): 39 | super(TextBiLSTM, self).__init__() 40 | self.num_classes = config['num_classes'] 41 | self.learning_rate = config['learning_rate'] 42 | self.dropout = config['dropout'] 43 | self.hidden_dims = config['hidden_dims'] 44 | self.rnn_layers = config['rnn_layers'] 45 | self.embedding_size = config['embedding_size'] 46 | self.bidirectional = config['bidirectional'] 47 | 48 | self.build_model() 49 | self.init_weight() 50 | 51 | def init_weight(net): 52 | for name, param in net.named_parameters(): 53 | if 'bias' in name: 54 | nn.init.constant_(param, 0.0) 55 | elif 'weight' in name: 56 | nn.init.xavier_uniform_(param) 57 | 58 | def build_model(self): 59 | # attention layer 60 | self.attention_layer = nn.Sequential( 61 | nn.Linear(self.hidden_dims, self.hidden_dims), 62 | nn.ReLU(inplace=True) 63 | ) 64 | # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) 65 | 66 | # 双层lstm 67 | self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims, 68 | num_layers=self.rnn_layers, dropout=self.dropout, 69 | bidirectional=self.bidirectional) 70 | 71 | # self.init_weight() 72 | 73 | # FC层 74 | # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes) 75 | self.fc_out = nn.Sequential( 76 | nn.Dropout(self.dropout), 77 | nn.Linear(self.hidden_dims, self.hidden_dims), 78 | nn.ReLU(), 79 | nn.Dropout(self.dropout), 80 | nn.Linear(self.hidden_dims, self.num_classes), 81 | nn.ReLU(), 82 | # nn.Softmax(dim=1), 83 | ) 84 | 85 | def attention_net_with_w(self, lstm_out, lstm_hidden): 86 | ''' 87 | :param lstm_out: [batch_size, len_seq, n_hidden * 2] 88 | :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] 89 | :return: [batch_size, n_hidden] 90 | ''' 91 | lstm_tmp_out = torch.chunk(lstm_out, 2, -1) 92 | # h [batch_size, time_step, hidden_dims] 93 | h = lstm_tmp_out[0] + lstm_tmp_out[1] 94 | # h = lstm_out 95 | # [batch_size, num_layers * num_directions, n_hidden] 96 | lstm_hidden = torch.sum(lstm_hidden, dim=1) 97 | # [batch_size, 1, n_hidden] 98 | lstm_hidden = lstm_hidden.unsqueeze(1) 99 | # atten_w [batch_size, 1, hidden_dims] 100 | atten_w = self.attention_layer(lstm_hidden) 101 | # m [batch_size, time_step, hidden_dims] 102 | m = nn.Tanh()(h) 103 | # atten_context [batch_size, 1, time_step] 104 | atten_context = torch.bmm(atten_w, m.transpose(1, 2)) 105 | # softmax_w [batch_size, 1, time_step] 106 | softmax_w = F.softmax(atten_context, dim=-1) 107 | # context [batch_size, 1, hidden_dims] 108 | context = torch.bmm(softmax_w, h) 109 | result = context.squeeze(1) 110 | return result 111 | 112 | def forward(self, x): 113 | 114 | # x : [len_seq, batch_size, embedding_dim] 115 | x = x.permute(1, 0, 2) 116 | output, (final_hidden_state, final_cell_state) = self.lstm_net(x) 117 | # output : [batch_size, len_seq, n_hidden * 2] 118 | output = output.permute(1, 0, 2) 119 | # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] 120 | final_hidden_state = final_hidden_state.permute(1, 0, 2) 121 | # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) 122 | # atten_out = self.attention_net(output, final_hidden_state) 123 | atten_out = self.attention_net_with_w(output, final_hidden_state) 124 | return self.fc_out(atten_out) 125 | 126 | def save(model, filename): 127 | save_filename = '{}.pt'.format(filename) 128 | torch.save(model, save_filename) 129 | print('Saved as %s' % save_filename) 130 | 131 | def train(epoch): 132 | global lr, train_acc 133 | model.train() 134 | batch_idx = 1 135 | total_loss = 0 136 | correct = 0 137 | pred = np.array([]) 138 | X_train = text_features[train_dep_idxs+train_non_idxs] 139 | Y_train = text_targets[train_dep_idxs+train_non_idxs] 140 | for i in range(0, X_train.shape[0], config['batch_size']): 141 | if i + config['batch_size'] > X_train.shape[0]: 142 | x, y = X_train[i:], Y_train[i:] 143 | else: 144 | x, y = X_train[i:(i + config['batch_size'])], Y_train[i:( 145 | i + config['batch_size'])] 146 | if config['cuda']: 147 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() 148 | else: 149 | x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \ 150 | Variable(torch.from_numpy(y)).type(torch.FloatTensor) 151 | 152 | # 将模型的参数梯度设置为0 153 | optimizer.zero_grad() 154 | output = model(x) 155 | loss = criterion(output, y.view_as(output)) 156 | # 后向传播调整参数 157 | loss.backward() 158 | # 根据梯度更新网络参数 159 | optimizer.step() 160 | batch_idx += 1 161 | # loss.item()能够得到张量中的元素值 162 | pred = np.hstack((pred, output.flatten().detach().numpy())) 163 | total_loss += loss.item() 164 | train_mae = mean_absolute_error(Y_train, pred) 165 | 166 | print('Train Epoch: {:2d}\t Learning rate: {:.4f}\t Loss: {:.4f}\t MAE: {:.4f}\t RMSE: {:.4f}\n ' 167 | .format(epoch + 1, config['learning_rate'], total_loss, train_mae, \ 168 | np.sqrt(mean_squared_error(Y_train, pred)))) 169 | return train_mae 170 | 171 | 172 | def evaluate(fold, model, train_mae): 173 | model.eval() 174 | batch_idx = 1 175 | total_loss = 0 176 | global min_mae, min_rmse, test_dep_idxs, test_non_idxs 177 | pred = np.array([]) 178 | X_test = text_features[list(test_dep_idxs)+list(test_non_idxs)] 179 | Y_test = text_targets[list(test_dep_idxs)+list(test_non_idxs)] 180 | with torch.no_grad(): 181 | if config['cuda']: 182 | x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\ 183 | Variable(torch.from_numpy(Y_test)).cuda() 184 | else: 185 | x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \ 186 | Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor) 187 | 188 | optimizer.zero_grad() 189 | output = model(x) 190 | loss = criterion(output, y.view_as(output)) 191 | total_loss += loss.item() 192 | pred = output.flatten().detach().numpy() 193 | 194 | mae = mean_absolute_error(Y_test, pred) 195 | rmse = np.sqrt(mean_squared_error(Y_test, pred)) 196 | 197 | print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 198 | print('='*89) 199 | 200 | if mae <= min_mae and mae < 8.5 and train_mae < 13: 201 | min_mae = mae 202 | min_rmse = rmse 203 | mode = 'bi' if config['bidirectional'] else 'norm' 204 | mode ='gru' 205 | save(model, os.path.join(prefix, 'Model/Regression/Text{}/BiLSTM_{}_{:.2f}'.format(fold+1, config['hidden_dims'], min_mae))) 206 | print('*' * 64) 207 | print('model saved: mae: {}\t rmse: {}'.format(min_mae, min_rmse)) 208 | print('*' * 64) 209 | 210 | return total_loss 211 | 212 | for fold in range(3): 213 | test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10] 214 | test_non_idxs = non_idxs[fold*44:(fold+1)*44] 215 | train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp)) 216 | train_non_idxs = list(set(non_idxs) - set(test_non_idxs)) 217 | 218 | # training data augmentation 219 | train_dep_idxs = [] 220 | for (i, idx) in enumerate(train_dep_idxs_tmp): 221 | feat = text_features[idx] 222 | if i < 14: 223 | for i in itertools.permutations(feat, feat.shape[0]): 224 | text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) 225 | text_targets = np.hstack((text_targets, text_targets[idx])) 226 | train_dep_idxs.append(len(text_features)-1) 227 | else: 228 | train_dep_idxs.append(idx) 229 | 230 | # test data augmentation 231 | # test_dep_idxs = [] 232 | # for idx in test_dep_idxs_tmp: 233 | # feat = text_features[idx] 234 | # for i in itertools.permutations(feat, feat.shape[0]): 235 | # text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) 236 | # text_targets = np.hstack((text_targets, text_targets[idx])) 237 | # test_dep_idxs.append(len(text_features)-1) 238 | test_dep_idxs = test_dep_idxs_tmp 239 | 240 | 241 | model = TextBiLSTM(config) 242 | 243 | if config['cuda']: 244 | model = model.cuda() 245 | 246 | optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) 247 | criterion = nn.SmoothL1Loss() 248 | # criterion = FocalLoss(class_num=2) 249 | min_mae = 100 250 | min_rmse = 100 251 | train_mae = 100 252 | 253 | 254 | for ep in range(1, config['epochs']): 255 | train_mae = train(ep) 256 | tloss = evaluate(fold, model, train_mae) 257 | 258 | # ============== prep ============== 259 | # X_test = np.squeeze(np.load(os.path.join(prefix, 'Features/Audio/val_samples_reg_avid256.npz'))['arr_0'], axis=2) 260 | # Y_test = np.load(os.path.join(prefix, 'Features/Audio/val_labels_reg_avid256.npz'))['arr_0'] 261 | # ============== prep ============== 262 | 263 | 264 | # ============== SVM ============== 265 | 266 | # from sklearn.svm import SVR 267 | # from sklearn.model_selection import KFold 268 | 269 | # X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 270 | # Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 271 | # kf = KFold(n_splits=3) 272 | # regr = SVR(kernel='linear', gamma='auto') 273 | # maes, rmses = [], [] 274 | # for train_index, test_index in kf.split(X): 275 | # # X_train, X_test = X[train_index], X[test_index] 276 | # # Y_train, Y_test = Y[train_index], Y[test_index] 277 | # X_train, Y_train = X[train_index], Y[train_index] 278 | # regr.fit([f.flatten() for f in X_train], Y_train) 279 | # pred = regr.predict([f.flatten() for f in X_test]) 280 | 281 | # mae = mean_absolute_error(Y_test, pred) 282 | # rmse = np.sqrt(mean_squared_error(Y_test, pred)) 283 | # maes.append(mae) 284 | # rmses.append(rmse) 285 | 286 | # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 287 | # print('='*89) 288 | # # break 289 | 290 | # print(np.mean(maes), np.mean(rmses)) 291 | # ============== SVM ============== 292 | 293 | # # ============== DT ============== 294 | # from sklearn.tree import DecisionTreeRegressor 295 | # from sklearn.model_selection import KFold 296 | 297 | # X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 298 | # Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 299 | # kf = KFold(n_splits=3) 300 | # regr = DecisionTreeRegressor(max_depth=100, random_state=0, criterion="mse") 301 | # maes, rmses = [], [] 302 | # for train_index, test_index in kf.split(X): 303 | # # X_train, X_test = X[train_index], X[test_index] 304 | # # Y_train, Y_test = Y[train_index], Y[test_index] 305 | # X_train, Y_train = X[train_index], Y[train_index] 306 | # regr.fit([f.flatten() for f in X_train], Y_train) 307 | # pred = regr.predict([f.flatten() for f in X_test]) 308 | 309 | # mae = mean_absolute_error(Y_test, pred) 310 | # rmse = np.sqrt(mean_squared_error(Y_test, pred)) 311 | # maes.append(mae) 312 | # rmses.append(rmse) 313 | 314 | # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 315 | # print('='*89) 316 | 317 | # print(np.mean(maes), np.mean(rmses)) 318 | # # ============== DT ============== 319 | 320 | # # ============== RF ============== 321 | # from sklearn.ensemble import RandomForestRegressor 322 | # from sklearn.model_selection import KFold 323 | 324 | # X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 325 | # Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 326 | # kf = KFold(n_splits=3) 327 | # regr = RandomForestRegressor(max_depth=100, random_state=0, criterion="mse") 328 | # maes, rmses = [], [] 329 | # for train_index, test_index in kf.split(X): 330 | # # X_train, X_test = X[train_index], X[test_index] 331 | # # Y_train, Y_test = Y[train_index], Y[test_index] 332 | # X_train, Y_train = X[train_index], Y[train_index] 333 | # regr.fit([f.flatten() for f in X_train], Y_train) 334 | # pred = regr.predict([f.flatten() for f in X_test]) 335 | 336 | # mae = mean_absolute_error(Y_test, pred) 337 | # rmse = np.sqrt(mean_squared_error(Y_test, pred)) 338 | # maes.append(mae) 339 | # rmses.append(rmse) 340 | 341 | # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 342 | # print('='*89) 343 | 344 | # print(np.mean(maes), np.mean(rmses)) 345 | # # ============== RF ============== 346 | 347 | # ============== ada ============== 348 | # from sklearn.ensemble import AdaBoostRegressor 349 | # from sklearn.model_selection import KFold 350 | 351 | # X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 352 | # Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] 353 | # kf = KFold(n_splits=3) 354 | # regr = AdaBoostRegressor(n_estimators=50) 355 | # maes, rmses = [], [] 356 | # for train_index, test_index in kf.split(X): 357 | # # X_train, X_test = X[train_index], X[test_index] 358 | # # Y_train, Y_test = Y[train_index], Y[test_index] 359 | # X_train, Y_train = X[train_index], Y[train_index] 360 | # regr.fit([f.flatten() for f in X_train], Y_train) 361 | # pred = regr.predict([f.flatten() for f in X_test]) 362 | 363 | # mae = mean_absolute_error(Y_test, pred) 364 | # rmse = np.sqrt(mean_squared_error(Y_test, pred)) 365 | # maes.append(mae) 366 | # rmses.append(rmse) 367 | 368 | # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) 369 | # print('='*89) 370 | 371 | # print(np.mean(maes), np.mean(rmses)) 372 | # ============== ada ============== 373 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ICASSP2022-Depression 2 | Automatic Depression Detection: a GRU/ BiLSTM-based Model and An Emotional Audio-Textual Corpus 3 | 4 | https://arxiv.org/pdf/2202.08210.pdf 5 | 6 | https://ieeexplore.ieee.org/abstract/document/9746569/ 7 | 8 | ## Code 9 | 10 | - Regression 11 | - audio_bilstm_perm.py: train audio network 12 | - text_bilstm_perm.py: train text network 13 | - fuse_net.py: train multi-modal network 14 | - Classification 15 | - audio_features_whole.py: extract audio features 16 | - text_features_whole.py: extract text features 17 | - audio_gru_whole.py: train audio network 18 | - text_bilstm_whole.py: train text network 19 | - fuse_net_whole.py: train fuse network 20 | 21 | 22 | ## Dataset: EATD-Corpus 23 | 24 | The EATD-Corpus is a dataset consist of audio and text files of 162 volunteers who received counseling. 25 | 26 | ### How to download 27 | The EATD-Corpus can be downloaded at https://1drv.ms/u/s!AsGVGqImbOwYhHUHcodFC3xmKZKK?e=mCT5oN. Password: Ymj26Uv5 28 | 29 | ### How to use 30 | 31 | Training set contains data from 83 volunteers (19 depressed and 64 non-depressed). 32 | 33 | Validation set contains data from 79 volunteers (11 depressed and 68 non-depressed). 34 | 35 | Each folder contains depression data for one volunteer. 36 | 37 | - {positive/negative/neutral}.wav: Raw audio in wav 38 | - {positive/negative/neutral}_out.wav: Preprocessed audio. Preprocessing operations include denoising and de-muting 39 | - {positive/negative/neutral}.txt: Audio translation 40 | - label.txt: Raw SDS score 41 | - new_label.txt: Standard SDS score (Raw SDS score multiplied by 1.25) 42 | --------------------------------------------------------------------------------