├── .idea ├── .gitignore ├── code_20250102.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── MPDD Dataset License Agreementt.pdf ├── README.md ├── __pycache__ ├── config.cpython-311.pyc ├── config.cpython-38.pyc ├── dataset.cpython-311.pyc ├── dataset.cpython-38.pyc ├── train.cpython-311.pyc └── train_val_split.cpython-311.pyc ├── config.json ├── dataset.py ├── feature_extraction ├── audio │ ├── extra_mfcc64.py │ ├── extra_opensmile.py │ └── extract_wav2vec_embedding.py ├── feature_personalized │ ├── .idea │ │ ├── .gitignore │ │ ├── inspectionProfiles │ │ │ └── profiles_settings.xml │ │ ├── misc.xml │ │ ├── modules.xml │ │ └── 生成个性化特征+嵌入.iml │ ├── extrapersonality.py │ └── gen_describtion.py └── visual │ ├── __pycache__ │ ├── dataset.cpython-311.pyc │ ├── dataset.cpython-38.pyc │ ├── util.cpython-311.pyc │ └── util.cpython-38.pyc │ ├── extract_openface.py │ ├── extract_resnet+densnet.py │ └── util.py ├── models ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-38.pyc │ ├── base_model.cpython-310.pyc │ ├── base_model.cpython-311.pyc │ ├── base_model.cpython-38.pyc │ ├── pretrain_model.cpython-310.pyc │ ├── pretrain_model.cpython-311.pyc │ └── pretrain_model.cpython-38.pyc ├── base_model.py ├── networks │ ├── ContextEncoder.py │ ├── LightWeightTrans.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── ContextEncoder.cpython-310.pyc │ │ ├── ContextEncoder.cpython-311.pyc │ │ ├── ContextEncoder.cpython-38.pyc │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-311.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── autoencoder.cpython-310.pyc │ │ ├── autoencoder.cpython-311.pyc │ │ ├── autoencoder.cpython-38.pyc │ │ ├── classifier.cpython-310.pyc │ │ ├── classifier.cpython-311.pyc │ │ ├── classifier.cpython-38.pyc │ │ ├── fc.cpython-310.pyc │ │ ├── fc.cpython-311.pyc │ │ ├── fc.cpython-38.pyc │ │ ├── interact_model.cpython-310.pyc │ │ ├── interact_model.cpython-311.pyc │ │ ├── interact_model.cpython-38.pyc │ │ ├── lstm.cpython-310.pyc │ │ ├── lstm.cpython-311.pyc │ │ ├── lstm.cpython-38.pyc │ │ ├── multihead_attention.cpython-310.pyc │ │ ├── multihead_attention.cpython-311.pyc │ │ ├── multihead_attention.cpython-38.pyc │ │ ├── tools.cpython-310.pyc │ │ ├── tools.cpython-311.pyc │ │ └── tools.cpython-38.pyc │ ├── autoencoder.py │ ├── classifier.py │ ├── cnn.py │ ├── fc.py │ ├── interact_model.py │ ├── lstm.py │ ├── multihead_attention.py │ └── tools.py ├── our │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── __init__.cpython-311.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── our_model.cpython-311.pyc │ │ ├── our_model.cpython-38.pyc │ │ ├── our_model_ablation.cpython-311.pyc │ │ └── zelin_our_model.cpython-311.pyc │ └── our_model.py └── utils │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-38.pyc │ ├── config.cpython-310.pyc │ ├── config.cpython-311.pyc │ ├── config.cpython-38.pyc │ ├── convert.cpython-310.pyc │ ├── convert.cpython-311.pyc │ ├── convert.cpython-38.pyc │ ├── functions.cpython-310.pyc │ ├── functions.cpython-311.pyc │ ├── functions.cpython-38.pyc │ ├── time_track.cpython-310.pyc │ ├── time_track.cpython-311.pyc │ └── time_track.cpython-38.pyc │ ├── config-orin.py │ ├── config.py │ ├── convert.py │ ├── functions.py │ ├── load_pretrained.py │ └── time_track.py ├── requirements.txt ├── scripts ├── Track1 │ ├── train_1s_binary.sh │ ├── train_1s_quinary.sh │ ├── train_1s_ternary.sh │ ├── train_5s_binary.sh │ ├── train_5s_quinary.sh │ └── train_5s_ternary.sh ├── Track2 │ ├── train_1s_binary.sh │ ├── train_1s_ternary.sh │ ├── train_5s_binary.sh │ └── train_5s_ternary.sh └── test.sh ├── test.py ├── train.py ├── train_val_split.py └── utils ├── __init__.py ├── __pycache__ ├── __init__.cpython-311.pyc ├── __init__.cpython-38.pyc ├── logger.cpython-311.pyc └── logger.cpython-38.pyc ├── image_pool.py └── logger.py /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | # 基于编辑器的 HTTP 客户端请求 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/code_20250102.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /MPDD Dataset License Agreementt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/MPDD Dataset License Agreementt.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MPDD Baseline Code 2 | The baseline system provided for the MM 2025 MPDD Challenge serves as a starting point for participants to develop their solutions for the Multimodal Personalized Depression Detection tasks. The baseline system is designed to be straightforward yet effective, providing participants with a solid foundation upon which they can build and improve. 3 | 4 | # Results 5 | The metrics reported are weighted/unweighted F1-score (W_F1/U_F1) and weighted/unweighted accuracy (W_Acc./U_Acc.) with and without personalized features (PF) for the MPDD-Young and MPDD-Elderly datasets. Each value represents the best-performing feature combination for each experiment, using default hyper-parameters. 6 | 7 | #### MPDD-Elderly (Track1) 8 | 9 | | Length | Task Type | Audio Feature | Visual Feature | w/ PF (W_F1/U_F1) | w/ PF (W_Acc./U_Acc.) | w/o PF (W_F1/U_F1) | w/o PF (W_Acc./U_Acc.) | 10 | |--------|-----------|---------------|----------------|-------------------|-----------------------|--------------------|------------------------| 11 | | 1s | Binary | mfcc | openface | 85.71 / 79.13 | 85.40 / 84.62 | 82.60 / 70.89 | 69.37 / 83.33 | 12 | | 1s | Ternary | opensmile | resnet | 56.48 / 55.64 | 55.49 / 56.41 | 54.35 / 49.14 | 48.93 / 55.13 | 13 | | 1s | Quinary | opensmile | densenet | 66.26 / 46.66 | 45.79 / 69.23 | 63.85 / 44.00 | 42.45 / 66.67 | 14 | | 5s | Binary | opensmile | resnet | 81.75 / 72.37 | 75.40 / 80.77 | 77.90 / 66.15 | 67.94 / 76.92 | 15 | | 5s | Ternary | wav2vec | openface | 58.22 / 59.37 | 59.62 / 57.69 | 50.88 / 47.59 | 46.58 / 50.00 | 16 | | 5s | Quinary | mfcc | densenet | 75.62 / 58.40 | 57.71 / 78.21 | 73.49 / 56.83 | 56.98 / 75.64 | 17 | 18 | 19 | #### MPDD-Young (Track2) 20 | 21 | | Length | Task Type | Audio Feature | Visual Feature | w/ PF (W_F1/U_F1) | w/ PF (W_Acc./U_Acc.) | w/o PF (W_F1/U_F1) | w/o PF (W_Acc./U_Acc.) | 22 | |--------|-----------|---------------|----------------|-------------------|-----------------------|--------------------|------------------------| 23 | | 1s | Binary | wav2vec | openface | 59.96 / 59.96 | 63.64 / 63.64 | 55.23 / 55.23 | 56.06 / 56.06 | 24 | | 1s | Ternary | mfcc | densenet | 51.86 / 51.62 | 49.66 / 51.52 | 47.95 / 43.72 | 42.63 / 48.48 | 25 | | 5s | Binary | opensmile | resnet | 62.11 / 62.11 | 62.12 / 62.12 | 60.02 / 60.02 | 60.61 / 60.61 | 26 | | 5s | Ternary | mfcc | densenet | 48.18 / 41.31 | 41.71 / 50.00 | 42.82 / 39.38 | 41.29 / 42.42 | 27 | 28 | # Environment 29 | 30 | python 3.10.0 31 | pytorch 2.3.0 32 | scikit-learn 1.5.1 33 | pandas 2.2.2 34 | 35 | Given `requirements.txt`, we recommend users to configure their environment via conda with the following steps: 36 | 37 | conda create -n mpdd python=3.10 -y 38 | conda activate mpdd 39 | pip install -r requirements.txt 40 | 41 | # Features 42 | 43 | In our baseline, we use the following features: 44 | 45 | ### Acoustic Feature: 46 | **Wav2vec:** We extract utterance-level acoustic features using the wav2vec model pre-trained on large-scale audio data. The embedding size of the acoustic features is 512. 47 | The link of the pre-trained model is: [wav2vec model](https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec) 48 | 49 | **MFCCs:** We extract Mel-frequency cepstral coefficients (MFCCs). The embedding size of MFCCs is 64. 50 | 51 | **OpenSmile:** We extract utterance-level acoustic features using opensmile. The embedding size of OpenSMILE features is 6373. 52 | 53 | ### Visual Feature: 54 | **Resnet-50 and Densenet-121:** We employ OpenCV tool to extract scene pictures from each video, capturing frames at a 10-frame interval. Subsequently, we utilize the Resnet-50 and Densenet-121 model to generate utterance-level features for the extracted scene pictures in the videos. The embedding size of the visual features is 1000 for Resnet, and 1024 (Track1) or 1000 (Track2) for Densenet. 55 | The links of the pre-trained models are: 56 | [ResNet-50](https://huggingface.co/microsoft/resnet-50) 57 | [DenseNet-121](https://huggingface.co/pytorch/vision/v0.10.0/densenet121) 58 | 59 | **OpenFace:** We extract csv visual features using the pretrained OpenFace model. The embedding size of OpenFace features is 709. You can download the executable file and model files for OpenFace from the following link: [OpenFace Toolkit](https://github.com/TadasBaltrusaitis/OpenFace) 60 | 61 | ### Personalized Feature: 62 | We generate personalized features by loading the GLM3 model, creating personalized descriptions, and embedding these descriptions using the `roberta-large` model. The embedding size of the personalized features is 1024. 63 | The link of the `roberta-large` model is: [RoBERTa Large](https://huggingface.co/roberta-large) 64 | 65 | # Usage 66 | ## Dataset Download 67 | Given the potential ethical risks and privacy concerns associated with this dataset, we place the highest priority on the protection and lawful use of the data. To this end, we have established and implemented a series of stringent access and authorization management measures to ensure compliance with relevant laws, regulations, and ethical standards, while making every effort to prevent potential ethical disputes arising from improper data use. 68 | 69 | To further safeguard the security and compliance of the data, please complete the following steps before contacting us to request access to the dataset labels and extracted features: 70 | 71 | - **1. Download the [MPDD Dataset License Agreement PDF](https://github.com/hacilab/MPDD/blob/main/MPDD%20Dataset%20License%20Agreementt.pdf)**. 72 | 73 | - **2. Carefully review the agreement**: The agreement outlines in detail the usage specifications, restrictions, and the responsibilities and obligations of the licensee. Please read the document thoroughly to ensure complete understanding of the terms and conditions. 74 | 75 | - **3. Manually sign the agreement**: After confirming your full understanding and agreement with the terms, fill in the required fields and sign the agreement by hand as formal acknowledgment of your acceptance (should be signed with a full-time faculty or researcher). 76 | 77 | Once you have completed the above steps, please submit the required materials to us through the following channels: 78 | 79 | - **Primary contact email**: sstcneu@163.com 80 | - **CC email**: fuchangzeng@qhd.neu.edu.cn 81 | 82 | We will review your submission to verify that you meet the access requirements. Upon approval, we will grant you the corresponding data access permissions. Please note that all materials submitted will be used solely for identity verification and access management and will not be used for any other purpose. 83 | 84 | We sincerely appreciate your cooperation in protecting data privacy and ensuring compliant use. If you have any questions or require further guidance, please feel free to contact us via the emails provided above. 85 | 86 | After obtaining the dataset, users should modify `data_rootpath` in the scripts during training and testing. Notice that testing data will be made public in the later stage of the competition. 87 | 88 | `data_rootpath`: 89 | 90 | ├── Training/ 91 | │ ├──1s 92 | │ ├──5s 93 | │ ├──individualEmbedding 94 | │ ├──labels 95 | ├── Testing/ 96 | │ ├──1s 97 | │ ├──5s 98 | │ ├──individualEmbedding 99 | │ ├──labels 100 | 101 | 102 | ## Training 103 | To train the model with default parameters, taking MPDD-Young for example, simply run: 104 | 105 | ```bash 106 | cd path/to/MPDD # replace with actual path 107 | ``` 108 | ```bash 109 | bash scripts/Track2/train_1s_binary.sh 110 | ``` 111 | 112 | You can also modify parameters such as feature types, split window time, classification dimensions, or learning rate directly through the command line: 113 | ```bash 114 | bash scripts/Track2/train_1s_binary.sh --audiofeature_method=wav2vec --videofeature_method=resnet --splitwindow_time=5s --labelcount=3 --batch_size=32 --lr=0.001 --num_epochs=500 115 | ``` 116 | Refer to `config.json` for more parameters. 117 | 118 | The specific dimensions of each feature are shown in the table below: 119 | | Feature | Dimension | 120 | |--------------------------|---------------------------------| 121 | | Wav2vec | 512 | 122 | | MFCCs | 64 | 123 | | OpenSmile | 6373 | 124 | | ResNet-50 | 1000 | 125 | | DenseNet-121 | 1024 for Track1, 1000 for Track2 | 126 | | OpenFace | 709 | 127 | | Personalized Feature | 1024 | 128 | 129 | 130 | ## Testing 131 | To predict the labels for the testing set with your obtained model, first modify the default parameters in `test.sh` to match the current task, and run: 132 | 133 | ```bash 134 | cd path/to/MPDD # replace with actual path 135 | ``` 136 | ```bash 137 | bash scripts/test.sh 138 | ``` 139 | After testing 6 tasks in Track1 or 4 tasks in Track2, the results will be merged into the `submission.csv` file in `./answer_Track2/`. 140 | 141 | # Acknowledgements 142 | The benchmark of MPDD is developed based on the work of MEIJU 2025. The Github URL of MEIJU 2025 is: https://github.com/AI-S2-Lab/MEIJU2025-baseline. 143 | -------------------------------------------------------------------------------- /__pycache__/config.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/config.cpython-311.pyc -------------------------------------------------------------------------------- /__pycache__/config.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/config.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/dataset.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/dataset.cpython-311.pyc -------------------------------------------------------------------------------- /__pycache__/dataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/dataset.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/train.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/train.cpython-311.pyc -------------------------------------------------------------------------------- /__pycache__/train_val_split.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/train_val_split.cpython-311.pyc -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_dim_a": 512, 3 | "embd_size_a": 128, 4 | "embd_method_a": "maxpool", 5 | 6 | "input_dim_v": 1000, 7 | "embd_size_v": 128, 8 | "embd_method_v": "maxpool", 9 | 10 | "emo_output_dim": 5, 11 | 12 | "cls_layers": "128,64", 13 | "dropout_rate": 0.2, 14 | "bn": false, 15 | "ce_weight": 1.0, 16 | "focal_weight": 1.0, 17 | "temperature": 0.007, 18 | 19 | 20 | "Transformer_head": 2, 21 | "Transformer_layers": 1, 22 | "hidden_size": 256, 23 | 24 | "attention_head": 1, 25 | "attention_dropout": 0.0, 26 | 27 | "activate_fun": "relu", 28 | "ablation": "normal", 29 | "use_ICL": true, 30 | "drop_last": false, 31 | "cvNo": 1, 32 | 33 | "gpu_ids": [0], 34 | "isTrain": true, 35 | "checkpoints_dir": "./checkpoints", 36 | "name": "MDPP", 37 | 38 | "cuda_benchmark": true, 39 | 40 | "lr": 2e-5, 41 | "beta1": 0.9, 42 | 43 | "log_dir": "./logs", 44 | "feature_max_len": 5 45 | } 46 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | import numpy as np 4 | from torch.utils.data import Dataset 5 | 6 | 7 | class AudioVisualDataset(Dataset): 8 | def __init__(self, json_data, label_count, personalized_feature_file, max_len=10, batch_size=32, audio_path='', video_path='', isTest=False): 9 | self.data = json_data 10 | self.max_len = max_len # Expected sequence length 11 | self.batch_size = batch_size 12 | self.isTest = isTest 13 | 14 | # Load personalized features 15 | self.personalized_features = self.load_personalized_features(personalized_feature_file) 16 | self.audio_path = audio_path 17 | self.video_path = video_path 18 | self.label_count = label_count 19 | 20 | def __len__(self): 21 | return len(self.data) 22 | 23 | def fixed_windows(self, features: torch.Tensor, fixLen=4): 24 | """ 25 | Divides 2D features into fixLen fixed windows and aggregates them into fixed-size results (Tensor version). 26 | 27 | Parameters. 28 | - features: the input feature tensor of (timesteps, feature_dim) 29 | 30 | Returns. 31 | - Tensor of (4, feature_dim), each row represents a window of aggregated features 32 | """ 33 | timesteps, feature_dim = features.shape 34 | window_size = int(torch.ceil(torch.tensor(timesteps / fixLen))) 35 | windows = [] 36 | for i in range(fixLen): 37 | start = i * window_size 38 | end = min(start + window_size, timesteps) 39 | window = features[start:end] 40 | if window.size(0) > 0: 41 | window_aggregated = torch.mean(window, dim=0) 42 | windows.append(window_aggregated) 43 | else: 44 | windows.append(torch.zeros(feature_dim)) 45 | 46 | return torch.stack(windows, dim=0) 47 | 48 | def pad_or_truncate(self, feature, max_len): 49 | """Fill or truncate the input feature sequence""" 50 | if feature.shape[0] < max_len: 51 | padding = torch.zeros((max_len - feature.shape[0], feature.shape[1])) 52 | feature = torch.cat((feature, padding), dim=0) 53 | else: 54 | feature = feature[:max_len] 55 | return feature 56 | 57 | def load_personalized_features(self, file_path): 58 | """ 59 | Load personalized features from the .npy file. 60 | """ 61 | 62 | data = np.load(file_path, allow_pickle=True) 63 | if isinstance(data, np.ndarray) and isinstance(data[0], dict): 64 | return {entry["id"]: entry["embedding"] for entry in data} 65 | else: 66 | raise ValueError("Unexpected data format in the .npy file. Ensure it contains a list of dictionaries.") 67 | 68 | def __getitem__(self, idx): 69 | entry = self.data[idx] 70 | 71 | # Load audio and video features 72 | audio_feature = np.load(self.audio_path + '/' + entry['audio_feature_path']) 73 | video_feature = np.load(self.video_path + '/' + entry['video_feature_path']) 74 | audio_feature = torch.tensor(audio_feature, dtype=torch.float32) 75 | video_feature = torch.tensor(video_feature, dtype=torch.float32) 76 | 77 | audio_feature = self.pad_or_truncate(audio_feature, self.max_len) 78 | video_feature = self.pad_or_truncate(video_feature, self.max_len) 79 | 80 | # Load label 81 | if self.isTest == False: 82 | if self.label_count == 2: 83 | label = torch.tensor(entry['bin_category'], dtype=torch.long) 84 | elif self.label_count == 3: 85 | label = torch.tensor(entry['tri_category'], dtype=torch.long) 86 | elif self.label_count == 5: 87 | label = torch.tensor(entry['pen_category'], dtype=torch.long) 88 | else: 89 | label = 0 90 | 91 | import os 92 | 93 | filepath = entry['audio_feature_path'] # the filename containing path to features 94 | filename = os.path.basename(filepath) 95 | # Extract person ids and convert to integers 96 | person_id = int(filename.split('_')[0]) 97 | personalized_id = str(person_id) 98 | 99 | if personalized_id in self.personalized_features: 100 | personalized_feature = torch.tensor(self.personalized_features[personalized_id], dtype=torch.float32) 101 | else: 102 | # If no personalized feature found, use a zero vector 103 | personalized_feature = torch.zeros(1024, dtype=torch.float32) 104 | print(f"❗Personalized feature not found for id: {personalized_id}") 105 | 106 | return { 107 | 'A_feat': audio_feature, 108 | 'V_feat': video_feature, 109 | 'emo_label': label, 110 | 'personalized_feat': personalized_feature 111 | } 112 | -------------------------------------------------------------------------------- /feature_extraction/audio/extra_mfcc64.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | import os 4 | 5 | 6 | def extract_and_save_mfcc(input_dir, output_dir, n_mfcc=64, frame_length=2048, hop_length=512): 7 | """ 8 | 从指定目录中批量提取音频文件的 64 维 MFCC 特征,并保存为 .npy 文件。 9 | 10 | 参数: 11 | input_dir (str): 输入目录路径,包含所有 .wav 文件。 12 | output_dir (str): 输出目录路径,用于保存 .npy 文件。 13 | n_mfcc (int): MFCC 特征维度,默认为 64。 14 | frame_length (int): 每帧的长度(默认 2048,对应 ~46ms)。 15 | hop_length (int): 帧移的长度(默认 512,对应 ~11ms)。 16 | """ 17 | # 确保输出目录存在 18 | os.makedirs(output_dir, exist_ok=True) 19 | 20 | # 遍历输入目录中的所有 .wav 文件 21 | for file_name in os.listdir(input_dir): 22 | if file_name.endswith('.wav'): 23 | input_path = os.path.join(input_dir, file_name) 24 | output_path = os.path.join(output_dir, file_name.replace('.wav', '.npy')) 25 | 26 | try: 27 | y, sr = librosa.load(input_path, sr=None) 28 | mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=frame_length, hop_length=hop_length) 29 | mfcc_transposed = mfcc.T # 转置为 (n_frames, n_mfcc) 30 | 31 | # 保存为 .npy 文件 32 | np.save(output_path, mfcc_transposed) 33 | 34 | # 打印文件名和特征形状 35 | print(f"Processed {file_name}: MFCC shape {mfcc_transposed.shape}") 36 | except Exception as e: 37 | print(f"Error processing {file_name}: {e}") 38 | 39 | 40 | # 输入目录和输出目录路径 41 | input_dir = r"D:\HACI\MMchallenge\Audio_split1\Audio_split_16k" # 替换为输入目录路径 42 | output_dir = r"D:\HACI\MMchallenge\Audio_split1\features\mfccs" # 替换为输出目录路径 43 | 44 | # 批量提取并保存特征 45 | extract_and_save_mfcc(input_dir, output_dir) -------------------------------------------------------------------------------- /feature_extraction/audio/extra_opensmile.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import opensmile 4 | 5 | input_audio_dir = r"D:\HACI\MMchallenge\Audio_split1\Audio_split_16k" # Directory containing audio files 6 | output_feature_dir = r"D:\HACI\MMchallenge\Audio_split1\features\opensmile" # Directory to save .npy feature files 7 | 8 | os.makedirs(output_feature_dir, exist_ok=True) 9 | 10 | smile = opensmile.Smile( 11 | feature_set=opensmile.FeatureSet.ComParE_2016, 12 | feature_level=opensmile.FeatureLevel.Functionals, 13 | ) 14 | 15 | for audio_file in os.listdir(input_audio_dir): 16 | if audio_file.endswith((".wav", ".mp3")): 17 | audio_path = os.path.join(input_audio_dir, audio_file) 18 | feature_file = os.path.splitext(audio_file)[0] + ".npy" 19 | output_path = os.path.join(output_feature_dir, feature_file) 20 | 21 | try: 22 | features = smile.process_file(audio_path) 23 | 24 | feature_array = features.to_numpy().flatten() 25 | 26 | print(f"Shape of features for {audio_file}: {feature_array.shape}") 27 | 28 | np.save(output_path, feature_array) 29 | print(f"Features saved for {audio_file} as {output_path}") 30 | except Exception as e: 31 | print(f"Error processing file {audio_file}: {e}") 32 | 33 | print("Feature extraction completed.") -------------------------------------------------------------------------------- /feature_extraction/audio/extract_wav2vec_embedding.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | """ 3 | wav2vec: https://arxiv.org/abs/1904.05862 4 | official github repo: https://github.com/pytorch/fairseq/tree/master/examples/wav2vec 5 | """ 6 | import os 7 | import time 8 | import glob 9 | import torch 10 | import numpy as np 11 | import soundfile as sf 12 | from fairseq.models.wav2vec import Wav2VecModel # Note: use fairseq version of 0.10.1 (pip install fairseq==0.10.1) 13 | 14 | def write_feature_to_npy(feature, csv_file, feature_level): 15 | if feature_level == 'UTTERANCE': 16 | feature = np.array(feature).squeeze() # [C,] 17 | if len(feature.shape) != 1: # change [T, C] => [C,] 18 | feature = np.mean(feature, axis=0) 19 | np.save(csv_file, feature) 20 | else: 21 | np.save(csv_file, feature) 22 | 23 | def extract(audio_files, feature_level, model, save_dir, overwrite=False, gpu=None): 24 | start_time = time.time() 25 | device = torch.device(f'cuda:{gpu}' if gpu is not None and torch.cuda.is_available() else 'cpu') 26 | 27 | dir_name = 'wav2vec-large' 28 | out_dir_z = os.path.join(save_dir, f'{dir_name}-z-{feature_level[:3]}') # features output by feature encoder 29 | if not os.path.exists(out_dir_z): 30 | os.makedirs(out_dir_z) 31 | elif overwrite or len(os.listdir(save_dir)) == 0: 32 | print(f'==> Warning: overwrite save_dir "{save_dir}"!') 33 | else: 34 | raise Exception(f'==> Error: save_dir "{save_dir}" already exists, set overwrite=True if needed!') 35 | 36 | out_dir_c = os.path.join(save_dir, f'{dir_name}-c-{feature_level[:3]}') # features output by context network 37 | if not os.path.exists(out_dir_c): 38 | os.makedirs(out_dir_c) 39 | elif overwrite or len(os.listdir(save_dir)) == 0: 40 | print(f'==> Warning: overwrite save_dir "{save_dir}"!') 41 | else: 42 | raise Exception(f'==> Error: save_dir "{save_dir}" already exists, set overwrite=True if needed!') 43 | 44 | for idx, wav_file in enumerate(audio_files, 1): 45 | file_name = os.path.basename(wav_file) 46 | vid = file_name[:-4] 47 | print(f'Processing "{file_name}" ({idx}/{len(audio_files)})...') 48 | # load audio 49 | audio, sampling_rate = sf.read(wav_file) 50 | audio = audio.astype('float32')[np.newaxis, :] 51 | audio = torch.from_numpy(audio) 52 | audio = audio.to(device) 53 | assert sampling_rate == 16000, f'Error: sampling rate ({sampling_rate}) != 16k!' 54 | with torch.no_grad(): 55 | z = model.feature_extractor(audio) # (1, C, T), stride: 10ms (100Hz), receptive field: 30ms 56 | c = model.feature_aggregator(z) # (1, C, T), stride: 10ms (100Hz), receptive field: 801ms (for large version) 57 | 58 | z_feature = z.detach().squeeze().t().cpu().numpy() 59 | c_feature = c.detach().squeeze().t().cpu().numpy() 60 | z_csv_file = os.path.join(out_dir_z, f'{vid}.npy') 61 | c_csv_file = os.path.join(out_dir_c, f'{vid}.npy') 62 | write_feature_to_npy(z_feature, z_csv_file, feature_level) 63 | write_feature_to_npy(c_feature, c_csv_file, feature_level) 64 | 65 | end_time = time.time() 66 | print(f'Total time used: {end_time - start_time:.1f}s.') 67 | 68 | if __name__ == '__main__': 69 | gpu = 0 70 | feature_level = 'UTTERANCE' 71 | overwrite = True 72 | audio_dir = '/path/to/audio' # Replace with your audio directory 73 | save_dir = '/path/to/save' # Replace with your save directory 74 | model_path = '/path/to/model/wav2vec_large.pt' # Replace with your model path 75 | 76 | # in: get audios (assert file extension is '.wav') 77 | audio_files = glob.glob(os.path.join(audio_dir, '*.wav')) 78 | print(f'Find total "{len(audio_files)}" audio files.') 79 | 80 | device = torch.device(f'cuda:{gpu}' if torch.cuda.is_available() else 'cpu') 81 | cp = torch.load(model_path, map_location=device) 82 | model = Wav2VecModel.build_model(cp['args'], task=None) 83 | model.load_state_dict(cp['model']) 84 | model.to(device) 85 | model.eval() 86 | 87 | # extract features 88 | extract(audio_files, feature_level=feature_level, model=model, save_dir=save_dir, overwrite=overwrite, gpu=gpu) 89 | -------------------------------------------------------------------------------- /feature_extraction/feature_personalized/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | # 基于编辑器的 HTTP 客户端请求 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /feature_extraction/feature_personalized/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /feature_extraction/feature_personalized/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /feature_extraction/feature_personalized/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /feature_extraction/feature_personalized/.idea/生成个性化特征+嵌入.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /feature_extraction/feature_personalized/extrapersonality.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["HF_ENDPOINT"]="https://hf-mirror.com" 3 | 4 | import json 5 | import numpy as np 6 | import torch 7 | from transformers import RobertaTokenizer, RobertaModel 8 | 9 | 10 | def load_data(json_file): 11 | with open(json_file, "r") as f: 12 | data = json.load(f) 13 | return data 14 | 15 | 16 | def generate_embeddings(descriptions, model, tokenizer, output_file): 17 | """ 18 | Generate embeddings for each description and save them along with their IDs. 19 | """ 20 | embeddings_with_ids = [] 21 | 22 | model.eval() # Set model to evaluation mode 23 | 24 | with torch.no_grad(): # Disable gradient computation 25 | for id_, description in descriptions.items(): 26 | print(f"Processing ID: {id_}") 27 | # Tokenize the description 28 | encoded_input = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512) 29 | 30 | # Get the model output 31 | output = model(**encoded_input) 32 | 33 | # Extract the CLS token representation 34 | embedding = output.last_hidden_state[:, 0, :].squeeze().numpy() 35 | print(embedding.shape) 36 | 37 | # Append the ID and its embedding as a dictionary entry 38 | embeddings_with_ids.append({"id": id_, "embedding": embedding}) 39 | 40 | # Save the embeddings and IDs as a numpy array 41 | np.save(output_file, embeddings_with_ids, allow_pickle=True) 42 | print(f"Embeddings and IDs saved to {output_file}") 43 | 44 | 45 | def main(): 46 | # Path to the input JSON file 47 | json_file = "./GLM_data/personalized_descriptions.json" 48 | 49 | # Path to save the output embeddings 50 | output_file = "./GLM_data/descriptions_embeddings_with_ids.npy" 51 | # Load Roberta model and tokenizer from Hugging Face 52 | model_name = "roberta-large" 53 | tokenizer = RobertaTokenizer.from_pretrained(model_name) 54 | model = RobertaModel.from_pretrained(model_name) 55 | 56 | # Load the personalized descriptions 57 | descriptions = load_data(json_file) 58 | print(f"Loaded {len(descriptions)} descriptions.") 59 | 60 | # Generate and save embeddings with IDs 61 | generate_embeddings(descriptions, model, tokenizer, output_file) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /feature_extraction/feature_personalized/gen_describtion.py: -------------------------------------------------------------------------------- 1 | ################################################################################################################ 2 | # NOTE: # 3 | # The variable `big5_scores` used in this script corresponds to the field `big5_traits` in actual use. # 4 | # Please replace `big5_scores` with `big5_traits` when running this script on our data. # 5 | ################################################################################################################ 6 | 7 | import os 8 | os.environ["HF_ENDPOINT"]="https://hf-mirror.com" 9 | 10 | import json 11 | from transformers import AutoTokenizer, AutoModel 12 | 13 | # Set GPU device 14 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 15 | 16 | # ----------------加载模型-----------------# 17 | tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True) 18 | model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True, device='cuda') 19 | model = model.eval() 20 | # ---------------------------------# 21 | 22 | 23 | def generate_patient_prompt(patient_data): 24 | """ 25 | Generate a structured prompt for personalized description based on patient data. 26 | """ 27 | # Extract relevant information 28 | big5_scores = patient_data.get("big5_scores", {}) 29 | age = patient_data.get("age", "unknown") 30 | gender = patient_data.get("gender", "unknown") 31 | native_place = patient_data.get("native_place", "unknown") 32 | # financial_stress = patient_data.get("family_factors", {}).get("Financial_Stress", "unknown") 33 | # family_members = patient_data.get("family_factors", {}).get("Family_Members", "unknown") 34 | # disease = patient_data.get("disease", "unknown") 35 | 36 | # Interpret scores 37 | extroversion = big5_scores.get("Extraversion", "unknown") 38 | agreeableness = big5_scores.get("Agreeableness", "unknown") 39 | openness = big5_scores.get("Openness", "unknown") 40 | neuroticism = big5_scores.get("Neuroticism", "unknown") 41 | conscientiousness = big5_scores.get("Conscientiousness", "unknown") 42 | 43 | # Explain financial stress and disease 44 | # financial_stress_desc = { 45 | # 0: "no financial stress", 46 | # 1: "mild financial stress", 47 | # 2: "moderate financial stress", 48 | # 3: "severe financial stress" 49 | # }.get(financial_stress, "unknown financial stress level") 50 | 51 | # disease_desc = { 52 | # "0": "the patient is healthy", 53 | # "1": "the patient has other diseases", 54 | # "2": "the patient has endocrine diseases", 55 | # "3": "the patient has circulatory system diseases", 56 | # "4": "the patient has neurological diseases" 57 | # }.get(disease, "unknown disease status") 58 | 59 | # Construct the prompt 60 | prompt = ( 61 | f"The patient is a {age}-year-old {gender} from {native_place}. " 62 | f"The patient's Extraversion score is {extroversion}. " 63 | f"The Agreeableness score is {agreeableness}. " 64 | f"The Openness score is {openness}. " 65 | f"The Neuroticism score is {neuroticism}. " 66 | f"The Conscientiousness score is {conscientiousness}. " 67 | # f"Their financial stress is categorized as {financial_stress_desc}, and they live with {family_members} family members. " 68 | # f"Based on the disease classification, {disease_desc}. " 69 | "Please generate a concise, fluent English description summarizing the patient's key personality traits, family environment, and other notable characteristics. " 70 | "Avoid mentioning depression or related terminology. " 71 | "Output the response as a single paragraph." 72 | ) 73 | 74 | return prompt 75 | 76 | 77 | def process_dataset(json_file, output_file): 78 | """ 79 | Process the JSON dataset and generate personalized descriptions. 80 | """ 81 | with open(json_file, "r") as f: 82 | dataset = json.load(f) 83 | 84 | # Initialize the dictionary to store results 85 | results = {} 86 | 87 | # Open the output file in write mode 88 | with open(output_file, "w") as f: 89 | for patient_id, patient_data in dataset.items(): 90 | print(f"Processing patient ID: {patient_id}") 91 | patient_prompt = generate_patient_prompt(patient_data) 92 | print(f"Generated prompt for patient {patient_id}: {patient_prompt}") 93 | 94 | # Use model.chat to generate personalized response 95 | response, history = model.chat(tokenizer, patient_prompt, history=[], temperature=0.1) 96 | print(f"Generated description for patient {patient_id}: {response}") 97 | 98 | # Store the result in the dictionary 99 | results[patient_id] = response 100 | 101 | # Write the current results to the JSON file 102 | json.dump(results, f, ensure_ascii=False, indent=4) 103 | f.write("\n") # Add a newline for better readability in the JSON file 104 | 105 | print(f"All patient descriptions saved to {output_file}.") 106 | 107 | 108 | if __name__ == "__main__": 109 | # Path to the dataset 110 | json_file = "./GLM_data/label_data.json" 111 | # Output description file 112 | output_file = "./GLM_data/personalized_descriptions.json" 113 | 114 | process_dataset(json_file, output_file) 115 | -------------------------------------------------------------------------------- /feature_extraction/visual/__pycache__/dataset.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/feature_extraction/visual/__pycache__/dataset.cpython-311.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/__pycache__/dataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/feature_extraction/visual/__pycache__/dataset.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/__pycache__/util.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/feature_extraction/visual/__pycache__/util.cpython-311.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/__pycache__/util.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/feature_extraction/visual/__pycache__/util.cpython-38.pyc -------------------------------------------------------------------------------- /feature_extraction/visual/extract_openface.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import shutil 4 | import argparse 5 | import numpy as np 6 | from util import read_hog, read_csv 7 | 8 | import sys 9 | sys.path.append('../../') 10 | 11 | 12 | def generate_face_faceDir(input_root, save_root): 13 | for dir_path in glob.glob(input_root + '/*_aligned'): # 'xx/xx/000100_guest_aligned' 14 | frame_names = os.listdir(dir_path) # ['xxx.bmp'] 15 | # assert len(frame_names) <= 1 16 | if len(frame_names) == 1: # move frame to face_root 17 | frame_path = os.path.join(dir_path, frame_names[0]) # 'xx/xx/000100_guest_aligned/xxx.bmp' 18 | name = os.path.basename(dir_path)[:-len('_aligned')] # '000100_guest' 19 | save_path = os.path.join(save_root, name + '.bmp') 20 | shutil.copy(frame_path, save_path) 21 | 22 | 23 | def generate_face_videoOne(input_root, save_root): 24 | for dir_path in glob.glob(input_root + '/*_aligned'): # 'xx/xx/000100_guest_aligned' 25 | frame_names = os.listdir(dir_path) # ['xxx.bmp'] 26 | for ii in range(len(frame_names)): 27 | frame_path = os.path.join(dir_path, frame_names[ii]) # 'xx/xx/000100_guest_aligned/xxx.bmp' 28 | frame_name = os.path.basename(frame_path) 29 | save_path = os.path.join(save_root, frame_name) 30 | shutil.copy(frame_path, save_path) 31 | 32 | 33 | def generate_hog(input_root, save_root): 34 | for hog_path in glob.glob(input_root + '/*.hog'): 35 | csv_path = hog_path[:-4] + '.csv' 36 | if os.path.exists(csv_path): 37 | hog_name = os.path.basename(hog_path)[:-4] 38 | _, feature = read_hog(hog_path) 39 | save_path = os.path.join(save_root, hog_name + '.npy') 40 | np.save(save_path, feature) 41 | 42 | 43 | def generate_csv(input_root, save_root, startIdx): 44 | for csv_path in glob.glob(input_root + '/*.csv'): 45 | csv_name = os.path.basename(csv_path)[:-4] 46 | feature = read_csv(csv_path, startIdx) 47 | save_path = os.path.join(save_root, csv_name + '.npy') 48 | np.save(save_path, feature) 49 | 50 | 51 | def extract(input_dir, process_type, save_dir, face_dir, hog_dir, pose_dir): 52 | # process folders 53 | vids = os.listdir(input_dir) 54 | print(f'Find total "{len(vids)}" videos.') 55 | for i, vid in enumerate(vids, 1): 56 | print(vid) 57 | # if vid > '011_003_088': continue 58 | saveVid = vid ## for folder 59 | if vid.endswith('.mp4') or vid.endswith('.avi'): saveVid = vid[:-4] # for mp4 or avi files 60 | 61 | print(f"Processing video '{vid}' ({i}/{len(vids)})...") 62 | input_root = os.path.join(input_dir, vid) # exists 63 | save_root = os.path.join(save_dir, saveVid) 64 | face_root = os.path.join(face_dir, saveVid) 65 | hog_root = os.path.join(hog_dir, saveVid) 66 | pose_root = os.path.join(pose_dir, saveVid) 67 | # if os.path.exists(face_root): continue 68 | if not os.path.exists(save_root): os.makedirs(save_root) 69 | if not os.path.exists(face_root): os.makedirs(face_root) 70 | if not os.path.exists(hog_root): os.makedirs(hog_root) 71 | if not os.path.exists(pose_root): os.makedirs(pose_root) 72 | if process_type == 'faceDir': 73 | exe_path = os.path.join(r'.\tools\OpenFace_2.2.0_win_x64', # 指向OpenFace工具的安装路径 74 | 'FaceLandmarkImg.exe') 75 | commond = '%s -fdir \"%s\" -out_dir \"%s\"' % (exe_path, input_root, save_root) 76 | os.system(commond) 77 | generate_face_faceDir(save_root, face_root) 78 | generate_hog(save_root, hog_root) 79 | generate_csv(save_root, pose_root, startIdx=2) 80 | elif process_type == 'videoOne': 81 | exe_path = os.path.join(r'.\tools\OpenFace_2.2.0_win_x64', 82 | 'FeatureExtraction.exe') 83 | commond = '%s -f \"%s\" -out_dir \"%s\"' % (exe_path, input_root, save_root) 84 | os.system(commond) 85 | generate_face_videoOne(save_root, face_root) 86 | generate_hog(save_root, hog_root) 87 | generate_csv(save_root, pose_root, startIdx=5) 88 | 89 | 90 | if __name__ == '__main__': 91 | parser = argparse.ArgumentParser(description='Run.') 92 | parser.add_argument('--overwrite', action='store_true', default=True, 93 | help='whether overwrite existed feature folder.') 94 | parser.add_argument('--dataset', type=str, default='BoxOfLies', help='input dataset') 95 | parser.add_argument('--type', type=str, default='faceDir', choices=['faceDir', 'videoOne'], 96 | help='faceDir: process on facedirs; videoOne: process on one video') 97 | params = parser.parse_args() 98 | 99 | print(f'==> Extracting openface features...') 100 | 101 | # in: face dir 102 | dataset = params.dataset 103 | process_type = params.type 104 | input_dir = r"E:\MEIJU_data20241229\frame_5s" 105 | 106 | # out: feature csv dir 107 | 108 | save_dir = os.path.join(r"\features\openface\frame_5s", 'openface_all') 109 | hog_dir = os.path.join(r"\features\openface\frame_5s", 'openface_hog') 110 | pose_dir = os.path.join(r"\features\openface\frame_5s", 'openface_pose') 111 | face_dir = os.path.join(r"\features\openface\frame_5s", 'openface_face') 112 | 113 | if not os.path.exists(save_dir): 114 | os.makedirs(save_dir) 115 | elif params.overwrite: 116 | print(f'==> Warning: overwrite save_dir "{save_dir}"!') 117 | else: 118 | raise Exception(f'==> Error: save_dir "{save_dir}" already exists, set overwrite=TRUE if needed!') 119 | 120 | if not os.path.exists(hog_dir): 121 | os.makedirs(hog_dir) 122 | elif params.overwrite: 123 | print(f'==> Warning: overwrite save_dir "{hog_dir}"!') 124 | else: 125 | raise Exception(f'==> Error: save_dir "{hog_dir}" already exists, set overwrite=TRUE if needed!') 126 | 127 | if not os.path.exists(pose_dir): 128 | os.makedirs(pose_dir) 129 | elif params.overwrite: 130 | print(f'==> Warning: overwrite save_dir "{pose_dir}"!') 131 | else: 132 | raise Exception(f'==> Error: save_dir "{pose_dir}" already exists, set overwrite=TRUE if needed!') 133 | 134 | if not os.path.exists(face_dir): 135 | os.makedirs(face_dir) 136 | elif params.overwrite: 137 | print(f'==> Warning: overwrite save_dir "{face_dir}"!') 138 | else: 139 | raise Exception(f'==> Error: save_dir "{face_dir}" already exists, set overwrite=TRUE if needed!') 140 | 141 | # process 142 | extract(input_dir, process_type, save_dir, face_dir, hog_dir, pose_dir) 143 | 144 | print(f'==> Finish') 145 | -------------------------------------------------------------------------------- /feature_extraction/visual/extract_resnet+densnet.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import torch 3 | from torchvision.models import resnet50, densenet121 4 | from torchvision.transforms import transforms 5 | import os 6 | import tqdm 7 | import torch.utils.data as data 8 | import glob 9 | import argparse 10 | import numpy as np 11 | from PIL import Image 12 | 13 | class FrameDataset(data.Dataset): 14 | def __init__(self, vid, face_dir, transform=None): 15 | super(FrameDataset, self).__init__() 16 | self.vid = vid 17 | self.path = os.path.join(face_dir, vid) 18 | self.transform = transform 19 | self.frames = self.get_frames() 20 | 21 | def get_frames(self): 22 | frames = glob.glob(os.path.join(self.path, '*')) 23 | return frames 24 | 25 | def __len__(self): 26 | return len(self.frames) 27 | 28 | def __getitem__(self, index): 29 | path = self.frames[index] 30 | img = Image.open(path) 31 | if self.transform is not None: 32 | img = self.transform(img) 33 | name = os.path.basename(path)[:-4] 34 | return img, name 35 | 36 | def frame_extract(video_path, root_save_path, sample_rate=2): 37 | video_name = os.path.basename(video_path)[:-4] 38 | save_dir = os.path.join(root_save_path, video_name) 39 | if not os.path.exists(save_dir): 40 | os.mkdir(save_dir) 41 | 42 | video = cv2.VideoCapture(video_path) 43 | 44 | count = 0 45 | while video.isOpened(): 46 | ret, frame = video.read() 47 | if not ret: 48 | break 49 | 50 | if count % sample_rate == 0: 51 | save_path = os.path.join(root_save_path, video_name, f'frame{count:04d}.jpg') 52 | cv2.imwrite(save_path, frame) 53 | # break 54 | count += 1 55 | 56 | video.release() 57 | cv2.destroyAllWindows() 58 | 59 | 60 | def extract(data_loader, model): 61 | model.eval() 62 | with torch.no_grad(): 63 | features, timestamps = [], [] 64 | for images, names in data_loader: 65 | # images = images.cuda() 66 | embedding = model(images) 67 | features.append(embedding.cpu().detach().numpy()) 68 | timestamps.extend(names) 69 | features, timestamps = np.row_stack(features), np.array(timestamps) 70 | return features, timestamps 71 | 72 | 73 | def feature_extract(frame_dir, save_dir, feature_level='UTT'): 74 | if not os.path.exists(save_dir): 75 | os.mkdir(save_dir) 76 | 77 | model = resnet50(pretrained=True)#.cuda() 78 | transform = transforms.Compose([ 79 | # transforms.ToPILImage(), 80 | transforms.Resize((224, 224)), 81 | transforms.ToTensor(), 82 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 83 | ]) 84 | vids = os.listdir(frame_dir) 85 | EMBEDDING_DIM = -1 86 | print(f'Find total "{len(vids)}" videos.') 87 | for i, vid in enumerate(vids, 1): 88 | print(f"Processing video '{vid}' ({i}/{len(vids)})...") 89 | csv_file = os.path.join(save_dir, f'{vid}.npy') 90 | if os.path.exists(csv_file): 91 | continue 92 | 93 | # forward 94 | dataset = FrameDataset(vid, frame_dir, transform=transform) 95 | if len(dataset) == 0: 96 | print("Warning: number of frames of video {} should not be zero.".format(vid)) 97 | embeddings, framenames = [], [] 98 | else: 99 | data_loader = torch.utils.data.DataLoader(dataset, 100 | batch_size=32, 101 | num_workers=4, 102 | pin_memory=True) 103 | embeddings, framenames = extract(data_loader, model) 104 | 105 | # save results 106 | indexes = np.argsort(framenames) 107 | embeddings = embeddings[indexes] 108 | EMBEDDING_DIM = max(EMBEDDING_DIM, np.shape(embeddings)[-1]) 109 | 110 | if feature_level == 'FRAME': 111 | embeddings = np.array(embeddings).squeeze() 112 | if len(embeddings) == 0: 113 | embeddings = np.zeros((1, EMBEDDING_DIM)) 114 | elif len(embeddings.shape) == 1: 115 | embeddings = embeddings[np.newaxis, :] 116 | np.save(csv_file, embeddings) # shape = (frame_num, 1000) 117 | else: 118 | embeddings = np.array(embeddings).squeeze() 119 | if len(embeddings) == 0: 120 | embeddings = np.zeros((EMBEDDING_DIM, )) 121 | elif len(embeddings.shape) == 2: 122 | embeddings = np.mean(embeddings, axis=0) 123 | np.save(csv_file, embeddings) 124 | 125 | 126 | 127 | def visual_extraction(): 128 | sample_rate = 10 129 | video_path = 'D:/HACI/MMchallenge/Video_split1/Video_split1' 130 | video_name = os.listdir(video_path) 131 | for video in tqdm.tqdm(video_name): 132 | if 'mp4' in video: 133 | video_path = os.path.join(video_path, video) 134 | if not os.path.exists('D:/HACI/MMchallenge/Video_split1/frame'): 135 | os.mkdir('D:/HACI/MMchallenge/Video_split1/frame') 136 | frame_extract(video_path, r'D:/HACI/MMchallenge/Video_split1/frame', sample_rate=sample_rate) 137 | 138 | print('Finished extracting frame!') 139 | 140 | 141 | video_frame_dir = 'D:/HACI/MMchallenge/Video_split1/frame' 142 | save_dir = 'D:/HACI/MMchallenge/Video_split1/features' 143 | feature_extract(video_frame_dir, save_dir, feature_level='UTT') 144 | 145 | 146 | if __name__ == '__main__': 147 | parser = argparse.ArgumentParser(description='Run.') 148 | parser.add_argument('--overwrite', action='store_true', default=True, help='whether overwrite existed feature folder.') 149 | parser.add_argument('--dataset', type=str, default='BoxOfLies', help='input dataset') 150 | params = parser.parse_args() 151 | 152 | print(f'==> Extracting resnet features...') 153 | 154 | dataset = params.dataset 155 | input_dir = 'D:/HACI/MMchallenge/Video_split1/frame' 156 | 157 | # out: feature csv dir 158 | save_dir = 'D:/HACI/MMchallenge/Video_split1/features' 159 | 160 | 161 | visual_extraction() 162 | 163 | pass 164 | -------------------------------------------------------------------------------- /feature_extraction/visual/util.py: -------------------------------------------------------------------------------- 1 | # *_*coding:utf-8 *_* 2 | import os 3 | import re 4 | import pandas as pd 5 | import numpy as np 6 | import struct 7 | 8 | ## for OPENFACE 9 | ## reference: https://gist.github.com/btlorch/6d259bfe6b753a7a88490c0607f07ff8 10 | def read_hog(filename, batch_size=5000): 11 | """ 12 | Read HoG features file created by OpenFace. 13 | For each frame, OpenFace extracts 12 * 12 * 31 HoG features, i.e., num_features = 4464. These features are stored in row-major order. 14 | :param filename: path to .hog file created by OpenFace 15 | :param batch_size: how many rows to read at a time 16 | :return: is_valid, hog_features 17 | is_valid: ndarray of shape [num_frames] 18 | hog_features: ndarray of shape [num_frames, num_features] 19 | """ 20 | all_feature_vectors = [] 21 | with open(filename, "rb") as f: 22 | num_cols, = struct.unpack("i", f.read(4)) # 12 23 | num_rows, = struct.unpack("i", f.read(4)) # 12 24 | num_channels, = struct.unpack("i", f.read(4)) # 31 25 | 26 | # The first four bytes encode a boolean value whether the frame is valid 27 | num_features = 1 + num_rows * num_cols * num_channels 28 | feature_vector = struct.unpack("{}f".format(num_features), f.read(num_features * 4)) 29 | feature_vector = np.array(feature_vector).reshape((1, num_features)) # [1, 4464+1] 30 | all_feature_vectors.append(feature_vector) 31 | 32 | # Every frame contains a header of four float values: num_cols, num_rows, num_channels, is_valid 33 | num_floats_per_feature_vector = 4 + num_rows * num_cols * num_channels 34 | # Read in batches of given batch_size 35 | num_floats_to_read = num_floats_per_feature_vector * batch_size 36 | # Multiply by 4 because of float32 37 | num_bytes_to_read = num_floats_to_read * 4 38 | 39 | while True: 40 | bytes = f.read(num_bytes_to_read) 41 | # For comparison how many bytes were actually read 42 | num_bytes_read = len(bytes) 43 | assert num_bytes_read % 4 == 0, "Number of bytes read does not match with float size" 44 | num_floats_read = num_bytes_read // 4 45 | assert num_floats_read % num_floats_per_feature_vector == 0, "Number of bytes read does not match with feature vector size" 46 | num_feature_vectors_read = num_floats_read // num_floats_per_feature_vector 47 | 48 | feature_vectors = struct.unpack("{}f".format(num_floats_read), bytes) 49 | # Convert to array 50 | feature_vectors = np.array(feature_vectors).reshape((num_feature_vectors_read, num_floats_per_feature_vector)) 51 | # Discard the first three values in each row (num_cols, num_rows, num_channels) 52 | feature_vectors = feature_vectors[:, 3:] 53 | # Append to list of all feature vectors that have been read so far 54 | all_feature_vectors.append(feature_vectors) 55 | 56 | if num_bytes_read < num_bytes_to_read: 57 | break 58 | 59 | # Concatenate batches 60 | all_feature_vectors = np.concatenate(all_feature_vectors, axis=0) 61 | 62 | # Split into is-valid and feature vectors 63 | is_valid = all_feature_vectors[:, 0] 64 | feature_vectors = all_feature_vectors[:, 1:] 65 | 66 | return is_valid, feature_vectors 67 | 68 | 69 | ## for OPENFACE 70 | def read_csv(filename, startIdx): 71 | data = pd.read_csv(filename) 72 | all_feature_vectors = [] 73 | for index in data.index: 74 | features = np.array(data.iloc[index][startIdx:]) 75 | all_feature_vectors.append(features) 76 | all_feature_vectors = np.array(all_feature_vectors) 77 | return all_feature_vectors 78 | 79 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | """This package contains modules related to objective functions, optimizations, and network architectures. 2 | 3 | To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel. 4 | You need to implement the following five functions: 5 | -- <__init__>: initialize the class; first call BaseModel.__init__(self, opt). 6 | -- : unpack data from dataset and apply preprocessing. 7 | -- : produce intermediate results. 8 | -- : calculate loss, gradients, and update network weights. 9 | -- : (optionally) add model-specific options and set default options. 10 | 11 | In the function <__init__>, you need to define four lists: 12 | -- self.loss_names (str list): specify the training losses that you want to plot and save. 13 | -- self.model_names (str list): define networks used in our training. 14 | -- self.visual_names (str list): specify the images that you want to display and save. 15 | -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage. 16 | 17 | Now you can use the model class by specifying flag '--model dummy'. 18 | See our template model class 'template_model.py' for more details. 19 | """ 20 | 21 | import importlib 22 | from models.base_model import BaseModel 23 | 24 | 25 | def find_model_using_name(model_name): 26 | """Import the module "models/[model_name]_model.py". 27 | 28 | In the file, the class called DatasetNameModel() will 29 | be instantiated. It has to be a subclass of BaseModel, 30 | and it is case-insensitive. 31 | """ 32 | if 'MISA' in model_name: 33 | model_filename = "models." + model_name + "_model" 34 | print(model_filename) 35 | elif 'our' in model_name: 36 | model_filename = "models.our." + model_name + "_model" 37 | else: 38 | model_filename = "models." + model_name + "_model" 39 | modellib = importlib.import_module(model_filename) 40 | model = None 41 | target_model_name = model_name.replace('_', '') + 'model' 42 | for name, cls in modellib.__dict__.items(): 43 | if name.lower() == target_model_name.lower() \ 44 | and issubclass(cls, BaseModel): 45 | model = cls 46 | 47 | if model is None: 48 | print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name)) 49 | exit(0) 50 | 51 | return model 52 | 53 | 54 | def get_option_setter(model_name): 55 | """Return the static method of the model class.""" 56 | model_class = find_model_using_name(model_name) 57 | return model_class.modify_commandline_options 58 | 59 | 60 | def create_model(opt): 61 | """Create a model given the option. 62 | 63 | This function warps the class CustomDatasetDataLoader. 64 | This is the main interface between this package and 'train.py'/'test.py' 65 | 66 | Example: 67 | >>> from models import create_model 68 | >>> model = create_model(opt) 69 | """ 70 | model = find_model_using_name(opt.model) 71 | instance = model(opt) 72 | print("model [%s] was created" % type(instance).__name__) 73 | return instance 74 | 75 | 76 | -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/base_model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/base_model.cpython-310.pyc -------------------------------------------------------------------------------- /models/__pycache__/base_model.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/base_model.cpython-311.pyc -------------------------------------------------------------------------------- /models/__pycache__/base_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/base_model.cpython-38.pyc -------------------------------------------------------------------------------- /models/__pycache__/pretrain_model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/pretrain_model.cpython-310.pyc -------------------------------------------------------------------------------- /models/__pycache__/pretrain_model.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/pretrain_model.cpython-311.pyc -------------------------------------------------------------------------------- /models/__pycache__/pretrain_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/pretrain_model.cpython-38.pyc -------------------------------------------------------------------------------- /models/base_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from collections import OrderedDict 4 | from abc import ABC, abstractmethod 5 | from .networks import tools 6 | 7 | 8 | class BaseModel(ABC): 9 | """This class is an abstract base class (ABC) for models. 10 | To create a subclass, you need to implement the following five functions: 11 | -- <__init__>: initialize the class; first call BaseModel.__init__(self, opt). 12 | -- : unpack data from dataset and apply preprocessing. 13 | -- : produce intermediate results. 14 | -- : calculate losses, gradients, and update network weights. 15 | -- : (optionally) add model-specific options and set default options. 16 | """ 17 | 18 | def __init__(self, opt): 19 | """Initialize the BaseModel class. 20 | 21 | Parameters: 22 | opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions 23 | 24 | When creating your custom class, you need to implement your own initialization. 25 | In this fucntion, you should first call 26 | Then, you need to define four lists: 27 | -- self.loss_names (str list): specify the training losses that you want to plot and save. 28 | -- self.model_names (str list): specify the images that you want to display and save. 29 | -- self.visual_names (str list): define networks used in our training. 30 | -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an example. 31 | """ 32 | self.opt = opt 33 | self.gpu_ids = opt.gpu_ids 34 | self.isTrain = opt.isTrain 35 | # self.device = torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu') # get device name: CPU or GPU 36 | self.device = None 37 | self.save_dir = os.path.join(opt.checkpoints_dir, opt.name) # save all the checkpoints to save_dir 38 | # self.image_save_dir = os.path.join(opt.image_dir, opt.name) 39 | # self.save_shared_dir = os.path.join(opt.shared_dir, opt.name) 40 | if opt.cuda_benchmark: # with [scale_width], input images might have different sizes, which hurts the performance of cudnn.benchmark. 41 | torch.backends.cudnn.benchmark = True 42 | self.loss_names = [] 43 | self.model_names = [] 44 | self.optimizers = [] 45 | self.metric = 0 # used for learning rate policy 'plateau' 46 | 47 | 48 | @staticmethod 49 | def modify_commandline_options(parser, is_train): 50 | """Add new model-specific options, and rewrite default values for existing options. 51 | 52 | Parameters: 53 | parser -- original option parser 54 | is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options. 55 | 56 | Returns: 57 | the modified parser. 58 | """ 59 | return parser 60 | 61 | @abstractmethod 62 | def set_input(self, input): 63 | """Unpack input data from the dataloader and perform necessary pre-processing steps. 64 | 65 | Parameters: 66 | input (dict): includes the data itself and its metadata information. 67 | """ 68 | pass 69 | 70 | @abstractmethod 71 | def forward(self): 72 | """Run forward pass; called by both functions and .""" 73 | pass 74 | 75 | @abstractmethod 76 | def optimize_parameters(self): 77 | """Calculate losses, gradients, and update network weights; called in every training iteration""" 78 | pass 79 | 80 | def setup(self, opt): 81 | """Load and print networks; create schedulers 82 | 83 | Parameters: 84 | opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions 85 | """ 86 | if self.isTrain: 87 | self.schedulers = [tools.get_scheduler(optimizer, opt) for optimizer in self.optimizers] 88 | for name in self.model_names: 89 | net = getattr(self, 'net' + name) 90 | net = tools.init_net(net, opt.init_type, opt.init_gain, opt.gpu_ids) 91 | setattr(self, 'net' + name, net) 92 | else: 93 | self.eval() 94 | 95 | self.print_networks(opt.verbose) 96 | self.post_process() 97 | 98 | def cuda(self): 99 | assert(torch.cuda.is_available()) 100 | for name in self.model_names: 101 | net = getattr(self, 'net' + name) 102 | net.to(self.gpu_ids[0]) 103 | net = torch.nn.DataParallel(net, self.gpu_ids) # multi-GPUs 104 | 105 | def eval(self): 106 | """Make models eval mode during test time""" 107 | self.isTrain = False 108 | for name in self.model_names: 109 | if isinstance(name, str): 110 | net = getattr(self, 'net' + name) 111 | net.eval() 112 | 113 | def train(self, mode: bool = False): 114 | # """Make models back to train mode after test time""" 115 | # self.isTrain = True 116 | # for name in self.model_names: 117 | # if isinstance(name, str): 118 | # net = getattr(self, 'net' + name) 119 | # net.train() 120 | """Make models back to train mode after test time (fzl 1029""" 121 | self.isTrain = mode # 更新 isTrain 标志位 122 | for name in self.model_names: 123 | if isinstance(name, str): 124 | net = getattr(self, 'net' + name) 125 | net.train(mode) # 确保子模块也切换到相应模式 126 | 127 | def test(self): 128 | """Forward function used in test time. 129 | 130 | This function wraps function in no_grad() so we don't save intermediate steps for backprop 131 | It also calls to produce additional visualization results 132 | """ 133 | # print('进入评估函数啦') 为了确定有没有走到这儿-Yes 134 | with torch.no_grad(): 135 | self.forward() 136 | 137 | def compute_visuals(self): 138 | """Calculate additional output images for visdom and HTML visualization""" 139 | pass 140 | 141 | def update_learning_rate(self, logger): 142 | """Update learning rates for all the networks; called at the end of every epoch""" 143 | for scheduler in self.schedulers: 144 | if self.opt.lr_policy == 'plateau': 145 | scheduler.step(self.metric) 146 | else: 147 | # print(scheduler) 148 | scheduler.step() 149 | 150 | lr = self.optimizers[0].param_groups[0]['lr'] 151 | # print('learning rate = %.7f' % lr) 152 | logger.info('learning rate = %.7f' % lr) 153 | 154 | def get_current_visuals(self): 155 | """Return visualization images. train.py will display these images with visdom, and save the images to a HTML""" 156 | visual_ret = OrderedDict() 157 | for name in self.visual_names: 158 | if isinstance(name, str): 159 | visual_ret[name] = getattr(self, name) 160 | return visual_ret 161 | 162 | def get_current_losses(self): 163 | """Return traning losses / errors. train.py will print out these errors on console, and save them to a file""" 164 | errors_ret = OrderedDict() 165 | # print('here') 为了确定有没有走到这儿-Yes 166 | for name in self.loss_names: 167 | if isinstance(name, str): 168 | errors_ret[name] = float(getattr(self, 'loss_' + name)) # float(...) works for both scalar tensor and float number 169 | return errors_ret 170 | 171 | def save_networks(self, epoch): 172 | """Save all the networks to the disk. 173 | 174 | Parameters: 175 | epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name) 176 | """ 177 | for name in self.model_names: 178 | # print("model_name is:",name) 179 | if isinstance(name, str): 180 | save_filename = '%s_net_%s.pth' % (epoch, name) 181 | save_path = os.path.join(self.save_dir, save_filename) 182 | # print('save_path is:', save_path) 183 | net = getattr(self, 'net' + name) 184 | 185 | if len(self.gpu_ids) > 0 and torch.cuda.is_available(): 186 | torch.save(net.module.cpu().state_dict(), save_path) 187 | # print(type(self.gpu_ids),self.gpu_ids) 188 | net.cuda(self.gpu_ids[0]) 189 | else: 190 | torch.save(net.cpu().state_dict(), save_path) 191 | 192 | def load_networks(self, epoch): 193 | """Load all the networks from the disk. 194 | 195 | Parameters: 196 | epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name) 197 | """ 198 | for name in self.model_names: 199 | if isinstance(name, str): 200 | load_filename = '%s_net_%s.pth' % (epoch, name) 201 | load_path = os.path.join(self.save_dir, load_filename) 202 | net = getattr(self, 'net' + name) 203 | if isinstance(net, torch.nn.DataParallel): 204 | net = net.module 205 | print('loading the model from %s' % load_path) 206 | state_dict = torch.load(load_path, map_location=self.device) 207 | if hasattr(state_dict, '_metadata'): 208 | del state_dict._metadata 209 | 210 | net.load_state_dict(state_dict) 211 | 212 | def load_networks_cv(self, folder_path): 213 | """Load all the networks from cv folder. 214 | 215 | Parameters: 216 | epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name) 217 | """ 218 | checkpoints = list(filter(lambda x: x.endswith('.pth'), os.listdir(folder_path))) 219 | for name in self.model_names: 220 | if isinstance(name, str): 221 | load_filename = list(filter(lambda x: x.split('.')[0].endswith('net_'+name), checkpoints)) 222 | print('load_filename is:', load_filename) 223 | # assert len(load_filename) == 1, 'In folder: {}, Exists file {}'.format(folder_path, load_filename) 224 | if len(load_filename) == 1: 225 | load_filename = load_filename[0] 226 | load_path = os.path.join(folder_path, load_filename) 227 | net = getattr(self, 'net' + name) 228 | if isinstance(net, torch.nn.DataParallel): 229 | net = net.module 230 | print('loading the model from %s' % load_path) 231 | state_dict = torch.load(load_path, map_location=self.device) 232 | if hasattr(state_dict, '_metadata'): 233 | del state_dict._metadata 234 | 235 | net.load_state_dict(state_dict) 236 | else: 237 | continue 238 | 239 | def print_networks(self, verbose): 240 | """Print the total number of parameters in the network and (if verbose) network architecture 241 | 242 | Parameters: 243 | verbose (bool) -- if verbose: print the network architecture 244 | """ 245 | print('---------- Networks initialized -------------') 246 | for name in self.model_names: 247 | if isinstance(name, str): 248 | net = getattr(self, 'net' + name) 249 | num_params = 0 250 | for param in net.parameters(): 251 | num_params += param.numel() 252 | if verbose: 253 | print(net) 254 | print('[Network %s] Total number of parameters : %.3f M' % (name, num_params / 1e6)) 255 | print('-----------------------------------------------') 256 | 257 | def set_requires_grad(self, nets, requires_grad=False): 258 | """Set requies_grad=Fasle for all the networks to avoid unnecessary computations 259 | Parameters: 260 | nets (network list) -- a list of networks 261 | requires_grad (bool) -- whether the networks require gradients or not 262 | """ 263 | if not isinstance(nets, list): 264 | nets = [nets] 265 | for net in nets: 266 | if net is not None: 267 | for param in net.parameters(): 268 | param.requires_grad = requires_grad 269 | 270 | def post_process(self): 271 | pass 272 | -------------------------------------------------------------------------------- /models/networks/ContextEncoder.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import os 4 | import json 5 | import numpy as np 6 | 7 | 8 | class ConversationalContextEncoder(nn.Module): 9 | """ Conversational Context Encoder """ 10 | 11 | def __init__(self, preprocess_config=None, model_config=None): 12 | super(ConversationalContextEncoder, self).__init__() 13 | d_model = model_config.hidden_size # ["transformer"]["encoder_hidden"] # 注意力层的隐藏层大小 14 | d_cont_enc = model_config.hidden_size # ["history_encoder"]["context_hidden"] # 上下文编码器隐藏层大小? 15 | num_layers = model_config.ContextEncoder_layers # ["history_encoder"]["context_layer"] # 上下文编码器层数 16 | dropout = model_config.ContextEncoder_dropout # ["history_encoder"]["context_dropout"] # 上下文编码器dropout 17 | self.text_emb_size = model_config.input_dim_l # ["history_encoder"]["text_emb_size"] # 文本embedding大小 18 | self.visual_emb_size = model_config.input_dim_v # ["history_encoder"]["visual_emb_size"] # 文本embedding大小 19 | self.audio_emb_size = model_config.input_dim_a # ["history_encoder"]["audio_emb_size"] # 文本embedding大小 20 | self.max_history_len = model_config.ContextEncoder_max_history_len # ["history_encoder"]["max_history_len"] # 最大历史长度 21 | 22 | self.text_emb_linear = nn.Linear(self.text_emb_size, d_cont_enc) 23 | self.visual_emb_linear = nn.Linear(self.visual_emb_size, d_cont_enc) 24 | self.audio_emb_linear = nn.Linear(self.audio_emb_size, d_cont_enc) 25 | self.speaker_linear = nn.Linear(d_model, d_cont_enc) 26 | n_speaker = 2 27 | self.speaker_embedding = nn.Embedding( 28 | n_speaker, 29 | model_config.hidden_size, 30 | ) 31 | 32 | self.text_gru = nn.GRU( 33 | input_size=d_cont_enc, 34 | hidden_size=d_cont_enc, 35 | num_layers=num_layers, 36 | batch_first=True, 37 | dropout=dropout, 38 | bidirectional=True 39 | ) 40 | # GRU的输出是2*hidden_size,所以需要线性层来将长度映射至hidden_size 41 | self.text_gru_linear = nn.Sequential( 42 | nn.Linear(2 * d_cont_enc, d_cont_enc), 43 | nn.ReLU() 44 | ) 45 | self.visual_gru = nn.GRU( 46 | input_size=d_cont_enc, 47 | hidden_size=d_cont_enc, 48 | num_layers=num_layers, 49 | batch_first=True, 50 | dropout=dropout, 51 | bidirectional=True 52 | ) 53 | # GRU的输出是2*hidden_size,所以需要线性层来将长度映射至hidden_size 54 | self.visual_gru_linear = nn.Sequential( 55 | nn.Linear(2 * d_cont_enc, d_cont_enc), 56 | nn.ReLU() 57 | ) 58 | self.audio_gru = nn.GRU( 59 | input_size=d_cont_enc, 60 | hidden_size=d_cont_enc, 61 | num_layers=num_layers, 62 | batch_first=True, 63 | dropout=dropout, 64 | bidirectional=True 65 | ) 66 | # GRU的输出是2*hidden_size,所以需要线性层来将长度映射至hidden_size 67 | self.audio_gru_linear = nn.Sequential( 68 | nn.Linear(2 * d_cont_enc, d_cont_enc), 69 | nn.ReLU() 70 | ) 71 | 72 | self.context_linear = nn.Linear(d_cont_enc, d_model) 73 | self.context_attention = SLA(d_model) 74 | 75 | def forward(self, text_emb, visual_emb, audio_emb, speaker, 76 | history_text_emb, history_visual_emb, history_audio_emb, history_speaker, modal='val'): 77 | # history_masks = get_mask_from_lengths(history_lens, self.max_history_len) 78 | 79 | # Embedding 80 | # 把当前句的文本embedding和对话历史embedding拼接起来 81 | if 'l' in modal: 82 | history_text_emb = torch.cat([history_text_emb, text_emb], dim=1) 83 | history_text_emb = self.text_emb_linear(history_text_emb) 84 | if 'v' in modal: 85 | history_visual_emb = torch.cat([history_visual_emb, visual_emb], dim=1) 86 | history_visual_emb = self.visual_emb_linear(history_visual_emb) 87 | if 'a' in modal: 88 | history_audio_emb = torch.cat([history_audio_emb, audio_emb], dim=1) 89 | history_audio_emb = self.audio_emb_linear(history_audio_emb) 90 | 91 | # # 拼接当前说话人和历史说话人 92 | history_speaker = torch.cat([history_speaker, speaker], dim=1) 93 | # # 降维 94 | history_speaker = self.speaker_linear(self.speaker_embedding(history_speaker)) 95 | 96 | # # 将对话文本历史和说话人历史拼接在一起,并进行编码, the reason we dropping this part is same as the above 97 | if 'l' in modal: 98 | history_text_enc = torch.cat([history_text_emb, history_speaker], dim=1) 99 | history_text_con = self.text_gru_linear(self.text_gru(history_text_enc)[0][:, -1, :]) 100 | if 'v' in modal: 101 | history_visual_enc = torch.cat([history_visual_emb, history_speaker], dim=1) 102 | history_visual_con = self.visual_gru_linear(self.visual_gru(history_visual_enc)[0][:, -1, :]) 103 | if 'a' in modal: 104 | history_audio_emb = torch.cat([history_audio_emb, history_speaker], dim=1) 105 | history_audio_con = self.audio_gru_linear(self.audio_gru(history_audio_emb)[0][:, -1, :]) 106 | 107 | # context_enc = torch.cat([history_visual_con, history_audio_con, history_text_con], dim=-1) 108 | if modal == 'val': 109 | context_enc = torch.stack([history_visual_con, history_audio_con, history_text_con], dim=0) 110 | elif modal == 'va': 111 | context_enc = torch.stack([history_visual_con, history_audio_con], dim=0) 112 | elif modal == 'vl': 113 | context_enc = torch.stack([history_visual_con, history_text_con], dim=0) 114 | elif modal == 'al': 115 | context_enc = torch.stack([history_audio_con, history_text_con], dim=0) 116 | elif modal == 'v': 117 | # context_enc = torch.stack([history_visual_con], dim=0) 118 | context_enc = history_visual_con.unsqueeze(0) 119 | elif modal == 'a': 120 | # context_enc = torch.stack([history_audio_con], dim=0) 121 | context_enc = history_audio_con.unsqueeze(0) 122 | elif modal == 'l': 123 | # context_enc = torch.stack([history_text_con], dim=0) 124 | context_enc = history_text_con.unsqueeze(0) 125 | else: 126 | context_enc = None 127 | 128 | # Split, 按照最大历史长度将历史编码切分成当前编码和过去编码,we don't have history, so we just use history_text_emb only 129 | # enc_current, enc_past = torch.split(history_enc, self.max_history_len, dim=1) 130 | # enc_current, enc_past = torch.split(history_text_emb, self.max_history_len, dim=1) 131 | 132 | # GRU,对当前编码进行编码,并使用掩码将填充部分置为0。 133 | # enc_current = self.gru_linear(self.gru(enc_current)[0]) 134 | # enc_current = enc_current.masked_fill(history_masks.unsqueeze(-1), 0) 135 | 136 | # Encoding 137 | # context_enc = torch.cat([enc_current, enc_past], dim=1) 138 | # context_enc = self.context_attention(self.context_linear(context_enc)) # [B, d] 139 | 140 | return context_enc 141 | 142 | 143 | class SLA(nn.Module): 144 | """ Sequence Level Attention """ 145 | 146 | def __init__(self, d_enc): 147 | super(SLA, self).__init__() 148 | self.linear = nn.Linear(d_enc, 1) 149 | self.softmax = nn.Softmax(dim=1) 150 | 151 | def forward(self, encoding, mask=None): 152 | attn = self.linear(encoding) 153 | if mask is not None: 154 | attn = attn.masked_fill(mask.unsqueeze(-1), -np.inf) 155 | aux_mask = (attn == -np.inf).all(self.softmax.dim).unsqueeze(self.softmax.dim) 156 | attn = attn.masked_fill(aux_mask, 0) # Remove all -inf along softmax.dim 157 | score = self.softmax(attn).transpose(-2, -1) # [B, 1, T] 158 | fused_rep = torch.matmul(score, encoding).squeeze(1) # [B, d] 159 | 160 | return fused_rep 161 | 162 | 163 | def get_mask_from_lengths(lengths, max_len=None): 164 | batch_size = lengths.shape[0] 165 | if max_len is None: 166 | max_len = torch.max(lengths).item() 167 | 168 | ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(lengths.device) 169 | mask = ids >= lengths.unsqueeze(1).expand(-1, max_len) 170 | 171 | return mask 172 | -------------------------------------------------------------------------------- /models/networks/LightWeightTrans.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import copy 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.nn import LayerNorm, Linear, Dropout, Module 6 | from torch.nn.modules import ModuleList 7 | from models.networks.multihead_attention import MultiheadAttention 8 | 9 | 10 | def _get_clones(module, n): 11 | return ModuleList([copy.deepcopy(module) for _ in range(n)]) 12 | 13 | 14 | class TransEncoderLayer(Module): 15 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1): 16 | super(TransEncoderLayer, self).__init__() 17 | self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) 18 | 19 | self.linear1 = Linear(d_model, dim_feedforward) 20 | self.dropout = Dropout(dropout) 21 | self.linear2 = Linear(dim_feedforward, d_model) 22 | 23 | self.norm = LayerNorm(d_model) 24 | self.dropout1 = Dropout(dropout) 25 | self.dropout2 = Dropout(dropout) 26 | 27 | def forward(self, src, src_mask=None, src_key_padding_mask=None): 28 | src2 = self.self_attn(src, src, src, attn_mask=src_mask, 29 | key_padding_mask=src_key_padding_mask)[0] 30 | src = src + self.dropout1(src2) 31 | src = self.norm(src) 32 | src2 = self.linear2(self.dropout(F.relu(self.linear1(src)))) 33 | src = src + self.dropout2(src2) 34 | src = self.norm(src) 35 | return src 36 | 37 | 38 | class TransEncoder(Module): 39 | def __init__(self, d_dual, d_model=512, nhead=8, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1): 40 | super(TransEncoder, self).__init__() 41 | self.d_model = d_model 42 | self.num_layers = num_encoder_layers 43 | self.linear1 = Linear(d_dual[0], d_model) 44 | self.linear2 = Linear(d_model, d_dual[1]) 45 | self.dropout = Dropout(dropout) 46 | 47 | encoder_layer = TransEncoderLayer(d_model, nhead, dim_feedforward, dropout) 48 | self.layers = _get_clones(encoder_layer, num_encoder_layers) 49 | 50 | self.norm = LayerNorm(d_model) 51 | 52 | def forward(self, src, mask=None, src_key_padding_mask=None): 53 | res = list() 54 | output = self.dropout(F.relu(self.linear1(src))) 55 | res.append(output) 56 | for i in range(self.num_layers): 57 | output = self.layers[i](output, src_mask=mask, src_key_padding_mask=src_key_padding_mask) 58 | res.append(output) 59 | if self.norm: 60 | output = self.norm(output) 61 | res.append(output) 62 | return self.linear2(output), res 63 | 64 | 65 | class EmotionClassifier(nn.Module): 66 | def __init__(self, config): 67 | super(EmotionClassifier, self).__init__() 68 | self.gpu_ids = config.gpu_ids 69 | self.device = torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu') 70 | 71 | self.output_dim = config.output_dim 72 | self.rnn_dropout = nn.Dropout(p=0.3, inplace=True) 73 | self.rnn_text = nn.LSTM(input_size=config.input_dim_l, hidden_size=config.gru_units, # text_dim->input_dim_l 74 | num_layers=1, bidirectional=False, dropout=0.0, batch_first=True) 75 | self.rnn_audio = nn.LSTM(input_size=config.input_dim_a, hidden_size=config.gru_units, # audio_dim->input_dim_a 76 | num_layers=1, bidirectional=False, dropout=0.0, batch_first=True) 77 | 78 | self.dense_text = nn.Linear(in_features=config.gru_units * 1, out_features=config.dense_units) 79 | self.dense_audio = nn.Linear(in_features=config.gru_units * 1, out_features=config.dense_units) 80 | self.dense_dropout = nn.Dropout(p=0.3, inplace=True) 81 | 82 | cat_dims = config.a_d_model + config.t_d_model + config.dense_units * 2 83 | self.out_layer_1 = nn.Linear(in_features=cat_dims, out_features=config.dense_units) 84 | self.out_layer_2 = nn.Linear(in_features=config.dense_units, out_features=config.output_dim) 85 | self.out_dropout = nn.Dropout(p=0.3, inplace=True) 86 | 87 | def forward(self, audio, text, uni_fusion): 88 | rnn_t, _ = self.rnn_text(text) 89 | encoded_text = torch.relu(self.dense_dropout(self.dense_text(torch.relu(rnn_t)))) 90 | rnn_a, _ = self.rnn_audio(audio) 91 | encoded_audio = torch.relu(self.dense_dropout(self.dense_audio(torch.relu(rnn_a)))) 92 | 93 | encoded_text = encoded_text.view(encoded_text.size(0), encoded_text.size(-1), encoded_text.size(1)) 94 | encoded_audio = encoded_audio.view(encoded_audio.size(0), encoded_audio.size(-1), encoded_audio.size(1)) 95 | 96 | layer_3 = torch.nn.Linear(in_features=encoded_text.size(-1), out_features=64).to(self.device) 97 | encoded_text = layer_3(encoded_text) 98 | layer_4 = torch.nn.Linear(in_features=encoded_audio.size(-1), out_features=64).to(self.device) 99 | encoded_audio = layer_4(encoded_audio) 100 | 101 | encoded_text = encoded_text.view(encoded_text.size(0), encoded_text.size(-1), encoded_text.size(1)) 102 | encoded_audio = encoded_audio.view(encoded_audio.size(0), encoded_audio.size(-1), encoded_audio.size(1)) 103 | 104 | encoded_feature = torch.cat((encoded_text, encoded_audio, uni_fusion[0], uni_fusion[1]), dim=-1) 105 | out1 = self.out_dropout(torch.relu(self.out_layer_1(encoded_feature))) 106 | out2 = self.out_layer_2(out1) 107 | in_feat = out2.transpose(1, 2) 108 | embd = F.max_pool1d(in_feat, in_feat.size(2), in_feat.size(2)) 109 | return embd.squeeze(-1) 110 | # return self.out_layer_2(out1) # mode = sentiment 111 | 112 | 113 | ''' 114 | D1 = Discriminator(feature_dims=128, conv_dim=20) 115 | dat = torch.randn(13, 1, 20, 128) 116 | output = D1(dat) 117 | print(output.shape) 118 | torch.Size([13, 20, 10, 64]) 119 | torch.Size([13, 40, 5, 32]) 120 | torch.Size([13, 80, 2, 16]) 121 | torch.Size([13, 1, 1, 1]) 122 | Translation = TransEncoder(d_dual=(300, 5), d_model=128, nhead=4, num_encoder_layers=2, 123 | dim_feedforward=512, dropout=0.5) 124 | dat = torch.empty(20, 11, 300) 125 | res = Translation(dat) 126 | print(res.shape) 127 | print(res[0, 1]) 128 | ''' 129 | -------------------------------------------------------------------------------- /models/networks/__init__.py: -------------------------------------------------------------------------------- 1 | ''' Contains network files. ''' -------------------------------------------------------------------------------- /models/networks/__pycache__/ContextEncoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/ContextEncoder.cpython-310.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/ContextEncoder.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/ContextEncoder.cpython-311.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/ContextEncoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/ContextEncoder.cpython-38.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/autoencoder.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/autoencoder.cpython-310.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/autoencoder.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/autoencoder.cpython-311.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/autoencoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/autoencoder.cpython-38.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/classifier.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/classifier.cpython-310.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/classifier.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/classifier.cpython-311.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/classifier.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/classifier.cpython-38.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/fc.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/fc.cpython-310.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/fc.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/fc.cpython-311.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/fc.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/fc.cpython-38.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/interact_model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/interact_model.cpython-310.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/interact_model.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/interact_model.cpython-311.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/interact_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/interact_model.cpython-38.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/lstm.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/lstm.cpython-310.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/lstm.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/lstm.cpython-311.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/lstm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/lstm.cpython-38.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/multihead_attention.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/multihead_attention.cpython-310.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/multihead_attention.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/multihead_attention.cpython-311.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/multihead_attention.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/multihead_attention.cpython-38.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/tools.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/tools.cpython-310.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/tools.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/tools.cpython-311.pyc -------------------------------------------------------------------------------- /models/networks/__pycache__/tools.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/tools.cpython-38.pyc -------------------------------------------------------------------------------- /models/networks/classifier.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence 4 | 5 | class LSTMClassifier(nn.Module): 6 | def __init__(self, input_size, hidden_size, fc1_size, output_size, dropout_rate): 7 | super(LSTMClassifier, self).__init__() 8 | self.input_size = input_size 9 | self.hidden_size = hidden_size 10 | self.fc1_size = fc1_size 11 | self.output_size = output_size 12 | self.dropout_rate = dropout_rate 13 | 14 | # defining modules - two layer bidirectional LSTM with layer norm in between 15 | self.rnn1 = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True) 16 | self.rnn2 = nn.LSTM(2 * hidden_size, hidden_size, bidirectional=True, batch_first=True) 17 | self.fc1 = nn.Linear(hidden_size * 4, fc1_size) 18 | self.fc2 = nn.Linear(fc1_size, output_size) 19 | self.relu = nn.ReLU() 20 | self.dropout = nn.Dropout(dropout_rate) 21 | self.layer_norm = nn.LayerNorm((hidden_size * 2, )) 22 | self.bn = nn.BatchNorm1d(hidden_size * 4) 23 | 24 | def extract_features(self, sequence, lengths, rnn1, rnn2, layer_norm): 25 | packed_sequence = pack_padded_sequence(sequence, lengths, batch_first=True, enforce_sorted=False) 26 | packed_h1, (final_h1, _) = rnn1(packed_sequence) 27 | padded_h1, _ = pad_packed_sequence(packed_h1, batch_first=True) 28 | normed_h1 = layer_norm(padded_h1) 29 | packed_normed_h1 = pack_padded_sequence(normed_h1, lengths, batch_first=True, enforce_sorted=False) 30 | _, (final_h2, _) = rnn2(packed_normed_h1) 31 | return final_h1, final_h2 32 | 33 | def rnn_flow(self, x, lengths): 34 | batch_size = lengths.size(0) 35 | h1, h2 = self.extract_features(x, lengths, self.rnn1, self.rnn2, self.layer_norm) 36 | h = torch.cat((h1, h2), dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1) 37 | return self.bn(h) 38 | 39 | def mask2length(self, mask): 40 | ''' mask [batch_size, seq_length, feat_size] 41 | ''' 42 | _mask = torch.mean(mask, dim=-1).long() # [batch_size, seq_len] 43 | length = torch.sum(_mask, dim=-1) # [batch_size,] 44 | return length 45 | 46 | def forward(self, x, mask): 47 | lengths = self.mask2length(mask) 48 | h = self.rnn_flow(x, lengths) 49 | h = self.fc1(h) 50 | h = self.dropout(h) 51 | h = self.relu(h) 52 | o = self.fc2(h) 53 | return o, h 54 | 55 | class SimpleClassifier(nn.Module): 56 | ''' Linear classifier, use embedding as input 57 | Linear approximation, should append with softmax 58 | ''' 59 | def __init__(self, embd_size, output_dim, dropout): 60 | super(SimpleClassifier, self).__init__() 61 | self.dropout = dropout 62 | self.C = nn.Linear(embd_size, output_dim) 63 | self.dropout_op = nn.Dropout(dropout) 64 | 65 | def forward(self, x): 66 | if self.dropout > 0: 67 | x = self.dropout_op(x) 68 | return self.C(x) 69 | 70 | class Identity(nn.Module): 71 | def __init__(self): 72 | super().__init__() 73 | 74 | def forward(self, x): 75 | return x 76 | 77 | class FcClassifier(nn.Module): 78 | def __init__(self, input_dim, layers, output_dim, dropout=0.3, use_bn=False): 79 | ''' Fully Connect classifier 80 | Parameters: 81 | -------------------------- 82 | input_dim: input feature dim 83 | layers: [x1, x2, x3] will create 3 layers with x1, x2, x3 hidden nodes respectively. 84 | output_dim: output feature dim 85 | activation: activation function 86 | dropout: dropout rate 87 | ''' 88 | super().__init__() 89 | self.all_layers = [] 90 | for i in range(0, len(layers)): 91 | self.all_layers.append(nn.Linear(input_dim, layers[i])) 92 | self.all_layers.append(nn.ReLU()) 93 | if use_bn: 94 | self.all_layers.append(nn.BatchNorm1d(layers[i])) 95 | if dropout > 0: 96 | self.all_layers.append(nn.Dropout(dropout)) 97 | input_dim = layers[i] 98 | 99 | if len(layers) == 0: 100 | layers.append(input_dim) 101 | self.all_layers.append(Identity()) 102 | 103 | self.fc_out = nn.Linear(layers[-1], output_dim) 104 | self.module = nn.Sequential(*self.all_layers) 105 | 106 | def forward(self, x): 107 | feat = self.module(x) 108 | out = self.fc_out(feat) 109 | return out, feat 110 | 111 | class EF_model_AL(nn.Module): 112 | def __init__(self, fc_classifier, lstm_classifier, out_dim_a, out_dim_v, fusion_size, num_class, dropout): 113 | ''' Early fusion model classifier 114 | Parameters: 115 | -------------------------- 116 | fc_classifier: acoustic classifier 117 | lstm_classifier: lexical classifier 118 | out_dim_a: fc_classifier output dim 119 | out_dim_v: lstm_classifier output dim 120 | fusion_size: output_size for fusion model 121 | num_class: class number 122 | dropout: dropout rate 123 | ''' 124 | super(EF_model_AL, self).__init__() 125 | self.fc_classifier = fc_classifier 126 | self.lstm_classifier = lstm_classifier 127 | self.out_dim = out_dim_a + out_dim_v 128 | self.dropout = nn.Dropout(dropout) 129 | self.num_class = num_class 130 | self.fusion_size = fusion_size 131 | # self.out = nn.Sequential( 132 | # nn.Linear(self.out_dim, self.fusion_size), 133 | # nn.ReLU(), 134 | # nn.Linear(self.fusion_size, self.num_class), 135 | # ) 136 | self.out1 = nn.Linear(self.out_dim, self.fusion_size) 137 | self.relu = nn.ReLU() 138 | self.out2 = nn.Linear(self.fusion_size, self.num_class) 139 | 140 | def forward(self, A_feat, L_feat, L_mask): 141 | _, A_out = self.fc_classifier(A_feat) 142 | _, L_out = self.lstm_classifier(L_feat, L_mask) 143 | feat = torch.cat([A_out, L_out], dim=-1) 144 | feat = self.dropout(feat) 145 | feat = self.relu(self.out1(feat)) 146 | out = self.out2(self.dropout(feat)) 147 | return out, feat 148 | 149 | 150 | class MaxPoolFc(nn.Module): 151 | def __init__(self, hidden_size, num_class=4): 152 | super(MaxPoolFc, self).__init__() 153 | self.hidden_size = hidden_size 154 | self.fc = nn.Sequential( 155 | nn.Linear(hidden_size, num_class), 156 | nn.ReLU() 157 | ) 158 | 159 | def forward(self, x): 160 | ''' x shape => [batch_size, seq_len, hidden_size] 161 | ''' 162 | batch_size, seq_len, hidden_size = x.size() 163 | x = x.view(batch_size, hidden_size, seq_len) 164 | # print(x.size()) 165 | out = torch.max_pool1d(x, kernel_size=seq_len) 166 | out = out.squeeze() 167 | out = self.fc(out) 168 | 169 | return out 170 | 171 | 172 | class Fusion(nn.Module): 173 | def __init__(self, input_dim, layers, output_dim, dropout=0.3): 174 | super().__init__() 175 | self.fusion = nn.Sequential() 176 | for i in range(len(layers)): 177 | self.fusion.add_module(f'fusion_layer_{i}', nn.Linear(in_features=input_dim, 178 | out_features=layers[i])) 179 | self.fusion.add_module(f'fusion_layer_{i}_dropout', nn.Dropout(dropout)) 180 | self.fusion.add_module(f'fusion_layer_{i}_activation', nn.ReLU()) 181 | input_dim = layers[i] 182 | 183 | self.fusion.add_module('fusion_layer_final', 184 | nn.Linear(in_features=layers[-1], out_features=output_dim)) 185 | 186 | def forward(self, x): 187 | feat = [] 188 | out = self.fusion(x) 189 | return out, feat 190 | 191 | 192 | if __name__ == '__main__': 193 | a = FcClassifier(256, [128], 4) 194 | print(a) -------------------------------------------------------------------------------- /models/networks/cnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class CNN(nn.Module): 7 | def __init__(self, input_dim, emb_size=128, in_channels=1, out_channels=128, kernel_heights=[2,3,4], dropout=0.5): 8 | super().__init__() 9 | ''' 10 | cat((conv1-relu+conv2-relu+conv3-relu)+maxpool) + dropout, and to trans 11 | ''' 12 | self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_heights[0], padding=0) 13 | self.conv2 = nn.Conv1d(in_channels, out_channels, kernel_heights[1], padding=0) 14 | self.conv3 = nn.Conv1d(in_channels, out_channels, kernel_heights[2], padding=0) 15 | self.dropout = nn.Dropout(dropout) 16 | self.embd = nn.Sequential( 17 | nn.Linear(len(kernel_heights)*out_channels, emb_size), 18 | nn.ReLU(inplace=True), 19 | ) 20 | 21 | def conv_block(self, input, conv_layer): 22 | conv_out = conv_layer(input)# conv_out.size() = (batch_size, out_channels, dim, 1) 23 | activation = F.relu(conv_out.squeeze(-1))# activation.size() = (batch_size, out_channels, dim1) 24 | max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2) # maxpool_out.size() = (batch_size, out_channels) 25 | return max_out 26 | 27 | def forward(self, utterance_x): 28 | batch_size, feat_dim = utterance_x.size() 29 | utterance_x = utterance_x.view(batch_size, 1, feat_dim) 30 | max_out1 = self.conv_block(utterance_x, self.conv1) 31 | max_out2 = self.conv_block(utterance_x, self.conv2) 32 | max_out3 = self.conv_block(utterance_x, self.conv3) 33 | all_out = torch.cat((max_out1, max_out2, max_out3), 1) 34 | fc_in = self.dropout(all_out) 35 | embd = self.embd(fc_in) 36 | # out = self.conv1(frame_x) # embd.shape: [batch_size, out_channels, dim, 1] 37 | # embd = out.view(frame_x.size(0), -1) 38 | return embd -------------------------------------------------------------------------------- /models/networks/fc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class FcEncoder(nn.Module): 5 | def __init__(self, input_dim, layers, dropout=0.5, use_bn=False): 6 | ''' Fully Connect classifier 7 | fc+relu+bn+dropout, 最后分类128-4层是直接fc的 8 | Parameters: 9 | -------------------------- 10 | input_dim: input feature dim 11 | layers: [x1, x2, x3] will create 3 layers with x1, x2, x3 hidden nodes respectively. 12 | dropout: dropout rate 13 | use_bn: use batchnorm or not 14 | ''' 15 | super().__init__() 16 | self.all_layers = [] 17 | for i in range(0, len(layers)): 18 | self.all_layers.append(nn.Linear(input_dim, layers[i])) 19 | self.all_layers.append(nn.ReLU()) 20 | if use_bn: 21 | self.all_layers.append(nn.BatchNorm1d(layers[i])) 22 | if dropout > 0: 23 | self.all_layers.append(nn.Dropout(dropout)) 24 | input_dim = layers[i] 25 | 26 | self.module = nn.Sequential(*self.all_layers) 27 | 28 | def forward(self, x): 29 | ## make layers to a whole module 30 | feat = self.module(x) 31 | return feat -------------------------------------------------------------------------------- /models/networks/interact_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import warnings 4 | from torch.nn import Parameter 5 | import torch.nn.functional as F 6 | from torch.nn import Linear, Module 7 | from torch.nn.init import xavier_normal_, xavier_uniform_, constant_ 8 | from models.networks.multihead_attention import MultiheadAttention 9 | from models.networks.multihead_attention import CrossAttention 10 | 11 | 12 | class InteractModule(Module): 13 | def __init__(self, opt): 14 | super(InteractModule, self).__init__() 15 | self.inter_attention = MultiheadAttention(embed_dim=opt.hidden_size, num_heads=opt.attention_head, 16 | dropout=opt.attention_dropout) 17 | self.hence_attention = MultiheadAttention(embed_dim=opt.hidden_size, num_heads=opt.attention_head, 18 | dropout=opt.attention_dropout) 19 | # self.inter_attention = CrossAttention(in_dim1=opt.hidden_size, in_dim2=opt.hidden_size, k_dim=opt.hidden_size, v_dim=opt.hidden_size, num_heads=opt.attention_head) 20 | # self.hence_attention = CrossAttention(in_dim1=opt.hidden_size, in_dim2=opt.hidden_size, k_dim=opt.hidden_size, v_dim=opt.hidden_size, num_heads=opt.attention_head) 21 | self.opt = opt 22 | 23 | def forward(self, query, key, value, activation='sigmoid'): 24 | # print(f'query.size is {query.size()}') 25 | inter_output, _ = self.inter_attention(query, key, value) 26 | # print(f'inter_output.shape is {inter_output.shape}') 27 | hence_output, _ = self.hence_attention(query, inter_output, inter_output) 28 | # print(f'hence_output.shape is {hence_output.shape}') 29 | 30 | # Gate machine 31 | inter_fusion = inter_output + hence_output 32 | if activation == 'sigmoid': 33 | act_function = torch.sigmoid 34 | elif activation == 'relu': 35 | act_function = F.relu 36 | else: 37 | raise ValueError(f'activation must be Sigmoid or ReLu, but got {activation}') 38 | 39 | assert self.opt.ablation in ['normal', 'gate', 40 | 'hence'], f'opt.ablation must be normal, gate, or hence, not be {self.opt.ablation}' 41 | 42 | if self.opt.ablation == 'normal': # no ablation 43 | inter_weight = act_function(inter_fusion) 44 | inter_result = torch.multiply(hence_output, inter_weight) 45 | 46 | # residual.shape = [3, bsz, hidden_size] 47 | residual = query + inter_result 48 | # change shape to [bsz, 3 * hidden_size] 49 | # residual = torch.cat((residual[0], residual[1], residual[2]), dim=1) 50 | 51 | elif self.opt.ablation == 'gate': # ablation of gate machine 52 | residual = query + hence_output 53 | # residual = torch.cat((residual[0], residual[1], residual[2]), dim=1) 54 | 55 | else: # ablation of hence_attention 56 | inter_weight = act_function(inter_output) 57 | inter_result = torch.multiply(inter_output, inter_weight) 58 | 59 | # residual.shape = [3, bsz, hidden_size] 60 | residual = query + inter_result 61 | # change shape to [bsz, 3 * hidden_size] 62 | 63 | 64 | result = [] 65 | for i in range(residual.shape[0]): 66 | result.append(residual[i]) 67 | residual = torch.cat(result, dim=1) 68 | return residual 69 | -------------------------------------------------------------------------------- /models/networks/lstm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class LSTMEncoder(nn.Module): 5 | ''' one directional LSTM encoder 6 | ''' 7 | def __init__(self, input_size, hidden_size, embd_method='last', bidirectional=False): 8 | super(LSTMEncoder, self).__init__() 9 | 10 | self.input_size = input_size 11 | self.hidden_size = hidden_size 12 | self.rnn = nn.LSTM(self.input_size, self.hidden_size, batch_first=True, bidirectional=bidirectional) 13 | assert embd_method in ['maxpool', 'attention', 'last', 'dense'] 14 | self.embd_method = embd_method 15 | 16 | if self.embd_method == 'attention': 17 | self.attention_vector_weight = nn.Parameter(torch.Tensor(hidden_size, 1)) 18 | self.attention_layer = nn.Sequential( 19 | nn.Linear(self.hidden_size, self.hidden_size), 20 | nn.Tanh(), 21 | ) 22 | self.softmax = nn.Softmax(dim=-1) 23 | elif self.embd_method == 'dense': 24 | self.dense_layer = nn.Sequential() 25 | self.bidirectional = bidirectional 26 | if bidirectional: 27 | self.dense_layer.add_module('linear', nn.Linear(2 * self.hidden_size, self.hidden_size)) 28 | else: 29 | self.dense_layer.add_module('linear', nn.Linear(self.hidden_size, self.hidden_size)) 30 | self.dense_layer.add_module('activate', nn.Tanh()) 31 | self.softmax = nn.Softmax(dim=-1) 32 | 33 | def embd_attention(self, r_out, h_n): 34 | '''' 35 | 参考这篇博客的实现: 36 | https://blog.csdn.net/dendi_hust/article/details/94435919 37 | https://blog.csdn.net/fkyyly/article/details/82501126 38 | 论文: Hierarchical Attention Networks for Document Classification 39 | formulation: lstm_output*softmax(u * tanh(W*lstm_output + Bias) 40 | W and Bias 是映射函数,其中 Bias 可加可不加 41 | u 是 attention vector 大小等于 hidden size 42 | ''' 43 | hidden_reps = self.attention_layer(r_out) # [batch_size, seq_len, hidden_size] 44 | atten_weight = (hidden_reps @ self.attention_vector_weight) # [batch_size, seq_len, 1] 45 | atten_weight = self.softmax(atten_weight) # [batch_size, seq_len, 1] 46 | # [batch_size, seq_len, hidden_size] * [batch_size, seq_len, 1] = [batch_size, seq_len, hidden_size] 47 | # sentence_vector = torch.sum(r_out * atten_weight, dim=1) # [batch_size, hidden_size] 48 | 49 | # return sentence_vector 50 | '''edit here (zelin)''' 51 | attended_r_out = r_out * atten_weight # 保持 [batch_size, seq_len, hidden_size] 52 | return attended_r_out # No sum over time dimension 53 | 54 | def embd_maxpool(self, r_out, h_n): 55 | 56 | """保留了时间维度,使用 torch.max 时增加 keepdim=True,并扩展结果""" 57 | pooled_out, _ = torch.max(r_out, dim=1, keepdim=True) # Keeps time dim 58 | return pooled_out.expand_as(r_out) # Duplicate across time dimension 59 | 60 | def embd_last(self, r_out, h_n): 61 | 62 | return r_out # Returns [batch_size, seq_len, hidden_size] 63 | 64 | def embd_dense(self, r_out, h_n): 65 | ''' 66 | 每个时间步应用 dense_layer,然后将其还原为原始的三维格式 [batch_size, seq_len, hidden_size]。 67 | ''' 68 | r_out = r_out.view(-1, r_out.size(2)) # Flatten to [batch_size * seq_len, hidden_size] 69 | dense_out = self.dense_layer(r_out) 70 | return dense_out.view(-1, r_out.size(1), self.hidden_size) # Reshape back to [batch_size, seq_len, hidden_size] 71 | 72 | def forward(self, x): 73 | ''' 74 | r_out shape: seq_len, batch, num_directions * hidden_size 75 | hn and hc shape: num_layers * num_directions, batch, hidden_size 76 | ''' 77 | r_out, (h_n, h_c) = self.rnn(x) 78 | embd = getattr(self, 'embd_' + self.embd_method)(r_out, h_n) 79 | return embd -------------------------------------------------------------------------------- /models/networks/multihead_attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import warnings 3 | from torch.nn import Parameter 4 | import torch.nn.functional as F 5 | from torch.nn import Linear, Module 6 | from torch.nn.init import xavier_normal_, xavier_uniform_, constant_ 7 | import torch.nn as nn 8 | 9 | 10 | class MultiheadAttention(Module): 11 | def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None): 12 | super(MultiheadAttention, self).__init__() 13 | self.embed_dim = embed_dim 14 | self.kdim = kdim if kdim is not None else embed_dim 15 | self.vdim = vdim if vdim is not None else embed_dim 16 | self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim 17 | 18 | self.num_heads = num_heads 19 | self.dropout = dropout 20 | self.head_dim = embed_dim // num_heads 21 | assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" 22 | 23 | self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim)) 24 | 25 | if self._qkv_same_embed_dim is False: 26 | self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim)) 27 | self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim)) 28 | self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim)) 29 | 30 | if bias: 31 | self.in_proj_bias = Parameter(torch.empty(3 * embed_dim)) 32 | else: 33 | self.register_parameter('in_proj_bias', None) 34 | self.out_proj = Linear(embed_dim, embed_dim, bias=bias) 35 | 36 | if add_bias_kv: 37 | self.bias_k = Parameter(torch.empty(1, 1, embed_dim)) 38 | self.bias_v = Parameter(torch.empty(1, 1, embed_dim)) 39 | else: 40 | self.bias_k = self.bias_v = None 41 | 42 | self.add_zero_attn = add_zero_attn 43 | 44 | self._reset_parameters() 45 | 46 | def _reset_parameters(self): 47 | if self._qkv_same_embed_dim: 48 | xavier_uniform_(self.in_proj_weight) 49 | else: 50 | xavier_uniform_(self.q_proj_weight) 51 | xavier_uniform_(self.k_proj_weight) 52 | xavier_uniform_(self.v_proj_weight) 53 | 54 | if self.in_proj_bias is not None: 55 | constant_(self.in_proj_bias, 0.) 56 | constant_(self.out_proj.bias, 0.) 57 | if self.bias_k is not None: 58 | xavier_normal_(self.bias_k) 59 | if self.bias_v is not None: 60 | xavier_normal_(self.bias_v) 61 | 62 | def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None): 63 | if hasattr(self, '_qkv_same_embed_dim') and self._qkv_same_embed_dim is False: 64 | return F.multi_head_attention_forward( 65 | query, key, value, self.embed_dim, self.num_heads, 66 | self.in_proj_weight, self.in_proj_bias, 67 | self.bias_k, self.bias_v, self.add_zero_attn, 68 | self.dropout, self.out_proj.weight, self.out_proj.bias, 69 | training=self.training, 70 | key_padding_mask=key_padding_mask, need_weights=need_weights, 71 | attn_mask=attn_mask, use_separate_proj_weight=True, 72 | q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight, 73 | v_proj_weight=self.v_proj_weight) 74 | else: 75 | if not hasattr(self, '_qkv_same_embed_dim'): 76 | warnings.warn('A new version of MultiheadAttention module has been implemented. \ 77 | Please re-train your model with the new module', 78 | UserWarning) 79 | 80 | return F.multi_head_attention_forward( 81 | query, key, value, self.embed_dim, self.num_heads, 82 | self.in_proj_weight, self.in_proj_bias, 83 | self.bias_k, self.bias_v, self.add_zero_attn, 84 | self.dropout, self.out_proj.weight, self.out_proj.bias, 85 | training=self.training, 86 | key_padding_mask=key_padding_mask, need_weights=need_weights, 87 | attn_mask=attn_mask) 88 | 89 | 90 | class CrossAttention(nn.Module): 91 | def __init__(self, in_dim1, in_dim2, k_dim, v_dim, num_heads): 92 | super(CrossAttention, self).__init__() 93 | self.num_heads = num_heads 94 | self.k_dim = k_dim 95 | self.v_dim = v_dim 96 | 97 | self.proj_q1 = nn.Linear(in_dim1, k_dim * num_heads, bias=False) 98 | self.proj_k2 = nn.Linear(in_dim2, k_dim * num_heads, bias=False) 99 | self.proj_v2 = nn.Linear(in_dim2, v_dim * num_heads, bias=False) 100 | self.proj_o = nn.Linear(v_dim * num_heads, in_dim1) 101 | 102 | def forward(self, x1, x2, _, mask=None): 103 | batch_size, seq_len1, in_dim1 = x1.size() 104 | seq_len2 = x2.size(1) 105 | 106 | q1 = self.proj_q1(x1).view(batch_size, seq_len1, self.num_heads, self.k_dim).permute(0, 2, 1, 3) 107 | k2 = self.proj_k2(x2).view(batch_size, seq_len2, self.num_heads, self.k_dim).permute(0, 2, 3, 1) 108 | v2 = self.proj_v2(x2).view(batch_size, seq_len2, self.num_heads, self.v_dim).permute(0, 2, 1, 3) 109 | 110 | attn = torch.matmul(q1, k2) / self.k_dim ** 0.5 111 | 112 | if mask is not None: 113 | attn = attn.masked_fill(mask == 0, -1e9) 114 | 115 | attn = F.softmax(attn, dim=-1) 116 | output = torch.matmul(attn, v2).permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len1, -1) 117 | output = self.proj_o(output) 118 | 119 | return output, 0 -------------------------------------------------------------------------------- /models/networks/tools.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import init 4 | import numpy as np 5 | import functools 6 | from torch.optim import lr_scheduler 7 | 8 | 9 | class Identity(nn.Module): 10 | def forward(self, x): 11 | return x 12 | 13 | 14 | def get_norm_layer(norm_type='instance'): 15 | """Return a normalization layer 16 | 17 | Parameters: 18 | norm_type (str) -- the name of the normalization layer: batch | instance | none 19 | 20 | For BatchNorm, we use learnable affine parameters and track running statistics (mean/stddev). 21 | For InstanceNorm, we do not use learnable affine parameters. We do not track running statistics. 22 | """ 23 | if norm_type == 'batch': 24 | norm_layer = functools.partial(nn.BatchNorm2d, affine=True, track_running_stats=True) 25 | elif norm_type == 'instance': 26 | norm_layer = functools.partial(nn.InstanceNorm2d, affine=False, track_running_stats=False) 27 | elif norm_type == 'layer': 28 | norm_layer = functools.partial(nn.LayerNorm, eps=1e-6, elementwise_affine=True) 29 | elif norm_type == 'none': 30 | norm_layer = lambda x: Identity() 31 | else: 32 | raise NotImplementedError('normalization layer [%s] is not found' % norm_type) 33 | return norm_layer 34 | 35 | 36 | def get_scheduler(optimizer, opt): 37 | """Return a learning rate scheduler 38 | 39 | Parameters: 40 | optimizer -- the optimizer of the network 41 | opt (option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions.  42 | opt.lr_policy is the name of learning rate policy: linear | step | plateau | cosine 43 | 44 | For 'linear', we keep the same learning rate for the first epochs 45 | and linearly decay the rate to zero over the next epochs. 46 | For other schedulers (step, plateau, and cosine), we use the default PyTorch schedulers. 47 | See https://pytorch.org/docs/stable/optim.html for more details. 48 | """ 49 | if opt.lr_policy == 'linear': 50 | def lambda_rule(epoch): 51 | lr_l = 1.0 - max(0, epoch + opt.epoch_count - opt.niter) / float(opt.niter_decay + 1) 52 | return lr_l 53 | 54 | scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) 55 | elif opt.lr_policy == 'step': 56 | scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1) 57 | elif opt.lr_policy == 'plateau': 58 | scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5) 59 | elif opt.lr_policy == 'cosine': 60 | scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.niter, eta_min=0) 61 | else: 62 | return NotImplementedError('learning rate policy [%s] is not implemented', opt.lr_policy) 63 | return scheduler 64 | 65 | 66 | def init_weights(net, init_type='normal', init_gain=0.02): 67 | """Initialize network weights. 68 | 69 | Parameters: 70 | net (network) -- network to be initialized 71 | init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal 72 | init_gain (float) -- scaling factor for normal, xavier and orthogonal. 73 | 74 | We use 'normal' in the original pix2pix and CycleGAN paper. But xavier and kaiming might 75 | work better for some applications. Feel free to try yourself. 76 | """ 77 | 78 | def init_func(m): # define the initialization function 79 | classname = m.__class__.__name__ 80 | if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1): 81 | if init_type == 'normal': 82 | init.normal_(m.weight.data, 0.0, init_gain) 83 | elif init_type == 'xavier': 84 | init.xavier_normal_(m.weight.data, gain=init_gain) 85 | elif init_type == 'kaiming': 86 | init.kaiming_normal_(m.weight.data, a=0, mode='fan_in') 87 | elif init_type == 'orthogonal': 88 | init.orthogonal_(m.weight.data, gain=init_gain) 89 | else: 90 | raise NotImplementedError('initialization method [%s] is not implemented' % init_type) 91 | if hasattr(m, 'bias') and m.bias is not None: 92 | init.constant_(m.bias.data, 0.0) 93 | elif classname.find( 94 | 'BatchNorm2d') != -1: # BatchNorm Layer's weight is not a matrix; only normal distribution applies. 95 | init.normal_(m.weight.data, 1.0, init_gain) 96 | init.constant_(m.bias.data, 0.0) 97 | 98 | print('initialize network with %s' % init_type) 99 | net.apply(init_func) # apply the initialization function 100 | 101 | 102 | def init_net(net, init_type='normal', init_gain=0.02, gpu_ids=[]): 103 | """Initialize a network: 1. register CPU/GPU device (with multi-GPU support); 2. initialize the network weights 104 | Parameters: 105 | net (network) -- the network to be initialized 106 | init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal 107 | gain (float) -- scaling factor for normal, xavier and orthogonal. 108 | gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2 109 | 110 | Return an initialized network. 111 | """ 112 | if len(gpu_ids) > 0: 113 | assert (torch.cuda.is_available()) 114 | net.to(gpu_ids[0]) 115 | net = torch.nn.DataParallel(net, gpu_ids) # multi-GPUs 116 | init_weights(net, init_type, init_gain=init_gain) 117 | return net 118 | 119 | 120 | def diagnose_network(net, name='network'): 121 | """Calculate and print the mean of average absolute(gradients) 122 | 123 | Parameters: 124 | net (torch network) -- Torch network 125 | name (str) -- the name of the network 126 | """ 127 | mean = 0.0 128 | count = 0 129 | for param in net.parameters(): 130 | if param.grad is not None: 131 | mean += torch.mean(torch.abs(param.grad.data)) 132 | count += 1 133 | if count > 0: 134 | mean = mean / count 135 | print(name) 136 | print(mean) 137 | 138 | 139 | class MidLayerFeatureExtractor(object): 140 | def __init__(self, layer): 141 | self.layer = layer 142 | self.feature = None 143 | self.layer.register_forward_hook(self.hook) 144 | self.device = None 145 | 146 | def hook(self, module, input, output): 147 | # default tensor on cpu 148 | self.is_empty = True 149 | self.feature = output.clone() 150 | self.is_empty = False 151 | # self.is_empty = True 152 | # self.feature = output 153 | # self.is_empty = False 154 | 155 | def extract(self): 156 | assert not self.is_empty, 'Synic Error in MidLayerFeatureExtractor, \ 157 | this may caused by calling extract method before the hooked module has execute forward method' 158 | return self.feature 159 | 160 | 161 | class MultiLayerFeatureExtractor(object): 162 | def __init__(self, net, layers): 163 | ''' 164 | Parameter: 165 | ----------------- 166 | net: torch.nn.Modules 167 | layers: str, something like "C.fc[0], module[1]" 168 | which will get mid layer features in net.C.fc[0] and net.module[1] respectively 169 | ''' 170 | self.net = net 171 | self.layer_names = layers.strip().split(',') 172 | self.layers = [self.str2layer(layer_name) for layer_name in self.layer_names] 173 | self.extractors = [MidLayerFeatureExtractor(layer) for layer in self.layers] 174 | 175 | def str2layer(self, name): 176 | modules = name.split('.') 177 | layer = self.net 178 | for module in modules: 179 | if '[' and ']' in module: 180 | sequential_name = module[:module.find('[')] 181 | target_module_num = int(module[module.find('[') + 1:module.find(']')]) 182 | layer = getattr(layer, sequential_name) 183 | layer = layer[target_module_num] 184 | else: 185 | layer = getattr(layer, module) 186 | 187 | return layer 188 | 189 | def extract(self): 190 | ans = [extractor.extract() for extractor in self.extractors] 191 | return ans 192 | 193 | 194 | def get_mask_from_lengths(lengths, max_len=None): 195 | batch_size = lengths.shape[0] 196 | if max_len is None: 197 | max_len = torch.max(lengths).item() 198 | 199 | ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(lengths.device) 200 | mask = ids >= lengths.unsqueeze(1).expand(-1, max_len) 201 | 202 | return mask 203 | -------------------------------------------------------------------------------- /models/our/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__init__.py -------------------------------------------------------------------------------- /models/our/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /models/our/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /models/our/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /models/our/__pycache__/our_model.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/our_model.cpython-311.pyc -------------------------------------------------------------------------------- /models/our/__pycache__/our_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/our_model.cpython-38.pyc -------------------------------------------------------------------------------- /models/our/__pycache__/our_model_ablation.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/our_model_ablation.cpython-311.pyc -------------------------------------------------------------------------------- /models/our/__pycache__/zelin_our_model.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/zelin_our_model.cpython-311.pyc -------------------------------------------------------------------------------- /models/our/our_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import json 4 | from collections import OrderedDict 5 | import torch.nn.functional as F 6 | from models.base_model import BaseModel 7 | from models.networks.lstm import LSTMEncoder 8 | from models.networks.classifier import FcClassifier 9 | from models.utils.config import OptConfig 10 | import math 11 | import torch.nn as nn 12 | 13 | 14 | class ourModel(BaseModel, nn.Module): 15 | 16 | def __init__(self, opt): 17 | """Initialize the LSTM autoencoder class 18 | Parameters: 19 | opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions 20 | """ 21 | nn.Module.__init__(self) 22 | super().__init__(opt) 23 | 24 | 25 | self.loss_names = [] 26 | self.model_names = [] 27 | 28 | # acoustic model 29 | self.netEmoA = LSTMEncoder(opt.input_dim_a, opt.embd_size_a, embd_method=opt.embd_method_a) 30 | self.model_names.append('EmoA') 31 | 32 | # visual model 33 | self.netEmoV = LSTMEncoder(opt.input_dim_v, opt.embd_size_v, opt.embd_method_v) 34 | self.model_names.append('EmoV') 35 | 36 | # Transformer Fusion model 37 | emo_encoder_layer = torch.nn.TransformerEncoderLayer(d_model=opt.hidden_size, nhead=int(opt.Transformer_head), batch_first=True) 38 | self.netEmoFusion = torch.nn.TransformerEncoder(emo_encoder_layer, num_layers=opt.Transformer_layers) 39 | self.model_names.append('EmoFusion') 40 | 41 | # Classifier 42 | cls_layers = list(map(lambda x: int(x), opt.cls_layers.split(','))) 43 | 44 | # cls_input_size = 5*opt.hidden_size, same with max-len 45 | cls_input_size = opt.feature_max_len * opt.hidden_size + 1024 # with personalized feature 46 | 47 | 48 | self.netEmoC = FcClassifier(cls_input_size, cls_layers, output_dim=opt.emo_output_dim, dropout=opt.dropout_rate) 49 | self.model_names.append('EmoC') 50 | self.loss_names.append('emo_CE') 51 | 52 | self.netEmoCF = FcClassifier(cls_input_size, cls_layers, output_dim=opt.emo_output_dim, dropout=opt.dropout_rate) 53 | self.model_names.append('EmoCF') 54 | self.loss_names.append('EmoF_CE') 55 | 56 | self.temperature = opt.temperature 57 | 58 | 59 | # self.device = 'cpu' 60 | # self.netEmoA = self.netEmoA.to(self.device) 61 | # self.netEmoV = self.netEmoV.to(self.device) 62 | # self.netEmoFusion = self.netEmoFusion.to(self.device) 63 | # self.netEmoC = self.netEmoC.to(self.device) 64 | # self.netEmoCF = self.netEmoCF.to(self.device) 65 | 66 | self.criterion_ce = torch.nn.CrossEntropyLoss() 67 | 68 | if self.isTrain: 69 | if not opt.use_ICL: 70 | self.criterion_ce = torch.nn.CrossEntropyLoss() 71 | self.criterion_focal = torch.nn.CrossEntropyLoss() 72 | else: 73 | self.criterion_ce = torch.nn.CrossEntropyLoss() 74 | self.criterion_focal = Focal_Loss() 75 | # initialize optimizers; schedulers will be automatically created by function . 76 | paremeters = [{'params': getattr(self, 'net' + net).parameters()} for net in self.model_names] 77 | self.optimizer = torch.optim.Adam(paremeters, lr=opt.lr, betas=(opt.beta1, 0.999)) 78 | self.optimizers.append(self.optimizer) 79 | self.ce_weight = opt.ce_weight 80 | self.focal_weight = opt.focal_weight 81 | 82 | # modify save_dir 83 | self.save_dir = os.path.join(opt.checkpoints_dir, opt.name) 84 | if not os.path.exists(self.save_dir): 85 | os.makedirs(self.save_dir) 86 | 87 | 88 | def post_process(self): 89 | # called after model.setup() 90 | def transform_key_for_parallel(state_dict): 91 | return OrderedDict([('module.' + key, value) for key, value in state_dict.items()]) 92 | 93 | if self.isTrain: 94 | print('[ Init ] Load parameters from pretrained encoder network') 95 | f = lambda x: transform_key_for_parallel(x) 96 | self.netEmoA.load_state_dict(f(self.pretrained_encoder.netEmoA.state_dict())) 97 | self.netEmoV.load_state_dict(f(self.pretrained_encoder.netEmoV.state_dict())) 98 | self.netEmoFusion.load_state_dict(f(self.pretrained_encoder.netEmoFusion.state_dict())) 99 | 100 | def load_from_opt_record(self, file_path): 101 | opt_content = json.load(open(file_path, 'r')) 102 | opt = OptConfig() 103 | opt.load(opt_content) 104 | return opt 105 | 106 | def set_input(self, input): 107 | 108 | self.acoustic = input['A_feat'].float().to(self.device) 109 | self.visual = input['V_feat'].float().to(self.device) 110 | 111 | self.emo_label = input['emo_label'].to(self.device) 112 | 113 | if 'personalized_feat' in input: 114 | self.personalized = input['personalized_feat'].float().to(self.device) 115 | else: 116 | self.personalized = None # if no personalized features given 117 | 118 | 119 | def forward(self, acoustic_feat=None, visual_feat=None): 120 | if acoustic_feat is not None: 121 | self.acoustic = acoustic_feat.float().to(self.device) 122 | self.visual = visual_feat.float().to(self.device) 123 | 124 | """Run forward pass; called by both functions and .""" 125 | 126 | emo_feat_A = self.netEmoA(self.acoustic) 127 | emo_feat_V = self.netEmoV(self.visual) 128 | 129 | '''insure time dimension modification''' 130 | emo_fusion_feat = torch.cat((emo_feat_V, emo_feat_A), dim=-1) # (batch_size, seq_len, 2 * embd_size) 131 | 132 | emo_fusion_feat = self.netEmoFusion(emo_fusion_feat) 133 | 134 | '''dynamic acquisition of bs''' 135 | batch_size = emo_fusion_feat.size(0) 136 | 137 | emo_fusion_feat = emo_fusion_feat.permute(1, 0, 2).reshape(batch_size, -1) # turn into [batch_size, feature_dim] 1028 138 | 139 | if self.personalized is not None: 140 | emo_fusion_feat = torch.cat((emo_fusion_feat, self.personalized), dim=-1) # [batch_size, seq_len * feature_dim + 1024] 141 | 142 | '''for back prop''' 143 | self.emo_logits_fusion, _ = self.netEmoCF(emo_fusion_feat) 144 | """-----------""" 145 | 146 | self.emo_logits, _ = self.netEmoC(emo_fusion_feat) 147 | self.emo_pred = F.softmax(self.emo_logits, dim=-1) 148 | 149 | def backward(self): 150 | """Calculate the loss for back propagation""" 151 | self.loss_emo_CE = self.criterion_ce(self.emo_logits, self.emo_label) 152 | self.loss_EmoF_CE = self.focal_weight * self.criterion_focal(self.emo_logits_fusion, self.emo_label) 153 | loss = self.loss_emo_CE + self.loss_EmoF_CE 154 | 155 | loss.backward() 156 | 157 | for model in self.model_names: 158 | torch.nn.utils.clip_grad_norm_(getattr(self, 'net' + model).parameters(), 1.0) 159 | 160 | def optimize_parameters(self, epoch): 161 | """Calculate losses, gradients, and update network weights; called in every training iteration""" 162 | # forward 163 | self.forward() 164 | # backward 165 | self.optimizer.zero_grad() 166 | self.backward() 167 | 168 | self.optimizer.step() 169 | 170 | 171 | class ActivateFun(torch.nn.Module): 172 | def __init__(self, opt): 173 | super(ActivateFun, self).__init__() 174 | self.activate_fun = opt.activate_fun 175 | 176 | def _gelu(self, x): 177 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 178 | 179 | def forward(self, x): 180 | if self.activate_fun == 'relu': 181 | return torch.relu(x) 182 | elif self.activate_fun == 'gelu': 183 | return self._gelu(x) 184 | 185 | 186 | class Focal_Loss(torch.nn.Module): 187 | def __init__(self, weight=0.5, gamma=3, reduction='mean'): 188 | super(Focal_Loss, self).__init__() 189 | self.gamma = gamma 190 | self.alpha = weight 191 | self.reduction = reduction 192 | 193 | def forward(self, preds, targets): 194 | """ 195 | preds:softmax output 196 | labels:true values 197 | """ 198 | ce_loss = F.cross_entropy(preds, targets, reduction='mean') 199 | pt = torch.exp(-ce_loss) 200 | focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss 201 | 202 | if self.reduction == 'none': 203 | return focal_loss 204 | elif self.reduction == 'mean': 205 | return torch.mean(focal_loss) 206 | elif self.reduction == 'sum': 207 | return torch.sum(focal_loss) 208 | else: 209 | raise NotImplementedError("Invalid reduction mode. Please choose 'none', 'mean', or 'sum'.") 210 | -------------------------------------------------------------------------------- /models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .convert import * 2 | from .time_track import time_desc_decorator 3 | from .functions import * -------------------------------------------------------------------------------- /models/utils/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/config.cpython-310.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/config.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/config.cpython-311.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/config.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/config.cpython-38.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/convert.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/convert.cpython-310.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/convert.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/convert.cpython-311.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/convert.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/convert.cpython-38.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/functions.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/functions.cpython-310.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/functions.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/functions.cpython-311.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/functions.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/functions.cpython-38.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/time_track.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/time_track.cpython-310.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/time_track.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/time_track.cpython-311.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/time_track.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/time_track.cpython-38.pyc -------------------------------------------------------------------------------- /models/utils/config-orin.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | class OptConfig(object): 4 | def __init__(self): 5 | pass 6 | 7 | def load(self, config_dict): 8 | if sys.version > '3': 9 | for key, value in config_dict.items(): 10 | if not isinstance(value, dict): 11 | setattr(self, key, value) 12 | else: 13 | self.load(value) 14 | else: 15 | for key, value in config_dict.iteritems(): 16 | if not isinstance(value, dict): 17 | setattr(self, key, value) 18 | else: 19 | self.load(value) -------------------------------------------------------------------------------- /models/utils/config.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | class OptConfig(object): 4 | def __init__(self): 5 | pass 6 | 7 | def load(self, config_dict): 8 | if sys.version > '3': 9 | for key, value in config_dict.items(): 10 | if not isinstance(value, dict): 11 | setattr(self, key, value) 12 | else: 13 | self.load(value) 14 | else: 15 | for key, value in config_dict.iteritems(): 16 | if not isinstance(value, dict): 17 | setattr(self, key, value) 18 | else: 19 | self.load(value) -------------------------------------------------------------------------------- /models/utils/convert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def to_gpu(x, on_cpu=False, gpu_id=None): 4 | """Tensor => Variable""" 5 | if torch.cuda.is_available() and not on_cpu: 6 | x = x.cuda(gpu_id) 7 | return x 8 | 9 | def to_cpu(x): 10 | """Variable => Tensor""" 11 | if torch.cuda.is_available(): 12 | x = x.cpu() 13 | return x.data -------------------------------------------------------------------------------- /models/utils/functions.py: -------------------------------------------------------------------------------- 1 | from torch.autograd import Function 2 | import torch.nn as nn 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | """ 7 | Adapted from https://github.com/fungtion/DSN/blob/master/functions.py 8 | """ 9 | 10 | class ReverseLayerF(Function): 11 | 12 | @staticmethod 13 | def forward(ctx, x, p): 14 | ctx.p = p 15 | 16 | return x.view_as(x) 17 | 18 | @staticmethod 19 | def backward(ctx, grad_output): 20 | output = grad_output.neg() * ctx.p 21 | 22 | return output, None 23 | 24 | 25 | class MSE(nn.Module): 26 | def __init__(self): 27 | super(MSE, self).__init__() 28 | 29 | def forward(self, pred, real): 30 | diffs = torch.add(real, -pred) 31 | n = torch.numel(diffs.data) 32 | mse = torch.sum(diffs.pow(2)) / n 33 | 34 | return mse 35 | 36 | 37 | class SIMSE(nn.Module): 38 | 39 | def __init__(self): 40 | super(SIMSE, self).__init__() 41 | 42 | def forward(self, pred, real): 43 | diffs = torch.add(real, - pred) 44 | n = torch.numel(diffs.data) 45 | simse = torch.sum(diffs).pow(2) / (n ** 2) 46 | 47 | return simse 48 | 49 | 50 | class DiffLoss(nn.Module): 51 | 52 | def __init__(self): 53 | super(DiffLoss, self).__init__() 54 | 55 | def forward(self, input1, input2): 56 | 57 | batch_size = input1.size(0) 58 | input1 = input1.view(batch_size, -1) 59 | input2 = input2.view(batch_size, -1) 60 | 61 | # Zero mean 62 | input1_mean = torch.mean(input1, dim=0, keepdims=True) # 按维度求均值,keepdims=True保持转换后维度不变 63 | input2_mean = torch.mean(input2, dim=0, keepdims=True) 64 | input1 = input1 - input1_mean 65 | input2 = input2 - input2_mean 66 | 67 | input1_l2_norm = torch.norm(input1, p=2, dim=1, keepdim=True).detach() # 求范数 68 | input1_l2 = input1.div(input1_l2_norm.expand_as(input1) + 1e-6) 69 | 70 | input2_l2_norm = torch.norm(input2, p=2, dim=1, keepdim=True).detach() 71 | input2_l2 = input2.div(input2_l2_norm.expand_as(input2) + 1e-6) 72 | 73 | diff_loss = torch.mean((input1_l2.t().mm(input2_l2)).pow(2)) 74 | 75 | return diff_loss 76 | 77 | class CMD(nn.Module): 78 | """ 79 | Adapted from https://github.com/wzell/cmd/blob/master/models/domain_regularizer.py 80 | """ 81 | 82 | def __init__(self): 83 | super(CMD, self).__init__() 84 | 85 | def forward(self, x1, x2, n_moments): 86 | mx1 = torch.mean(x1, 0) 87 | mx2 = torch.mean(x2, 0) 88 | sx1 = x1-mx1 89 | sx2 = x2-mx2 90 | dm = self.matchnorm(mx1, mx2) 91 | scms = dm 92 | for i in range(n_moments - 1): 93 | scms += self.scm(sx1, sx2, i + 2) 94 | return scms 95 | 96 | def matchnorm(self, x1, x2): 97 | power = torch.pow(x1-x2,2) 98 | summed = torch.sum(power) 99 | sqrt = summed**(0.5) 100 | return sqrt 101 | # return ((x1-x2)**2).sum().sqrt() 102 | 103 | def scm(self, sx1, sx2, k): 104 | ss1 = torch.mean(torch.pow(sx1, k), 0) 105 | ss2 = torch.mean(torch.pow(sx2, k), 0) 106 | return self.matchnorm(ss1, ss2) 107 | 108 | 109 | class SupConLoss(nn.Module): 110 | 111 | def __init__(self, temperature=0.5, scale_by_temperature=True): 112 | super(SupConLoss, self).__init__() 113 | self.temperature = temperature 114 | self.scale_by_temperature = scale_by_temperature 115 | 116 | def forward(self, features, labels=None, mask=None): 117 | """ 118 | 输入: 119 | features: 输入样本的特征,尺寸为 [batch_size, hidden_dim]. 120 | labels: 每个样本的ground truth标签,尺寸是[batch_size]. 121 | mask: 用于对比学习的mask,尺寸为 [batch_size, batch_size], 如果样本i和j属于同一个label,那么mask_{i,j}=1 122 | 输出: 123 | loss值 124 | """ 125 | device = (torch.device('cuda') 126 | if features.is_cuda 127 | else torch.device('cpu')) 128 | features = F.normalize(features, p=2, dim=1) 129 | batch_size = features.shape[0] 130 | # 关于labels参数 131 | if labels is not None and mask is not None: # labels和mask不能同时定义值,因为如果有label,那么mask是需要根据Label得到的 132 | raise ValueError('Cannot define both `labels` and `mask`') 133 | elif labels is None and mask is None: # 如果没有labels,也没有mask,就是无监督学习,mask是对角线为1的矩阵,表示(i,i)属于同一类 134 | mask = torch.eye(batch_size, dtype=torch.float32).to(device) 135 | elif labels is not None: # 如果给出了labels, mask根据label得到,两个样本i,j的label相等时,mask_{i,j}=1 136 | labels = labels.contiguous().view(-1, 1) 137 | if labels.shape[0] != batch_size: 138 | raise ValueError('Num of labels does not match num of features') 139 | mask = torch.eq(labels, labels.T).float().to(device) 140 | else: 141 | mask = mask.float().to(device) 142 | ''' 143 | 示例: 144 | labels: 145 | tensor([[1.], 146 | [2.], 147 | [1.], 148 | [1.]]) 149 | mask: # 两个样本i,j的label相等时,mask_{i,j}=1 150 | tensor([[1., 0., 1., 1.], 151 | [0., 1., 0., 0.], 152 | [1., 0., 1., 1.], 153 | [1., 0., 1., 1.]]) 154 | ''' 155 | # compute logits 156 | anchor_dot_contrast = torch.div( 157 | torch.matmul(features, features.T), 158 | self.temperature) # 计算两两样本间点乘相似度 159 | # for numerical stability 160 | logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True) 161 | logits = anchor_dot_contrast - logits_max.detach() 162 | exp_logits = torch.exp(logits) 163 | ''' 164 | logits是anchor_dot_contrast减去每一行的最大值得到的最终相似度 165 | 示例: logits: torch.size([4,4]) 166 | logits: 167 | tensor([[ 0.0000, -0.0471, -0.3352, -0.2156], 168 | [-1.2576, 0.0000, -0.3367, -0.0725], 169 | [-1.3500, -0.1409, -0.1420, 0.0000], 170 | [-1.4312, -0.0776, -0.2009, 0.0000]]) 171 | ''' 172 | # 构建mask 173 | logits_mask = torch.ones_like(mask) - torch.eye(batch_size) 174 | positives_mask = mask * logits_mask 175 | negatives_mask = 1. - mask 176 | ''' 177 | 但是对于计算Loss而言,(i,i)位置表示样本本身的相似度,对Loss是没用的,所以要mask掉 178 | # 第ind行第ind位置填充为0 179 | 得到logits_mask: 180 | tensor([[0., 1., 1., 1.], 181 | [1., 0., 1., 1.], 182 | [1., 1., 0., 1.], 183 | [1., 1., 1., 0.]]) 184 | positives_mask: 185 | tensor([[0., 0., 1., 1.], 186 | [0., 0., 0., 0.], 187 | [1., 0., 0., 1.], 188 | [1., 0., 1., 0.]]) 189 | negatives_mask: 190 | tensor([[0., 1., 0., 0.], 191 | [1., 0., 1., 1.], 192 | [0., 1., 0., 0.], 193 | [0., 1., 0., 0.]]) 194 | ''' 195 | num_positives_per_row = torch.sum(positives_mask, axis=1) # 除了自己之外,正样本的个数 [2 0 2 2] 196 | denominator = torch.sum( 197 | exp_logits * negatives_mask, axis=1, keepdims=True) + torch.sum( 198 | exp_logits * positives_mask, axis=1, keepdims=True) 199 | 200 | log_probs = logits - torch.log(denominator) 201 | if torch.any(torch.isnan(log_probs)): 202 | raise ValueError("Log_prob has nan!") 203 | 204 | log_probs = torch.sum( 205 | log_probs * positives_mask, axis=1)[num_positives_per_row > 0] / num_positives_per_row[ 206 | num_positives_per_row > 0] 207 | ''' 208 | 计算正样本平均的log-likelihood 209 | 考虑到一个类别可能只有一个样本,就没有正样本了 比如我们labels的第二个类别 labels[1,2,1,1] 210 | 所以这里只计算正样本个数>0的 211 | ''' 212 | # loss 213 | loss = -log_probs 214 | if self.scale_by_temperature: 215 | loss *= self.temperature 216 | loss = loss.mean() 217 | return loss -------------------------------------------------------------------------------- /models/utils/load_pretrained.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from .config import OptConfig 4 | 5 | def load_from_opt_record(file_path): 6 | opt_content = json.load(open(file_path, 'r')) 7 | opt = OptConfig() 8 | opt.load(opt_content) 9 | return opt 10 | 11 | def load_pretrained_model(model_class, checkpoints_dir, cv, gpu_ids): 12 | path = os.path.join(checkpoints_dir, str(cv)) 13 | config_path = os.path.join(checkpoints_dir, 'train_opt.conf') 14 | config = load_from_opt_record(config_path) 15 | config.isTrain = False # teacher model should be in test mode 16 | config.gpu_ids = gpu_ids # set gpu to the same 17 | model = model_class(config) 18 | model.cuda() 19 | model.load_networks_cv(path) 20 | model.eval() 21 | return model 22 | -------------------------------------------------------------------------------- /models/utils/time_track.py: -------------------------------------------------------------------------------- 1 | import time 2 | from functools import partial 3 | 4 | 5 | def base_time_desc_decorator(method, desc='test_description'): 6 | def timed(*args, **kwargs): 7 | 8 | # Print Description 9 | # print('#' * 50) 10 | print(desc) 11 | # print('#' * 50 + '\n') 12 | 13 | # Calculation Runtime 14 | start = time.time() 15 | 16 | # Run Method 17 | try: 18 | result = method(*args, **kwargs) 19 | except TypeError: 20 | result = method(**kwargs) 21 | 22 | # Print Runtime 23 | print('Done! It took {:.2} secs\n'.format(time.time() - start)) 24 | 25 | if result is not None: 26 | return result 27 | 28 | return timed 29 | 30 | 31 | def time_desc_decorator(desc): return partial(base_time_desc_decorator, desc=desc) 32 | 33 | 34 | @time_desc_decorator('this is description') 35 | def time_test(arg, kwarg='this is kwarg'): 36 | time.sleep(3) 37 | print('Inside of time_test') 38 | print('printing arg: ', arg) 39 | print('printing kwarg: ', kwarg) 40 | 41 | 42 | @time_desc_decorator('this is second description') 43 | def no_arg_method(): 44 | print('this method has no argument') 45 | 46 | 47 | if __name__ == '__main__': 48 | time_test('hello', kwarg=3) 49 | time_test(3) 50 | no_arg_method() 51 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==1.5.1 2 | torch 3 | pandas 4 | -------------------------------------------------------------------------------- /scripts/Track1/train_1s_binary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Default Training Parameters 5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory 6 | AUDIOFEATURE_METHOD="mfccs" # Audio feature type, options {wav2vec, opensmile, mfccs} 7 | VIDEOLFEATURE_METHOD="openface" # Video feature type, options {openface, resnet, densenet} 8 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"} 9 | LABELCOUNT=2 # Number of label categories, options {2, 3, 5} 10 | TRACK_OPTION="Track1" 11 | FEATURE_MAX_LEN=26 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding 12 | BATCH_SIZE=1 13 | LR=0.00002 14 | NUM_EPOCHS=200 15 | DEVICE="cpu" # Options {cuda, cpu} 16 | 17 | 18 | for arg in "$@"; do 19 | case $arg in 20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;; 21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;; 22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;; 23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;; 24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;; 25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;; 26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;; 27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;; 28 | --lr=*) LR="${arg#*=}" ;; 29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;; 30 | --device=*) DEVICE="${arg#*=}" ;; 31 | *) echo "Unknown option: $arg"; exit 1 ;; 32 | esac 33 | done 34 | 35 | for i in `seq 1 1 1`; do 36 | cmd="python train.py \ 37 | --data_rootpath=$data_rootpath \ 38 | --audiofeature_method=$AUDIOFEATURE_METHOD \ 39 | --videofeature_method=$VIDEOLFEATURE_METHOD \ 40 | --splitwindow_time=$SPLITWINDOW \ 41 | --labelcount=$LABELCOUNT \ 42 | --track_option=$TRACK_OPTION \ 43 | --feature_max_len=$FEATURE_MAX_LEN \ 44 | --batch_size=$BATCH_SIZE \ 45 | --lr=$LR \ 46 | --num_epochs=$NUM_EPOCHS \ 47 | --device=$DEVICE" 48 | 49 | echo "\n-------------------------------------------------------------------------------------" 50 | echo "Execute command: $cmd" 51 | echo "-------------------------------------------------------------------------------------\n" 52 | echo $cmd | sh 53 | done -------------------------------------------------------------------------------- /scripts/Track1/train_1s_quinary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Default Training Parameters 5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory 6 | AUDIOFEATURE_METHOD="opensmile" # Audio feature type, options {wav2vec, opensmile, mfccs} 7 | VIDEOLFEATURE_METHOD="densenet" # Video feature type, options {openface, resnet, densenet} 8 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"} 9 | LABELCOUNT=5 # Number of label categories, options {2, 3, 5} 10 | TRACK_OPTION="Track1" 11 | FEATURE_MAX_LEN=26 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding 12 | BATCH_SIZE=1 13 | LR=1.65701813672055e-5 14 | NUM_EPOCHS=400 15 | DEVICE="cpu" # Options {cuda, cpu} 16 | 17 | 18 | for arg in "$@"; do 19 | case $arg in 20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;; 21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;; 22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;; 23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;; 24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;; 25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;; 26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;; 27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;; 28 | --lr=*) LR="${arg#*=}" ;; 29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;; 30 | --device=*) DEVICE="${arg#*=}" ;; 31 | *) echo "Unknown option: $arg"; exit 1 ;; 32 | esac 33 | done 34 | 35 | for i in `seq 1 1 1`; do 36 | cmd="python train.py \ 37 | --data_rootpath=$data_rootpath \ 38 | --audiofeature_method=$AUDIOFEATURE_METHOD \ 39 | --videofeature_method=$VIDEOLFEATURE_METHOD \ 40 | --splitwindow_time=$SPLITWINDOW \ 41 | --labelcount=$LABELCOUNT \ 42 | --track_option=$TRACK_OPTION \ 43 | --feature_max_len=$FEATURE_MAX_LEN \ 44 | --batch_size=$BATCH_SIZE \ 45 | --lr=$LR \ 46 | --num_epochs=$NUM_EPOCHS \ 47 | --device=$DEVICE" 48 | 49 | echo "\n-------------------------------------------------------------------------------------" 50 | echo "Execute command: $cmd" 51 | echo "-------------------------------------------------------------------------------------\n" 52 | echo $cmd | sh 53 | done 54 | -------------------------------------------------------------------------------- /scripts/Track1/train_1s_ternary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Default Training Parameters 5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory 6 | AUDIOFEATURE_METHOD="opensmile" # Audio feature type, options {wav2vec, opensmile, mfccs} 7 | VIDEOLFEATURE_METHOD="resnet" # Video feature type, options {openface, resnet, densenet} 8 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"} 9 | LABELCOUNT=3 # Number of label categories, options {2, 3, 5} 10 | TRACK_OPTION="Track1" 11 | FEATURE_MAX_LEN=26 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding 12 | BATCH_SIZE=2 13 | LR=4.58358993791005e-06 14 | NUM_EPOCHS=400 15 | DEVICE="cpu" # Options {cuda, cpu} 16 | 17 | 18 | for arg in "$@"; do 19 | case $arg in 20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;; 21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;; 22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;; 23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;; 24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;; 25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;; 26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;; 27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;; 28 | --lr=*) LR="${arg#*=}" ;; 29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;; 30 | --device=*) DEVICE="${arg#*=}" ;; 31 | *) echo "Unknown option: $arg"; exit 1 ;; 32 | esac 33 | done 34 | 35 | for i in `seq 1 1 1`; do 36 | cmd="python train.py \ 37 | --data_rootpath=$data_rootpath \ 38 | --audiofeature_method=$AUDIOFEATURE_METHOD \ 39 | --videofeature_method=$VIDEOLFEATURE_METHOD \ 40 | --splitwindow_time=$SPLITWINDOW \ 41 | --labelcount=$LABELCOUNT \ 42 | --track_option=$TRACK_OPTION \ 43 | --feature_max_len=$FEATURE_MAX_LEN \ 44 | --batch_size=$BATCH_SIZE \ 45 | --lr=$LR \ 46 | --num_epochs=$NUM_EPOCHS \ 47 | --device=$DEVICE" 48 | 49 | echo "\n-------------------------------------------------------------------------------------" 50 | echo "Execute command: $cmd" 51 | echo "-------------------------------------------------------------------------------------\n" 52 | echo $cmd | sh 53 | done 54 | -------------------------------------------------------------------------------- /scripts/Track1/train_5s_binary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Default Training Parameters 5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory 6 | AUDIOFEATURE_METHOD="opensmile" # Audio feature type, options {wav2vec, opensmile, mfccs} 7 | VIDEOLFEATURE_METHOD="resnet" # Video feature type, options {openface, resnet, densenet} 8 | SPLITWINDOW="5s" # Window duration, options {"1s", "5s"} 9 | LABELCOUNT=2 # Number of label categories, options {2, 3, 5} 10 | TRACK_OPTION="Track1" 11 | FEATURE_MAX_LEN=5 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding 12 | BATCH_SIZE=2 13 | LR=0.000018 14 | NUM_EPOCHS=200 15 | DEVICE="cpu" # Options {cuda, cpu} 16 | 17 | 18 | for arg in "$@"; do 19 | case $arg in 20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;; 21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;; 22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;; 23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;; 24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;; 25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;; 26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;; 27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;; 28 | --lr=*) LR="${arg#*=}" ;; 29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;; 30 | --device=*) DEVICE="${arg#*=}" ;; 31 | *) echo "Unknown option: $arg"; exit 1 ;; 32 | esac 33 | done 34 | 35 | for i in `seq 1 1 1`; do 36 | cmd="python train.py \ 37 | --data_rootpath=$data_rootpath \ 38 | --audiofeature_method=$AUDIOFEATURE_METHOD \ 39 | --videofeature_method=$VIDEOLFEATURE_METHOD \ 40 | --splitwindow_time=$SPLITWINDOW \ 41 | --labelcount=$LABELCOUNT \ 42 | --track_option=$TRACK_OPTION \ 43 | --feature_max_len=$FEATURE_MAX_LEN \ 44 | --batch_size=$BATCH_SIZE \ 45 | --lr=$LR \ 46 | --num_epochs=$NUM_EPOCHS \ 47 | --device=$DEVICE" 48 | 49 | echo "\n-------------------------------------------------------------------------------------" 50 | echo "Execute command: $cmd" 51 | echo "-------------------------------------------------------------------------------------\n" 52 | echo $cmd | sh 53 | done -------------------------------------------------------------------------------- /scripts/Track1/train_5s_quinary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Default Training Parameters 5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory 6 | AUDIOFEATURE_METHOD="mfccs" # Audio feature type, options {wav2vec, opensmile, mfccs} 7 | VIDEOLFEATURE_METHOD="resnet" # Video feature type, options {openface, resnet, densenet} 8 | SPLITWINDOW="5s" # Window duration, options {"1s", "5s"} 9 | LABELCOUNT=5 # Number of label categories, options {2, 3, 5} 10 | TRACK_OPTION="Track1" 11 | FEATURE_MAX_LEN=5 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding 12 | BATCH_SIZE=16 13 | LR=0.000098 14 | NUM_EPOCHS=200 15 | DEVICE="cpu" # Options {cuda, cpu} 16 | 17 | 18 | for arg in "$@"; do 19 | case $arg in 20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;; 21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;; 22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;; 23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;; 24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;; 25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;; 26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;; 27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;; 28 | --lr=*) LR="${arg#*=}" ;; 29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;; 30 | --device=*) DEVICE="${arg#*=}" ;; 31 | *) echo "Unknown option: $arg"; exit 1 ;; 32 | esac 33 | done 34 | 35 | for i in `seq 1 1 1`; do 36 | cmd="python train.py \ 37 | --data_rootpath=$data_rootpath \ 38 | --audiofeature_method=$AUDIOFEATURE_METHOD \ 39 | --videofeature_method=$VIDEOLFEATURE_METHOD \ 40 | --splitwindow_time=$SPLITWINDOW \ 41 | --labelcount=$LABELCOUNT \ 42 | --track_option=$TRACK_OPTION \ 43 | --feature_max_len=$FEATURE_MAX_LEN \ 44 | --batch_size=$BATCH_SIZE \ 45 | --lr=$LR \ 46 | --num_epochs=$NUM_EPOCHS \ 47 | --device=$DEVICE" 48 | 49 | echo "\n-------------------------------------------------------------------------------------" 50 | echo "Execute command: $cmd" 51 | echo "-------------------------------------------------------------------------------------\n" 52 | echo $cmd | sh 53 | done -------------------------------------------------------------------------------- /scripts/Track1/train_5s_ternary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Default Training Parameters 5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory 6 | AUDIOFEATURE_METHOD="wav2vec" # Audio feature type, options {wav2vec, opensmile, mfccs} 7 | VIDEOLFEATURE_METHOD="openface" # Video feature type, options {openface, resnet, densenet} 8 | SPLITWINDOW="5s" # Window duration, options {"1s", "5s"} 9 | LABELCOUNT=3 # Number of label categories, options {2, 3, 5} 10 | TRACK_OPTION="Track1" 11 | FEATURE_MAX_LEN=5 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding 12 | BATCH_SIZE=16 13 | LR=0.000147971824491024 14 | NUM_EPOCHS=400 15 | DEVICE="cpu" # Options {cuda, cpu} 16 | 17 | 18 | for arg in "$@"; do 19 | case $arg in 20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;; 21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;; 22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;; 23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;; 24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;; 25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;; 26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;; 27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;; 28 | --lr=*) LR="${arg#*=}" ;; 29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;; 30 | --device=*) DEVICE="${arg#*=}" ;; 31 | *) echo "Unknown option: $arg"; exit 1 ;; 32 | esac 33 | done 34 | 35 | for i in `seq 1 1 1`; do 36 | cmd="python train.py \ 37 | --data_rootpath=$data_rootpath \ 38 | --audiofeature_method=$AUDIOFEATURE_METHOD \ 39 | --videofeature_method=$VIDEOLFEATURE_METHOD \ 40 | --splitwindow_time=$SPLITWINDOW \ 41 | --labelcount=$LABELCOUNT \ 42 | --track_option=$TRACK_OPTION \ 43 | --feature_max_len=$FEATURE_MAX_LEN \ 44 | --batch_size=$BATCH_SIZE \ 45 | --lr=$LR \ 46 | --num_epochs=$NUM_EPOCHS \ 47 | --device=$DEVICE" 48 | 49 | echo "\n-------------------------------------------------------------------------------------" 50 | echo "Execute command: $cmd" 51 | echo "-------------------------------------------------------------------------------------\n" 52 | echo $cmd | sh 53 | done 54 | -------------------------------------------------------------------------------- /scripts/Track2/train_1s_binary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Default Training Parameters 5 | data_rootpath="D:/HACI/MMchallenge/NEUQdata" # Dataset root directory 6 | AUDIOFEATURE_METHOD="wav2vec" # Audio feature type, options {wav2vec, opensmile, mfccs} 7 | VIDEOLFEATURE_METHOD="openface" # Video feature type, options {openface, resnet, densenet} 8 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"} 9 | LABELCOUNT=2 # Number of label categories, options {2, 3, 5} 10 | TRACK_OPTION="Track2" 11 | FEATURE_MAX_LEN=25 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding 12 | BATCH_SIZE=16 13 | LR=0.00006 14 | NUM_EPOCHS=500 15 | DEVICE="cpu" # Options {cuda, cpu} 16 | 17 | 18 | for arg in "$@"; do 19 | case $arg in 20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;; 21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;; 22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;; 23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;; 24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;; 25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;; 26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;; 27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;; 28 | --lr=*) LR="${arg#*=}" ;; 29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;; 30 | --device=*) DEVICE="${arg#*=}" ;; 31 | *) echo "Unknown option: $arg"; exit 1 ;; 32 | esac 33 | done 34 | 35 | for i in `seq 1 1 1`; do 36 | cmd="python train.py \ 37 | --data_rootpath=$data_rootpath \ 38 | --audiofeature_method=$AUDIOFEATURE_METHOD \ 39 | --videofeature_method=$VIDEOLFEATURE_METHOD \ 40 | --splitwindow_time=$SPLITWINDOW \ 41 | --labelcount=$LABELCOUNT \ 42 | --track_option=$TRACK_OPTION \ 43 | --feature_max_len=$FEATURE_MAX_LEN \ 44 | --batch_size=$BATCH_SIZE \ 45 | --lr=$LR \ 46 | --num_epochs=$NUM_EPOCHS \ 47 | --device=$DEVICE" 48 | 49 | echo "\n-------------------------------------------------------------------------------------" 50 | echo "Execute command: $cmd" 51 | echo "-------------------------------------------------------------------------------------\n" 52 | echo $cmd | sh 53 | done -------------------------------------------------------------------------------- /scripts/Track2/train_1s_ternary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Default Training Parameters 5 | data_rootpath="D:/HACI/MMchallenge/NEUQdata" # Dataset root directory 6 | AUDIOFEATURE_METHOD="mfccs" # Audio feature type, options {wav2vec, opensmile, mfccs} 7 | VIDEOLFEATURE_METHOD="densenet" # Video feature type, options {openface, resnet, densenet} 8 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"} 9 | LABELCOUNT=3 # Number of label categories, options {2, 3, 5} 10 | TRACK_OPTION="Track2" 11 | FEATURE_MAX_LEN=25 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding 12 | BATCH_SIZE=8 13 | LR=0.00026 14 | NUM_EPOCHS=500 15 | DEVICE="cpu" # Options {cuda, cpu} 16 | 17 | 18 | for arg in "$@"; do 19 | case $arg in 20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;; 21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;; 22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;; 23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;; 24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;; 25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;; 26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;; 27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;; 28 | --lr=*) LR="${arg#*=}" ;; 29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;; 30 | --device=*) DEVICE="${arg#*=}" ;; 31 | *) echo "Unknown option: $arg"; exit 1 ;; 32 | esac 33 | done 34 | 35 | for i in `seq 1 1 1`; do 36 | cmd="python train.py \ 37 | --data_rootpath=$data_rootpath \ 38 | --audiofeature_method=$AUDIOFEATURE_METHOD \ 39 | --videofeature_method=$VIDEOLFEATURE_METHOD \ 40 | --splitwindow_time=$SPLITWINDOW \ 41 | --labelcount=$LABELCOUNT \ 42 | --track_option=$TRACK_OPTION \ 43 | --feature_max_len=$FEATURE_MAX_LEN \ 44 | --batch_size=$BATCH_SIZE \ 45 | --lr=$LR \ 46 | --num_epochs=$NUM_EPOCHS \ 47 | --device=$DEVICE" 48 | 49 | echo "\n-------------------------------------------------------------------------------------" 50 | echo "Execute command: $cmd" 51 | echo "-------------------------------------------------------------------------------------\n" 52 | echo $cmd | sh 53 | done -------------------------------------------------------------------------------- /scripts/Track2/train_5s_binary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Default Training Parameters 5 | data_rootpath="D:/HACI/MMchallenge/NEUQdata" # Dataset root directory 6 | AUDIOFEATURE_METHOD="opensmile" # Audio feature type, options {wav2vec, opensmile, mfccs} 7 | VIDEOLFEATURE_METHOD="resnet" # Video feature type, options {openface, resnet, densenet} 8 | SPLITWINDOW="5s" # Window duration, options {"1s", "5s"} 9 | LABELCOUNT=2 # Number of label categories, options {2, 3, 5} 10 | TRACK_OPTION="Track2" 11 | FEATURE_MAX_LEN=5 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding 12 | BATCH_SIZE=24 13 | LR=0.00005 14 | NUM_EPOCHS=500 15 | DEVICE="cpu" # Options {cuda, cpu} 16 | 17 | 18 | for arg in "$@"; do 19 | case $arg in 20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;; 21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;; 22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;; 23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;; 24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;; 25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;; 26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;; 27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;; 28 | --lr=*) LR="${arg#*=}" ;; 29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;; 30 | --device=*) DEVICE="${arg#*=}" ;; 31 | *) echo "Unknown option: $arg"; exit 1 ;; 32 | esac 33 | done 34 | 35 | for i in `seq 1 1 1`; do 36 | cmd="python train.py \ 37 | --data_rootpath=$data_rootpath \ 38 | --audiofeature_method=$AUDIOFEATURE_METHOD \ 39 | --videofeature_method=$VIDEOLFEATURE_METHOD \ 40 | --splitwindow_time=$SPLITWINDOW \ 41 | --labelcount=$LABELCOUNT \ 42 | --track_option=$TRACK_OPTION \ 43 | --feature_max_len=$FEATURE_MAX_LEN \ 44 | --batch_size=$BATCH_SIZE \ 45 | --lr=$LR \ 46 | --num_epochs=$NUM_EPOCHS \ 47 | --device=$DEVICE" 48 | 49 | echo "\n-------------------------------------------------------------------------------------" 50 | echo "Execute command: $cmd" 51 | echo "-------------------------------------------------------------------------------------\n" 52 | echo $cmd | sh 53 | done -------------------------------------------------------------------------------- /scripts/Track2/train_5s_ternary.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Default Training Parameters 5 | data_rootpath="D:/HACI/MMchallenge/NEUQdata" # Dataset root directory 6 | AUDIOFEATURE_METHOD="mfccs" # Audio feature type, options {wav2vec, opensmile, mfccs} 7 | VIDEOLFEATURE_METHOD="densenet" # Video feature type, options {openface, resnet, densenet} 8 | SPLITWINDOW="5s" # Window duration, options {"1s", "5s"} 9 | LABELCOUNT=3 # Number of label categories, options {2, 3, 5} 10 | TRACK_OPTION="Track2" 11 | FEATURE_MAX_LEN=5 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding 12 | BATCH_SIZE=8 13 | LR=0.0004 14 | NUM_EPOCHS=500 15 | DEVICE="cpu" # Options {cuda, cpu} 16 | 17 | 18 | for arg in "$@"; do 19 | case $arg in 20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;; 21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;; 22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;; 23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;; 24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;; 25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;; 26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;; 27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;; 28 | --lr=*) LR="${arg#*=}" ;; 29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;; 30 | --device=*) DEVICE="${arg#*=}" ;; 31 | *) echo "Unknown option: $arg"; exit 1 ;; 32 | esac 33 | done 34 | 35 | for i in `seq 1 1 1`; do 36 | cmd="python train.py \ 37 | --data_rootpath=$data_rootpath \ 38 | --audiofeature_method=$AUDIOFEATURE_METHOD \ 39 | --videofeature_method=$VIDEOLFEATURE_METHOD \ 40 | --splitwindow_time=$SPLITWINDOW \ 41 | --labelcount=$LABELCOUNT \ 42 | --track_option=$TRACK_OPTION \ 43 | --feature_max_len=$FEATURE_MAX_LEN \ 44 | --batch_size=$BATCH_SIZE \ 45 | --lr=$LR \ 46 | --num_epochs=$NUM_EPOCHS \ 47 | --device=$DEVICE" 48 | 49 | echo "\n-------------------------------------------------------------------------------------" 50 | echo "Execute command: $cmd" 51 | echo "-------------------------------------------------------------------------------------\n" 52 | echo $cmd | sh 53 | done -------------------------------------------------------------------------------- /scripts/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | # Default Training Parameters 5 | DATA_ROOTPATH="E:/MDPP_data/MPDD-Elderly" 6 | TRAIN_MODEL="D:/HACI/MMchallenge/MEIJU2025-baseline-master/MPDD/checkpoints/1s_5labels_opensmile+densenet/best_model_2025-02-13-21.12.01.pth" 7 | AUDIOFEATURE_METHOD="opensmile" # Audio feature type, options {wav2vec, opensmile, mfccs} 8 | VIDEOLFEATURE_METHOD="densenet" # Video feature type, options {openface, resnet, densenet} 9 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"} 10 | LABELCOUNT=5 # Number of label categories, options {2, 3, 5} 11 | TRACK_OPTION="Track1" 12 | FEATURE_MAX_LEN=26 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding. For Track1, options {26, 5}; for Track2, options {25, 5} 13 | BATCH_SIZE=1 14 | DEVICE="cpu" 15 | 16 | for arg in "$@"; do 17 | case $arg in 18 | --data_rootpath=*) DATA_ROOTPATH="${arg#*=}" ;; 19 | --train_model=*) TRAIN_MODEL="${arg#*=}" ;; 20 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;; 21 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;; 22 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;; 23 | --labelcount=*) LABELCOUNT="${arg#*=}" ;; 24 | --track_option=*) TRACK_OPTION="${arg#*=}" ;; 25 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;; 26 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;; 27 | --lr=*) LR="${arg#*=}" ;; 28 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;; 29 | --device=*) DEVICE="${arg#*=}" ;; 30 | *) echo "Unknown option: $arg"; exit 1 ;; 31 | esac 32 | done 33 | 34 | for i in `seq 1 1 1`; do 35 | cmd="python test.py \ 36 | --data_rootpath=$DATA_ROOTPATH \ 37 | --train_model=$TRAIN_MODEL \ 38 | --audiofeature_method=$AUDIOFEATURE_METHOD \ 39 | --videofeature_method=$VIDEOLFEATURE_METHOD \ 40 | --splitwindow_time=$SPLITWINDOW \ 41 | --labelcount=$LABELCOUNT \ 42 | --track_option=$TRACK_OPTION \ 43 | --feature_max_len=$FEATURE_MAX_LEN \ 44 | --batch_size=$BATCH_SIZE \ 45 | --device=$DEVICE" 46 | 47 | echo "\n-------------------------------------------------------------------------------------" 48 | echo "Execute command: $cmd" 49 | echo "-------------------------------------------------------------------------------------\n" 50 | echo $cmd | sh 51 | done 52 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | from models.our.our_model import ourModel 5 | from train import eval 6 | import argparse 7 | from utils.logger import get_logger 8 | import numpy as np 9 | import pandas as pd 10 | import time 11 | from torch.utils.data import DataLoader 12 | from dataset import * 13 | 14 | class Opt: 15 | def __init__(self, config_dict): 16 | self.__dict__.update(config_dict) 17 | 18 | def load_config(config_file): 19 | with open(config_file, 'r') as f: 20 | return json.load(f) 21 | 22 | if __name__ == '__main__': 23 | parser = argparse.ArgumentParser(description="Test MDPP Model") 24 | parser.add_argument('--labelcount', type=int, default=2, 25 | help="Number of data categories (2, 3, or 5).") 26 | parser.add_argument('--track_option', type=str, required=True, 27 | help="Track1 or Track2") 28 | parser.add_argument('--feature_max_len', type=int, required=True, 29 | help="Max length of feature.") 30 | parser.add_argument('--data_rootpath', type=str, required=True, 31 | help="Root path to the program dataset") 32 | parser.add_argument('--train_model', type=str, required=True, 33 | help="Path to the training model") 34 | 35 | parser.add_argument('--test_json', type=str, required=False, 36 | help="File name of the testing JSON file") 37 | parser.add_argument('--personalized_features_file', type=str, 38 | help="File name of the personalized features file") 39 | 40 | parser.add_argument('--audiofeature_method', type=str, default='wav2vec', 41 | choices=['mfccs', 'opensmile', 'wav2vec'], 42 | help="Method for extracting audio features.") 43 | parser.add_argument('--videofeature_method', type=str, default='openface', 44 | choices=['openface', 'resnet', 'densenet'], 45 | help="Method for extracting video features.") 46 | parser.add_argument('--splitwindow_time', type=str, default='1s', 47 | help="Time window for splitted features. e.g. '1s' or '5s'") 48 | 49 | parser.add_argument('--batch_size', type=int, default=24, 50 | help="Batch size for testing") 51 | parser.add_argument('--lr', type=float, default=1e-4, 52 | help="Learning rate") 53 | parser.add_argument('--device', type=str, default='cpu', 54 | help="Device to test the model on, e.g. 'cuda' or 'cpu'") 55 | 56 | args = parser.parse_args() 57 | 58 | args.test_json = os.path.join(args.data_rootpath, 'Testing', 'labels', 'Testing_files.json') 59 | args.personalized_features_file = os.path.join(args.data_rootpath, 'Testing', 'individualEmbedding', 'descriptions_embeddings_with_ids.npy') 60 | 61 | 62 | config = load_config('config.json') 63 | opt = Opt(config) 64 | 65 | # Modify individual dynamic parameters in opt according to task category 66 | opt.emo_output_dim = args.labelcount 67 | opt.feature_max_len = args.feature_max_len 68 | opt.lr = args.lr 69 | 70 | # Splice out feature folder paths according to incoming audio and video feature types 71 | audio_path = os.path.join(args.data_rootpath, 'Testing', f"{args.splitwindow_time}", 'Audio', f"{args.audiofeature_method}") + '/' 72 | video_path = os.path.join(args.data_rootpath, 'Testing', f"{args.splitwindow_time}", 'Visual', f"{args.videofeature_method}") + '/' 73 | 74 | # Obtain input_dim_a, input_dim_v 75 | for filename in os.listdir(audio_path): 76 | if filename.endswith('.npy'): 77 | opt.input_dim_a = np.load(audio_path + filename).shape[1] 78 | break 79 | 80 | for filename in os.listdir(video_path): 81 | if filename.endswith('.npy'): 82 | opt.input_dim_v = np.load(video_path + filename).shape[1] 83 | break 84 | 85 | opt.name = f'{args.splitwindow_time}_{args.labelcount}labels_{args.audiofeature_method}+{args.videofeature_method}' 86 | logger_path = os.path.join(opt.log_dir, opt.name) 87 | if not os.path.exists(opt.log_dir): 88 | os.mkdir(opt.log_dir) 89 | if not os.path.exists(logger_path): 90 | os.mkdir(logger_path) 91 | logger = get_logger(logger_path, 'result') 92 | 93 | cur_time = time.strftime('%Y-%m-%d-%H.%M.%S', time.localtime(time.time())) 94 | best_model_name = f"best_model_{cur_time}.pth" 95 | 96 | logger.info(f"splitwindow_time={args.splitwindow_time}, audiofeature_method={args.audiofeature_method}, " 97 | f"videofeature_method={args.videofeature_method}") 98 | logger.info(f"batch_size={args.batch_size}, , " 99 | f"labels={opt.emo_output_dim}, feature_max_len={opt.feature_max_len}") 100 | 101 | 102 | model = ourModel(opt) 103 | model.load_state_dict(torch.load(args.train_model, map_location=torch.device(args.device))) 104 | model.to(args.device) 105 | test_data = json.load(open(args.test_json, 'r')) 106 | test_loader = DataLoader( 107 | AudioVisualDataset(test_data, args.labelcount, args.personalized_features_file, opt.feature_max_len, 108 | batch_size=args.batch_size, 109 | audio_path=audio_path, video_path=video_path,isTest=True), batch_size=args.batch_size, shuffle=False) 110 | logger.info('The number of testing samples = %d' % len(test_loader.dataset)) 111 | 112 | # testing 113 | _, pred, *_ = eval(model, test_loader, args.device) 114 | 115 | filenames = [item["audio_feature_path"] for item in test_data if "audio_feature_path" in item] 116 | IDs = [path[:path.find('.')] for path in filenames] 117 | 118 | if args.labelcount==2: 119 | label="bin" 120 | elif args.labelcount==3: 121 | label="tri" 122 | elif args.labelcount==5: 123 | label="pen" 124 | 125 | 126 | # output results to CSV 127 | pred_col_name = f"{args.splitwindow_time}_{label}" 128 | 129 | result_dir = f"./answer_{args.track_option}" 130 | if not os.path.exists(result_dir): 131 | os.makedirs(result_dir) 132 | 133 | csv_file = f"{result_dir}/submission.csv" 134 | 135 | # Get the order of the IDs in the test data to ensure consistency 136 | test_ids = [np.int64(item["audio_feature_path"].split('_')[0])for item in test_data] 137 | 138 | if os.path.exists(csv_file): 139 | df = pd.read_csv(csv_file) 140 | else: 141 | df = pd.DataFrame(columns=["ID"]) 142 | 143 | pred = np.array(pred) 144 | 145 | if len(pred) != len(test_data): 146 | logger.error(f"Prediction length {len(pred)} does not match test data length {len(test_data)}") 147 | raise ValueError("Mismatch between predictions and test data") 148 | 149 | # zelin: ID 直接使用 audio_feature_path 去除 .npy 后缀 150 | id_list = [item["audio_feature_path"].replace(".npy", "") for item in test_data] 151 | 152 | # 构建预测结果 DataFrame 153 | result_df = pd.DataFrame({ 154 | "ID": id_list, 155 | pred_col_name: pred 156 | }) 157 | 158 | # 如果已有 CSV 文件,按 ID 合并;否则新建 159 | if os.path.exists(csv_file): 160 | existing_df = pd.read_csv(csv_file) 161 | 162 | # 合并已有 CSV 和本轮结果(保留所有 ID,自动对齐列) 163 | merged_df = pd.merge(existing_df, result_df, on="ID", how="outer") 164 | else: 165 | merged_df = result_df 166 | 167 | # 保存更新后的结果(覆盖写入,但保留所有旧列 + 本轮预测列) 168 | merged_df.to_csv(csv_file, index=False) 169 | logger.info(f"Testing complete. Results saved to: {csv_file}. Shape={merged_df.shape}") 170 | 171 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import os 3 | import json 4 | import time 5 | import argparse 6 | import torch 7 | from sklearn.metrics import f1_score, confusion_matrix, accuracy_score 8 | from torch.utils.data import DataLoader 9 | from train_val_split import train_val_split1, train_val_split2 10 | from models.our.our_model import ourModel 11 | from dataset import * 12 | from utils.logger import get_logger 13 | import numpy as np 14 | 15 | class Opt: 16 | def __init__(self, config_dict): 17 | self.__dict__.update(config_dict) 18 | 19 | def load_config(config_file): 20 | with open(config_file, 'r') as f: 21 | return json.load(f) 22 | 23 | def eval(model, val_loader, device): 24 | model.eval() 25 | total_emo_pred = [] 26 | total_emo_label = [] 27 | 28 | with torch.no_grad(): 29 | for data in val_loader: 30 | for k, v in data.items(): 31 | data[k] = v.to(device) 32 | model.set_input(data) 33 | model.test() 34 | emo_pred = model.emo_pred.argmax(dim=1).cpu().numpy() 35 | emo_label = data['emo_label'].cpu().numpy() 36 | total_emo_pred.append(emo_pred) 37 | total_emo_label.append(emo_label) 38 | 39 | total_emo_pred = np.concatenate(total_emo_pred) 40 | total_emo_label = np.concatenate(total_emo_label) 41 | 42 | emo_acc_unweighted = accuracy_score(total_emo_label, total_emo_pred, sample_weight=None) 43 | class_counts = np.bincount(total_emo_label) # Get the sample size for each category 44 | sample_weights = 1 / (class_counts[total_emo_label] + 1e-6) # Calculate weights for each sample to avoid division by zero errors 45 | emo_acc_weighted = accuracy_score(total_emo_label, total_emo_pred, sample_weight=sample_weights) 46 | 47 | emo_f1_weighted = f1_score(total_emo_label, total_emo_pred, average='weighted') 48 | emo_f1_unweighted = f1_score(total_emo_label, total_emo_pred, average='macro') 49 | emo_cm = confusion_matrix(total_emo_label, total_emo_pred) 50 | 51 | return total_emo_label,total_emo_pred,emo_acc_weighted, emo_acc_unweighted, emo_f1_weighted, emo_f1_unweighted, emo_cm 52 | 53 | 54 | def train_model(train_json, model, audio_path='', video_path='', max_len=5, 55 | best_model_name='best_model.pth', seed=None): 56 | """ 57 | This is the traing function 58 | """ 59 | logger.info(f'personalized features used:{args.personalized_features_file}') 60 | num_epochs = args.num_epochs 61 | device = args.device 62 | print(f"device: {device}") 63 | model.to(device) 64 | 65 | # split training and validation set 66 | # data = json.load(open(train_json, 'r')) 67 | if args.track_option=='Track1': 68 | train_data, val_data, train_category_count, val_category_count = train_val_split1(train_json, val_ratio=0.1, random_seed=seed) 69 | elif args.track_option=='Track2': 70 | train_data, val_data, train_category_count, val_category_count = train_val_split2(train_json, val_percentage=0.1, 71 | seed=seed) 72 | 73 | train_loader = DataLoader( 74 | AudioVisualDataset(train_data, args.labelcount, args.personalized_features_file, max_len, 75 | batch_size=args.batch_size, 76 | audio_path=audio_path, video_path=video_path), batch_size=args.batch_size, shuffle=True) 77 | val_loader = DataLoader( 78 | AudioVisualDataset(val_data, args.labelcount, args.personalized_features_file, max_len, 79 | batch_size=args.batch_size, 80 | audio_path=audio_path, video_path=video_path), batch_size=args.batch_size, shuffle=False) 81 | 82 | logger.info('The number of training samples = %d' % len(train_loader.dataset)) 83 | logger.info('The number of val samples = %d' % len(val_loader.dataset)) 84 | 85 | best_emo_acc = 0.0 86 | best_emo_f1 = 0.0 87 | best_emo_epoch = 1 88 | best_emo_cm = [] 89 | 90 | for epoch in range(num_epochs): 91 | model.train(True) 92 | total_loss = 0 93 | 94 | for i, data in enumerate(train_loader): 95 | for k, v in data.items(): 96 | data[k] = v.to(device) 97 | model.set_input(data) 98 | model.optimize_parameters(epoch) 99 | 100 | losses = model.get_current_losses() 101 | total_loss += losses['emo_CE'] 102 | 103 | avg_loss = total_loss / len(train_loader) 104 | 105 | # evaluation 106 | label, pred, emo_acc_weighted, emo_acc_unweighted, emo_f1_weighted, emo_f1_unweighted, emo_cm = eval(model, val_loader, 107 | device) 108 | 109 | logger.info(f"Epoch {epoch + 1}/{num_epochs}, Avg Loss: {avg_loss:.10f}, " 110 | f"Weighted F1: {emo_f1_weighted:.10f}, Unweighted F1: {emo_f1_unweighted:.10f}, " 111 | f"Weighted Acc: {emo_acc_weighted:.10f}, Unweighted Acc: {emo_acc_unweighted:.10f}") 112 | logger.info('Confusion Matrix:\n{}'.format(emo_cm)) 113 | 114 | if emo_f1_weighted > best_emo_f1: 115 | cur_time = time.strftime('%Y-%m-%d-%H.%M.%S', time.localtime(time.time())) 116 | best_emo_f1 = emo_f1_weighted 117 | best_emo_f1_unweighted = emo_f1_unweighted 118 | best_emo_acc = emo_acc_weighted 119 | best_emo_acc_unweighted = emo_acc_unweighted 120 | best_emo_cm = emo_cm 121 | best_emo_epoch = epoch + 1 122 | best_model = model 123 | save_path = os.path.join(os.path.join(opt.checkpoints_dir, opt.name), best_model_name) 124 | torch.save(model.state_dict(), save_path) 125 | print("Saved best model.") 126 | 127 | logger.info(f"Training complete. Random seed: {seed}. Best epoch: {best_emo_epoch}.") 128 | logger.info(f"Best Weighted F1: {best_emo_f1:.4f}, Best Unweighted F1: {best_emo_f1_unweighted:.4f}, " 129 | f"Best Weighted Acc: {best_emo_acc:.4f}, Best Unweighted Acc: {best_emo_acc_unweighted:.4f}.") 130 | logger.info('Confusion Matrix:\n{}'.format(best_emo_cm)) 131 | 132 | # output results to CSV 133 | csv_file = f'{opt.log_dir}/{opt.name}.csv' 134 | formatted_best_emo_cm = ' '.join([f"[{' '.join(map(str, row))}]" for row in best_emo_cm]) 135 | header = f"Time,random seed,splitwindow_time,labelcount,audiofeature_method,videofeature_method," \ 136 | f"batch_size,num_epochs,feature_max_len,lr," \ 137 | f"Weighted_F1,Unweighted_F1,Weighted_Acc,Unweighted_Acc,Confusion_Matrix" 138 | result_value = f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')},{seed},{args.splitwindow_time},{args.labelcount},{args.audiofeature_method},{args.videofeature_method}," \ 139 | f"{args.batch_size},{args.num_epochs},{opt.feature_max_len},{opt.lr:.6f}," \ 140 | f"{best_emo_f1:.4f},{best_emo_f1_unweighted:.4f},{best_emo_acc:.4f},{best_emo_acc_unweighted:.4f},{formatted_best_emo_cm}" 141 | file_exists = os.path.exists(csv_file) 142 | # Open file (append if file exists, create if it doesn't) 143 | with open(csv_file, mode='a') as file: 144 | if not file_exists: 145 | file.write(header + '\n') 146 | file.write(result_value + '\n') 147 | 148 | return best_emo_f1, best_emo_f1_unweighted, best_emo_acc, best_emo_acc_unweighted, best_emo_cm 149 | 150 | 151 | if __name__ == '__main__': 152 | parser = argparse.ArgumentParser(description="Train MDPP Model") 153 | parser.add_argument('--labelcount', type=int, default=3, 154 | help="Number of data categories (2, 3, or 5).") 155 | parser.add_argument('--track_option', type=str, required=True, 156 | help="Track1 or Track2") 157 | parser.add_argument('--feature_max_len', type=int, required=True, 158 | help="Max length of feature.") 159 | parser.add_argument('--data_rootpath', type=str, required=True, 160 | help="Root path to the program dataset") 161 | parser.add_argument('--train_json', type=str, required=False, 162 | help="File name of the training JSON file") 163 | parser.add_argument('--personalized_features_file', type=str, 164 | help="File name of the personalized features file") 165 | parser.add_argument('--audiofeature_method', type=str, default='mfccs', 166 | choices=['mfccs', 'opensmile', 'wav2vec'], 167 | help="Method for extracting audio features.") 168 | parser.add_argument('--videofeature_method', type=str, default='densenet', 169 | choices=['openface', 'resnet', 'densenet'], 170 | help="Method for extracting video features.") 171 | parser.add_argument('--splitwindow_time', type=str, default='1s', 172 | help="Time window for splitted features. e.g. '1s' or '5s'") 173 | 174 | parser.add_argument('--batch_size', type=int, default=32, 175 | help="Batch size for training") 176 | parser.add_argument('--lr', type=float, default=1e-4, 177 | help="Learning rate") 178 | parser.add_argument('--num_epochs', type=int, default=10, 179 | help="Number of epochs to train the model") 180 | parser.add_argument('--device', type=str, default='cpu', 181 | help="Device to train the model on, e.g. 'cuda' or 'cpu'") 182 | 183 | args = parser.parse_args() 184 | 185 | args.train_json = os.path.join(args.data_rootpath, 'Training', 'labels', 'Training_Validation_files.json') 186 | args.personalized_features_file = os.path.join(args.data_rootpath, 'Training', 'individualEmbedding', 'descriptions_embeddings_with_ids.npy') 187 | 188 | config = load_config('config.json') 189 | opt = Opt(config) 190 | 191 | # Modify individual dynamic parameters in opt according to task category 192 | opt.emo_output_dim = args.labelcount 193 | opt.feature_max_len = args.feature_max_len 194 | opt.lr = args.lr 195 | 196 | # Splice out feature folder paths according to incoming audio and video feature types 197 | audio_path = os.path.join(args.data_rootpath, 'Training', f"{args.splitwindow_time}", 'Audio', f"{args.audiofeature_method}") + '/' 198 | video_path = os.path.join(args.data_rootpath, 'Training', f"{args.splitwindow_time}", 'Visual', f"{args.videofeature_method}") + '/' 199 | 200 | # Obtain input_dim_a, input_dim_v 201 | for filename in os.listdir(audio_path): 202 | if filename.endswith('.npy'): 203 | opt.input_dim_a = np.load(audio_path + filename).shape[1] 204 | break 205 | 206 | for filename in os.listdir(video_path): 207 | if filename.endswith('.npy'): 208 | opt.input_dim_v = np.load(video_path + filename).shape[1] 209 | break 210 | 211 | 212 | opt.name = f'{args.splitwindow_time}_{args.labelcount}labels_{args.audiofeature_method}+{args.videofeature_method}' 213 | logger_path = os.path.join(opt.log_dir, opt.name) 214 | if not os.path.exists(opt.log_dir): 215 | os.mkdir(opt.log_dir) 216 | if not os.path.exists(logger_path): 217 | os.mkdir(logger_path) 218 | logger = get_logger(logger_path, 'result') 219 | 220 | model = ourModel(opt) 221 | 222 | cur_time = time.strftime('%Y-%m-%d-%H.%M.%S', time.localtime(time.time())) 223 | best_model_name = f"best_model_{cur_time}.pth" 224 | 225 | logger.info(f"splitwindow_time={args.splitwindow_time}, audiofeature_method={args.audiofeature_method}, " 226 | f"videofeature_method={args.videofeature_method}") 227 | logger.info(f"batch_size={args.batch_size}, num_epochs={args.num_epochs}, " 228 | f"labels={opt.emo_output_dim}, feature_max_len={opt.feature_max_len}, lr={opt.lr}") 229 | 230 | # set random seed 231 | # seed = np.random.randint(0, 10000) 232 | seed = 3407 233 | np.random.seed(seed) 234 | torch.manual_seed(seed) 235 | torch.cuda.manual_seed_all(seed) 236 | 237 | logger.info(f"Using random seed: {seed}") 238 | 239 | # training 240 | train_model( 241 | train_json=args.train_json, 242 | model=model, 243 | max_len=opt.feature_max_len, 244 | best_model_name=best_model_name, 245 | audio_path=audio_path, 246 | video_path=video_path, 247 | seed=seed 248 | ) 249 | -------------------------------------------------------------------------------- /train_val_split.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | from collections import defaultdict 4 | import math 5 | import numpy as np 6 | import torch 7 | 8 | 9 | def train_val_split2(file_path, val_percentage=0.10, seed=None): 10 | """ 11 | Track2 dataset split 12 | 13 | Group data by person ID from the given JSON file and select 10% of the person IDs, ensuring that the proportion of each label remains consistent with the original data. For each person, either all their data is selected or none is selected. 14 | 15 | Parameters: 16 | - file_path: path to the JSON file 17 | - val_percentage: proportion of data to select (based on the number of person IDs, default is 10%) 18 | - seed: random seed 19 | 20 | Returns: 21 | - train_data: training set 22 | - val_data: validation set 23 | - train_category_count: total count of tri_category labels in the training set 24 | - val_category_count: total count of tri_category labels in the validation set 25 | """ 26 | 27 | if seed is not None: 28 | random.seed(seed) 29 | 30 | with open(file_path, 'r') as file: 31 | data = json.load(file) 32 | 33 | grouped_by_person = defaultdict(list) 34 | for entry in data: 35 | # Extract person ID (assumes format "personID_topicID.npy") 36 | person_id = entry['audio_feature_path'].split('_')[0] 37 | grouped_by_person[person_id].append(entry) 38 | 39 | # Evenly distribute persons based on label category (young dataset is split according to tri_category) 40 | tri_category_person = defaultdict(list) 41 | for person_id, entries in grouped_by_person.items(): 42 | tri_category = entries[0]['tri_category'] 43 | tri_category_person[tri_category].append(person_id) 44 | 45 | total_person_count = len(grouped_by_person) 46 | num_persons_to_select = round(total_person_count * val_percentage) 47 | 48 | selected_person_ids = set() 49 | 50 | # Calculate the number of persons per category and the number to be selected 51 | selected_per_category = defaultdict(int) 52 | for category, person_ids in tri_category_person.items(): 53 | num_category_persons = len(person_ids) 54 | num_category_to_select = round(num_category_persons * val_percentage + 0.001) 55 | selected_per_category[category] = num_category_to_select 56 | 57 | for category, person_ids in tri_category_person.items(): 58 | num_category_to_select = selected_per_category[category] 59 | selected_person_ids.update(random.sample(person_ids, num_category_to_select)) 60 | 61 | # Build the validation set data 62 | val_data = [] 63 | for entry in data: 64 | person_id = entry['audio_feature_path'].split('_')[0] 65 | if person_id in selected_person_ids: 66 | val_data.append(entry) 67 | 68 | # Training set 69 | train_data = [entry for entry in data if entry not in val_data] 70 | 71 | # Count the total number of tri_category labels in train_data and val_data 72 | train_category_count = defaultdict(int) 73 | val_category_count = defaultdict(int) 74 | 75 | for entry in train_data: 76 | train_category_count[entry['tri_category']] += 1 77 | 78 | for entry in val_data: 79 | val_category_count[entry['tri_category']] += 1 80 | # Save train_data and val_data to JSON file (if needed) 81 | 82 | 83 | return train_data, val_data, train_category_count, val_category_count 84 | 85 | import json 86 | import random 87 | from collections import defaultdict 88 | 89 | def train_val_split1(file_path, val_ratio=0.1, random_seed=3407): 90 | """ 91 | Track1 dataset split 92 | 93 | Read the JSON file and split it into training and validation sets according to the specified rules: 94 | - Data with label=4 are split in a 2:1 ratio; 95 | - Data with label=3 and id=69 are placed directly into the validation set; 96 | - The remaining data are split according to val_ratio. 97 | 98 | Ensure that: 99 | - Samples with the same ID do not appear in both the training and validation sets. 100 | - The return format is consistent with train_val_split. 101 | 102 | Parameters: 103 | file_path (str): path to the JSON data file 104 | val_ratio (float): proportion of the validation set, default is 0.1 105 | random_seed (int): random seed, default is 3407 106 | 107 | Returns: 108 | tuple: (training data list, validation data list, training set category counts, validation set category counts) 109 | """ 110 | random.seed(random_seed) 111 | 112 | with open(file_path, 'r') as file: 113 | data = json.load(file) 114 | 115 | train_data, val_data = [], [] 116 | label_to_ids = defaultdict(set) 117 | id_to_samples = defaultdict(list) 118 | 119 | for item in data: 120 | pen_category = item["pen_category"] 121 | id_ = item["id"] 122 | label_to_ids[pen_category].add(id_) 123 | id_to_samples[id_].append(item) 124 | 125 | train_ids, val_ids = set(), set() 126 | 127 | for pen_category, ids in label_to_ids.items(): 128 | ids = list(ids) 129 | 130 | # Process label=4 (split in a 2:1 ratio) 131 | if pen_category == 4: 132 | for id_ in ids: 133 | samples = id_to_samples[id_] 134 | if len(samples) >= 3: 135 | random.shuffle(samples) 136 | train_data.extend(samples[:2]) 137 | val_data.extend(samples[2:3]) 138 | else: 139 | train_data.extend(samples) 140 | continue 141 | 142 | # Process the case of label=3 and id=69 143 | if pen_category == 3: 144 | for id_ in ids: 145 | if id_ == "69": # ID 87 directly placed into the validation set 146 | val_data.extend(id_to_samples[id_]) 147 | else: 148 | train_data.extend(id_to_samples[id_]) 149 | continue 150 | 151 | # Other categories are randomly split according to the proportion 152 | random.shuffle(ids) 153 | split_index = int(len(ids) * (1 - val_ratio)) 154 | train_ids.update(ids[:split_index]) 155 | val_ids.update(ids[split_index:]) 156 | 157 | # Split data based on ID 158 | for id_ in train_ids: 159 | train_data.extend(id_to_samples[id_]) 160 | for id_ in val_ids: 161 | val_data.extend(id_to_samples[id_]) 162 | 163 | # Calculate category statistics 164 | train_category_count = defaultdict(int) 165 | val_category_count = defaultdict(int) 166 | 167 | for entry in train_data: 168 | train_category_count[entry['pen_category']] += 1 169 | for entry in val_data: 170 | val_category_count[entry['pen_category']] += 1 171 | 172 | return train_data, val_data, train_category_count, val_category_count 173 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/utils/__init__.py -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/utils/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/logger.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/utils/__pycache__/logger.cpython-311.pyc -------------------------------------------------------------------------------- /utils/__pycache__/logger.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/utils/__pycache__/logger.cpython-38.pyc -------------------------------------------------------------------------------- /utils/image_pool.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | 4 | 5 | class ImagePool(): 6 | """This class implements an image buffer that stores previously generated images. 7 | 8 | This buffer enables us to update discriminators using a history of generated images 9 | rather than the ones produced by the latest generators. 10 | """ 11 | 12 | def __init__(self, pool_size): 13 | """Initialize the ImagePool class 14 | 15 | Parameters: 16 | pool_size (int) -- the size of image buffer, if pool_size=0, no buffer will be created 17 | """ 18 | self.pool_size = pool_size 19 | if self.pool_size > 0: # create an empty pool 20 | self.num_imgs = 0 21 | self.images = [] 22 | 23 | def query(self, images): 24 | """Return an image from the pool. 25 | 26 | Parameters: 27 | images: the latest generated images from the generator 28 | 29 | Returns images from the buffer. 30 | 31 | By 50/100, the buffer will return input images. 32 | By 50/100, the buffer will return images previously stored in the buffer, 33 | and insert the current images to the buffer. 34 | """ 35 | if self.pool_size == 0: # if the buffer size is 0, do nothing 36 | return images 37 | return_images = [] 38 | for image in images: 39 | image = torch.unsqueeze(image.data, 0) 40 | if self.num_imgs < self.pool_size: # if the buffer is not full; keep inserting current images to the buffer 41 | self.num_imgs = self.num_imgs + 1 42 | self.images.append(image) 43 | return_images.append(image) 44 | else: 45 | p = random.uniform(0, 1) 46 | if p > 0.5: # by 50% chance, the buffer will return a previously stored image, and insert the current image into the buffer 47 | random_id = random.randint(0, self.pool_size - 1) # randint is inclusive 48 | tmp = self.images[random_id].clone() 49 | self.images[random_id] = image 50 | return_images.append(tmp) 51 | else: # by another 50% chance, the buffer will return the current image 52 | return_images.append(image) 53 | return_images = torch.cat(return_images, 0) # collect all the images and return 54 | return return_images 55 | -------------------------------------------------------------------------------- /utils/logger.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import logging 4 | # import fcntl 5 | 6 | def get_logger(path, suffix): 7 | cur_time = time.strftime('%Y-%m-%d-%H.%M.%S',time.localtime(time.time())) 8 | logger = logging.getLogger(__name__+cur_time) 9 | logger.setLevel(level = logging.INFO) 10 | handler = logging.FileHandler(os.path.join(path, f"{suffix}_{cur_time}.log")) 11 | handler.setLevel(logging.INFO) 12 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 13 | handler.setFormatter(formatter) 14 | 15 | console = logging.StreamHandler() 16 | console.setLevel(logging.INFO) 17 | 18 | logger.addHandler(handler) 19 | logger.addHandler(console) 20 | return logger 21 | 22 | class ResultRecorder(object): 23 | def __init__(self, path, total_cv=10): 24 | self.path = path 25 | self.total_cv = total_cv 26 | if not os.path.exists(self.path): 27 | f = open(self.path, 'w') 28 | # f.write('acc\tuar\tf1\n') 29 | f.write('emo_metric\tint_metric\tjoint_metric\n') 30 | f.close() 31 | 32 | def is_full(self, content): 33 | if len(content) < self.total_cv+1: 34 | return False 35 | 36 | for line in content: 37 | if not len(line.split('\t')) == 3: 38 | return False 39 | return True 40 | 41 | def calc_mean(self, content): 42 | # acc = [float(line.split('\t')[0]) for line in content[1:]] 43 | # uar = [float(line.split('\t')[1]) for line in content[1:]] 44 | # f1 = [float(line.split('\t')[2]) for line in content[1:]] 45 | # mean_acc = sum(acc) / len(acc) 46 | # mean_uar = sum(uar) / len(uar) 47 | # mean_f1 = sum(f1) / len(f1) 48 | # return mean_acc, mean_uar, mean_f1 49 | emo_metric = [float(line.split('\t')[0]) for line in content[1:]] 50 | int_metric = [float(line.split('\t')[1]) for line in content[1:]] 51 | joint_metric = [float(line.split('\t')[2]) for line in content[1:]] 52 | mean_emo_metric = sum(emo_metric) / len(emo_metric) 53 | mean_int_metric = sum(int_metric) / len(int_metric) 54 | mean_joint_metric = sum(joint_metric) / len(joint_metric) 55 | return mean_emo_metric, mean_int_metric, mean_joint_metric 56 | 57 | def write_result_to_tsv(self, results, cvNo): 58 | # 使用fcntl对文件加锁,避免多个不同进程同时操作同一个文件 59 | f_in = open(self.path) 60 | # fcntl.flock(f_in.fileno(), fcntl.LOCK_EX) # 加锁 61 | content = f_in.readlines() 62 | if len(content) < self.total_cv+1: 63 | content += ['\n'] * (self.total_cv-len(content)+1) 64 | keys = [item for item in results.keys()] 65 | # content[cvNo] = '{:.4f}\t{:.4f}\t{:.4f}\n'.format(results[keys[0]], results[keys[1]], results[keys[2]]) 66 | content[cvNo] = '{:.4f}\n'.format(results[keys[0]]) 67 | 68 | if self.is_full(content): 69 | mean_emo_metric, mean_int_metric, mean_joint_metric = self.calc_mean(content) 70 | # content.append('{:.4f}\t{:.4f}\t{:.4f}\n'.format(mean_emo_metric, mean_int_metric, mean_joint_metric)) 71 | content.append('{:.4f}\n'.format(mean_emo_metric)) 72 | 73 | f_out = open(self.path, 'w') 74 | f_out.writelines(content) 75 | f_out.close() 76 | f_in.close() # 释放锁 77 | 78 | 79 | class LossRecorder(object): 80 | def __init__(self, path, total_cv=10, total_epoch=40): 81 | self.path = path 82 | self.total_epoch = total_epoch 83 | self.total_cv = total_cv 84 | if not os.path.exists(self.path): 85 | f = open(self.path, 'w') 86 | f.close() 87 | 88 | def is_full(self, content): 89 | if len(content) < self.total_cv + 1: 90 | return False 91 | 92 | for line in content: 93 | if not len(line.split('\t')) == 3: 94 | return False 95 | return True 96 | 97 | def calc_mean(self, content): 98 | loss_list = [[] * self.total_cv] * self.total_epoch 99 | mean_list = [[] * self.total_cv] * self.total_epoch 100 | for i in range(0, self.total_epoch): 101 | loss_list[i] = [float(line.split('\t')[i]) for line in content[1:]] 102 | for i in range(0, self.total_epoch): 103 | mean_list[i] = sum(loss_list[i]) / len(loss_list[i]) 104 | return mean_list 105 | 106 | def write_result_to_tsv(self, results, cvNo): 107 | # 使用fcntl对文件加锁,避免多个不同进程同时操作同一个文件 108 | f_in = open(self.path) 109 | # fcntl.flock(f_in.fileno(), fcntl.LOCK_EX) # 加锁 110 | content = f_in.readlines() 111 | if len(content) < self.total_cv + 1: 112 | content += ['\n'] * (self.total_cv - len(content) + 1) 113 | string = '' 114 | for i in results: 115 | string += str(i.numpy())[:8] 116 | string += '\t' 117 | content[cvNo] = string + '\n' 118 | 119 | f_out = open(self.path, 'w') 120 | f_out.writelines(content) 121 | f_out.close() 122 | f_in.close() # 释放锁 123 | 124 | def read_result_from_tsv(self,): 125 | f_out = open(self.path) 126 | # fcntl.flock(f_out.fileno(), fcntl.LOCK_EX) 127 | content = f_out.readlines() 128 | loss_list = [[] * self.total_cv] * self.total_epoch 129 | for i in range(0, self.total_epoch): 130 | loss_list[i] = [float(line.split('\t')[i]) for line in content[1:]] 131 | mean = self.calc_mean(content) 132 | return mean 133 | --------------------------------------------------------------------------------