├── .idea
├── .gitignore
├── code_20250102.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── MPDD Dataset License Agreementt.pdf
├── README.md
├── __pycache__
├── config.cpython-311.pyc
├── config.cpython-38.pyc
├── dataset.cpython-311.pyc
├── dataset.cpython-38.pyc
├── train.cpython-311.pyc
└── train_val_split.cpython-311.pyc
├── config.json
├── dataset.py
├── feature_extraction
├── audio
│ ├── extra_mfcc64.py
│ ├── extra_opensmile.py
│ └── extract_wav2vec_embedding.py
├── feature_personalized
│ ├── .idea
│ │ ├── .gitignore
│ │ ├── inspectionProfiles
│ │ │ └── profiles_settings.xml
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ └── 生成个性化特征+嵌入.iml
│ ├── extrapersonality.py
│ └── gen_describtion.py
└── visual
│ ├── __pycache__
│ ├── dataset.cpython-311.pyc
│ ├── dataset.cpython-38.pyc
│ ├── util.cpython-311.pyc
│ └── util.cpython-38.pyc
│ ├── extract_openface.py
│ ├── extract_resnet+densnet.py
│ └── util.py
├── models
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-310.pyc
│ ├── __init__.cpython-311.pyc
│ ├── __init__.cpython-38.pyc
│ ├── base_model.cpython-310.pyc
│ ├── base_model.cpython-311.pyc
│ ├── base_model.cpython-38.pyc
│ ├── pretrain_model.cpython-310.pyc
│ ├── pretrain_model.cpython-311.pyc
│ └── pretrain_model.cpython-38.pyc
├── base_model.py
├── networks
│ ├── ContextEncoder.py
│ ├── LightWeightTrans.py
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── ContextEncoder.cpython-310.pyc
│ │ ├── ContextEncoder.cpython-311.pyc
│ │ ├── ContextEncoder.cpython-38.pyc
│ │ ├── __init__.cpython-310.pyc
│ │ ├── __init__.cpython-311.pyc
│ │ ├── __init__.cpython-38.pyc
│ │ ├── autoencoder.cpython-310.pyc
│ │ ├── autoencoder.cpython-311.pyc
│ │ ├── autoencoder.cpython-38.pyc
│ │ ├── classifier.cpython-310.pyc
│ │ ├── classifier.cpython-311.pyc
│ │ ├── classifier.cpython-38.pyc
│ │ ├── fc.cpython-310.pyc
│ │ ├── fc.cpython-311.pyc
│ │ ├── fc.cpython-38.pyc
│ │ ├── interact_model.cpython-310.pyc
│ │ ├── interact_model.cpython-311.pyc
│ │ ├── interact_model.cpython-38.pyc
│ │ ├── lstm.cpython-310.pyc
│ │ ├── lstm.cpython-311.pyc
│ │ ├── lstm.cpython-38.pyc
│ │ ├── multihead_attention.cpython-310.pyc
│ │ ├── multihead_attention.cpython-311.pyc
│ │ ├── multihead_attention.cpython-38.pyc
│ │ ├── tools.cpython-310.pyc
│ │ ├── tools.cpython-311.pyc
│ │ └── tools.cpython-38.pyc
│ ├── autoencoder.py
│ ├── classifier.py
│ ├── cnn.py
│ ├── fc.py
│ ├── interact_model.py
│ ├── lstm.py
│ ├── multihead_attention.py
│ └── tools.py
├── our
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-310.pyc
│ │ ├── __init__.cpython-311.pyc
│ │ ├── __init__.cpython-38.pyc
│ │ ├── our_model.cpython-311.pyc
│ │ ├── our_model.cpython-38.pyc
│ │ ├── our_model_ablation.cpython-311.pyc
│ │ └── zelin_our_model.cpython-311.pyc
│ └── our_model.py
└── utils
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-310.pyc
│ ├── __init__.cpython-311.pyc
│ ├── __init__.cpython-38.pyc
│ ├── config.cpython-310.pyc
│ ├── config.cpython-311.pyc
│ ├── config.cpython-38.pyc
│ ├── convert.cpython-310.pyc
│ ├── convert.cpython-311.pyc
│ ├── convert.cpython-38.pyc
│ ├── functions.cpython-310.pyc
│ ├── functions.cpython-311.pyc
│ ├── functions.cpython-38.pyc
│ ├── time_track.cpython-310.pyc
│ ├── time_track.cpython-311.pyc
│ └── time_track.cpython-38.pyc
│ ├── config-orin.py
│ ├── config.py
│ ├── convert.py
│ ├── functions.py
│ ├── load_pretrained.py
│ └── time_track.py
├── requirements.txt
├── scripts
├── Track1
│ ├── train_1s_binary.sh
│ ├── train_1s_quinary.sh
│ ├── train_1s_ternary.sh
│ ├── train_5s_binary.sh
│ ├── train_5s_quinary.sh
│ └── train_5s_ternary.sh
├── Track2
│ ├── train_1s_binary.sh
│ ├── train_1s_ternary.sh
│ ├── train_5s_binary.sh
│ └── train_5s_ternary.sh
└── test.sh
├── test.py
├── train.py
├── train_val_split.py
└── utils
├── __init__.py
├── __pycache__
├── __init__.cpython-311.pyc
├── __init__.cpython-38.pyc
├── logger.cpython-311.pyc
└── logger.cpython-38.pyc
├── image_pool.py
└── logger.py
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # 默认忽略的文件
2 | /shelf/
3 | /workspace.xml
4 | # 基于编辑器的 HTTP 客户端请求
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/.idea/code_20250102.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/MPDD Dataset License Agreementt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/MPDD Dataset License Agreementt.pdf
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MPDD Baseline Code
2 | The baseline system provided for the MM 2025 MPDD Challenge serves as a starting point for participants to develop their solutions for the Multimodal Personalized Depression Detection tasks. The baseline system is designed to be straightforward yet effective, providing participants with a solid foundation upon which they can build and improve.
3 |
4 | # Results
5 | The metrics reported are weighted/unweighted F1-score (W_F1/U_F1) and weighted/unweighted accuracy (W_Acc./U_Acc.) with and without personalized features (PF) for the MPDD-Young and MPDD-Elderly datasets. Each value represents the best-performing feature combination for each experiment, using default hyper-parameters.
6 |
7 | #### MPDD-Elderly (Track1)
8 |
9 | | Length | Task Type | Audio Feature | Visual Feature | w/ PF (W_F1/U_F1) | w/ PF (W_Acc./U_Acc.) | w/o PF (W_F1/U_F1) | w/o PF (W_Acc./U_Acc.) |
10 | |--------|-----------|---------------|----------------|-------------------|-----------------------|--------------------|------------------------|
11 | | 1s | Binary | mfcc | openface | 85.71 / 79.13 | 85.40 / 84.62 | 82.60 / 70.89 | 69.37 / 83.33 |
12 | | 1s | Ternary | opensmile | resnet | 56.48 / 55.64 | 55.49 / 56.41 | 54.35 / 49.14 | 48.93 / 55.13 |
13 | | 1s | Quinary | opensmile | densenet | 66.26 / 46.66 | 45.79 / 69.23 | 63.85 / 44.00 | 42.45 / 66.67 |
14 | | 5s | Binary | opensmile | resnet | 81.75 / 72.37 | 75.40 / 80.77 | 77.90 / 66.15 | 67.94 / 76.92 |
15 | | 5s | Ternary | wav2vec | openface | 58.22 / 59.37 | 59.62 / 57.69 | 50.88 / 47.59 | 46.58 / 50.00 |
16 | | 5s | Quinary | mfcc | densenet | 75.62 / 58.40 | 57.71 / 78.21 | 73.49 / 56.83 | 56.98 / 75.64 |
17 |
18 |
19 | #### MPDD-Young (Track2)
20 |
21 | | Length | Task Type | Audio Feature | Visual Feature | w/ PF (W_F1/U_F1) | w/ PF (W_Acc./U_Acc.) | w/o PF (W_F1/U_F1) | w/o PF (W_Acc./U_Acc.) |
22 | |--------|-----------|---------------|----------------|-------------------|-----------------------|--------------------|------------------------|
23 | | 1s | Binary | wav2vec | openface | 59.96 / 59.96 | 63.64 / 63.64 | 55.23 / 55.23 | 56.06 / 56.06 |
24 | | 1s | Ternary | mfcc | densenet | 51.86 / 51.62 | 49.66 / 51.52 | 47.95 / 43.72 | 42.63 / 48.48 |
25 | | 5s | Binary | opensmile | resnet | 62.11 / 62.11 | 62.12 / 62.12 | 60.02 / 60.02 | 60.61 / 60.61 |
26 | | 5s | Ternary | mfcc | densenet | 48.18 / 41.31 | 41.71 / 50.00 | 42.82 / 39.38 | 41.29 / 42.42 |
27 |
28 | # Environment
29 |
30 | python 3.10.0
31 | pytorch 2.3.0
32 | scikit-learn 1.5.1
33 | pandas 2.2.2
34 |
35 | Given `requirements.txt`, we recommend users to configure their environment via conda with the following steps:
36 |
37 | conda create -n mpdd python=3.10 -y
38 | conda activate mpdd
39 | pip install -r requirements.txt
40 |
41 | # Features
42 |
43 | In our baseline, we use the following features:
44 |
45 | ### Acoustic Feature:
46 | **Wav2vec:** We extract utterance-level acoustic features using the wav2vec model pre-trained on large-scale audio data. The embedding size of the acoustic features is 512.
47 | The link of the pre-trained model is: [wav2vec model](https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec)
48 |
49 | **MFCCs:** We extract Mel-frequency cepstral coefficients (MFCCs). The embedding size of MFCCs is 64.
50 |
51 | **OpenSmile:** We extract utterance-level acoustic features using opensmile. The embedding size of OpenSMILE features is 6373.
52 |
53 | ### Visual Feature:
54 | **Resnet-50 and Densenet-121:** We employ OpenCV tool to extract scene pictures from each video, capturing frames at a 10-frame interval. Subsequently, we utilize the Resnet-50 and Densenet-121 model to generate utterance-level features for the extracted scene pictures in the videos. The embedding size of the visual features is 1000 for Resnet, and 1024 (Track1) or 1000 (Track2) for Densenet.
55 | The links of the pre-trained models are:
56 | [ResNet-50](https://huggingface.co/microsoft/resnet-50)
57 | [DenseNet-121](https://huggingface.co/pytorch/vision/v0.10.0/densenet121)
58 |
59 | **OpenFace:** We extract csv visual features using the pretrained OpenFace model. The embedding size of OpenFace features is 709. You can download the executable file and model files for OpenFace from the following link: [OpenFace Toolkit](https://github.com/TadasBaltrusaitis/OpenFace)
60 |
61 | ### Personalized Feature:
62 | We generate personalized features by loading the GLM3 model, creating personalized descriptions, and embedding these descriptions using the `roberta-large` model. The embedding size of the personalized features is 1024.
63 | The link of the `roberta-large` model is: [RoBERTa Large](https://huggingface.co/roberta-large)
64 |
65 | # Usage
66 | ## Dataset Download
67 | Given the potential ethical risks and privacy concerns associated with this dataset, we place the highest priority on the protection and lawful use of the data. To this end, we have established and implemented a series of stringent access and authorization management measures to ensure compliance with relevant laws, regulations, and ethical standards, while making every effort to prevent potential ethical disputes arising from improper data use.
68 |
69 | To further safeguard the security and compliance of the data, please complete the following steps before contacting us to request access to the dataset labels and extracted features:
70 |
71 | - **1. Download the [MPDD Dataset License Agreement PDF](https://github.com/hacilab/MPDD/blob/main/MPDD%20Dataset%20License%20Agreementt.pdf)**.
72 |
73 | - **2. Carefully review the agreement**: The agreement outlines in detail the usage specifications, restrictions, and the responsibilities and obligations of the licensee. Please read the document thoroughly to ensure complete understanding of the terms and conditions.
74 |
75 | - **3. Manually sign the agreement**: After confirming your full understanding and agreement with the terms, fill in the required fields and sign the agreement by hand as formal acknowledgment of your acceptance (should be signed with a full-time faculty or researcher).
76 |
77 | Once you have completed the above steps, please submit the required materials to us through the following channels:
78 |
79 | - **Primary contact email**: sstcneu@163.com
80 | - **CC email**: fuchangzeng@qhd.neu.edu.cn
81 |
82 | We will review your submission to verify that you meet the access requirements. Upon approval, we will grant you the corresponding data access permissions. Please note that all materials submitted will be used solely for identity verification and access management and will not be used for any other purpose.
83 |
84 | We sincerely appreciate your cooperation in protecting data privacy and ensuring compliant use. If you have any questions or require further guidance, please feel free to contact us via the emails provided above.
85 |
86 | After obtaining the dataset, users should modify `data_rootpath` in the scripts during training and testing. Notice that testing data will be made public in the later stage of the competition.
87 |
88 | `data_rootpath`:
89 |
90 | ├── Training/
91 | │ ├──1s
92 | │ ├──5s
93 | │ ├──individualEmbedding
94 | │ ├──labels
95 | ├── Testing/
96 | │ ├──1s
97 | │ ├──5s
98 | │ ├──individualEmbedding
99 | │ ├──labels
100 |
101 |
102 | ## Training
103 | To train the model with default parameters, taking MPDD-Young for example, simply run:
104 |
105 | ```bash
106 | cd path/to/MPDD # replace with actual path
107 | ```
108 | ```bash
109 | bash scripts/Track2/train_1s_binary.sh
110 | ```
111 |
112 | You can also modify parameters such as feature types, split window time, classification dimensions, or learning rate directly through the command line:
113 | ```bash
114 | bash scripts/Track2/train_1s_binary.sh --audiofeature_method=wav2vec --videofeature_method=resnet --splitwindow_time=5s --labelcount=3 --batch_size=32 --lr=0.001 --num_epochs=500
115 | ```
116 | Refer to `config.json` for more parameters.
117 |
118 | The specific dimensions of each feature are shown in the table below:
119 | | Feature | Dimension |
120 | |--------------------------|---------------------------------|
121 | | Wav2vec | 512 |
122 | | MFCCs | 64 |
123 | | OpenSmile | 6373 |
124 | | ResNet-50 | 1000 |
125 | | DenseNet-121 | 1024 for Track1, 1000 for Track2 |
126 | | OpenFace | 709 |
127 | | Personalized Feature | 1024 |
128 |
129 |
130 | ## Testing
131 | To predict the labels for the testing set with your obtained model, first modify the default parameters in `test.sh` to match the current task, and run:
132 |
133 | ```bash
134 | cd path/to/MPDD # replace with actual path
135 | ```
136 | ```bash
137 | bash scripts/test.sh
138 | ```
139 | After testing 6 tasks in Track1 or 4 tasks in Track2, the results will be merged into the `submission.csv` file in `./answer_Track2/`.
140 |
141 | # Acknowledgements
142 | The benchmark of MPDD is developed based on the work of MEIJU 2025. The Github URL of MEIJU 2025 is: https://github.com/AI-S2-Lab/MEIJU2025-baseline.
143 |
--------------------------------------------------------------------------------
/__pycache__/config.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/config.cpython-311.pyc
--------------------------------------------------------------------------------
/__pycache__/config.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/config.cpython-38.pyc
--------------------------------------------------------------------------------
/__pycache__/dataset.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/dataset.cpython-311.pyc
--------------------------------------------------------------------------------
/__pycache__/dataset.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/dataset.cpython-38.pyc
--------------------------------------------------------------------------------
/__pycache__/train.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/train.cpython-311.pyc
--------------------------------------------------------------------------------
/__pycache__/train_val_split.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/__pycache__/train_val_split.cpython-311.pyc
--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "input_dim_a": 512,
3 | "embd_size_a": 128,
4 | "embd_method_a": "maxpool",
5 |
6 | "input_dim_v": 1000,
7 | "embd_size_v": 128,
8 | "embd_method_v": "maxpool",
9 |
10 | "emo_output_dim": 5,
11 |
12 | "cls_layers": "128,64",
13 | "dropout_rate": 0.2,
14 | "bn": false,
15 | "ce_weight": 1.0,
16 | "focal_weight": 1.0,
17 | "temperature": 0.007,
18 |
19 |
20 | "Transformer_head": 2,
21 | "Transformer_layers": 1,
22 | "hidden_size": 256,
23 |
24 | "attention_head": 1,
25 | "attention_dropout": 0.0,
26 |
27 | "activate_fun": "relu",
28 | "ablation": "normal",
29 | "use_ICL": true,
30 | "drop_last": false,
31 | "cvNo": 1,
32 |
33 | "gpu_ids": [0],
34 | "isTrain": true,
35 | "checkpoints_dir": "./checkpoints",
36 | "name": "MDPP",
37 |
38 | "cuda_benchmark": true,
39 |
40 | "lr": 2e-5,
41 | "beta1": 0.9,
42 |
43 | "log_dir": "./logs",
44 | "feature_max_len": 5
45 | }
46 |
--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
1 | import json
2 | import torch
3 | import numpy as np
4 | from torch.utils.data import Dataset
5 |
6 |
7 | class AudioVisualDataset(Dataset):
8 | def __init__(self, json_data, label_count, personalized_feature_file, max_len=10, batch_size=32, audio_path='', video_path='', isTest=False):
9 | self.data = json_data
10 | self.max_len = max_len # Expected sequence length
11 | self.batch_size = batch_size
12 | self.isTest = isTest
13 |
14 | # Load personalized features
15 | self.personalized_features = self.load_personalized_features(personalized_feature_file)
16 | self.audio_path = audio_path
17 | self.video_path = video_path
18 | self.label_count = label_count
19 |
20 | def __len__(self):
21 | return len(self.data)
22 |
23 | def fixed_windows(self, features: torch.Tensor, fixLen=4):
24 | """
25 | Divides 2D features into fixLen fixed windows and aggregates them into fixed-size results (Tensor version).
26 |
27 | Parameters.
28 | - features: the input feature tensor of (timesteps, feature_dim)
29 |
30 | Returns.
31 | - Tensor of (4, feature_dim), each row represents a window of aggregated features
32 | """
33 | timesteps, feature_dim = features.shape
34 | window_size = int(torch.ceil(torch.tensor(timesteps / fixLen)))
35 | windows = []
36 | for i in range(fixLen):
37 | start = i * window_size
38 | end = min(start + window_size, timesteps)
39 | window = features[start:end]
40 | if window.size(0) > 0:
41 | window_aggregated = torch.mean(window, dim=0)
42 | windows.append(window_aggregated)
43 | else:
44 | windows.append(torch.zeros(feature_dim))
45 |
46 | return torch.stack(windows, dim=0)
47 |
48 | def pad_or_truncate(self, feature, max_len):
49 | """Fill or truncate the input feature sequence"""
50 | if feature.shape[0] < max_len:
51 | padding = torch.zeros((max_len - feature.shape[0], feature.shape[1]))
52 | feature = torch.cat((feature, padding), dim=0)
53 | else:
54 | feature = feature[:max_len]
55 | return feature
56 |
57 | def load_personalized_features(self, file_path):
58 | """
59 | Load personalized features from the .npy file.
60 | """
61 |
62 | data = np.load(file_path, allow_pickle=True)
63 | if isinstance(data, np.ndarray) and isinstance(data[0], dict):
64 | return {entry["id"]: entry["embedding"] for entry in data}
65 | else:
66 | raise ValueError("Unexpected data format in the .npy file. Ensure it contains a list of dictionaries.")
67 |
68 | def __getitem__(self, idx):
69 | entry = self.data[idx]
70 |
71 | # Load audio and video features
72 | audio_feature = np.load(self.audio_path + '/' + entry['audio_feature_path'])
73 | video_feature = np.load(self.video_path + '/' + entry['video_feature_path'])
74 | audio_feature = torch.tensor(audio_feature, dtype=torch.float32)
75 | video_feature = torch.tensor(video_feature, dtype=torch.float32)
76 |
77 | audio_feature = self.pad_or_truncate(audio_feature, self.max_len)
78 | video_feature = self.pad_or_truncate(video_feature, self.max_len)
79 |
80 | # Load label
81 | if self.isTest == False:
82 | if self.label_count == 2:
83 | label = torch.tensor(entry['bin_category'], dtype=torch.long)
84 | elif self.label_count == 3:
85 | label = torch.tensor(entry['tri_category'], dtype=torch.long)
86 | elif self.label_count == 5:
87 | label = torch.tensor(entry['pen_category'], dtype=torch.long)
88 | else:
89 | label = 0
90 |
91 | import os
92 |
93 | filepath = entry['audio_feature_path'] # the filename containing path to features
94 | filename = os.path.basename(filepath)
95 | # Extract person ids and convert to integers
96 | person_id = int(filename.split('_')[0])
97 | personalized_id = str(person_id)
98 |
99 | if personalized_id in self.personalized_features:
100 | personalized_feature = torch.tensor(self.personalized_features[personalized_id], dtype=torch.float32)
101 | else:
102 | # If no personalized feature found, use a zero vector
103 | personalized_feature = torch.zeros(1024, dtype=torch.float32)
104 | print(f"❗Personalized feature not found for id: {personalized_id}")
105 |
106 | return {
107 | 'A_feat': audio_feature,
108 | 'V_feat': video_feature,
109 | 'emo_label': label,
110 | 'personalized_feat': personalized_feature
111 | }
112 |
--------------------------------------------------------------------------------
/feature_extraction/audio/extra_mfcc64.py:
--------------------------------------------------------------------------------
1 | import librosa
2 | import numpy as np
3 | import os
4 |
5 |
6 | def extract_and_save_mfcc(input_dir, output_dir, n_mfcc=64, frame_length=2048, hop_length=512):
7 | """
8 | 从指定目录中批量提取音频文件的 64 维 MFCC 特征,并保存为 .npy 文件。
9 |
10 | 参数:
11 | input_dir (str): 输入目录路径,包含所有 .wav 文件。
12 | output_dir (str): 输出目录路径,用于保存 .npy 文件。
13 | n_mfcc (int): MFCC 特征维度,默认为 64。
14 | frame_length (int): 每帧的长度(默认 2048,对应 ~46ms)。
15 | hop_length (int): 帧移的长度(默认 512,对应 ~11ms)。
16 | """
17 | # 确保输出目录存在
18 | os.makedirs(output_dir, exist_ok=True)
19 |
20 | # 遍历输入目录中的所有 .wav 文件
21 | for file_name in os.listdir(input_dir):
22 | if file_name.endswith('.wav'):
23 | input_path = os.path.join(input_dir, file_name)
24 | output_path = os.path.join(output_dir, file_name.replace('.wav', '.npy'))
25 |
26 | try:
27 | y, sr = librosa.load(input_path, sr=None)
28 | mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=frame_length, hop_length=hop_length)
29 | mfcc_transposed = mfcc.T # 转置为 (n_frames, n_mfcc)
30 |
31 | # 保存为 .npy 文件
32 | np.save(output_path, mfcc_transposed)
33 |
34 | # 打印文件名和特征形状
35 | print(f"Processed {file_name}: MFCC shape {mfcc_transposed.shape}")
36 | except Exception as e:
37 | print(f"Error processing {file_name}: {e}")
38 |
39 |
40 | # 输入目录和输出目录路径
41 | input_dir = r"D:\HACI\MMchallenge\Audio_split1\Audio_split_16k" # 替换为输入目录路径
42 | output_dir = r"D:\HACI\MMchallenge\Audio_split1\features\mfccs" # 替换为输出目录路径
43 |
44 | # 批量提取并保存特征
45 | extract_and_save_mfcc(input_dir, output_dir)
--------------------------------------------------------------------------------
/feature_extraction/audio/extra_opensmile.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import opensmile
4 |
5 | input_audio_dir = r"D:\HACI\MMchallenge\Audio_split1\Audio_split_16k" # Directory containing audio files
6 | output_feature_dir = r"D:\HACI\MMchallenge\Audio_split1\features\opensmile" # Directory to save .npy feature files
7 |
8 | os.makedirs(output_feature_dir, exist_ok=True)
9 |
10 | smile = opensmile.Smile(
11 | feature_set=opensmile.FeatureSet.ComParE_2016,
12 | feature_level=opensmile.FeatureLevel.Functionals,
13 | )
14 |
15 | for audio_file in os.listdir(input_audio_dir):
16 | if audio_file.endswith((".wav", ".mp3")):
17 | audio_path = os.path.join(input_audio_dir, audio_file)
18 | feature_file = os.path.splitext(audio_file)[0] + ".npy"
19 | output_path = os.path.join(output_feature_dir, feature_file)
20 |
21 | try:
22 | features = smile.process_file(audio_path)
23 |
24 | feature_array = features.to_numpy().flatten()
25 |
26 | print(f"Shape of features for {audio_file}: {feature_array.shape}")
27 |
28 | np.save(output_path, feature_array)
29 | print(f"Features saved for {audio_file} as {output_path}")
30 | except Exception as e:
31 | print(f"Error processing file {audio_file}: {e}")
32 |
33 | print("Feature extraction completed.")
--------------------------------------------------------------------------------
/feature_extraction/audio/extract_wav2vec_embedding.py:
--------------------------------------------------------------------------------
1 | # *_*coding:utf-8 *_*
2 | """
3 | wav2vec: https://arxiv.org/abs/1904.05862
4 | official github repo: https://github.com/pytorch/fairseq/tree/master/examples/wav2vec
5 | """
6 | import os
7 | import time
8 | import glob
9 | import torch
10 | import numpy as np
11 | import soundfile as sf
12 | from fairseq.models.wav2vec import Wav2VecModel # Note: use fairseq version of 0.10.1 (pip install fairseq==0.10.1)
13 |
14 | def write_feature_to_npy(feature, csv_file, feature_level):
15 | if feature_level == 'UTTERANCE':
16 | feature = np.array(feature).squeeze() # [C,]
17 | if len(feature.shape) != 1: # change [T, C] => [C,]
18 | feature = np.mean(feature, axis=0)
19 | np.save(csv_file, feature)
20 | else:
21 | np.save(csv_file, feature)
22 |
23 | def extract(audio_files, feature_level, model, save_dir, overwrite=False, gpu=None):
24 | start_time = time.time()
25 | device = torch.device(f'cuda:{gpu}' if gpu is not None and torch.cuda.is_available() else 'cpu')
26 |
27 | dir_name = 'wav2vec-large'
28 | out_dir_z = os.path.join(save_dir, f'{dir_name}-z-{feature_level[:3]}') # features output by feature encoder
29 | if not os.path.exists(out_dir_z):
30 | os.makedirs(out_dir_z)
31 | elif overwrite or len(os.listdir(save_dir)) == 0:
32 | print(f'==> Warning: overwrite save_dir "{save_dir}"!')
33 | else:
34 | raise Exception(f'==> Error: save_dir "{save_dir}" already exists, set overwrite=True if needed!')
35 |
36 | out_dir_c = os.path.join(save_dir, f'{dir_name}-c-{feature_level[:3]}') # features output by context network
37 | if not os.path.exists(out_dir_c):
38 | os.makedirs(out_dir_c)
39 | elif overwrite or len(os.listdir(save_dir)) == 0:
40 | print(f'==> Warning: overwrite save_dir "{save_dir}"!')
41 | else:
42 | raise Exception(f'==> Error: save_dir "{save_dir}" already exists, set overwrite=True if needed!')
43 |
44 | for idx, wav_file in enumerate(audio_files, 1):
45 | file_name = os.path.basename(wav_file)
46 | vid = file_name[:-4]
47 | print(f'Processing "{file_name}" ({idx}/{len(audio_files)})...')
48 | # load audio
49 | audio, sampling_rate = sf.read(wav_file)
50 | audio = audio.astype('float32')[np.newaxis, :]
51 | audio = torch.from_numpy(audio)
52 | audio = audio.to(device)
53 | assert sampling_rate == 16000, f'Error: sampling rate ({sampling_rate}) != 16k!'
54 | with torch.no_grad():
55 | z = model.feature_extractor(audio) # (1, C, T), stride: 10ms (100Hz), receptive field: 30ms
56 | c = model.feature_aggregator(z) # (1, C, T), stride: 10ms (100Hz), receptive field: 801ms (for large version)
57 |
58 | z_feature = z.detach().squeeze().t().cpu().numpy()
59 | c_feature = c.detach().squeeze().t().cpu().numpy()
60 | z_csv_file = os.path.join(out_dir_z, f'{vid}.npy')
61 | c_csv_file = os.path.join(out_dir_c, f'{vid}.npy')
62 | write_feature_to_npy(z_feature, z_csv_file, feature_level)
63 | write_feature_to_npy(c_feature, c_csv_file, feature_level)
64 |
65 | end_time = time.time()
66 | print(f'Total time used: {end_time - start_time:.1f}s.')
67 |
68 | if __name__ == '__main__':
69 | gpu = 0
70 | feature_level = 'UTTERANCE'
71 | overwrite = True
72 | audio_dir = '/path/to/audio' # Replace with your audio directory
73 | save_dir = '/path/to/save' # Replace with your save directory
74 | model_path = '/path/to/model/wav2vec_large.pt' # Replace with your model path
75 |
76 | # in: get audios (assert file extension is '.wav')
77 | audio_files = glob.glob(os.path.join(audio_dir, '*.wav'))
78 | print(f'Find total "{len(audio_files)}" audio files.')
79 |
80 | device = torch.device(f'cuda:{gpu}' if torch.cuda.is_available() else 'cpu')
81 | cp = torch.load(model_path, map_location=device)
82 | model = Wav2VecModel.build_model(cp['args'], task=None)
83 | model.load_state_dict(cp['model'])
84 | model.to(device)
85 | model.eval()
86 |
87 | # extract features
88 | extract(audio_files, feature_level=feature_level, model=model, save_dir=save_dir, overwrite=overwrite, gpu=gpu)
89 |
--------------------------------------------------------------------------------
/feature_extraction/feature_personalized/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # 默认忽略的文件
2 | /shelf/
3 | /workspace.xml
4 | # 基于编辑器的 HTTP 客户端请求
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/feature_extraction/feature_personalized/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/feature_extraction/feature_personalized/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/feature_extraction/feature_personalized/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/feature_extraction/feature_personalized/.idea/生成个性化特征+嵌入.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/feature_extraction/feature_personalized/extrapersonality.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ["HF_ENDPOINT"]="https://hf-mirror.com"
3 |
4 | import json
5 | import numpy as np
6 | import torch
7 | from transformers import RobertaTokenizer, RobertaModel
8 |
9 |
10 | def load_data(json_file):
11 | with open(json_file, "r") as f:
12 | data = json.load(f)
13 | return data
14 |
15 |
16 | def generate_embeddings(descriptions, model, tokenizer, output_file):
17 | """
18 | Generate embeddings for each description and save them along with their IDs.
19 | """
20 | embeddings_with_ids = []
21 |
22 | model.eval() # Set model to evaluation mode
23 |
24 | with torch.no_grad(): # Disable gradient computation
25 | for id_, description in descriptions.items():
26 | print(f"Processing ID: {id_}")
27 | # Tokenize the description
28 | encoded_input = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
29 |
30 | # Get the model output
31 | output = model(**encoded_input)
32 |
33 | # Extract the CLS token representation
34 | embedding = output.last_hidden_state[:, 0, :].squeeze().numpy()
35 | print(embedding.shape)
36 |
37 | # Append the ID and its embedding as a dictionary entry
38 | embeddings_with_ids.append({"id": id_, "embedding": embedding})
39 |
40 | # Save the embeddings and IDs as a numpy array
41 | np.save(output_file, embeddings_with_ids, allow_pickle=True)
42 | print(f"Embeddings and IDs saved to {output_file}")
43 |
44 |
45 | def main():
46 | # Path to the input JSON file
47 | json_file = "./GLM_data/personalized_descriptions.json"
48 |
49 | # Path to save the output embeddings
50 | output_file = "./GLM_data/descriptions_embeddings_with_ids.npy"
51 | # Load Roberta model and tokenizer from Hugging Face
52 | model_name = "roberta-large"
53 | tokenizer = RobertaTokenizer.from_pretrained(model_name)
54 | model = RobertaModel.from_pretrained(model_name)
55 |
56 | # Load the personalized descriptions
57 | descriptions = load_data(json_file)
58 | print(f"Loaded {len(descriptions)} descriptions.")
59 |
60 | # Generate and save embeddings with IDs
61 | generate_embeddings(descriptions, model, tokenizer, output_file)
62 |
63 |
64 | if __name__ == "__main__":
65 | main()
66 |
--------------------------------------------------------------------------------
/feature_extraction/feature_personalized/gen_describtion.py:
--------------------------------------------------------------------------------
1 | ################################################################################################################
2 | # NOTE: #
3 | # The variable `big5_scores` used in this script corresponds to the field `big5_traits` in actual use. #
4 | # Please replace `big5_scores` with `big5_traits` when running this script on our data. #
5 | ################################################################################################################
6 |
7 | import os
8 | os.environ["HF_ENDPOINT"]="https://hf-mirror.com"
9 |
10 | import json
11 | from transformers import AutoTokenizer, AutoModel
12 |
13 | # Set GPU device
14 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
15 |
16 | # ----------------加载模型-----------------#
17 | tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
18 | model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True, device='cuda')
19 | model = model.eval()
20 | # ---------------------------------#
21 |
22 |
23 | def generate_patient_prompt(patient_data):
24 | """
25 | Generate a structured prompt for personalized description based on patient data.
26 | """
27 | # Extract relevant information
28 | big5_scores = patient_data.get("big5_scores", {})
29 | age = patient_data.get("age", "unknown")
30 | gender = patient_data.get("gender", "unknown")
31 | native_place = patient_data.get("native_place", "unknown")
32 | # financial_stress = patient_data.get("family_factors", {}).get("Financial_Stress", "unknown")
33 | # family_members = patient_data.get("family_factors", {}).get("Family_Members", "unknown")
34 | # disease = patient_data.get("disease", "unknown")
35 |
36 | # Interpret scores
37 | extroversion = big5_scores.get("Extraversion", "unknown")
38 | agreeableness = big5_scores.get("Agreeableness", "unknown")
39 | openness = big5_scores.get("Openness", "unknown")
40 | neuroticism = big5_scores.get("Neuroticism", "unknown")
41 | conscientiousness = big5_scores.get("Conscientiousness", "unknown")
42 |
43 | # Explain financial stress and disease
44 | # financial_stress_desc = {
45 | # 0: "no financial stress",
46 | # 1: "mild financial stress",
47 | # 2: "moderate financial stress",
48 | # 3: "severe financial stress"
49 | # }.get(financial_stress, "unknown financial stress level")
50 |
51 | # disease_desc = {
52 | # "0": "the patient is healthy",
53 | # "1": "the patient has other diseases",
54 | # "2": "the patient has endocrine diseases",
55 | # "3": "the patient has circulatory system diseases",
56 | # "4": "the patient has neurological diseases"
57 | # }.get(disease, "unknown disease status")
58 |
59 | # Construct the prompt
60 | prompt = (
61 | f"The patient is a {age}-year-old {gender} from {native_place}. "
62 | f"The patient's Extraversion score is {extroversion}. "
63 | f"The Agreeableness score is {agreeableness}. "
64 | f"The Openness score is {openness}. "
65 | f"The Neuroticism score is {neuroticism}. "
66 | f"The Conscientiousness score is {conscientiousness}. "
67 | # f"Their financial stress is categorized as {financial_stress_desc}, and they live with {family_members} family members. "
68 | # f"Based on the disease classification, {disease_desc}. "
69 | "Please generate a concise, fluent English description summarizing the patient's key personality traits, family environment, and other notable characteristics. "
70 | "Avoid mentioning depression or related terminology. "
71 | "Output the response as a single paragraph."
72 | )
73 |
74 | return prompt
75 |
76 |
77 | def process_dataset(json_file, output_file):
78 | """
79 | Process the JSON dataset and generate personalized descriptions.
80 | """
81 | with open(json_file, "r") as f:
82 | dataset = json.load(f)
83 |
84 | # Initialize the dictionary to store results
85 | results = {}
86 |
87 | # Open the output file in write mode
88 | with open(output_file, "w") as f:
89 | for patient_id, patient_data in dataset.items():
90 | print(f"Processing patient ID: {patient_id}")
91 | patient_prompt = generate_patient_prompt(patient_data)
92 | print(f"Generated prompt for patient {patient_id}: {patient_prompt}")
93 |
94 | # Use model.chat to generate personalized response
95 | response, history = model.chat(tokenizer, patient_prompt, history=[], temperature=0.1)
96 | print(f"Generated description for patient {patient_id}: {response}")
97 |
98 | # Store the result in the dictionary
99 | results[patient_id] = response
100 |
101 | # Write the current results to the JSON file
102 | json.dump(results, f, ensure_ascii=False, indent=4)
103 | f.write("\n") # Add a newline for better readability in the JSON file
104 |
105 | print(f"All patient descriptions saved to {output_file}.")
106 |
107 |
108 | if __name__ == "__main__":
109 | # Path to the dataset
110 | json_file = "./GLM_data/label_data.json"
111 | # Output description file
112 | output_file = "./GLM_data/personalized_descriptions.json"
113 |
114 | process_dataset(json_file, output_file)
115 |
--------------------------------------------------------------------------------
/feature_extraction/visual/__pycache__/dataset.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/feature_extraction/visual/__pycache__/dataset.cpython-311.pyc
--------------------------------------------------------------------------------
/feature_extraction/visual/__pycache__/dataset.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/feature_extraction/visual/__pycache__/dataset.cpython-38.pyc
--------------------------------------------------------------------------------
/feature_extraction/visual/__pycache__/util.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/feature_extraction/visual/__pycache__/util.cpython-311.pyc
--------------------------------------------------------------------------------
/feature_extraction/visual/__pycache__/util.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/feature_extraction/visual/__pycache__/util.cpython-38.pyc
--------------------------------------------------------------------------------
/feature_extraction/visual/extract_openface.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import shutil
4 | import argparse
5 | import numpy as np
6 | from util import read_hog, read_csv
7 |
8 | import sys
9 | sys.path.append('../../')
10 |
11 |
12 | def generate_face_faceDir(input_root, save_root):
13 | for dir_path in glob.glob(input_root + '/*_aligned'): # 'xx/xx/000100_guest_aligned'
14 | frame_names = os.listdir(dir_path) # ['xxx.bmp']
15 | # assert len(frame_names) <= 1
16 | if len(frame_names) == 1: # move frame to face_root
17 | frame_path = os.path.join(dir_path, frame_names[0]) # 'xx/xx/000100_guest_aligned/xxx.bmp'
18 | name = os.path.basename(dir_path)[:-len('_aligned')] # '000100_guest'
19 | save_path = os.path.join(save_root, name + '.bmp')
20 | shutil.copy(frame_path, save_path)
21 |
22 |
23 | def generate_face_videoOne(input_root, save_root):
24 | for dir_path in glob.glob(input_root + '/*_aligned'): # 'xx/xx/000100_guest_aligned'
25 | frame_names = os.listdir(dir_path) # ['xxx.bmp']
26 | for ii in range(len(frame_names)):
27 | frame_path = os.path.join(dir_path, frame_names[ii]) # 'xx/xx/000100_guest_aligned/xxx.bmp'
28 | frame_name = os.path.basename(frame_path)
29 | save_path = os.path.join(save_root, frame_name)
30 | shutil.copy(frame_path, save_path)
31 |
32 |
33 | def generate_hog(input_root, save_root):
34 | for hog_path in glob.glob(input_root + '/*.hog'):
35 | csv_path = hog_path[:-4] + '.csv'
36 | if os.path.exists(csv_path):
37 | hog_name = os.path.basename(hog_path)[:-4]
38 | _, feature = read_hog(hog_path)
39 | save_path = os.path.join(save_root, hog_name + '.npy')
40 | np.save(save_path, feature)
41 |
42 |
43 | def generate_csv(input_root, save_root, startIdx):
44 | for csv_path in glob.glob(input_root + '/*.csv'):
45 | csv_name = os.path.basename(csv_path)[:-4]
46 | feature = read_csv(csv_path, startIdx)
47 | save_path = os.path.join(save_root, csv_name + '.npy')
48 | np.save(save_path, feature)
49 |
50 |
51 | def extract(input_dir, process_type, save_dir, face_dir, hog_dir, pose_dir):
52 | # process folders
53 | vids = os.listdir(input_dir)
54 | print(f'Find total "{len(vids)}" videos.')
55 | for i, vid in enumerate(vids, 1):
56 | print(vid)
57 | # if vid > '011_003_088': continue
58 | saveVid = vid ## for folder
59 | if vid.endswith('.mp4') or vid.endswith('.avi'): saveVid = vid[:-4] # for mp4 or avi files
60 |
61 | print(f"Processing video '{vid}' ({i}/{len(vids)})...")
62 | input_root = os.path.join(input_dir, vid) # exists
63 | save_root = os.path.join(save_dir, saveVid)
64 | face_root = os.path.join(face_dir, saveVid)
65 | hog_root = os.path.join(hog_dir, saveVid)
66 | pose_root = os.path.join(pose_dir, saveVid)
67 | # if os.path.exists(face_root): continue
68 | if not os.path.exists(save_root): os.makedirs(save_root)
69 | if not os.path.exists(face_root): os.makedirs(face_root)
70 | if not os.path.exists(hog_root): os.makedirs(hog_root)
71 | if not os.path.exists(pose_root): os.makedirs(pose_root)
72 | if process_type == 'faceDir':
73 | exe_path = os.path.join(r'.\tools\OpenFace_2.2.0_win_x64', # 指向OpenFace工具的安装路径
74 | 'FaceLandmarkImg.exe')
75 | commond = '%s -fdir \"%s\" -out_dir \"%s\"' % (exe_path, input_root, save_root)
76 | os.system(commond)
77 | generate_face_faceDir(save_root, face_root)
78 | generate_hog(save_root, hog_root)
79 | generate_csv(save_root, pose_root, startIdx=2)
80 | elif process_type == 'videoOne':
81 | exe_path = os.path.join(r'.\tools\OpenFace_2.2.0_win_x64',
82 | 'FeatureExtraction.exe')
83 | commond = '%s -f \"%s\" -out_dir \"%s\"' % (exe_path, input_root, save_root)
84 | os.system(commond)
85 | generate_face_videoOne(save_root, face_root)
86 | generate_hog(save_root, hog_root)
87 | generate_csv(save_root, pose_root, startIdx=5)
88 |
89 |
90 | if __name__ == '__main__':
91 | parser = argparse.ArgumentParser(description='Run.')
92 | parser.add_argument('--overwrite', action='store_true', default=True,
93 | help='whether overwrite existed feature folder.')
94 | parser.add_argument('--dataset', type=str, default='BoxOfLies', help='input dataset')
95 | parser.add_argument('--type', type=str, default='faceDir', choices=['faceDir', 'videoOne'],
96 | help='faceDir: process on facedirs; videoOne: process on one video')
97 | params = parser.parse_args()
98 |
99 | print(f'==> Extracting openface features...')
100 |
101 | # in: face dir
102 | dataset = params.dataset
103 | process_type = params.type
104 | input_dir = r"E:\MEIJU_data20241229\frame_5s"
105 |
106 | # out: feature csv dir
107 |
108 | save_dir = os.path.join(r"\features\openface\frame_5s", 'openface_all')
109 | hog_dir = os.path.join(r"\features\openface\frame_5s", 'openface_hog')
110 | pose_dir = os.path.join(r"\features\openface\frame_5s", 'openface_pose')
111 | face_dir = os.path.join(r"\features\openface\frame_5s", 'openface_face')
112 |
113 | if not os.path.exists(save_dir):
114 | os.makedirs(save_dir)
115 | elif params.overwrite:
116 | print(f'==> Warning: overwrite save_dir "{save_dir}"!')
117 | else:
118 | raise Exception(f'==> Error: save_dir "{save_dir}" already exists, set overwrite=TRUE if needed!')
119 |
120 | if not os.path.exists(hog_dir):
121 | os.makedirs(hog_dir)
122 | elif params.overwrite:
123 | print(f'==> Warning: overwrite save_dir "{hog_dir}"!')
124 | else:
125 | raise Exception(f'==> Error: save_dir "{hog_dir}" already exists, set overwrite=TRUE if needed!')
126 |
127 | if not os.path.exists(pose_dir):
128 | os.makedirs(pose_dir)
129 | elif params.overwrite:
130 | print(f'==> Warning: overwrite save_dir "{pose_dir}"!')
131 | else:
132 | raise Exception(f'==> Error: save_dir "{pose_dir}" already exists, set overwrite=TRUE if needed!')
133 |
134 | if not os.path.exists(face_dir):
135 | os.makedirs(face_dir)
136 | elif params.overwrite:
137 | print(f'==> Warning: overwrite save_dir "{face_dir}"!')
138 | else:
139 | raise Exception(f'==> Error: save_dir "{face_dir}" already exists, set overwrite=TRUE if needed!')
140 |
141 | # process
142 | extract(input_dir, process_type, save_dir, face_dir, hog_dir, pose_dir)
143 |
144 | print(f'==> Finish')
145 |
--------------------------------------------------------------------------------
/feature_extraction/visual/extract_resnet+densnet.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import torch
3 | from torchvision.models import resnet50, densenet121
4 | from torchvision.transforms import transforms
5 | import os
6 | import tqdm
7 | import torch.utils.data as data
8 | import glob
9 | import argparse
10 | import numpy as np
11 | from PIL import Image
12 |
13 | class FrameDataset(data.Dataset):
14 | def __init__(self, vid, face_dir, transform=None):
15 | super(FrameDataset, self).__init__()
16 | self.vid = vid
17 | self.path = os.path.join(face_dir, vid)
18 | self.transform = transform
19 | self.frames = self.get_frames()
20 |
21 | def get_frames(self):
22 | frames = glob.glob(os.path.join(self.path, '*'))
23 | return frames
24 |
25 | def __len__(self):
26 | return len(self.frames)
27 |
28 | def __getitem__(self, index):
29 | path = self.frames[index]
30 | img = Image.open(path)
31 | if self.transform is not None:
32 | img = self.transform(img)
33 | name = os.path.basename(path)[:-4]
34 | return img, name
35 |
36 | def frame_extract(video_path, root_save_path, sample_rate=2):
37 | video_name = os.path.basename(video_path)[:-4]
38 | save_dir = os.path.join(root_save_path, video_name)
39 | if not os.path.exists(save_dir):
40 | os.mkdir(save_dir)
41 |
42 | video = cv2.VideoCapture(video_path)
43 |
44 | count = 0
45 | while video.isOpened():
46 | ret, frame = video.read()
47 | if not ret:
48 | break
49 |
50 | if count % sample_rate == 0:
51 | save_path = os.path.join(root_save_path, video_name, f'frame{count:04d}.jpg')
52 | cv2.imwrite(save_path, frame)
53 | # break
54 | count += 1
55 |
56 | video.release()
57 | cv2.destroyAllWindows()
58 |
59 |
60 | def extract(data_loader, model):
61 | model.eval()
62 | with torch.no_grad():
63 | features, timestamps = [], []
64 | for images, names in data_loader:
65 | # images = images.cuda()
66 | embedding = model(images)
67 | features.append(embedding.cpu().detach().numpy())
68 | timestamps.extend(names)
69 | features, timestamps = np.row_stack(features), np.array(timestamps)
70 | return features, timestamps
71 |
72 |
73 | def feature_extract(frame_dir, save_dir, feature_level='UTT'):
74 | if not os.path.exists(save_dir):
75 | os.mkdir(save_dir)
76 |
77 | model = resnet50(pretrained=True)#.cuda()
78 | transform = transforms.Compose([
79 | # transforms.ToPILImage(),
80 | transforms.Resize((224, 224)),
81 | transforms.ToTensor(),
82 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
83 | ])
84 | vids = os.listdir(frame_dir)
85 | EMBEDDING_DIM = -1
86 | print(f'Find total "{len(vids)}" videos.')
87 | for i, vid in enumerate(vids, 1):
88 | print(f"Processing video '{vid}' ({i}/{len(vids)})...")
89 | csv_file = os.path.join(save_dir, f'{vid}.npy')
90 | if os.path.exists(csv_file):
91 | continue
92 |
93 | # forward
94 | dataset = FrameDataset(vid, frame_dir, transform=transform)
95 | if len(dataset) == 0:
96 | print("Warning: number of frames of video {} should not be zero.".format(vid))
97 | embeddings, framenames = [], []
98 | else:
99 | data_loader = torch.utils.data.DataLoader(dataset,
100 | batch_size=32,
101 | num_workers=4,
102 | pin_memory=True)
103 | embeddings, framenames = extract(data_loader, model)
104 |
105 | # save results
106 | indexes = np.argsort(framenames)
107 | embeddings = embeddings[indexes]
108 | EMBEDDING_DIM = max(EMBEDDING_DIM, np.shape(embeddings)[-1])
109 |
110 | if feature_level == 'FRAME':
111 | embeddings = np.array(embeddings).squeeze()
112 | if len(embeddings) == 0:
113 | embeddings = np.zeros((1, EMBEDDING_DIM))
114 | elif len(embeddings.shape) == 1:
115 | embeddings = embeddings[np.newaxis, :]
116 | np.save(csv_file, embeddings) # shape = (frame_num, 1000)
117 | else:
118 | embeddings = np.array(embeddings).squeeze()
119 | if len(embeddings) == 0:
120 | embeddings = np.zeros((EMBEDDING_DIM, ))
121 | elif len(embeddings.shape) == 2:
122 | embeddings = np.mean(embeddings, axis=0)
123 | np.save(csv_file, embeddings)
124 |
125 |
126 |
127 | def visual_extraction():
128 | sample_rate = 10
129 | video_path = 'D:/HACI/MMchallenge/Video_split1/Video_split1'
130 | video_name = os.listdir(video_path)
131 | for video in tqdm.tqdm(video_name):
132 | if 'mp4' in video:
133 | video_path = os.path.join(video_path, video)
134 | if not os.path.exists('D:/HACI/MMchallenge/Video_split1/frame'):
135 | os.mkdir('D:/HACI/MMchallenge/Video_split1/frame')
136 | frame_extract(video_path, r'D:/HACI/MMchallenge/Video_split1/frame', sample_rate=sample_rate)
137 |
138 | print('Finished extracting frame!')
139 |
140 |
141 | video_frame_dir = 'D:/HACI/MMchallenge/Video_split1/frame'
142 | save_dir = 'D:/HACI/MMchallenge/Video_split1/features'
143 | feature_extract(video_frame_dir, save_dir, feature_level='UTT')
144 |
145 |
146 | if __name__ == '__main__':
147 | parser = argparse.ArgumentParser(description='Run.')
148 | parser.add_argument('--overwrite', action='store_true', default=True, help='whether overwrite existed feature folder.')
149 | parser.add_argument('--dataset', type=str, default='BoxOfLies', help='input dataset')
150 | params = parser.parse_args()
151 |
152 | print(f'==> Extracting resnet features...')
153 |
154 | dataset = params.dataset
155 | input_dir = 'D:/HACI/MMchallenge/Video_split1/frame'
156 |
157 | # out: feature csv dir
158 | save_dir = 'D:/HACI/MMchallenge/Video_split1/features'
159 |
160 |
161 | visual_extraction()
162 |
163 | pass
164 |
--------------------------------------------------------------------------------
/feature_extraction/visual/util.py:
--------------------------------------------------------------------------------
1 | # *_*coding:utf-8 *_*
2 | import os
3 | import re
4 | import pandas as pd
5 | import numpy as np
6 | import struct
7 |
8 | ## for OPENFACE
9 | ## reference: https://gist.github.com/btlorch/6d259bfe6b753a7a88490c0607f07ff8
10 | def read_hog(filename, batch_size=5000):
11 | """
12 | Read HoG features file created by OpenFace.
13 | For each frame, OpenFace extracts 12 * 12 * 31 HoG features, i.e., num_features = 4464. These features are stored in row-major order.
14 | :param filename: path to .hog file created by OpenFace
15 | :param batch_size: how many rows to read at a time
16 | :return: is_valid, hog_features
17 | is_valid: ndarray of shape [num_frames]
18 | hog_features: ndarray of shape [num_frames, num_features]
19 | """
20 | all_feature_vectors = []
21 | with open(filename, "rb") as f:
22 | num_cols, = struct.unpack("i", f.read(4)) # 12
23 | num_rows, = struct.unpack("i", f.read(4)) # 12
24 | num_channels, = struct.unpack("i", f.read(4)) # 31
25 |
26 | # The first four bytes encode a boolean value whether the frame is valid
27 | num_features = 1 + num_rows * num_cols * num_channels
28 | feature_vector = struct.unpack("{}f".format(num_features), f.read(num_features * 4))
29 | feature_vector = np.array(feature_vector).reshape((1, num_features)) # [1, 4464+1]
30 | all_feature_vectors.append(feature_vector)
31 |
32 | # Every frame contains a header of four float values: num_cols, num_rows, num_channels, is_valid
33 | num_floats_per_feature_vector = 4 + num_rows * num_cols * num_channels
34 | # Read in batches of given batch_size
35 | num_floats_to_read = num_floats_per_feature_vector * batch_size
36 | # Multiply by 4 because of float32
37 | num_bytes_to_read = num_floats_to_read * 4
38 |
39 | while True:
40 | bytes = f.read(num_bytes_to_read)
41 | # For comparison how many bytes were actually read
42 | num_bytes_read = len(bytes)
43 | assert num_bytes_read % 4 == 0, "Number of bytes read does not match with float size"
44 | num_floats_read = num_bytes_read // 4
45 | assert num_floats_read % num_floats_per_feature_vector == 0, "Number of bytes read does not match with feature vector size"
46 | num_feature_vectors_read = num_floats_read // num_floats_per_feature_vector
47 |
48 | feature_vectors = struct.unpack("{}f".format(num_floats_read), bytes)
49 | # Convert to array
50 | feature_vectors = np.array(feature_vectors).reshape((num_feature_vectors_read, num_floats_per_feature_vector))
51 | # Discard the first three values in each row (num_cols, num_rows, num_channels)
52 | feature_vectors = feature_vectors[:, 3:]
53 | # Append to list of all feature vectors that have been read so far
54 | all_feature_vectors.append(feature_vectors)
55 |
56 | if num_bytes_read < num_bytes_to_read:
57 | break
58 |
59 | # Concatenate batches
60 | all_feature_vectors = np.concatenate(all_feature_vectors, axis=0)
61 |
62 | # Split into is-valid and feature vectors
63 | is_valid = all_feature_vectors[:, 0]
64 | feature_vectors = all_feature_vectors[:, 1:]
65 |
66 | return is_valid, feature_vectors
67 |
68 |
69 | ## for OPENFACE
70 | def read_csv(filename, startIdx):
71 | data = pd.read_csv(filename)
72 | all_feature_vectors = []
73 | for index in data.index:
74 | features = np.array(data.iloc[index][startIdx:])
75 | all_feature_vectors.append(features)
76 | all_feature_vectors = np.array(all_feature_vectors)
77 | return all_feature_vectors
78 |
79 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | """This package contains modules related to objective functions, optimizations, and network architectures.
2 |
3 | To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel.
4 | You need to implement the following five functions:
5 | -- <__init__>: initialize the class; first call BaseModel.__init__(self, opt).
6 | -- : unpack data from dataset and apply preprocessing.
7 | -- : produce intermediate results.
8 | -- : calculate loss, gradients, and update network weights.
9 | -- : (optionally) add model-specific options and set default options.
10 |
11 | In the function <__init__>, you need to define four lists:
12 | -- self.loss_names (str list): specify the training losses that you want to plot and save.
13 | -- self.model_names (str list): define networks used in our training.
14 | -- self.visual_names (str list): specify the images that you want to display and save.
15 | -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage.
16 |
17 | Now you can use the model class by specifying flag '--model dummy'.
18 | See our template model class 'template_model.py' for more details.
19 | """
20 |
21 | import importlib
22 | from models.base_model import BaseModel
23 |
24 |
25 | def find_model_using_name(model_name):
26 | """Import the module "models/[model_name]_model.py".
27 |
28 | In the file, the class called DatasetNameModel() will
29 | be instantiated. It has to be a subclass of BaseModel,
30 | and it is case-insensitive.
31 | """
32 | if 'MISA' in model_name:
33 | model_filename = "models." + model_name + "_model"
34 | print(model_filename)
35 | elif 'our' in model_name:
36 | model_filename = "models.our." + model_name + "_model"
37 | else:
38 | model_filename = "models." + model_name + "_model"
39 | modellib = importlib.import_module(model_filename)
40 | model = None
41 | target_model_name = model_name.replace('_', '') + 'model'
42 | for name, cls in modellib.__dict__.items():
43 | if name.lower() == target_model_name.lower() \
44 | and issubclass(cls, BaseModel):
45 | model = cls
46 |
47 | if model is None:
48 | print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name))
49 | exit(0)
50 |
51 | return model
52 |
53 |
54 | def get_option_setter(model_name):
55 | """Return the static method of the model class."""
56 | model_class = find_model_using_name(model_name)
57 | return model_class.modify_commandline_options
58 |
59 |
60 | def create_model(opt):
61 | """Create a model given the option.
62 |
63 | This function warps the class CustomDatasetDataLoader.
64 | This is the main interface between this package and 'train.py'/'test.py'
65 |
66 | Example:
67 | >>> from models import create_model
68 | >>> model = create_model(opt)
69 | """
70 | model = find_model_using_name(opt.model)
71 | instance = model(opt)
72 | print("model [%s] was created" % type(instance).__name__)
73 | return instance
74 |
75 |
76 |
--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/models/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/models/__pycache__/base_model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/base_model.cpython-310.pyc
--------------------------------------------------------------------------------
/models/__pycache__/base_model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/base_model.cpython-311.pyc
--------------------------------------------------------------------------------
/models/__pycache__/base_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/base_model.cpython-38.pyc
--------------------------------------------------------------------------------
/models/__pycache__/pretrain_model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/pretrain_model.cpython-310.pyc
--------------------------------------------------------------------------------
/models/__pycache__/pretrain_model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/pretrain_model.cpython-311.pyc
--------------------------------------------------------------------------------
/models/__pycache__/pretrain_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/__pycache__/pretrain_model.cpython-38.pyc
--------------------------------------------------------------------------------
/models/base_model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | from collections import OrderedDict
4 | from abc import ABC, abstractmethod
5 | from .networks import tools
6 |
7 |
8 | class BaseModel(ABC):
9 | """This class is an abstract base class (ABC) for models.
10 | To create a subclass, you need to implement the following five functions:
11 | -- <__init__>: initialize the class; first call BaseModel.__init__(self, opt).
12 | -- : unpack data from dataset and apply preprocessing.
13 | -- : produce intermediate results.
14 | -- : calculate losses, gradients, and update network weights.
15 | -- : (optionally) add model-specific options and set default options.
16 | """
17 |
18 | def __init__(self, opt):
19 | """Initialize the BaseModel class.
20 |
21 | Parameters:
22 | opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
23 |
24 | When creating your custom class, you need to implement your own initialization.
25 | In this fucntion, you should first call
26 | Then, you need to define four lists:
27 | -- self.loss_names (str list): specify the training losses that you want to plot and save.
28 | -- self.model_names (str list): specify the images that you want to display and save.
29 | -- self.visual_names (str list): define networks used in our training.
30 | -- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an example.
31 | """
32 | self.opt = opt
33 | self.gpu_ids = opt.gpu_ids
34 | self.isTrain = opt.isTrain
35 | # self.device = torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu') # get device name: CPU or GPU
36 | self.device = None
37 | self.save_dir = os.path.join(opt.checkpoints_dir, opt.name) # save all the checkpoints to save_dir
38 | # self.image_save_dir = os.path.join(opt.image_dir, opt.name)
39 | # self.save_shared_dir = os.path.join(opt.shared_dir, opt.name)
40 | if opt.cuda_benchmark: # with [scale_width], input images might have different sizes, which hurts the performance of cudnn.benchmark.
41 | torch.backends.cudnn.benchmark = True
42 | self.loss_names = []
43 | self.model_names = []
44 | self.optimizers = []
45 | self.metric = 0 # used for learning rate policy 'plateau'
46 |
47 |
48 | @staticmethod
49 | def modify_commandline_options(parser, is_train):
50 | """Add new model-specific options, and rewrite default values for existing options.
51 |
52 | Parameters:
53 | parser -- original option parser
54 | is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
55 |
56 | Returns:
57 | the modified parser.
58 | """
59 | return parser
60 |
61 | @abstractmethod
62 | def set_input(self, input):
63 | """Unpack input data from the dataloader and perform necessary pre-processing steps.
64 |
65 | Parameters:
66 | input (dict): includes the data itself and its metadata information.
67 | """
68 | pass
69 |
70 | @abstractmethod
71 | def forward(self):
72 | """Run forward pass; called by both functions and ."""
73 | pass
74 |
75 | @abstractmethod
76 | def optimize_parameters(self):
77 | """Calculate losses, gradients, and update network weights; called in every training iteration"""
78 | pass
79 |
80 | def setup(self, opt):
81 | """Load and print networks; create schedulers
82 |
83 | Parameters:
84 | opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
85 | """
86 | if self.isTrain:
87 | self.schedulers = [tools.get_scheduler(optimizer, opt) for optimizer in self.optimizers]
88 | for name in self.model_names:
89 | net = getattr(self, 'net' + name)
90 | net = tools.init_net(net, opt.init_type, opt.init_gain, opt.gpu_ids)
91 | setattr(self, 'net' + name, net)
92 | else:
93 | self.eval()
94 |
95 | self.print_networks(opt.verbose)
96 | self.post_process()
97 |
98 | def cuda(self):
99 | assert(torch.cuda.is_available())
100 | for name in self.model_names:
101 | net = getattr(self, 'net' + name)
102 | net.to(self.gpu_ids[0])
103 | net = torch.nn.DataParallel(net, self.gpu_ids) # multi-GPUs
104 |
105 | def eval(self):
106 | """Make models eval mode during test time"""
107 | self.isTrain = False
108 | for name in self.model_names:
109 | if isinstance(name, str):
110 | net = getattr(self, 'net' + name)
111 | net.eval()
112 |
113 | def train(self, mode: bool = False):
114 | # """Make models back to train mode after test time"""
115 | # self.isTrain = True
116 | # for name in self.model_names:
117 | # if isinstance(name, str):
118 | # net = getattr(self, 'net' + name)
119 | # net.train()
120 | """Make models back to train mode after test time (fzl 1029"""
121 | self.isTrain = mode # 更新 isTrain 标志位
122 | for name in self.model_names:
123 | if isinstance(name, str):
124 | net = getattr(self, 'net' + name)
125 | net.train(mode) # 确保子模块也切换到相应模式
126 |
127 | def test(self):
128 | """Forward function used in test time.
129 |
130 | This function wraps function in no_grad() so we don't save intermediate steps for backprop
131 | It also calls to produce additional visualization results
132 | """
133 | # print('进入评估函数啦') 为了确定有没有走到这儿-Yes
134 | with torch.no_grad():
135 | self.forward()
136 |
137 | def compute_visuals(self):
138 | """Calculate additional output images for visdom and HTML visualization"""
139 | pass
140 |
141 | def update_learning_rate(self, logger):
142 | """Update learning rates for all the networks; called at the end of every epoch"""
143 | for scheduler in self.schedulers:
144 | if self.opt.lr_policy == 'plateau':
145 | scheduler.step(self.metric)
146 | else:
147 | # print(scheduler)
148 | scheduler.step()
149 |
150 | lr = self.optimizers[0].param_groups[0]['lr']
151 | # print('learning rate = %.7f' % lr)
152 | logger.info('learning rate = %.7f' % lr)
153 |
154 | def get_current_visuals(self):
155 | """Return visualization images. train.py will display these images with visdom, and save the images to a HTML"""
156 | visual_ret = OrderedDict()
157 | for name in self.visual_names:
158 | if isinstance(name, str):
159 | visual_ret[name] = getattr(self, name)
160 | return visual_ret
161 |
162 | def get_current_losses(self):
163 | """Return traning losses / errors. train.py will print out these errors on console, and save them to a file"""
164 | errors_ret = OrderedDict()
165 | # print('here') 为了确定有没有走到这儿-Yes
166 | for name in self.loss_names:
167 | if isinstance(name, str):
168 | errors_ret[name] = float(getattr(self, 'loss_' + name)) # float(...) works for both scalar tensor and float number
169 | return errors_ret
170 |
171 | def save_networks(self, epoch):
172 | """Save all the networks to the disk.
173 |
174 | Parameters:
175 | epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
176 | """
177 | for name in self.model_names:
178 | # print("model_name is:",name)
179 | if isinstance(name, str):
180 | save_filename = '%s_net_%s.pth' % (epoch, name)
181 | save_path = os.path.join(self.save_dir, save_filename)
182 | # print('save_path is:', save_path)
183 | net = getattr(self, 'net' + name)
184 |
185 | if len(self.gpu_ids) > 0 and torch.cuda.is_available():
186 | torch.save(net.module.cpu().state_dict(), save_path)
187 | # print(type(self.gpu_ids),self.gpu_ids)
188 | net.cuda(self.gpu_ids[0])
189 | else:
190 | torch.save(net.cpu().state_dict(), save_path)
191 |
192 | def load_networks(self, epoch):
193 | """Load all the networks from the disk.
194 |
195 | Parameters:
196 | epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
197 | """
198 | for name in self.model_names:
199 | if isinstance(name, str):
200 | load_filename = '%s_net_%s.pth' % (epoch, name)
201 | load_path = os.path.join(self.save_dir, load_filename)
202 | net = getattr(self, 'net' + name)
203 | if isinstance(net, torch.nn.DataParallel):
204 | net = net.module
205 | print('loading the model from %s' % load_path)
206 | state_dict = torch.load(load_path, map_location=self.device)
207 | if hasattr(state_dict, '_metadata'):
208 | del state_dict._metadata
209 |
210 | net.load_state_dict(state_dict)
211 |
212 | def load_networks_cv(self, folder_path):
213 | """Load all the networks from cv folder.
214 |
215 | Parameters:
216 | epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
217 | """
218 | checkpoints = list(filter(lambda x: x.endswith('.pth'), os.listdir(folder_path)))
219 | for name in self.model_names:
220 | if isinstance(name, str):
221 | load_filename = list(filter(lambda x: x.split('.')[0].endswith('net_'+name), checkpoints))
222 | print('load_filename is:', load_filename)
223 | # assert len(load_filename) == 1, 'In folder: {}, Exists file {}'.format(folder_path, load_filename)
224 | if len(load_filename) == 1:
225 | load_filename = load_filename[0]
226 | load_path = os.path.join(folder_path, load_filename)
227 | net = getattr(self, 'net' + name)
228 | if isinstance(net, torch.nn.DataParallel):
229 | net = net.module
230 | print('loading the model from %s' % load_path)
231 | state_dict = torch.load(load_path, map_location=self.device)
232 | if hasattr(state_dict, '_metadata'):
233 | del state_dict._metadata
234 |
235 | net.load_state_dict(state_dict)
236 | else:
237 | continue
238 |
239 | def print_networks(self, verbose):
240 | """Print the total number of parameters in the network and (if verbose) network architecture
241 |
242 | Parameters:
243 | verbose (bool) -- if verbose: print the network architecture
244 | """
245 | print('---------- Networks initialized -------------')
246 | for name in self.model_names:
247 | if isinstance(name, str):
248 | net = getattr(self, 'net' + name)
249 | num_params = 0
250 | for param in net.parameters():
251 | num_params += param.numel()
252 | if verbose:
253 | print(net)
254 | print('[Network %s] Total number of parameters : %.3f M' % (name, num_params / 1e6))
255 | print('-----------------------------------------------')
256 |
257 | def set_requires_grad(self, nets, requires_grad=False):
258 | """Set requies_grad=Fasle for all the networks to avoid unnecessary computations
259 | Parameters:
260 | nets (network list) -- a list of networks
261 | requires_grad (bool) -- whether the networks require gradients or not
262 | """
263 | if not isinstance(nets, list):
264 | nets = [nets]
265 | for net in nets:
266 | if net is not None:
267 | for param in net.parameters():
268 | param.requires_grad = requires_grad
269 |
270 | def post_process(self):
271 | pass
272 |
--------------------------------------------------------------------------------
/models/networks/ContextEncoder.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | import os
4 | import json
5 | import numpy as np
6 |
7 |
8 | class ConversationalContextEncoder(nn.Module):
9 | """ Conversational Context Encoder """
10 |
11 | def __init__(self, preprocess_config=None, model_config=None):
12 | super(ConversationalContextEncoder, self).__init__()
13 | d_model = model_config.hidden_size # ["transformer"]["encoder_hidden"] # 注意力层的隐藏层大小
14 | d_cont_enc = model_config.hidden_size # ["history_encoder"]["context_hidden"] # 上下文编码器隐藏层大小?
15 | num_layers = model_config.ContextEncoder_layers # ["history_encoder"]["context_layer"] # 上下文编码器层数
16 | dropout = model_config.ContextEncoder_dropout # ["history_encoder"]["context_dropout"] # 上下文编码器dropout
17 | self.text_emb_size = model_config.input_dim_l # ["history_encoder"]["text_emb_size"] # 文本embedding大小
18 | self.visual_emb_size = model_config.input_dim_v # ["history_encoder"]["visual_emb_size"] # 文本embedding大小
19 | self.audio_emb_size = model_config.input_dim_a # ["history_encoder"]["audio_emb_size"] # 文本embedding大小
20 | self.max_history_len = model_config.ContextEncoder_max_history_len # ["history_encoder"]["max_history_len"] # 最大历史长度
21 |
22 | self.text_emb_linear = nn.Linear(self.text_emb_size, d_cont_enc)
23 | self.visual_emb_linear = nn.Linear(self.visual_emb_size, d_cont_enc)
24 | self.audio_emb_linear = nn.Linear(self.audio_emb_size, d_cont_enc)
25 | self.speaker_linear = nn.Linear(d_model, d_cont_enc)
26 | n_speaker = 2
27 | self.speaker_embedding = nn.Embedding(
28 | n_speaker,
29 | model_config.hidden_size,
30 | )
31 |
32 | self.text_gru = nn.GRU(
33 | input_size=d_cont_enc,
34 | hidden_size=d_cont_enc,
35 | num_layers=num_layers,
36 | batch_first=True,
37 | dropout=dropout,
38 | bidirectional=True
39 | )
40 | # GRU的输出是2*hidden_size,所以需要线性层来将长度映射至hidden_size
41 | self.text_gru_linear = nn.Sequential(
42 | nn.Linear(2 * d_cont_enc, d_cont_enc),
43 | nn.ReLU()
44 | )
45 | self.visual_gru = nn.GRU(
46 | input_size=d_cont_enc,
47 | hidden_size=d_cont_enc,
48 | num_layers=num_layers,
49 | batch_first=True,
50 | dropout=dropout,
51 | bidirectional=True
52 | )
53 | # GRU的输出是2*hidden_size,所以需要线性层来将长度映射至hidden_size
54 | self.visual_gru_linear = nn.Sequential(
55 | nn.Linear(2 * d_cont_enc, d_cont_enc),
56 | nn.ReLU()
57 | )
58 | self.audio_gru = nn.GRU(
59 | input_size=d_cont_enc,
60 | hidden_size=d_cont_enc,
61 | num_layers=num_layers,
62 | batch_first=True,
63 | dropout=dropout,
64 | bidirectional=True
65 | )
66 | # GRU的输出是2*hidden_size,所以需要线性层来将长度映射至hidden_size
67 | self.audio_gru_linear = nn.Sequential(
68 | nn.Linear(2 * d_cont_enc, d_cont_enc),
69 | nn.ReLU()
70 | )
71 |
72 | self.context_linear = nn.Linear(d_cont_enc, d_model)
73 | self.context_attention = SLA(d_model)
74 |
75 | def forward(self, text_emb, visual_emb, audio_emb, speaker,
76 | history_text_emb, history_visual_emb, history_audio_emb, history_speaker, modal='val'):
77 | # history_masks = get_mask_from_lengths(history_lens, self.max_history_len)
78 |
79 | # Embedding
80 | # 把当前句的文本embedding和对话历史embedding拼接起来
81 | if 'l' in modal:
82 | history_text_emb = torch.cat([history_text_emb, text_emb], dim=1)
83 | history_text_emb = self.text_emb_linear(history_text_emb)
84 | if 'v' in modal:
85 | history_visual_emb = torch.cat([history_visual_emb, visual_emb], dim=1)
86 | history_visual_emb = self.visual_emb_linear(history_visual_emb)
87 | if 'a' in modal:
88 | history_audio_emb = torch.cat([history_audio_emb, audio_emb], dim=1)
89 | history_audio_emb = self.audio_emb_linear(history_audio_emb)
90 |
91 | # # 拼接当前说话人和历史说话人
92 | history_speaker = torch.cat([history_speaker, speaker], dim=1)
93 | # # 降维
94 | history_speaker = self.speaker_linear(self.speaker_embedding(history_speaker))
95 |
96 | # # 将对话文本历史和说话人历史拼接在一起,并进行编码, the reason we dropping this part is same as the above
97 | if 'l' in modal:
98 | history_text_enc = torch.cat([history_text_emb, history_speaker], dim=1)
99 | history_text_con = self.text_gru_linear(self.text_gru(history_text_enc)[0][:, -1, :])
100 | if 'v' in modal:
101 | history_visual_enc = torch.cat([history_visual_emb, history_speaker], dim=1)
102 | history_visual_con = self.visual_gru_linear(self.visual_gru(history_visual_enc)[0][:, -1, :])
103 | if 'a' in modal:
104 | history_audio_emb = torch.cat([history_audio_emb, history_speaker], dim=1)
105 | history_audio_con = self.audio_gru_linear(self.audio_gru(history_audio_emb)[0][:, -1, :])
106 |
107 | # context_enc = torch.cat([history_visual_con, history_audio_con, history_text_con], dim=-1)
108 | if modal == 'val':
109 | context_enc = torch.stack([history_visual_con, history_audio_con, history_text_con], dim=0)
110 | elif modal == 'va':
111 | context_enc = torch.stack([history_visual_con, history_audio_con], dim=0)
112 | elif modal == 'vl':
113 | context_enc = torch.stack([history_visual_con, history_text_con], dim=0)
114 | elif modal == 'al':
115 | context_enc = torch.stack([history_audio_con, history_text_con], dim=0)
116 | elif modal == 'v':
117 | # context_enc = torch.stack([history_visual_con], dim=0)
118 | context_enc = history_visual_con.unsqueeze(0)
119 | elif modal == 'a':
120 | # context_enc = torch.stack([history_audio_con], dim=0)
121 | context_enc = history_audio_con.unsqueeze(0)
122 | elif modal == 'l':
123 | # context_enc = torch.stack([history_text_con], dim=0)
124 | context_enc = history_text_con.unsqueeze(0)
125 | else:
126 | context_enc = None
127 |
128 | # Split, 按照最大历史长度将历史编码切分成当前编码和过去编码,we don't have history, so we just use history_text_emb only
129 | # enc_current, enc_past = torch.split(history_enc, self.max_history_len, dim=1)
130 | # enc_current, enc_past = torch.split(history_text_emb, self.max_history_len, dim=1)
131 |
132 | # GRU,对当前编码进行编码,并使用掩码将填充部分置为0。
133 | # enc_current = self.gru_linear(self.gru(enc_current)[0])
134 | # enc_current = enc_current.masked_fill(history_masks.unsqueeze(-1), 0)
135 |
136 | # Encoding
137 | # context_enc = torch.cat([enc_current, enc_past], dim=1)
138 | # context_enc = self.context_attention(self.context_linear(context_enc)) # [B, d]
139 |
140 | return context_enc
141 |
142 |
143 | class SLA(nn.Module):
144 | """ Sequence Level Attention """
145 |
146 | def __init__(self, d_enc):
147 | super(SLA, self).__init__()
148 | self.linear = nn.Linear(d_enc, 1)
149 | self.softmax = nn.Softmax(dim=1)
150 |
151 | def forward(self, encoding, mask=None):
152 | attn = self.linear(encoding)
153 | if mask is not None:
154 | attn = attn.masked_fill(mask.unsqueeze(-1), -np.inf)
155 | aux_mask = (attn == -np.inf).all(self.softmax.dim).unsqueeze(self.softmax.dim)
156 | attn = attn.masked_fill(aux_mask, 0) # Remove all -inf along softmax.dim
157 | score = self.softmax(attn).transpose(-2, -1) # [B, 1, T]
158 | fused_rep = torch.matmul(score, encoding).squeeze(1) # [B, d]
159 |
160 | return fused_rep
161 |
162 |
163 | def get_mask_from_lengths(lengths, max_len=None):
164 | batch_size = lengths.shape[0]
165 | if max_len is None:
166 | max_len = torch.max(lengths).item()
167 |
168 | ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(lengths.device)
169 | mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
170 |
171 | return mask
172 |
--------------------------------------------------------------------------------
/models/networks/LightWeightTrans.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import copy
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from torch.nn import LayerNorm, Linear, Dropout, Module
6 | from torch.nn.modules import ModuleList
7 | from models.networks.multihead_attention import MultiheadAttention
8 |
9 |
10 | def _get_clones(module, n):
11 | return ModuleList([copy.deepcopy(module) for _ in range(n)])
12 |
13 |
14 | class TransEncoderLayer(Module):
15 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
16 | super(TransEncoderLayer, self).__init__()
17 | self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
18 |
19 | self.linear1 = Linear(d_model, dim_feedforward)
20 | self.dropout = Dropout(dropout)
21 | self.linear2 = Linear(dim_feedforward, d_model)
22 |
23 | self.norm = LayerNorm(d_model)
24 | self.dropout1 = Dropout(dropout)
25 | self.dropout2 = Dropout(dropout)
26 |
27 | def forward(self, src, src_mask=None, src_key_padding_mask=None):
28 | src2 = self.self_attn(src, src, src, attn_mask=src_mask,
29 | key_padding_mask=src_key_padding_mask)[0]
30 | src = src + self.dropout1(src2)
31 | src = self.norm(src)
32 | src2 = self.linear2(self.dropout(F.relu(self.linear1(src))))
33 | src = src + self.dropout2(src2)
34 | src = self.norm(src)
35 | return src
36 |
37 |
38 | class TransEncoder(Module):
39 | def __init__(self, d_dual, d_model=512, nhead=8, num_encoder_layers=6, dim_feedforward=2048, dropout=0.1):
40 | super(TransEncoder, self).__init__()
41 | self.d_model = d_model
42 | self.num_layers = num_encoder_layers
43 | self.linear1 = Linear(d_dual[0], d_model)
44 | self.linear2 = Linear(d_model, d_dual[1])
45 | self.dropout = Dropout(dropout)
46 |
47 | encoder_layer = TransEncoderLayer(d_model, nhead, dim_feedforward, dropout)
48 | self.layers = _get_clones(encoder_layer, num_encoder_layers)
49 |
50 | self.norm = LayerNorm(d_model)
51 |
52 | def forward(self, src, mask=None, src_key_padding_mask=None):
53 | res = list()
54 | output = self.dropout(F.relu(self.linear1(src)))
55 | res.append(output)
56 | for i in range(self.num_layers):
57 | output = self.layers[i](output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)
58 | res.append(output)
59 | if self.norm:
60 | output = self.norm(output)
61 | res.append(output)
62 | return self.linear2(output), res
63 |
64 |
65 | class EmotionClassifier(nn.Module):
66 | def __init__(self, config):
67 | super(EmotionClassifier, self).__init__()
68 | self.gpu_ids = config.gpu_ids
69 | self.device = torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu')
70 |
71 | self.output_dim = config.output_dim
72 | self.rnn_dropout = nn.Dropout(p=0.3, inplace=True)
73 | self.rnn_text = nn.LSTM(input_size=config.input_dim_l, hidden_size=config.gru_units, # text_dim->input_dim_l
74 | num_layers=1, bidirectional=False, dropout=0.0, batch_first=True)
75 | self.rnn_audio = nn.LSTM(input_size=config.input_dim_a, hidden_size=config.gru_units, # audio_dim->input_dim_a
76 | num_layers=1, bidirectional=False, dropout=0.0, batch_first=True)
77 |
78 | self.dense_text = nn.Linear(in_features=config.gru_units * 1, out_features=config.dense_units)
79 | self.dense_audio = nn.Linear(in_features=config.gru_units * 1, out_features=config.dense_units)
80 | self.dense_dropout = nn.Dropout(p=0.3, inplace=True)
81 |
82 | cat_dims = config.a_d_model + config.t_d_model + config.dense_units * 2
83 | self.out_layer_1 = nn.Linear(in_features=cat_dims, out_features=config.dense_units)
84 | self.out_layer_2 = nn.Linear(in_features=config.dense_units, out_features=config.output_dim)
85 | self.out_dropout = nn.Dropout(p=0.3, inplace=True)
86 |
87 | def forward(self, audio, text, uni_fusion):
88 | rnn_t, _ = self.rnn_text(text)
89 | encoded_text = torch.relu(self.dense_dropout(self.dense_text(torch.relu(rnn_t))))
90 | rnn_a, _ = self.rnn_audio(audio)
91 | encoded_audio = torch.relu(self.dense_dropout(self.dense_audio(torch.relu(rnn_a))))
92 |
93 | encoded_text = encoded_text.view(encoded_text.size(0), encoded_text.size(-1), encoded_text.size(1))
94 | encoded_audio = encoded_audio.view(encoded_audio.size(0), encoded_audio.size(-1), encoded_audio.size(1))
95 |
96 | layer_3 = torch.nn.Linear(in_features=encoded_text.size(-1), out_features=64).to(self.device)
97 | encoded_text = layer_3(encoded_text)
98 | layer_4 = torch.nn.Linear(in_features=encoded_audio.size(-1), out_features=64).to(self.device)
99 | encoded_audio = layer_4(encoded_audio)
100 |
101 | encoded_text = encoded_text.view(encoded_text.size(0), encoded_text.size(-1), encoded_text.size(1))
102 | encoded_audio = encoded_audio.view(encoded_audio.size(0), encoded_audio.size(-1), encoded_audio.size(1))
103 |
104 | encoded_feature = torch.cat((encoded_text, encoded_audio, uni_fusion[0], uni_fusion[1]), dim=-1)
105 | out1 = self.out_dropout(torch.relu(self.out_layer_1(encoded_feature)))
106 | out2 = self.out_layer_2(out1)
107 | in_feat = out2.transpose(1, 2)
108 | embd = F.max_pool1d(in_feat, in_feat.size(2), in_feat.size(2))
109 | return embd.squeeze(-1)
110 | # return self.out_layer_2(out1) # mode = sentiment
111 |
112 |
113 | '''
114 | D1 = Discriminator(feature_dims=128, conv_dim=20)
115 | dat = torch.randn(13, 1, 20, 128)
116 | output = D1(dat)
117 | print(output.shape)
118 | torch.Size([13, 20, 10, 64])
119 | torch.Size([13, 40, 5, 32])
120 | torch.Size([13, 80, 2, 16])
121 | torch.Size([13, 1, 1, 1])
122 | Translation = TransEncoder(d_dual=(300, 5), d_model=128, nhead=4, num_encoder_layers=2,
123 | dim_feedforward=512, dropout=0.5)
124 | dat = torch.empty(20, 11, 300)
125 | res = Translation(dat)
126 | print(res.shape)
127 | print(res[0, 1])
128 | '''
129 |
--------------------------------------------------------------------------------
/models/networks/__init__.py:
--------------------------------------------------------------------------------
1 | ''' Contains network files. '''
--------------------------------------------------------------------------------
/models/networks/__pycache__/ContextEncoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/ContextEncoder.cpython-310.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/ContextEncoder.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/ContextEncoder.cpython-311.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/ContextEncoder.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/ContextEncoder.cpython-38.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/autoencoder.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/autoencoder.cpython-310.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/autoencoder.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/autoencoder.cpython-311.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/autoencoder.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/autoencoder.cpython-38.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/classifier.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/classifier.cpython-310.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/classifier.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/classifier.cpython-311.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/classifier.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/classifier.cpython-38.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/fc.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/fc.cpython-310.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/fc.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/fc.cpython-311.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/fc.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/fc.cpython-38.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/interact_model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/interact_model.cpython-310.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/interact_model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/interact_model.cpython-311.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/interact_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/interact_model.cpython-38.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/lstm.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/lstm.cpython-310.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/lstm.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/lstm.cpython-311.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/lstm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/lstm.cpython-38.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/multihead_attention.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/multihead_attention.cpython-310.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/multihead_attention.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/multihead_attention.cpython-311.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/multihead_attention.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/multihead_attention.cpython-38.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/tools.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/tools.cpython-310.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/tools.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/tools.cpython-311.pyc
--------------------------------------------------------------------------------
/models/networks/__pycache__/tools.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/networks/__pycache__/tools.cpython-38.pyc
--------------------------------------------------------------------------------
/models/networks/classifier.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
4 |
5 | class LSTMClassifier(nn.Module):
6 | def __init__(self, input_size, hidden_size, fc1_size, output_size, dropout_rate):
7 | super(LSTMClassifier, self).__init__()
8 | self.input_size = input_size
9 | self.hidden_size = hidden_size
10 | self.fc1_size = fc1_size
11 | self.output_size = output_size
12 | self.dropout_rate = dropout_rate
13 |
14 | # defining modules - two layer bidirectional LSTM with layer norm in between
15 | self.rnn1 = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True)
16 | self.rnn2 = nn.LSTM(2 * hidden_size, hidden_size, bidirectional=True, batch_first=True)
17 | self.fc1 = nn.Linear(hidden_size * 4, fc1_size)
18 | self.fc2 = nn.Linear(fc1_size, output_size)
19 | self.relu = nn.ReLU()
20 | self.dropout = nn.Dropout(dropout_rate)
21 | self.layer_norm = nn.LayerNorm((hidden_size * 2, ))
22 | self.bn = nn.BatchNorm1d(hidden_size * 4)
23 |
24 | def extract_features(self, sequence, lengths, rnn1, rnn2, layer_norm):
25 | packed_sequence = pack_padded_sequence(sequence, lengths, batch_first=True, enforce_sorted=False)
26 | packed_h1, (final_h1, _) = rnn1(packed_sequence)
27 | padded_h1, _ = pad_packed_sequence(packed_h1, batch_first=True)
28 | normed_h1 = layer_norm(padded_h1)
29 | packed_normed_h1 = pack_padded_sequence(normed_h1, lengths, batch_first=True, enforce_sorted=False)
30 | _, (final_h2, _) = rnn2(packed_normed_h1)
31 | return final_h1, final_h2
32 |
33 | def rnn_flow(self, x, lengths):
34 | batch_size = lengths.size(0)
35 | h1, h2 = self.extract_features(x, lengths, self.rnn1, self.rnn2, self.layer_norm)
36 | h = torch.cat((h1, h2), dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1)
37 | return self.bn(h)
38 |
39 | def mask2length(self, mask):
40 | ''' mask [batch_size, seq_length, feat_size]
41 | '''
42 | _mask = torch.mean(mask, dim=-1).long() # [batch_size, seq_len]
43 | length = torch.sum(_mask, dim=-1) # [batch_size,]
44 | return length
45 |
46 | def forward(self, x, mask):
47 | lengths = self.mask2length(mask)
48 | h = self.rnn_flow(x, lengths)
49 | h = self.fc1(h)
50 | h = self.dropout(h)
51 | h = self.relu(h)
52 | o = self.fc2(h)
53 | return o, h
54 |
55 | class SimpleClassifier(nn.Module):
56 | ''' Linear classifier, use embedding as input
57 | Linear approximation, should append with softmax
58 | '''
59 | def __init__(self, embd_size, output_dim, dropout):
60 | super(SimpleClassifier, self).__init__()
61 | self.dropout = dropout
62 | self.C = nn.Linear(embd_size, output_dim)
63 | self.dropout_op = nn.Dropout(dropout)
64 |
65 | def forward(self, x):
66 | if self.dropout > 0:
67 | x = self.dropout_op(x)
68 | return self.C(x)
69 |
70 | class Identity(nn.Module):
71 | def __init__(self):
72 | super().__init__()
73 |
74 | def forward(self, x):
75 | return x
76 |
77 | class FcClassifier(nn.Module):
78 | def __init__(self, input_dim, layers, output_dim, dropout=0.3, use_bn=False):
79 | ''' Fully Connect classifier
80 | Parameters:
81 | --------------------------
82 | input_dim: input feature dim
83 | layers: [x1, x2, x3] will create 3 layers with x1, x2, x3 hidden nodes respectively.
84 | output_dim: output feature dim
85 | activation: activation function
86 | dropout: dropout rate
87 | '''
88 | super().__init__()
89 | self.all_layers = []
90 | for i in range(0, len(layers)):
91 | self.all_layers.append(nn.Linear(input_dim, layers[i]))
92 | self.all_layers.append(nn.ReLU())
93 | if use_bn:
94 | self.all_layers.append(nn.BatchNorm1d(layers[i]))
95 | if dropout > 0:
96 | self.all_layers.append(nn.Dropout(dropout))
97 | input_dim = layers[i]
98 |
99 | if len(layers) == 0:
100 | layers.append(input_dim)
101 | self.all_layers.append(Identity())
102 |
103 | self.fc_out = nn.Linear(layers[-1], output_dim)
104 | self.module = nn.Sequential(*self.all_layers)
105 |
106 | def forward(self, x):
107 | feat = self.module(x)
108 | out = self.fc_out(feat)
109 | return out, feat
110 |
111 | class EF_model_AL(nn.Module):
112 | def __init__(self, fc_classifier, lstm_classifier, out_dim_a, out_dim_v, fusion_size, num_class, dropout):
113 | ''' Early fusion model classifier
114 | Parameters:
115 | --------------------------
116 | fc_classifier: acoustic classifier
117 | lstm_classifier: lexical classifier
118 | out_dim_a: fc_classifier output dim
119 | out_dim_v: lstm_classifier output dim
120 | fusion_size: output_size for fusion model
121 | num_class: class number
122 | dropout: dropout rate
123 | '''
124 | super(EF_model_AL, self).__init__()
125 | self.fc_classifier = fc_classifier
126 | self.lstm_classifier = lstm_classifier
127 | self.out_dim = out_dim_a + out_dim_v
128 | self.dropout = nn.Dropout(dropout)
129 | self.num_class = num_class
130 | self.fusion_size = fusion_size
131 | # self.out = nn.Sequential(
132 | # nn.Linear(self.out_dim, self.fusion_size),
133 | # nn.ReLU(),
134 | # nn.Linear(self.fusion_size, self.num_class),
135 | # )
136 | self.out1 = nn.Linear(self.out_dim, self.fusion_size)
137 | self.relu = nn.ReLU()
138 | self.out2 = nn.Linear(self.fusion_size, self.num_class)
139 |
140 | def forward(self, A_feat, L_feat, L_mask):
141 | _, A_out = self.fc_classifier(A_feat)
142 | _, L_out = self.lstm_classifier(L_feat, L_mask)
143 | feat = torch.cat([A_out, L_out], dim=-1)
144 | feat = self.dropout(feat)
145 | feat = self.relu(self.out1(feat))
146 | out = self.out2(self.dropout(feat))
147 | return out, feat
148 |
149 |
150 | class MaxPoolFc(nn.Module):
151 | def __init__(self, hidden_size, num_class=4):
152 | super(MaxPoolFc, self).__init__()
153 | self.hidden_size = hidden_size
154 | self.fc = nn.Sequential(
155 | nn.Linear(hidden_size, num_class),
156 | nn.ReLU()
157 | )
158 |
159 | def forward(self, x):
160 | ''' x shape => [batch_size, seq_len, hidden_size]
161 | '''
162 | batch_size, seq_len, hidden_size = x.size()
163 | x = x.view(batch_size, hidden_size, seq_len)
164 | # print(x.size())
165 | out = torch.max_pool1d(x, kernel_size=seq_len)
166 | out = out.squeeze()
167 | out = self.fc(out)
168 |
169 | return out
170 |
171 |
172 | class Fusion(nn.Module):
173 | def __init__(self, input_dim, layers, output_dim, dropout=0.3):
174 | super().__init__()
175 | self.fusion = nn.Sequential()
176 | for i in range(len(layers)):
177 | self.fusion.add_module(f'fusion_layer_{i}', nn.Linear(in_features=input_dim,
178 | out_features=layers[i]))
179 | self.fusion.add_module(f'fusion_layer_{i}_dropout', nn.Dropout(dropout))
180 | self.fusion.add_module(f'fusion_layer_{i}_activation', nn.ReLU())
181 | input_dim = layers[i]
182 |
183 | self.fusion.add_module('fusion_layer_final',
184 | nn.Linear(in_features=layers[-1], out_features=output_dim))
185 |
186 | def forward(self, x):
187 | feat = []
188 | out = self.fusion(x)
189 | return out, feat
190 |
191 |
192 | if __name__ == '__main__':
193 | a = FcClassifier(256, [128], 4)
194 | print(a)
--------------------------------------------------------------------------------
/models/networks/cnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class CNN(nn.Module):
7 | def __init__(self, input_dim, emb_size=128, in_channels=1, out_channels=128, kernel_heights=[2,3,4], dropout=0.5):
8 | super().__init__()
9 | '''
10 | cat((conv1-relu+conv2-relu+conv3-relu)+maxpool) + dropout, and to trans
11 | '''
12 | self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_heights[0], padding=0)
13 | self.conv2 = nn.Conv1d(in_channels, out_channels, kernel_heights[1], padding=0)
14 | self.conv3 = nn.Conv1d(in_channels, out_channels, kernel_heights[2], padding=0)
15 | self.dropout = nn.Dropout(dropout)
16 | self.embd = nn.Sequential(
17 | nn.Linear(len(kernel_heights)*out_channels, emb_size),
18 | nn.ReLU(inplace=True),
19 | )
20 |
21 | def conv_block(self, input, conv_layer):
22 | conv_out = conv_layer(input)# conv_out.size() = (batch_size, out_channels, dim, 1)
23 | activation = F.relu(conv_out.squeeze(-1))# activation.size() = (batch_size, out_channels, dim1)
24 | max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2) # maxpool_out.size() = (batch_size, out_channels)
25 | return max_out
26 |
27 | def forward(self, utterance_x):
28 | batch_size, feat_dim = utterance_x.size()
29 | utterance_x = utterance_x.view(batch_size, 1, feat_dim)
30 | max_out1 = self.conv_block(utterance_x, self.conv1)
31 | max_out2 = self.conv_block(utterance_x, self.conv2)
32 | max_out3 = self.conv_block(utterance_x, self.conv3)
33 | all_out = torch.cat((max_out1, max_out2, max_out3), 1)
34 | fc_in = self.dropout(all_out)
35 | embd = self.embd(fc_in)
36 | # out = self.conv1(frame_x) # embd.shape: [batch_size, out_channels, dim, 1]
37 | # embd = out.view(frame_x.size(0), -1)
38 | return embd
--------------------------------------------------------------------------------
/models/networks/fc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class FcEncoder(nn.Module):
5 | def __init__(self, input_dim, layers, dropout=0.5, use_bn=False):
6 | ''' Fully Connect classifier
7 | fc+relu+bn+dropout, 最后分类128-4层是直接fc的
8 | Parameters:
9 | --------------------------
10 | input_dim: input feature dim
11 | layers: [x1, x2, x3] will create 3 layers with x1, x2, x3 hidden nodes respectively.
12 | dropout: dropout rate
13 | use_bn: use batchnorm or not
14 | '''
15 | super().__init__()
16 | self.all_layers = []
17 | for i in range(0, len(layers)):
18 | self.all_layers.append(nn.Linear(input_dim, layers[i]))
19 | self.all_layers.append(nn.ReLU())
20 | if use_bn:
21 | self.all_layers.append(nn.BatchNorm1d(layers[i]))
22 | if dropout > 0:
23 | self.all_layers.append(nn.Dropout(dropout))
24 | input_dim = layers[i]
25 |
26 | self.module = nn.Sequential(*self.all_layers)
27 |
28 | def forward(self, x):
29 | ## make layers to a whole module
30 | feat = self.module(x)
31 | return feat
--------------------------------------------------------------------------------
/models/networks/interact_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import warnings
4 | from torch.nn import Parameter
5 | import torch.nn.functional as F
6 | from torch.nn import Linear, Module
7 | from torch.nn.init import xavier_normal_, xavier_uniform_, constant_
8 | from models.networks.multihead_attention import MultiheadAttention
9 | from models.networks.multihead_attention import CrossAttention
10 |
11 |
12 | class InteractModule(Module):
13 | def __init__(self, opt):
14 | super(InteractModule, self).__init__()
15 | self.inter_attention = MultiheadAttention(embed_dim=opt.hidden_size, num_heads=opt.attention_head,
16 | dropout=opt.attention_dropout)
17 | self.hence_attention = MultiheadAttention(embed_dim=opt.hidden_size, num_heads=opt.attention_head,
18 | dropout=opt.attention_dropout)
19 | # self.inter_attention = CrossAttention(in_dim1=opt.hidden_size, in_dim2=opt.hidden_size, k_dim=opt.hidden_size, v_dim=opt.hidden_size, num_heads=opt.attention_head)
20 | # self.hence_attention = CrossAttention(in_dim1=opt.hidden_size, in_dim2=opt.hidden_size, k_dim=opt.hidden_size, v_dim=opt.hidden_size, num_heads=opt.attention_head)
21 | self.opt = opt
22 |
23 | def forward(self, query, key, value, activation='sigmoid'):
24 | # print(f'query.size is {query.size()}')
25 | inter_output, _ = self.inter_attention(query, key, value)
26 | # print(f'inter_output.shape is {inter_output.shape}')
27 | hence_output, _ = self.hence_attention(query, inter_output, inter_output)
28 | # print(f'hence_output.shape is {hence_output.shape}')
29 |
30 | # Gate machine
31 | inter_fusion = inter_output + hence_output
32 | if activation == 'sigmoid':
33 | act_function = torch.sigmoid
34 | elif activation == 'relu':
35 | act_function = F.relu
36 | else:
37 | raise ValueError(f'activation must be Sigmoid or ReLu, but got {activation}')
38 |
39 | assert self.opt.ablation in ['normal', 'gate',
40 | 'hence'], f'opt.ablation must be normal, gate, or hence, not be {self.opt.ablation}'
41 |
42 | if self.opt.ablation == 'normal': # no ablation
43 | inter_weight = act_function(inter_fusion)
44 | inter_result = torch.multiply(hence_output, inter_weight)
45 |
46 | # residual.shape = [3, bsz, hidden_size]
47 | residual = query + inter_result
48 | # change shape to [bsz, 3 * hidden_size]
49 | # residual = torch.cat((residual[0], residual[1], residual[2]), dim=1)
50 |
51 | elif self.opt.ablation == 'gate': # ablation of gate machine
52 | residual = query + hence_output
53 | # residual = torch.cat((residual[0], residual[1], residual[2]), dim=1)
54 |
55 | else: # ablation of hence_attention
56 | inter_weight = act_function(inter_output)
57 | inter_result = torch.multiply(inter_output, inter_weight)
58 |
59 | # residual.shape = [3, bsz, hidden_size]
60 | residual = query + inter_result
61 | # change shape to [bsz, 3 * hidden_size]
62 |
63 |
64 | result = []
65 | for i in range(residual.shape[0]):
66 | result.append(residual[i])
67 | residual = torch.cat(result, dim=1)
68 | return residual
69 |
--------------------------------------------------------------------------------
/models/networks/lstm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class LSTMEncoder(nn.Module):
5 | ''' one directional LSTM encoder
6 | '''
7 | def __init__(self, input_size, hidden_size, embd_method='last', bidirectional=False):
8 | super(LSTMEncoder, self).__init__()
9 |
10 | self.input_size = input_size
11 | self.hidden_size = hidden_size
12 | self.rnn = nn.LSTM(self.input_size, self.hidden_size, batch_first=True, bidirectional=bidirectional)
13 | assert embd_method in ['maxpool', 'attention', 'last', 'dense']
14 | self.embd_method = embd_method
15 |
16 | if self.embd_method == 'attention':
17 | self.attention_vector_weight = nn.Parameter(torch.Tensor(hidden_size, 1))
18 | self.attention_layer = nn.Sequential(
19 | nn.Linear(self.hidden_size, self.hidden_size),
20 | nn.Tanh(),
21 | )
22 | self.softmax = nn.Softmax(dim=-1)
23 | elif self.embd_method == 'dense':
24 | self.dense_layer = nn.Sequential()
25 | self.bidirectional = bidirectional
26 | if bidirectional:
27 | self.dense_layer.add_module('linear', nn.Linear(2 * self.hidden_size, self.hidden_size))
28 | else:
29 | self.dense_layer.add_module('linear', nn.Linear(self.hidden_size, self.hidden_size))
30 | self.dense_layer.add_module('activate', nn.Tanh())
31 | self.softmax = nn.Softmax(dim=-1)
32 |
33 | def embd_attention(self, r_out, h_n):
34 | ''''
35 | 参考这篇博客的实现:
36 | https://blog.csdn.net/dendi_hust/article/details/94435919
37 | https://blog.csdn.net/fkyyly/article/details/82501126
38 | 论文: Hierarchical Attention Networks for Document Classification
39 | formulation: lstm_output*softmax(u * tanh(W*lstm_output + Bias)
40 | W and Bias 是映射函数,其中 Bias 可加可不加
41 | u 是 attention vector 大小等于 hidden size
42 | '''
43 | hidden_reps = self.attention_layer(r_out) # [batch_size, seq_len, hidden_size]
44 | atten_weight = (hidden_reps @ self.attention_vector_weight) # [batch_size, seq_len, 1]
45 | atten_weight = self.softmax(atten_weight) # [batch_size, seq_len, 1]
46 | # [batch_size, seq_len, hidden_size] * [batch_size, seq_len, 1] = [batch_size, seq_len, hidden_size]
47 | # sentence_vector = torch.sum(r_out * atten_weight, dim=1) # [batch_size, hidden_size]
48 |
49 | # return sentence_vector
50 | '''edit here (zelin)'''
51 | attended_r_out = r_out * atten_weight # 保持 [batch_size, seq_len, hidden_size]
52 | return attended_r_out # No sum over time dimension
53 |
54 | def embd_maxpool(self, r_out, h_n):
55 |
56 | """保留了时间维度,使用 torch.max 时增加 keepdim=True,并扩展结果"""
57 | pooled_out, _ = torch.max(r_out, dim=1, keepdim=True) # Keeps time dim
58 | return pooled_out.expand_as(r_out) # Duplicate across time dimension
59 |
60 | def embd_last(self, r_out, h_n):
61 |
62 | return r_out # Returns [batch_size, seq_len, hidden_size]
63 |
64 | def embd_dense(self, r_out, h_n):
65 | '''
66 | 每个时间步应用 dense_layer,然后将其还原为原始的三维格式 [batch_size, seq_len, hidden_size]。
67 | '''
68 | r_out = r_out.view(-1, r_out.size(2)) # Flatten to [batch_size * seq_len, hidden_size]
69 | dense_out = self.dense_layer(r_out)
70 | return dense_out.view(-1, r_out.size(1), self.hidden_size) # Reshape back to [batch_size, seq_len, hidden_size]
71 |
72 | def forward(self, x):
73 | '''
74 | r_out shape: seq_len, batch, num_directions * hidden_size
75 | hn and hc shape: num_layers * num_directions, batch, hidden_size
76 | '''
77 | r_out, (h_n, h_c) = self.rnn(x)
78 | embd = getattr(self, 'embd_' + self.embd_method)(r_out, h_n)
79 | return embd
--------------------------------------------------------------------------------
/models/networks/multihead_attention.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import warnings
3 | from torch.nn import Parameter
4 | import torch.nn.functional as F
5 | from torch.nn import Linear, Module
6 | from torch.nn.init import xavier_normal_, xavier_uniform_, constant_
7 | import torch.nn as nn
8 |
9 |
10 | class MultiheadAttention(Module):
11 | def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
12 | super(MultiheadAttention, self).__init__()
13 | self.embed_dim = embed_dim
14 | self.kdim = kdim if kdim is not None else embed_dim
15 | self.vdim = vdim if vdim is not None else embed_dim
16 | self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
17 |
18 | self.num_heads = num_heads
19 | self.dropout = dropout
20 | self.head_dim = embed_dim // num_heads
21 | assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
22 |
23 | self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
24 |
25 | if self._qkv_same_embed_dim is False:
26 | self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
27 | self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
28 | self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
29 |
30 | if bias:
31 | self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
32 | else:
33 | self.register_parameter('in_proj_bias', None)
34 | self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
35 |
36 | if add_bias_kv:
37 | self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
38 | self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
39 | else:
40 | self.bias_k = self.bias_v = None
41 |
42 | self.add_zero_attn = add_zero_attn
43 |
44 | self._reset_parameters()
45 |
46 | def _reset_parameters(self):
47 | if self._qkv_same_embed_dim:
48 | xavier_uniform_(self.in_proj_weight)
49 | else:
50 | xavier_uniform_(self.q_proj_weight)
51 | xavier_uniform_(self.k_proj_weight)
52 | xavier_uniform_(self.v_proj_weight)
53 |
54 | if self.in_proj_bias is not None:
55 | constant_(self.in_proj_bias, 0.)
56 | constant_(self.out_proj.bias, 0.)
57 | if self.bias_k is not None:
58 | xavier_normal_(self.bias_k)
59 | if self.bias_v is not None:
60 | xavier_normal_(self.bias_v)
61 |
62 | def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None):
63 | if hasattr(self, '_qkv_same_embed_dim') and self._qkv_same_embed_dim is False:
64 | return F.multi_head_attention_forward(
65 | query, key, value, self.embed_dim, self.num_heads,
66 | self.in_proj_weight, self.in_proj_bias,
67 | self.bias_k, self.bias_v, self.add_zero_attn,
68 | self.dropout, self.out_proj.weight, self.out_proj.bias,
69 | training=self.training,
70 | key_padding_mask=key_padding_mask, need_weights=need_weights,
71 | attn_mask=attn_mask, use_separate_proj_weight=True,
72 | q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
73 | v_proj_weight=self.v_proj_weight)
74 | else:
75 | if not hasattr(self, '_qkv_same_embed_dim'):
76 | warnings.warn('A new version of MultiheadAttention module has been implemented. \
77 | Please re-train your model with the new module',
78 | UserWarning)
79 |
80 | return F.multi_head_attention_forward(
81 | query, key, value, self.embed_dim, self.num_heads,
82 | self.in_proj_weight, self.in_proj_bias,
83 | self.bias_k, self.bias_v, self.add_zero_attn,
84 | self.dropout, self.out_proj.weight, self.out_proj.bias,
85 | training=self.training,
86 | key_padding_mask=key_padding_mask, need_weights=need_weights,
87 | attn_mask=attn_mask)
88 |
89 |
90 | class CrossAttention(nn.Module):
91 | def __init__(self, in_dim1, in_dim2, k_dim, v_dim, num_heads):
92 | super(CrossAttention, self).__init__()
93 | self.num_heads = num_heads
94 | self.k_dim = k_dim
95 | self.v_dim = v_dim
96 |
97 | self.proj_q1 = nn.Linear(in_dim1, k_dim * num_heads, bias=False)
98 | self.proj_k2 = nn.Linear(in_dim2, k_dim * num_heads, bias=False)
99 | self.proj_v2 = nn.Linear(in_dim2, v_dim * num_heads, bias=False)
100 | self.proj_o = nn.Linear(v_dim * num_heads, in_dim1)
101 |
102 | def forward(self, x1, x2, _, mask=None):
103 | batch_size, seq_len1, in_dim1 = x1.size()
104 | seq_len2 = x2.size(1)
105 |
106 | q1 = self.proj_q1(x1).view(batch_size, seq_len1, self.num_heads, self.k_dim).permute(0, 2, 1, 3)
107 | k2 = self.proj_k2(x2).view(batch_size, seq_len2, self.num_heads, self.k_dim).permute(0, 2, 3, 1)
108 | v2 = self.proj_v2(x2).view(batch_size, seq_len2, self.num_heads, self.v_dim).permute(0, 2, 1, 3)
109 |
110 | attn = torch.matmul(q1, k2) / self.k_dim ** 0.5
111 |
112 | if mask is not None:
113 | attn = attn.masked_fill(mask == 0, -1e9)
114 |
115 | attn = F.softmax(attn, dim=-1)
116 | output = torch.matmul(attn, v2).permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len1, -1)
117 | output = self.proj_o(output)
118 |
119 | return output, 0
--------------------------------------------------------------------------------
/models/networks/tools.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn import init
4 | import numpy as np
5 | import functools
6 | from torch.optim import lr_scheduler
7 |
8 |
9 | class Identity(nn.Module):
10 | def forward(self, x):
11 | return x
12 |
13 |
14 | def get_norm_layer(norm_type='instance'):
15 | """Return a normalization layer
16 |
17 | Parameters:
18 | norm_type (str) -- the name of the normalization layer: batch | instance | none
19 |
20 | For BatchNorm, we use learnable affine parameters and track running statistics (mean/stddev).
21 | For InstanceNorm, we do not use learnable affine parameters. We do not track running statistics.
22 | """
23 | if norm_type == 'batch':
24 | norm_layer = functools.partial(nn.BatchNorm2d, affine=True, track_running_stats=True)
25 | elif norm_type == 'instance':
26 | norm_layer = functools.partial(nn.InstanceNorm2d, affine=False, track_running_stats=False)
27 | elif norm_type == 'layer':
28 | norm_layer = functools.partial(nn.LayerNorm, eps=1e-6, elementwise_affine=True)
29 | elif norm_type == 'none':
30 | norm_layer = lambda x: Identity()
31 | else:
32 | raise NotImplementedError('normalization layer [%s] is not found' % norm_type)
33 | return norm_layer
34 |
35 |
36 | def get_scheduler(optimizer, opt):
37 | """Return a learning rate scheduler
38 |
39 | Parameters:
40 | optimizer -- the optimizer of the network
41 | opt (option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions.
42 | opt.lr_policy is the name of learning rate policy: linear | step | plateau | cosine
43 |
44 | For 'linear', we keep the same learning rate for the first epochs
45 | and linearly decay the rate to zero over the next epochs.
46 | For other schedulers (step, plateau, and cosine), we use the default PyTorch schedulers.
47 | See https://pytorch.org/docs/stable/optim.html for more details.
48 | """
49 | if opt.lr_policy == 'linear':
50 | def lambda_rule(epoch):
51 | lr_l = 1.0 - max(0, epoch + opt.epoch_count - opt.niter) / float(opt.niter_decay + 1)
52 | return lr_l
53 |
54 | scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
55 | elif opt.lr_policy == 'step':
56 | scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1)
57 | elif opt.lr_policy == 'plateau':
58 | scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5)
59 | elif opt.lr_policy == 'cosine':
60 | scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.niter, eta_min=0)
61 | else:
62 | return NotImplementedError('learning rate policy [%s] is not implemented', opt.lr_policy)
63 | return scheduler
64 |
65 |
66 | def init_weights(net, init_type='normal', init_gain=0.02):
67 | """Initialize network weights.
68 |
69 | Parameters:
70 | net (network) -- network to be initialized
71 | init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal
72 | init_gain (float) -- scaling factor for normal, xavier and orthogonal.
73 |
74 | We use 'normal' in the original pix2pix and CycleGAN paper. But xavier and kaiming might
75 | work better for some applications. Feel free to try yourself.
76 | """
77 |
78 | def init_func(m): # define the initialization function
79 | classname = m.__class__.__name__
80 | if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1):
81 | if init_type == 'normal':
82 | init.normal_(m.weight.data, 0.0, init_gain)
83 | elif init_type == 'xavier':
84 | init.xavier_normal_(m.weight.data, gain=init_gain)
85 | elif init_type == 'kaiming':
86 | init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
87 | elif init_type == 'orthogonal':
88 | init.orthogonal_(m.weight.data, gain=init_gain)
89 | else:
90 | raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
91 | if hasattr(m, 'bias') and m.bias is not None:
92 | init.constant_(m.bias.data, 0.0)
93 | elif classname.find(
94 | 'BatchNorm2d') != -1: # BatchNorm Layer's weight is not a matrix; only normal distribution applies.
95 | init.normal_(m.weight.data, 1.0, init_gain)
96 | init.constant_(m.bias.data, 0.0)
97 |
98 | print('initialize network with %s' % init_type)
99 | net.apply(init_func) # apply the initialization function
100 |
101 |
102 | def init_net(net, init_type='normal', init_gain=0.02, gpu_ids=[]):
103 | """Initialize a network: 1. register CPU/GPU device (with multi-GPU support); 2. initialize the network weights
104 | Parameters:
105 | net (network) -- the network to be initialized
106 | init_type (str) -- the name of an initialization method: normal | xavier | kaiming | orthogonal
107 | gain (float) -- scaling factor for normal, xavier and orthogonal.
108 | gpu_ids (int list) -- which GPUs the network runs on: e.g., 0,1,2
109 |
110 | Return an initialized network.
111 | """
112 | if len(gpu_ids) > 0:
113 | assert (torch.cuda.is_available())
114 | net.to(gpu_ids[0])
115 | net = torch.nn.DataParallel(net, gpu_ids) # multi-GPUs
116 | init_weights(net, init_type, init_gain=init_gain)
117 | return net
118 |
119 |
120 | def diagnose_network(net, name='network'):
121 | """Calculate and print the mean of average absolute(gradients)
122 |
123 | Parameters:
124 | net (torch network) -- Torch network
125 | name (str) -- the name of the network
126 | """
127 | mean = 0.0
128 | count = 0
129 | for param in net.parameters():
130 | if param.grad is not None:
131 | mean += torch.mean(torch.abs(param.grad.data))
132 | count += 1
133 | if count > 0:
134 | mean = mean / count
135 | print(name)
136 | print(mean)
137 |
138 |
139 | class MidLayerFeatureExtractor(object):
140 | def __init__(self, layer):
141 | self.layer = layer
142 | self.feature = None
143 | self.layer.register_forward_hook(self.hook)
144 | self.device = None
145 |
146 | def hook(self, module, input, output):
147 | # default tensor on cpu
148 | self.is_empty = True
149 | self.feature = output.clone()
150 | self.is_empty = False
151 | # self.is_empty = True
152 | # self.feature = output
153 | # self.is_empty = False
154 |
155 | def extract(self):
156 | assert not self.is_empty, 'Synic Error in MidLayerFeatureExtractor, \
157 | this may caused by calling extract method before the hooked module has execute forward method'
158 | return self.feature
159 |
160 |
161 | class MultiLayerFeatureExtractor(object):
162 | def __init__(self, net, layers):
163 | '''
164 | Parameter:
165 | -----------------
166 | net: torch.nn.Modules
167 | layers: str, something like "C.fc[0], module[1]"
168 | which will get mid layer features in net.C.fc[0] and net.module[1] respectively
169 | '''
170 | self.net = net
171 | self.layer_names = layers.strip().split(',')
172 | self.layers = [self.str2layer(layer_name) for layer_name in self.layer_names]
173 | self.extractors = [MidLayerFeatureExtractor(layer) for layer in self.layers]
174 |
175 | def str2layer(self, name):
176 | modules = name.split('.')
177 | layer = self.net
178 | for module in modules:
179 | if '[' and ']' in module:
180 | sequential_name = module[:module.find('[')]
181 | target_module_num = int(module[module.find('[') + 1:module.find(']')])
182 | layer = getattr(layer, sequential_name)
183 | layer = layer[target_module_num]
184 | else:
185 | layer = getattr(layer, module)
186 |
187 | return layer
188 |
189 | def extract(self):
190 | ans = [extractor.extract() for extractor in self.extractors]
191 | return ans
192 |
193 |
194 | def get_mask_from_lengths(lengths, max_len=None):
195 | batch_size = lengths.shape[0]
196 | if max_len is None:
197 | max_len = torch.max(lengths).item()
198 |
199 | ids = torch.arange(0, max_len).unsqueeze(0).expand(batch_size, -1).to(lengths.device)
200 | mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
201 |
202 | return mask
203 |
--------------------------------------------------------------------------------
/models/our/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__init__.py
--------------------------------------------------------------------------------
/models/our/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/models/our/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/models/our/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/models/our/__pycache__/our_model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/our_model.cpython-311.pyc
--------------------------------------------------------------------------------
/models/our/__pycache__/our_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/our_model.cpython-38.pyc
--------------------------------------------------------------------------------
/models/our/__pycache__/our_model_ablation.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/our_model_ablation.cpython-311.pyc
--------------------------------------------------------------------------------
/models/our/__pycache__/zelin_our_model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/our/__pycache__/zelin_our_model.cpython-311.pyc
--------------------------------------------------------------------------------
/models/our/our_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import os
3 | import json
4 | from collections import OrderedDict
5 | import torch.nn.functional as F
6 | from models.base_model import BaseModel
7 | from models.networks.lstm import LSTMEncoder
8 | from models.networks.classifier import FcClassifier
9 | from models.utils.config import OptConfig
10 | import math
11 | import torch.nn as nn
12 |
13 |
14 | class ourModel(BaseModel, nn.Module):
15 |
16 | def __init__(self, opt):
17 | """Initialize the LSTM autoencoder class
18 | Parameters:
19 | opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
20 | """
21 | nn.Module.__init__(self)
22 | super().__init__(opt)
23 |
24 |
25 | self.loss_names = []
26 | self.model_names = []
27 |
28 | # acoustic model
29 | self.netEmoA = LSTMEncoder(opt.input_dim_a, opt.embd_size_a, embd_method=opt.embd_method_a)
30 | self.model_names.append('EmoA')
31 |
32 | # visual model
33 | self.netEmoV = LSTMEncoder(opt.input_dim_v, opt.embd_size_v, opt.embd_method_v)
34 | self.model_names.append('EmoV')
35 |
36 | # Transformer Fusion model
37 | emo_encoder_layer = torch.nn.TransformerEncoderLayer(d_model=opt.hidden_size, nhead=int(opt.Transformer_head), batch_first=True)
38 | self.netEmoFusion = torch.nn.TransformerEncoder(emo_encoder_layer, num_layers=opt.Transformer_layers)
39 | self.model_names.append('EmoFusion')
40 |
41 | # Classifier
42 | cls_layers = list(map(lambda x: int(x), opt.cls_layers.split(',')))
43 |
44 | # cls_input_size = 5*opt.hidden_size, same with max-len
45 | cls_input_size = opt.feature_max_len * opt.hidden_size + 1024 # with personalized feature
46 |
47 |
48 | self.netEmoC = FcClassifier(cls_input_size, cls_layers, output_dim=opt.emo_output_dim, dropout=opt.dropout_rate)
49 | self.model_names.append('EmoC')
50 | self.loss_names.append('emo_CE')
51 |
52 | self.netEmoCF = FcClassifier(cls_input_size, cls_layers, output_dim=opt.emo_output_dim, dropout=opt.dropout_rate)
53 | self.model_names.append('EmoCF')
54 | self.loss_names.append('EmoF_CE')
55 |
56 | self.temperature = opt.temperature
57 |
58 |
59 | # self.device = 'cpu'
60 | # self.netEmoA = self.netEmoA.to(self.device)
61 | # self.netEmoV = self.netEmoV.to(self.device)
62 | # self.netEmoFusion = self.netEmoFusion.to(self.device)
63 | # self.netEmoC = self.netEmoC.to(self.device)
64 | # self.netEmoCF = self.netEmoCF.to(self.device)
65 |
66 | self.criterion_ce = torch.nn.CrossEntropyLoss()
67 |
68 | if self.isTrain:
69 | if not opt.use_ICL:
70 | self.criterion_ce = torch.nn.CrossEntropyLoss()
71 | self.criterion_focal = torch.nn.CrossEntropyLoss()
72 | else:
73 | self.criterion_ce = torch.nn.CrossEntropyLoss()
74 | self.criterion_focal = Focal_Loss()
75 | # initialize optimizers; schedulers will be automatically created by function .
76 | paremeters = [{'params': getattr(self, 'net' + net).parameters()} for net in self.model_names]
77 | self.optimizer = torch.optim.Adam(paremeters, lr=opt.lr, betas=(opt.beta1, 0.999))
78 | self.optimizers.append(self.optimizer)
79 | self.ce_weight = opt.ce_weight
80 | self.focal_weight = opt.focal_weight
81 |
82 | # modify save_dir
83 | self.save_dir = os.path.join(opt.checkpoints_dir, opt.name)
84 | if not os.path.exists(self.save_dir):
85 | os.makedirs(self.save_dir)
86 |
87 |
88 | def post_process(self):
89 | # called after model.setup()
90 | def transform_key_for_parallel(state_dict):
91 | return OrderedDict([('module.' + key, value) for key, value in state_dict.items()])
92 |
93 | if self.isTrain:
94 | print('[ Init ] Load parameters from pretrained encoder network')
95 | f = lambda x: transform_key_for_parallel(x)
96 | self.netEmoA.load_state_dict(f(self.pretrained_encoder.netEmoA.state_dict()))
97 | self.netEmoV.load_state_dict(f(self.pretrained_encoder.netEmoV.state_dict()))
98 | self.netEmoFusion.load_state_dict(f(self.pretrained_encoder.netEmoFusion.state_dict()))
99 |
100 | def load_from_opt_record(self, file_path):
101 | opt_content = json.load(open(file_path, 'r'))
102 | opt = OptConfig()
103 | opt.load(opt_content)
104 | return opt
105 |
106 | def set_input(self, input):
107 |
108 | self.acoustic = input['A_feat'].float().to(self.device)
109 | self.visual = input['V_feat'].float().to(self.device)
110 |
111 | self.emo_label = input['emo_label'].to(self.device)
112 |
113 | if 'personalized_feat' in input:
114 | self.personalized = input['personalized_feat'].float().to(self.device)
115 | else:
116 | self.personalized = None # if no personalized features given
117 |
118 |
119 | def forward(self, acoustic_feat=None, visual_feat=None):
120 | if acoustic_feat is not None:
121 | self.acoustic = acoustic_feat.float().to(self.device)
122 | self.visual = visual_feat.float().to(self.device)
123 |
124 | """Run forward pass; called by both functions and ."""
125 |
126 | emo_feat_A = self.netEmoA(self.acoustic)
127 | emo_feat_V = self.netEmoV(self.visual)
128 |
129 | '''insure time dimension modification'''
130 | emo_fusion_feat = torch.cat((emo_feat_V, emo_feat_A), dim=-1) # (batch_size, seq_len, 2 * embd_size)
131 |
132 | emo_fusion_feat = self.netEmoFusion(emo_fusion_feat)
133 |
134 | '''dynamic acquisition of bs'''
135 | batch_size = emo_fusion_feat.size(0)
136 |
137 | emo_fusion_feat = emo_fusion_feat.permute(1, 0, 2).reshape(batch_size, -1) # turn into [batch_size, feature_dim] 1028
138 |
139 | if self.personalized is not None:
140 | emo_fusion_feat = torch.cat((emo_fusion_feat, self.personalized), dim=-1) # [batch_size, seq_len * feature_dim + 1024]
141 |
142 | '''for back prop'''
143 | self.emo_logits_fusion, _ = self.netEmoCF(emo_fusion_feat)
144 | """-----------"""
145 |
146 | self.emo_logits, _ = self.netEmoC(emo_fusion_feat)
147 | self.emo_pred = F.softmax(self.emo_logits, dim=-1)
148 |
149 | def backward(self):
150 | """Calculate the loss for back propagation"""
151 | self.loss_emo_CE = self.criterion_ce(self.emo_logits, self.emo_label)
152 | self.loss_EmoF_CE = self.focal_weight * self.criterion_focal(self.emo_logits_fusion, self.emo_label)
153 | loss = self.loss_emo_CE + self.loss_EmoF_CE
154 |
155 | loss.backward()
156 |
157 | for model in self.model_names:
158 | torch.nn.utils.clip_grad_norm_(getattr(self, 'net' + model).parameters(), 1.0)
159 |
160 | def optimize_parameters(self, epoch):
161 | """Calculate losses, gradients, and update network weights; called in every training iteration"""
162 | # forward
163 | self.forward()
164 | # backward
165 | self.optimizer.zero_grad()
166 | self.backward()
167 |
168 | self.optimizer.step()
169 |
170 |
171 | class ActivateFun(torch.nn.Module):
172 | def __init__(self, opt):
173 | super(ActivateFun, self).__init__()
174 | self.activate_fun = opt.activate_fun
175 |
176 | def _gelu(self, x):
177 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
178 |
179 | def forward(self, x):
180 | if self.activate_fun == 'relu':
181 | return torch.relu(x)
182 | elif self.activate_fun == 'gelu':
183 | return self._gelu(x)
184 |
185 |
186 | class Focal_Loss(torch.nn.Module):
187 | def __init__(self, weight=0.5, gamma=3, reduction='mean'):
188 | super(Focal_Loss, self).__init__()
189 | self.gamma = gamma
190 | self.alpha = weight
191 | self.reduction = reduction
192 |
193 | def forward(self, preds, targets):
194 | """
195 | preds:softmax output
196 | labels:true values
197 | """
198 | ce_loss = F.cross_entropy(preds, targets, reduction='mean')
199 | pt = torch.exp(-ce_loss)
200 | focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
201 |
202 | if self.reduction == 'none':
203 | return focal_loss
204 | elif self.reduction == 'mean':
205 | return torch.mean(focal_loss)
206 | elif self.reduction == 'sum':
207 | return torch.sum(focal_loss)
208 | else:
209 | raise NotImplementedError("Invalid reduction mode. Please choose 'none', 'mean', or 'sum'.")
210 |
--------------------------------------------------------------------------------
/models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .convert import *
2 | from .time_track import time_desc_decorator
3 | from .functions import *
--------------------------------------------------------------------------------
/models/utils/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/config.cpython-310.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/config.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/config.cpython-311.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/config.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/config.cpython-38.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/convert.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/convert.cpython-310.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/convert.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/convert.cpython-311.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/convert.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/convert.cpython-38.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/functions.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/functions.cpython-310.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/functions.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/functions.cpython-311.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/functions.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/functions.cpython-38.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/time_track.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/time_track.cpython-310.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/time_track.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/time_track.cpython-311.pyc
--------------------------------------------------------------------------------
/models/utils/__pycache__/time_track.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/models/utils/__pycache__/time_track.cpython-38.pyc
--------------------------------------------------------------------------------
/models/utils/config-orin.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | class OptConfig(object):
4 | def __init__(self):
5 | pass
6 |
7 | def load(self, config_dict):
8 | if sys.version > '3':
9 | for key, value in config_dict.items():
10 | if not isinstance(value, dict):
11 | setattr(self, key, value)
12 | else:
13 | self.load(value)
14 | else:
15 | for key, value in config_dict.iteritems():
16 | if not isinstance(value, dict):
17 | setattr(self, key, value)
18 | else:
19 | self.load(value)
--------------------------------------------------------------------------------
/models/utils/config.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | class OptConfig(object):
4 | def __init__(self):
5 | pass
6 |
7 | def load(self, config_dict):
8 | if sys.version > '3':
9 | for key, value in config_dict.items():
10 | if not isinstance(value, dict):
11 | setattr(self, key, value)
12 | else:
13 | self.load(value)
14 | else:
15 | for key, value in config_dict.iteritems():
16 | if not isinstance(value, dict):
17 | setattr(self, key, value)
18 | else:
19 | self.load(value)
--------------------------------------------------------------------------------
/models/utils/convert.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | def to_gpu(x, on_cpu=False, gpu_id=None):
4 | """Tensor => Variable"""
5 | if torch.cuda.is_available() and not on_cpu:
6 | x = x.cuda(gpu_id)
7 | return x
8 |
9 | def to_cpu(x):
10 | """Variable => Tensor"""
11 | if torch.cuda.is_available():
12 | x = x.cpu()
13 | return x.data
--------------------------------------------------------------------------------
/models/utils/functions.py:
--------------------------------------------------------------------------------
1 | from torch.autograd import Function
2 | import torch.nn as nn
3 | import torch
4 | import torch.nn.functional as F
5 |
6 | """
7 | Adapted from https://github.com/fungtion/DSN/blob/master/functions.py
8 | """
9 |
10 | class ReverseLayerF(Function):
11 |
12 | @staticmethod
13 | def forward(ctx, x, p):
14 | ctx.p = p
15 |
16 | return x.view_as(x)
17 |
18 | @staticmethod
19 | def backward(ctx, grad_output):
20 | output = grad_output.neg() * ctx.p
21 |
22 | return output, None
23 |
24 |
25 | class MSE(nn.Module):
26 | def __init__(self):
27 | super(MSE, self).__init__()
28 |
29 | def forward(self, pred, real):
30 | diffs = torch.add(real, -pred)
31 | n = torch.numel(diffs.data)
32 | mse = torch.sum(diffs.pow(2)) / n
33 |
34 | return mse
35 |
36 |
37 | class SIMSE(nn.Module):
38 |
39 | def __init__(self):
40 | super(SIMSE, self).__init__()
41 |
42 | def forward(self, pred, real):
43 | diffs = torch.add(real, - pred)
44 | n = torch.numel(diffs.data)
45 | simse = torch.sum(diffs).pow(2) / (n ** 2)
46 |
47 | return simse
48 |
49 |
50 | class DiffLoss(nn.Module):
51 |
52 | def __init__(self):
53 | super(DiffLoss, self).__init__()
54 |
55 | def forward(self, input1, input2):
56 |
57 | batch_size = input1.size(0)
58 | input1 = input1.view(batch_size, -1)
59 | input2 = input2.view(batch_size, -1)
60 |
61 | # Zero mean
62 | input1_mean = torch.mean(input1, dim=0, keepdims=True) # 按维度求均值,keepdims=True保持转换后维度不变
63 | input2_mean = torch.mean(input2, dim=0, keepdims=True)
64 | input1 = input1 - input1_mean
65 | input2 = input2 - input2_mean
66 |
67 | input1_l2_norm = torch.norm(input1, p=2, dim=1, keepdim=True).detach() # 求范数
68 | input1_l2 = input1.div(input1_l2_norm.expand_as(input1) + 1e-6)
69 |
70 | input2_l2_norm = torch.norm(input2, p=2, dim=1, keepdim=True).detach()
71 | input2_l2 = input2.div(input2_l2_norm.expand_as(input2) + 1e-6)
72 |
73 | diff_loss = torch.mean((input1_l2.t().mm(input2_l2)).pow(2))
74 |
75 | return diff_loss
76 |
77 | class CMD(nn.Module):
78 | """
79 | Adapted from https://github.com/wzell/cmd/blob/master/models/domain_regularizer.py
80 | """
81 |
82 | def __init__(self):
83 | super(CMD, self).__init__()
84 |
85 | def forward(self, x1, x2, n_moments):
86 | mx1 = torch.mean(x1, 0)
87 | mx2 = torch.mean(x2, 0)
88 | sx1 = x1-mx1
89 | sx2 = x2-mx2
90 | dm = self.matchnorm(mx1, mx2)
91 | scms = dm
92 | for i in range(n_moments - 1):
93 | scms += self.scm(sx1, sx2, i + 2)
94 | return scms
95 |
96 | def matchnorm(self, x1, x2):
97 | power = torch.pow(x1-x2,2)
98 | summed = torch.sum(power)
99 | sqrt = summed**(0.5)
100 | return sqrt
101 | # return ((x1-x2)**2).sum().sqrt()
102 |
103 | def scm(self, sx1, sx2, k):
104 | ss1 = torch.mean(torch.pow(sx1, k), 0)
105 | ss2 = torch.mean(torch.pow(sx2, k), 0)
106 | return self.matchnorm(ss1, ss2)
107 |
108 |
109 | class SupConLoss(nn.Module):
110 |
111 | def __init__(self, temperature=0.5, scale_by_temperature=True):
112 | super(SupConLoss, self).__init__()
113 | self.temperature = temperature
114 | self.scale_by_temperature = scale_by_temperature
115 |
116 | def forward(self, features, labels=None, mask=None):
117 | """
118 | 输入:
119 | features: 输入样本的特征,尺寸为 [batch_size, hidden_dim].
120 | labels: 每个样本的ground truth标签,尺寸是[batch_size].
121 | mask: 用于对比学习的mask,尺寸为 [batch_size, batch_size], 如果样本i和j属于同一个label,那么mask_{i,j}=1
122 | 输出:
123 | loss值
124 | """
125 | device = (torch.device('cuda')
126 | if features.is_cuda
127 | else torch.device('cpu'))
128 | features = F.normalize(features, p=2, dim=1)
129 | batch_size = features.shape[0]
130 | # 关于labels参数
131 | if labels is not None and mask is not None: # labels和mask不能同时定义值,因为如果有label,那么mask是需要根据Label得到的
132 | raise ValueError('Cannot define both `labels` and `mask`')
133 | elif labels is None and mask is None: # 如果没有labels,也没有mask,就是无监督学习,mask是对角线为1的矩阵,表示(i,i)属于同一类
134 | mask = torch.eye(batch_size, dtype=torch.float32).to(device)
135 | elif labels is not None: # 如果给出了labels, mask根据label得到,两个样本i,j的label相等时,mask_{i,j}=1
136 | labels = labels.contiguous().view(-1, 1)
137 | if labels.shape[0] != batch_size:
138 | raise ValueError('Num of labels does not match num of features')
139 | mask = torch.eq(labels, labels.T).float().to(device)
140 | else:
141 | mask = mask.float().to(device)
142 | '''
143 | 示例:
144 | labels:
145 | tensor([[1.],
146 | [2.],
147 | [1.],
148 | [1.]])
149 | mask: # 两个样本i,j的label相等时,mask_{i,j}=1
150 | tensor([[1., 0., 1., 1.],
151 | [0., 1., 0., 0.],
152 | [1., 0., 1., 1.],
153 | [1., 0., 1., 1.]])
154 | '''
155 | # compute logits
156 | anchor_dot_contrast = torch.div(
157 | torch.matmul(features, features.T),
158 | self.temperature) # 计算两两样本间点乘相似度
159 | # for numerical stability
160 | logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
161 | logits = anchor_dot_contrast - logits_max.detach()
162 | exp_logits = torch.exp(logits)
163 | '''
164 | logits是anchor_dot_contrast减去每一行的最大值得到的最终相似度
165 | 示例: logits: torch.size([4,4])
166 | logits:
167 | tensor([[ 0.0000, -0.0471, -0.3352, -0.2156],
168 | [-1.2576, 0.0000, -0.3367, -0.0725],
169 | [-1.3500, -0.1409, -0.1420, 0.0000],
170 | [-1.4312, -0.0776, -0.2009, 0.0000]])
171 | '''
172 | # 构建mask
173 | logits_mask = torch.ones_like(mask) - torch.eye(batch_size)
174 | positives_mask = mask * logits_mask
175 | negatives_mask = 1. - mask
176 | '''
177 | 但是对于计算Loss而言,(i,i)位置表示样本本身的相似度,对Loss是没用的,所以要mask掉
178 | # 第ind行第ind位置填充为0
179 | 得到logits_mask:
180 | tensor([[0., 1., 1., 1.],
181 | [1., 0., 1., 1.],
182 | [1., 1., 0., 1.],
183 | [1., 1., 1., 0.]])
184 | positives_mask:
185 | tensor([[0., 0., 1., 1.],
186 | [0., 0., 0., 0.],
187 | [1., 0., 0., 1.],
188 | [1., 0., 1., 0.]])
189 | negatives_mask:
190 | tensor([[0., 1., 0., 0.],
191 | [1., 0., 1., 1.],
192 | [0., 1., 0., 0.],
193 | [0., 1., 0., 0.]])
194 | '''
195 | num_positives_per_row = torch.sum(positives_mask, axis=1) # 除了自己之外,正样本的个数 [2 0 2 2]
196 | denominator = torch.sum(
197 | exp_logits * negatives_mask, axis=1, keepdims=True) + torch.sum(
198 | exp_logits * positives_mask, axis=1, keepdims=True)
199 |
200 | log_probs = logits - torch.log(denominator)
201 | if torch.any(torch.isnan(log_probs)):
202 | raise ValueError("Log_prob has nan!")
203 |
204 | log_probs = torch.sum(
205 | log_probs * positives_mask, axis=1)[num_positives_per_row > 0] / num_positives_per_row[
206 | num_positives_per_row > 0]
207 | '''
208 | 计算正样本平均的log-likelihood
209 | 考虑到一个类别可能只有一个样本,就没有正样本了 比如我们labels的第二个类别 labels[1,2,1,1]
210 | 所以这里只计算正样本个数>0的
211 | '''
212 | # loss
213 | loss = -log_probs
214 | if self.scale_by_temperature:
215 | loss *= self.temperature
216 | loss = loss.mean()
217 | return loss
--------------------------------------------------------------------------------
/models/utils/load_pretrained.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from .config import OptConfig
4 |
5 | def load_from_opt_record(file_path):
6 | opt_content = json.load(open(file_path, 'r'))
7 | opt = OptConfig()
8 | opt.load(opt_content)
9 | return opt
10 |
11 | def load_pretrained_model(model_class, checkpoints_dir, cv, gpu_ids):
12 | path = os.path.join(checkpoints_dir, str(cv))
13 | config_path = os.path.join(checkpoints_dir, 'train_opt.conf')
14 | config = load_from_opt_record(config_path)
15 | config.isTrain = False # teacher model should be in test mode
16 | config.gpu_ids = gpu_ids # set gpu to the same
17 | model = model_class(config)
18 | model.cuda()
19 | model.load_networks_cv(path)
20 | model.eval()
21 | return model
22 |
--------------------------------------------------------------------------------
/models/utils/time_track.py:
--------------------------------------------------------------------------------
1 | import time
2 | from functools import partial
3 |
4 |
5 | def base_time_desc_decorator(method, desc='test_description'):
6 | def timed(*args, **kwargs):
7 |
8 | # Print Description
9 | # print('#' * 50)
10 | print(desc)
11 | # print('#' * 50 + '\n')
12 |
13 | # Calculation Runtime
14 | start = time.time()
15 |
16 | # Run Method
17 | try:
18 | result = method(*args, **kwargs)
19 | except TypeError:
20 | result = method(**kwargs)
21 |
22 | # Print Runtime
23 | print('Done! It took {:.2} secs\n'.format(time.time() - start))
24 |
25 | if result is not None:
26 | return result
27 |
28 | return timed
29 |
30 |
31 | def time_desc_decorator(desc): return partial(base_time_desc_decorator, desc=desc)
32 |
33 |
34 | @time_desc_decorator('this is description')
35 | def time_test(arg, kwarg='this is kwarg'):
36 | time.sleep(3)
37 | print('Inside of time_test')
38 | print('printing arg: ', arg)
39 | print('printing kwarg: ', kwarg)
40 |
41 |
42 | @time_desc_decorator('this is second description')
43 | def no_arg_method():
44 | print('this method has no argument')
45 |
46 |
47 | if __name__ == '__main__':
48 | time_test('hello', kwarg=3)
49 | time_test(3)
50 | no_arg_method()
51 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn==1.5.1
2 | torch
3 | pandas
4 |
--------------------------------------------------------------------------------
/scripts/Track1/train_1s_binary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | # Default Training Parameters
5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory
6 | AUDIOFEATURE_METHOD="mfccs" # Audio feature type, options {wav2vec, opensmile, mfccs}
7 | VIDEOLFEATURE_METHOD="openface" # Video feature type, options {openface, resnet, densenet}
8 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"}
9 | LABELCOUNT=2 # Number of label categories, options {2, 3, 5}
10 | TRACK_OPTION="Track1"
11 | FEATURE_MAX_LEN=26 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding
12 | BATCH_SIZE=1
13 | LR=0.00002
14 | NUM_EPOCHS=200
15 | DEVICE="cpu" # Options {cuda, cpu}
16 |
17 |
18 | for arg in "$@"; do
19 | case $arg in
20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;;
21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;;
22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;;
23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;;
24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;;
25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;;
26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;;
27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;;
28 | --lr=*) LR="${arg#*=}" ;;
29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;;
30 | --device=*) DEVICE="${arg#*=}" ;;
31 | *) echo "Unknown option: $arg"; exit 1 ;;
32 | esac
33 | done
34 |
35 | for i in `seq 1 1 1`; do
36 | cmd="python train.py \
37 | --data_rootpath=$data_rootpath \
38 | --audiofeature_method=$AUDIOFEATURE_METHOD \
39 | --videofeature_method=$VIDEOLFEATURE_METHOD \
40 | --splitwindow_time=$SPLITWINDOW \
41 | --labelcount=$LABELCOUNT \
42 | --track_option=$TRACK_OPTION \
43 | --feature_max_len=$FEATURE_MAX_LEN \
44 | --batch_size=$BATCH_SIZE \
45 | --lr=$LR \
46 | --num_epochs=$NUM_EPOCHS \
47 | --device=$DEVICE"
48 |
49 | echo "\n-------------------------------------------------------------------------------------"
50 | echo "Execute command: $cmd"
51 | echo "-------------------------------------------------------------------------------------\n"
52 | echo $cmd | sh
53 | done
--------------------------------------------------------------------------------
/scripts/Track1/train_1s_quinary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | # Default Training Parameters
5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory
6 | AUDIOFEATURE_METHOD="opensmile" # Audio feature type, options {wav2vec, opensmile, mfccs}
7 | VIDEOLFEATURE_METHOD="densenet" # Video feature type, options {openface, resnet, densenet}
8 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"}
9 | LABELCOUNT=5 # Number of label categories, options {2, 3, 5}
10 | TRACK_OPTION="Track1"
11 | FEATURE_MAX_LEN=26 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding
12 | BATCH_SIZE=1
13 | LR=1.65701813672055e-5
14 | NUM_EPOCHS=400
15 | DEVICE="cpu" # Options {cuda, cpu}
16 |
17 |
18 | for arg in "$@"; do
19 | case $arg in
20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;;
21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;;
22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;;
23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;;
24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;;
25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;;
26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;;
27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;;
28 | --lr=*) LR="${arg#*=}" ;;
29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;;
30 | --device=*) DEVICE="${arg#*=}" ;;
31 | *) echo "Unknown option: $arg"; exit 1 ;;
32 | esac
33 | done
34 |
35 | for i in `seq 1 1 1`; do
36 | cmd="python train.py \
37 | --data_rootpath=$data_rootpath \
38 | --audiofeature_method=$AUDIOFEATURE_METHOD \
39 | --videofeature_method=$VIDEOLFEATURE_METHOD \
40 | --splitwindow_time=$SPLITWINDOW \
41 | --labelcount=$LABELCOUNT \
42 | --track_option=$TRACK_OPTION \
43 | --feature_max_len=$FEATURE_MAX_LEN \
44 | --batch_size=$BATCH_SIZE \
45 | --lr=$LR \
46 | --num_epochs=$NUM_EPOCHS \
47 | --device=$DEVICE"
48 |
49 | echo "\n-------------------------------------------------------------------------------------"
50 | echo "Execute command: $cmd"
51 | echo "-------------------------------------------------------------------------------------\n"
52 | echo $cmd | sh
53 | done
54 |
--------------------------------------------------------------------------------
/scripts/Track1/train_1s_ternary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | # Default Training Parameters
5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory
6 | AUDIOFEATURE_METHOD="opensmile" # Audio feature type, options {wav2vec, opensmile, mfccs}
7 | VIDEOLFEATURE_METHOD="resnet" # Video feature type, options {openface, resnet, densenet}
8 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"}
9 | LABELCOUNT=3 # Number of label categories, options {2, 3, 5}
10 | TRACK_OPTION="Track1"
11 | FEATURE_MAX_LEN=26 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding
12 | BATCH_SIZE=2
13 | LR=4.58358993791005e-06
14 | NUM_EPOCHS=400
15 | DEVICE="cpu" # Options {cuda, cpu}
16 |
17 |
18 | for arg in "$@"; do
19 | case $arg in
20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;;
21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;;
22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;;
23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;;
24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;;
25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;;
26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;;
27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;;
28 | --lr=*) LR="${arg#*=}" ;;
29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;;
30 | --device=*) DEVICE="${arg#*=}" ;;
31 | *) echo "Unknown option: $arg"; exit 1 ;;
32 | esac
33 | done
34 |
35 | for i in `seq 1 1 1`; do
36 | cmd="python train.py \
37 | --data_rootpath=$data_rootpath \
38 | --audiofeature_method=$AUDIOFEATURE_METHOD \
39 | --videofeature_method=$VIDEOLFEATURE_METHOD \
40 | --splitwindow_time=$SPLITWINDOW \
41 | --labelcount=$LABELCOUNT \
42 | --track_option=$TRACK_OPTION \
43 | --feature_max_len=$FEATURE_MAX_LEN \
44 | --batch_size=$BATCH_SIZE \
45 | --lr=$LR \
46 | --num_epochs=$NUM_EPOCHS \
47 | --device=$DEVICE"
48 |
49 | echo "\n-------------------------------------------------------------------------------------"
50 | echo "Execute command: $cmd"
51 | echo "-------------------------------------------------------------------------------------\n"
52 | echo $cmd | sh
53 | done
54 |
--------------------------------------------------------------------------------
/scripts/Track1/train_5s_binary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | # Default Training Parameters
5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory
6 | AUDIOFEATURE_METHOD="opensmile" # Audio feature type, options {wav2vec, opensmile, mfccs}
7 | VIDEOLFEATURE_METHOD="resnet" # Video feature type, options {openface, resnet, densenet}
8 | SPLITWINDOW="5s" # Window duration, options {"1s", "5s"}
9 | LABELCOUNT=2 # Number of label categories, options {2, 3, 5}
10 | TRACK_OPTION="Track1"
11 | FEATURE_MAX_LEN=5 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding
12 | BATCH_SIZE=2
13 | LR=0.000018
14 | NUM_EPOCHS=200
15 | DEVICE="cpu" # Options {cuda, cpu}
16 |
17 |
18 | for arg in "$@"; do
19 | case $arg in
20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;;
21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;;
22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;;
23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;;
24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;;
25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;;
26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;;
27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;;
28 | --lr=*) LR="${arg#*=}" ;;
29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;;
30 | --device=*) DEVICE="${arg#*=}" ;;
31 | *) echo "Unknown option: $arg"; exit 1 ;;
32 | esac
33 | done
34 |
35 | for i in `seq 1 1 1`; do
36 | cmd="python train.py \
37 | --data_rootpath=$data_rootpath \
38 | --audiofeature_method=$AUDIOFEATURE_METHOD \
39 | --videofeature_method=$VIDEOLFEATURE_METHOD \
40 | --splitwindow_time=$SPLITWINDOW \
41 | --labelcount=$LABELCOUNT \
42 | --track_option=$TRACK_OPTION \
43 | --feature_max_len=$FEATURE_MAX_LEN \
44 | --batch_size=$BATCH_SIZE \
45 | --lr=$LR \
46 | --num_epochs=$NUM_EPOCHS \
47 | --device=$DEVICE"
48 |
49 | echo "\n-------------------------------------------------------------------------------------"
50 | echo "Execute command: $cmd"
51 | echo "-------------------------------------------------------------------------------------\n"
52 | echo $cmd | sh
53 | done
--------------------------------------------------------------------------------
/scripts/Track1/train_5s_quinary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | # Default Training Parameters
5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory
6 | AUDIOFEATURE_METHOD="mfccs" # Audio feature type, options {wav2vec, opensmile, mfccs}
7 | VIDEOLFEATURE_METHOD="resnet" # Video feature type, options {openface, resnet, densenet}
8 | SPLITWINDOW="5s" # Window duration, options {"1s", "5s"}
9 | LABELCOUNT=5 # Number of label categories, options {2, 3, 5}
10 | TRACK_OPTION="Track1"
11 | FEATURE_MAX_LEN=5 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding
12 | BATCH_SIZE=16
13 | LR=0.000098
14 | NUM_EPOCHS=200
15 | DEVICE="cpu" # Options {cuda, cpu}
16 |
17 |
18 | for arg in "$@"; do
19 | case $arg in
20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;;
21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;;
22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;;
23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;;
24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;;
25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;;
26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;;
27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;;
28 | --lr=*) LR="${arg#*=}" ;;
29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;;
30 | --device=*) DEVICE="${arg#*=}" ;;
31 | *) echo "Unknown option: $arg"; exit 1 ;;
32 | esac
33 | done
34 |
35 | for i in `seq 1 1 1`; do
36 | cmd="python train.py \
37 | --data_rootpath=$data_rootpath \
38 | --audiofeature_method=$AUDIOFEATURE_METHOD \
39 | --videofeature_method=$VIDEOLFEATURE_METHOD \
40 | --splitwindow_time=$SPLITWINDOW \
41 | --labelcount=$LABELCOUNT \
42 | --track_option=$TRACK_OPTION \
43 | --feature_max_len=$FEATURE_MAX_LEN \
44 | --batch_size=$BATCH_SIZE \
45 | --lr=$LR \
46 | --num_epochs=$NUM_EPOCHS \
47 | --device=$DEVICE"
48 |
49 | echo "\n-------------------------------------------------------------------------------------"
50 | echo "Execute command: $cmd"
51 | echo "-------------------------------------------------------------------------------------\n"
52 | echo $cmd | sh
53 | done
--------------------------------------------------------------------------------
/scripts/Track1/train_5s_ternary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | # Default Training Parameters
5 | data_rootpath="E:/MDPP_data/MPDD-Elderly" # Dataset root directory
6 | AUDIOFEATURE_METHOD="wav2vec" # Audio feature type, options {wav2vec, opensmile, mfccs}
7 | VIDEOLFEATURE_METHOD="openface" # Video feature type, options {openface, resnet, densenet}
8 | SPLITWINDOW="5s" # Window duration, options {"1s", "5s"}
9 | LABELCOUNT=3 # Number of label categories, options {2, 3, 5}
10 | TRACK_OPTION="Track1"
11 | FEATURE_MAX_LEN=5 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding
12 | BATCH_SIZE=16
13 | LR=0.000147971824491024
14 | NUM_EPOCHS=400
15 | DEVICE="cpu" # Options {cuda, cpu}
16 |
17 |
18 | for arg in "$@"; do
19 | case $arg in
20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;;
21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;;
22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;;
23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;;
24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;;
25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;;
26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;;
27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;;
28 | --lr=*) LR="${arg#*=}" ;;
29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;;
30 | --device=*) DEVICE="${arg#*=}" ;;
31 | *) echo "Unknown option: $arg"; exit 1 ;;
32 | esac
33 | done
34 |
35 | for i in `seq 1 1 1`; do
36 | cmd="python train.py \
37 | --data_rootpath=$data_rootpath \
38 | --audiofeature_method=$AUDIOFEATURE_METHOD \
39 | --videofeature_method=$VIDEOLFEATURE_METHOD \
40 | --splitwindow_time=$SPLITWINDOW \
41 | --labelcount=$LABELCOUNT \
42 | --track_option=$TRACK_OPTION \
43 | --feature_max_len=$FEATURE_MAX_LEN \
44 | --batch_size=$BATCH_SIZE \
45 | --lr=$LR \
46 | --num_epochs=$NUM_EPOCHS \
47 | --device=$DEVICE"
48 |
49 | echo "\n-------------------------------------------------------------------------------------"
50 | echo "Execute command: $cmd"
51 | echo "-------------------------------------------------------------------------------------\n"
52 | echo $cmd | sh
53 | done
54 |
--------------------------------------------------------------------------------
/scripts/Track2/train_1s_binary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | # Default Training Parameters
5 | data_rootpath="D:/HACI/MMchallenge/NEUQdata" # Dataset root directory
6 | AUDIOFEATURE_METHOD="wav2vec" # Audio feature type, options {wav2vec, opensmile, mfccs}
7 | VIDEOLFEATURE_METHOD="openface" # Video feature type, options {openface, resnet, densenet}
8 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"}
9 | LABELCOUNT=2 # Number of label categories, options {2, 3, 5}
10 | TRACK_OPTION="Track2"
11 | FEATURE_MAX_LEN=25 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding
12 | BATCH_SIZE=16
13 | LR=0.00006
14 | NUM_EPOCHS=500
15 | DEVICE="cpu" # Options {cuda, cpu}
16 |
17 |
18 | for arg in "$@"; do
19 | case $arg in
20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;;
21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;;
22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;;
23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;;
24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;;
25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;;
26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;;
27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;;
28 | --lr=*) LR="${arg#*=}" ;;
29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;;
30 | --device=*) DEVICE="${arg#*=}" ;;
31 | *) echo "Unknown option: $arg"; exit 1 ;;
32 | esac
33 | done
34 |
35 | for i in `seq 1 1 1`; do
36 | cmd="python train.py \
37 | --data_rootpath=$data_rootpath \
38 | --audiofeature_method=$AUDIOFEATURE_METHOD \
39 | --videofeature_method=$VIDEOLFEATURE_METHOD \
40 | --splitwindow_time=$SPLITWINDOW \
41 | --labelcount=$LABELCOUNT \
42 | --track_option=$TRACK_OPTION \
43 | --feature_max_len=$FEATURE_MAX_LEN \
44 | --batch_size=$BATCH_SIZE \
45 | --lr=$LR \
46 | --num_epochs=$NUM_EPOCHS \
47 | --device=$DEVICE"
48 |
49 | echo "\n-------------------------------------------------------------------------------------"
50 | echo "Execute command: $cmd"
51 | echo "-------------------------------------------------------------------------------------\n"
52 | echo $cmd | sh
53 | done
--------------------------------------------------------------------------------
/scripts/Track2/train_1s_ternary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | # Default Training Parameters
5 | data_rootpath="D:/HACI/MMchallenge/NEUQdata" # Dataset root directory
6 | AUDIOFEATURE_METHOD="mfccs" # Audio feature type, options {wav2vec, opensmile, mfccs}
7 | VIDEOLFEATURE_METHOD="densenet" # Video feature type, options {openface, resnet, densenet}
8 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"}
9 | LABELCOUNT=3 # Number of label categories, options {2, 3, 5}
10 | TRACK_OPTION="Track2"
11 | FEATURE_MAX_LEN=25 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding
12 | BATCH_SIZE=8
13 | LR=0.00026
14 | NUM_EPOCHS=500
15 | DEVICE="cpu" # Options {cuda, cpu}
16 |
17 |
18 | for arg in "$@"; do
19 | case $arg in
20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;;
21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;;
22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;;
23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;;
24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;;
25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;;
26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;;
27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;;
28 | --lr=*) LR="${arg#*=}" ;;
29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;;
30 | --device=*) DEVICE="${arg#*=}" ;;
31 | *) echo "Unknown option: $arg"; exit 1 ;;
32 | esac
33 | done
34 |
35 | for i in `seq 1 1 1`; do
36 | cmd="python train.py \
37 | --data_rootpath=$data_rootpath \
38 | --audiofeature_method=$AUDIOFEATURE_METHOD \
39 | --videofeature_method=$VIDEOLFEATURE_METHOD \
40 | --splitwindow_time=$SPLITWINDOW \
41 | --labelcount=$LABELCOUNT \
42 | --track_option=$TRACK_OPTION \
43 | --feature_max_len=$FEATURE_MAX_LEN \
44 | --batch_size=$BATCH_SIZE \
45 | --lr=$LR \
46 | --num_epochs=$NUM_EPOCHS \
47 | --device=$DEVICE"
48 |
49 | echo "\n-------------------------------------------------------------------------------------"
50 | echo "Execute command: $cmd"
51 | echo "-------------------------------------------------------------------------------------\n"
52 | echo $cmd | sh
53 | done
--------------------------------------------------------------------------------
/scripts/Track2/train_5s_binary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | # Default Training Parameters
5 | data_rootpath="D:/HACI/MMchallenge/NEUQdata" # Dataset root directory
6 | AUDIOFEATURE_METHOD="opensmile" # Audio feature type, options {wav2vec, opensmile, mfccs}
7 | VIDEOLFEATURE_METHOD="resnet" # Video feature type, options {openface, resnet, densenet}
8 | SPLITWINDOW="5s" # Window duration, options {"1s", "5s"}
9 | LABELCOUNT=2 # Number of label categories, options {2, 3, 5}
10 | TRACK_OPTION="Track2"
11 | FEATURE_MAX_LEN=5 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding
12 | BATCH_SIZE=24
13 | LR=0.00005
14 | NUM_EPOCHS=500
15 | DEVICE="cpu" # Options {cuda, cpu}
16 |
17 |
18 | for arg in "$@"; do
19 | case $arg in
20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;;
21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;;
22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;;
23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;;
24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;;
25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;;
26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;;
27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;;
28 | --lr=*) LR="${arg#*=}" ;;
29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;;
30 | --device=*) DEVICE="${arg#*=}" ;;
31 | *) echo "Unknown option: $arg"; exit 1 ;;
32 | esac
33 | done
34 |
35 | for i in `seq 1 1 1`; do
36 | cmd="python train.py \
37 | --data_rootpath=$data_rootpath \
38 | --audiofeature_method=$AUDIOFEATURE_METHOD \
39 | --videofeature_method=$VIDEOLFEATURE_METHOD \
40 | --splitwindow_time=$SPLITWINDOW \
41 | --labelcount=$LABELCOUNT \
42 | --track_option=$TRACK_OPTION \
43 | --feature_max_len=$FEATURE_MAX_LEN \
44 | --batch_size=$BATCH_SIZE \
45 | --lr=$LR \
46 | --num_epochs=$NUM_EPOCHS \
47 | --device=$DEVICE"
48 |
49 | echo "\n-------------------------------------------------------------------------------------"
50 | echo "Execute command: $cmd"
51 | echo "-------------------------------------------------------------------------------------\n"
52 | echo $cmd | sh
53 | done
--------------------------------------------------------------------------------
/scripts/Track2/train_5s_ternary.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | # Default Training Parameters
5 | data_rootpath="D:/HACI/MMchallenge/NEUQdata" # Dataset root directory
6 | AUDIOFEATURE_METHOD="mfccs" # Audio feature type, options {wav2vec, opensmile, mfccs}
7 | VIDEOLFEATURE_METHOD="densenet" # Video feature type, options {openface, resnet, densenet}
8 | SPLITWINDOW="5s" # Window duration, options {"1s", "5s"}
9 | LABELCOUNT=3 # Number of label categories, options {2, 3, 5}
10 | TRACK_OPTION="Track2"
11 | FEATURE_MAX_LEN=5 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding
12 | BATCH_SIZE=8
13 | LR=0.0004
14 | NUM_EPOCHS=500
15 | DEVICE="cpu" # Options {cuda, cpu}
16 |
17 |
18 | for arg in "$@"; do
19 | case $arg in
20 | --data_rootpath=*) data_rootpath="${arg#*=}" ;;
21 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;;
22 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;;
23 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;;
24 | --labelcount=*) LABELCOUNT="${arg#*=}" ;;
25 | --track_option=*) TRACK_OPTION="${arg#*=}" ;;
26 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;;
27 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;;
28 | --lr=*) LR="${arg#*=}" ;;
29 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;;
30 | --device=*) DEVICE="${arg#*=}" ;;
31 | *) echo "Unknown option: $arg"; exit 1 ;;
32 | esac
33 | done
34 |
35 | for i in `seq 1 1 1`; do
36 | cmd="python train.py \
37 | --data_rootpath=$data_rootpath \
38 | --audiofeature_method=$AUDIOFEATURE_METHOD \
39 | --videofeature_method=$VIDEOLFEATURE_METHOD \
40 | --splitwindow_time=$SPLITWINDOW \
41 | --labelcount=$LABELCOUNT \
42 | --track_option=$TRACK_OPTION \
43 | --feature_max_len=$FEATURE_MAX_LEN \
44 | --batch_size=$BATCH_SIZE \
45 | --lr=$LR \
46 | --num_epochs=$NUM_EPOCHS \
47 | --device=$DEVICE"
48 |
49 | echo "\n-------------------------------------------------------------------------------------"
50 | echo "Execute command: $cmd"
51 | echo "-------------------------------------------------------------------------------------\n"
52 | echo $cmd | sh
53 | done
--------------------------------------------------------------------------------
/scripts/test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | # Default Training Parameters
5 | DATA_ROOTPATH="E:/MDPP_data/MPDD-Elderly"
6 | TRAIN_MODEL="D:/HACI/MMchallenge/MEIJU2025-baseline-master/MPDD/checkpoints/1s_5labels_opensmile+densenet/best_model_2025-02-13-21.12.01.pth"
7 | AUDIOFEATURE_METHOD="opensmile" # Audio feature type, options {wav2vec, opensmile, mfccs}
8 | VIDEOLFEATURE_METHOD="densenet" # Video feature type, options {openface, resnet, densenet}
9 | SPLITWINDOW="1s" # Window duration, options {"1s", "5s"}
10 | LABELCOUNT=5 # Number of label categories, options {2, 3, 5}
11 | TRACK_OPTION="Track1"
12 | FEATURE_MAX_LEN=26 # Set maximum feature length; pad with zeros if insufficient, truncate if exceeding. For Track1, options {26, 5}; for Track2, options {25, 5}
13 | BATCH_SIZE=1
14 | DEVICE="cpu"
15 |
16 | for arg in "$@"; do
17 | case $arg in
18 | --data_rootpath=*) DATA_ROOTPATH="${arg#*=}" ;;
19 | --train_model=*) TRAIN_MODEL="${arg#*=}" ;;
20 | --audiofeature_method=*) AUDIOFEATURE_METHOD="${arg#*=}" ;;
21 | --videofeature_method=*) VIDEOLFEATURE_METHOD="${arg#*=}" ;;
22 | --splitwindow_time=*) SPLITWINDOW="${arg#*=}" ;;
23 | --labelcount=*) LABELCOUNT="${arg#*=}" ;;
24 | --track_option=*) TRACK_OPTION="${arg#*=}" ;;
25 | --feature_max_len=*) FEATURE_MAX_LEN="${arg#*=}" ;;
26 | --batch_size=*) BATCH_SIZE="${arg#*=}" ;;
27 | --lr=*) LR="${arg#*=}" ;;
28 | --num_epochs=*) NUM_EPOCHS="${arg#*=}" ;;
29 | --device=*) DEVICE="${arg#*=}" ;;
30 | *) echo "Unknown option: $arg"; exit 1 ;;
31 | esac
32 | done
33 |
34 | for i in `seq 1 1 1`; do
35 | cmd="python test.py \
36 | --data_rootpath=$DATA_ROOTPATH \
37 | --train_model=$TRAIN_MODEL \
38 | --audiofeature_method=$AUDIOFEATURE_METHOD \
39 | --videofeature_method=$VIDEOLFEATURE_METHOD \
40 | --splitwindow_time=$SPLITWINDOW \
41 | --labelcount=$LABELCOUNT \
42 | --track_option=$TRACK_OPTION \
43 | --feature_max_len=$FEATURE_MAX_LEN \
44 | --batch_size=$BATCH_SIZE \
45 | --device=$DEVICE"
46 |
47 | echo "\n-------------------------------------------------------------------------------------"
48 | echo "Execute command: $cmd"
49 | echo "-------------------------------------------------------------------------------------\n"
50 | echo $cmd | sh
51 | done
52 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import json
4 | from models.our.our_model import ourModel
5 | from train import eval
6 | import argparse
7 | from utils.logger import get_logger
8 | import numpy as np
9 | import pandas as pd
10 | import time
11 | from torch.utils.data import DataLoader
12 | from dataset import *
13 |
14 | class Opt:
15 | def __init__(self, config_dict):
16 | self.__dict__.update(config_dict)
17 |
18 | def load_config(config_file):
19 | with open(config_file, 'r') as f:
20 | return json.load(f)
21 |
22 | if __name__ == '__main__':
23 | parser = argparse.ArgumentParser(description="Test MDPP Model")
24 | parser.add_argument('--labelcount', type=int, default=2,
25 | help="Number of data categories (2, 3, or 5).")
26 | parser.add_argument('--track_option', type=str, required=True,
27 | help="Track1 or Track2")
28 | parser.add_argument('--feature_max_len', type=int, required=True,
29 | help="Max length of feature.")
30 | parser.add_argument('--data_rootpath', type=str, required=True,
31 | help="Root path to the program dataset")
32 | parser.add_argument('--train_model', type=str, required=True,
33 | help="Path to the training model")
34 |
35 | parser.add_argument('--test_json', type=str, required=False,
36 | help="File name of the testing JSON file")
37 | parser.add_argument('--personalized_features_file', type=str,
38 | help="File name of the personalized features file")
39 |
40 | parser.add_argument('--audiofeature_method', type=str, default='wav2vec',
41 | choices=['mfccs', 'opensmile', 'wav2vec'],
42 | help="Method for extracting audio features.")
43 | parser.add_argument('--videofeature_method', type=str, default='openface',
44 | choices=['openface', 'resnet', 'densenet'],
45 | help="Method for extracting video features.")
46 | parser.add_argument('--splitwindow_time', type=str, default='1s',
47 | help="Time window for splitted features. e.g. '1s' or '5s'")
48 |
49 | parser.add_argument('--batch_size', type=int, default=24,
50 | help="Batch size for testing")
51 | parser.add_argument('--lr', type=float, default=1e-4,
52 | help="Learning rate")
53 | parser.add_argument('--device', type=str, default='cpu',
54 | help="Device to test the model on, e.g. 'cuda' or 'cpu'")
55 |
56 | args = parser.parse_args()
57 |
58 | args.test_json = os.path.join(args.data_rootpath, 'Testing', 'labels', 'Testing_files.json')
59 | args.personalized_features_file = os.path.join(args.data_rootpath, 'Testing', 'individualEmbedding', 'descriptions_embeddings_with_ids.npy')
60 |
61 |
62 | config = load_config('config.json')
63 | opt = Opt(config)
64 |
65 | # Modify individual dynamic parameters in opt according to task category
66 | opt.emo_output_dim = args.labelcount
67 | opt.feature_max_len = args.feature_max_len
68 | opt.lr = args.lr
69 |
70 | # Splice out feature folder paths according to incoming audio and video feature types
71 | audio_path = os.path.join(args.data_rootpath, 'Testing', f"{args.splitwindow_time}", 'Audio', f"{args.audiofeature_method}") + '/'
72 | video_path = os.path.join(args.data_rootpath, 'Testing', f"{args.splitwindow_time}", 'Visual', f"{args.videofeature_method}") + '/'
73 |
74 | # Obtain input_dim_a, input_dim_v
75 | for filename in os.listdir(audio_path):
76 | if filename.endswith('.npy'):
77 | opt.input_dim_a = np.load(audio_path + filename).shape[1]
78 | break
79 |
80 | for filename in os.listdir(video_path):
81 | if filename.endswith('.npy'):
82 | opt.input_dim_v = np.load(video_path + filename).shape[1]
83 | break
84 |
85 | opt.name = f'{args.splitwindow_time}_{args.labelcount}labels_{args.audiofeature_method}+{args.videofeature_method}'
86 | logger_path = os.path.join(opt.log_dir, opt.name)
87 | if not os.path.exists(opt.log_dir):
88 | os.mkdir(opt.log_dir)
89 | if not os.path.exists(logger_path):
90 | os.mkdir(logger_path)
91 | logger = get_logger(logger_path, 'result')
92 |
93 | cur_time = time.strftime('%Y-%m-%d-%H.%M.%S', time.localtime(time.time()))
94 | best_model_name = f"best_model_{cur_time}.pth"
95 |
96 | logger.info(f"splitwindow_time={args.splitwindow_time}, audiofeature_method={args.audiofeature_method}, "
97 | f"videofeature_method={args.videofeature_method}")
98 | logger.info(f"batch_size={args.batch_size}, , "
99 | f"labels={opt.emo_output_dim}, feature_max_len={opt.feature_max_len}")
100 |
101 |
102 | model = ourModel(opt)
103 | model.load_state_dict(torch.load(args.train_model, map_location=torch.device(args.device)))
104 | model.to(args.device)
105 | test_data = json.load(open(args.test_json, 'r'))
106 | test_loader = DataLoader(
107 | AudioVisualDataset(test_data, args.labelcount, args.personalized_features_file, opt.feature_max_len,
108 | batch_size=args.batch_size,
109 | audio_path=audio_path, video_path=video_path,isTest=True), batch_size=args.batch_size, shuffle=False)
110 | logger.info('The number of testing samples = %d' % len(test_loader.dataset))
111 |
112 | # testing
113 | _, pred, *_ = eval(model, test_loader, args.device)
114 |
115 | filenames = [item["audio_feature_path"] for item in test_data if "audio_feature_path" in item]
116 | IDs = [path[:path.find('.')] for path in filenames]
117 |
118 | if args.labelcount==2:
119 | label="bin"
120 | elif args.labelcount==3:
121 | label="tri"
122 | elif args.labelcount==5:
123 | label="pen"
124 |
125 |
126 | # output results to CSV
127 | pred_col_name = f"{args.splitwindow_time}_{label}"
128 |
129 | result_dir = f"./answer_{args.track_option}"
130 | if not os.path.exists(result_dir):
131 | os.makedirs(result_dir)
132 |
133 | csv_file = f"{result_dir}/submission.csv"
134 |
135 | # Get the order of the IDs in the test data to ensure consistency
136 | test_ids = [np.int64(item["audio_feature_path"].split('_')[0])for item in test_data]
137 |
138 | if os.path.exists(csv_file):
139 | df = pd.read_csv(csv_file)
140 | else:
141 | df = pd.DataFrame(columns=["ID"])
142 |
143 | pred = np.array(pred)
144 |
145 | if len(pred) != len(test_data):
146 | logger.error(f"Prediction length {len(pred)} does not match test data length {len(test_data)}")
147 | raise ValueError("Mismatch between predictions and test data")
148 |
149 | # zelin: ID 直接使用 audio_feature_path 去除 .npy 后缀
150 | id_list = [item["audio_feature_path"].replace(".npy", "") for item in test_data]
151 |
152 | # 构建预测结果 DataFrame
153 | result_df = pd.DataFrame({
154 | "ID": id_list,
155 | pred_col_name: pred
156 | })
157 |
158 | # 如果已有 CSV 文件,按 ID 合并;否则新建
159 | if os.path.exists(csv_file):
160 | existing_df = pd.read_csv(csv_file)
161 |
162 | # 合并已有 CSV 和本轮结果(保留所有 ID,自动对齐列)
163 | merged_df = pd.merge(existing_df, result_df, on="ID", how="outer")
164 | else:
165 | merged_df = result_df
166 |
167 | # 保存更新后的结果(覆盖写入,但保留所有旧列 + 本轮预测列)
168 | merged_df.to_csv(csv_file, index=False)
169 | logger.info(f"Testing complete. Results saved to: {csv_file}. Shape={merged_df.shape}")
170 |
171 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | import os
3 | import json
4 | import time
5 | import argparse
6 | import torch
7 | from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
8 | from torch.utils.data import DataLoader
9 | from train_val_split import train_val_split1, train_val_split2
10 | from models.our.our_model import ourModel
11 | from dataset import *
12 | from utils.logger import get_logger
13 | import numpy as np
14 |
15 | class Opt:
16 | def __init__(self, config_dict):
17 | self.__dict__.update(config_dict)
18 |
19 | def load_config(config_file):
20 | with open(config_file, 'r') as f:
21 | return json.load(f)
22 |
23 | def eval(model, val_loader, device):
24 | model.eval()
25 | total_emo_pred = []
26 | total_emo_label = []
27 |
28 | with torch.no_grad():
29 | for data in val_loader:
30 | for k, v in data.items():
31 | data[k] = v.to(device)
32 | model.set_input(data)
33 | model.test()
34 | emo_pred = model.emo_pred.argmax(dim=1).cpu().numpy()
35 | emo_label = data['emo_label'].cpu().numpy()
36 | total_emo_pred.append(emo_pred)
37 | total_emo_label.append(emo_label)
38 |
39 | total_emo_pred = np.concatenate(total_emo_pred)
40 | total_emo_label = np.concatenate(total_emo_label)
41 |
42 | emo_acc_unweighted = accuracy_score(total_emo_label, total_emo_pred, sample_weight=None)
43 | class_counts = np.bincount(total_emo_label) # Get the sample size for each category
44 | sample_weights = 1 / (class_counts[total_emo_label] + 1e-6) # Calculate weights for each sample to avoid division by zero errors
45 | emo_acc_weighted = accuracy_score(total_emo_label, total_emo_pred, sample_weight=sample_weights)
46 |
47 | emo_f1_weighted = f1_score(total_emo_label, total_emo_pred, average='weighted')
48 | emo_f1_unweighted = f1_score(total_emo_label, total_emo_pred, average='macro')
49 | emo_cm = confusion_matrix(total_emo_label, total_emo_pred)
50 |
51 | return total_emo_label,total_emo_pred,emo_acc_weighted, emo_acc_unweighted, emo_f1_weighted, emo_f1_unweighted, emo_cm
52 |
53 |
54 | def train_model(train_json, model, audio_path='', video_path='', max_len=5,
55 | best_model_name='best_model.pth', seed=None):
56 | """
57 | This is the traing function
58 | """
59 | logger.info(f'personalized features used:{args.personalized_features_file}')
60 | num_epochs = args.num_epochs
61 | device = args.device
62 | print(f"device: {device}")
63 | model.to(device)
64 |
65 | # split training and validation set
66 | # data = json.load(open(train_json, 'r'))
67 | if args.track_option=='Track1':
68 | train_data, val_data, train_category_count, val_category_count = train_val_split1(train_json, val_ratio=0.1, random_seed=seed)
69 | elif args.track_option=='Track2':
70 | train_data, val_data, train_category_count, val_category_count = train_val_split2(train_json, val_percentage=0.1,
71 | seed=seed)
72 |
73 | train_loader = DataLoader(
74 | AudioVisualDataset(train_data, args.labelcount, args.personalized_features_file, max_len,
75 | batch_size=args.batch_size,
76 | audio_path=audio_path, video_path=video_path), batch_size=args.batch_size, shuffle=True)
77 | val_loader = DataLoader(
78 | AudioVisualDataset(val_data, args.labelcount, args.personalized_features_file, max_len,
79 | batch_size=args.batch_size,
80 | audio_path=audio_path, video_path=video_path), batch_size=args.batch_size, shuffle=False)
81 |
82 | logger.info('The number of training samples = %d' % len(train_loader.dataset))
83 | logger.info('The number of val samples = %d' % len(val_loader.dataset))
84 |
85 | best_emo_acc = 0.0
86 | best_emo_f1 = 0.0
87 | best_emo_epoch = 1
88 | best_emo_cm = []
89 |
90 | for epoch in range(num_epochs):
91 | model.train(True)
92 | total_loss = 0
93 |
94 | for i, data in enumerate(train_loader):
95 | for k, v in data.items():
96 | data[k] = v.to(device)
97 | model.set_input(data)
98 | model.optimize_parameters(epoch)
99 |
100 | losses = model.get_current_losses()
101 | total_loss += losses['emo_CE']
102 |
103 | avg_loss = total_loss / len(train_loader)
104 |
105 | # evaluation
106 | label, pred, emo_acc_weighted, emo_acc_unweighted, emo_f1_weighted, emo_f1_unweighted, emo_cm = eval(model, val_loader,
107 | device)
108 |
109 | logger.info(f"Epoch {epoch + 1}/{num_epochs}, Avg Loss: {avg_loss:.10f}, "
110 | f"Weighted F1: {emo_f1_weighted:.10f}, Unweighted F1: {emo_f1_unweighted:.10f}, "
111 | f"Weighted Acc: {emo_acc_weighted:.10f}, Unweighted Acc: {emo_acc_unweighted:.10f}")
112 | logger.info('Confusion Matrix:\n{}'.format(emo_cm))
113 |
114 | if emo_f1_weighted > best_emo_f1:
115 | cur_time = time.strftime('%Y-%m-%d-%H.%M.%S', time.localtime(time.time()))
116 | best_emo_f1 = emo_f1_weighted
117 | best_emo_f1_unweighted = emo_f1_unweighted
118 | best_emo_acc = emo_acc_weighted
119 | best_emo_acc_unweighted = emo_acc_unweighted
120 | best_emo_cm = emo_cm
121 | best_emo_epoch = epoch + 1
122 | best_model = model
123 | save_path = os.path.join(os.path.join(opt.checkpoints_dir, opt.name), best_model_name)
124 | torch.save(model.state_dict(), save_path)
125 | print("Saved best model.")
126 |
127 | logger.info(f"Training complete. Random seed: {seed}. Best epoch: {best_emo_epoch}.")
128 | logger.info(f"Best Weighted F1: {best_emo_f1:.4f}, Best Unweighted F1: {best_emo_f1_unweighted:.4f}, "
129 | f"Best Weighted Acc: {best_emo_acc:.4f}, Best Unweighted Acc: {best_emo_acc_unweighted:.4f}.")
130 | logger.info('Confusion Matrix:\n{}'.format(best_emo_cm))
131 |
132 | # output results to CSV
133 | csv_file = f'{opt.log_dir}/{opt.name}.csv'
134 | formatted_best_emo_cm = ' '.join([f"[{' '.join(map(str, row))}]" for row in best_emo_cm])
135 | header = f"Time,random seed,splitwindow_time,labelcount,audiofeature_method,videofeature_method," \
136 | f"batch_size,num_epochs,feature_max_len,lr," \
137 | f"Weighted_F1,Unweighted_F1,Weighted_Acc,Unweighted_Acc,Confusion_Matrix"
138 | result_value = f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')},{seed},{args.splitwindow_time},{args.labelcount},{args.audiofeature_method},{args.videofeature_method}," \
139 | f"{args.batch_size},{args.num_epochs},{opt.feature_max_len},{opt.lr:.6f}," \
140 | f"{best_emo_f1:.4f},{best_emo_f1_unweighted:.4f},{best_emo_acc:.4f},{best_emo_acc_unweighted:.4f},{formatted_best_emo_cm}"
141 | file_exists = os.path.exists(csv_file)
142 | # Open file (append if file exists, create if it doesn't)
143 | with open(csv_file, mode='a') as file:
144 | if not file_exists:
145 | file.write(header + '\n')
146 | file.write(result_value + '\n')
147 |
148 | return best_emo_f1, best_emo_f1_unweighted, best_emo_acc, best_emo_acc_unweighted, best_emo_cm
149 |
150 |
151 | if __name__ == '__main__':
152 | parser = argparse.ArgumentParser(description="Train MDPP Model")
153 | parser.add_argument('--labelcount', type=int, default=3,
154 | help="Number of data categories (2, 3, or 5).")
155 | parser.add_argument('--track_option', type=str, required=True,
156 | help="Track1 or Track2")
157 | parser.add_argument('--feature_max_len', type=int, required=True,
158 | help="Max length of feature.")
159 | parser.add_argument('--data_rootpath', type=str, required=True,
160 | help="Root path to the program dataset")
161 | parser.add_argument('--train_json', type=str, required=False,
162 | help="File name of the training JSON file")
163 | parser.add_argument('--personalized_features_file', type=str,
164 | help="File name of the personalized features file")
165 | parser.add_argument('--audiofeature_method', type=str, default='mfccs',
166 | choices=['mfccs', 'opensmile', 'wav2vec'],
167 | help="Method for extracting audio features.")
168 | parser.add_argument('--videofeature_method', type=str, default='densenet',
169 | choices=['openface', 'resnet', 'densenet'],
170 | help="Method for extracting video features.")
171 | parser.add_argument('--splitwindow_time', type=str, default='1s',
172 | help="Time window for splitted features. e.g. '1s' or '5s'")
173 |
174 | parser.add_argument('--batch_size', type=int, default=32,
175 | help="Batch size for training")
176 | parser.add_argument('--lr', type=float, default=1e-4,
177 | help="Learning rate")
178 | parser.add_argument('--num_epochs', type=int, default=10,
179 | help="Number of epochs to train the model")
180 | parser.add_argument('--device', type=str, default='cpu',
181 | help="Device to train the model on, e.g. 'cuda' or 'cpu'")
182 |
183 | args = parser.parse_args()
184 |
185 | args.train_json = os.path.join(args.data_rootpath, 'Training', 'labels', 'Training_Validation_files.json')
186 | args.personalized_features_file = os.path.join(args.data_rootpath, 'Training', 'individualEmbedding', 'descriptions_embeddings_with_ids.npy')
187 |
188 | config = load_config('config.json')
189 | opt = Opt(config)
190 |
191 | # Modify individual dynamic parameters in opt according to task category
192 | opt.emo_output_dim = args.labelcount
193 | opt.feature_max_len = args.feature_max_len
194 | opt.lr = args.lr
195 |
196 | # Splice out feature folder paths according to incoming audio and video feature types
197 | audio_path = os.path.join(args.data_rootpath, 'Training', f"{args.splitwindow_time}", 'Audio', f"{args.audiofeature_method}") + '/'
198 | video_path = os.path.join(args.data_rootpath, 'Training', f"{args.splitwindow_time}", 'Visual', f"{args.videofeature_method}") + '/'
199 |
200 | # Obtain input_dim_a, input_dim_v
201 | for filename in os.listdir(audio_path):
202 | if filename.endswith('.npy'):
203 | opt.input_dim_a = np.load(audio_path + filename).shape[1]
204 | break
205 |
206 | for filename in os.listdir(video_path):
207 | if filename.endswith('.npy'):
208 | opt.input_dim_v = np.load(video_path + filename).shape[1]
209 | break
210 |
211 |
212 | opt.name = f'{args.splitwindow_time}_{args.labelcount}labels_{args.audiofeature_method}+{args.videofeature_method}'
213 | logger_path = os.path.join(opt.log_dir, opt.name)
214 | if not os.path.exists(opt.log_dir):
215 | os.mkdir(opt.log_dir)
216 | if not os.path.exists(logger_path):
217 | os.mkdir(logger_path)
218 | logger = get_logger(logger_path, 'result')
219 |
220 | model = ourModel(opt)
221 |
222 | cur_time = time.strftime('%Y-%m-%d-%H.%M.%S', time.localtime(time.time()))
223 | best_model_name = f"best_model_{cur_time}.pth"
224 |
225 | logger.info(f"splitwindow_time={args.splitwindow_time}, audiofeature_method={args.audiofeature_method}, "
226 | f"videofeature_method={args.videofeature_method}")
227 | logger.info(f"batch_size={args.batch_size}, num_epochs={args.num_epochs}, "
228 | f"labels={opt.emo_output_dim}, feature_max_len={opt.feature_max_len}, lr={opt.lr}")
229 |
230 | # set random seed
231 | # seed = np.random.randint(0, 10000)
232 | seed = 3407
233 | np.random.seed(seed)
234 | torch.manual_seed(seed)
235 | torch.cuda.manual_seed_all(seed)
236 |
237 | logger.info(f"Using random seed: {seed}")
238 |
239 | # training
240 | train_model(
241 | train_json=args.train_json,
242 | model=model,
243 | max_len=opt.feature_max_len,
244 | best_model_name=best_model_name,
245 | audio_path=audio_path,
246 | video_path=video_path,
247 | seed=seed
248 | )
249 |
--------------------------------------------------------------------------------
/train_val_split.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | from collections import defaultdict
4 | import math
5 | import numpy as np
6 | import torch
7 |
8 |
9 | def train_val_split2(file_path, val_percentage=0.10, seed=None):
10 | """
11 | Track2 dataset split
12 |
13 | Group data by person ID from the given JSON file and select 10% of the person IDs, ensuring that the proportion of each label remains consistent with the original data. For each person, either all their data is selected or none is selected.
14 |
15 | Parameters:
16 | - file_path: path to the JSON file
17 | - val_percentage: proportion of data to select (based on the number of person IDs, default is 10%)
18 | - seed: random seed
19 |
20 | Returns:
21 | - train_data: training set
22 | - val_data: validation set
23 | - train_category_count: total count of tri_category labels in the training set
24 | - val_category_count: total count of tri_category labels in the validation set
25 | """
26 |
27 | if seed is not None:
28 | random.seed(seed)
29 |
30 | with open(file_path, 'r') as file:
31 | data = json.load(file)
32 |
33 | grouped_by_person = defaultdict(list)
34 | for entry in data:
35 | # Extract person ID (assumes format "personID_topicID.npy")
36 | person_id = entry['audio_feature_path'].split('_')[0]
37 | grouped_by_person[person_id].append(entry)
38 |
39 | # Evenly distribute persons based on label category (young dataset is split according to tri_category)
40 | tri_category_person = defaultdict(list)
41 | for person_id, entries in grouped_by_person.items():
42 | tri_category = entries[0]['tri_category']
43 | tri_category_person[tri_category].append(person_id)
44 |
45 | total_person_count = len(grouped_by_person)
46 | num_persons_to_select = round(total_person_count * val_percentage)
47 |
48 | selected_person_ids = set()
49 |
50 | # Calculate the number of persons per category and the number to be selected
51 | selected_per_category = defaultdict(int)
52 | for category, person_ids in tri_category_person.items():
53 | num_category_persons = len(person_ids)
54 | num_category_to_select = round(num_category_persons * val_percentage + 0.001)
55 | selected_per_category[category] = num_category_to_select
56 |
57 | for category, person_ids in tri_category_person.items():
58 | num_category_to_select = selected_per_category[category]
59 | selected_person_ids.update(random.sample(person_ids, num_category_to_select))
60 |
61 | # Build the validation set data
62 | val_data = []
63 | for entry in data:
64 | person_id = entry['audio_feature_path'].split('_')[0]
65 | if person_id in selected_person_ids:
66 | val_data.append(entry)
67 |
68 | # Training set
69 | train_data = [entry for entry in data if entry not in val_data]
70 |
71 | # Count the total number of tri_category labels in train_data and val_data
72 | train_category_count = defaultdict(int)
73 | val_category_count = defaultdict(int)
74 |
75 | for entry in train_data:
76 | train_category_count[entry['tri_category']] += 1
77 |
78 | for entry in val_data:
79 | val_category_count[entry['tri_category']] += 1
80 | # Save train_data and val_data to JSON file (if needed)
81 |
82 |
83 | return train_data, val_data, train_category_count, val_category_count
84 |
85 | import json
86 | import random
87 | from collections import defaultdict
88 |
89 | def train_val_split1(file_path, val_ratio=0.1, random_seed=3407):
90 | """
91 | Track1 dataset split
92 |
93 | Read the JSON file and split it into training and validation sets according to the specified rules:
94 | - Data with label=4 are split in a 2:1 ratio;
95 | - Data with label=3 and id=69 are placed directly into the validation set;
96 | - The remaining data are split according to val_ratio.
97 |
98 | Ensure that:
99 | - Samples with the same ID do not appear in both the training and validation sets.
100 | - The return format is consistent with train_val_split.
101 |
102 | Parameters:
103 | file_path (str): path to the JSON data file
104 | val_ratio (float): proportion of the validation set, default is 0.1
105 | random_seed (int): random seed, default is 3407
106 |
107 | Returns:
108 | tuple: (training data list, validation data list, training set category counts, validation set category counts)
109 | """
110 | random.seed(random_seed)
111 |
112 | with open(file_path, 'r') as file:
113 | data = json.load(file)
114 |
115 | train_data, val_data = [], []
116 | label_to_ids = defaultdict(set)
117 | id_to_samples = defaultdict(list)
118 |
119 | for item in data:
120 | pen_category = item["pen_category"]
121 | id_ = item["id"]
122 | label_to_ids[pen_category].add(id_)
123 | id_to_samples[id_].append(item)
124 |
125 | train_ids, val_ids = set(), set()
126 |
127 | for pen_category, ids in label_to_ids.items():
128 | ids = list(ids)
129 |
130 | # Process label=4 (split in a 2:1 ratio)
131 | if pen_category == 4:
132 | for id_ in ids:
133 | samples = id_to_samples[id_]
134 | if len(samples) >= 3:
135 | random.shuffle(samples)
136 | train_data.extend(samples[:2])
137 | val_data.extend(samples[2:3])
138 | else:
139 | train_data.extend(samples)
140 | continue
141 |
142 | # Process the case of label=3 and id=69
143 | if pen_category == 3:
144 | for id_ in ids:
145 | if id_ == "69": # ID 87 directly placed into the validation set
146 | val_data.extend(id_to_samples[id_])
147 | else:
148 | train_data.extend(id_to_samples[id_])
149 | continue
150 |
151 | # Other categories are randomly split according to the proportion
152 | random.shuffle(ids)
153 | split_index = int(len(ids) * (1 - val_ratio))
154 | train_ids.update(ids[:split_index])
155 | val_ids.update(ids[split_index:])
156 |
157 | # Split data based on ID
158 | for id_ in train_ids:
159 | train_data.extend(id_to_samples[id_])
160 | for id_ in val_ids:
161 | val_data.extend(id_to_samples[id_])
162 |
163 | # Calculate category statistics
164 | train_category_count = defaultdict(int)
165 | val_category_count = defaultdict(int)
166 |
167 | for entry in train_data:
168 | train_category_count[entry['pen_category']] += 1
169 | for entry in val_data:
170 | val_category_count[entry['pen_category']] += 1
171 |
172 | return train_data, val_data, train_category_count, val_category_count
173 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/utils/__init__.py
--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/utils/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/utils/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/logger.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/utils/__pycache__/logger.cpython-311.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/logger.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hacilab/MPDD/bf2d4f33dbe58d1600686d6440e8ef3b4b53ca1d/utils/__pycache__/logger.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/image_pool.py:
--------------------------------------------------------------------------------
1 | import random
2 | import torch
3 |
4 |
5 | class ImagePool():
6 | """This class implements an image buffer that stores previously generated images.
7 |
8 | This buffer enables us to update discriminators using a history of generated images
9 | rather than the ones produced by the latest generators.
10 | """
11 |
12 | def __init__(self, pool_size):
13 | """Initialize the ImagePool class
14 |
15 | Parameters:
16 | pool_size (int) -- the size of image buffer, if pool_size=0, no buffer will be created
17 | """
18 | self.pool_size = pool_size
19 | if self.pool_size > 0: # create an empty pool
20 | self.num_imgs = 0
21 | self.images = []
22 |
23 | def query(self, images):
24 | """Return an image from the pool.
25 |
26 | Parameters:
27 | images: the latest generated images from the generator
28 |
29 | Returns images from the buffer.
30 |
31 | By 50/100, the buffer will return input images.
32 | By 50/100, the buffer will return images previously stored in the buffer,
33 | and insert the current images to the buffer.
34 | """
35 | if self.pool_size == 0: # if the buffer size is 0, do nothing
36 | return images
37 | return_images = []
38 | for image in images:
39 | image = torch.unsqueeze(image.data, 0)
40 | if self.num_imgs < self.pool_size: # if the buffer is not full; keep inserting current images to the buffer
41 | self.num_imgs = self.num_imgs + 1
42 | self.images.append(image)
43 | return_images.append(image)
44 | else:
45 | p = random.uniform(0, 1)
46 | if p > 0.5: # by 50% chance, the buffer will return a previously stored image, and insert the current image into the buffer
47 | random_id = random.randint(0, self.pool_size - 1) # randint is inclusive
48 | tmp = self.images[random_id].clone()
49 | self.images[random_id] = image
50 | return_images.append(tmp)
51 | else: # by another 50% chance, the buffer will return the current image
52 | return_images.append(image)
53 | return_images = torch.cat(return_images, 0) # collect all the images and return
54 | return return_images
55 |
--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
1 | import time
2 | import os
3 | import logging
4 | # import fcntl
5 |
6 | def get_logger(path, suffix):
7 | cur_time = time.strftime('%Y-%m-%d-%H.%M.%S',time.localtime(time.time()))
8 | logger = logging.getLogger(__name__+cur_time)
9 | logger.setLevel(level = logging.INFO)
10 | handler = logging.FileHandler(os.path.join(path, f"{suffix}_{cur_time}.log"))
11 | handler.setLevel(logging.INFO)
12 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
13 | handler.setFormatter(formatter)
14 |
15 | console = logging.StreamHandler()
16 | console.setLevel(logging.INFO)
17 |
18 | logger.addHandler(handler)
19 | logger.addHandler(console)
20 | return logger
21 |
22 | class ResultRecorder(object):
23 | def __init__(self, path, total_cv=10):
24 | self.path = path
25 | self.total_cv = total_cv
26 | if not os.path.exists(self.path):
27 | f = open(self.path, 'w')
28 | # f.write('acc\tuar\tf1\n')
29 | f.write('emo_metric\tint_metric\tjoint_metric\n')
30 | f.close()
31 |
32 | def is_full(self, content):
33 | if len(content) < self.total_cv+1:
34 | return False
35 |
36 | for line in content:
37 | if not len(line.split('\t')) == 3:
38 | return False
39 | return True
40 |
41 | def calc_mean(self, content):
42 | # acc = [float(line.split('\t')[0]) for line in content[1:]]
43 | # uar = [float(line.split('\t')[1]) for line in content[1:]]
44 | # f1 = [float(line.split('\t')[2]) for line in content[1:]]
45 | # mean_acc = sum(acc) / len(acc)
46 | # mean_uar = sum(uar) / len(uar)
47 | # mean_f1 = sum(f1) / len(f1)
48 | # return mean_acc, mean_uar, mean_f1
49 | emo_metric = [float(line.split('\t')[0]) for line in content[1:]]
50 | int_metric = [float(line.split('\t')[1]) for line in content[1:]]
51 | joint_metric = [float(line.split('\t')[2]) for line in content[1:]]
52 | mean_emo_metric = sum(emo_metric) / len(emo_metric)
53 | mean_int_metric = sum(int_metric) / len(int_metric)
54 | mean_joint_metric = sum(joint_metric) / len(joint_metric)
55 | return mean_emo_metric, mean_int_metric, mean_joint_metric
56 |
57 | def write_result_to_tsv(self, results, cvNo):
58 | # 使用fcntl对文件加锁,避免多个不同进程同时操作同一个文件
59 | f_in = open(self.path)
60 | # fcntl.flock(f_in.fileno(), fcntl.LOCK_EX) # 加锁
61 | content = f_in.readlines()
62 | if len(content) < self.total_cv+1:
63 | content += ['\n'] * (self.total_cv-len(content)+1)
64 | keys = [item for item in results.keys()]
65 | # content[cvNo] = '{:.4f}\t{:.4f}\t{:.4f}\n'.format(results[keys[0]], results[keys[1]], results[keys[2]])
66 | content[cvNo] = '{:.4f}\n'.format(results[keys[0]])
67 |
68 | if self.is_full(content):
69 | mean_emo_metric, mean_int_metric, mean_joint_metric = self.calc_mean(content)
70 | # content.append('{:.4f}\t{:.4f}\t{:.4f}\n'.format(mean_emo_metric, mean_int_metric, mean_joint_metric))
71 | content.append('{:.4f}\n'.format(mean_emo_metric))
72 |
73 | f_out = open(self.path, 'w')
74 | f_out.writelines(content)
75 | f_out.close()
76 | f_in.close() # 释放锁
77 |
78 |
79 | class LossRecorder(object):
80 | def __init__(self, path, total_cv=10, total_epoch=40):
81 | self.path = path
82 | self.total_epoch = total_epoch
83 | self.total_cv = total_cv
84 | if not os.path.exists(self.path):
85 | f = open(self.path, 'w')
86 | f.close()
87 |
88 | def is_full(self, content):
89 | if len(content) < self.total_cv + 1:
90 | return False
91 |
92 | for line in content:
93 | if not len(line.split('\t')) == 3:
94 | return False
95 | return True
96 |
97 | def calc_mean(self, content):
98 | loss_list = [[] * self.total_cv] * self.total_epoch
99 | mean_list = [[] * self.total_cv] * self.total_epoch
100 | for i in range(0, self.total_epoch):
101 | loss_list[i] = [float(line.split('\t')[i]) for line in content[1:]]
102 | for i in range(0, self.total_epoch):
103 | mean_list[i] = sum(loss_list[i]) / len(loss_list[i])
104 | return mean_list
105 |
106 | def write_result_to_tsv(self, results, cvNo):
107 | # 使用fcntl对文件加锁,避免多个不同进程同时操作同一个文件
108 | f_in = open(self.path)
109 | # fcntl.flock(f_in.fileno(), fcntl.LOCK_EX) # 加锁
110 | content = f_in.readlines()
111 | if len(content) < self.total_cv + 1:
112 | content += ['\n'] * (self.total_cv - len(content) + 1)
113 | string = ''
114 | for i in results:
115 | string += str(i.numpy())[:8]
116 | string += '\t'
117 | content[cvNo] = string + '\n'
118 |
119 | f_out = open(self.path, 'w')
120 | f_out.writelines(content)
121 | f_out.close()
122 | f_in.close() # 释放锁
123 |
124 | def read_result_from_tsv(self,):
125 | f_out = open(self.path)
126 | # fcntl.flock(f_out.fileno(), fcntl.LOCK_EX)
127 | content = f_out.readlines()
128 | loss_list = [[] * self.total_cv] * self.total_epoch
129 | for i in range(0, self.total_epoch):
130 | loss_list[i] = [float(line.split('\t')[i]) for line in content[1:]]
131 | mean = self.calc_mean(content)
132 | return mean
133 |
--------------------------------------------------------------------------------