├── .DS_Store
├── .idea
├── .gitignore
├── OGM-GE_CVPR2022.iml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── LICENSE
├── OGM_Sup.pdf
├── README.md
├── ckpt
└── .DS_Store
├── data
├── .DS_Store
├── CREMAD
│ ├── .DS_Store
│ ├── data.csv
│ ├── readme.md
│ ├── stat.csv
│ ├── test.csv
│ ├── train.csv
│ └── video_preprocessing.py
├── KineticSound
│ ├── my_test.txt
│ ├── my_train.txt
│ └── process_audio.py
└── VGGSound
│ ├── .DS_Store
│ ├── mp4_to_wav.py
│ ├── vggsound.csv
│ └── video_preprocessing.py
├── dataset
├── CramedDataset.py
├── VGGSoundDataset.py
└── dataset.py
├── demo
├── algorithom.PNG
├── demo_guitar.PNG
├── demo_snow.PNG
├── five lines.PNG
└── pipeline.PNG
├── main.py
├── models
├── .DS_Store
├── backbone.py
├── basic_model.py
├── fusion_modules.py
└── old_models
│ ├── .DS_Store
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── __init__.cpython-37.pyc
│ ├── avmodel.cpython-36.pyc
│ ├── avmodel.cpython-37.pyc
│ ├── avmodel_3.cpython-37.pyc
│ ├── avmodel_am.cpython-37.pyc
│ ├── avmodel_att.cpython-37.pyc
│ ├── avmodel_cma.cpython-37.pyc
│ ├── avmodel_demo.cpython-37.pyc
│ ├── avmodel_gate.cpython-37.pyc
│ ├── avmodel_gradblending.cpython-37.pyc
│ ├── avmodel_md.cpython-37.pyc
│ ├── avmodel_psp.cpython-37.pyc
│ ├── avmodel_uni.cpython-37.pyc
│ ├── avmodel_x.cpython-37.pyc
│ ├── encodera.cpython-36.pyc
│ ├── encodera.cpython-37.pyc
│ ├── encoderv.cpython-36.pyc
│ └── encoderv.cpython-37.pyc
│ ├── avmodel.py
│ ├── avmodel_x.py
│ ├── encodera.py
│ └── encoderv.py
└── utils
├── evaluation.py
└── utils.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/.DS_Store
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/OGM-GE_CVPR2022.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 GeWu-Lab
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/OGM_Sup.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/OGM_Sup.pdf
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Official OGM-GE in PyTorch
2 |
3 |
4 | Here is the official PyTorch implementation of OGM-GE proposed in ''*Balanced Multimodal Learning via On-the-fly Gradient Modulation*'', which is a flexible plug-in module to enhance the optimization process of multimodal learning. Please refer to our [CVPR 2022 paper](https://arxiv.org/abs/2203.15332) for more details.
5 |
6 | **Paper Title: "Balanced Multimodal Learning via On-the-fly Gradient Modulation"**
7 |
8 | **Authors: Xiaokang Peng\*, [Yake Wei\*](https://echo0409.github.io/), [Andong Deng](https://dengandong.github.io/), [Dong Wang](https://redwang.github.io/) and [Di Hu](https://dtaoo.github.io/index.html)**
9 |
10 | **Accepted by: IEEE Conference on Computer Vision and Pattern Recognition(CVPR 2022, Oral Presentation)**
11 |
12 | **[[arXiv](https://arxiv.org/abs/2203.15332)]** **[[Supplementary Material](https://github.com/GeWu-Lab/OGM-GE_CVPR2022/blob/main/OGM_Sup.pdf)]**
13 |
14 |
15 | ## News
16 | - We release a balanced audiovisual dataset for imbalance mult-modal learning analysis! [Project Page](https://gewu-lab.github.io/Balanced-Audiovisual-Dataset/), [Paper](https://arxiv.org/abs/2302.10912)
17 | - We further develop imbalance-mitigating method, MMCosine, for audio-visual fine-grained tasks! The [paper](https://arxiv.org/abs/2303.05338) has been accepted by ICASSP2023. [Project Page](https://gewu-lab.github.io/MMCosine/)
18 | - The effectiveness of the OGM-GE method has been corroborated by the work of several other researchers.
19 |
20 | | Task | Dataset |Modalities | w/o OGM-GE | w/ OGM-GE or similar | Source |
21 | |--------------------|--------------|--------------|------|------|------|
22 | | Action Recognition | UCF101 |RGB, Optical-Flow | 82.3 | 84.0 | [1] |
23 | | Knowledge Graph Link Prediction | OpenBG-Complete-IMG+ |Image, OCR | 59.4 | 60.1 | [2] |
24 |
25 | [1] [On Uni-modal Feature Learning In Supervised Multi-modal Learning](https://openreview.net/pdf?id=mb7VM83DkyC)
26 |
27 | [2] [IMKGA-SM: Interpretable Multimodal Knowledge Graph Answer Prediction via Sequence Modeling](https://arxiv.org/pdf/2301.02445.pdf)
28 |
29 | - Recent works inspired by OGM-GE:
30 |
31 | [PMR: Prototypical Modal Rebalance for Multimodal Learning](https://openaccess.thecvf.com/content/CVPR2023/papers/Fan_PMR_Prototypical_Modal_Rebalance_for_Multimodal_Learning_CVPR_2023_paper.pdf) CVPR 2023.
32 |
33 | [Graph Interactive Network with Adaptive Gradient for Multi-Modal Rumor Detection](https://dl.acm.org/doi/abs/10.1145/3591106.3592250) ICMR 2023.
34 |
35 | [MMCosine: Multi-Modal Cosine Loss Towards Balanced Audio-Visual Fine-Grained Learning](https://arxiv.org/pdf/2303.05338.pdf) ICASSP 2023.
36 |
37 | [Make Acoustic and Visual Cues Matter: CH-SIMS v2.0 Dataset and AV-Mixup Consistent Module](https://dl.acm.org/doi/pdf/10.1145/3536221.3556630) ICMI 2022.
38 |
39 | ## What is the imbalance phenomenon in multimodal learning task?
40 | We observe that **the potential of multimodal information is not fully exploited even when the multimodal model outperforms its uni-modal counterpart.** We conduct linear probing experiments to explore the quality of jointly trained encoders, and find them under-optimized (the yellow line) compared with the uni-modal model (the red line). We proposed the OGM-GE method to improve the optimization process adaptively and achieved consistent improvement (the blue line). We improve both the multimodal performance and uni-model representation as shown in the following figure.
41 |
42 |
43 |
44 |
45 | ## Method Introduction
46 | Pipeline of our OGM-GE method, consisting of two submodules:
47 | 1. On-the-fly Gradient Modulation (OGM), which is designed to adaptively balance the training between modalities;
48 | 2. Adaptive Gaussian noise Enhancement (GE), which restores the gradient intensity and brings generalization.
49 |
50 |
51 |
52 |
53 |
54 | ## Main Dependencies
55 | + Ubuntu 16.04
56 | + CUDA Version: 11.1
57 | + PyTorch 1.8.1
58 | + torchvision 0.9.1
59 | + python 3.7.6
60 |
61 |
62 | ## Usage
63 | ### Data Preparation
64 | Download Original Dataset:
65 | [CREMA-D](https://github.com/CheyneyComputerScience/CREMA-D),
66 | [AVE](https://sites.google.com/view/audiovisualresearch),
67 | [VGGSound](https://www.robots.ox.ac.uk/~vgg/data/vggsound/),
68 | [Kinetics-Sounds](https://github.com/cvdfoundation/kinetics-dataset).
69 |
70 | [comment]: <> ([ESC50](https://github.com/karoldvl/ESC-50/archive/master.zip).)
71 |
72 |
73 | ### Pre-processing
74 |
75 | For CREMA-D and VGGSound dataset, we provide code to pre-process videos into RGB frames and audio wav files in the directory ```data/```.
76 |
77 | **[!!Attention]:For audio modality, we convert the WAV files into spectrogram. [Here](https://github.com/GeWu-Lab/OGM-GE_CVPR2022/blob/main/data/KineticSound/process_audio.py) we provide one of the pre-processing methods of wav file.**
78 |
79 | #### CREMA-D
80 |
81 | As the original CREMA-D dataset has provided the original audio and video files, we simply extract the video frames by running the code:
82 |
83 | ```python data/CREMAD/video_preprecessing.py```
84 |
85 | Note that, the relevant path/dir should be changed according your own env.
86 |
87 | #### VGGSound
88 |
89 | As the original VGGSound dataset only provide the raw video files, we have to extract the audio by running the code:
90 |
91 | ```python data/VGGSound/mp4_to_wav.py```
92 |
93 | Then, extracting the video frames:
94 |
95 | ```python data/VGGSound/video_preprecessing.py```
96 |
97 | Note that, the relevant path/dir should be changed according your own env.
98 |
99 |
100 |
101 | ## Core code demo
102 |
103 | Our proposed OGM-GE can work as a simple but useful plugin for some widely used multimodal fusion frameworks. We dispaly the core abstract code part as following:
104 | ```python
105 | ---in training step---
106 |
107 | # Out_a, out_v are calculated to estimate the performance of 'a' and 'v' modality.
108 | x, y, out = model(spec.unsqueeze(1).float(), image.float(), label, iteration)
109 | out_v = (torch.mm(x,torch.transpose(model.module.fc_.weight[:,:512],0,1)) + model.module.fc_.bias/2)
110 | out_a = (torch.mm(y,torch.transpose(model.module.fc_.weight[:,512:],0,1)) + model.module.fc_.bias/2)
111 | loss = criterion(out, label)
112 |
113 | # Calculate original gradient first
114 | loss.backward()
115 |
116 | # Calculation of discrepancy ration and k.
117 | k_a,k_v = calculate_coefficient(label, out_a, out_v)
118 |
119 | # Gradient Modulation begins before optimization, and with GE applied.
120 | update_model_with_OGM_GE(model, k_a, k_v)
121 |
122 | # Optimize the modulated parameters.
123 | optimizer.step()
124 |
125 | ---continue for next training step---
126 | ```
127 |
128 | ### default modulation setting:
129 |
130 | ```--modulation OGM_GE --modulation_starts 0 --modulation_ends 50 --fusion_method concat --alpha 0.1```
131 |
132 | You can train your model simply by running:
133 |
134 | ```python main.py --dataset VGGSound --train```.
135 |
136 | You can also adapt to your own setting by adding additional arguments, for example, if you want to train our model on CREMA-D dataset, with gated fusion method and only OGM (i.e., without GE), and try to modulate the gradient from epoch 20 to epoch 80, you can run the following command:
137 |
138 | ```train.py --train --dataset CREMAD --fusion_method gated --modulation OGM --modulation_starts 20 --modulation_ends 80 --alpha 0.3```.
139 |
140 |
141 |
142 | ## Test and Eval
143 |
144 | You can test the performance of trained model by simply running
145 |
146 | ```python main.py --ckpt_path /PATH-to-trained-ckpt ```
147 |
148 | ## Tips
149 |
150 | There is a hype-parameter within OGM-GE, which is the alpha that depends on the modality discrepancy on different dataset.
151 | Here we recommend alpha=0.1 for VGGSound and alpha=0.8 for CREMA-D.
152 |
153 | ## Checkpoints
154 |
155 | [CREMA-D](https://zenodo.org/record/6778788)
156 |
157 |
158 | ## Demo explanation
159 |
160 |
161 |
162 |
163 |
164 |
165 | As shown in above picture, 'playing guitar' is a class that audio surpasses visual modality for most samples ('shovelling show' is just opposite), and we can tell audio achieves more adequate training and leads the optimization process. Our OGM-GE (as well as OGM) gains improvement in both modalties as well as multimodal performance, and the weak visual gains more porfit. The evaluation metric used in 'audio' and 'visual' is the predicted accuracy with classification scores just from one specific modality.
166 |
167 |
168 |
169 | ## Citation
170 | If you find this work useful, please consider citing it.
171 |
172 |
173 | @inproceedings{Peng2022Balanced,
174 | title = {Balanced Multimodal Learning via On-the-fly Gradient Modulation},
175 | author = {Peng, Xiaokang and Wei, Yake and Deng, Andong and Wang, Dong and Hu, Di},
176 | booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
177 | year = {2022}
178 | }
179 |
180 |
181 | ## Acknowledgement
182 |
183 | This research was supported by Public Computing Cloud, Renmin University of China.
184 |
185 | ## License
186 |
187 | This project is released under the [GNU General Public License v3.0](https://github.com/Mukosame/Zooming-Slow-Mo-CVPR-2020/blob/master/LICENSE).
188 |
189 |
190 | ## Contact us
191 |
192 | If you have any detailed questions or suggestions, you can email us:
193 | **yakewei@ruc.edu.cn** and **andongdeng69@gmail.com**
194 |
--------------------------------------------------------------------------------
/ckpt/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/ckpt/.DS_Store
--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/data/.DS_Store
--------------------------------------------------------------------------------
/data/CREMAD/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/data/CREMAD/.DS_Store
--------------------------------------------------------------------------------
/data/CREMAD/readme.md:
--------------------------------------------------------------------------------
1 | This place to save 'classes list', 'training set' and 'testing set' division file for loading data. Users can devide the traing set into a training set and a validation set.
2 | Data files of CREMA-D is provided as example.
3 |
--------------------------------------------------------------------------------
/data/CREMAD/stat.csv:
--------------------------------------------------------------------------------
1 | NEU
2 | HAP
3 | SAD
4 | FEA
5 | DIS
6 | ANG
7 |
--------------------------------------------------------------------------------
/data/CREMAD/test.csv:
--------------------------------------------------------------------------------
1 | 1041_TIE_NEU_XX,NEU
2 | 1071_MTI_SAD_XX,SAD
3 | 1028_WSI_DIS_XX,DIS
4 | 1016_IEO_HAP_LO,HAP
5 | 1087_IEO_FEA_LO,FEA
6 | 1082_ITH_HAP_XX,HAP
7 | 1071_WSI_NEU_XX,NEU
8 | 1089_IEO_DIS_MD,DIS
9 | 1056_ITH_NEU_XX,NEU
10 | 1091_TAI_SAD_XX,SAD
11 | 1068_ITS_DIS_XX,DIS
12 | 1011_IWL_ANG_XX,ANG
13 | 1052_ITS_NEU_XX,NEU
14 | 1050_IOM_ANG_XX,ANG
15 | 1030_IOM_ANG_XX,ANG
16 | 1027_IWW_SAD_XX,SAD
17 | 1014_TSI_HAP_XX,HAP
18 | 1032_DFA_NEU_XX,NEU
19 | 1026_IWL_SAD_XX,SAD
20 | 1035_TSI_ANG_XX,ANG
21 | 1010_WSI_SAD_XX,SAD
22 | 1039_IEO_SAD_HI,SAD
23 | 1063_DFA_DIS_XX,DIS
24 | 1064_IWW_HAP_XX,HAP
25 | 1072_IOM_SAD_XX,SAD
26 | 1023_TIE_DIS_XX,DIS
27 | 1040_IEO_SAD_HI,SAD
28 | 1088_DFA_FEA_XX,FEA
29 | 1027_IEO_FEA_HI,FEA
30 | 1054_WSI_ANG_XX,ANG
31 | 1048_MTI_NEU_XX,NEU
32 | 1026_TIE_FEA_XX,FEA
33 | 1035_IWW_ANG_XX,ANG
34 | 1089_ITS_NEU_XX,NEU
35 | 1038_IWL_ANG_XX,ANG
36 | 1035_IWL_ANG_XX,ANG
37 | 1011_IEO_SAD_HI,SAD
38 | 1017_IWL_HAP_XX,HAP
39 | 1087_DFA_DIS_XX,DIS
40 | 1040_TSI_DIS_XX,DIS
41 | 1019_IWL_SAD_XX,SAD
42 | 1090_IEO_HAP_LO,HAP
43 | 1018_TAI_SAD_XX,SAD
44 | 1027_IOM_NEU_XX,NEU
45 | 1076_DFA_NEU_XX,NEU
46 | 1037_TAI_NEU_XX,NEU
47 | 1091_TSI_NEU_XX,NEU
48 | 1047_TSI_NEU_XX,NEU
49 | 1076_IEO_SAD_MD,SAD
50 | 1024_IEO_ANG_HI,ANG
51 | 1001_WSI_NEU_XX,NEU
52 | 1091_IEO_HAP_MD,HAP
53 | 1075_IEO_HAP_HI,HAP
54 | 1079_TAI_NEU_XX,NEU
55 | 1077_TAI_DIS_XX,DIS
56 | 1041_ITS_NEU_XX,NEU
57 | 1021_MTI_ANG_XX,ANG
58 | 1002_MTI_HAP_XX,HAP
59 | 1005_IEO_SAD_LO,SAD
60 | 1034_MTI_HAP_XX,HAP
61 | 1026_ITH_HAP_XX,HAP
62 | 1077_DFA_NEU_XX,NEU
63 | 1060_DFA_NEU_XX,NEU
64 | 1081_TAI_DIS_XX,DIS
65 | 1084_MTI_HAP_XX,HAP
66 | 1013_WSI_ANG_XX,ANG
67 | 1071_IOM_DIS_XX,DIS
68 | 1030_TAI_NEU_XX,NEU
69 | 1053_IWL_SAD_XX,SAD
70 | 1048_MTI_FEA_XX,FEA
71 | 1027_IEO_DIS_MD,DIS
72 | 1037_TIE_HAP_XX,HAP
73 | 1066_IWL_NEU_XX,NEU
74 | 1022_TSI_DIS_XX,DIS
75 | 1016_MTI_FEA_XX,FEA
76 | 1049_ITH_FEA_XX,FEA
77 | 1070_IEO_SAD_HI,SAD
78 | 1002_IWL_NEU_XX,NEU
79 | 1039_IOM_HAP_XX,HAP
80 | 1061_TAI_NEU_XX,NEU
81 | 1012_ITH_NEU_XX,NEU
82 | 1040_ITS_DIS_XX,DIS
83 | 1066_ITH_SAD_XX,SAD
84 | 1016_TIE_SAD_XX,SAD
85 | 1044_IEO_SAD_HI,SAD
86 | 1089_TIE_ANG_XX,ANG
87 | 1064_IWL_FEA_XX,FEA
88 | 1032_IEO_DIS_LO,DIS
89 | 1078_DFA_SAD_XX,SAD
90 | 1003_IEO_FEA_MD,FEA
91 | 1082_IWL_FEA_XX,FEA
92 | 1028_TAI_FEA_XX,FEA
93 | 1052_IEO_ANG_MD,ANG
94 | 1006_TAI_FEA_XX,FEA
95 | 1070_IWL_FEA_XX,FEA
96 | 1030_IWW_DIS_XX,DIS
97 | 1063_IEO_SAD_HI,SAD
98 | 1089_WSI_DIS_XX,DIS
99 | 1026_MTI_SAD_XX,SAD
100 | 1009_TIE_ANG_XX,ANG
101 | 1075_IOM_HAP_XX,HAP
102 | 1041_IEO_SAD_LO,SAD
103 | 1037_DFA_FEA_XX,FEA
104 | 1026_TIE_DIS_XX,DIS
105 | 1019_TSI_SAD_XX,SAD
106 | 1036_DFA_HAP_XX,HAP
107 | 1061_IWW_FEA_XX,FEA
108 | 1027_TIE_HAP_XX,HAP
109 | 1066_IOM_HAP_XX,HAP
110 | 1012_IEO_NEU_XX,NEU
111 | 1003_IEO_FEA_HI,FEA
112 | 1016_IEO_ANG_MD,ANG
113 | 1070_IWL_ANG_XX,ANG
114 | 1020_IEO_ANG_LO,ANG
115 | 1010_IWW_FEA_XX,FEA
116 | 1043_IOM_ANG_XX,ANG
117 | 1069_TSI_HAP_XX,HAP
118 | 1073_IEO_ANG_LO,ANG
119 | 1045_WSI_FEA_XX,FEA
120 | 1061_TIE_HAP_XX,HAP
121 | 1004_MTI_ANG_XX,ANG
122 | 1058_TIE_NEU_XX,NEU
123 | 1044_ITS_ANG_XX,ANG
124 | 1081_IOM_NEU_XX,NEU
125 | 1044_IOM_SAD_XX,SAD
126 | 1004_IEO_ANG_HI,ANG
127 | 1060_ITS_HAP_XX,HAP
128 | 1090_ITS_DIS_XX,DIS
129 | 1005_ITS_ANG_XX,ANG
130 | 1053_IWW_ANG_XX,ANG
131 | 1089_IEO_FEA_HI,FEA
132 | 1009_WSI_NEU_XX,NEU
133 | 1035_ITH_DIS_XX,DIS
134 | 1075_TIE_DIS_XX,DIS
135 | 1023_IOM_FEA_XX,FEA
136 | 1062_IEO_FEA_LO,FEA
137 | 1085_IEO_ANG_MD,ANG
138 | 1091_TSI_ANG_XX,ANG
139 | 1020_ITS_SAD_XX,SAD
140 | 1035_IEO_SAD_MD,SAD
141 | 1007_ITH_ANG_XX,ANG
142 | 1064_TAI_ANG_XX,ANG
143 | 1058_ITS_NEU_XX,NEU
144 | 1034_ITS_ANG_XX,ANG
145 | 1068_TIE_FEA_XX,FEA
146 | 1013_MTI_HAP_XX,HAP
147 | 1037_IWW_FEA_XX,FEA
148 | 1055_TAI_FEA_XX,FEA
149 | 1011_IWL_DIS_XX,DIS
150 | 1053_TIE_DIS_XX,DIS
151 | 1033_IEO_ANG_HI,ANG
152 | 1032_IWW_HAP_XX,HAP
153 | 1010_IOM_HAP_XX,HAP
154 | 1060_ITS_NEU_XX,NEU
155 | 1051_WSI_FEA_XX,FEA
156 | 1088_IEO_DIS_LO,DIS
157 | 1042_IWW_ANG_XX,ANG
158 | 1046_IEO_ANG_MD,ANG
159 | 1073_IWW_FEA_XX,FEA
160 | 1006_IOM_ANG_XX,ANG
161 | 1019_ITS_SAD_XX,SAD
162 | 1056_ITS_FEA_XX,FEA
163 | 1025_DFA_DIS_XX,DIS
164 | 1055_IEO_SAD_HI,SAD
165 | 1086_ITH_SAD_XX,SAD
166 | 1035_IEO_HAP_LO,HAP
167 | 1085_ITS_HAP_XX,HAP
168 | 1078_MTI_NEU_XX,NEU
169 | 1067_IEO_ANG_HI,ANG
170 | 1042_IWL_DIS_XX,DIS
171 | 1020_TSI_FEA_XX,FEA
172 | 1037_ITH_ANG_XX,ANG
173 | 1022_IEO_DIS_LO,DIS
174 | 1029_IWW_SAD_XX,SAD
175 | 1071_MTI_HAP_XX,HAP
176 | 1020_TAI_HAP_XX,HAP
177 | 1011_IWL_FEA_XX,FEA
178 | 1067_IWL_FEA_XX,FEA
179 | 1008_IWL_NEU_XX,NEU
180 | 1058_TSI_NEU_XX,NEU
181 | 1074_TAI_DIS_XX,DIS
182 | 1053_IEO_ANG_HI,ANG
183 | 1059_IOM_DIS_XX,DIS
184 | 1091_DFA_HAP_XX,HAP
185 | 1035_TSI_HAP_XX,HAP
186 | 1054_WSI_DIS_XX,DIS
187 | 1082_TIE_HAP_XX,HAP
188 | 1053_TSI_FEA_XX,FEA
189 | 1041_TSI_SAD_XX,SAD
190 | 1033_TIE_DIS_XX,DIS
191 | 1080_TAI_SAD_XX,SAD
192 | 1007_MTI_ANG_XX,ANG
193 | 1006_IWL_ANG_XX,ANG
194 | 1080_IEO_DIS_HI,DIS
195 | 1047_TSI_HAP_XX,HAP
196 | 1037_TAI_SAD_XX,SAD
197 | 1046_DFA_NEU_XX,NEU
198 | 1055_IWL_DIS_XX,DIS
199 | 1086_IEO_FEA_LO,FEA
200 | 1073_DFA_SAD_XX,SAD
201 | 1026_IWW_NEU_XX,NEU
202 | 1077_DFA_HAP_XX,HAP
203 | 1006_TAI_ANG_XX,ANG
204 | 1077_WSI_DIS_XX,DIS
205 | 1018_TSI_NEU_XX,NEU
206 | 1065_IOM_ANG_XX,ANG
207 | 1037_MTI_ANG_XX,ANG
208 | 1039_IWL_HAP_XX,HAP
209 | 1070_IEO_DIS_LO,DIS
210 | 1030_TSI_FEA_XX,FEA
211 | 1046_IWL_SAD_XX,SAD
212 | 1033_MTI_DIS_XX,DIS
213 | 1059_ITH_NEU_XX,NEU
214 | 1037_TSI_HAP_XX,HAP
215 | 1077_TIE_FEA_XX,FEA
216 | 1011_IWL_SAD_XX,SAD
217 | 1022_DFA_NEU_XX,NEU
218 | 1044_WSI_HAP_XX,HAP
219 | 1047_IEO_DIS_HI,DIS
220 | 1091_ITS_SAD_XX,SAD
221 | 1056_IEO_DIS_LO,DIS
222 | 1031_IWW_FEA_XX,FEA
223 | 1045_ITS_ANG_XX,ANG
224 | 1011_IEO_NEU_XX,NEU
225 | 1084_IWL_HAP_XX,HAP
226 | 1011_IEO_DIS_MD,DIS
227 | 1004_IWL_FEA_XX,FEA
228 | 1019_DFA_NEU_XX,NEU
229 | 1081_MTI_NEU_XX,NEU
230 | 1057_IWW_DIS_XX,DIS
231 | 1039_IOM_SAD_XX,SAD
232 | 1013_TSI_FEA_XX,FEA
233 | 1008_IEO_SAD_LO,SAD
234 | 1041_IWL_DIS_XX,DIS
235 | 1017_IEO_DIS_MD,DIS
236 | 1009_TSI_NEU_XX,NEU
237 | 1023_IEO_DIS_MD,DIS
238 | 1040_ITS_SAD_XX,SAD
239 | 1041_ITH_HAP_XX,HAP
240 | 1077_TIE_DIS_XX,DIS
241 | 1072_IEO_ANG_MD,ANG
242 | 1087_IEO_FEA_MD,FEA
243 | 1018_TAI_NEU_XX,NEU
244 | 1040_TIE_HAP_XX,HAP
245 | 1008_MTI_HAP_XX,HAP
246 | 1089_TSI_ANG_XX,ANG
247 | 1025_IWL_SAD_XX,SAD
248 | 1030_IWW_HAP_XX,HAP
249 | 1022_IWW_FEA_XX,FEA
250 | 1065_ITS_FEA_XX,FEA
251 | 1022_ITH_FEA_XX,FEA
252 | 1001_ITS_SAD_XX,SAD
253 | 1004_IOM_NEU_XX,NEU
254 | 1086_DFA_DIS_XX,DIS
255 | 1085_IWW_ANG_XX,ANG
256 | 1046_IWL_DIS_XX,DIS
257 | 1035_TAI_SAD_XX,SAD
258 | 1054_IEO_HAP_MD,HAP
259 | 1014_DFA_NEU_XX,NEU
260 | 1021_IEO_HAP_LO,HAP
261 | 1089_IEO_HAP_HI,HAP
262 | 1065_WSI_ANG_XX,ANG
263 | 1029_TSI_HAP_XX,HAP
264 | 1088_IEO_ANG_MD,ANG
265 | 1044_IWW_NEU_XX,NEU
266 | 1064_MTI_HAP_XX,HAP
267 | 1072_IWW_NEU_XX,NEU
268 | 1054_ITS_FEA_XX,FEA
269 | 1087_DFA_SAD_XX,SAD
270 | 1010_MTI_FEA_XX,FEA
271 | 1074_WSI_ANG_XX,ANG
272 | 1049_TSI_SAD_XX,SAD
273 | 1082_TAI_HAP_XX,HAP
274 | 1066_IWW_FEA_XX,FEA
275 | 1072_TSI_SAD_XX,SAD
276 | 1073_IWW_HAP_XX,HAP
277 | 1091_MTI_NEU_XX,NEU
278 | 1022_TAI_HAP_XX,HAP
279 | 1086_WSI_SAD_XX,SAD
280 | 1063_ITH_SAD_XX,SAD
281 | 1046_IEO_DIS_LO,DIS
282 | 1082_IWW_NEU_XX,NEU
283 | 1002_WSI_FEA_XX,FEA
284 | 1044_TIE_FEA_XX,FEA
285 | 1089_IWL_HAP_XX,HAP
286 | 1028_IWW_ANG_XX,ANG
287 | 1057_IEO_ANG_MD,ANG
288 | 1003_TSI_HAP_XX,HAP
289 | 1002_ITH_DIS_XX,DIS
290 | 1081_DFA_NEU_XX,NEU
291 | 1010_ITS_NEU_XX,NEU
292 | 1004_TIE_ANG_XX,ANG
293 | 1091_IEO_HAP_LO,HAP
294 | 1006_IOM_SAD_XX,SAD
295 | 1047_TAI_SAD_XX,SAD
296 | 1046_IEO_FEA_HI,FEA
297 | 1083_TSI_FEA_XX,FEA
298 | 1085_TSI_DIS_XX,DIS
299 | 1060_TAI_NEU_XX,NEU
300 | 1023_ITH_SAD_XX,SAD
301 | 1054_MTI_FEA_XX,FEA
302 | 1028_ITH_NEU_XX,NEU
303 | 1044_ITS_DIS_XX,DIS
304 | 1032_ITS_DIS_XX,DIS
305 | 1044_IEO_DIS_HI,DIS
306 | 1067_IEO_ANG_MD,ANG
307 | 1032_TSI_NEU_XX,NEU
308 | 1070_IEO_ANG_HI,ANG
309 | 1022_TIE_SAD_XX,SAD
310 | 1033_IEO_SAD_LO,SAD
311 | 1083_IWW_HAP_XX,HAP
312 | 1071_ITH_ANG_XX,ANG
313 | 1085_IOM_HAP_XX,HAP
314 | 1054_TIE_HAP_XX,HAP
315 | 1015_TSI_DIS_XX,DIS
316 | 1062_MTI_DIS_XX,DIS
317 | 1007_TIE_ANG_XX,ANG
318 | 1079_DFA_SAD_XX,SAD
319 | 1073_DFA_DIS_XX,DIS
320 | 1008_ITH_FEA_XX,FEA
321 | 1086_IWL_HAP_XX,HAP
322 | 1058_IOM_NEU_XX,NEU
323 | 1034_IEO_DIS_HI,DIS
324 | 1054_IWL_ANG_XX,ANG
325 | 1057_DFA_ANG_XX,ANG
326 | 1072_IWL_DIS_XX,DIS
327 | 1026_IOM_ANG_XX,ANG
328 | 1034_IEO_NEU_XX,NEU
329 | 1047_IWL_DIS_XX,DIS
330 | 1005_TSI_HAP_XX,HAP
331 | 1009_IWL_SAD_XX,SAD
332 | 1043_IEO_DIS_MD,DIS
333 | 1070_IEO_DIS_MD,DIS
334 | 1079_MTI_SAD_XX,SAD
335 | 1005_IWL_NEU_XX,NEU
336 | 1071_ITS_SAD_XX,SAD
337 | 1053_TSI_NEU_XX,NEU
338 | 1036_TIE_FEA_XX,FEA
339 | 1061_ITS_NEU_XX,NEU
340 | 1015_ITH_FEA_XX,FEA
341 | 1088_TSI_FEA_XX,FEA
342 | 1032_MTI_NEU_XX,NEU
343 | 1019_TSI_FEA_XX,FEA
344 | 1026_WSI_DIS_XX,DIS
345 | 1011_WSI_DIS_XX,DIS
346 | 1010_IEO_FEA_LO,FEA
347 | 1027_MTI_FEA_XX,FEA
348 | 1051_MTI_NEU_XX,NEU
349 | 1047_WSI_SAD_XX,SAD
350 | 1031_TAI_FEA_XX,FEA
351 | 1086_IOM_DIS_XX,DIS
352 | 1030_IOM_NEU_XX,NEU
353 | 1072_MTI_FEA_XX,FEA
354 | 1059_IEO_FEA_MD,FEA
355 | 1078_IWW_SAD_XX,SAD
356 | 1043_TAI_DIS_XX,DIS
357 | 1053_DFA_ANG_XX,ANG
358 | 1012_DFA_NEU_XX,NEU
359 | 1049_IWW_NEU_XX,NEU
360 | 1062_IWL_NEU_XX,NEU
361 | 1030_WSI_NEU_XX,NEU
362 | 1012_WSI_FEA_XX,FEA
363 | 1035_DFA_HAP_XX,HAP
364 | 1082_TAI_DIS_XX,DIS
365 | 1070_TAI_NEU_XX,NEU
366 | 1066_WSI_FEA_XX,FEA
367 | 1011_ITS_DIS_XX,DIS
368 | 1032_IEO_FEA_LO,FEA
369 | 1028_IEO_DIS_HI,DIS
370 | 1062_ITS_DIS_XX,DIS
371 | 1018_TIE_ANG_XX,ANG
372 | 1030_WSI_DIS_XX,DIS
373 | 1004_WSI_SAD_XX,SAD
374 | 1081_IWL_ANG_XX,ANG
375 | 1012_IEO_FEA_HI,FEA
376 | 1063_MTI_SAD_XX,SAD
377 | 1080_IWW_NEU_XX,NEU
378 | 1087_TSI_DIS_XX,DIS
379 | 1069_ITH_SAD_XX,SAD
380 | 1019_IOM_HAP_XX,HAP
381 | 1021_TSI_ANG_XX,ANG
382 | 1069_IOM_FEA_XX,FEA
383 | 1039_MTI_NEU_XX,NEU
384 | 1011_MTI_DIS_XX,DIS
385 | 1043_TSI_FEA_XX,FEA
386 | 1055_IEO_SAD_LO,SAD
387 | 1084_IOM_HAP_XX,HAP
388 | 1041_IOM_DIS_XX,DIS
389 | 1043_IOM_SAD_XX,SAD
390 | 1048_DFA_ANG_XX,ANG
391 | 1031_ITS_HAP_XX,HAP
392 | 1032_IWW_NEU_XX,NEU
393 | 1031_DFA_NEU_XX,NEU
394 | 1013_DFA_ANG_XX,ANG
395 | 1056_IOM_NEU_XX,NEU
396 | 1043_WSI_HAP_XX,HAP
397 | 1018_ITS_DIS_XX,DIS
398 | 1053_IEO_DIS_MD,DIS
399 | 1019_TAI_ANG_XX,ANG
400 | 1079_IEO_SAD_MD,SAD
401 | 1016_WSI_FEA_XX,FEA
402 | 1050_IWW_NEU_XX,NEU
403 | 1046_MTI_FEA_XX,FEA
404 | 1064_TAI_DIS_XX,DIS
405 | 1062_IEO_SAD_LO,SAD
406 | 1065_WSI_HAP_XX,HAP
407 | 1028_IWW_DIS_XX,DIS
408 | 1066_IWW_SAD_XX,SAD
409 | 1014_TAI_DIS_XX,DIS
410 | 1018_WSI_SAD_XX,SAD
411 | 1040_TAI_DIS_XX,DIS
412 | 1015_TAI_SAD_XX,SAD
413 | 1046_TIE_ANG_XX,ANG
414 | 1084_ITH_NEU_XX,NEU
415 | 1005_IEO_FEA_HI,FEA
416 | 1021_WSI_FEA_XX,FEA
417 | 1037_MTI_SAD_XX,SAD
418 | 1076_IWW_ANG_XX,ANG
419 | 1085_IOM_DIS_XX,DIS
420 | 1058_ITH_SAD_XX,SAD
421 | 1009_IOM_NEU_XX,NEU
422 | 1058_TAI_SAD_XX,SAD
423 | 1021_TIE_NEU_XX,NEU
424 | 1009_IWW_ANG_XX,ANG
425 | 1003_IEO_ANG_HI,ANG
426 | 1038_TAI_FEA_XX,FEA
427 | 1027_IWL_NEU_XX,NEU
428 | 1053_WSI_ANG_XX,ANG
429 | 1024_TIE_SAD_XX,SAD
430 | 1047_ITH_FEA_XX,FEA
431 | 1036_IWW_SAD_XX,SAD
432 | 1036_DFA_NEU_XX,NEU
433 | 1088_MTI_FEA_XX,FEA
434 | 1025_IEO_FEA_LO,FEA
435 | 1029_IEO_SAD_HI,SAD
436 | 1019_WSI_SAD_XX,SAD
437 | 1050_WSI_SAD_XX,SAD
438 | 1063_IEO_HAP_LO,HAP
439 | 1020_WSI_FEA_XX,FEA
440 | 1066_DFA_HAP_XX,HAP
441 | 1049_WSI_SAD_XX,SAD
442 | 1023_ITH_ANG_XX,ANG
443 | 1019_IEO_ANG_LO,ANG
444 | 1075_DFA_NEU_XX,NEU
445 | 1044_IEO_SAD_LO,SAD
446 | 1051_IEO_HAP_MD,HAP
447 | 1075_MTI_NEU_XX,NEU
448 | 1079_WSI_DIS_XX,DIS
449 | 1013_ITH_SAD_XX,SAD
450 | 1029_IWL_FEA_XX,FEA
451 | 1062_IEO_DIS_LO,DIS
452 | 1041_DFA_SAD_XX,SAD
453 | 1083_WSI_SAD_XX,SAD
454 | 1030_TAI_DIS_XX,DIS
455 | 1052_DFA_FEA_XX,FEA
456 | 1076_IWW_NEU_XX,NEU
457 | 1026_IEO_NEU_XX,NEU
458 | 1025_IEO_ANG_HI,ANG
459 | 1019_DFA_ANG_XX,ANG
460 | 1033_ITH_DIS_XX,DIS
461 | 1027_IWL_ANG_XX,ANG
462 | 1024_IOM_HAP_XX,HAP
463 | 1077_DFA_ANG_XX,ANG
464 | 1057_IWW_FEA_XX,FEA
465 | 1020_IOM_SAD_XX,SAD
466 | 1055_ITS_NEU_XX,NEU
467 | 1070_DFA_NEU_XX,NEU
468 | 1071_ITS_DIS_XX,DIS
469 | 1027_ITH_HAP_XX,HAP
470 | 1057_IEO_ANG_HI,ANG
471 | 1002_IWL_FEA_XX,FEA
472 | 1052_TAI_FEA_XX,FEA
473 | 1019_WSI_HAP_XX,HAP
474 | 1030_DFA_HAP_XX,HAP
475 | 1064_IEO_SAD_MD,SAD
476 | 1061_DFA_ANG_XX,ANG
477 | 1083_TIE_SAD_XX,SAD
478 | 1045_IEO_ANG_LO,ANG
479 | 1028_IWW_SAD_XX,SAD
480 | 1012_TSI_DIS_XX,DIS
481 | 1070_MTI_HAP_XX,HAP
482 | 1024_TSI_DIS_XX,DIS
483 | 1074_MTI_HAP_XX,HAP
484 | 1030_ITS_FEA_XX,FEA
485 | 1054_IWL_NEU_XX,NEU
486 | 1054_IEO_DIS_MD,DIS
487 | 1084_TIE_NEU_XX,NEU
488 | 1073_TSI_SAD_XX,SAD
489 | 1053_DFA_NEU_XX,NEU
490 | 1025_DFA_HAP_XX,HAP
491 | 1042_MTI_HAP_XX,HAP
492 | 1089_IEO_SAD_LO,SAD
493 | 1009_ITH_FEA_XX,FEA
494 | 1017_MTI_NEU_XX,NEU
495 | 1054_TSI_HAP_XX,HAP
496 | 1071_WSI_DIS_XX,DIS
497 | 1080_TSI_DIS_XX,DIS
498 | 1052_ITS_FEA_XX,FEA
499 | 1078_TIE_NEU_XX,NEU
500 | 1034_IEO_ANG_LO,ANG
501 | 1018_IWL_SAD_XX,SAD
502 | 1034_TAI_ANG_XX,ANG
503 | 1012_TSI_FEA_XX,FEA
504 | 1025_WSI_DIS_XX,DIS
505 | 1036_IEO_SAD_MD,SAD
506 | 1063_IEO_SAD_MD,SAD
507 | 1040_IEO_NEU_XX,NEU
508 | 1007_IWL_FEA_XX,FEA
509 | 1023_IWW_NEU_XX,NEU
510 | 1010_IWW_HAP_XX,HAP
511 | 1067_TAI_DIS_XX,DIS
512 | 1074_ITS_HAP_XX,HAP
513 | 1045_ITS_HAP_XX,HAP
514 | 1072_IWW_FEA_XX,FEA
515 | 1088_ITH_SAD_XX,SAD
516 | 1068_DFA_SAD_XX,SAD
517 | 1041_ITS_FEA_XX,FEA
518 | 1048_IEO_FEA_LO,FEA
519 | 1067_ITS_DIS_XX,DIS
520 | 1008_TAI_SAD_XX,SAD
521 | 1075_TAI_SAD_XX,SAD
522 | 1023_ITH_HAP_XX,HAP
523 | 1063_WSI_FEA_XX,FEA
524 | 1009_IEO_SAD_LO,SAD
525 | 1068_IEO_FEA_HI,FEA
526 | 1071_IOM_NEU_XX,NEU
527 | 1075_IEO_SAD_LO,SAD
528 | 1036_WSI_HAP_XX,HAP
529 | 1022_IOM_DIS_XX,DIS
530 | 1017_TAI_HAP_XX,HAP
531 | 1005_DFA_SAD_XX,SAD
532 | 1055_TAI_SAD_XX,SAD
533 | 1058_ITH_HAP_XX,HAP
534 | 1014_ITS_HAP_XX,HAP
535 | 1050_IWL_SAD_XX,SAD
536 | 1002_ITS_DIS_XX,DIS
537 | 1029_IEO_DIS_LO,DIS
538 | 1091_IOM_DIS_XX,DIS
539 | 1001_DFA_HAP_XX,HAP
540 | 1073_IEO_DIS_HI,DIS
541 | 1071_ITH_NEU_XX,NEU
542 | 1005_TSI_ANG_XX,ANG
543 | 1083_ITH_DIS_XX,DIS
544 | 1045_IEO_SAD_HI,SAD
545 | 1054_TAI_NEU_XX,NEU
546 | 1046_ITS_FEA_XX,FEA
547 | 1085_ITH_HAP_XX,HAP
548 | 1018_TSI_HAP_XX,HAP
549 | 1008_IOM_HAP_XX,HAP
550 | 1081_TAI_SAD_XX,SAD
551 | 1039_ITH_DIS_XX,DIS
552 | 1076_IEO_NEU_XX,NEU
553 | 1062_MTI_NEU_XX,NEU
554 | 1028_IOM_HAP_XX,HAP
555 | 1004_TAI_DIS_XX,DIS
556 | 1041_IEO_FEA_HI,FEA
557 | 1079_ITS_SAD_XX,SAD
558 | 1065_ITS_DIS_XX,DIS
559 | 1083_MTI_NEU_XX,NEU
560 | 1003_MTI_NEU_XX,NEU
561 | 1029_TIE_DIS_XX,DIS
562 | 1066_ITS_ANG_XX,ANG
563 | 1034_DFA_SAD_XX,SAD
564 | 1034_TSI_HAP_XX,HAP
565 | 1043_TAI_SAD_XX,SAD
566 | 1042_TAI_HAP_XX,HAP
567 | 1088_TAI_FEA_XX,FEA
568 | 1022_IEO_DIS_HI,DIS
569 | 1062_IWL_HAP_XX,HAP
570 | 1003_IEO_HAP_MD,HAP
571 | 1048_IEO_SAD_MD,SAD
572 | 1015_DFA_DIS_XX,DIS
573 | 1056_DFA_FEA_XX,FEA
574 | 1035_DFA_FEA_XX,FEA
575 | 1050_TSI_SAD_XX,SAD
576 | 1039_ITS_NEU_XX,NEU
577 | 1005_IOM_SAD_XX,SAD
578 | 1046_WSI_ANG_XX,ANG
579 | 1044_TSI_NEU_XX,NEU
580 | 1067_WSI_SAD_XX,SAD
581 | 1012_IEO_SAD_HI,SAD
582 | 1067_MTI_SAD_XX,SAD
583 | 1004_IWL_HAP_XX,HAP
584 | 1009_DFA_NEU_XX,NEU
585 | 1041_IOM_SAD_XX,SAD
586 | 1001_DFA_SAD_XX,SAD
587 | 1048_MTI_DIS_XX,DIS
588 | 1071_IEO_HAP_HI,HAP
589 | 1010_DFA_FEA_XX,FEA
590 | 1032_IWL_FEA_XX,FEA
591 | 1057_IWW_ANG_XX,ANG
592 | 1052_ITS_HAP_XX,HAP
593 | 1005_ITH_HAP_XX,HAP
594 | 1015_WSI_HAP_XX,HAP
595 | 1061_IWW_DIS_XX,DIS
596 | 1058_DFA_DIS_XX,DIS
597 | 1021_TSI_FEA_XX,FEA
598 | 1082_IOM_NEU_XX,NEU
599 | 1008_IEO_HAP_LO,HAP
600 | 1085_DFA_FEA_XX,FEA
601 | 1020_IWW_NEU_XX,NEU
602 | 1068_IOM_SAD_XX,SAD
603 | 1070_TAI_ANG_XX,ANG
604 | 1048_WSI_HAP_XX,HAP
605 | 1056_WSI_DIS_XX,DIS
606 | 1025_ITH_ANG_XX,ANG
607 | 1029_ITS_DIS_XX,DIS
608 | 1013_IOM_SAD_XX,SAD
609 | 1010_TSI_FEA_XX,FEA
610 | 1005_IEO_NEU_XX,NEU
611 | 1039_TAI_SAD_XX,SAD
612 | 1027_IEO_SAD_HI,SAD
613 | 1083_IWL_ANG_XX,ANG
614 | 1052_DFA_ANG_XX,ANG
615 | 1052_IEO_FEA_HI,FEA
616 | 1055_IWL_FEA_XX,FEA
617 | 1075_WSI_HAP_XX,HAP
618 | 1061_IOM_HAP_XX,HAP
619 | 1091_IWL_FEA_XX,FEA
620 | 1028_WSI_HAP_XX,HAP
621 | 1009_IEO_HAP_LO,HAP
622 | 1044_TIE_NEU_XX,NEU
623 | 1090_IEO_SAD_MD,SAD
624 | 1053_WSI_FEA_XX,FEA
625 | 1019_MTI_FEA_XX,FEA
626 | 1013_TAI_FEA_XX,FEA
627 | 1029_IOM_SAD_XX,SAD
628 | 1064_IEO_ANG_LO,ANG
629 | 1020_TAI_ANG_XX,ANG
630 | 1001_TAI_HAP_XX,HAP
631 | 1004_ITS_HAP_XX,HAP
632 | 1039_WSI_FEA_XX,FEA
633 | 1012_MTI_SAD_XX,SAD
634 | 1088_MTI_DIS_XX,DIS
635 | 1058_IEO_HAP_LO,HAP
636 | 1089_IWL_SAD_XX,SAD
637 | 1069_IEO_DIS_MD,DIS
638 | 1035_DFA_SAD_XX,SAD
639 | 1032_IWW_SAD_XX,SAD
640 | 1052_TSI_SAD_XX,SAD
641 | 1013_MTI_SAD_XX,SAD
642 | 1065_TIE_SAD_XX,SAD
643 | 1056_IEO_HAP_HI,HAP
644 | 1020_ITH_HAP_XX,HAP
645 | 1024_DFA_NEU_XX,NEU
646 | 1053_IOM_FEA_XX,FEA
647 | 1036_WSI_SAD_XX,SAD
648 | 1052_WSI_FEA_XX,FEA
649 | 1033_IWL_FEA_XX,FEA
650 | 1064_IEO_SAD_LO,SAD
651 | 1036_IWL_HAP_XX,HAP
652 | 1058_MTI_NEU_XX,NEU
653 | 1090_MTI_SAD_XX,SAD
654 | 1007_ITH_FEA_XX,FEA
655 | 1052_WSI_SAD_XX,SAD
656 | 1084_IEO_FEA_MD,FEA
657 | 1030_MTI_NEU_XX,NEU
658 | 1005_WSI_ANG_XX,ANG
659 | 1084_IEO_DIS_LO,DIS
660 | 1090_IEO_FEA_HI,FEA
661 | 1041_WSI_ANG_XX,ANG
662 | 1079_MTI_HAP_XX,HAP
663 | 1047_TAI_FEA_XX,FEA
664 | 1020_MTI_DIS_XX,DIS
665 | 1010_TSI_SAD_XX,SAD
666 | 1055_ITS_HAP_XX,HAP
667 | 1052_IWL_FEA_XX,FEA
668 | 1033_ITH_FEA_XX,FEA
669 | 1028_TSI_DIS_XX,DIS
670 | 1034_IEO_HAP_MD,HAP
671 | 1089_DFA_HAP_XX,HAP
672 | 1012_IWW_ANG_XX,ANG
673 | 1072_DFA_DIS_XX,DIS
674 | 1068_ITH_FEA_XX,FEA
675 | 1007_IOM_DIS_XX,DIS
676 | 1075_IOM_SAD_XX,SAD
677 | 1050_IWL_FEA_XX,FEA
678 | 1058_MTI_DIS_XX,DIS
679 | 1036_ITS_DIS_XX,DIS
680 | 1060_IWW_HAP_XX,HAP
681 | 1077_TAI_NEU_XX,NEU
682 | 1047_ITH_HAP_XX,HAP
683 | 1088_TSI_NEU_XX,NEU
684 | 1048_TIE_FEA_XX,FEA
685 | 1021_MTI_DIS_XX,DIS
686 | 1060_ITS_ANG_XX,ANG
687 | 1076_IWL_HAP_XX,HAP
688 | 1002_ITS_SAD_XX,SAD
689 | 1026_IEO_FEA_LO,FEA
690 | 1066_IEO_DIS_HI,DIS
691 | 1010_ITS_ANG_XX,ANG
692 | 1037_TAI_DIS_XX,DIS
693 | 1091_IEO_FEA_LO,FEA
694 | 1003_TAI_NEU_XX,NEU
695 | 1012_ITS_HAP_XX,HAP
696 | 1079_IEO_FEA_MD,FEA
697 | 1057_TAI_SAD_XX,SAD
698 | 1060_TSI_NEU_XX,NEU
699 | 1001_WSI_DIS_XX,DIS
700 | 1057_WSI_NEU_XX,NEU
701 | 1041_WSI_DIS_XX,DIS
702 | 1029_MTI_FEA_XX,FEA
703 | 1086_IWW_FEA_XX,FEA
704 | 1028_TAI_SAD_XX,SAD
705 | 1086_IOM_FEA_XX,FEA
706 | 1053_DFA_FEA_XX,FEA
707 | 1025_IEO_FEA_MD,FEA
708 | 1055_IEO_ANG_MD,ANG
709 | 1008_IEO_FEA_LO,FEA
710 | 1013_IOM_FEA_XX,FEA
711 | 1062_TSI_NEU_XX,NEU
712 | 1015_TAI_FEA_XX,FEA
713 | 1030_IWL_DIS_XX,DIS
714 | 1036_IOM_ANG_XX,ANG
715 | 1067_TAI_NEU_XX,NEU
716 | 1072_IWW_SAD_XX,SAD
717 | 1034_IWW_SAD_XX,SAD
718 | 1013_WSI_SAD_XX,SAD
719 | 1032_TIE_FEA_XX,FEA
720 | 1028_WSI_ANG_XX,ANG
721 | 1032_IWL_HAP_XX,HAP
722 | 1078_IEO_FEA_MD,FEA
723 | 1071_ITS_HAP_XX,HAP
724 | 1069_ITS_DIS_XX,DIS
725 | 1078_ITS_ANG_XX,ANG
726 | 1049_DFA_NEU_XX,NEU
727 | 1049_MTI_HAP_XX,HAP
728 | 1004_DFA_ANG_XX,ANG
729 | 1073_IOM_HAP_XX,HAP
730 | 1057_TAI_FEA_XX,FEA
731 | 1012_IWL_HAP_XX,HAP
732 | 1006_ITS_DIS_XX,DIS
733 | 1036_TAI_ANG_XX,ANG
734 | 1055_ITH_DIS_XX,DIS
735 | 1028_IOM_FEA_XX,FEA
736 | 1029_ITH_SAD_XX,SAD
737 | 1056_ITS_ANG_XX,ANG
738 | 1006_IWL_DIS_XX,DIS
739 | 1060_IOM_ANG_XX,ANG
740 | 1089_DFA_ANG_XX,ANG
741 | 1068_IEO_DIS_HI,DIS
742 | 1066_ITS_DIS_XX,DIS
743 | 1072_IEO_SAD_MD,SAD
744 | 1014_TIE_HAP_XX,HAP
745 |
--------------------------------------------------------------------------------
/data/CREMAD/video_preprocessing.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import cv2
3 | import os
4 | import pdb
5 |
6 | class videoReader(object):
7 | def __init__(self, video_path, frame_interval=1, frame_kept_per_second=1):
8 | self.video_path = video_path
9 | self.frame_interval = frame_interval
10 | self.frame_kept_per_second = frame_kept_per_second
11 |
12 | #pdb.set_trace()
13 | self.vid = cv2.VideoCapture(self.video_path)
14 | self.fps = int(self.vid.get(cv2.CAP_PROP_FPS))
15 | self.video_frames = self.vid.get(cv2.CAP_PROP_FRAME_COUNT)
16 | self.video_len = int(self.video_frames/self.fps)
17 |
18 |
19 | def video2frame(self, frame_save_path):
20 | self.frame_save_path = frame_save_path
21 | success, image = self.vid.read()
22 | count = 0
23 | while success:
24 | count +=1
25 | if count % self.frame_interval == 0:
26 | save_name = '{}/frame_{}_{}.jpg'.format(self.frame_save_path, int(count/self.fps), count) # filename_second_index
27 | cv2.imencode('.jpg', image)[1].tofile(save_name)
28 | success, image = self.vid.read()
29 |
30 |
31 | def video2frame_update(self, frame_save_path):
32 | self.frame_save_path = frame_save_path
33 |
34 | count = 0
35 | frame_interval = int(self.fps/self.frame_kept_per_second)
36 | while(count < self.video_frames):
37 | ret, image = self.vid.read()
38 | if not ret:
39 | break
40 | if count % self.fps == 0:
41 | frame_id = 0
42 | if frame_id 0:
45 | m = torch.nn.ZeroPad2d((0, 0, 0, p))
46 | fbank = m(fbank)
47 | elif p < 0:
48 | fbank = fbank[0:target_length, :]
49 | fbank = (fbank - norm_mean) / (norm_std * 2)
50 |
51 | print(fbank.shape)
52 | np.save(save_path + '/'+ name + '.npy',fbank)
53 |
--------------------------------------------------------------------------------
/data/VGGSound/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/data/VGGSound/.DS_Store
--------------------------------------------------------------------------------
/data/VGGSound/mp4_to_wav.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | train_videos = '/data/users/xiaokang_peng/VGGsound/train-videos/train_video_list.txt'
5 | test_videos = '/data/users/xiaokang_peng/VGGsound/test-videos/test_video_list.txt'
6 |
7 | train_audio_dir = '/data/users/xiaokang_peng/VGGsound/train-audios/train-set'
8 | test_audio_dir = '/data/users/xiaokang_peng/VGGsound/test-audios/test-set'
9 |
10 |
11 | # test set processing
12 | with open(test_videos, 'r') as f:
13 | files = f.readlines()
14 |
15 | for i, item in enumerate(files):
16 | if i % 500 == 0:
17 | print('*******************************************')
18 | print('{}/{}'.format(i, len(files)))
19 | print('*******************************************')
20 | mp4_filename = os.path.join('/data/users/xiaokang_peng/VGGsound/test-videos/test-set/', item[:-1])
21 | wav_filename = os.path.join(test_audio_dir, item[:-5]+'.wav')
22 | if os.path.exists(wav_filename):
23 | pass
24 | else:
25 | os.system('ffmpeg -i {} -acodec pcm_s16le -ar 16000 {}'.format(mp4_filename, wav_filename))
26 |
27 |
28 | # train set processing
29 | with open(train_videos, 'r') as f:
30 | files = f.readlines()
31 |
32 | for i, item in enumerate(files):
33 | if i % 500 == 0:
34 | print('*******************************************')
35 | print('{}/{}'.format(i, len(files)))
36 | print('*******************************************')
37 | mp4_filename = os.path.join('/data/users/xiaokang_peng/VGGsound/train-videos/train-set/', item[:-1])
38 | wav_filename = os.path.join(train_audio_dir, item[:-5]+'.wav')
39 | if os.path.exists(wav_filename):
40 | pass
41 | else:
42 | os.system('ffmpeg -i {} -acodec pcm_s16le -ar 16000 {}'.format(mp4_filename, wav_filename))
43 |
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/data/VGGSound/video_preprocessing.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import cv2
3 | import os
4 | import pdb
5 |
6 | class videoReader(object):
7 | def __init__(self, video_path, frame_interval=1, frame_kept_per_second=1):
8 | self.video_path = video_path
9 | self.frame_interval = frame_interval
10 | self.frame_kept_per_second = frame_kept_per_second
11 |
12 | #pdb.set_trace()
13 | self.vid = cv2.VideoCapture(self.video_path)
14 | self.fps = int(self.vid.get(cv2.CAP_PROP_FPS))
15 | self.video_frames = self.vid.get(cv2.CAP_PROP_FRAME_COUNT)
16 | self.video_len = int(self.video_frames/self.fps)
17 |
18 |
19 | def video2frame(self, frame_save_path):
20 | self.frame_save_path = frame_save_path
21 | success, image = self.vid.read()
22 | count = 0
23 | while success:
24 | count +=1
25 | if count % self.frame_interval == 0:
26 | save_name = '{}/frame_{}_{}.jpg'.format(self.frame_save_path, int(count/self.fps), count) # filename_second_index
27 | cv2.imencode('.jpg', image)[1].tofile(save_name)
28 | success, image = self.vid.read()
29 |
30 |
31 | def video2frame_update(self, frame_save_path):
32 | self.frame_save_path = frame_save_path
33 |
34 | count = 0
35 | frame_interval = int(self.fps/self.frame_kept_per_second)
36 | while(count < self.video_frames):
37 | ret, image = self.vid.read()
38 | if not ret:
39 | break
40 | if count % self.fps == 0:
41 | frame_id = 0
42 | if frame_id 1.] = 1.
60 | resamples[resamples < -1.] = -1.
61 |
62 | spectrogram = librosa.stft(resamples, n_fft=512, hop_length=353)
63 | spectrogram = np.log(np.abs(spectrogram) + 1e-7)
64 | #mean = np.mean(spectrogram)
65 | #std = np.std(spectrogram)
66 | #spectrogram = np.divide(spectrogram - mean, std + 1e-9)
67 |
68 | if self.mode == 'train':
69 | transform = transforms.Compose([
70 | transforms.RandomResizedCrop(224),
71 | transforms.RandomHorizontalFlip(),
72 | transforms.ToTensor(),
73 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
74 | ])
75 | else:
76 | transform = transforms.Compose([
77 | transforms.Resize(size=(224, 224)),
78 | transforms.ToTensor(),
79 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
80 | ])
81 |
82 | # Visual
83 | image_samples = os.listdir(self.image[idx])
84 | select_index = np.random.choice(len(image_samples), size=self.args.fps, replace=False)
85 | select_index.sort()
86 | images = torch.zeros((self.args.fps, 3, 224, 224))
87 | for i in range(self.args.fps):
88 | img = Image.open(os.path.join(self.image[idx], image_samples[i])).convert('RGB')
89 | img = transform(img)
90 | images[i] = img
91 |
92 | images = torch.permute(images, (1,0,2,3))
93 |
94 | # label
95 | label = self.label[idx]
96 |
97 | return spectrogram, images, label
--------------------------------------------------------------------------------
/dataset/VGGSoundDataset.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import csv
3 | import os
4 | import pickle
5 | import librosa
6 | import numpy as np
7 | from scipy import signal
8 | import torch
9 | from PIL import Image
10 | from torch.utils.data import Dataset
11 | from torchvision import transforms
12 | import pdb
13 | import random
14 |
15 | class VGGSound(Dataset):
16 |
17 | def __init__(self, args, mode='train'):
18 | self.args = args
19 | self.mode = mode
20 | train_video_data = []
21 | train_audio_data = []
22 | test_video_data = []
23 | test_audio_data = []
24 | train_label = []
25 | test_label = []
26 | train_class = []
27 | test_class = []
28 |
29 | with open('/home/hudi/OGM-GE_CVPR2022/data/VGGSound/vggsound.csv') as f:
30 | csv_reader = csv.reader(f)
31 |
32 | for item in csv_reader:
33 | if item[3] == 'train':
34 | video_dir = os.path.join('/data/users/xiaokang_peng/VGGsound/', 'train-videos/train-set-img', 'Image-{:02d}-FPS'.format(self.args.fps), item[0]+'_'+item[1]+'.mp4')
35 | audio_dir = os.path.join('/data/users/xiaokang_peng/VGGsound/', 'train-audios/train-set', item[0]+'_'+item[1]+'.wav')
36 | if os.path.exists(video_dir) and os.path.exists(audio_dir) and len(os.listdir(video_dir))>3 :
37 | train_video_data.append(video_dir)
38 | train_audio_data.append(audio_dir)
39 | if item[2] not in train_class: train_class.append(item[2])
40 | train_label.append(item[2])
41 |
42 | if item[3] == 'test':
43 | video_dir = os.path.join('/data/users/xiaokang_peng/VGGsound/', 'test-videos/test-set-img', 'Image-{:02d}-FPS'.format(self.args.fps), item[0]+'_'+item[1]+'.mp4')
44 | audio_dir = os.path.join('/data/users/xiaokang_peng/VGGsound/', 'test-audios/test-set', item[0]+'_'+item[1]+'.wav')
45 | if os.path.exists(video_dir) and os.path.exists(audio_dir) and len(os.listdir(video_dir))>3:
46 | test_video_data.append(video_dir)
47 | test_audio_data.append(audio_dir)
48 | if item[2] not in test_class: test_class.append(item[2])
49 | test_label.append(item[2])
50 |
51 | assert len(train_class) == len(test_class)
52 | self.classes = train_class
53 |
54 | class_dict = dict(zip(self.classes, range(len(self.classes))))
55 |
56 | if mode == 'train':
57 | self.video = train_video_data
58 | self.audio = train_audio_data
59 | self.label = [class_dict[train_label[idx]] for idx in range(len(train_label))]
60 | if mode == 'test':
61 | self.video = test_video_data
62 | self.audio = test_audio_data
63 | self.label = [class_dict[test_label[idx]] for idx in range(len(test_label))]
64 |
65 |
66 | def __len__(self):
67 | return len(self.video)
68 |
69 | def __getitem__(self, idx):
70 |
71 | # audio
72 | sample, rate = librosa.load(self.audio[idx], sr=16000, mono=True)
73 | while len(sample)/rate < 10.:
74 | sample = np.tile(sample, 2)
75 |
76 | start_point = random.randint(a=0, b=rate*5)
77 | new_sample = sample[start_point:start_point+rate*5]
78 | new_sample[new_sample > 1.] = 1.
79 | new_sample[new_sample < -1.] = -1.
80 |
81 | spectrogram = librosa.stft(new_sample, n_fft=256, hop_length=128)
82 | spectrogram = np.log(np.abs(spectrogram) + 1e-7)
83 |
84 | if self.mode == 'train':
85 | transform = transforms.Compose([
86 | transforms.RandomResizedCrop(224),
87 | transforms.RandomHorizontalFlip(),
88 | transforms.ToTensor(),
89 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
90 | ])
91 | else:
92 | transform = transforms.Compose([
93 | transforms.Resize(size=(224, 224)),
94 | transforms.ToTensor(),
95 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
96 | ])
97 |
98 |
99 | # Visual
100 | image_samples = os.listdir(self.video[idx])
101 | select_index = np.random.choice(len(image_samples), size=self.args.use_video_frames, replace=False)
102 | select_index.sort()
103 | images = torch.zeros((self.args.use_video_frames, 3, 224, 224))
104 | for i in range(self.args.use_video_frames):
105 | img = Image.open(os.path.join(self.video[idx], image_samples[i])).convert('RGB')
106 | img = transform(img)
107 | images[i] = img
108 |
109 | images = torch.permute(images, (1,0,2,3))
110 |
111 | # label
112 | label = self.label[idx]
113 |
114 | return spectrogram, images, label
--------------------------------------------------------------------------------
/dataset/dataset.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import csv
3 | import os
4 | import pickle
5 |
6 | import torch
7 | from PIL import Image
8 | from torch.utils.data import Dataset
9 | from torchvision import transforms
10 |
11 |
12 | class AVDataset(Dataset):
13 |
14 | def __init__(self, args, mode='train'):
15 | classes = []
16 | data = []
17 | data2class = {}
18 | self.mode = mode
19 |
20 | self.data_root = '../data/'
21 |
22 | self.visual_feature_path = os.path.join(self.data_root, args.dataset, 'visual/')
23 | self.audio_feature_path = os.path.join(self.data_root, args.dataset, 'audio_spec/')
24 | self.stat_path = os.path.join(self.data_root, args.dataset, 'stat.txt')
25 | self.train_txt = os.path.join(self.data_root, args.dataset, 'my_train.txt')
26 | self.test_txt = os.path.join(self.data_root, args.dataset, 'my_test.txt')
27 |
28 | with open(self.stat_path) as f1:
29 | csv_reader = csv.reader(f1)
30 | for row in csv_reader:
31 | classes.append(row[0])
32 |
33 | if mode == 'train':
34 | csv_file = self.train_txt
35 | else:
36 | csv_file = self.test_txt
37 |
38 | with open(csv_file) as f2:
39 | csv_reader = csv.reader(f2)
40 | for item in csv_reader:
41 | audio_path = os.path.join(self.audio_feature_path, item[1] + '.pkl')
42 | visual_path = os.path.join(self.visual_feature_path, item[1])
43 | if os.path.exists(audio_path) and os.path.exists(visual_path):
44 | if args.dataset == 'AVE':
45 | # AVE, delete repeated labels
46 | a = set(data)
47 | if item[1] in a:
48 | del data2class[item[1]]
49 | data.remove(item[1])
50 | data.append(item[1])
51 | data2class[item[1]] = item[0]
52 | else:
53 | continue
54 |
55 | self.classes = sorted(classes)
56 |
57 | print(self.classes)
58 | self.data2class = data2class
59 |
60 | self.av_files = []
61 | for item in data:
62 | self.av_files.append(item)
63 | print('# of files = %d ' % len(self.av_files))
64 | print('# of classes = %d' % len(self.classes))
65 |
66 | def __len__(self):
67 | return len(self.av_files)
68 |
69 | def __getitem__(self, idx):
70 | av_file = self.av_files[idx]
71 |
72 | # Audio
73 | audio_path = os.path.join(self.audio_feature_path, av_file + '.pkl')
74 | spectrogram = pickle.load(open(audio_path, 'rb'))
75 |
76 | # Visual
77 | visual_path = os.path.join(self.visual_feature_path, av_file)
78 | file_num = len(os.listdir(visual_path))
79 |
80 | if self.mode == 'train':
81 |
82 | transform = transforms.Compose([
83 | transforms.RandomResizedCrop(224),
84 | transforms.RandomHorizontalFlip(),
85 | transforms.ToTensor(),
86 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
87 | ])
88 | else:
89 | transform = transforms.Compose([
90 | transforms.Resize(size=(224, 224)),
91 | transforms.ToTensor(),
92 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
93 | ])
94 |
95 | pick_num = 3
96 | seg = int(file_num / pick_num)
97 | path1 = []
98 | image = []
99 | image_arr = []
100 | t = [0] * pick_num
101 |
102 | for i in range(pick_num):
103 | t[i] = seg * i + 1
104 | path1.append('frame_0000' + str(t[i]) + '.jpg')
105 | image.append(Image.open(visual_path + "/" + path1[i]).convert('RGB'))
106 | image_arr.append(transform(image[i]))
107 | image_arr[i] = image_arr[i].unsqueeze(1).float()
108 | if i == 0:
109 | image_n = copy.copy(image_arr[i])
110 | else:
111 | image_n = torch.cat((image_n, image_arr[i]), 1)
112 |
113 | return spectrogram, image_n, self.classes.index(self.data2class[av_file]), av_file
114 |
--------------------------------------------------------------------------------
/demo/algorithom.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/demo/algorithom.PNG
--------------------------------------------------------------------------------
/demo/demo_guitar.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/demo/demo_guitar.PNG
--------------------------------------------------------------------------------
/demo/demo_snow.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/demo/demo_snow.PNG
--------------------------------------------------------------------------------
/demo/five lines.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/demo/five lines.PNG
--------------------------------------------------------------------------------
/demo/pipeline.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/demo/pipeline.PNG
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import numpy as np
5 | import torch
6 | import torch.nn as nn
7 | import torch.optim as optim
8 | from torch.utils.data import DataLoader
9 | from torch.utils.tensorboard import SummaryWriter
10 | import pdb
11 |
12 | from dataset.CramedDataset import CramedDataset
13 | from dataset.VGGSoundDataset import VGGSound
14 | from dataset.dataset import AVDataset
15 | from models.basic_model import AVClassifier
16 | from utils.utils import setup_seed, weight_init
17 |
18 |
19 | def get_arguments():
20 | parser = argparse.ArgumentParser()
21 | parser.add_argument('--dataset', default='CREMAD', type=str,
22 | help='VGGSound, KineticSound, CREMAD, AVE')
23 | parser.add_argument('--modulation', default='OGM_GE', type=str,
24 |
25 | choices=['Normal', 'OGM', 'OGM_GE'])
26 | parser.add_argument('--fusion_method', default='concat', type=str,
27 | choices=['sum', 'concat', 'gated', 'film'])
28 | parser.add_argument('--fps', default=1, type=int)
29 | parser.add_argument('--use_video_frames', default=3, type=int)
30 | parser.add_argument('--audio_path', default='/home/hudi/data/CREMA-D/AudioWAV', type=str)
31 | parser.add_argument('--visual_path', default='/home/hudi/data/CREMA-D/', type=str)
32 |
33 | parser.add_argument('--batch_size', default=64, type=int)
34 | parser.add_argument('--epochs', default=100, type=int)
35 |
36 | parser.add_argument('--optimizer', default='sgd', type=str, choices=['sgd', 'adam'])
37 | parser.add_argument('--learning_rate', default=0.001, type=float, help='initial learning rate')
38 | parser.add_argument('--lr_decay_step', default=70, type=int, help='where learning rate decays')
39 | parser.add_argument('--lr_decay_ratio', default=0.1, type=float, help='decay coefficient')
40 |
41 | parser.add_argument('--modulation_starts', default=0, type=int, help='where modulation begins')
42 | parser.add_argument('--modulation_ends', default=50, type=int, help='where modulation ends')
43 | parser.add_argument('--alpha', required=True, type=float, help='alpha in OGM-GE')
44 |
45 | parser.add_argument('--ckpt_path', required=True, type=str, help='path to save trained models')
46 | parser.add_argument('--train', action='store_true', help='turn on train mode')
47 |
48 | parser.add_argument('--use_tensorboard', default=False, type=bool, help='whether to visualize')
49 | parser.add_argument('--tensorboard_path', type=str, help='path to save tensorboard logs')
50 |
51 | parser.add_argument('--random_seed', default=0, type=int)
52 | parser.add_argument('--gpu_ids', default='0, 1', type=str, help='GPU ids')
53 |
54 | return parser.parse_args()
55 |
56 |
57 | def train_epoch(args, epoch, model, device, dataloader, optimizer, scheduler, writer=None):
58 | criterion = nn.CrossEntropyLoss()
59 | softmax = nn.Softmax(dim=1)
60 | relu = nn.ReLU(inplace=True)
61 | tanh = nn.Tanh()
62 |
63 | model.train()
64 | print("Start training ... ")
65 |
66 | _loss = 0
67 | _loss_a = 0
68 | _loss_v = 0
69 |
70 | for step, (spec, image, label) in enumerate(dataloader):
71 |
72 | #pdb.set_trace()
73 | spec = spec.to(device)
74 | image = image.to(device)
75 | label = label.to(device)
76 |
77 | optimizer.zero_grad()
78 |
79 | # TODO: make it simpler and easier to extend
80 | a, v, out = model(spec.unsqueeze(1).float(), image.float())
81 |
82 | if args.fusion_method == 'sum':
83 | out_v = (torch.mm(v, torch.transpose(model.module.fusion_module.fc_y.weight, 0, 1)) +
84 | model.module.fusion_module.fc_y.bias)
85 | out_a = (torch.mm(a, torch.transpose(model.module.fusion_module.fc_x.weight, 0, 1)) +
86 | model.module.fusion_module.fc_x.bias)
87 | else:
88 | weight_size = model.module.fusion_module.fc_out.weight.size(1)
89 | out_v = (torch.mm(v, torch.transpose(model.module.fusion_module.fc_out.weight[:, weight_size // 2:], 0, 1))
90 | + model.module.fusion_module.fc_out.bias / 2)
91 |
92 | out_a = (torch.mm(a, torch.transpose(model.module.fusion_module.fc_out.weight[:, :weight_size // 2], 0, 1))
93 | + model.module.fusion_module.fc_out.bias / 2)
94 |
95 | loss = criterion(out, label)
96 | loss_v = criterion(out_v, label)
97 | loss_a = criterion(out_a, label)
98 | loss.backward()
99 |
100 | if args.modulation == 'Normal':
101 | # no modulation, regular optimization
102 | pass
103 | else:
104 | # Modulation starts here !
105 | score_v = sum([softmax(out_v)[i][label[i]] for i in range(out_v.size(0))])
106 | score_a = sum([softmax(out_a)[i][label[i]] for i in range(out_a.size(0))])
107 |
108 | ratio_v = score_v / score_a
109 | ratio_a = 1 / ratio_v
110 |
111 | """
112 | Below is the Eq.(10) in our CVPR paper:
113 | 1 - tanh(alpha * rho_t_u), if rho_t_u > 1
114 | k_t_u =
115 | 1, else
116 | coeff_u is k_t_u, where t means iteration steps and u is modality indicator, either a or v.
117 | """
118 |
119 | if ratio_v > 1:
120 | coeff_v = 1 - tanh(args.alpha * relu(ratio_v))
121 | coeff_a = 1
122 | else:
123 | coeff_a = 1 - tanh(args.alpha * relu(ratio_a))
124 | coeff_v = 1
125 |
126 | if args.use_tensorboard:
127 | iteration = epoch * len(dataloader) + step
128 | writer.add_scalar('data/ratio v', ratio_v, iteration)
129 | writer.add_scalar('data/coefficient v', coeff_v, iteration)
130 | writer.add_scalar('data/coefficient a', coeff_a, iteration)
131 |
132 | if args.modulation_starts <= epoch <= args.modulation_ends: # bug fixed
133 | for name, parms in model.named_parameters():
134 | layer = str(name).split('.')[1]
135 |
136 | if 'audio' in layer and len(parms.grad.size()) == 4:
137 | if args.modulation == 'OGM_GE': # bug fixed
138 | parms.grad = parms.grad * coeff_a + \
139 | torch.zeros_like(parms.grad).normal_(0, parms.grad.std().item() + 1e-8)
140 | elif args.modulation == 'OGM':
141 | parms.grad *= coeff_a
142 |
143 | if 'visual' in layer and len(parms.grad.size()) == 4:
144 | if args.modulation == 'OGM_GE': # bug fixed
145 | parms.grad = parms.grad * coeff_v + \
146 | torch.zeros_like(parms.grad).normal_(0, parms.grad.std().item() + 1e-8)
147 | elif args.modulation == 'OGM':
148 | parms.grad *= coeff_v
149 | else:
150 | pass
151 |
152 |
153 | optimizer.step()
154 |
155 | _loss += loss.item()
156 | _loss_a += loss_a.item()
157 | _loss_v += loss_v.item()
158 |
159 | scheduler.step()
160 |
161 | return _loss / len(dataloader), _loss_a / len(dataloader), _loss_v / len(dataloader)
162 |
163 |
164 | def valid(args, model, device, dataloader):
165 | softmax = nn.Softmax(dim=1)
166 |
167 | if args.dataset == 'VGGSound':
168 | n_classes = 309
169 | elif args.dataset == 'KineticSound':
170 | n_classes = 31
171 | elif args.dataset == 'CREMAD':
172 | n_classes = 6
173 | elif args.dataset == 'AVE':
174 | n_classes = 28
175 | else:
176 | raise NotImplementedError('Incorrect dataset name {}'.format(args.dataset))
177 |
178 | with torch.no_grad():
179 | model.eval()
180 | # TODO: more flexible
181 | num = [0.0 for _ in range(n_classes)]
182 | acc = [0.0 for _ in range(n_classes)]
183 | acc_a = [0.0 for _ in range(n_classes)]
184 | acc_v = [0.0 for _ in range(n_classes)]
185 |
186 | for step, (spec, image, label) in enumerate(dataloader):
187 |
188 | spec = spec.to(device)
189 | image = image.to(device)
190 | label = label.to(device)
191 |
192 | a, v, out = model(spec.unsqueeze(1).float(), image.float())
193 |
194 | if args.fusion_method == 'sum':
195 | out_v = (torch.mm(v, torch.transpose(model.module.fusion_module.fc_y.weight, 0, 1)) +
196 | model.module.fusion_module.fc_y.bias / 2)
197 | out_a = (torch.mm(a, torch.transpose(model.module.fusion_module.fc_x.weight, 0, 1)) +
198 | model.module.fusion_module.fc_x.bias / 2)
199 | else:
200 | out_v = (torch.mm(v, torch.transpose(model.module.fusion_module.fc_out.weight[:, 512:], 0, 1)) +
201 | model.module.fusion_module.fc_out.bias / 2)
202 | out_a = (torch.mm(a, torch.transpose(model.module.fusion_module.fc_out.weight[:, :512], 0, 1)) +
203 | model.module.fusion_module.fc_out.bias / 2)
204 |
205 | prediction = softmax(out)
206 | pred_v = softmax(out_v)
207 | pred_a = softmax(out_a)
208 |
209 | for i in range(image.shape[0]):
210 |
211 | ma = np.argmax(prediction[i].cpu().data.numpy())
212 | v = np.argmax(pred_v[i].cpu().data.numpy())
213 | a = np.argmax(pred_a[i].cpu().data.numpy())
214 | num[label[i]] += 1.0
215 |
216 | #pdb.set_trace()
217 | if np.asarray(label[i].cpu()) == ma:
218 | acc[label[i]] += 1.0
219 | if np.asarray(label[i].cpu()) == v:
220 | acc_v[label[i]] += 1.0
221 | if np.asarray(label[i].cpu()) == a:
222 | acc_a[label[i]] += 1.0
223 |
224 | return sum(acc) / sum(num), sum(acc_a) / sum(num), sum(acc_v) / sum(num)
225 |
226 |
227 | def main():
228 | args = get_arguments()
229 | print(args)
230 |
231 | setup_seed(args.random_seed)
232 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
233 | gpu_ids = list(range(torch.cuda.device_count()))
234 |
235 | device = torch.device('cuda:0')
236 |
237 | model = AVClassifier(args)
238 |
239 | model.apply(weight_init)
240 | model.to(device)
241 |
242 | model = torch.nn.DataParallel(model, device_ids=gpu_ids)
243 |
244 | model.cuda()
245 |
246 | optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=1e-4)
247 | scheduler = optim.lr_scheduler.StepLR(optimizer, args.lr_decay_step, args.lr_decay_ratio)
248 |
249 | if args.dataset == 'VGGSound':
250 | train_dataset = VGGSound(args, mode='train')
251 | test_dataset = VGGSound(args, mode='test')
252 | elif args.dataset == 'KineticSound':
253 | train_dataset = AVDataset(args, mode='train')
254 | test_dataset = AVDataset(args, mode='test')
255 | elif args.dataset == 'CREMAD':
256 | train_dataset = CramedDataset(args, mode='train')
257 | test_dataset = CramedDataset(args, mode='test')
258 | elif args.dataset == 'AVE':
259 | train_dataset = AVDataset(args, mode='train')
260 | test_dataset = AVDataset(args, mode='test')
261 | else:
262 | raise NotImplementedError('Incorrect dataset name {}! '
263 | 'Only support VGGSound, KineticSound and CREMA-D for now!'.format(args.dataset))
264 |
265 | train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size,
266 | shuffle=True, num_workers=32, pin_memory=True)
267 |
268 | test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size,
269 | shuffle=False, num_workers=32, pin_memory=True)
270 |
271 | if args.train:
272 |
273 | best_acc = 0.0
274 |
275 | for epoch in range(args.epochs):
276 |
277 | print('Epoch: {}: '.format(epoch))
278 |
279 | if args.use_tensorboard:
280 |
281 | writer_path = os.path.join(args.tensorboard_path, args.dataset)
282 | if not os.path.exists(writer_path):
283 | os.mkdir(writer_path)
284 | log_name = '{}_{}'.format(args.fusion_method, args.modulation)
285 | writer = SummaryWriter(os.path.join(writer_path, log_name))
286 |
287 | batch_loss, batch_loss_a, batch_loss_v = train_epoch(args, epoch, model, device,
288 | train_dataloader, optimizer, scheduler)
289 | acc, acc_a, acc_v = valid(args, model, device, test_dataloader)
290 |
291 | writer.add_scalars('Loss', {'Total Loss': batch_loss,
292 | 'Audio Loss': batch_loss_a,
293 | 'Visual Loss': batch_loss_v}, epoch)
294 |
295 | writer.add_scalars('Evaluation', {'Total Accuracy': acc,
296 | 'Audio Accuracy': acc_a,
297 | 'Visual Accuracy': acc_v}, epoch)
298 |
299 | else:
300 | batch_loss, batch_loss_a, batch_loss_v = train_epoch(args, epoch, model, device,
301 | train_dataloader, optimizer, scheduler)
302 | acc, acc_a, acc_v = valid(args, model, device, test_dataloader)
303 |
304 | if acc > best_acc:
305 | best_acc = float(acc)
306 |
307 | if not os.path.exists(args.ckpt_path):
308 | os.mkdir(args.ckpt_path)
309 |
310 | model_name = 'best_model_of_dataset_{}_{}_alpha_{}_' \
311 | 'optimizer_{}_modulate_starts_{}_ends_{}_' \
312 | 'epoch_{}_acc_{}.pth'.format(args.dataset,
313 | args.modulation,
314 | args.alpha,
315 | args.optimizer,
316 | args.modulation_starts,
317 | args.modulation_ends,
318 | epoch, acc)
319 |
320 | saved_dict = {'saved_epoch': epoch,
321 | 'modulation': args.modulation,
322 | 'alpha': args.alpha,
323 | 'fusion': args.fusion_method,
324 | 'acc': acc,
325 | 'model': model.state_dict(),
326 | 'optimizer': optimizer.state_dict(),
327 | 'scheduler': scheduler.state_dict()}
328 |
329 | save_dir = os.path.join(args.ckpt_path, model_name)
330 |
331 | torch.save(saved_dict, save_dir)
332 | print('The best model has been saved at {}.'.format(save_dir))
333 | print("Loss: {:.3f}, Acc: {:.3f}".format(batch_loss, acc))
334 | print("Audio Acc: {:.3f}, Visual Acc: {:.3f} ".format(acc_a, acc_v))
335 | else:
336 | print("Loss: {:.3f}, Acc: {:.3f}, Best Acc: {:.3f}".format(batch_loss, acc, best_acc))
337 | print("Audio Acc: {:.3f}, Visual Acc: {:.3f} ".format(acc_a, acc_v))
338 |
339 | else:
340 | # first load trained model
341 | loaded_dict = torch.load(args.ckpt_path)
342 | # epoch = loaded_dict['saved_epoch']
343 | modulation = loaded_dict['modulation']
344 | # alpha = loaded_dict['alpha']
345 | fusion = loaded_dict['fusion']
346 | state_dict = loaded_dict['model']
347 | # optimizer_dict = loaded_dict['optimizer']
348 | # scheduler = loaded_dict['scheduler']
349 |
350 | assert modulation == args.modulation, 'inconsistency between modulation method of loaded model and args !'
351 | assert fusion == args.fusion_method, 'inconsistency between fusion method of loaded model and args !'
352 |
353 | model = model.load_state_dict(state_dict)
354 | print('Trained model loaded!')
355 |
356 | acc, acc_a, acc_v = valid(args, model, device, test_dataloader)
357 | print('Accuracy: {}, accuracy_a: {}, accuracy_v: {}'.format(acc, acc_a, acc_v))
358 |
359 |
360 | if __name__ == "__main__":
361 | main()
362 |
--------------------------------------------------------------------------------
/models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/.DS_Store
--------------------------------------------------------------------------------
/models/backbone.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 |
3 |
4 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
5 | """3x3 convolution with padding"""
6 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
7 | padding=dilation, groups=groups, bias=False, dilation=dilation)
8 |
9 |
10 | def conv1x1(in_planes, out_planes, stride=1):
11 | """1x1 convolution"""
12 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
13 |
14 |
15 | class BasicBlock(nn.Module):
16 | expansion = 1
17 |
18 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
19 | base_width=64, dilation=1, norm_layer=None):
20 | super(BasicBlock, self).__init__()
21 | if norm_layer is None:
22 | norm_layer = nn.BatchNorm2d
23 | if groups != 1 or base_width != 64:
24 | raise ValueError('BasicBlock only supports groups=1 and base_width=64')
25 | if dilation > 1:
26 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
27 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1
28 | self.conv1 = conv3x3(inplanes, planes, stride)
29 | self.bn1 = norm_layer(planes)
30 | self.relu = nn.ReLU(inplace=True)
31 | self.conv2 = conv3x3(planes, planes)
32 | self.bn2 = norm_layer(planes)
33 | self.downsample = downsample
34 | self.stride = stride
35 |
36 | def forward(self, x):
37 | identity = x
38 |
39 | out = self.conv1(x)
40 | out = self.bn1(out)
41 | out = self.relu(out)
42 |
43 | out = self.conv2(out)
44 | out = self.bn2(out)
45 |
46 | if self.downsample is not None:
47 | identity = self.downsample(x)
48 |
49 | out += identity
50 | out = self.relu(out)
51 |
52 | return out
53 |
54 |
55 | class ResNet(nn.Module):
56 |
57 | def __init__(self, block, layers, modality, num_classes=1000, pool='avgpool', zero_init_residual=False,
58 | groups=1, width_per_group=64, replace_stride_with_dilation=None,
59 | norm_layer=None):
60 | super(ResNet, self).__init__()
61 | self.modality = modality
62 | self.pool = pool
63 | if norm_layer is None:
64 | norm_layer = nn.BatchNorm2d
65 | self._norm_layer = norm_layer
66 |
67 | self.inplanes = 64
68 | self.dilation = 1
69 | if replace_stride_with_dilation is None:
70 | # each element in the tuple indicates if we should replace
71 | # the 2x2 stride with a dilated convolution instead
72 | replace_stride_with_dilation = [False, False, False]
73 | if len(replace_stride_with_dilation) != 3:
74 | raise ValueError("replace_stride_with_dilation should be None "
75 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
76 | self.groups = groups
77 | self.base_width = width_per_group
78 | if modality == 'audio':
79 | self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=7, stride=2, padding=3,
80 | bias=False)
81 | elif modality == 'visual':
82 | self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
83 | bias=False)
84 | else:
85 | raise NotImplementedError('Incorrect modality, should be audio or visual but got {}'.format(modality))
86 | self.bn1 = norm_layer(self.inplanes)
87 | self.relu = nn.ReLU(inplace=True)
88 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
89 | self.layer1 = self._make_layer(block, 64, layers[0])
90 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
91 | dilate=replace_stride_with_dilation[0])
92 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
93 | dilate=replace_stride_with_dilation[1])
94 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
95 | dilate=replace_stride_with_dilation[2])
96 | # if self.pool == 'avgpool':
97 | # self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
98 | #
99 | # self.fc = nn.Linear(512 * block.expansion, num_classes) # 8192
100 |
101 | for m in self.modules():
102 | if isinstance(m, nn.Conv2d):
103 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
104 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
105 | nn.init.normal_(m.weight, mean=1, std=0.02)
106 | nn.init.constant_(m.bias, 0)
107 |
108 | # Zero-initialize the last BN in each residual branch,
109 | # so that the residual branch starts with zeros, and each residual block behaves like an identity.
110 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
111 | if zero_init_residual:
112 | for m in self.modules():
113 | if isinstance(m, Bottleneck):
114 | nn.init.constant_(m.bn3.weight, 0)
115 | elif isinstance(m, BasicBlock):
116 | nn.init.constant_(m.bn2.weight, 0)
117 |
118 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
119 | norm_layer = self._norm_layer
120 | downsample = None
121 | previous_dilation = self.dilation
122 | if dilate:
123 | self.dilation *= stride
124 | stride = 1
125 | if stride != 1 or self.inplanes != planes * block.expansion:
126 | downsample = nn.Sequential(
127 | conv1x1(self.inplanes, planes * block.expansion, stride),
128 | norm_layer(planes * block.expansion),
129 | )
130 |
131 | layers = []
132 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
133 | self.base_width, previous_dilation, norm_layer))
134 | self.inplanes = planes * block.expansion
135 | for _ in range(1, blocks):
136 | layers.append(block(self.inplanes, planes, groups=self.groups,
137 | base_width=self.base_width, dilation=self.dilation,
138 | norm_layer=norm_layer))
139 |
140 | return nn.Sequential(*layers)
141 |
142 | def forward(self, x):
143 |
144 | if self.modality == 'visual':
145 | (B, C, T, H, W) = x.size()
146 | x = x.permute(0, 2, 1, 3, 4).contiguous()
147 | x = x.view(B * T, C, H, W)
148 |
149 | x = self.conv1(x)
150 | x = self.bn1(x)
151 | x = self.relu(x)
152 | x = self.maxpool(x)
153 |
154 | x = self.layer1(x)
155 | x = self.layer2(x)
156 | x = self.layer3(x)
157 | x = self.layer4(x)
158 | out = x
159 |
160 | return out
161 |
162 |
163 | class Bottleneck(nn.Module):
164 | expansion = 4
165 |
166 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
167 | base_width=64, dilation=1, norm_layer=None):
168 | super(Bottleneck, self).__init__()
169 | if norm_layer is None:
170 | norm_layer = nn.BatchNorm2d
171 | width = int(planes * (base_width / 64.)) * groups
172 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1
173 | self.conv1 = conv1x1(inplanes, width)
174 | self.bn1 = norm_layer(width)
175 | self.conv2 = conv3x3(width, width, stride, groups, dilation)
176 | self.bn2 = norm_layer(width)
177 | self.conv3 = conv1x1(width, planes * self.expansion)
178 | self.bn3 = norm_layer(planes * self.expansion)
179 | self.relu = nn.ReLU(inplace=True)
180 | self.downsample = downsample
181 | self.stride = stride
182 |
183 | def forward(self, x):
184 | identity = x
185 |
186 | out = self.conv1(x)
187 | out = self.bn1(out)
188 | out = self.relu(out)
189 |
190 | out = self.conv2(out)
191 | out = self.bn2(out)
192 | out = self.relu(out)
193 |
194 | out = self.conv3(out)
195 | out = self.bn3(out)
196 |
197 | if self.downsample is not None:
198 | identity = self.downsample(x)
199 |
200 | out += identity
201 | out = self.relu(out)
202 |
203 | return out
204 |
205 |
206 | def _resnet(arch, block, layers, modality, progress, **kwargs):
207 | model = ResNet(block, layers, modality, **kwargs)
208 | return model
209 |
210 |
211 | def resnet18(modality, progress=True, **kwargs):
212 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], modality, progress,
213 | **kwargs)
214 |
--------------------------------------------------------------------------------
/models/basic_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from .backbone import resnet18
5 | from .fusion_modules import SumFusion, ConcatFusion, FiLM, GatedFusion
6 |
7 |
8 | class AVClassifier(nn.Module):
9 | def __init__(self, args):
10 | super(AVClassifier, self).__init__()
11 |
12 | fusion = args.fusion_method
13 | if args.dataset == 'VGGSound':
14 | n_classes = 309
15 | elif args.dataset == 'KineticSound':
16 | n_classes = 31
17 | elif args.dataset == 'CREMAD':
18 | n_classes = 6
19 | elif args.dataset == 'AVE':
20 | n_classes = 28
21 | else:
22 | raise NotImplementedError('Incorrect dataset name {}'.format(args.dataset))
23 |
24 | if fusion == 'sum':
25 | self.fusion_module = SumFusion(output_dim=n_classes)
26 | elif fusion == 'concat':
27 | self.fusion_module = ConcatFusion(output_dim=n_classes)
28 | elif fusion == 'film':
29 | self.fusion_module = FiLM(output_dim=n_classes, x_film=True)
30 | elif fusion == 'gated':
31 | self.fusion_module = GatedFusion(output_dim=n_classes, x_gate=True)
32 | else:
33 | raise NotImplementedError('Incorrect fusion method: {}!'.format(fusion))
34 |
35 | self.audio_net = resnet18(modality='audio')
36 | self.visual_net = resnet18(modality='visual')
37 |
38 | def forward(self, audio, visual):
39 |
40 | a = self.audio_net(audio)
41 | v = self.visual_net(visual)
42 |
43 | (_, C, H, W) = v.size()
44 | B = a.size()[0]
45 | v = v.view(B, -1, C, H, W)
46 | v = v.permute(0, 2, 1, 3, 4)
47 |
48 | a = F.adaptive_avg_pool2d(a, 1)
49 | v = F.adaptive_avg_pool3d(v, 1)
50 |
51 | a = torch.flatten(a, 1)
52 | v = torch.flatten(v, 1)
53 |
54 | a, v, out = self.fusion_module(a, v)
55 |
56 | return a, v, out
57 |
--------------------------------------------------------------------------------
/models/fusion_modules.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 |
5 | class SumFusion(nn.Module):
6 | def __init__(self, input_dim=512, output_dim=100):
7 | super(SumFusion, self).__init__()
8 | self.fc_x = nn.Linear(input_dim, output_dim)
9 | self.fc_y = nn.Linear(input_dim, output_dim)
10 |
11 | def forward(self, x, y):
12 | output = self.fc_x(x) + self.fc_y(y)
13 | return x, y, output
14 |
15 |
16 | class ConcatFusion(nn.Module):
17 | def __init__(self, input_dim=1024, output_dim=100):
18 | super(ConcatFusion, self).__init__()
19 | self.fc_out = nn.Linear(input_dim, output_dim)
20 |
21 | def forward(self, x, y):
22 | output = torch.cat((x, y), dim=1)
23 | output = self.fc_out(output)
24 | return x, y, output
25 |
26 |
27 | class FiLM(nn.Module):
28 | """
29 | FiLM: Visual Reasoning with a General Conditioning Layer,
30 | https://arxiv.org/pdf/1709.07871.pdf.
31 | """
32 |
33 | def __init__(self, input_dim=512, dim=512, output_dim=100, x_film=True):
34 | super(FiLM, self).__init__()
35 |
36 | self.dim = input_dim
37 | self.fc = nn.Linear(input_dim, 2 * dim)
38 | self.fc_out = nn.Linear(dim, output_dim)
39 |
40 | self.x_film = x_film
41 |
42 | def forward(self, x, y):
43 |
44 | if self.x_film:
45 | film = x
46 | to_be_film = y
47 | else:
48 | film = y
49 | to_be_film = x
50 |
51 | gamma, beta = torch.split(self.fc(film), self.dim, 1)
52 |
53 | output = gamma * to_be_film + beta
54 | output = self.fc_out(output)
55 |
56 | return x, y, output
57 |
58 |
59 | class GatedFusion(nn.Module):
60 | """
61 | Efficient Large-Scale Multi-Modal Classification,
62 | https://arxiv.org/pdf/1802.02892.pdf.
63 | """
64 |
65 | def __init__(self, input_dim=512, dim=512, output_dim=100, x_gate=True):
66 | super(GatedFusion, self).__init__()
67 |
68 | self.fc_x = nn.Linear(input_dim, dim)
69 | self.fc_y = nn.Linear(input_dim, dim)
70 | self.fc_out = nn.Linear(dim, output_dim)
71 |
72 | self.x_gate = x_gate # whether to choose the x to obtain the gate
73 |
74 | self.sigmoid = nn.Sigmoid()
75 |
76 | def forward(self, x, y):
77 | out_x = self.fc_x(x)
78 | out_y = self.fc_y(y)
79 |
80 | if self.x_gate:
81 | gate = self.sigmoid(out_x)
82 | output = self.fc_out(torch.mul(gate, out_y))
83 | else:
84 | gate = self.sigmoid(out_y)
85 | output = self.fc_out(torch.mul(out_x, gate))
86 |
87 | return out_x, out_y, output
88 |
89 |
--------------------------------------------------------------------------------
/models/old_models/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/.DS_Store
--------------------------------------------------------------------------------
/models/old_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .encodera import *
2 | from .encoderv import *
3 | #from .avmodel_att import *
4 | #from .avmodel import *
5 |
--------------------------------------------------------------------------------
/models/old_models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel.cpython-36.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel_3.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_3.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel_am.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_am.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel_att.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_att.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel_cma.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_cma.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel_demo.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_demo.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel_gate.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_gate.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel_gradblending.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_gradblending.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel_md.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_md.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel_psp.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_psp.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel_uni.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_uni.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/avmodel_x.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_x.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/encodera.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/encodera.cpython-36.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/encodera.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/encodera.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/encoderv.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/encoderv.cpython-36.pyc
--------------------------------------------------------------------------------
/models/old_models/__pycache__/encoderv.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/encoderv.cpython-37.pyc
--------------------------------------------------------------------------------
/models/old_models/avmodel.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from PIL import Image
4 | import torch
5 | import torchvision
6 | from torchvision.transforms import *
7 | import torch.nn as nn
8 | from torch.autograd import Variable
9 | from torch.utils.data import Dataset, DataLoader
10 | import numpy as np
11 | import math
12 | from collections import OrderedDict
13 | import torch.nn.functional as F
14 | import torch.optim as optim
15 | import argparse
16 | import csv
17 | import random
18 | import warnings
19 | import pdb
20 | sys.path.append('/home/xiaokang_peng/ks/models')
21 | import encodera as ma
22 | import encoderv as mv
23 | warnings.filterwarnings('ignore')
24 |
25 |
26 |
27 | class AVmodel(nn.Module):
28 | def __init__(self,args):
29 | super(AVmodel,self).__init__()
30 | self.args = args
31 | self.parta = ma.Resnet(self.args)
32 | self.parta.fc = nn.Linear(512, args.n_classes)
33 |
34 | self.partv = mv.Resnet(self.args)
35 | self.partv.fc = nn.Linear(512, args.n_classes)
36 |
37 | self.fc_ = nn.Linear(1024, args.n_classes)
38 |
39 | self.dropx = nn.Dropout(0.0)
40 | self.dropy = nn.Dropout(0.5)
41 |
42 |
43 |
44 | def forward(self,audio,visual,label,iterations):
45 |
46 | y = self.parta(audio)
47 | x = self.partv(visual)
48 | (_, C, H, W) = x.size()
49 | B = y.size()[0]
50 | x = x.view(B, -1, C, H, W)
51 | x = x.permute(0, 2, 1, 3, 4)
52 |
53 | x = F.adaptive_avg_pool3d(x, 1)
54 | y = F.adaptive_avg_pool2d(y, 1)
55 | x = x.squeeze(2).squeeze(2).squeeze(2)
56 | y = y.squeeze(2).squeeze(2)
57 |
58 | #x = self.dropx(x)
59 | #y = self.dropy(y)
60 | #x *= self.dropx(torch.ones(1)).cuda()
61 | #y *= self.dropy(torch.ones(1)).cuda()
62 |
63 | out = torch.cat((x, y),1)
64 | out = self.fc_(out)
65 |
66 |
67 | return x, y, out
68 |
69 |
--------------------------------------------------------------------------------
/models/old_models/avmodel_x.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from PIL import Image
4 | import torch
5 | import torchvision
6 | from torchvision.transforms import *
7 | import torch.nn as nn
8 | from torch.autograd import Variable
9 | from torch.utils.data import Dataset, DataLoader
10 | import numpy as np
11 | import math
12 | from collections import OrderedDict
13 | import torch.nn.functional as F
14 | import torch.optim as optim
15 | import argparse
16 | import csv
17 | import random
18 | import warnings
19 | import pdb
20 | #sys.path.append('/home/xiaokang_peng/avetry/ave_av/models')
21 | import encodera as ma
22 | import encoderv as mv
23 | warnings.filterwarnings('ignore')
24 |
25 |
26 | from resemblyzer import VoiceEncoder, preprocess_wav
27 | from pathlib import Path
28 |
29 |
30 | class AVmodel_x(nn.Module):
31 | def __init__(self,args):
32 | super(AVmodel_x,self).__init__()
33 | self.args = args
34 | '''
35 | self.parta = ma.Resnet(self.args)
36 | self.parta.fc = nn.Linear(512, args.n_classes)
37 | '''
38 | self.partv = mv.Resnet(self.args)
39 | self.partv.fc = nn.Linear(512, args.n_classes)
40 |
41 | self.fc_ = nn.Linear(1024, args.n_classes)
42 | self.fc_a = nn.Linear(256, 512)
43 |
44 |
45 |
46 |
47 | def forward(self,audio,visual,label,iterations):
48 | iteration = iterations
49 | y = audio
50 | #print(audio.size())
51 | x = self.partv(visual)
52 | (_, C, H, W) = x.size()
53 | B = y.size()[0]
54 | x = x.view(B, -1, C, H, W)
55 | x = x.permute(0, 2, 1, 3, 4)
56 |
57 | x = F.adaptive_avg_pool3d(x, 1)
58 | #y = F.adaptive_avg_pool2d(y, 1)
59 | x = x.squeeze(2).squeeze(2).squeeze(2)
60 | #y = y.squeeze(2).squeeze(2)
61 | y = self.fc_a(y)
62 |
63 | #print(x.size(),y.size())
64 |
65 | out = torch.cat((x, y),1)
66 | out = self.fc_(out)
67 |
68 |
69 | return x, y, out
70 |
71 |
--------------------------------------------------------------------------------
/models/old_models/encodera.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class Aencoder(nn.Module):
7 |
8 | def __init__(self, args):
9 | super(Aencoder, self).__init__()
10 | self.audnet = Resnet(args)
11 |
12 | def forward(self, audio):
13 | aud = self.audnet(audio)
14 | return aud
15 |
16 |
17 | def Resnet(opt):
18 | if opt.model_depth == 18:
19 | model = resnet18(
20 | # num_classes=opt.n_classes,
21 | num_classes=1000,
22 | pool=opt.pool)
23 | return model
24 |
25 |
26 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
27 | """3x3 convolution with padding"""
28 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
29 | padding=dilation, groups=groups, bias=False, dilation=dilation)
30 |
31 |
32 | def conv1x1(in_planes, out_planes, stride=1):
33 | """1x1 convolution"""
34 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
35 |
36 |
37 | class BasicBlock(nn.Module):
38 | expansion = 1
39 |
40 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
41 | base_width=64, dilation=1, norm_layer=None):
42 | super(BasicBlock, self).__init__()
43 | if norm_layer is None:
44 | norm_layer = nn.BatchNorm2d
45 | if groups != 1 or base_width != 64:
46 | raise ValueError('BasicBlock only supports groups=1 and base_width=64')
47 | if dilation > 1:
48 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
49 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1
50 | self.conv1 = conv3x3(inplanes, planes, stride)
51 | self.bn1 = norm_layer(planes)
52 | self.relu = nn.ReLU(inplace=True)
53 | self.conv2 = conv3x3(planes, planes)
54 | self.bn2 = norm_layer(planes)
55 | self.downsample = downsample
56 | self.stride = stride
57 |
58 | def forward(self, x):
59 | identity = x
60 |
61 | out = self.conv1(x)
62 | out = self.bn1(out)
63 | out = self.relu(out)
64 |
65 | out = self.conv2(out)
66 | out = self.bn2(out)
67 |
68 | if self.downsample is not None:
69 | identity = self.downsample(x)
70 |
71 | out += identity
72 | out = self.relu(out)
73 |
74 | return out
75 |
76 |
77 | class ResNet(nn.Module):
78 |
79 | def __init__(self, block, layers, num_classes=1000, pool='avgpool', zero_init_residual=False,
80 | groups=1, width_per_group=64, replace_stride_with_dilation=None,
81 | norm_layer=None):
82 | super(ResNet, self).__init__()
83 | self.pool = pool
84 | if norm_layer is None:
85 | norm_layer = nn.BatchNorm2d
86 | self._norm_layer = norm_layer
87 |
88 | self.inplanes = 64
89 | self.dilation = 1
90 | if replace_stride_with_dilation is None:
91 | # each element in the tuple indicates if we should replace
92 | # the 2x2 stride with a dilated convolution instead
93 | replace_stride_with_dilation = [False, False, False]
94 | if len(replace_stride_with_dilation) != 3:
95 | raise ValueError("replace_stride_with_dilation should be None "
96 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
97 | self.groups = groups
98 | self.base_width = width_per_group
99 | self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=7, stride=2, padding=3,
100 | bias=False)
101 | self.bn1 = norm_layer(self.inplanes)
102 | self.relu = nn.ReLU(inplace=True)
103 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
104 | self.layer1 = self._make_layer(block, 64, layers[0])
105 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
106 | dilate=replace_stride_with_dilation[0])
107 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
108 | dilate=replace_stride_with_dilation[1])
109 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
110 | dilate=replace_stride_with_dilation[2])
111 | if self.pool == 'avgpool':
112 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
113 |
114 | self.fc = nn.Linear(512 * block.expansion, num_classes) # 8192
115 | elif self.pool == 'vlad':
116 | self.avgpool = NetVLAD()
117 | self.fc_ = nn.Linear(8192 * block.expansion, num_classes)
118 |
119 | for m in self.modules():
120 | if isinstance(m, nn.Conv2d):
121 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
122 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
123 | nn.init.normal_(m.weight, mean=1, std=0.02)
124 | nn.init.constant_(m.bias, 0)
125 |
126 | # Zero-initialize the last BN in each residual branch,
127 | # so that the residual branch starts with zeros, and each residual block behaves like an identity.
128 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
129 | if zero_init_residual:
130 | for m in self.modules():
131 | if isinstance(m, Bottleneck):
132 | nn.init.constant_(m.bn3.weight, 0)
133 | elif isinstance(m, BasicBlock):
134 | nn.init.constant_(m.bn2.weight, 0)
135 |
136 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
137 | norm_layer = self._norm_layer
138 | downsample = None
139 | previous_dilation = self.dilation
140 | if dilate:
141 | self.dilation *= stride
142 | stride = 1
143 | if stride != 1 or self.inplanes != planes * block.expansion:
144 | downsample = nn.Sequential(
145 | conv1x1(self.inplanes, planes * block.expansion, stride),
146 | norm_layer(planes * block.expansion),
147 | )
148 |
149 | layers = []
150 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
151 | self.base_width, previous_dilation, norm_layer))
152 | self.inplanes = planes * block.expansion
153 | for _ in range(1, blocks):
154 | layers.append(block(self.inplanes, planes, groups=self.groups,
155 | base_width=self.base_width, dilation=self.dilation,
156 | norm_layer=norm_layer))
157 |
158 | return nn.Sequential(*layers)
159 |
160 | def forward(self, x):
161 | x = self.conv1(x)
162 | x = self.bn1(x)
163 | x = self.relu(x)
164 | x = self.maxpool(x)
165 |
166 | x = self.layer1(x)
167 | x = self.layer2(x)
168 | x = self.layer3(x)
169 | x = self.layer4(x)
170 | out = x
171 |
172 | x = self.avgpool(x)
173 | x = x.reshape(x.size(0), -1)
174 |
175 | if self.pool == 'avgpool':
176 | x = self.fc(x)
177 | elif self.pool == 'vlad':
178 | x = self.fc_(x)
179 |
180 | return out
181 |
182 |
183 | class NetVLAD(nn.Module):
184 | """NetVLAD layer implementation"""
185 |
186 | def __init__(self, num_clusters=16, dim=512, alpha=100.0,
187 | normalize_input=True):
188 | """
189 | Args:
190 | num_clusters : int
191 | The number of clusters
192 | dim : int
193 | Dimension of descriptors
194 | alpha : float
195 | Parameter of initialization. Larger value is harder assignment.
196 | normalize_input : bool
197 | If true, descriptor-wise L2 normalization is applied to input.
198 | """
199 | super(NetVLAD, self).__init__()
200 | self.num_clusters = num_clusters
201 | self.dim = dim
202 | self.alpha = alpha
203 | self.normalize_input = normalize_input
204 | self.conv = nn.Conv2d(dim, num_clusters, kernel_size=(1, 1), bias=True)
205 | self.centroids = nn.Parameter(torch.rand(num_clusters, dim))
206 | self._init_params()
207 |
208 | def _init_params(self):
209 | self.conv.weight = nn.Parameter(
210 | (2.0 * self.alpha * self.centroids).unsqueeze(-1).unsqueeze(-1)
211 | )
212 | self.conv.bias = nn.Parameter(
213 | - self.alpha * self.centroids.norm(dim=1)
214 | )
215 |
216 | def forward(self, x):
217 | N, C = x.shape[:2]
218 |
219 | if self.normalize_input:
220 | x = F.normalize(x, p=2, dim=1) # across descriptor dim
221 |
222 | # soft-assignment
223 | soft_assign = self.conv(x).view(N, self.num_clusters, -1)
224 | soft_assign = F.softmax(soft_assign, dim=1)
225 |
226 | x_flatten = x.view(N, C, -1)
227 |
228 | # calculate residuals to each clusters
229 | residual = x_flatten.expand(self.num_clusters, -1, -1, -1).permute(1, 0, 2, 3) - \
230 | self.centroids.expand(x_flatten.size(-1), -1, -1).permute(1, 2, 0).unsqueeze(0)
231 | residual *= soft_assign.unsqueeze(2)
232 | vlad = residual.sum(dim=-1)
233 |
234 | vlad = F.normalize(vlad, p=2, dim=2) # intra-normalization
235 | vlad = vlad.view(x.size(0), -1) # flatten
236 | vlad = F.normalize(vlad, p=2, dim=1) # L2 normalize
237 |
238 | return vlad
239 |
240 |
241 | class Bottleneck(nn.Module):
242 | expansion = 4
243 |
244 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
245 | base_width=64, dilation=1, norm_layer=None):
246 | super(Bottleneck, self).__init__()
247 | if norm_layer is None:
248 | norm_layer = nn.BatchNorm2d
249 | width = int(planes * (base_width / 64.)) * groups
250 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1
251 | self.conv1 = conv1x1(inplanes, width)
252 | self.bn1 = norm_layer(width)
253 | self.conv2 = conv3x3(width, width, stride, groups, dilation)
254 | self.bn2 = norm_layer(width)
255 | self.conv3 = conv1x1(width, planes * self.expansion)
256 | self.bn3 = norm_layer(planes * self.expansion)
257 | self.relu = nn.ReLU(inplace=True)
258 | self.downsample = downsample
259 | self.stride = stride
260 |
261 | def forward(self, x):
262 | identity = x
263 |
264 | out = self.conv1(x)
265 | out = self.bn1(out)
266 | out = self.relu(out)
267 |
268 | out = self.conv2(out)
269 | out = self.bn2(out)
270 | out = self.relu(out)
271 |
272 | out = self.conv3(out)
273 | out = self.bn3(out)
274 |
275 | if self.downsample is not None:
276 | identity = self.downsample(x)
277 |
278 | out += identity
279 | out = self.relu(out)
280 |
281 | return out
282 |
283 |
284 | def _resnet(arch, block, layers, pretrained, progress, **kwargs):
285 | model = ResNet(block, layers, **kwargs)
286 | if pretrained:
287 | state_dict = load_state_dict_from_url(model_urls[arch],
288 | progress=progress)
289 | model.load_state_dict(state_dict)
290 | return model
291 |
292 |
293 | def resnet18(pretrained=False, progress=True, **kwargs):
294 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
295 | **kwargs)
296 |
--------------------------------------------------------------------------------
/models/old_models/encoderv.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class Vencoder(nn.Module):
7 |
8 | def __init__(self,args):
9 | super(Vencoder, self).__init__()
10 | self.audnet = Resnet(args)
11 |
12 | def forward(self, audio):
13 | aud = self.audnet(audio)
14 | return aud
15 |
16 |
17 | def Resnet(opt):
18 | if opt.model_depth == 18:
19 | model = resnet18(
20 | #num_classes=opt.n_classes,
21 | num_classes=1000,
22 | pool=opt.pool)
23 | return model
24 |
25 |
26 |
27 |
28 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
29 | """3x3 convolution with padding"""
30 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
31 | padding=dilation, groups=groups, bias=False, dilation=dilation)
32 |
33 |
34 | def conv1x1(in_planes, out_planes, stride=1):
35 | """1x1 convolution"""
36 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
37 |
38 |
39 | class BasicBlock(nn.Module):
40 | expansion = 1
41 |
42 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
43 | base_width=64, dilation=1, norm_layer=None):
44 | super(BasicBlock, self).__init__()
45 | if norm_layer is None:
46 | norm_layer = nn.BatchNorm2d
47 | if groups != 1 or base_width != 64:
48 | raise ValueError('BasicBlock only supports groups=1 and base_width=64')
49 | if dilation > 1:
50 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
51 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1
52 | self.conv1 = conv3x3(inplanes, planes, stride)
53 | self.bn1 = norm_layer(planes)
54 | self.relu = nn.ReLU(inplace=True)
55 | self.conv2 = conv3x3(planes, planes)
56 | self.bn2 = norm_layer(planes)
57 | self.downsample = downsample
58 | self.stride = stride
59 |
60 | def forward(self, x):
61 | identity = x
62 |
63 | out = self.conv1(x)
64 | out = self.bn1(out)
65 | out = self.relu(out)
66 |
67 | out = self.conv2(out)
68 | out = self.bn2(out)
69 |
70 | if self.downsample is not None:
71 | identity = self.downsample(x)
72 |
73 | out += identity
74 | out = self.relu(out)
75 |
76 | return out
77 |
78 |
79 | class ResNet(nn.Module):
80 |
81 | def __init__(self, block, layers, num_classes=1000, pool='avgpool', zero_init_residual=False,
82 | groups=1, width_per_group=64, replace_stride_with_dilation=None,
83 | norm_layer=None):
84 | super(ResNet, self).__init__()
85 | self.pool = pool
86 | if norm_layer is None:
87 | norm_layer = nn.BatchNorm2d
88 | self._norm_layer = norm_layer
89 |
90 | self.inplanes = 64
91 | self.dilation = 1
92 | if replace_stride_with_dilation is None:
93 | # each element in the tuple indicates if we should replace
94 | # the 2x2 stride with a dilated convolution instead
95 | replace_stride_with_dilation = [False, False, False]
96 | if len(replace_stride_with_dilation) != 3:
97 | raise ValueError("replace_stride_with_dilation should be None "
98 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
99 | self.groups = groups
100 | self.base_width = width_per_group
101 | self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
102 | bias=False)
103 | self.bn1 = norm_layer(self.inplanes)
104 | self.relu = nn.ReLU(inplace=True)
105 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
106 | self.layer1 = self._make_layer(block, 64, layers[0])
107 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
108 | dilate=replace_stride_with_dilation[0])
109 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
110 | dilate=replace_stride_with_dilation[1])
111 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
112 | dilate=replace_stride_with_dilation[2])
113 | if self.pool == 'avgpool':
114 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
115 |
116 | self.fc = nn.Linear(512 * block.expansion, num_classes) # 8192
117 | elif self.pool == 'vlad':
118 | self.avgpool = NetVLAD()
119 | self.fc_ = nn.Linear(8192 * block.expansion, num_classes)
120 |
121 | for m in self.modules():
122 | if isinstance(m, nn.Conv2d):
123 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
124 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
125 | nn.init.normal_(m.weight, mean=1, std=0.02)
126 | nn.init.constant_(m.bias, 0)
127 |
128 | # Zero-initialize the last BN in each residual branch,
129 | # so that the residual branch starts with zeros, and each residual block behaves like an identity.
130 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
131 | if zero_init_residual:
132 | for m in self.modules():
133 | if isinstance(m, Bottleneck):
134 | nn.init.constant_(m.bn3.weight, 0)
135 | elif isinstance(m, BasicBlock):
136 | nn.init.constant_(m.bn2.weight, 0)
137 |
138 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
139 | norm_layer = self._norm_layer
140 | downsample = None
141 | previous_dilation = self.dilation
142 | if dilate:
143 | self.dilation *= stride
144 | stride = 1
145 | if stride != 1 or self.inplanes != planes * block.expansion:
146 | downsample = nn.Sequential(
147 | conv1x1(self.inplanes, planes * block.expansion, stride),
148 | norm_layer(planes * block.expansion),
149 | )
150 |
151 | layers = []
152 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
153 | self.base_width, previous_dilation, norm_layer))
154 | self.inplanes = planes * block.expansion
155 | for _ in range(1, blocks):
156 | layers.append(block(self.inplanes, planes, groups=self.groups,
157 | base_width=self.base_width, dilation=self.dilation,
158 | norm_layer=norm_layer))
159 |
160 | return nn.Sequential(*layers)
161 |
162 | def forward(self, x):
163 |
164 | (B, C, T, H, W) = x.size()
165 | x = x.permute(0, 2, 1, 3, 4).contiguous()
166 | x = x.view(B * T, C, H, W)
167 |
168 | x = self.conv1(x)
169 | x = self.bn1(x)
170 | x = self.relu(x)
171 | x = self.maxpool(x)
172 |
173 | x = self.layer1(x)
174 | x = self.layer2(x)
175 | x = self.layer3(x)
176 | x = self.layer4(x)
177 |
178 | out = x
179 | '''
180 | x = self.avgpool(x)
181 | x = x.reshape(x.size(0), -1)
182 |
183 | if self.pool == 'avgpool':
184 | x = self.fc(x)
185 | elif self.pool == 'vlad':
186 | x = self.fc_(x)
187 | '''
188 | return out
189 |
190 |
191 | class NetVLAD(nn.Module):
192 | """NetVLAD layer implementation"""
193 |
194 | def __init__(self, num_clusters=16, dim=512, alpha=100.0,
195 | normalize_input=True):
196 | """
197 | Args:
198 | num_clusters : int
199 | The number of clusters
200 | dim : int
201 | Dimension of descriptors
202 | alpha : float
203 | Parameter of initialization. Larger value is harder assignment.
204 | normalize_input : bool
205 | If true, descriptor-wise L2 normalization is applied to input.
206 | """
207 | super(NetVLAD, self).__init__()
208 | self.num_clusters = num_clusters
209 | self.dim = dim
210 | self.alpha = alpha
211 | self.normalize_input = normalize_input
212 | self.conv = nn.Conv2d(dim, num_clusters, kernel_size=(1, 1), bias=True)
213 | self.centroids = nn.Parameter(torch.rand(num_clusters, dim))
214 | self._init_params()
215 |
216 | def _init_params(self):
217 | self.conv.weight = nn.Parameter(
218 | (2.0 * self.alpha * self.centroids).unsqueeze(-1).unsqueeze(-1)
219 | )
220 | self.conv.bias = nn.Parameter(
221 | - self.alpha * self.centroids.norm(dim=1)
222 | )
223 |
224 | def forward(self, x):
225 | N, C = x.shape[:2]
226 |
227 | if self.normalize_input:
228 | x = F.normalize(x, p=2, dim=1) # across descriptor dim
229 |
230 | # soft-assignment
231 | soft_assign = self.conv(x).view(N, self.num_clusters, -1)
232 | soft_assign = F.softmax(soft_assign, dim=1)
233 |
234 | x_flatten = x.view(N, C, -1)
235 |
236 | # calculate residuals to each clusters
237 | residual = x_flatten.expand(self.num_clusters, -1, -1, -1).permute(1, 0, 2, 3) - \
238 | self.centroids.expand(x_flatten.size(-1), -1, -1).permute(1, 2, 0).unsqueeze(0)
239 | residual *= soft_assign.unsqueeze(2)
240 | vlad = residual.sum(dim=-1)
241 |
242 | vlad = F.normalize(vlad, p=2, dim=2) # intra-normalization
243 | vlad = vlad.view(x.size(0), -1) # flatten
244 | vlad = F.normalize(vlad, p=2, dim=1) # L2 normalize
245 |
246 | return vlad
247 |
248 |
249 | class Bottleneck(nn.Module):
250 | expansion = 4
251 |
252 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
253 | base_width=64, dilation=1, norm_layer=None):
254 | super(Bottleneck, self).__init__()
255 | if norm_layer is None:
256 | norm_layer = nn.BatchNorm2d
257 | width = int(planes * (base_width / 64.)) * groups
258 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1
259 | self.conv1 = conv1x1(inplanes, width)
260 | self.bn1 = norm_layer(width)
261 | self.conv2 = conv3x3(width, width, stride, groups, dilation)
262 | self.bn2 = norm_layer(width)
263 | self.conv3 = conv1x1(width, planes * self.expansion)
264 | self.bn3 = norm_layer(planes * self.expansion)
265 | self.relu = nn.ReLU(inplace=True)
266 | self.downsample = downsample
267 | self.stride = stride
268 |
269 | def forward(self, x):
270 | identity = x
271 |
272 | out = self.conv1(x)
273 | out = self.bn1(out)
274 | out = self.relu(out)
275 |
276 | out = self.conv2(out)
277 | out = self.bn2(out)
278 | out = self.relu(out)
279 |
280 | out = self.conv3(out)
281 | out = self.bn3(out)
282 |
283 | if self.downsample is not None:
284 | identity = self.downsample(x)
285 |
286 | out += identity
287 | out = self.relu(out)
288 |
289 | return out
290 |
291 |
292 | def _resnet(arch, block, layers, pretrained, progress, **kwargs):
293 | model = ResNet(block, layers, **kwargs)
294 | if pretrained:
295 | state_dict = load_state_dict_from_url(model_urls[arch],
296 | progress=progress)
297 | model.load_state_dict(state_dict)
298 | return model
299 |
300 |
301 | def resnet18(pretrained=False, progress=True, **kwargs):
302 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
303 | **kwargs)
304 |
305 |
--------------------------------------------------------------------------------
/utils/evaluation.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def obtain_top1_accuracy(output, target):
5 | with torch.no_grad():
6 | batch_size = output.size(0)
7 |
8 | _, pred = output.topk(1, 1, True, True)
9 | pred = pred.t()
10 | correct = pred.eq(target.view(1, -1).expand_as(pred))
11 |
12 | correct_k = correct[:1].reshape(-1).float().sum(0, keepdims=True)
13 | top1 = correct_k.mul_(100.0 / batch_size)
14 |
15 | return correct_k, top1
16 |
17 |
--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | import random
5 |
6 |
7 | def setup_seed(seed):
8 | torch.manual_seed(seed)
9 | torch.cuda.manual_seed_all(seed)
10 | np.random.seed(seed)
11 | random.seed(seed)
12 | torch.backends.cudnn.deterministic = True
13 |
14 |
15 | def weight_init(m):
16 | if isinstance(m, nn.Linear):
17 | nn.init.xavier_normal_(m.weight)
18 | nn.init.constant_(m.bias, 0)
19 | elif isinstance(m, nn.Conv2d):
20 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
21 | elif isinstance(m, nn.BatchNorm2d):
22 | nn.init.constant_(m.weight, 1)
23 | nn.init.constant_(m.bias, 0)
24 |
--------------------------------------------------------------------------------