├── .DS_Store ├── .idea ├── .gitignore ├── OGM-GE_CVPR2022.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── LICENSE ├── OGM_Sup.pdf ├── README.md ├── ckpt └── .DS_Store ├── data ├── .DS_Store ├── CREMAD │ ├── .DS_Store │ ├── data.csv │ ├── readme.md │ ├── stat.csv │ ├── test.csv │ ├── train.csv │ └── video_preprocessing.py ├── KineticSound │ ├── my_test.txt │ ├── my_train.txt │ └── process_audio.py └── VGGSound │ ├── .DS_Store │ ├── mp4_to_wav.py │ ├── vggsound.csv │ └── video_preprocessing.py ├── dataset ├── CramedDataset.py ├── VGGSoundDataset.py └── dataset.py ├── demo ├── algorithom.PNG ├── demo_guitar.PNG ├── demo_snow.PNG ├── five lines.PNG └── pipeline.PNG ├── main.py ├── models ├── .DS_Store ├── backbone.py ├── basic_model.py ├── fusion_modules.py └── old_models │ ├── .DS_Store │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-37.pyc │ ├── avmodel.cpython-36.pyc │ ├── avmodel.cpython-37.pyc │ ├── avmodel_3.cpython-37.pyc │ ├── avmodel_am.cpython-37.pyc │ ├── avmodel_att.cpython-37.pyc │ ├── avmodel_cma.cpython-37.pyc │ ├── avmodel_demo.cpython-37.pyc │ ├── avmodel_gate.cpython-37.pyc │ ├── avmodel_gradblending.cpython-37.pyc │ ├── avmodel_md.cpython-37.pyc │ ├── avmodel_psp.cpython-37.pyc │ ├── avmodel_uni.cpython-37.pyc │ ├── avmodel_x.cpython-37.pyc │ ├── encodera.cpython-36.pyc │ ├── encodera.cpython-37.pyc │ ├── encoderv.cpython-36.pyc │ └── encoderv.cpython-37.pyc │ ├── avmodel.py │ ├── avmodel_x.py │ ├── encodera.py │ └── encoderv.py └── utils ├── evaluation.py └── utils.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/.DS_Store -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/OGM-GE_CVPR2022.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 35 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 GeWu-Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /OGM_Sup.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/OGM_Sup.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Official OGM-GE in PyTorch 2 | 3 | 4 | Here is the official PyTorch implementation of OGM-GE proposed in ''*Balanced Multimodal Learning via On-the-fly Gradient Modulation*'', which is a flexible plug-in module to enhance the optimization process of multimodal learning. Please refer to our [CVPR 2022 paper](https://arxiv.org/abs/2203.15332) for more details. 5 | 6 | **Paper Title: "Balanced Multimodal Learning via On-the-fly Gradient Modulation"** 7 | 8 | **Authors: Xiaokang Peng\*, [Yake Wei\*](https://echo0409.github.io/), [Andong Deng](https://dengandong.github.io/), [Dong Wang](https://redwang.github.io/) and [Di Hu](https://dtaoo.github.io/index.html)** 9 | 10 | **Accepted by: IEEE Conference on Computer Vision and Pattern Recognition(CVPR 2022, Oral Presentation)** 11 | 12 | **[[arXiv](https://arxiv.org/abs/2203.15332)]** **[[Supplementary Material](https://github.com/GeWu-Lab/OGM-GE_CVPR2022/blob/main/OGM_Sup.pdf)]** 13 | 14 | 15 | ## News 16 | - We release a balanced audiovisual dataset for imbalance mult-modal learning analysis! [Project Page](https://gewu-lab.github.io/Balanced-Audiovisual-Dataset/), [Paper](https://arxiv.org/abs/2302.10912) 17 | - We further develop imbalance-mitigating method, MMCosine, for audio-visual fine-grained tasks! The [paper](https://arxiv.org/abs/2303.05338) has been accepted by ICASSP2023. [Project Page](https://gewu-lab.github.io/MMCosine/) 18 | - The effectiveness of the OGM-GE method has been corroborated by the work of several other researchers. 19 | 20 | | Task | Dataset |Modalities | w/o OGM-GE | w/ OGM-GE or similar | Source | 21 | |--------------------|--------------|--------------|------|------|------| 22 | | Action Recognition | UCF101 |RGB, Optical-Flow | 82.3 | 84.0 | [1] | 23 | | Knowledge Graph Link Prediction | OpenBG-Complete-IMG+ |Image, OCR | 59.4 | 60.1 | [2] | 24 | 25 | [1] [On Uni-modal Feature Learning In Supervised Multi-modal Learning](https://openreview.net/pdf?id=mb7VM83DkyC) 26 | 27 | [2] [IMKGA-SM: Interpretable Multimodal Knowledge Graph Answer Prediction via Sequence Modeling](https://arxiv.org/pdf/2301.02445.pdf) 28 | 29 | - Recent works inspired by OGM-GE: 30 | 31 | [PMR: Prototypical Modal Rebalance for Multimodal Learning](https://openaccess.thecvf.com/content/CVPR2023/papers/Fan_PMR_Prototypical_Modal_Rebalance_for_Multimodal_Learning_CVPR_2023_paper.pdf) CVPR 2023. 32 | 33 | [Graph Interactive Network with Adaptive Gradient for Multi-Modal Rumor Detection](https://dl.acm.org/doi/abs/10.1145/3591106.3592250) ICMR 2023. 34 | 35 | [MMCosine: Multi-Modal Cosine Loss Towards Balanced Audio-Visual Fine-Grained Learning](https://arxiv.org/pdf/2303.05338.pdf) ICASSP 2023. 36 | 37 | [Make Acoustic and Visual Cues Matter: CH-SIMS v2.0 Dataset and AV-Mixup Consistent Module](https://dl.acm.org/doi/pdf/10.1145/3536221.3556630) ICMI 2022. 38 | 39 | ## What is the imbalance phenomenon in multimodal learning task? 40 | We observe that **the potential of multimodal information is not fully exploited even when the multimodal model outperforms its uni-modal counterpart.** We conduct linear probing experiments to explore the quality of jointly trained encoders, and find them under-optimized (the yellow line) compared with the uni-modal model (the red line). We proposed the OGM-GE method to improve the optimization process adaptively and achieved consistent improvement (the blue line). We improve both the multimodal performance and uni-model representation as shown in the following figure. 41 |
42 | 43 |
44 | 45 | ## Method Introduction 46 | Pipeline of our OGM-GE method, consisting of two submodules: 47 | 1. On-the-fly Gradient Modulation (OGM), which is designed to adaptively balance the training between modalities; 48 | 2. Adaptive Gaussian noise Enhancement (GE), which restores the gradient intensity and brings generalization. 49 |
50 | 51 |
52 | 53 | 54 | ## Main Dependencies 55 | + Ubuntu 16.04 56 | + CUDA Version: 11.1 57 | + PyTorch 1.8.1 58 | + torchvision 0.9.1 59 | + python 3.7.6 60 | 61 | 62 | ## Usage 63 | ### Data Preparation 64 | Download Original Dataset: 65 | [CREMA-D](https://github.com/CheyneyComputerScience/CREMA-D), 66 | [AVE](https://sites.google.com/view/audiovisualresearch), 67 | [VGGSound](https://www.robots.ox.ac.uk/~vgg/data/vggsound/), 68 | [Kinetics-Sounds](https://github.com/cvdfoundation/kinetics-dataset). 69 | 70 | [comment]: <> ([ESC50](https://github.com/karoldvl/ESC-50/archive/master.zip).) 71 | 72 | 73 | ### Pre-processing 74 | 75 | For CREMA-D and VGGSound dataset, we provide code to pre-process videos into RGB frames and audio wav files in the directory ```data/```. 76 | 77 | **[!!Attention]:For audio modality, we convert the WAV files into spectrogram. [Here](https://github.com/GeWu-Lab/OGM-GE_CVPR2022/blob/main/data/KineticSound/process_audio.py) we provide one of the pre-processing methods of wav file.** 78 | 79 | #### CREMA-D 80 | 81 | As the original CREMA-D dataset has provided the original audio and video files, we simply extract the video frames by running the code: 82 | 83 | ```python data/CREMAD/video_preprecessing.py``` 84 | 85 | Note that, the relevant path/dir should be changed according your own env. 86 | 87 | #### VGGSound 88 | 89 | As the original VGGSound dataset only provide the raw video files, we have to extract the audio by running the code: 90 | 91 | ```python data/VGGSound/mp4_to_wav.py``` 92 | 93 | Then, extracting the video frames: 94 | 95 | ```python data/VGGSound/video_preprecessing.py``` 96 | 97 | Note that, the relevant path/dir should be changed according your own env. 98 | 99 | 100 | 101 | ## Core code demo 102 | 103 | Our proposed OGM-GE can work as a simple but useful plugin for some widely used multimodal fusion frameworks. We dispaly the core abstract code part as following: 104 | ```python 105 | ---in training step--- 106 | 107 | # Out_a, out_v are calculated to estimate the performance of 'a' and 'v' modality. 108 | x, y, out = model(spec.unsqueeze(1).float(), image.float(), label, iteration) 109 | out_v = (torch.mm(x,torch.transpose(model.module.fc_.weight[:,:512],0,1)) + model.module.fc_.bias/2) 110 | out_a = (torch.mm(y,torch.transpose(model.module.fc_.weight[:,512:],0,1)) + model.module.fc_.bias/2) 111 | loss = criterion(out, label) 112 | 113 | # Calculate original gradient first 114 | loss.backward() 115 | 116 | # Calculation of discrepancy ration and k. 117 | k_a,k_v = calculate_coefficient(label, out_a, out_v) 118 | 119 | # Gradient Modulation begins before optimization, and with GE applied. 120 | update_model_with_OGM_GE(model, k_a, k_v) 121 | 122 | # Optimize the modulated parameters. 123 | optimizer.step() 124 | 125 | ---continue for next training step--- 126 | ``` 127 | 128 | ### default modulation setting: 129 | 130 | ```--modulation OGM_GE --modulation_starts 0 --modulation_ends 50 --fusion_method concat --alpha 0.1``` 131 | 132 | You can train your model simply by running: 133 | 134 | ```python main.py --dataset VGGSound --train```. 135 | 136 | You can also adapt to your own setting by adding additional arguments, for example, if you want to train our model on CREMA-D dataset, with gated fusion method and only OGM (i.e., without GE), and try to modulate the gradient from epoch 20 to epoch 80, you can run the following command: 137 | 138 | ```train.py --train --dataset CREMAD --fusion_method gated --modulation OGM --modulation_starts 20 --modulation_ends 80 --alpha 0.3```. 139 | 140 | 141 | 142 | ## Test and Eval 143 | 144 | You can test the performance of trained model by simply running 145 | 146 | ```python main.py --ckpt_path /PATH-to-trained-ckpt ``` 147 | 148 | ## Tips 149 | 150 | There is a hype-parameter within OGM-GE, which is the alpha that depends on the modality discrepancy on different dataset. 151 | Here we recommend alpha=0.1 for VGGSound and alpha=0.8 for CREMA-D. 152 | 153 | ## Checkpoints 154 | 155 | [CREMA-D](https://zenodo.org/record/6778788) 156 | 157 | 158 | ## Demo explanation 159 |
160 | 161 |
162 |
163 | 164 |
165 | As shown in above picture, 'playing guitar' is a class that audio surpasses visual modality for most samples ('shovelling show' is just opposite), and we can tell audio achieves more adequate training and leads the optimization process. Our OGM-GE (as well as OGM) gains improvement in both modalties as well as multimodal performance, and the weak visual gains more porfit. The evaluation metric used in 'audio' and 'visual' is the predicted accuracy with classification scores just from one specific modality. 166 | 167 | 168 | 169 | ## Citation 170 | If you find this work useful, please consider citing it. 171 | 172 |

173 | @inproceedings{Peng2022Balanced,
174 |   title	= {Balanced Multimodal Learning via On-the-fly Gradient Modulation},
175 |   author = {Peng, Xiaokang and Wei, Yake and Deng, Andong and Wang, Dong and Hu, Di},
176 |   booktitle	= {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
177 |   year	= {2022}
178 | }
179 | 
180 | 181 | ## Acknowledgement 182 | 183 | This research was supported by Public Computing Cloud, Renmin University of China. 184 | 185 | ## License 186 | 187 | This project is released under the [GNU General Public License v3.0](https://github.com/Mukosame/Zooming-Slow-Mo-CVPR-2020/blob/master/LICENSE). 188 | 189 | 190 | ## Contact us 191 | 192 | If you have any detailed questions or suggestions, you can email us: 193 | **yakewei@ruc.edu.cn** and **andongdeng69@gmail.com** 194 | -------------------------------------------------------------------------------- /ckpt/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/ckpt/.DS_Store -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/data/.DS_Store -------------------------------------------------------------------------------- /data/CREMAD/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/data/CREMAD/.DS_Store -------------------------------------------------------------------------------- /data/CREMAD/readme.md: -------------------------------------------------------------------------------- 1 | This place to save 'classes list', 'training set' and 'testing set' division file for loading data. Users can devide the traing set into a training set and a validation set. 2 | Data files of CREMA-D is provided as example. 3 | -------------------------------------------------------------------------------- /data/CREMAD/stat.csv: -------------------------------------------------------------------------------- 1 | NEU 2 | HAP 3 | SAD 4 | FEA 5 | DIS 6 | ANG 7 | -------------------------------------------------------------------------------- /data/CREMAD/test.csv: -------------------------------------------------------------------------------- 1 | 1041_TIE_NEU_XX,NEU 2 | 1071_MTI_SAD_XX,SAD 3 | 1028_WSI_DIS_XX,DIS 4 | 1016_IEO_HAP_LO,HAP 5 | 1087_IEO_FEA_LO,FEA 6 | 1082_ITH_HAP_XX,HAP 7 | 1071_WSI_NEU_XX,NEU 8 | 1089_IEO_DIS_MD,DIS 9 | 1056_ITH_NEU_XX,NEU 10 | 1091_TAI_SAD_XX,SAD 11 | 1068_ITS_DIS_XX,DIS 12 | 1011_IWL_ANG_XX,ANG 13 | 1052_ITS_NEU_XX,NEU 14 | 1050_IOM_ANG_XX,ANG 15 | 1030_IOM_ANG_XX,ANG 16 | 1027_IWW_SAD_XX,SAD 17 | 1014_TSI_HAP_XX,HAP 18 | 1032_DFA_NEU_XX,NEU 19 | 1026_IWL_SAD_XX,SAD 20 | 1035_TSI_ANG_XX,ANG 21 | 1010_WSI_SAD_XX,SAD 22 | 1039_IEO_SAD_HI,SAD 23 | 1063_DFA_DIS_XX,DIS 24 | 1064_IWW_HAP_XX,HAP 25 | 1072_IOM_SAD_XX,SAD 26 | 1023_TIE_DIS_XX,DIS 27 | 1040_IEO_SAD_HI,SAD 28 | 1088_DFA_FEA_XX,FEA 29 | 1027_IEO_FEA_HI,FEA 30 | 1054_WSI_ANG_XX,ANG 31 | 1048_MTI_NEU_XX,NEU 32 | 1026_TIE_FEA_XX,FEA 33 | 1035_IWW_ANG_XX,ANG 34 | 1089_ITS_NEU_XX,NEU 35 | 1038_IWL_ANG_XX,ANG 36 | 1035_IWL_ANG_XX,ANG 37 | 1011_IEO_SAD_HI,SAD 38 | 1017_IWL_HAP_XX,HAP 39 | 1087_DFA_DIS_XX,DIS 40 | 1040_TSI_DIS_XX,DIS 41 | 1019_IWL_SAD_XX,SAD 42 | 1090_IEO_HAP_LO,HAP 43 | 1018_TAI_SAD_XX,SAD 44 | 1027_IOM_NEU_XX,NEU 45 | 1076_DFA_NEU_XX,NEU 46 | 1037_TAI_NEU_XX,NEU 47 | 1091_TSI_NEU_XX,NEU 48 | 1047_TSI_NEU_XX,NEU 49 | 1076_IEO_SAD_MD,SAD 50 | 1024_IEO_ANG_HI,ANG 51 | 1001_WSI_NEU_XX,NEU 52 | 1091_IEO_HAP_MD,HAP 53 | 1075_IEO_HAP_HI,HAP 54 | 1079_TAI_NEU_XX,NEU 55 | 1077_TAI_DIS_XX,DIS 56 | 1041_ITS_NEU_XX,NEU 57 | 1021_MTI_ANG_XX,ANG 58 | 1002_MTI_HAP_XX,HAP 59 | 1005_IEO_SAD_LO,SAD 60 | 1034_MTI_HAP_XX,HAP 61 | 1026_ITH_HAP_XX,HAP 62 | 1077_DFA_NEU_XX,NEU 63 | 1060_DFA_NEU_XX,NEU 64 | 1081_TAI_DIS_XX,DIS 65 | 1084_MTI_HAP_XX,HAP 66 | 1013_WSI_ANG_XX,ANG 67 | 1071_IOM_DIS_XX,DIS 68 | 1030_TAI_NEU_XX,NEU 69 | 1053_IWL_SAD_XX,SAD 70 | 1048_MTI_FEA_XX,FEA 71 | 1027_IEO_DIS_MD,DIS 72 | 1037_TIE_HAP_XX,HAP 73 | 1066_IWL_NEU_XX,NEU 74 | 1022_TSI_DIS_XX,DIS 75 | 1016_MTI_FEA_XX,FEA 76 | 1049_ITH_FEA_XX,FEA 77 | 1070_IEO_SAD_HI,SAD 78 | 1002_IWL_NEU_XX,NEU 79 | 1039_IOM_HAP_XX,HAP 80 | 1061_TAI_NEU_XX,NEU 81 | 1012_ITH_NEU_XX,NEU 82 | 1040_ITS_DIS_XX,DIS 83 | 1066_ITH_SAD_XX,SAD 84 | 1016_TIE_SAD_XX,SAD 85 | 1044_IEO_SAD_HI,SAD 86 | 1089_TIE_ANG_XX,ANG 87 | 1064_IWL_FEA_XX,FEA 88 | 1032_IEO_DIS_LO,DIS 89 | 1078_DFA_SAD_XX,SAD 90 | 1003_IEO_FEA_MD,FEA 91 | 1082_IWL_FEA_XX,FEA 92 | 1028_TAI_FEA_XX,FEA 93 | 1052_IEO_ANG_MD,ANG 94 | 1006_TAI_FEA_XX,FEA 95 | 1070_IWL_FEA_XX,FEA 96 | 1030_IWW_DIS_XX,DIS 97 | 1063_IEO_SAD_HI,SAD 98 | 1089_WSI_DIS_XX,DIS 99 | 1026_MTI_SAD_XX,SAD 100 | 1009_TIE_ANG_XX,ANG 101 | 1075_IOM_HAP_XX,HAP 102 | 1041_IEO_SAD_LO,SAD 103 | 1037_DFA_FEA_XX,FEA 104 | 1026_TIE_DIS_XX,DIS 105 | 1019_TSI_SAD_XX,SAD 106 | 1036_DFA_HAP_XX,HAP 107 | 1061_IWW_FEA_XX,FEA 108 | 1027_TIE_HAP_XX,HAP 109 | 1066_IOM_HAP_XX,HAP 110 | 1012_IEO_NEU_XX,NEU 111 | 1003_IEO_FEA_HI,FEA 112 | 1016_IEO_ANG_MD,ANG 113 | 1070_IWL_ANG_XX,ANG 114 | 1020_IEO_ANG_LO,ANG 115 | 1010_IWW_FEA_XX,FEA 116 | 1043_IOM_ANG_XX,ANG 117 | 1069_TSI_HAP_XX,HAP 118 | 1073_IEO_ANG_LO,ANG 119 | 1045_WSI_FEA_XX,FEA 120 | 1061_TIE_HAP_XX,HAP 121 | 1004_MTI_ANG_XX,ANG 122 | 1058_TIE_NEU_XX,NEU 123 | 1044_ITS_ANG_XX,ANG 124 | 1081_IOM_NEU_XX,NEU 125 | 1044_IOM_SAD_XX,SAD 126 | 1004_IEO_ANG_HI,ANG 127 | 1060_ITS_HAP_XX,HAP 128 | 1090_ITS_DIS_XX,DIS 129 | 1005_ITS_ANG_XX,ANG 130 | 1053_IWW_ANG_XX,ANG 131 | 1089_IEO_FEA_HI,FEA 132 | 1009_WSI_NEU_XX,NEU 133 | 1035_ITH_DIS_XX,DIS 134 | 1075_TIE_DIS_XX,DIS 135 | 1023_IOM_FEA_XX,FEA 136 | 1062_IEO_FEA_LO,FEA 137 | 1085_IEO_ANG_MD,ANG 138 | 1091_TSI_ANG_XX,ANG 139 | 1020_ITS_SAD_XX,SAD 140 | 1035_IEO_SAD_MD,SAD 141 | 1007_ITH_ANG_XX,ANG 142 | 1064_TAI_ANG_XX,ANG 143 | 1058_ITS_NEU_XX,NEU 144 | 1034_ITS_ANG_XX,ANG 145 | 1068_TIE_FEA_XX,FEA 146 | 1013_MTI_HAP_XX,HAP 147 | 1037_IWW_FEA_XX,FEA 148 | 1055_TAI_FEA_XX,FEA 149 | 1011_IWL_DIS_XX,DIS 150 | 1053_TIE_DIS_XX,DIS 151 | 1033_IEO_ANG_HI,ANG 152 | 1032_IWW_HAP_XX,HAP 153 | 1010_IOM_HAP_XX,HAP 154 | 1060_ITS_NEU_XX,NEU 155 | 1051_WSI_FEA_XX,FEA 156 | 1088_IEO_DIS_LO,DIS 157 | 1042_IWW_ANG_XX,ANG 158 | 1046_IEO_ANG_MD,ANG 159 | 1073_IWW_FEA_XX,FEA 160 | 1006_IOM_ANG_XX,ANG 161 | 1019_ITS_SAD_XX,SAD 162 | 1056_ITS_FEA_XX,FEA 163 | 1025_DFA_DIS_XX,DIS 164 | 1055_IEO_SAD_HI,SAD 165 | 1086_ITH_SAD_XX,SAD 166 | 1035_IEO_HAP_LO,HAP 167 | 1085_ITS_HAP_XX,HAP 168 | 1078_MTI_NEU_XX,NEU 169 | 1067_IEO_ANG_HI,ANG 170 | 1042_IWL_DIS_XX,DIS 171 | 1020_TSI_FEA_XX,FEA 172 | 1037_ITH_ANG_XX,ANG 173 | 1022_IEO_DIS_LO,DIS 174 | 1029_IWW_SAD_XX,SAD 175 | 1071_MTI_HAP_XX,HAP 176 | 1020_TAI_HAP_XX,HAP 177 | 1011_IWL_FEA_XX,FEA 178 | 1067_IWL_FEA_XX,FEA 179 | 1008_IWL_NEU_XX,NEU 180 | 1058_TSI_NEU_XX,NEU 181 | 1074_TAI_DIS_XX,DIS 182 | 1053_IEO_ANG_HI,ANG 183 | 1059_IOM_DIS_XX,DIS 184 | 1091_DFA_HAP_XX,HAP 185 | 1035_TSI_HAP_XX,HAP 186 | 1054_WSI_DIS_XX,DIS 187 | 1082_TIE_HAP_XX,HAP 188 | 1053_TSI_FEA_XX,FEA 189 | 1041_TSI_SAD_XX,SAD 190 | 1033_TIE_DIS_XX,DIS 191 | 1080_TAI_SAD_XX,SAD 192 | 1007_MTI_ANG_XX,ANG 193 | 1006_IWL_ANG_XX,ANG 194 | 1080_IEO_DIS_HI,DIS 195 | 1047_TSI_HAP_XX,HAP 196 | 1037_TAI_SAD_XX,SAD 197 | 1046_DFA_NEU_XX,NEU 198 | 1055_IWL_DIS_XX,DIS 199 | 1086_IEO_FEA_LO,FEA 200 | 1073_DFA_SAD_XX,SAD 201 | 1026_IWW_NEU_XX,NEU 202 | 1077_DFA_HAP_XX,HAP 203 | 1006_TAI_ANG_XX,ANG 204 | 1077_WSI_DIS_XX,DIS 205 | 1018_TSI_NEU_XX,NEU 206 | 1065_IOM_ANG_XX,ANG 207 | 1037_MTI_ANG_XX,ANG 208 | 1039_IWL_HAP_XX,HAP 209 | 1070_IEO_DIS_LO,DIS 210 | 1030_TSI_FEA_XX,FEA 211 | 1046_IWL_SAD_XX,SAD 212 | 1033_MTI_DIS_XX,DIS 213 | 1059_ITH_NEU_XX,NEU 214 | 1037_TSI_HAP_XX,HAP 215 | 1077_TIE_FEA_XX,FEA 216 | 1011_IWL_SAD_XX,SAD 217 | 1022_DFA_NEU_XX,NEU 218 | 1044_WSI_HAP_XX,HAP 219 | 1047_IEO_DIS_HI,DIS 220 | 1091_ITS_SAD_XX,SAD 221 | 1056_IEO_DIS_LO,DIS 222 | 1031_IWW_FEA_XX,FEA 223 | 1045_ITS_ANG_XX,ANG 224 | 1011_IEO_NEU_XX,NEU 225 | 1084_IWL_HAP_XX,HAP 226 | 1011_IEO_DIS_MD,DIS 227 | 1004_IWL_FEA_XX,FEA 228 | 1019_DFA_NEU_XX,NEU 229 | 1081_MTI_NEU_XX,NEU 230 | 1057_IWW_DIS_XX,DIS 231 | 1039_IOM_SAD_XX,SAD 232 | 1013_TSI_FEA_XX,FEA 233 | 1008_IEO_SAD_LO,SAD 234 | 1041_IWL_DIS_XX,DIS 235 | 1017_IEO_DIS_MD,DIS 236 | 1009_TSI_NEU_XX,NEU 237 | 1023_IEO_DIS_MD,DIS 238 | 1040_ITS_SAD_XX,SAD 239 | 1041_ITH_HAP_XX,HAP 240 | 1077_TIE_DIS_XX,DIS 241 | 1072_IEO_ANG_MD,ANG 242 | 1087_IEO_FEA_MD,FEA 243 | 1018_TAI_NEU_XX,NEU 244 | 1040_TIE_HAP_XX,HAP 245 | 1008_MTI_HAP_XX,HAP 246 | 1089_TSI_ANG_XX,ANG 247 | 1025_IWL_SAD_XX,SAD 248 | 1030_IWW_HAP_XX,HAP 249 | 1022_IWW_FEA_XX,FEA 250 | 1065_ITS_FEA_XX,FEA 251 | 1022_ITH_FEA_XX,FEA 252 | 1001_ITS_SAD_XX,SAD 253 | 1004_IOM_NEU_XX,NEU 254 | 1086_DFA_DIS_XX,DIS 255 | 1085_IWW_ANG_XX,ANG 256 | 1046_IWL_DIS_XX,DIS 257 | 1035_TAI_SAD_XX,SAD 258 | 1054_IEO_HAP_MD,HAP 259 | 1014_DFA_NEU_XX,NEU 260 | 1021_IEO_HAP_LO,HAP 261 | 1089_IEO_HAP_HI,HAP 262 | 1065_WSI_ANG_XX,ANG 263 | 1029_TSI_HAP_XX,HAP 264 | 1088_IEO_ANG_MD,ANG 265 | 1044_IWW_NEU_XX,NEU 266 | 1064_MTI_HAP_XX,HAP 267 | 1072_IWW_NEU_XX,NEU 268 | 1054_ITS_FEA_XX,FEA 269 | 1087_DFA_SAD_XX,SAD 270 | 1010_MTI_FEA_XX,FEA 271 | 1074_WSI_ANG_XX,ANG 272 | 1049_TSI_SAD_XX,SAD 273 | 1082_TAI_HAP_XX,HAP 274 | 1066_IWW_FEA_XX,FEA 275 | 1072_TSI_SAD_XX,SAD 276 | 1073_IWW_HAP_XX,HAP 277 | 1091_MTI_NEU_XX,NEU 278 | 1022_TAI_HAP_XX,HAP 279 | 1086_WSI_SAD_XX,SAD 280 | 1063_ITH_SAD_XX,SAD 281 | 1046_IEO_DIS_LO,DIS 282 | 1082_IWW_NEU_XX,NEU 283 | 1002_WSI_FEA_XX,FEA 284 | 1044_TIE_FEA_XX,FEA 285 | 1089_IWL_HAP_XX,HAP 286 | 1028_IWW_ANG_XX,ANG 287 | 1057_IEO_ANG_MD,ANG 288 | 1003_TSI_HAP_XX,HAP 289 | 1002_ITH_DIS_XX,DIS 290 | 1081_DFA_NEU_XX,NEU 291 | 1010_ITS_NEU_XX,NEU 292 | 1004_TIE_ANG_XX,ANG 293 | 1091_IEO_HAP_LO,HAP 294 | 1006_IOM_SAD_XX,SAD 295 | 1047_TAI_SAD_XX,SAD 296 | 1046_IEO_FEA_HI,FEA 297 | 1083_TSI_FEA_XX,FEA 298 | 1085_TSI_DIS_XX,DIS 299 | 1060_TAI_NEU_XX,NEU 300 | 1023_ITH_SAD_XX,SAD 301 | 1054_MTI_FEA_XX,FEA 302 | 1028_ITH_NEU_XX,NEU 303 | 1044_ITS_DIS_XX,DIS 304 | 1032_ITS_DIS_XX,DIS 305 | 1044_IEO_DIS_HI,DIS 306 | 1067_IEO_ANG_MD,ANG 307 | 1032_TSI_NEU_XX,NEU 308 | 1070_IEO_ANG_HI,ANG 309 | 1022_TIE_SAD_XX,SAD 310 | 1033_IEO_SAD_LO,SAD 311 | 1083_IWW_HAP_XX,HAP 312 | 1071_ITH_ANG_XX,ANG 313 | 1085_IOM_HAP_XX,HAP 314 | 1054_TIE_HAP_XX,HAP 315 | 1015_TSI_DIS_XX,DIS 316 | 1062_MTI_DIS_XX,DIS 317 | 1007_TIE_ANG_XX,ANG 318 | 1079_DFA_SAD_XX,SAD 319 | 1073_DFA_DIS_XX,DIS 320 | 1008_ITH_FEA_XX,FEA 321 | 1086_IWL_HAP_XX,HAP 322 | 1058_IOM_NEU_XX,NEU 323 | 1034_IEO_DIS_HI,DIS 324 | 1054_IWL_ANG_XX,ANG 325 | 1057_DFA_ANG_XX,ANG 326 | 1072_IWL_DIS_XX,DIS 327 | 1026_IOM_ANG_XX,ANG 328 | 1034_IEO_NEU_XX,NEU 329 | 1047_IWL_DIS_XX,DIS 330 | 1005_TSI_HAP_XX,HAP 331 | 1009_IWL_SAD_XX,SAD 332 | 1043_IEO_DIS_MD,DIS 333 | 1070_IEO_DIS_MD,DIS 334 | 1079_MTI_SAD_XX,SAD 335 | 1005_IWL_NEU_XX,NEU 336 | 1071_ITS_SAD_XX,SAD 337 | 1053_TSI_NEU_XX,NEU 338 | 1036_TIE_FEA_XX,FEA 339 | 1061_ITS_NEU_XX,NEU 340 | 1015_ITH_FEA_XX,FEA 341 | 1088_TSI_FEA_XX,FEA 342 | 1032_MTI_NEU_XX,NEU 343 | 1019_TSI_FEA_XX,FEA 344 | 1026_WSI_DIS_XX,DIS 345 | 1011_WSI_DIS_XX,DIS 346 | 1010_IEO_FEA_LO,FEA 347 | 1027_MTI_FEA_XX,FEA 348 | 1051_MTI_NEU_XX,NEU 349 | 1047_WSI_SAD_XX,SAD 350 | 1031_TAI_FEA_XX,FEA 351 | 1086_IOM_DIS_XX,DIS 352 | 1030_IOM_NEU_XX,NEU 353 | 1072_MTI_FEA_XX,FEA 354 | 1059_IEO_FEA_MD,FEA 355 | 1078_IWW_SAD_XX,SAD 356 | 1043_TAI_DIS_XX,DIS 357 | 1053_DFA_ANG_XX,ANG 358 | 1012_DFA_NEU_XX,NEU 359 | 1049_IWW_NEU_XX,NEU 360 | 1062_IWL_NEU_XX,NEU 361 | 1030_WSI_NEU_XX,NEU 362 | 1012_WSI_FEA_XX,FEA 363 | 1035_DFA_HAP_XX,HAP 364 | 1082_TAI_DIS_XX,DIS 365 | 1070_TAI_NEU_XX,NEU 366 | 1066_WSI_FEA_XX,FEA 367 | 1011_ITS_DIS_XX,DIS 368 | 1032_IEO_FEA_LO,FEA 369 | 1028_IEO_DIS_HI,DIS 370 | 1062_ITS_DIS_XX,DIS 371 | 1018_TIE_ANG_XX,ANG 372 | 1030_WSI_DIS_XX,DIS 373 | 1004_WSI_SAD_XX,SAD 374 | 1081_IWL_ANG_XX,ANG 375 | 1012_IEO_FEA_HI,FEA 376 | 1063_MTI_SAD_XX,SAD 377 | 1080_IWW_NEU_XX,NEU 378 | 1087_TSI_DIS_XX,DIS 379 | 1069_ITH_SAD_XX,SAD 380 | 1019_IOM_HAP_XX,HAP 381 | 1021_TSI_ANG_XX,ANG 382 | 1069_IOM_FEA_XX,FEA 383 | 1039_MTI_NEU_XX,NEU 384 | 1011_MTI_DIS_XX,DIS 385 | 1043_TSI_FEA_XX,FEA 386 | 1055_IEO_SAD_LO,SAD 387 | 1084_IOM_HAP_XX,HAP 388 | 1041_IOM_DIS_XX,DIS 389 | 1043_IOM_SAD_XX,SAD 390 | 1048_DFA_ANG_XX,ANG 391 | 1031_ITS_HAP_XX,HAP 392 | 1032_IWW_NEU_XX,NEU 393 | 1031_DFA_NEU_XX,NEU 394 | 1013_DFA_ANG_XX,ANG 395 | 1056_IOM_NEU_XX,NEU 396 | 1043_WSI_HAP_XX,HAP 397 | 1018_ITS_DIS_XX,DIS 398 | 1053_IEO_DIS_MD,DIS 399 | 1019_TAI_ANG_XX,ANG 400 | 1079_IEO_SAD_MD,SAD 401 | 1016_WSI_FEA_XX,FEA 402 | 1050_IWW_NEU_XX,NEU 403 | 1046_MTI_FEA_XX,FEA 404 | 1064_TAI_DIS_XX,DIS 405 | 1062_IEO_SAD_LO,SAD 406 | 1065_WSI_HAP_XX,HAP 407 | 1028_IWW_DIS_XX,DIS 408 | 1066_IWW_SAD_XX,SAD 409 | 1014_TAI_DIS_XX,DIS 410 | 1018_WSI_SAD_XX,SAD 411 | 1040_TAI_DIS_XX,DIS 412 | 1015_TAI_SAD_XX,SAD 413 | 1046_TIE_ANG_XX,ANG 414 | 1084_ITH_NEU_XX,NEU 415 | 1005_IEO_FEA_HI,FEA 416 | 1021_WSI_FEA_XX,FEA 417 | 1037_MTI_SAD_XX,SAD 418 | 1076_IWW_ANG_XX,ANG 419 | 1085_IOM_DIS_XX,DIS 420 | 1058_ITH_SAD_XX,SAD 421 | 1009_IOM_NEU_XX,NEU 422 | 1058_TAI_SAD_XX,SAD 423 | 1021_TIE_NEU_XX,NEU 424 | 1009_IWW_ANG_XX,ANG 425 | 1003_IEO_ANG_HI,ANG 426 | 1038_TAI_FEA_XX,FEA 427 | 1027_IWL_NEU_XX,NEU 428 | 1053_WSI_ANG_XX,ANG 429 | 1024_TIE_SAD_XX,SAD 430 | 1047_ITH_FEA_XX,FEA 431 | 1036_IWW_SAD_XX,SAD 432 | 1036_DFA_NEU_XX,NEU 433 | 1088_MTI_FEA_XX,FEA 434 | 1025_IEO_FEA_LO,FEA 435 | 1029_IEO_SAD_HI,SAD 436 | 1019_WSI_SAD_XX,SAD 437 | 1050_WSI_SAD_XX,SAD 438 | 1063_IEO_HAP_LO,HAP 439 | 1020_WSI_FEA_XX,FEA 440 | 1066_DFA_HAP_XX,HAP 441 | 1049_WSI_SAD_XX,SAD 442 | 1023_ITH_ANG_XX,ANG 443 | 1019_IEO_ANG_LO,ANG 444 | 1075_DFA_NEU_XX,NEU 445 | 1044_IEO_SAD_LO,SAD 446 | 1051_IEO_HAP_MD,HAP 447 | 1075_MTI_NEU_XX,NEU 448 | 1079_WSI_DIS_XX,DIS 449 | 1013_ITH_SAD_XX,SAD 450 | 1029_IWL_FEA_XX,FEA 451 | 1062_IEO_DIS_LO,DIS 452 | 1041_DFA_SAD_XX,SAD 453 | 1083_WSI_SAD_XX,SAD 454 | 1030_TAI_DIS_XX,DIS 455 | 1052_DFA_FEA_XX,FEA 456 | 1076_IWW_NEU_XX,NEU 457 | 1026_IEO_NEU_XX,NEU 458 | 1025_IEO_ANG_HI,ANG 459 | 1019_DFA_ANG_XX,ANG 460 | 1033_ITH_DIS_XX,DIS 461 | 1027_IWL_ANG_XX,ANG 462 | 1024_IOM_HAP_XX,HAP 463 | 1077_DFA_ANG_XX,ANG 464 | 1057_IWW_FEA_XX,FEA 465 | 1020_IOM_SAD_XX,SAD 466 | 1055_ITS_NEU_XX,NEU 467 | 1070_DFA_NEU_XX,NEU 468 | 1071_ITS_DIS_XX,DIS 469 | 1027_ITH_HAP_XX,HAP 470 | 1057_IEO_ANG_HI,ANG 471 | 1002_IWL_FEA_XX,FEA 472 | 1052_TAI_FEA_XX,FEA 473 | 1019_WSI_HAP_XX,HAP 474 | 1030_DFA_HAP_XX,HAP 475 | 1064_IEO_SAD_MD,SAD 476 | 1061_DFA_ANG_XX,ANG 477 | 1083_TIE_SAD_XX,SAD 478 | 1045_IEO_ANG_LO,ANG 479 | 1028_IWW_SAD_XX,SAD 480 | 1012_TSI_DIS_XX,DIS 481 | 1070_MTI_HAP_XX,HAP 482 | 1024_TSI_DIS_XX,DIS 483 | 1074_MTI_HAP_XX,HAP 484 | 1030_ITS_FEA_XX,FEA 485 | 1054_IWL_NEU_XX,NEU 486 | 1054_IEO_DIS_MD,DIS 487 | 1084_TIE_NEU_XX,NEU 488 | 1073_TSI_SAD_XX,SAD 489 | 1053_DFA_NEU_XX,NEU 490 | 1025_DFA_HAP_XX,HAP 491 | 1042_MTI_HAP_XX,HAP 492 | 1089_IEO_SAD_LO,SAD 493 | 1009_ITH_FEA_XX,FEA 494 | 1017_MTI_NEU_XX,NEU 495 | 1054_TSI_HAP_XX,HAP 496 | 1071_WSI_DIS_XX,DIS 497 | 1080_TSI_DIS_XX,DIS 498 | 1052_ITS_FEA_XX,FEA 499 | 1078_TIE_NEU_XX,NEU 500 | 1034_IEO_ANG_LO,ANG 501 | 1018_IWL_SAD_XX,SAD 502 | 1034_TAI_ANG_XX,ANG 503 | 1012_TSI_FEA_XX,FEA 504 | 1025_WSI_DIS_XX,DIS 505 | 1036_IEO_SAD_MD,SAD 506 | 1063_IEO_SAD_MD,SAD 507 | 1040_IEO_NEU_XX,NEU 508 | 1007_IWL_FEA_XX,FEA 509 | 1023_IWW_NEU_XX,NEU 510 | 1010_IWW_HAP_XX,HAP 511 | 1067_TAI_DIS_XX,DIS 512 | 1074_ITS_HAP_XX,HAP 513 | 1045_ITS_HAP_XX,HAP 514 | 1072_IWW_FEA_XX,FEA 515 | 1088_ITH_SAD_XX,SAD 516 | 1068_DFA_SAD_XX,SAD 517 | 1041_ITS_FEA_XX,FEA 518 | 1048_IEO_FEA_LO,FEA 519 | 1067_ITS_DIS_XX,DIS 520 | 1008_TAI_SAD_XX,SAD 521 | 1075_TAI_SAD_XX,SAD 522 | 1023_ITH_HAP_XX,HAP 523 | 1063_WSI_FEA_XX,FEA 524 | 1009_IEO_SAD_LO,SAD 525 | 1068_IEO_FEA_HI,FEA 526 | 1071_IOM_NEU_XX,NEU 527 | 1075_IEO_SAD_LO,SAD 528 | 1036_WSI_HAP_XX,HAP 529 | 1022_IOM_DIS_XX,DIS 530 | 1017_TAI_HAP_XX,HAP 531 | 1005_DFA_SAD_XX,SAD 532 | 1055_TAI_SAD_XX,SAD 533 | 1058_ITH_HAP_XX,HAP 534 | 1014_ITS_HAP_XX,HAP 535 | 1050_IWL_SAD_XX,SAD 536 | 1002_ITS_DIS_XX,DIS 537 | 1029_IEO_DIS_LO,DIS 538 | 1091_IOM_DIS_XX,DIS 539 | 1001_DFA_HAP_XX,HAP 540 | 1073_IEO_DIS_HI,DIS 541 | 1071_ITH_NEU_XX,NEU 542 | 1005_TSI_ANG_XX,ANG 543 | 1083_ITH_DIS_XX,DIS 544 | 1045_IEO_SAD_HI,SAD 545 | 1054_TAI_NEU_XX,NEU 546 | 1046_ITS_FEA_XX,FEA 547 | 1085_ITH_HAP_XX,HAP 548 | 1018_TSI_HAP_XX,HAP 549 | 1008_IOM_HAP_XX,HAP 550 | 1081_TAI_SAD_XX,SAD 551 | 1039_ITH_DIS_XX,DIS 552 | 1076_IEO_NEU_XX,NEU 553 | 1062_MTI_NEU_XX,NEU 554 | 1028_IOM_HAP_XX,HAP 555 | 1004_TAI_DIS_XX,DIS 556 | 1041_IEO_FEA_HI,FEA 557 | 1079_ITS_SAD_XX,SAD 558 | 1065_ITS_DIS_XX,DIS 559 | 1083_MTI_NEU_XX,NEU 560 | 1003_MTI_NEU_XX,NEU 561 | 1029_TIE_DIS_XX,DIS 562 | 1066_ITS_ANG_XX,ANG 563 | 1034_DFA_SAD_XX,SAD 564 | 1034_TSI_HAP_XX,HAP 565 | 1043_TAI_SAD_XX,SAD 566 | 1042_TAI_HAP_XX,HAP 567 | 1088_TAI_FEA_XX,FEA 568 | 1022_IEO_DIS_HI,DIS 569 | 1062_IWL_HAP_XX,HAP 570 | 1003_IEO_HAP_MD,HAP 571 | 1048_IEO_SAD_MD,SAD 572 | 1015_DFA_DIS_XX,DIS 573 | 1056_DFA_FEA_XX,FEA 574 | 1035_DFA_FEA_XX,FEA 575 | 1050_TSI_SAD_XX,SAD 576 | 1039_ITS_NEU_XX,NEU 577 | 1005_IOM_SAD_XX,SAD 578 | 1046_WSI_ANG_XX,ANG 579 | 1044_TSI_NEU_XX,NEU 580 | 1067_WSI_SAD_XX,SAD 581 | 1012_IEO_SAD_HI,SAD 582 | 1067_MTI_SAD_XX,SAD 583 | 1004_IWL_HAP_XX,HAP 584 | 1009_DFA_NEU_XX,NEU 585 | 1041_IOM_SAD_XX,SAD 586 | 1001_DFA_SAD_XX,SAD 587 | 1048_MTI_DIS_XX,DIS 588 | 1071_IEO_HAP_HI,HAP 589 | 1010_DFA_FEA_XX,FEA 590 | 1032_IWL_FEA_XX,FEA 591 | 1057_IWW_ANG_XX,ANG 592 | 1052_ITS_HAP_XX,HAP 593 | 1005_ITH_HAP_XX,HAP 594 | 1015_WSI_HAP_XX,HAP 595 | 1061_IWW_DIS_XX,DIS 596 | 1058_DFA_DIS_XX,DIS 597 | 1021_TSI_FEA_XX,FEA 598 | 1082_IOM_NEU_XX,NEU 599 | 1008_IEO_HAP_LO,HAP 600 | 1085_DFA_FEA_XX,FEA 601 | 1020_IWW_NEU_XX,NEU 602 | 1068_IOM_SAD_XX,SAD 603 | 1070_TAI_ANG_XX,ANG 604 | 1048_WSI_HAP_XX,HAP 605 | 1056_WSI_DIS_XX,DIS 606 | 1025_ITH_ANG_XX,ANG 607 | 1029_ITS_DIS_XX,DIS 608 | 1013_IOM_SAD_XX,SAD 609 | 1010_TSI_FEA_XX,FEA 610 | 1005_IEO_NEU_XX,NEU 611 | 1039_TAI_SAD_XX,SAD 612 | 1027_IEO_SAD_HI,SAD 613 | 1083_IWL_ANG_XX,ANG 614 | 1052_DFA_ANG_XX,ANG 615 | 1052_IEO_FEA_HI,FEA 616 | 1055_IWL_FEA_XX,FEA 617 | 1075_WSI_HAP_XX,HAP 618 | 1061_IOM_HAP_XX,HAP 619 | 1091_IWL_FEA_XX,FEA 620 | 1028_WSI_HAP_XX,HAP 621 | 1009_IEO_HAP_LO,HAP 622 | 1044_TIE_NEU_XX,NEU 623 | 1090_IEO_SAD_MD,SAD 624 | 1053_WSI_FEA_XX,FEA 625 | 1019_MTI_FEA_XX,FEA 626 | 1013_TAI_FEA_XX,FEA 627 | 1029_IOM_SAD_XX,SAD 628 | 1064_IEO_ANG_LO,ANG 629 | 1020_TAI_ANG_XX,ANG 630 | 1001_TAI_HAP_XX,HAP 631 | 1004_ITS_HAP_XX,HAP 632 | 1039_WSI_FEA_XX,FEA 633 | 1012_MTI_SAD_XX,SAD 634 | 1088_MTI_DIS_XX,DIS 635 | 1058_IEO_HAP_LO,HAP 636 | 1089_IWL_SAD_XX,SAD 637 | 1069_IEO_DIS_MD,DIS 638 | 1035_DFA_SAD_XX,SAD 639 | 1032_IWW_SAD_XX,SAD 640 | 1052_TSI_SAD_XX,SAD 641 | 1013_MTI_SAD_XX,SAD 642 | 1065_TIE_SAD_XX,SAD 643 | 1056_IEO_HAP_HI,HAP 644 | 1020_ITH_HAP_XX,HAP 645 | 1024_DFA_NEU_XX,NEU 646 | 1053_IOM_FEA_XX,FEA 647 | 1036_WSI_SAD_XX,SAD 648 | 1052_WSI_FEA_XX,FEA 649 | 1033_IWL_FEA_XX,FEA 650 | 1064_IEO_SAD_LO,SAD 651 | 1036_IWL_HAP_XX,HAP 652 | 1058_MTI_NEU_XX,NEU 653 | 1090_MTI_SAD_XX,SAD 654 | 1007_ITH_FEA_XX,FEA 655 | 1052_WSI_SAD_XX,SAD 656 | 1084_IEO_FEA_MD,FEA 657 | 1030_MTI_NEU_XX,NEU 658 | 1005_WSI_ANG_XX,ANG 659 | 1084_IEO_DIS_LO,DIS 660 | 1090_IEO_FEA_HI,FEA 661 | 1041_WSI_ANG_XX,ANG 662 | 1079_MTI_HAP_XX,HAP 663 | 1047_TAI_FEA_XX,FEA 664 | 1020_MTI_DIS_XX,DIS 665 | 1010_TSI_SAD_XX,SAD 666 | 1055_ITS_HAP_XX,HAP 667 | 1052_IWL_FEA_XX,FEA 668 | 1033_ITH_FEA_XX,FEA 669 | 1028_TSI_DIS_XX,DIS 670 | 1034_IEO_HAP_MD,HAP 671 | 1089_DFA_HAP_XX,HAP 672 | 1012_IWW_ANG_XX,ANG 673 | 1072_DFA_DIS_XX,DIS 674 | 1068_ITH_FEA_XX,FEA 675 | 1007_IOM_DIS_XX,DIS 676 | 1075_IOM_SAD_XX,SAD 677 | 1050_IWL_FEA_XX,FEA 678 | 1058_MTI_DIS_XX,DIS 679 | 1036_ITS_DIS_XX,DIS 680 | 1060_IWW_HAP_XX,HAP 681 | 1077_TAI_NEU_XX,NEU 682 | 1047_ITH_HAP_XX,HAP 683 | 1088_TSI_NEU_XX,NEU 684 | 1048_TIE_FEA_XX,FEA 685 | 1021_MTI_DIS_XX,DIS 686 | 1060_ITS_ANG_XX,ANG 687 | 1076_IWL_HAP_XX,HAP 688 | 1002_ITS_SAD_XX,SAD 689 | 1026_IEO_FEA_LO,FEA 690 | 1066_IEO_DIS_HI,DIS 691 | 1010_ITS_ANG_XX,ANG 692 | 1037_TAI_DIS_XX,DIS 693 | 1091_IEO_FEA_LO,FEA 694 | 1003_TAI_NEU_XX,NEU 695 | 1012_ITS_HAP_XX,HAP 696 | 1079_IEO_FEA_MD,FEA 697 | 1057_TAI_SAD_XX,SAD 698 | 1060_TSI_NEU_XX,NEU 699 | 1001_WSI_DIS_XX,DIS 700 | 1057_WSI_NEU_XX,NEU 701 | 1041_WSI_DIS_XX,DIS 702 | 1029_MTI_FEA_XX,FEA 703 | 1086_IWW_FEA_XX,FEA 704 | 1028_TAI_SAD_XX,SAD 705 | 1086_IOM_FEA_XX,FEA 706 | 1053_DFA_FEA_XX,FEA 707 | 1025_IEO_FEA_MD,FEA 708 | 1055_IEO_ANG_MD,ANG 709 | 1008_IEO_FEA_LO,FEA 710 | 1013_IOM_FEA_XX,FEA 711 | 1062_TSI_NEU_XX,NEU 712 | 1015_TAI_FEA_XX,FEA 713 | 1030_IWL_DIS_XX,DIS 714 | 1036_IOM_ANG_XX,ANG 715 | 1067_TAI_NEU_XX,NEU 716 | 1072_IWW_SAD_XX,SAD 717 | 1034_IWW_SAD_XX,SAD 718 | 1013_WSI_SAD_XX,SAD 719 | 1032_TIE_FEA_XX,FEA 720 | 1028_WSI_ANG_XX,ANG 721 | 1032_IWL_HAP_XX,HAP 722 | 1078_IEO_FEA_MD,FEA 723 | 1071_ITS_HAP_XX,HAP 724 | 1069_ITS_DIS_XX,DIS 725 | 1078_ITS_ANG_XX,ANG 726 | 1049_DFA_NEU_XX,NEU 727 | 1049_MTI_HAP_XX,HAP 728 | 1004_DFA_ANG_XX,ANG 729 | 1073_IOM_HAP_XX,HAP 730 | 1057_TAI_FEA_XX,FEA 731 | 1012_IWL_HAP_XX,HAP 732 | 1006_ITS_DIS_XX,DIS 733 | 1036_TAI_ANG_XX,ANG 734 | 1055_ITH_DIS_XX,DIS 735 | 1028_IOM_FEA_XX,FEA 736 | 1029_ITH_SAD_XX,SAD 737 | 1056_ITS_ANG_XX,ANG 738 | 1006_IWL_DIS_XX,DIS 739 | 1060_IOM_ANG_XX,ANG 740 | 1089_DFA_ANG_XX,ANG 741 | 1068_IEO_DIS_HI,DIS 742 | 1066_ITS_DIS_XX,DIS 743 | 1072_IEO_SAD_MD,SAD 744 | 1014_TIE_HAP_XX,HAP 745 | -------------------------------------------------------------------------------- /data/CREMAD/video_preprocessing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import cv2 3 | import os 4 | import pdb 5 | 6 | class videoReader(object): 7 | def __init__(self, video_path, frame_interval=1, frame_kept_per_second=1): 8 | self.video_path = video_path 9 | self.frame_interval = frame_interval 10 | self.frame_kept_per_second = frame_kept_per_second 11 | 12 | #pdb.set_trace() 13 | self.vid = cv2.VideoCapture(self.video_path) 14 | self.fps = int(self.vid.get(cv2.CAP_PROP_FPS)) 15 | self.video_frames = self.vid.get(cv2.CAP_PROP_FRAME_COUNT) 16 | self.video_len = int(self.video_frames/self.fps) 17 | 18 | 19 | def video2frame(self, frame_save_path): 20 | self.frame_save_path = frame_save_path 21 | success, image = self.vid.read() 22 | count = 0 23 | while success: 24 | count +=1 25 | if count % self.frame_interval == 0: 26 | save_name = '{}/frame_{}_{}.jpg'.format(self.frame_save_path, int(count/self.fps), count) # filename_second_index 27 | cv2.imencode('.jpg', image)[1].tofile(save_name) 28 | success, image = self.vid.read() 29 | 30 | 31 | def video2frame_update(self, frame_save_path): 32 | self.frame_save_path = frame_save_path 33 | 34 | count = 0 35 | frame_interval = int(self.fps/self.frame_kept_per_second) 36 | while(count < self.video_frames): 37 | ret, image = self.vid.read() 38 | if not ret: 39 | break 40 | if count % self.fps == 0: 41 | frame_id = 0 42 | if frame_id 0: 45 | m = torch.nn.ZeroPad2d((0, 0, 0, p)) 46 | fbank = m(fbank) 47 | elif p < 0: 48 | fbank = fbank[0:target_length, :] 49 | fbank = (fbank - norm_mean) / (norm_std * 2) 50 | 51 | print(fbank.shape) 52 | np.save(save_path + '/'+ name + '.npy',fbank) 53 | -------------------------------------------------------------------------------- /data/VGGSound/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/data/VGGSound/.DS_Store -------------------------------------------------------------------------------- /data/VGGSound/mp4_to_wav.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | train_videos = '/data/users/xiaokang_peng/VGGsound/train-videos/train_video_list.txt' 5 | test_videos = '/data/users/xiaokang_peng/VGGsound/test-videos/test_video_list.txt' 6 | 7 | train_audio_dir = '/data/users/xiaokang_peng/VGGsound/train-audios/train-set' 8 | test_audio_dir = '/data/users/xiaokang_peng/VGGsound/test-audios/test-set' 9 | 10 | 11 | # test set processing 12 | with open(test_videos, 'r') as f: 13 | files = f.readlines() 14 | 15 | for i, item in enumerate(files): 16 | if i % 500 == 0: 17 | print('*******************************************') 18 | print('{}/{}'.format(i, len(files))) 19 | print('*******************************************') 20 | mp4_filename = os.path.join('/data/users/xiaokang_peng/VGGsound/test-videos/test-set/', item[:-1]) 21 | wav_filename = os.path.join(test_audio_dir, item[:-5]+'.wav') 22 | if os.path.exists(wav_filename): 23 | pass 24 | else: 25 | os.system('ffmpeg -i {} -acodec pcm_s16le -ar 16000 {}'.format(mp4_filename, wav_filename)) 26 | 27 | 28 | # train set processing 29 | with open(train_videos, 'r') as f: 30 | files = f.readlines() 31 | 32 | for i, item in enumerate(files): 33 | if i % 500 == 0: 34 | print('*******************************************') 35 | print('{}/{}'.format(i, len(files))) 36 | print('*******************************************') 37 | mp4_filename = os.path.join('/data/users/xiaokang_peng/VGGsound/train-videos/train-set/', item[:-1]) 38 | wav_filename = os.path.join(train_audio_dir, item[:-5]+'.wav') 39 | if os.path.exists(wav_filename): 40 | pass 41 | else: 42 | os.system('ffmpeg -i {} -acodec pcm_s16le -ar 16000 {}'.format(mp4_filename, wav_filename)) 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /data/VGGSound/video_preprocessing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import cv2 3 | import os 4 | import pdb 5 | 6 | class videoReader(object): 7 | def __init__(self, video_path, frame_interval=1, frame_kept_per_second=1): 8 | self.video_path = video_path 9 | self.frame_interval = frame_interval 10 | self.frame_kept_per_second = frame_kept_per_second 11 | 12 | #pdb.set_trace() 13 | self.vid = cv2.VideoCapture(self.video_path) 14 | self.fps = int(self.vid.get(cv2.CAP_PROP_FPS)) 15 | self.video_frames = self.vid.get(cv2.CAP_PROP_FRAME_COUNT) 16 | self.video_len = int(self.video_frames/self.fps) 17 | 18 | 19 | def video2frame(self, frame_save_path): 20 | self.frame_save_path = frame_save_path 21 | success, image = self.vid.read() 22 | count = 0 23 | while success: 24 | count +=1 25 | if count % self.frame_interval == 0: 26 | save_name = '{}/frame_{}_{}.jpg'.format(self.frame_save_path, int(count/self.fps), count) # filename_second_index 27 | cv2.imencode('.jpg', image)[1].tofile(save_name) 28 | success, image = self.vid.read() 29 | 30 | 31 | def video2frame_update(self, frame_save_path): 32 | self.frame_save_path = frame_save_path 33 | 34 | count = 0 35 | frame_interval = int(self.fps/self.frame_kept_per_second) 36 | while(count < self.video_frames): 37 | ret, image = self.vid.read() 38 | if not ret: 39 | break 40 | if count % self.fps == 0: 41 | frame_id = 0 42 | if frame_id 1.] = 1. 60 | resamples[resamples < -1.] = -1. 61 | 62 | spectrogram = librosa.stft(resamples, n_fft=512, hop_length=353) 63 | spectrogram = np.log(np.abs(spectrogram) + 1e-7) 64 | #mean = np.mean(spectrogram) 65 | #std = np.std(spectrogram) 66 | #spectrogram = np.divide(spectrogram - mean, std + 1e-9) 67 | 68 | if self.mode == 'train': 69 | transform = transforms.Compose([ 70 | transforms.RandomResizedCrop(224), 71 | transforms.RandomHorizontalFlip(), 72 | transforms.ToTensor(), 73 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 74 | ]) 75 | else: 76 | transform = transforms.Compose([ 77 | transforms.Resize(size=(224, 224)), 78 | transforms.ToTensor(), 79 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 80 | ]) 81 | 82 | # Visual 83 | image_samples = os.listdir(self.image[idx]) 84 | select_index = np.random.choice(len(image_samples), size=self.args.fps, replace=False) 85 | select_index.sort() 86 | images = torch.zeros((self.args.fps, 3, 224, 224)) 87 | for i in range(self.args.fps): 88 | img = Image.open(os.path.join(self.image[idx], image_samples[i])).convert('RGB') 89 | img = transform(img) 90 | images[i] = img 91 | 92 | images = torch.permute(images, (1,0,2,3)) 93 | 94 | # label 95 | label = self.label[idx] 96 | 97 | return spectrogram, images, label -------------------------------------------------------------------------------- /dataset/VGGSoundDataset.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import csv 3 | import os 4 | import pickle 5 | import librosa 6 | import numpy as np 7 | from scipy import signal 8 | import torch 9 | from PIL import Image 10 | from torch.utils.data import Dataset 11 | from torchvision import transforms 12 | import pdb 13 | import random 14 | 15 | class VGGSound(Dataset): 16 | 17 | def __init__(self, args, mode='train'): 18 | self.args = args 19 | self.mode = mode 20 | train_video_data = [] 21 | train_audio_data = [] 22 | test_video_data = [] 23 | test_audio_data = [] 24 | train_label = [] 25 | test_label = [] 26 | train_class = [] 27 | test_class = [] 28 | 29 | with open('/home/hudi/OGM-GE_CVPR2022/data/VGGSound/vggsound.csv') as f: 30 | csv_reader = csv.reader(f) 31 | 32 | for item in csv_reader: 33 | if item[3] == 'train': 34 | video_dir = os.path.join('/data/users/xiaokang_peng/VGGsound/', 'train-videos/train-set-img', 'Image-{:02d}-FPS'.format(self.args.fps), item[0]+'_'+item[1]+'.mp4') 35 | audio_dir = os.path.join('/data/users/xiaokang_peng/VGGsound/', 'train-audios/train-set', item[0]+'_'+item[1]+'.wav') 36 | if os.path.exists(video_dir) and os.path.exists(audio_dir) and len(os.listdir(video_dir))>3 : 37 | train_video_data.append(video_dir) 38 | train_audio_data.append(audio_dir) 39 | if item[2] not in train_class: train_class.append(item[2]) 40 | train_label.append(item[2]) 41 | 42 | if item[3] == 'test': 43 | video_dir = os.path.join('/data/users/xiaokang_peng/VGGsound/', 'test-videos/test-set-img', 'Image-{:02d}-FPS'.format(self.args.fps), item[0]+'_'+item[1]+'.mp4') 44 | audio_dir = os.path.join('/data/users/xiaokang_peng/VGGsound/', 'test-audios/test-set', item[0]+'_'+item[1]+'.wav') 45 | if os.path.exists(video_dir) and os.path.exists(audio_dir) and len(os.listdir(video_dir))>3: 46 | test_video_data.append(video_dir) 47 | test_audio_data.append(audio_dir) 48 | if item[2] not in test_class: test_class.append(item[2]) 49 | test_label.append(item[2]) 50 | 51 | assert len(train_class) == len(test_class) 52 | self.classes = train_class 53 | 54 | class_dict = dict(zip(self.classes, range(len(self.classes)))) 55 | 56 | if mode == 'train': 57 | self.video = train_video_data 58 | self.audio = train_audio_data 59 | self.label = [class_dict[train_label[idx]] for idx in range(len(train_label))] 60 | if mode == 'test': 61 | self.video = test_video_data 62 | self.audio = test_audio_data 63 | self.label = [class_dict[test_label[idx]] for idx in range(len(test_label))] 64 | 65 | 66 | def __len__(self): 67 | return len(self.video) 68 | 69 | def __getitem__(self, idx): 70 | 71 | # audio 72 | sample, rate = librosa.load(self.audio[idx], sr=16000, mono=True) 73 | while len(sample)/rate < 10.: 74 | sample = np.tile(sample, 2) 75 | 76 | start_point = random.randint(a=0, b=rate*5) 77 | new_sample = sample[start_point:start_point+rate*5] 78 | new_sample[new_sample > 1.] = 1. 79 | new_sample[new_sample < -1.] = -1. 80 | 81 | spectrogram = librosa.stft(new_sample, n_fft=256, hop_length=128) 82 | spectrogram = np.log(np.abs(spectrogram) + 1e-7) 83 | 84 | if self.mode == 'train': 85 | transform = transforms.Compose([ 86 | transforms.RandomResizedCrop(224), 87 | transforms.RandomHorizontalFlip(), 88 | transforms.ToTensor(), 89 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 90 | ]) 91 | else: 92 | transform = transforms.Compose([ 93 | transforms.Resize(size=(224, 224)), 94 | transforms.ToTensor(), 95 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 96 | ]) 97 | 98 | 99 | # Visual 100 | image_samples = os.listdir(self.video[idx]) 101 | select_index = np.random.choice(len(image_samples), size=self.args.use_video_frames, replace=False) 102 | select_index.sort() 103 | images = torch.zeros((self.args.use_video_frames, 3, 224, 224)) 104 | for i in range(self.args.use_video_frames): 105 | img = Image.open(os.path.join(self.video[idx], image_samples[i])).convert('RGB') 106 | img = transform(img) 107 | images[i] = img 108 | 109 | images = torch.permute(images, (1,0,2,3)) 110 | 111 | # label 112 | label = self.label[idx] 113 | 114 | return spectrogram, images, label -------------------------------------------------------------------------------- /dataset/dataset.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import csv 3 | import os 4 | import pickle 5 | 6 | import torch 7 | from PIL import Image 8 | from torch.utils.data import Dataset 9 | from torchvision import transforms 10 | 11 | 12 | class AVDataset(Dataset): 13 | 14 | def __init__(self, args, mode='train'): 15 | classes = [] 16 | data = [] 17 | data2class = {} 18 | self.mode = mode 19 | 20 | self.data_root = '../data/' 21 | 22 | self.visual_feature_path = os.path.join(self.data_root, args.dataset, 'visual/') 23 | self.audio_feature_path = os.path.join(self.data_root, args.dataset, 'audio_spec/') 24 | self.stat_path = os.path.join(self.data_root, args.dataset, 'stat.txt') 25 | self.train_txt = os.path.join(self.data_root, args.dataset, 'my_train.txt') 26 | self.test_txt = os.path.join(self.data_root, args.dataset, 'my_test.txt') 27 | 28 | with open(self.stat_path) as f1: 29 | csv_reader = csv.reader(f1) 30 | for row in csv_reader: 31 | classes.append(row[0]) 32 | 33 | if mode == 'train': 34 | csv_file = self.train_txt 35 | else: 36 | csv_file = self.test_txt 37 | 38 | with open(csv_file) as f2: 39 | csv_reader = csv.reader(f2) 40 | for item in csv_reader: 41 | audio_path = os.path.join(self.audio_feature_path, item[1] + '.pkl') 42 | visual_path = os.path.join(self.visual_feature_path, item[1]) 43 | if os.path.exists(audio_path) and os.path.exists(visual_path): 44 | if args.dataset == 'AVE': 45 | # AVE, delete repeated labels 46 | a = set(data) 47 | if item[1] in a: 48 | del data2class[item[1]] 49 | data.remove(item[1]) 50 | data.append(item[1]) 51 | data2class[item[1]] = item[0] 52 | else: 53 | continue 54 | 55 | self.classes = sorted(classes) 56 | 57 | print(self.classes) 58 | self.data2class = data2class 59 | 60 | self.av_files = [] 61 | for item in data: 62 | self.av_files.append(item) 63 | print('# of files = %d ' % len(self.av_files)) 64 | print('# of classes = %d' % len(self.classes)) 65 | 66 | def __len__(self): 67 | return len(self.av_files) 68 | 69 | def __getitem__(self, idx): 70 | av_file = self.av_files[idx] 71 | 72 | # Audio 73 | audio_path = os.path.join(self.audio_feature_path, av_file + '.pkl') 74 | spectrogram = pickle.load(open(audio_path, 'rb')) 75 | 76 | # Visual 77 | visual_path = os.path.join(self.visual_feature_path, av_file) 78 | file_num = len(os.listdir(visual_path)) 79 | 80 | if self.mode == 'train': 81 | 82 | transform = transforms.Compose([ 83 | transforms.RandomResizedCrop(224), 84 | transforms.RandomHorizontalFlip(), 85 | transforms.ToTensor(), 86 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 87 | ]) 88 | else: 89 | transform = transforms.Compose([ 90 | transforms.Resize(size=(224, 224)), 91 | transforms.ToTensor(), 92 | transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 93 | ]) 94 | 95 | pick_num = 3 96 | seg = int(file_num / pick_num) 97 | path1 = [] 98 | image = [] 99 | image_arr = [] 100 | t = [0] * pick_num 101 | 102 | for i in range(pick_num): 103 | t[i] = seg * i + 1 104 | path1.append('frame_0000' + str(t[i]) + '.jpg') 105 | image.append(Image.open(visual_path + "/" + path1[i]).convert('RGB')) 106 | image_arr.append(transform(image[i])) 107 | image_arr[i] = image_arr[i].unsqueeze(1).float() 108 | if i == 0: 109 | image_n = copy.copy(image_arr[i]) 110 | else: 111 | image_n = torch.cat((image_n, image_arr[i]), 1) 112 | 113 | return spectrogram, image_n, self.classes.index(self.data2class[av_file]), av_file 114 | -------------------------------------------------------------------------------- /demo/algorithom.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/demo/algorithom.PNG -------------------------------------------------------------------------------- /demo/demo_guitar.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/demo/demo_guitar.PNG -------------------------------------------------------------------------------- /demo/demo_snow.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/demo/demo_snow.PNG -------------------------------------------------------------------------------- /demo/five lines.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/demo/five lines.PNG -------------------------------------------------------------------------------- /demo/pipeline.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/demo/pipeline.PNG -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from torch.utils.data import DataLoader 9 | from torch.utils.tensorboard import SummaryWriter 10 | import pdb 11 | 12 | from dataset.CramedDataset import CramedDataset 13 | from dataset.VGGSoundDataset import VGGSound 14 | from dataset.dataset import AVDataset 15 | from models.basic_model import AVClassifier 16 | from utils.utils import setup_seed, weight_init 17 | 18 | 19 | def get_arguments(): 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--dataset', default='CREMAD', type=str, 22 | help='VGGSound, KineticSound, CREMAD, AVE') 23 | parser.add_argument('--modulation', default='OGM_GE', type=str, 24 | 25 | choices=['Normal', 'OGM', 'OGM_GE']) 26 | parser.add_argument('--fusion_method', default='concat', type=str, 27 | choices=['sum', 'concat', 'gated', 'film']) 28 | parser.add_argument('--fps', default=1, type=int) 29 | parser.add_argument('--use_video_frames', default=3, type=int) 30 | parser.add_argument('--audio_path', default='/home/hudi/data/CREMA-D/AudioWAV', type=str) 31 | parser.add_argument('--visual_path', default='/home/hudi/data/CREMA-D/', type=str) 32 | 33 | parser.add_argument('--batch_size', default=64, type=int) 34 | parser.add_argument('--epochs', default=100, type=int) 35 | 36 | parser.add_argument('--optimizer', default='sgd', type=str, choices=['sgd', 'adam']) 37 | parser.add_argument('--learning_rate', default=0.001, type=float, help='initial learning rate') 38 | parser.add_argument('--lr_decay_step', default=70, type=int, help='where learning rate decays') 39 | parser.add_argument('--lr_decay_ratio', default=0.1, type=float, help='decay coefficient') 40 | 41 | parser.add_argument('--modulation_starts', default=0, type=int, help='where modulation begins') 42 | parser.add_argument('--modulation_ends', default=50, type=int, help='where modulation ends') 43 | parser.add_argument('--alpha', required=True, type=float, help='alpha in OGM-GE') 44 | 45 | parser.add_argument('--ckpt_path', required=True, type=str, help='path to save trained models') 46 | parser.add_argument('--train', action='store_true', help='turn on train mode') 47 | 48 | parser.add_argument('--use_tensorboard', default=False, type=bool, help='whether to visualize') 49 | parser.add_argument('--tensorboard_path', type=str, help='path to save tensorboard logs') 50 | 51 | parser.add_argument('--random_seed', default=0, type=int) 52 | parser.add_argument('--gpu_ids', default='0, 1', type=str, help='GPU ids') 53 | 54 | return parser.parse_args() 55 | 56 | 57 | def train_epoch(args, epoch, model, device, dataloader, optimizer, scheduler, writer=None): 58 | criterion = nn.CrossEntropyLoss() 59 | softmax = nn.Softmax(dim=1) 60 | relu = nn.ReLU(inplace=True) 61 | tanh = nn.Tanh() 62 | 63 | model.train() 64 | print("Start training ... ") 65 | 66 | _loss = 0 67 | _loss_a = 0 68 | _loss_v = 0 69 | 70 | for step, (spec, image, label) in enumerate(dataloader): 71 | 72 | #pdb.set_trace() 73 | spec = spec.to(device) 74 | image = image.to(device) 75 | label = label.to(device) 76 | 77 | optimizer.zero_grad() 78 | 79 | # TODO: make it simpler and easier to extend 80 | a, v, out = model(spec.unsqueeze(1).float(), image.float()) 81 | 82 | if args.fusion_method == 'sum': 83 | out_v = (torch.mm(v, torch.transpose(model.module.fusion_module.fc_y.weight, 0, 1)) + 84 | model.module.fusion_module.fc_y.bias) 85 | out_a = (torch.mm(a, torch.transpose(model.module.fusion_module.fc_x.weight, 0, 1)) + 86 | model.module.fusion_module.fc_x.bias) 87 | else: 88 | weight_size = model.module.fusion_module.fc_out.weight.size(1) 89 | out_v = (torch.mm(v, torch.transpose(model.module.fusion_module.fc_out.weight[:, weight_size // 2:], 0, 1)) 90 | + model.module.fusion_module.fc_out.bias / 2) 91 | 92 | out_a = (torch.mm(a, torch.transpose(model.module.fusion_module.fc_out.weight[:, :weight_size // 2], 0, 1)) 93 | + model.module.fusion_module.fc_out.bias / 2) 94 | 95 | loss = criterion(out, label) 96 | loss_v = criterion(out_v, label) 97 | loss_a = criterion(out_a, label) 98 | loss.backward() 99 | 100 | if args.modulation == 'Normal': 101 | # no modulation, regular optimization 102 | pass 103 | else: 104 | # Modulation starts here ! 105 | score_v = sum([softmax(out_v)[i][label[i]] for i in range(out_v.size(0))]) 106 | score_a = sum([softmax(out_a)[i][label[i]] for i in range(out_a.size(0))]) 107 | 108 | ratio_v = score_v / score_a 109 | ratio_a = 1 / ratio_v 110 | 111 | """ 112 | Below is the Eq.(10) in our CVPR paper: 113 | 1 - tanh(alpha * rho_t_u), if rho_t_u > 1 114 | k_t_u = 115 | 1, else 116 | coeff_u is k_t_u, where t means iteration steps and u is modality indicator, either a or v. 117 | """ 118 | 119 | if ratio_v > 1: 120 | coeff_v = 1 - tanh(args.alpha * relu(ratio_v)) 121 | coeff_a = 1 122 | else: 123 | coeff_a = 1 - tanh(args.alpha * relu(ratio_a)) 124 | coeff_v = 1 125 | 126 | if args.use_tensorboard: 127 | iteration = epoch * len(dataloader) + step 128 | writer.add_scalar('data/ratio v', ratio_v, iteration) 129 | writer.add_scalar('data/coefficient v', coeff_v, iteration) 130 | writer.add_scalar('data/coefficient a', coeff_a, iteration) 131 | 132 | if args.modulation_starts <= epoch <= args.modulation_ends: # bug fixed 133 | for name, parms in model.named_parameters(): 134 | layer = str(name).split('.')[1] 135 | 136 | if 'audio' in layer and len(parms.grad.size()) == 4: 137 | if args.modulation == 'OGM_GE': # bug fixed 138 | parms.grad = parms.grad * coeff_a + \ 139 | torch.zeros_like(parms.grad).normal_(0, parms.grad.std().item() + 1e-8) 140 | elif args.modulation == 'OGM': 141 | parms.grad *= coeff_a 142 | 143 | if 'visual' in layer and len(parms.grad.size()) == 4: 144 | if args.modulation == 'OGM_GE': # bug fixed 145 | parms.grad = parms.grad * coeff_v + \ 146 | torch.zeros_like(parms.grad).normal_(0, parms.grad.std().item() + 1e-8) 147 | elif args.modulation == 'OGM': 148 | parms.grad *= coeff_v 149 | else: 150 | pass 151 | 152 | 153 | optimizer.step() 154 | 155 | _loss += loss.item() 156 | _loss_a += loss_a.item() 157 | _loss_v += loss_v.item() 158 | 159 | scheduler.step() 160 | 161 | return _loss / len(dataloader), _loss_a / len(dataloader), _loss_v / len(dataloader) 162 | 163 | 164 | def valid(args, model, device, dataloader): 165 | softmax = nn.Softmax(dim=1) 166 | 167 | if args.dataset == 'VGGSound': 168 | n_classes = 309 169 | elif args.dataset == 'KineticSound': 170 | n_classes = 31 171 | elif args.dataset == 'CREMAD': 172 | n_classes = 6 173 | elif args.dataset == 'AVE': 174 | n_classes = 28 175 | else: 176 | raise NotImplementedError('Incorrect dataset name {}'.format(args.dataset)) 177 | 178 | with torch.no_grad(): 179 | model.eval() 180 | # TODO: more flexible 181 | num = [0.0 for _ in range(n_classes)] 182 | acc = [0.0 for _ in range(n_classes)] 183 | acc_a = [0.0 for _ in range(n_classes)] 184 | acc_v = [0.0 for _ in range(n_classes)] 185 | 186 | for step, (spec, image, label) in enumerate(dataloader): 187 | 188 | spec = spec.to(device) 189 | image = image.to(device) 190 | label = label.to(device) 191 | 192 | a, v, out = model(spec.unsqueeze(1).float(), image.float()) 193 | 194 | if args.fusion_method == 'sum': 195 | out_v = (torch.mm(v, torch.transpose(model.module.fusion_module.fc_y.weight, 0, 1)) + 196 | model.module.fusion_module.fc_y.bias / 2) 197 | out_a = (torch.mm(a, torch.transpose(model.module.fusion_module.fc_x.weight, 0, 1)) + 198 | model.module.fusion_module.fc_x.bias / 2) 199 | else: 200 | out_v = (torch.mm(v, torch.transpose(model.module.fusion_module.fc_out.weight[:, 512:], 0, 1)) + 201 | model.module.fusion_module.fc_out.bias / 2) 202 | out_a = (torch.mm(a, torch.transpose(model.module.fusion_module.fc_out.weight[:, :512], 0, 1)) + 203 | model.module.fusion_module.fc_out.bias / 2) 204 | 205 | prediction = softmax(out) 206 | pred_v = softmax(out_v) 207 | pred_a = softmax(out_a) 208 | 209 | for i in range(image.shape[0]): 210 | 211 | ma = np.argmax(prediction[i].cpu().data.numpy()) 212 | v = np.argmax(pred_v[i].cpu().data.numpy()) 213 | a = np.argmax(pred_a[i].cpu().data.numpy()) 214 | num[label[i]] += 1.0 215 | 216 | #pdb.set_trace() 217 | if np.asarray(label[i].cpu()) == ma: 218 | acc[label[i]] += 1.0 219 | if np.asarray(label[i].cpu()) == v: 220 | acc_v[label[i]] += 1.0 221 | if np.asarray(label[i].cpu()) == a: 222 | acc_a[label[i]] += 1.0 223 | 224 | return sum(acc) / sum(num), sum(acc_a) / sum(num), sum(acc_v) / sum(num) 225 | 226 | 227 | def main(): 228 | args = get_arguments() 229 | print(args) 230 | 231 | setup_seed(args.random_seed) 232 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids 233 | gpu_ids = list(range(torch.cuda.device_count())) 234 | 235 | device = torch.device('cuda:0') 236 | 237 | model = AVClassifier(args) 238 | 239 | model.apply(weight_init) 240 | model.to(device) 241 | 242 | model = torch.nn.DataParallel(model, device_ids=gpu_ids) 243 | 244 | model.cuda() 245 | 246 | optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=1e-4) 247 | scheduler = optim.lr_scheduler.StepLR(optimizer, args.lr_decay_step, args.lr_decay_ratio) 248 | 249 | if args.dataset == 'VGGSound': 250 | train_dataset = VGGSound(args, mode='train') 251 | test_dataset = VGGSound(args, mode='test') 252 | elif args.dataset == 'KineticSound': 253 | train_dataset = AVDataset(args, mode='train') 254 | test_dataset = AVDataset(args, mode='test') 255 | elif args.dataset == 'CREMAD': 256 | train_dataset = CramedDataset(args, mode='train') 257 | test_dataset = CramedDataset(args, mode='test') 258 | elif args.dataset == 'AVE': 259 | train_dataset = AVDataset(args, mode='train') 260 | test_dataset = AVDataset(args, mode='test') 261 | else: 262 | raise NotImplementedError('Incorrect dataset name {}! ' 263 | 'Only support VGGSound, KineticSound and CREMA-D for now!'.format(args.dataset)) 264 | 265 | train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, 266 | shuffle=True, num_workers=32, pin_memory=True) 267 | 268 | test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, 269 | shuffle=False, num_workers=32, pin_memory=True) 270 | 271 | if args.train: 272 | 273 | best_acc = 0.0 274 | 275 | for epoch in range(args.epochs): 276 | 277 | print('Epoch: {}: '.format(epoch)) 278 | 279 | if args.use_tensorboard: 280 | 281 | writer_path = os.path.join(args.tensorboard_path, args.dataset) 282 | if not os.path.exists(writer_path): 283 | os.mkdir(writer_path) 284 | log_name = '{}_{}'.format(args.fusion_method, args.modulation) 285 | writer = SummaryWriter(os.path.join(writer_path, log_name)) 286 | 287 | batch_loss, batch_loss_a, batch_loss_v = train_epoch(args, epoch, model, device, 288 | train_dataloader, optimizer, scheduler) 289 | acc, acc_a, acc_v = valid(args, model, device, test_dataloader) 290 | 291 | writer.add_scalars('Loss', {'Total Loss': batch_loss, 292 | 'Audio Loss': batch_loss_a, 293 | 'Visual Loss': batch_loss_v}, epoch) 294 | 295 | writer.add_scalars('Evaluation', {'Total Accuracy': acc, 296 | 'Audio Accuracy': acc_a, 297 | 'Visual Accuracy': acc_v}, epoch) 298 | 299 | else: 300 | batch_loss, batch_loss_a, batch_loss_v = train_epoch(args, epoch, model, device, 301 | train_dataloader, optimizer, scheduler) 302 | acc, acc_a, acc_v = valid(args, model, device, test_dataloader) 303 | 304 | if acc > best_acc: 305 | best_acc = float(acc) 306 | 307 | if not os.path.exists(args.ckpt_path): 308 | os.mkdir(args.ckpt_path) 309 | 310 | model_name = 'best_model_of_dataset_{}_{}_alpha_{}_' \ 311 | 'optimizer_{}_modulate_starts_{}_ends_{}_' \ 312 | 'epoch_{}_acc_{}.pth'.format(args.dataset, 313 | args.modulation, 314 | args.alpha, 315 | args.optimizer, 316 | args.modulation_starts, 317 | args.modulation_ends, 318 | epoch, acc) 319 | 320 | saved_dict = {'saved_epoch': epoch, 321 | 'modulation': args.modulation, 322 | 'alpha': args.alpha, 323 | 'fusion': args.fusion_method, 324 | 'acc': acc, 325 | 'model': model.state_dict(), 326 | 'optimizer': optimizer.state_dict(), 327 | 'scheduler': scheduler.state_dict()} 328 | 329 | save_dir = os.path.join(args.ckpt_path, model_name) 330 | 331 | torch.save(saved_dict, save_dir) 332 | print('The best model has been saved at {}.'.format(save_dir)) 333 | print("Loss: {:.3f}, Acc: {:.3f}".format(batch_loss, acc)) 334 | print("Audio Acc: {:.3f}, Visual Acc: {:.3f} ".format(acc_a, acc_v)) 335 | else: 336 | print("Loss: {:.3f}, Acc: {:.3f}, Best Acc: {:.3f}".format(batch_loss, acc, best_acc)) 337 | print("Audio Acc: {:.3f}, Visual Acc: {:.3f} ".format(acc_a, acc_v)) 338 | 339 | else: 340 | # first load trained model 341 | loaded_dict = torch.load(args.ckpt_path) 342 | # epoch = loaded_dict['saved_epoch'] 343 | modulation = loaded_dict['modulation'] 344 | # alpha = loaded_dict['alpha'] 345 | fusion = loaded_dict['fusion'] 346 | state_dict = loaded_dict['model'] 347 | # optimizer_dict = loaded_dict['optimizer'] 348 | # scheduler = loaded_dict['scheduler'] 349 | 350 | assert modulation == args.modulation, 'inconsistency between modulation method of loaded model and args !' 351 | assert fusion == args.fusion_method, 'inconsistency between fusion method of loaded model and args !' 352 | 353 | model = model.load_state_dict(state_dict) 354 | print('Trained model loaded!') 355 | 356 | acc, acc_a, acc_v = valid(args, model, device, test_dataloader) 357 | print('Accuracy: {}, accuracy_a: {}, accuracy_v: {}'.format(acc, acc_a, acc_v)) 358 | 359 | 360 | if __name__ == "__main__": 361 | main() 362 | -------------------------------------------------------------------------------- /models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/.DS_Store -------------------------------------------------------------------------------- /models/backbone.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): 5 | """3x3 convolution with padding""" 6 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 7 | padding=dilation, groups=groups, bias=False, dilation=dilation) 8 | 9 | 10 | def conv1x1(in_planes, out_planes, stride=1): 11 | """1x1 convolution""" 12 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 13 | 14 | 15 | class BasicBlock(nn.Module): 16 | expansion = 1 17 | 18 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 19 | base_width=64, dilation=1, norm_layer=None): 20 | super(BasicBlock, self).__init__() 21 | if norm_layer is None: 22 | norm_layer = nn.BatchNorm2d 23 | if groups != 1 or base_width != 64: 24 | raise ValueError('BasicBlock only supports groups=1 and base_width=64') 25 | if dilation > 1: 26 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock") 27 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1 28 | self.conv1 = conv3x3(inplanes, planes, stride) 29 | self.bn1 = norm_layer(planes) 30 | self.relu = nn.ReLU(inplace=True) 31 | self.conv2 = conv3x3(planes, planes) 32 | self.bn2 = norm_layer(planes) 33 | self.downsample = downsample 34 | self.stride = stride 35 | 36 | def forward(self, x): 37 | identity = x 38 | 39 | out = self.conv1(x) 40 | out = self.bn1(out) 41 | out = self.relu(out) 42 | 43 | out = self.conv2(out) 44 | out = self.bn2(out) 45 | 46 | if self.downsample is not None: 47 | identity = self.downsample(x) 48 | 49 | out += identity 50 | out = self.relu(out) 51 | 52 | return out 53 | 54 | 55 | class ResNet(nn.Module): 56 | 57 | def __init__(self, block, layers, modality, num_classes=1000, pool='avgpool', zero_init_residual=False, 58 | groups=1, width_per_group=64, replace_stride_with_dilation=None, 59 | norm_layer=None): 60 | super(ResNet, self).__init__() 61 | self.modality = modality 62 | self.pool = pool 63 | if norm_layer is None: 64 | norm_layer = nn.BatchNorm2d 65 | self._norm_layer = norm_layer 66 | 67 | self.inplanes = 64 68 | self.dilation = 1 69 | if replace_stride_with_dilation is None: 70 | # each element in the tuple indicates if we should replace 71 | # the 2x2 stride with a dilated convolution instead 72 | replace_stride_with_dilation = [False, False, False] 73 | if len(replace_stride_with_dilation) != 3: 74 | raise ValueError("replace_stride_with_dilation should be None " 75 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) 76 | self.groups = groups 77 | self.base_width = width_per_group 78 | if modality == 'audio': 79 | self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=7, stride=2, padding=3, 80 | bias=False) 81 | elif modality == 'visual': 82 | self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, 83 | bias=False) 84 | else: 85 | raise NotImplementedError('Incorrect modality, should be audio or visual but got {}'.format(modality)) 86 | self.bn1 = norm_layer(self.inplanes) 87 | self.relu = nn.ReLU(inplace=True) 88 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 89 | self.layer1 = self._make_layer(block, 64, layers[0]) 90 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2, 91 | dilate=replace_stride_with_dilation[0]) 92 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2, 93 | dilate=replace_stride_with_dilation[1]) 94 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2, 95 | dilate=replace_stride_with_dilation[2]) 96 | # if self.pool == 'avgpool': 97 | # self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 98 | # 99 | # self.fc = nn.Linear(512 * block.expansion, num_classes) # 8192 100 | 101 | for m in self.modules(): 102 | if isinstance(m, nn.Conv2d): 103 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 104 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): 105 | nn.init.normal_(m.weight, mean=1, std=0.02) 106 | nn.init.constant_(m.bias, 0) 107 | 108 | # Zero-initialize the last BN in each residual branch, 109 | # so that the residual branch starts with zeros, and each residual block behaves like an identity. 110 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 111 | if zero_init_residual: 112 | for m in self.modules(): 113 | if isinstance(m, Bottleneck): 114 | nn.init.constant_(m.bn3.weight, 0) 115 | elif isinstance(m, BasicBlock): 116 | nn.init.constant_(m.bn2.weight, 0) 117 | 118 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False): 119 | norm_layer = self._norm_layer 120 | downsample = None 121 | previous_dilation = self.dilation 122 | if dilate: 123 | self.dilation *= stride 124 | stride = 1 125 | if stride != 1 or self.inplanes != planes * block.expansion: 126 | downsample = nn.Sequential( 127 | conv1x1(self.inplanes, planes * block.expansion, stride), 128 | norm_layer(planes * block.expansion), 129 | ) 130 | 131 | layers = [] 132 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups, 133 | self.base_width, previous_dilation, norm_layer)) 134 | self.inplanes = planes * block.expansion 135 | for _ in range(1, blocks): 136 | layers.append(block(self.inplanes, planes, groups=self.groups, 137 | base_width=self.base_width, dilation=self.dilation, 138 | norm_layer=norm_layer)) 139 | 140 | return nn.Sequential(*layers) 141 | 142 | def forward(self, x): 143 | 144 | if self.modality == 'visual': 145 | (B, C, T, H, W) = x.size() 146 | x = x.permute(0, 2, 1, 3, 4).contiguous() 147 | x = x.view(B * T, C, H, W) 148 | 149 | x = self.conv1(x) 150 | x = self.bn1(x) 151 | x = self.relu(x) 152 | x = self.maxpool(x) 153 | 154 | x = self.layer1(x) 155 | x = self.layer2(x) 156 | x = self.layer3(x) 157 | x = self.layer4(x) 158 | out = x 159 | 160 | return out 161 | 162 | 163 | class Bottleneck(nn.Module): 164 | expansion = 4 165 | 166 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 167 | base_width=64, dilation=1, norm_layer=None): 168 | super(Bottleneck, self).__init__() 169 | if norm_layer is None: 170 | norm_layer = nn.BatchNorm2d 171 | width = int(planes * (base_width / 64.)) * groups 172 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1 173 | self.conv1 = conv1x1(inplanes, width) 174 | self.bn1 = norm_layer(width) 175 | self.conv2 = conv3x3(width, width, stride, groups, dilation) 176 | self.bn2 = norm_layer(width) 177 | self.conv3 = conv1x1(width, planes * self.expansion) 178 | self.bn3 = norm_layer(planes * self.expansion) 179 | self.relu = nn.ReLU(inplace=True) 180 | self.downsample = downsample 181 | self.stride = stride 182 | 183 | def forward(self, x): 184 | identity = x 185 | 186 | out = self.conv1(x) 187 | out = self.bn1(out) 188 | out = self.relu(out) 189 | 190 | out = self.conv2(out) 191 | out = self.bn2(out) 192 | out = self.relu(out) 193 | 194 | out = self.conv3(out) 195 | out = self.bn3(out) 196 | 197 | if self.downsample is not None: 198 | identity = self.downsample(x) 199 | 200 | out += identity 201 | out = self.relu(out) 202 | 203 | return out 204 | 205 | 206 | def _resnet(arch, block, layers, modality, progress, **kwargs): 207 | model = ResNet(block, layers, modality, **kwargs) 208 | return model 209 | 210 | 211 | def resnet18(modality, progress=True, **kwargs): 212 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], modality, progress, 213 | **kwargs) 214 | -------------------------------------------------------------------------------- /models/basic_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from .backbone import resnet18 5 | from .fusion_modules import SumFusion, ConcatFusion, FiLM, GatedFusion 6 | 7 | 8 | class AVClassifier(nn.Module): 9 | def __init__(self, args): 10 | super(AVClassifier, self).__init__() 11 | 12 | fusion = args.fusion_method 13 | if args.dataset == 'VGGSound': 14 | n_classes = 309 15 | elif args.dataset == 'KineticSound': 16 | n_classes = 31 17 | elif args.dataset == 'CREMAD': 18 | n_classes = 6 19 | elif args.dataset == 'AVE': 20 | n_classes = 28 21 | else: 22 | raise NotImplementedError('Incorrect dataset name {}'.format(args.dataset)) 23 | 24 | if fusion == 'sum': 25 | self.fusion_module = SumFusion(output_dim=n_classes) 26 | elif fusion == 'concat': 27 | self.fusion_module = ConcatFusion(output_dim=n_classes) 28 | elif fusion == 'film': 29 | self.fusion_module = FiLM(output_dim=n_classes, x_film=True) 30 | elif fusion == 'gated': 31 | self.fusion_module = GatedFusion(output_dim=n_classes, x_gate=True) 32 | else: 33 | raise NotImplementedError('Incorrect fusion method: {}!'.format(fusion)) 34 | 35 | self.audio_net = resnet18(modality='audio') 36 | self.visual_net = resnet18(modality='visual') 37 | 38 | def forward(self, audio, visual): 39 | 40 | a = self.audio_net(audio) 41 | v = self.visual_net(visual) 42 | 43 | (_, C, H, W) = v.size() 44 | B = a.size()[0] 45 | v = v.view(B, -1, C, H, W) 46 | v = v.permute(0, 2, 1, 3, 4) 47 | 48 | a = F.adaptive_avg_pool2d(a, 1) 49 | v = F.adaptive_avg_pool3d(v, 1) 50 | 51 | a = torch.flatten(a, 1) 52 | v = torch.flatten(v, 1) 53 | 54 | a, v, out = self.fusion_module(a, v) 55 | 56 | return a, v, out 57 | -------------------------------------------------------------------------------- /models/fusion_modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class SumFusion(nn.Module): 6 | def __init__(self, input_dim=512, output_dim=100): 7 | super(SumFusion, self).__init__() 8 | self.fc_x = nn.Linear(input_dim, output_dim) 9 | self.fc_y = nn.Linear(input_dim, output_dim) 10 | 11 | def forward(self, x, y): 12 | output = self.fc_x(x) + self.fc_y(y) 13 | return x, y, output 14 | 15 | 16 | class ConcatFusion(nn.Module): 17 | def __init__(self, input_dim=1024, output_dim=100): 18 | super(ConcatFusion, self).__init__() 19 | self.fc_out = nn.Linear(input_dim, output_dim) 20 | 21 | def forward(self, x, y): 22 | output = torch.cat((x, y), dim=1) 23 | output = self.fc_out(output) 24 | return x, y, output 25 | 26 | 27 | class FiLM(nn.Module): 28 | """ 29 | FiLM: Visual Reasoning with a General Conditioning Layer, 30 | https://arxiv.org/pdf/1709.07871.pdf. 31 | """ 32 | 33 | def __init__(self, input_dim=512, dim=512, output_dim=100, x_film=True): 34 | super(FiLM, self).__init__() 35 | 36 | self.dim = input_dim 37 | self.fc = nn.Linear(input_dim, 2 * dim) 38 | self.fc_out = nn.Linear(dim, output_dim) 39 | 40 | self.x_film = x_film 41 | 42 | def forward(self, x, y): 43 | 44 | if self.x_film: 45 | film = x 46 | to_be_film = y 47 | else: 48 | film = y 49 | to_be_film = x 50 | 51 | gamma, beta = torch.split(self.fc(film), self.dim, 1) 52 | 53 | output = gamma * to_be_film + beta 54 | output = self.fc_out(output) 55 | 56 | return x, y, output 57 | 58 | 59 | class GatedFusion(nn.Module): 60 | """ 61 | Efficient Large-Scale Multi-Modal Classification, 62 | https://arxiv.org/pdf/1802.02892.pdf. 63 | """ 64 | 65 | def __init__(self, input_dim=512, dim=512, output_dim=100, x_gate=True): 66 | super(GatedFusion, self).__init__() 67 | 68 | self.fc_x = nn.Linear(input_dim, dim) 69 | self.fc_y = nn.Linear(input_dim, dim) 70 | self.fc_out = nn.Linear(dim, output_dim) 71 | 72 | self.x_gate = x_gate # whether to choose the x to obtain the gate 73 | 74 | self.sigmoid = nn.Sigmoid() 75 | 76 | def forward(self, x, y): 77 | out_x = self.fc_x(x) 78 | out_y = self.fc_y(y) 79 | 80 | if self.x_gate: 81 | gate = self.sigmoid(out_x) 82 | output = self.fc_out(torch.mul(gate, out_y)) 83 | else: 84 | gate = self.sigmoid(out_y) 85 | output = self.fc_out(torch.mul(out_x, gate)) 86 | 87 | return out_x, out_y, output 88 | 89 | -------------------------------------------------------------------------------- /models/old_models/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/.DS_Store -------------------------------------------------------------------------------- /models/old_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .encodera import * 2 | from .encoderv import * 3 | #from .avmodel_att import * 4 | #from .avmodel import * 5 | -------------------------------------------------------------------------------- /models/old_models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel.cpython-36.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel_3.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_3.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel_am.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_am.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel_att.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_att.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel_cma.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_cma.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel_demo.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_demo.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel_gate.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_gate.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel_gradblending.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_gradblending.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel_md.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_md.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel_psp.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_psp.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel_uni.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_uni.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/avmodel_x.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/avmodel_x.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/encodera.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/encodera.cpython-36.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/encodera.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/encodera.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/encoderv.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/encoderv.cpython-36.pyc -------------------------------------------------------------------------------- /models/old_models/__pycache__/encoderv.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/OGM-GE_CVPR2022/343dc1d95c9bda88f105c0da8d24f185e0e98b54/models/old_models/__pycache__/encoderv.cpython-37.pyc -------------------------------------------------------------------------------- /models/old_models/avmodel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from PIL import Image 4 | import torch 5 | import torchvision 6 | from torchvision.transforms import * 7 | import torch.nn as nn 8 | from torch.autograd import Variable 9 | from torch.utils.data import Dataset, DataLoader 10 | import numpy as np 11 | import math 12 | from collections import OrderedDict 13 | import torch.nn.functional as F 14 | import torch.optim as optim 15 | import argparse 16 | import csv 17 | import random 18 | import warnings 19 | import pdb 20 | sys.path.append('/home/xiaokang_peng/ks/models') 21 | import encodera as ma 22 | import encoderv as mv 23 | warnings.filterwarnings('ignore') 24 | 25 | 26 | 27 | class AVmodel(nn.Module): 28 | def __init__(self,args): 29 | super(AVmodel,self).__init__() 30 | self.args = args 31 | self.parta = ma.Resnet(self.args) 32 | self.parta.fc = nn.Linear(512, args.n_classes) 33 | 34 | self.partv = mv.Resnet(self.args) 35 | self.partv.fc = nn.Linear(512, args.n_classes) 36 | 37 | self.fc_ = nn.Linear(1024, args.n_classes) 38 | 39 | self.dropx = nn.Dropout(0.0) 40 | self.dropy = nn.Dropout(0.5) 41 | 42 | 43 | 44 | def forward(self,audio,visual,label,iterations): 45 | 46 | y = self.parta(audio) 47 | x = self.partv(visual) 48 | (_, C, H, W) = x.size() 49 | B = y.size()[0] 50 | x = x.view(B, -1, C, H, W) 51 | x = x.permute(0, 2, 1, 3, 4) 52 | 53 | x = F.adaptive_avg_pool3d(x, 1) 54 | y = F.adaptive_avg_pool2d(y, 1) 55 | x = x.squeeze(2).squeeze(2).squeeze(2) 56 | y = y.squeeze(2).squeeze(2) 57 | 58 | #x = self.dropx(x) 59 | #y = self.dropy(y) 60 | #x *= self.dropx(torch.ones(1)).cuda() 61 | #y *= self.dropy(torch.ones(1)).cuda() 62 | 63 | out = torch.cat((x, y),1) 64 | out = self.fc_(out) 65 | 66 | 67 | return x, y, out 68 | 69 | -------------------------------------------------------------------------------- /models/old_models/avmodel_x.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from PIL import Image 4 | import torch 5 | import torchvision 6 | from torchvision.transforms import * 7 | import torch.nn as nn 8 | from torch.autograd import Variable 9 | from torch.utils.data import Dataset, DataLoader 10 | import numpy as np 11 | import math 12 | from collections import OrderedDict 13 | import torch.nn.functional as F 14 | import torch.optim as optim 15 | import argparse 16 | import csv 17 | import random 18 | import warnings 19 | import pdb 20 | #sys.path.append('/home/xiaokang_peng/avetry/ave_av/models') 21 | import encodera as ma 22 | import encoderv as mv 23 | warnings.filterwarnings('ignore') 24 | 25 | 26 | from resemblyzer import VoiceEncoder, preprocess_wav 27 | from pathlib import Path 28 | 29 | 30 | class AVmodel_x(nn.Module): 31 | def __init__(self,args): 32 | super(AVmodel_x,self).__init__() 33 | self.args = args 34 | ''' 35 | self.parta = ma.Resnet(self.args) 36 | self.parta.fc = nn.Linear(512, args.n_classes) 37 | ''' 38 | self.partv = mv.Resnet(self.args) 39 | self.partv.fc = nn.Linear(512, args.n_classes) 40 | 41 | self.fc_ = nn.Linear(1024, args.n_classes) 42 | self.fc_a = nn.Linear(256, 512) 43 | 44 | 45 | 46 | 47 | def forward(self,audio,visual,label,iterations): 48 | iteration = iterations 49 | y = audio 50 | #print(audio.size()) 51 | x = self.partv(visual) 52 | (_, C, H, W) = x.size() 53 | B = y.size()[0] 54 | x = x.view(B, -1, C, H, W) 55 | x = x.permute(0, 2, 1, 3, 4) 56 | 57 | x = F.adaptive_avg_pool3d(x, 1) 58 | #y = F.adaptive_avg_pool2d(y, 1) 59 | x = x.squeeze(2).squeeze(2).squeeze(2) 60 | #y = y.squeeze(2).squeeze(2) 61 | y = self.fc_a(y) 62 | 63 | #print(x.size(),y.size()) 64 | 65 | out = torch.cat((x, y),1) 66 | out = self.fc_(out) 67 | 68 | 69 | return x, y, out 70 | 71 | -------------------------------------------------------------------------------- /models/old_models/encodera.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Aencoder(nn.Module): 7 | 8 | def __init__(self, args): 9 | super(Aencoder, self).__init__() 10 | self.audnet = Resnet(args) 11 | 12 | def forward(self, audio): 13 | aud = self.audnet(audio) 14 | return aud 15 | 16 | 17 | def Resnet(opt): 18 | if opt.model_depth == 18: 19 | model = resnet18( 20 | # num_classes=opt.n_classes, 21 | num_classes=1000, 22 | pool=opt.pool) 23 | return model 24 | 25 | 26 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): 27 | """3x3 convolution with padding""" 28 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 29 | padding=dilation, groups=groups, bias=False, dilation=dilation) 30 | 31 | 32 | def conv1x1(in_planes, out_planes, stride=1): 33 | """1x1 convolution""" 34 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 35 | 36 | 37 | class BasicBlock(nn.Module): 38 | expansion = 1 39 | 40 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 41 | base_width=64, dilation=1, norm_layer=None): 42 | super(BasicBlock, self).__init__() 43 | if norm_layer is None: 44 | norm_layer = nn.BatchNorm2d 45 | if groups != 1 or base_width != 64: 46 | raise ValueError('BasicBlock only supports groups=1 and base_width=64') 47 | if dilation > 1: 48 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock") 49 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1 50 | self.conv1 = conv3x3(inplanes, planes, stride) 51 | self.bn1 = norm_layer(planes) 52 | self.relu = nn.ReLU(inplace=True) 53 | self.conv2 = conv3x3(planes, planes) 54 | self.bn2 = norm_layer(planes) 55 | self.downsample = downsample 56 | self.stride = stride 57 | 58 | def forward(self, x): 59 | identity = x 60 | 61 | out = self.conv1(x) 62 | out = self.bn1(out) 63 | out = self.relu(out) 64 | 65 | out = self.conv2(out) 66 | out = self.bn2(out) 67 | 68 | if self.downsample is not None: 69 | identity = self.downsample(x) 70 | 71 | out += identity 72 | out = self.relu(out) 73 | 74 | return out 75 | 76 | 77 | class ResNet(nn.Module): 78 | 79 | def __init__(self, block, layers, num_classes=1000, pool='avgpool', zero_init_residual=False, 80 | groups=1, width_per_group=64, replace_stride_with_dilation=None, 81 | norm_layer=None): 82 | super(ResNet, self).__init__() 83 | self.pool = pool 84 | if norm_layer is None: 85 | norm_layer = nn.BatchNorm2d 86 | self._norm_layer = norm_layer 87 | 88 | self.inplanes = 64 89 | self.dilation = 1 90 | if replace_stride_with_dilation is None: 91 | # each element in the tuple indicates if we should replace 92 | # the 2x2 stride with a dilated convolution instead 93 | replace_stride_with_dilation = [False, False, False] 94 | if len(replace_stride_with_dilation) != 3: 95 | raise ValueError("replace_stride_with_dilation should be None " 96 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) 97 | self.groups = groups 98 | self.base_width = width_per_group 99 | self.conv1 = nn.Conv2d(1, self.inplanes, kernel_size=7, stride=2, padding=3, 100 | bias=False) 101 | self.bn1 = norm_layer(self.inplanes) 102 | self.relu = nn.ReLU(inplace=True) 103 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 104 | self.layer1 = self._make_layer(block, 64, layers[0]) 105 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2, 106 | dilate=replace_stride_with_dilation[0]) 107 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2, 108 | dilate=replace_stride_with_dilation[1]) 109 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2, 110 | dilate=replace_stride_with_dilation[2]) 111 | if self.pool == 'avgpool': 112 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 113 | 114 | self.fc = nn.Linear(512 * block.expansion, num_classes) # 8192 115 | elif self.pool == 'vlad': 116 | self.avgpool = NetVLAD() 117 | self.fc_ = nn.Linear(8192 * block.expansion, num_classes) 118 | 119 | for m in self.modules(): 120 | if isinstance(m, nn.Conv2d): 121 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 122 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): 123 | nn.init.normal_(m.weight, mean=1, std=0.02) 124 | nn.init.constant_(m.bias, 0) 125 | 126 | # Zero-initialize the last BN in each residual branch, 127 | # so that the residual branch starts with zeros, and each residual block behaves like an identity. 128 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 129 | if zero_init_residual: 130 | for m in self.modules(): 131 | if isinstance(m, Bottleneck): 132 | nn.init.constant_(m.bn3.weight, 0) 133 | elif isinstance(m, BasicBlock): 134 | nn.init.constant_(m.bn2.weight, 0) 135 | 136 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False): 137 | norm_layer = self._norm_layer 138 | downsample = None 139 | previous_dilation = self.dilation 140 | if dilate: 141 | self.dilation *= stride 142 | stride = 1 143 | if stride != 1 or self.inplanes != planes * block.expansion: 144 | downsample = nn.Sequential( 145 | conv1x1(self.inplanes, planes * block.expansion, stride), 146 | norm_layer(planes * block.expansion), 147 | ) 148 | 149 | layers = [] 150 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups, 151 | self.base_width, previous_dilation, norm_layer)) 152 | self.inplanes = planes * block.expansion 153 | for _ in range(1, blocks): 154 | layers.append(block(self.inplanes, planes, groups=self.groups, 155 | base_width=self.base_width, dilation=self.dilation, 156 | norm_layer=norm_layer)) 157 | 158 | return nn.Sequential(*layers) 159 | 160 | def forward(self, x): 161 | x = self.conv1(x) 162 | x = self.bn1(x) 163 | x = self.relu(x) 164 | x = self.maxpool(x) 165 | 166 | x = self.layer1(x) 167 | x = self.layer2(x) 168 | x = self.layer3(x) 169 | x = self.layer4(x) 170 | out = x 171 | 172 | x = self.avgpool(x) 173 | x = x.reshape(x.size(0), -1) 174 | 175 | if self.pool == 'avgpool': 176 | x = self.fc(x) 177 | elif self.pool == 'vlad': 178 | x = self.fc_(x) 179 | 180 | return out 181 | 182 | 183 | class NetVLAD(nn.Module): 184 | """NetVLAD layer implementation""" 185 | 186 | def __init__(self, num_clusters=16, dim=512, alpha=100.0, 187 | normalize_input=True): 188 | """ 189 | Args: 190 | num_clusters : int 191 | The number of clusters 192 | dim : int 193 | Dimension of descriptors 194 | alpha : float 195 | Parameter of initialization. Larger value is harder assignment. 196 | normalize_input : bool 197 | If true, descriptor-wise L2 normalization is applied to input. 198 | """ 199 | super(NetVLAD, self).__init__() 200 | self.num_clusters = num_clusters 201 | self.dim = dim 202 | self.alpha = alpha 203 | self.normalize_input = normalize_input 204 | self.conv = nn.Conv2d(dim, num_clusters, kernel_size=(1, 1), bias=True) 205 | self.centroids = nn.Parameter(torch.rand(num_clusters, dim)) 206 | self._init_params() 207 | 208 | def _init_params(self): 209 | self.conv.weight = nn.Parameter( 210 | (2.0 * self.alpha * self.centroids).unsqueeze(-1).unsqueeze(-1) 211 | ) 212 | self.conv.bias = nn.Parameter( 213 | - self.alpha * self.centroids.norm(dim=1) 214 | ) 215 | 216 | def forward(self, x): 217 | N, C = x.shape[:2] 218 | 219 | if self.normalize_input: 220 | x = F.normalize(x, p=2, dim=1) # across descriptor dim 221 | 222 | # soft-assignment 223 | soft_assign = self.conv(x).view(N, self.num_clusters, -1) 224 | soft_assign = F.softmax(soft_assign, dim=1) 225 | 226 | x_flatten = x.view(N, C, -1) 227 | 228 | # calculate residuals to each clusters 229 | residual = x_flatten.expand(self.num_clusters, -1, -1, -1).permute(1, 0, 2, 3) - \ 230 | self.centroids.expand(x_flatten.size(-1), -1, -1).permute(1, 2, 0).unsqueeze(0) 231 | residual *= soft_assign.unsqueeze(2) 232 | vlad = residual.sum(dim=-1) 233 | 234 | vlad = F.normalize(vlad, p=2, dim=2) # intra-normalization 235 | vlad = vlad.view(x.size(0), -1) # flatten 236 | vlad = F.normalize(vlad, p=2, dim=1) # L2 normalize 237 | 238 | return vlad 239 | 240 | 241 | class Bottleneck(nn.Module): 242 | expansion = 4 243 | 244 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 245 | base_width=64, dilation=1, norm_layer=None): 246 | super(Bottleneck, self).__init__() 247 | if norm_layer is None: 248 | norm_layer = nn.BatchNorm2d 249 | width = int(planes * (base_width / 64.)) * groups 250 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1 251 | self.conv1 = conv1x1(inplanes, width) 252 | self.bn1 = norm_layer(width) 253 | self.conv2 = conv3x3(width, width, stride, groups, dilation) 254 | self.bn2 = norm_layer(width) 255 | self.conv3 = conv1x1(width, planes * self.expansion) 256 | self.bn3 = norm_layer(planes * self.expansion) 257 | self.relu = nn.ReLU(inplace=True) 258 | self.downsample = downsample 259 | self.stride = stride 260 | 261 | def forward(self, x): 262 | identity = x 263 | 264 | out = self.conv1(x) 265 | out = self.bn1(out) 266 | out = self.relu(out) 267 | 268 | out = self.conv2(out) 269 | out = self.bn2(out) 270 | out = self.relu(out) 271 | 272 | out = self.conv3(out) 273 | out = self.bn3(out) 274 | 275 | if self.downsample is not None: 276 | identity = self.downsample(x) 277 | 278 | out += identity 279 | out = self.relu(out) 280 | 281 | return out 282 | 283 | 284 | def _resnet(arch, block, layers, pretrained, progress, **kwargs): 285 | model = ResNet(block, layers, **kwargs) 286 | if pretrained: 287 | state_dict = load_state_dict_from_url(model_urls[arch], 288 | progress=progress) 289 | model.load_state_dict(state_dict) 290 | return model 291 | 292 | 293 | def resnet18(pretrained=False, progress=True, **kwargs): 294 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, 295 | **kwargs) 296 | -------------------------------------------------------------------------------- /models/old_models/encoderv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Vencoder(nn.Module): 7 | 8 | def __init__(self,args): 9 | super(Vencoder, self).__init__() 10 | self.audnet = Resnet(args) 11 | 12 | def forward(self, audio): 13 | aud = self.audnet(audio) 14 | return aud 15 | 16 | 17 | def Resnet(opt): 18 | if opt.model_depth == 18: 19 | model = resnet18( 20 | #num_classes=opt.n_classes, 21 | num_classes=1000, 22 | pool=opt.pool) 23 | return model 24 | 25 | 26 | 27 | 28 | def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): 29 | """3x3 convolution with padding""" 30 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 31 | padding=dilation, groups=groups, bias=False, dilation=dilation) 32 | 33 | 34 | def conv1x1(in_planes, out_planes, stride=1): 35 | """1x1 convolution""" 36 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 37 | 38 | 39 | class BasicBlock(nn.Module): 40 | expansion = 1 41 | 42 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 43 | base_width=64, dilation=1, norm_layer=None): 44 | super(BasicBlock, self).__init__() 45 | if norm_layer is None: 46 | norm_layer = nn.BatchNorm2d 47 | if groups != 1 or base_width != 64: 48 | raise ValueError('BasicBlock only supports groups=1 and base_width=64') 49 | if dilation > 1: 50 | raise NotImplementedError("Dilation > 1 not supported in BasicBlock") 51 | # Both self.conv1 and self.downsample layers downsample the input when stride != 1 52 | self.conv1 = conv3x3(inplanes, planes, stride) 53 | self.bn1 = norm_layer(planes) 54 | self.relu = nn.ReLU(inplace=True) 55 | self.conv2 = conv3x3(planes, planes) 56 | self.bn2 = norm_layer(planes) 57 | self.downsample = downsample 58 | self.stride = stride 59 | 60 | def forward(self, x): 61 | identity = x 62 | 63 | out = self.conv1(x) 64 | out = self.bn1(out) 65 | out = self.relu(out) 66 | 67 | out = self.conv2(out) 68 | out = self.bn2(out) 69 | 70 | if self.downsample is not None: 71 | identity = self.downsample(x) 72 | 73 | out += identity 74 | out = self.relu(out) 75 | 76 | return out 77 | 78 | 79 | class ResNet(nn.Module): 80 | 81 | def __init__(self, block, layers, num_classes=1000, pool='avgpool', zero_init_residual=False, 82 | groups=1, width_per_group=64, replace_stride_with_dilation=None, 83 | norm_layer=None): 84 | super(ResNet, self).__init__() 85 | self.pool = pool 86 | if norm_layer is None: 87 | norm_layer = nn.BatchNorm2d 88 | self._norm_layer = norm_layer 89 | 90 | self.inplanes = 64 91 | self.dilation = 1 92 | if replace_stride_with_dilation is None: 93 | # each element in the tuple indicates if we should replace 94 | # the 2x2 stride with a dilated convolution instead 95 | replace_stride_with_dilation = [False, False, False] 96 | if len(replace_stride_with_dilation) != 3: 97 | raise ValueError("replace_stride_with_dilation should be None " 98 | "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) 99 | self.groups = groups 100 | self.base_width = width_per_group 101 | self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, 102 | bias=False) 103 | self.bn1 = norm_layer(self.inplanes) 104 | self.relu = nn.ReLU(inplace=True) 105 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 106 | self.layer1 = self._make_layer(block, 64, layers[0]) 107 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2, 108 | dilate=replace_stride_with_dilation[0]) 109 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2, 110 | dilate=replace_stride_with_dilation[1]) 111 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2, 112 | dilate=replace_stride_with_dilation[2]) 113 | if self.pool == 'avgpool': 114 | self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) 115 | 116 | self.fc = nn.Linear(512 * block.expansion, num_classes) # 8192 117 | elif self.pool == 'vlad': 118 | self.avgpool = NetVLAD() 119 | self.fc_ = nn.Linear(8192 * block.expansion, num_classes) 120 | 121 | for m in self.modules(): 122 | if isinstance(m, nn.Conv2d): 123 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 124 | elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): 125 | nn.init.normal_(m.weight, mean=1, std=0.02) 126 | nn.init.constant_(m.bias, 0) 127 | 128 | # Zero-initialize the last BN in each residual branch, 129 | # so that the residual branch starts with zeros, and each residual block behaves like an identity. 130 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 131 | if zero_init_residual: 132 | for m in self.modules(): 133 | if isinstance(m, Bottleneck): 134 | nn.init.constant_(m.bn3.weight, 0) 135 | elif isinstance(m, BasicBlock): 136 | nn.init.constant_(m.bn2.weight, 0) 137 | 138 | def _make_layer(self, block, planes, blocks, stride=1, dilate=False): 139 | norm_layer = self._norm_layer 140 | downsample = None 141 | previous_dilation = self.dilation 142 | if dilate: 143 | self.dilation *= stride 144 | stride = 1 145 | if stride != 1 or self.inplanes != planes * block.expansion: 146 | downsample = nn.Sequential( 147 | conv1x1(self.inplanes, planes * block.expansion, stride), 148 | norm_layer(planes * block.expansion), 149 | ) 150 | 151 | layers = [] 152 | layers.append(block(self.inplanes, planes, stride, downsample, self.groups, 153 | self.base_width, previous_dilation, norm_layer)) 154 | self.inplanes = planes * block.expansion 155 | for _ in range(1, blocks): 156 | layers.append(block(self.inplanes, planes, groups=self.groups, 157 | base_width=self.base_width, dilation=self.dilation, 158 | norm_layer=norm_layer)) 159 | 160 | return nn.Sequential(*layers) 161 | 162 | def forward(self, x): 163 | 164 | (B, C, T, H, W) = x.size() 165 | x = x.permute(0, 2, 1, 3, 4).contiguous() 166 | x = x.view(B * T, C, H, W) 167 | 168 | x = self.conv1(x) 169 | x = self.bn1(x) 170 | x = self.relu(x) 171 | x = self.maxpool(x) 172 | 173 | x = self.layer1(x) 174 | x = self.layer2(x) 175 | x = self.layer3(x) 176 | x = self.layer4(x) 177 | 178 | out = x 179 | ''' 180 | x = self.avgpool(x) 181 | x = x.reshape(x.size(0), -1) 182 | 183 | if self.pool == 'avgpool': 184 | x = self.fc(x) 185 | elif self.pool == 'vlad': 186 | x = self.fc_(x) 187 | ''' 188 | return out 189 | 190 | 191 | class NetVLAD(nn.Module): 192 | """NetVLAD layer implementation""" 193 | 194 | def __init__(self, num_clusters=16, dim=512, alpha=100.0, 195 | normalize_input=True): 196 | """ 197 | Args: 198 | num_clusters : int 199 | The number of clusters 200 | dim : int 201 | Dimension of descriptors 202 | alpha : float 203 | Parameter of initialization. Larger value is harder assignment. 204 | normalize_input : bool 205 | If true, descriptor-wise L2 normalization is applied to input. 206 | """ 207 | super(NetVLAD, self).__init__() 208 | self.num_clusters = num_clusters 209 | self.dim = dim 210 | self.alpha = alpha 211 | self.normalize_input = normalize_input 212 | self.conv = nn.Conv2d(dim, num_clusters, kernel_size=(1, 1), bias=True) 213 | self.centroids = nn.Parameter(torch.rand(num_clusters, dim)) 214 | self._init_params() 215 | 216 | def _init_params(self): 217 | self.conv.weight = nn.Parameter( 218 | (2.0 * self.alpha * self.centroids).unsqueeze(-1).unsqueeze(-1) 219 | ) 220 | self.conv.bias = nn.Parameter( 221 | - self.alpha * self.centroids.norm(dim=1) 222 | ) 223 | 224 | def forward(self, x): 225 | N, C = x.shape[:2] 226 | 227 | if self.normalize_input: 228 | x = F.normalize(x, p=2, dim=1) # across descriptor dim 229 | 230 | # soft-assignment 231 | soft_assign = self.conv(x).view(N, self.num_clusters, -1) 232 | soft_assign = F.softmax(soft_assign, dim=1) 233 | 234 | x_flatten = x.view(N, C, -1) 235 | 236 | # calculate residuals to each clusters 237 | residual = x_flatten.expand(self.num_clusters, -1, -1, -1).permute(1, 0, 2, 3) - \ 238 | self.centroids.expand(x_flatten.size(-1), -1, -1).permute(1, 2, 0).unsqueeze(0) 239 | residual *= soft_assign.unsqueeze(2) 240 | vlad = residual.sum(dim=-1) 241 | 242 | vlad = F.normalize(vlad, p=2, dim=2) # intra-normalization 243 | vlad = vlad.view(x.size(0), -1) # flatten 244 | vlad = F.normalize(vlad, p=2, dim=1) # L2 normalize 245 | 246 | return vlad 247 | 248 | 249 | class Bottleneck(nn.Module): 250 | expansion = 4 251 | 252 | def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, 253 | base_width=64, dilation=1, norm_layer=None): 254 | super(Bottleneck, self).__init__() 255 | if norm_layer is None: 256 | norm_layer = nn.BatchNorm2d 257 | width = int(planes * (base_width / 64.)) * groups 258 | # Both self.conv2 and self.downsample layers downsample the input when stride != 1 259 | self.conv1 = conv1x1(inplanes, width) 260 | self.bn1 = norm_layer(width) 261 | self.conv2 = conv3x3(width, width, stride, groups, dilation) 262 | self.bn2 = norm_layer(width) 263 | self.conv3 = conv1x1(width, planes * self.expansion) 264 | self.bn3 = norm_layer(planes * self.expansion) 265 | self.relu = nn.ReLU(inplace=True) 266 | self.downsample = downsample 267 | self.stride = stride 268 | 269 | def forward(self, x): 270 | identity = x 271 | 272 | out = self.conv1(x) 273 | out = self.bn1(out) 274 | out = self.relu(out) 275 | 276 | out = self.conv2(out) 277 | out = self.bn2(out) 278 | out = self.relu(out) 279 | 280 | out = self.conv3(out) 281 | out = self.bn3(out) 282 | 283 | if self.downsample is not None: 284 | identity = self.downsample(x) 285 | 286 | out += identity 287 | out = self.relu(out) 288 | 289 | return out 290 | 291 | 292 | def _resnet(arch, block, layers, pretrained, progress, **kwargs): 293 | model = ResNet(block, layers, **kwargs) 294 | if pretrained: 295 | state_dict = load_state_dict_from_url(model_urls[arch], 296 | progress=progress) 297 | model.load_state_dict(state_dict) 298 | return model 299 | 300 | 301 | def resnet18(pretrained=False, progress=True, **kwargs): 302 | return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, 303 | **kwargs) 304 | 305 | -------------------------------------------------------------------------------- /utils/evaluation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def obtain_top1_accuracy(output, target): 5 | with torch.no_grad(): 6 | batch_size = output.size(0) 7 | 8 | _, pred = output.topk(1, 1, True, True) 9 | pred = pred.t() 10 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 11 | 12 | correct_k = correct[:1].reshape(-1).float().sum(0, keepdims=True) 13 | top1 = correct_k.mul_(100.0 / batch_size) 14 | 15 | return correct_k, top1 16 | 17 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import random 5 | 6 | 7 | def setup_seed(seed): 8 | torch.manual_seed(seed) 9 | torch.cuda.manual_seed_all(seed) 10 | np.random.seed(seed) 11 | random.seed(seed) 12 | torch.backends.cudnn.deterministic = True 13 | 14 | 15 | def weight_init(m): 16 | if isinstance(m, nn.Linear): 17 | nn.init.xavier_normal_(m.weight) 18 | nn.init.constant_(m.bias, 0) 19 | elif isinstance(m, nn.Conv2d): 20 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 21 | elif isinstance(m, nn.BatchNorm2d): 22 | nn.init.constant_(m.weight, 1) 23 | nn.init.constant_(m.bias, 0) 24 | --------------------------------------------------------------------------------