├── README.md ├── config └── multi_modal_bert_base_layer_conect.json ├── data ├── music21_train.csv └── music21_val.csv ├── requirements.txt ├── retrieval ├── eval_score.py ├── orig_retrieval_networks.py ├── retrieval_datasets.py ├── train_retrieval_networks.py └── tribert_retrieval_networks.py ├── train_trimodal.py ├── tribert ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── audio_net.cpython-36.pyc │ ├── criterion.cpython-36.pyc │ ├── tribert.cpython-36.pyc │ ├── utils.cpython-36.pyc │ └── vilbert.cpython-36.pyc ├── datasets │ ├── __pycache__ │ │ ├── base.cpython-36.pyc │ │ ├── music_multimodal.cpython-36.pyc │ │ └── video_transforms.cpython-36.pyc │ ├── base.py │ ├── music_multimodal.py │ └── video_transforms.py ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── audio_net.cpython-36.pyc │ │ ├── criterion.cpython-36.pyc │ │ ├── synthesizer_net.cpython-36.pyc │ │ ├── utils.cpython-36.pyc │ │ └── vision_net.cpython-36.pyc │ ├── audio_net.py │ ├── criterion.py │ ├── utils.py │ └── vision_net.py ├── optimization.py └── tribert.py ├── utils_music21.py ├── visualization ├── audio-0pXFhOg1o2c+audio-bGfyLBoZPM4 │ ├── av1.mp4 │ ├── av2.mp4 │ ├── gt1.wav │ ├── gt2.wav │ ├── gtamp1.jpg │ ├── gtamp2.jpg │ ├── gtmask1.jpg │ ├── gtmask2.jpg │ ├── mix.jpg │ ├── mix.wav │ ├── pred1.wav │ ├── pred2.wav │ ├── predamp1.jpg │ ├── predamp2.jpg │ ├── predmask1.jpg │ ├── predmask2.jpg │ ├── video1.mp4 │ ├── video2.mp4 │ └── weight.jpg ├── audio-2XY77lk_LCQ+audio-1K-0VC9hWIA │ ├── av1.mp4 │ ├── av2.mp4 │ ├── gt1.wav │ ├── gt2.wav │ ├── gtamp1.jpg │ ├── gtamp2.jpg │ ├── gtmask1.jpg │ ├── gtmask2.jpg │ ├── mix.jpg │ ├── mix.wav │ ├── pred1.wav │ ├── pred2.wav │ ├── predamp1.jpg │ ├── predamp2.jpg │ ├── predmask1.jpg │ ├── predmask2.jpg │ ├── video1.mp4 │ ├── video2.mp4 │ └── weight.jpg ├── audio-EVp6jkgYuUc+audio-8YELO9yxs_c │ ├── av1.mp4 │ ├── av2.mp4 │ ├── gt1.wav │ ├── gt2.wav │ ├── gtamp1.jpg │ ├── gtamp2.jpg │ ├── gtmask1.jpg │ ├── gtmask2.jpg │ ├── mix.jpg │ ├── mix.wav │ ├── pred1.wav │ ├── pred2.wav │ ├── predamp1.jpg │ ├── predamp2.jpg │ ├── predmask1.jpg │ ├── predmask2.jpg │ ├── video1.mp4 │ ├── video2.mp4 │ └── weight.jpg ├── audio-E_ugm84TMvo+audio-PMDSfAZ4-eo │ ├── av1.mp4 │ ├── av2.mp4 │ ├── gt1.wav │ ├── gt2.wav │ ├── gtamp1.jpg │ ├── gtamp2.jpg │ ├── gtmask1.jpg │ ├── gtmask2.jpg │ ├── mix.jpg │ ├── mix.wav │ ├── pred1.wav │ ├── pred2.wav │ ├── predamp1.jpg │ ├── predamp2.jpg │ ├── predmask1.jpg │ ├── predmask2.jpg │ ├── video1.mp4 │ ├── video2.mp4 │ └── weight.jpg ├── audio-GV2bbRPFhvk+audio-NdzCe7COROw │ ├── av1.mp4 │ ├── av2.mp4 │ ├── gt1.wav │ ├── gt2.wav │ ├── gtamp1.jpg │ ├── gtamp2.jpg │ ├── gtmask1.jpg │ ├── gtmask2.jpg │ ├── mix.jpg │ ├── mix.wav │ ├── pred1.wav │ ├── pred2.wav │ ├── predamp1.jpg │ ├── predamp2.jpg │ ├── predmask1.jpg │ ├── predmask2.jpg │ ├── video1.mp4 │ ├── video2.mp4 │ └── weight.jpg ├── audio-P_dHPMofcwM+audio-EXMQITpeeaM │ ├── av1.mp4 │ ├── av2.mp4 │ ├── gt1.wav │ ├── gt2.wav │ ├── gtamp1.jpg │ ├── gtamp2.jpg │ ├── gtmask1.jpg │ ├── gtmask2.jpg │ ├── mix.jpg │ ├── mix.wav │ ├── pred1.wav │ ├── pred2.wav │ ├── predamp1.jpg │ ├── predamp2.jpg │ ├── predmask1.jpg │ ├── predmask2.jpg │ ├── video1.mp4 │ ├── video2.mp4 │ └── weight.jpg ├── audio-PlKCXvBDxaI+audio-0yR5s-CSw4E │ ├── av1.mp4 │ ├── av2.mp4 │ ├── gt1.wav │ ├── gt2.wav │ ├── gtamp1.jpg │ ├── gtamp2.jpg │ ├── gtmask1.jpg │ ├── gtmask2.jpg │ ├── mix.jpg │ ├── mix.wav │ ├── pred1.wav │ ├── pred2.wav │ ├── predamp1.jpg │ ├── predamp2.jpg │ ├── predmask1.jpg │ ├── predmask2.jpg │ ├── video1.mp4 │ ├── video2.mp4 │ └── weight.jpg ├── audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c │ ├── av1.mp4 │ ├── av2.mp4 │ ├── gt1.wav │ ├── gt2.wav │ ├── gtamp1.jpg │ ├── gtamp2.jpg │ ├── gtmask1.jpg │ ├── gtmask2.jpg │ ├── mix.jpg │ ├── mix.wav │ ├── pred1.wav │ ├── pred2.wav │ ├── predamp1.jpg │ ├── predamp2.jpg │ ├── predmask1.jpg │ ├── predmask2.jpg │ ├── video1.mp4 │ ├── video2.mp4 │ └── weight.jpg ├── audio-Xafuav13p2E+audio-t6B3JugXgTI │ ├── av1.mp4 │ ├── av2.mp4 │ ├── gt1.wav │ ├── gt2.wav │ ├── gtamp1.jpg │ ├── gtamp2.jpg │ ├── gtmask1.jpg │ ├── gtmask2.jpg │ ├── mix.jpg │ ├── mix.wav │ ├── pred1.wav │ ├── pred2.wav │ ├── predamp1.jpg │ ├── predamp2.jpg │ ├── predmask1.jpg │ ├── predmask2.jpg │ ├── video1.mp4 │ ├── video2.mp4 │ └── weight.jpg ├── audio-gK3ooFKujO0+audio-087sDeYPdjY │ ├── av1.mp4 │ ├── av2.mp4 │ ├── gt1.wav │ ├── gt2.wav │ ├── gtamp1.jpg │ ├── gtamp2.jpg │ ├── gtmask1.jpg │ ├── gtmask2.jpg │ ├── mix.jpg │ ├── mix.wav │ ├── pred1.wav │ ├── pred2.wav │ ├── predamp1.jpg │ ├── predamp2.jpg │ ├── predmask1.jpg │ ├── predmask2.jpg │ ├── video1.mp4 │ ├── video2.mp4 │ └── weight.jpg └── index.html └── viz.py /README.md: -------------------------------------------------------------------------------- 1 | # TriBERT 2 | 3 | This repository contains the code for the NeurIPS 2021 paper titled ["TriBERT: Full-body Human-centric Audio-visual Representation Learning for Visual Sound Separation"](https://arxiv.org/pdf/2110.13412.pdf). 4 | 5 | # Data pre-processing: 6 | 7 | Please download [MUSIC21](https://github.com/roudimit/MUSIC_dataset). we found 314 videos are missing. Moreover, the train/val/test split was unavailable. Therefore, we used a random 80/20 train/test split which is given in [data](https://github.com/ubc-vision/TriBERT/tree/master/data). 8 | 9 | After downloading the dataset, please consider following steps as data pre-processing. 10 | 11 | 1. Following [Sound-of-Pixels](https://github.com/hangzhaomit/Sound-of-Pixels) we extracted video frames at 8fps and waveforms at 11025Hz from videos. We considered these frames and waveforms as our visual and audio input for TriBERT model. 12 | 2. Setup [AlphaPose toolbox](https://github.com/MVIG-SJTU/AlphaPose) to detect 26 keypoints for body joints and 21 keypoints for each hand. 13 | 3. Re-train [ST-GCN network](https://github.com/yysijie/st-gcn) with the keypoints detected using AlphaPose and extract body joint features of size 256 × 68. These features will be considered as pose embedding to pose stream of TriBERT model. 14 | 15 | # Pre-trained model 16 | 17 | Please download our pre-trained model from [Google Drive](https://drive.google.com/file/d/1cOIEUzcp7tKO1C6OyXwso2Rrm0wZHuu2/view?usp=sharing). To train from scratch please pre-process the data first and then run: 18 | 19 | ``` 20 | python train_trimodal.py 21 | 22 | ``` 23 | 24 | # Multi-modal Retrieval 25 | 26 | The code used for our multi-modal retrieval experiments are in the `retrieval` directory. We conduct retrieval on TriBERT embeddings as well as baseline (before passing through TriBERT) embeddings. The networks used for these tasks are located in `tribert_retrieval_networks.py` and `orig_retrieval_networks.py`, respectively. 27 | 28 | To train a retrieval network, use `train_retrieval_networks.py`. To evaluate the performance of a specific type of retrieval between TriBERT embeddings and baseline embeddings, use `train_retrieval_networks.py`. 29 | 30 | 31 | # Acknowledgment 32 | 33 | This repository is developed on top of [ViLBERT](https://github.com/jiasenlu/vilbert_beta) and [Sound-of-Pixels](https://github.com/hangzhaomit/Sound-of-Pixels). Please also refer to the original License of these projects. 34 | 35 | # Bibtext 36 | 37 | If you find this code is useful for your research, please cite our paper 38 | 39 | 40 | ``` 41 | @inproceedings{rahman2021tribert, 42 | title={TriBERT: Human-centric Audio-visual Representation Learning}, 43 | author={Rahman, Tanzila and Yang, Mengyu and Sigal, Leonid}, 44 | booktitle={Thirty-Fifth Conference on Neural Information Processing Systems}, 45 | year={2021} 46 | } 47 | ``` 48 | -------------------------------------------------------------------------------- /config/multi_modal_bert_base_layer_conect.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 12, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522, 13 | "v_feature_size": 1024, 14 | "v_target_size": 21, 15 | "v_hidden_size": 1024, 16 | "v_num_hidden_layers":6, 17 | "v_num_attention_heads":8, 18 | "v_intermediate_size":1024, 19 | "p_number_of_keypoints":68, 20 | "p_keypoints_feature":256, 21 | "p_hidden_size":1024, 22 | "p_arget_size":21, 23 | "p_num_hidden_layers":6, 24 | "p_num_attention_heads":8, 25 | "p_intermediate_size":1024, 26 | "a_feature_size": 2048, 27 | "a_target_size": 21, 28 | "a_hidden_size": 512, 29 | "a_num_hidden_layers":6, 30 | "_num_attention_heads":8, 31 | "a_intermediate_size":1024, 32 | "bi_hidden_size":512, 33 | "bi_num_attention_heads":8, 34 | "bi_intermediate_size": 1024, 35 | "bi_attention_type":1, 36 | "v_attention_probs_dropout_prob":0.1, 37 | "v_hidden_act":"gelu", 38 | "v_hidden_dropout_prob":0.1, 39 | "v_initializer_range":0.02, 40 | "v_biattention_id":[0, 1, 2, 3, 4, 5], 41 | "p_biattention_id":[0, 1, 2, 3, 4, 5], 42 | "a_biattention_id":[0, 1, 2, 3, 4, 5], 43 | "pooling_method": "mul", 44 | "arch_sound": "unet7", 45 | "arch_synthesizer": "linear", 46 | "arch_frame": "resnet50fc", 47 | "img_pool": "maxpool", 48 | "num_channels": 32, 49 | "binary_mask": 1, 50 | "loss": "bce", 51 | "weighted_loss":1, 52 | "num_mix":2, 53 | "log_freq": 1, 54 | "num_frames":3, 55 | "stride_frames":24, 56 | "frameRate": 8, 57 | "audLen": 65535, 58 | "audRate": 11025, 59 | "num_gpus":4, 60 | "workers": 48, 61 | "batch_size_per_gpu":20, 62 | "lr_frame":1e-4, 63 | "lr_sound": 1e-3, 64 | "lr_synthesizer": 1e-3, 65 | "num_epoch": 100, 66 | "disp_iter": 20, 67 | "num_vis": 0, 68 | "num_val": 256, 69 | "imgSize": 224, 70 | "stft_frame": 1022, 71 | "stft_hop": 256, 72 | "seed": 1234, 73 | "dup_trainset": 100, 74 | "weights_sound": "./ckpt_tribert/sound_pretrain.pth", 75 | "img_pool": "maxpool", 76 | "weights_frame": "./ckpt_tribert/frame_pretrain.pth", 77 | "arch_synthesizer": "linear", 78 | "sound_activation": "no", 79 | "img_activation": "sigmoid", 80 | "output_activation": "sigmoid", 81 | "mask_thres" : 0.5, 82 | "ckpt" : "./ckpt_tribert", 83 | "weight_decay" : "1e-4", 84 | "beta1" : 0.9, 85 | "lr_steps" : "[40, 60]" 86 | } 87 | -------------------------------------------------------------------------------- /data/music21_train.csv: -------------------------------------------------------------------------------- 1 | bWPte6-VThw 2 | Uh_PzT65l2E 3 | STtGADifdUw 4 | 1eqS3rYvDjI 5 | HdmrBA6sBYE 6 | QxZO_ZxOZRU 7 | GouIcBwgP_U 8 | _idF1Hf3-CI 9 | Lje_xoSVBdQ 10 | Us9dxgfBfeo 11 | XZ1rUpbdu-Y 12 | FnAhTPl8ees 13 | VVq9NZPpaJc 14 | EBWWzSobojo 15 | 8mDNLCo1McQ 16 | NihlZckO34o 17 | UD2SYltnnlY 18 | wHxCLLVtkKo 19 | IcV9yqQOqkw 20 | PWX_mohRkQ4 21 | 1uG4jnhHgAc 22 | rYeUVUQ9lzI 23 | F6bIb1ucefE 24 | ljHG8rMHSHc 25 | ErtFfLCNtz8 26 | 11KsiMpJg2I 27 | xkq-VyAzFHc 28 | 5Eg6GNhszNE 29 | AJOIqmlI65Y 30 | _9aFrft67Zw 31 | FJiGcSP4Dak 32 | qLfSgZThsDM 33 | mRIeWFw9040 34 | GQGBzVaDTmM 35 | stnmYDeNpZM 36 | N1HwJvvYyoo 37 | -UoFXtE2eqk 38 | Cc75Ef4kBbk 39 | ALMOnRonTPM 40 | clVigOfCCNc 41 | X2QQ9FPQjQs 42 | GF-3c8hBi9I 43 | kbTy_5o6AB4 44 | -z1JPhZ-BUY 45 | 5DYC92362Qg 46 | jf0ondpk0jE 47 | ag7eC_pVobo 48 | nTMJTm6tqoY 49 | TSktVYy2820 50 | iqyw0xR5Wf0 51 | PoJuodmk9Ms 52 | 8LU-kq19V3Y 53 | 6YyYNvPORcs 54 | 1nHGmlUj8GM 55 | REiXxfSu3Fw 56 | BBIENSA_GkE 57 | Aibr0BCz-Yk 58 | 41b8GUEx-OM 59 | WSRAO6abxog 60 | cMYr693e3Eo 61 | D5gWhSH7jRU 62 | wfGx5L88O4Q 63 | G46O0IzYtt0 64 | W9ZICrM6V-s 65 | FrNjZANKt-Q 66 | JbGvqve_JH0 67 | 70vWxJiWrFU 68 | pBJUB98yZTw 69 | HQveyGmUBSs 70 | JEDgAVcR_yE 71 | S2IWw6z-cfc 72 | T90-aKvgnkE 73 | 1Ju4S0qeDqw 74 | XuZIw5OkZwU 75 | Fis5cOQ5tGQ 76 | WKeppEkkL-Y 77 | XlxYlk8IShg 78 | UIG0kM1HzC8 79 | Ith2Ni8jg1Y 80 | OkP0XYvGBfs 81 | focoWBYg16w 82 | 9RSYp7jWRfY 83 | iVzOtFTfgrg 84 | V9rjOI5YEDE 85 | OgHPzxGb1sM 86 | Ujpp_SiEAlY 87 | E_TBQsEEw5A 88 | OVv7XW_4lN8 89 | An1z4o0l68c 90 | oWvLCWWZVYk 91 | WkY638E5wy8 92 | U2NMdF5yzdE 93 | 6Gji_i1TWPY 94 | BbPsU29zD1U 95 | MVZGmcXU2D8 96 | aFRh4pn6yzY 97 | 8iry5jpDxV4 98 | bLvdb6haNVk 99 | D6YQ6WwWZrk 100 | V9NILCtoTkc 101 | Giy_Btup7DQ 102 | JnAAzR4XZ-I 103 | HhWAn3EXqrA 104 | 4AsWvRTLPWI 105 | oQJVBH6ST7o 106 | j-B6-AoZpo8 107 | LzTaV_X7srE 108 | 7kbgi9oNEi0 109 | AIcISC49phE 110 | b9hVMnMfLoA 111 | _IcBMTI8PEQ 112 | pfIpwtClF4M 113 | UF7UE6P2wVo 114 | pDS4o2C-eUQ 115 | JeZyrStc2ws 116 | fhzqzm0dYuE 117 | X8rR0SV_BhU 118 | ufsSRKgUb6g 119 | eBoCeQtoFtM 120 | 3C4RYkpivCU 121 | 2C8PwkOElRg 122 | IHZSAJkNpps 123 | NL70xysQ-Ss 124 | b6_iAVepfo8 125 | Eaq914E77oY 126 | aXeErkTlDEg 127 | za7phW3V0xE 128 | l7LptHCRkQU 129 | ADxyFxpe1ls 130 | qHOw-XwugYM 131 | LOm0yStU_YM 132 | wuVq8ksXZ8c 133 | No-IJw4uWMw 134 | 2skFRfSMYnQ 135 | 8EU0xM-pQOI 136 | 5OrjP43Yigo 137 | IFEP8Z5Qos0 138 | 1tPJuR1AWmI 139 | hslVlYk0WEc 140 | KPTyodoU2ng 141 | D0gOvIiH8Hg 142 | Rr1UkDkhDlA 143 | 5wtXVHH99PU 144 | -ibvkmiORQI 145 | XXmpQ0KkZ_E 146 | ZgGEuzLZw2E 147 | KxuFM4z9EJ0 148 | GB9mc0eIa_8 149 | 4B-mDmpRA4I 150 | cTO_2R81fdY 151 | ZSJu_zyOaWc 152 | SVZNEDEIK48 153 | 3XMD9TRDx2s 154 | 6pkShdDe9LU 155 | RqPkcu-xidI 156 | 3P0h1Ib2kM0 157 | NcjXqWPG2Nw 158 | GAZvmnoaMAs 159 | VlIcqDWmPkw 160 | 35TPV7_D7do 161 | BO7_8o4QSr4 162 | nyOkMXal0yA 163 | 4QYLzfVmyvQ 164 | co1QtO9H8zQ 165 | 2CcXQsF06Xo 166 | AAUm9njcMRk 167 | -XAsXmbExkE 168 | 1MiyvDB_bzM 169 | 41iiV-wahIY 170 | _vmWIk6aRXU 171 | Sw04q5d7d2Q 172 | HO2irXZhr-k 173 | 4DEgI36KJUU 174 | kjKR2GBuQWw 175 | kdB0igDRaLk 176 | jW_AuF6dQYY 177 | NIEqzh0dvow 178 | 10RkjPIguAY 179 | 4Oy5VmaacaI 180 | FpFuu7cbVt0 181 | rZYf9Dl8ew0 182 | EwG7o3SS-r0 183 | UYp3dKioAao 184 | 0PqIvfpaVSc 185 | iffYeAAUQ1o 186 | 8YNdpTm2vsc 187 | YwDABKyGY_g 188 | AX9ygkC_3SY 189 | iN1dXiDWpw8 190 | 3N1KAaBJkwU 191 | NDVxjRCWjYU 192 | lDCeAgGUwyc 193 | CaeZoHNHul0 194 | 7RRqeIUqXPs 195 | r2w-OciR4hw 196 | 9_ybb2vE4Z8 197 | 0DBd-xYa4Ck 198 | 60tvG40iSxo 199 | 43N29koKyPo 200 | IS5LJPPQC68 201 | NLGte2YC6Oc 202 | G_rINXF-bzg 203 | qb-ttddKLYo 204 | ASUUqMvTdjg 205 | FryEgnBZN8U 206 | r29WR4uOf9k 207 | PA8DVnECfh0 208 | 97UEABGyyw8 209 | 894n_sDSEc0 210 | d1S-RVDZv64 211 | OuoWrVc8BkE 212 | 5GWxV8I21AQ 213 | zkA4sRrrHVg 214 | 2ARPm0djKIk 215 | EtrYq3j7src 216 | T7cs_RGdb9k 217 | wnsA_6nhIIE 218 | PFX2WTY9gWI 219 | MUalqwUFrPs 220 | Z4CRwrR_lBE 221 | m_sKRr_n1NE 222 | r-hUC0z2VDE 223 | 141eeOiScwo 224 | jXyE0gq1upc 225 | P_lSgczU2Sk 226 | 1fsMnm7bYLg 227 | pR9R6DxqAhE 228 | hs8tK5bp5D8 229 | yn1NaZWN_qQ 230 | c7Jyu_OiEB0 231 | OufpA4FV1Tc 232 | -ryG2g4d6Qc 233 | 1odwJu3RtlI 234 | EanNPdLUh9U 235 | h-gSV3p0m9E 236 | lTVGdSchoPs 237 | _49-JJzSxTw 238 | CO6CPC-NZhE 239 | C2si-AWsap0 240 | 0AKB1aSpyNg 241 | tacEJsR5akI 242 | X4UWEifacwI 243 | 5e3dhOOmz8g 244 | 2TLnVbJzH3E 245 | CtrTfd8foxA 246 | GTmzbN99PWo 247 | TBngVoy_GrE 248 | EYGbVrIlL6M 249 | Zqg4DwXmYBI 250 | 5ESDEU04WSQ 251 | YPiD4WdnUMQ 252 | b1ZNLsvmRNQ 253 | Cy1i3u-Lz1w 254 | S3G9BjzPKXo 255 | -_-TDDxoaAc 256 | 9g20WQ0WHn0 257 | Von-UD2RWnU 258 | 23DDVbzSig8 259 | aV-74W7SdJ4 260 | RUUtdFmg03M 261 | oqHi5-Io4Uo 262 | OoV1TBZamMI 263 | lTu6hayHl5k 264 | cp_0VICDDSc 265 | MCftKtqmH1A 266 | _AtTFfgZvEk 267 | GOYQXO-DJcg 268 | aJpxlrsy2As 269 | txpFPySWtiE 270 | Lp9rAMPaqLQ 271 | OBbmnUIJQDQ 272 | 8B9WaDcotLg 273 | 6FhYpuJyGT4 274 | Q5-AV79Rh5I 275 | 1i8XHo25_KU 276 | TyMrlwmg5M4 277 | eCQO6k5Qrmg 278 | ByaSZ2dD0yY 279 | U7X8kVuyZTE 280 | aZLkNOye_wA 281 | 9XoXQXCVUQY 282 | 3d1b4UH43-E 283 | 5nUYCnCQd6M 284 | b7p-tCYilKM 285 | 75bCUXEv1hs 286 | VtZXRWtp5G0 287 | Mhc9ckO6esk 288 | bLjS6E6c0IA 289 | 3RP8GYBEiiw 290 | wEzEcfz1QS0 291 | 6XeB5fgtU9Y 292 | msVlr2fgcgM 293 | cZa21THo--4 294 | _YvHViIph10 295 | yy2vL2RUiPI 296 | KrlDeCWv9ak 297 | PkJWA3cf-z4 298 | EXKxvMVayRk 299 | I0Cyq1YlSVM 300 | eIqVZuyPu2U 301 | 1E7U1w2Z8C0 302 | K1LRscO7TSw 303 | TWrh_c9YRp8 304 | XlvlHfWjKx4 305 | XFcB4jGhOBE 306 | WaiDZX7FH-8 307 | CTtHozrSfh0 308 | TSEWnnKagqQ 309 | YLTpkQqX_ek 310 | 7VEwnKM85xs 311 | 9VaXf5R7Xs8 312 | 2PKQczRfm1g 313 | nvjfuTKz2yA 314 | 1F3ay026U60 315 | HHaApvpQVdQ 316 | aerJoJj--HU 317 | Wt40rzyGSBs 318 | WNeHlya4VnQ 319 | 0o22Fy2Hr3s 320 | 6YBD40q1ads 321 | C3s60zFIPTs 322 | TOLoplXKr7U 323 | SPXyBDbGt4s 324 | D296jSWKxiM 325 | 0OrLdMPOyos 326 | qKnggsxx2II 327 | fFkthPInr8k 328 | sybsjsj1zPo 329 | HPyM9xzrih0 330 | XS_dSmZ0xbw 331 | 5HNlS9UFELE 332 | WjwOm5pJlI0 333 | 4QWePxpUu6c 334 | L6V8S8U9iLs 335 | Bsd2TJw0Wss 336 | XlS6Ao_V8nE 337 | HyD1g1KDoEE 338 | F4IPsMGMPgM 339 | kx6qqoGft2w 340 | aavW-tx8daw 341 | LUZf782HzNo 342 | VUXyjq1bw_Y 343 | -7WIMBTTvZs 344 | h2oiK1-8U5U 345 | XmjMGVv3TLk 346 | 7171YEDwyGc 347 | 2ngAIYZqDxE 348 | 6MEnSbm3xl0 349 | e5YNEk9Ye-o 350 | 9YS7ReG44zk 351 | kXB2EMfU7Ho 352 | KKW1Gr3Xttw 353 | 9OZHbvWdBbQ 354 | 2R12lQszz90 355 | oOCHF3GgKLs 356 | PDohhCaNf98 357 | U4RSnhRVUR0 358 | SMp2-xo1lZE 359 | 6EaTems2LNs 360 | LPqBkF5NVSQ 361 | jr-coZHlWl8 362 | V8ifoas59TM 363 | W9VsprvJQAM 364 | 9YNITyPKZFQ 365 | S3ZIb96M6Ag 366 | iLeEWSC2M2U 367 | Yz6izw5dhn8 368 | 9bEbuj4Qb3I 369 | Te3bHjd0kdc 370 | jTnHP1szyyw 371 | oEb2mlij38w 372 | XbOYpwFkn7o 373 | cyUUlWSEujk 374 | LqyX7bRrMTw 375 | lMQVRb2mz44 376 | 4TFz7QX5aNg 377 | gOsW2AVyjyc 378 | LUWw06fW4zU 379 | SDR39fhyZRI 380 | HI8t07KXWaw 381 | QeravklOaAg 382 | W8buacmF8g0 383 | MKSTzG114d0 384 | OIBq0bBziuw 385 | iB17xqmFw3A 386 | 8rb-thew50c 387 | uJnD0NCyJoc 388 | PYjwNew61XQ 389 | 5675YbIvcz0 390 | -4nIqLncdB0 391 | DS500VD8YDg 392 | LIocmQeT8_Q 393 | ZvLT_J9Uv4o 394 | Lq3671NhNZk 395 | D-2OOEcv_ZI 396 | 4tkY07v9YWg 397 | tHvLBLCBHyU 398 | Sd-tIwLzonE 399 | YFxW8-V050U 400 | q_ycwl0c448 401 | 6nxSuOPWrn4 402 | x9TgjVZPZbM 403 | -94WkAANyB0 404 | IlDWk-x6Cag 405 | 73ikMuXwKgc 406 | K_U5-GbPgi0 407 | C3QJz0gN9i8 408 | cPKV0QF9Scg 409 | AP-vB4w_7Ag 410 | w55jJXSspUo 411 | 9i79TXP7V2c 412 | ZRpsKMe5dSg 413 | 6G78knF3eFk 414 | bsorCgLZkkA 415 | B2rJa22OHkk 416 | bowjUDJsWq8 417 | fxG1YUtix_o 418 | pXE3Bl5-GoM 419 | P4KxMdjna0k 420 | wgpwadsv5YQ 421 | vd1dgdxlA94 422 | 4bAhK1C6BWw 423 | qsciXAjiwF0 424 | Mff3pp-Fj2w 425 | WgqK2DARbIQ 426 | GSol2w0QnJE 427 | JB8-oKS6K-8 428 | CjNKUiFHUC8 429 | 9xj3e5A8XwE 430 | ZANyOBEWiJA 431 | 3-95PC9eX40 432 | b-50KrgDpjU 433 | _VwGcE2IOMc 434 | 1mCi6eWXVbg 435 | h-58UapA8gY 436 | KG-tXicVpsg 437 | iEOdwzdXmfE 438 | 6syzZgAk2vI 439 | kao4iqPuQBI 440 | 37HdHAzJrOQ 441 | MnoQccPJbyM 442 | 3n-vlswrQYM 443 | 7Zudy1bw7bw 444 | BI7SMfb-sek 445 | x5Bzb6R73JQ 446 | WeeRb3LMb8E 447 | QqKlMPzQj7k 448 | MvkJvZSOmjY 449 | 4q7L3TX3Hl0 450 | Gufj4Jhk92I 451 | 8lol9fS2CHU 452 | nKc4srJBgYU 453 | Pzf9MQKkoNM 454 | unRA0xR-XWc 455 | e2aq2A6Bhuc 456 | 2V12S02E2Jc 457 | -zIIUIlfKlY 458 | 1oz3h9doX_g 459 | y0WB29MJJgQ 460 | NOYN6GRhnTs 461 | ZcKsPFZG4_0 462 | DshS7-Mn9dM 463 | -wxurZAFJWA 464 | JRPtROMynA8 465 | q4kscRK91y4 466 | DLziSt24jrg 467 | w-IEfjDTi9c 468 | 1fbHYpRstQs 469 | D2bFizJfvi8 470 | PeT8aBVvtFI 471 | 3zi2fHlfP_8 472 | HmIPqgiaW24 473 | -eNT-Gvikbs 474 | jLY5DKM6HoI 475 | QOYMhOXfGMw 476 | IGouH5bKhj8 477 | HwVslStJE6U 478 | NeayVC7CEvM 479 | 9dOqmTMqp1k 480 | _O1mTqCSo6E 481 | z73RZix8Q_o 482 | 48E4sS8K-bM 483 | UI7k6m8Gdz4 484 | awCkfdohkzE 485 | ywH4K8mQ-1s 486 | bq5utI8KK_8 487 | WDJYj3SvUlw 488 | _u-pz6i3iDY 489 | 341do4TeJT0 490 | HnUOaSfTA6c 491 | 8Oy-rsKES5I 492 | 7wmiNt5a79s 493 | Qy2y8X-Ec8Q 494 | eHbxLcoLWYY 495 | rc2taOIxV_M 496 | TREfn2OfujU 497 | 47Hl3Rb2164 498 | ySFS1KgXerA 499 | qljUWQ1rY3Q 500 | sqT7a8TBgOg 501 | 8P6PFNNCrgs 502 | 5GpXUu8cz1c 503 | 4fGE07dATvc 504 | CG37e-Q2NeU 505 | RgB3wzNVZyY 506 | XIcV5kBVIbw 507 | BMhnTdy-A0M 508 | CeOAuSm1NUo 509 | B7vaxJhgCQE 510 | 3AQsq1PG1Wc 511 | jPaCkyptN1s 512 | aYCanSq5Y7E 513 | vTNNzAqqeZ8 514 | EbqdFn9dNvM 515 | 1OsOEt5Qpfc 516 | TUrhztP4TGo 517 | C74cF3O6ZFo 518 | I1K7TJcgOsE 519 | _Nr58vTAIkw 520 | dpnBlIZGyJA 521 | ENnSwgLc2B4 522 | fXjfvEcAV6w 523 | AQBgyM8NrHQ 524 | Vq0eH1uUvXc 525 | Z_yNLFYjpsY 526 | Fif2-OjH-8A 527 | UMCbhbok-ec 528 | UfP31JXZbKU 529 | VLAFJ3PD830 530 | 6apsC8o3f6k 531 | 1n44lCfG59k 532 | -HLTNgdajqw 533 | LeH5Urwtlug 534 | QaOUijKCqZU 535 | o3a6F070Xt0 536 | nvzbPGgv7R4 537 | BxmkJkFr1nk 538 | xjhZhI2Zthg 539 | zOWt7kRh9e8 540 | 1t-zt8nAsQo 541 | Hb_UTQsnFqA 542 | -XMgVGqJm8U 543 | hYepU4PQFAo 544 | Owzy345m12U 545 | HJL0k3fQx3s 546 | kQZJh5Clq_s 547 | PEyIPSlJNQM 548 | 6AukD08i9GM 549 | HGqEtp1wpyc 550 | JoIl89Ybhs0 551 | 09C8NuLfsxc 552 | JmQ1C-cwiuY 553 | 9L3e1AuZNIM 554 | 0oKi3ARn640 555 | 8DHG_hVSw1o 556 | zPzDLRIk6Es 557 | EtP6JBWtd7s 558 | E_8W7f9ZZpY 559 | b6TT-5B4dXw 560 | COfSRe0FHWg 561 | 5P7iGxvFI4Q 562 | YmKMEpt9OjQ 563 | AszMf9QGxug 564 | PY3pK0Guf2Y 565 | zhevOlbQf8s 566 | 4h550TCCd9w 567 | BwDQ9VIW74Q 568 | tGG2P7mjuqQ 569 | j0U_hqIH_XM 570 | -_jlqeSs6ZU 571 | 2WZsNZrbt6w 572 | nQpXK7PS-oo 573 | 4lVuJEIaXgo 574 | 0N26WnKiCIg 575 | 5iKfNcUcPI0 576 | OZDN7VkC6AY 577 | ZPpbacaDz3k 578 | H6fPWLu6biA 579 | CFcPlv9RhEY 580 | 8zJgbPSLDLw 581 | yXKJiqOuvkA 582 | 1vZ-IKkcPL4 583 | X2hHGcwLPhs 584 | 9fOU26yd0Pw 585 | VPKf9yRkKf4 586 | JDV3yiF-6DU 587 | RB2WGan5ghM 588 | 2ZTKgdvVo7k 589 | bapfkDpaSGw 590 | 0px0WUwkOy8 591 | vBgwzJYJ5yk 592 | Q8efUjCGqvo 593 | 9ygrq7in_uI 594 | jB58yIBmcTo 595 | GMYP2T_ZTkw 596 | UFNc_74au6A 597 | 3TJPKAfxyEE 598 | g1sNdUDTfJQ 599 | _MkybbNPYFI 600 | n8-2q4dheyU 601 | KenJhs2kS3Y 602 | za84Zws-5gY 603 | 2ZxAVUsuE4Y 604 | lVQYZKrq1aY 605 | fwBwJFRrD4w 606 | OxA5Z_5-w0E 607 | W65r_CnPlKg 608 | AKNt1SF-5VA 609 | -tY40Ev8IzE 610 | cFdMRCeB8WQ 611 | 3-zT9mN8Lio 612 | rxr5tTH8_mM 613 | OCmsvdY_9RM 614 | qs68YTfKHkw 615 | RWP6BHh_c7Y 616 | pBU7dNbUxQs 617 | DF3RPgRYw7s 618 | ARNQJIxtj7E 619 | Bsgp-hgtF7E 620 | aTYc-hHt4us 621 | GPh_9jtWub8 622 | aY4Ra2KOyas 623 | 5yi3rSuvNyI 624 | GhwLgGnTMG8 625 | nZ4HNz9xvSw 626 | ZXz0cVq_RTI 627 | GtQixP8bAk8 628 | 8caeW1WShg0 629 | 3daxf9BhdyY 630 | 7G6QVPCHv6Q 631 | EvBzKBdfZxA 632 | 9PJgKV2VCcM 633 | 40KRN9rzkpk 634 | UZK_2bTuzrI 635 | geK-gOMgm20 636 | zhaLSfnahL0 637 | 9i1yD3mnqUg 638 | 2tG4y_uyEAY 639 | _oUTOskOwXs 640 | 7gP4eJzp8Ro 641 | KS-fMmS96XU 642 | H4K5buSMzdA 643 | 5QQb0kt_kAA 644 | 8GsWxA7kalQ 645 | WVyd3N6u5YE 646 | gfkfwnOFybM 647 | wzNBinIsa3c 648 | Cml4I28SUHc 649 | R700Q9DEUEk 650 | 9L4BvIDqhm8 651 | dijkShln-ig 652 | BPMsd_gDFb0 653 | 04lh6kxa5Kc 654 | QZEkGj9m1qs 655 | wNAFZMyLNUA 656 | I5LCi0sWTU4 657 | OgRf5gri-9k 658 | 7zfk-5Wq5Aw 659 | 4AI18Hhy5No 660 | e73x3LGeb_Q 661 | 8VfMykySryc 662 | HOUBn-wHcwQ 663 | dAOjddR-84E 664 | XI7Fx7rJt4I 665 | WVeheJFHQS4 666 | -L5Bo-mWKCI 667 | RK__-JoNte0 668 | Xx0Haa8Fk2o 669 | 26HLgXWF-Co 670 | w81p5ty4OZc 671 | CQnWcgZfqRM 672 | 2_valn6NOrQ 673 | 07BlCwnia_U 674 | xdgQ6LJFGTI 675 | vEOyhF8KLPE 676 | NLooa9XHDMc 677 | fH4pDr8CqI8 678 | c-Yr5qyCefA 679 | DVrCy36Uigo 680 | JkhKTTB0YB8 681 | E7E1kMExVcg 682 | P-B8ooirjHE 683 | 8sgzGhxw-tM 684 | d5IO46KfjR4 685 | GMGnSMNrvfI 686 | BxdQ2OKFP3Q 687 | LKVxvHeb8hs 688 | Mdryqvk0UH0 689 | yoCJxgwzJy0 690 | 9kwbEUqCNbU 691 | 2toke35E958 692 | X3GbAKww1KI 693 | SkXnQi_nNbk 694 | FfV5O2F2waI 695 | -ktVpJDWu_k 696 | Hy-gFmxHhxs 697 | MJmVYLWPnCI 698 | IKUVgbi1e2U 699 | mvE-NTWT77c 700 | I0LedcEaPL0 701 | -piYS0yc_dI 702 | H-j5lp2QjEE 703 | 6Fm4fJTDr68 704 | VQR_CcRrZA0 705 | F_AqoGpCWHg 706 | 3dbHMyBWR7M 707 | ZtVxXYObmOQ 708 | -e5DuAUwBgA 709 | y-Y6QAUqSKg 710 | cNLX70EhWtg 711 | cFskXS823Kk 712 | wkkzzAjSxDs 713 | tAuYNf2hqHU 714 | M7FQHjE1AGY 715 | 5UZF3bGoZpM 716 | t_3Zm1PbydA 717 | 6CTOaH3qQKY 718 | k3371a9MXsI 719 | Me36Lr7rpAM 720 | 82pGU0POQSM 721 | I2QXo4mGeRE 722 | nCEKU3Rpvok 723 | QUAQqGpml4M 724 | 4ftnQdEaLO8 725 | zacX_7sb4j4 726 | pQ9IiR2pFQY 727 | MwHZRjcXgRE 728 | Sw8346DYwME 729 | 5awjINljUoo 730 | WnUAJsk97k4 731 | DPSLdZBaQTM 732 | QoFHDWgxD9w 733 | 49EvJsKoEko 734 | A22AWJ6zFBs 735 | ZrLi-YLGVnk 736 | BB6a5BPpbFs 737 | SAN5JBTnlas 738 | Amk7Ssb5EHo 739 | 4-hZDt0Vr5I 740 | pHUAsuSJeoA 741 | hDYLqqa1FNQ 742 | o7YS-fzv5_k 743 | CKCY0Ib-PRY 744 | P5pxcBdxa4E 745 | RfTCdZEqWcg 746 | Lop_iMipljc 747 | pfGRBQiaYZo 748 | S86qStDAz3A 749 | ZFp0yw9Dxcw 750 | 1TBxNBIUx3o 751 | kuNep98Fd8o 752 | 9cko37myhM0 753 | -SzFwDekTGo 754 | 2LouW42GMpQ 755 | D1DjJqqK5Rw 756 | I5IYfx7ontI 757 | JBozc8J3LXc 758 | fhf2cx9t4vY 759 | FR_1gEaBeeE 760 | nMYw1bi81Dg 761 | lJfi82vd9eI 762 | Z36VEiqi02Y 763 | LHO49Q572F4 764 | loCRmBvtHZ4 765 | 5IoSJhlLmqI 766 | Jd5heRezP5s 767 | M3YZEmQyt6c 768 | 85KcVDCQLfc 769 | 6Ubt6kjJJRg 770 | _jPFkOkNjuo 771 | 8rcClHLysvU 772 | ensmwD87FjM 773 | KW2cyLN7bRM 774 | 4PUBRWXm9wQ 775 | kotc5pssmt8 776 | 1fhZ3iBeK2g 777 | NhWImcVAnNg 778 | qLLwSb9YX5g 779 | ZYMb7ZJkJgc 780 | BabDKZLYlyE 781 | NhkPpOvrtz0 782 | s1hBC8N4-Zo 783 | Ee9n3mxehr4 784 | xyi0Ft3sb0U 785 | 6I846oP0E7Q 786 | 3q104tR3k2Q 787 | Aiantckb2lk 788 | vhVtX8F0-BY 789 | 7C-udYVfjoY 790 | DK6cSN_7lwk 791 | VA1EolcDwZI 792 | JxgWTP5Nnww 793 | 2NaPqrUQ-3U 794 | bcSGjoMLe4A 795 | qBnsIqyXheM 796 | K-EqzJG2W0U 797 | ZFuLPm6-kVw 798 | U6iN9QcIp7I 799 | eAS-Lj0IRk4 800 | N3nwDmWvsF4 801 | S0M77oMpTws 802 | E6jlrR-M3tk 803 | 2ZmGv8hr0ag 804 | HeWD0YLbwOw 805 | GKwO7cy2JIA 806 | lVpWbUIW0gY 807 | 603xwk_CKug 808 | Jtq-HDS3Zdc 809 | GezwOJW568A 810 | Sllno_P7l8M 811 | -3V-vUMkHqk 812 | HXHJ1hPyHd8 813 | 6PgAIsZOc5o 814 | 2gyrqiqTuiM 815 | JByLsqhRL_0 816 | 3EkqUlQ7Mw4 817 | InzjAr8VdvY 818 | IZx6ZXWuOGY 819 | bBCBMV7K-YA 820 | -7l7M6vMm1k 821 | HaX5E66eW24 822 | FGSdBlb55Ao 823 | ZVcWGHRPcP8 824 | 5UcdHNTO1-U 825 | XEj3ejyxjNU 826 | ds0ayqYweAc 827 | pWGLb57zbz8 828 | Uu9lMixnPGU 829 | KgebjbW1mGk 830 | 7q1E1pueAU4 831 | CSKOEhh5nBk 832 | 2OMFkTM5Mrs 833 | 18Uttg-RRIU 834 | TM-dWqH4qFI 835 | 3TfXOozH1XQ 836 | DvbeXXoraro 837 | -ieHd2A9vOg 838 | naAz_RNicmU 839 | UNFwNbFdiTI 840 | dfLQaboaa84 841 | -------------------------------------------------------------------------------- /data/music21_val.csv: -------------------------------------------------------------------------------- 1 | PlKCXvBDxaI 2 | Ba4ajAm0VDI 3 | 2XY77lk_LCQ 4 | kGAREuYVNDY 5 | LfPea4VOU7Q 6 | 0pXFhOg1o2c 7 | DLTK2_UyrDI 8 | Xafuav13p2E 9 | PohzBumrY2Q 10 | wEF6fWnx_wY 11 | gK3ooFKujO0 12 | P_dHPMofcwM 13 | E_ugm84TMvo 14 | 1owUN7YXSzE 15 | 1K-0VC9hWIA 16 | KsvxsTNvqno 17 | Y-dpHbPGQWE 18 | pIPr-lp0C6E 19 | Vk9YR-NLkmQ 20 | TaII-y9-5Iw 21 | W1I1I6J_U5Q 22 | EVp6jkgYuUc 23 | GV2bbRPFhvk 24 | R6M2LLmUg1k 25 | 8OocjGxAtHw 26 | 7uQ-FX2YPYY 27 | VcvG3_CEScw 28 | nCvy2n4ET8Y 29 | hf1QMr3zCv8 30 | Z1_Pkm_SCsU 31 | P6mSuuTzYxk 32 | 9sYVHOsxdEA 33 | 6hQHOZNwfRs 34 | 6D0ciGPtcqo 35 | -0gYWIOfqdM 36 | NdzCe7COROw 37 | 8RjCoqH6jdM 38 | bSJ2gRMbDjg 39 | FYqqkgkeYrI 40 | AlYZ7nWkzcQ 41 | Hw7Z9LUqCSI 42 | KjkMQ12U0I0 43 | 8YELO9yxs_c 44 | FyJTmjsiDuM 45 | -mXzp53rYGs 46 | 6YkGjuk773U 47 | nUZ9jX1J78c 48 | QWttV5rZMrg 49 | os3aGWhaxhY 50 | cZoWhKTZTYY 51 | LCfjGiTfHfI 52 | 6fRV-ERTGlI 53 | lp4_79ooAQo 54 | HosvsEE5SqA 55 | N8v-xQ0p8yA 56 | rg41DcQAwWI 57 | PF9PrzmkR2Q 58 | mUUpqPK_w-o 59 | QVXgHLXaoFw 60 | -Gdh8N_KpLY 61 | VYkZb7F4zKo 62 | BQq0jYGyyuo 63 | 44FPny8k4DM 64 | 9EizWhixKCg 65 | P3RSXOzak0o 66 | NxcaesZGezI 67 | 8zotbsDK62Q 68 | -X4loU7hcpw 69 | s-UjwvJ0eeA 70 | pgedyY56-bU 71 | oqwostd-nHQ 72 | _3pLesuLT4w 73 | GL-1Ke5Keok 74 | 7jsI_0tw8MY 75 | 4CTkB84ukfY 76 | VcEhFKOTF7A 77 | 2kBkI6f1X_s 78 | 9okuB-rub8c 79 | 2lJh-6TYdX8 80 | lm51ySOO71o 81 | qMFYRw0uZ_E 82 | -_F1gRwExMw 83 | t6B3JugXgTI 84 | v5bCWMzpeX8 85 | 6A6wmDbAG7s 86 | 6gjpPVZnO64 87 | E5elpplcRY8 88 | Ba5_zNpzAgw 89 | Rj5XJO5T8PY 90 | qZZ1t3ubduo 91 | ZoE25YHTGDQ 92 | -DlGdZNAsxA 93 | XgU1dc7eys0 94 | 2avwRKFpUbw 95 | frHmgK4u0Gg 96 | 9SZLDvRaHEA 97 | HpVhQbyM2dc 98 | DIt5UQR1tdo 99 | 0yR5s-CSw4E 100 | xtVkxXOu_Mo 101 | HihdcTYa-8k 102 | WS-i7gOmKSo 103 | aCi1P8Rymo4 104 | 7zrsesZWrhc 105 | wh4H5Cfg1RQ 106 | Oe0njWj0KDE 107 | C952gmBlkIU 108 | 5_NsbkyLO2Q 109 | MqB8Ux4DsIM 110 | i6t-dNA9Fck 111 | -H44pbRIg7s 112 | DGn7xcsxMPk 113 | 3yRMbH36HRE 114 | WnXzjXoSF_g 115 | nBHt2QYgzsY 116 | EXMQITpeeaM 117 | FcO9w1mYEEs 118 | -5CDUnGu3jA 119 | JeohKdrShNI 120 | D6RmuscqupA 121 | KC2RgtW4IZ0 122 | PMDSfAZ4-eo 123 | aZjtE82spMA 124 | PCicM6i59_I 125 | cIR1PDcHaew 126 | 6d0tmb28kV4 127 | -5MipMQ25cU 128 | QoBoHAujB6I 129 | 2P83WJXifEs 130 | DlqCn_xrNRU 131 | zJIt_KdE3Jc 132 | -QEVcuzr5bQ 133 | dXiiW1p_lTI 134 | BQH7E8ByxC8 135 | i53jcmLQESI 136 | h61TbdGR7Rk 137 | S1iKARDRvX4 138 | a9psZCnRJes 139 | Vs1NPirh2jI 140 | Xm5a8vCHlFw 141 | Mh2tJbJiWQQ 142 | j43Hj6k9Zy4 143 | Ct-SQmiA31s 144 | A8TgRagoY18 145 | SyToSKUxgKs 146 | j97FUC5-hpY 147 | 087sDeYPdjY 148 | F-sSpzBT_Rk 149 | dVxfGxQq-vc 150 | 19CdPaHrVPQ 151 | YiE-dyrpRgA 152 | mjL-XSxiu5k 153 | UKk9DJD1jOo 154 | nMLPDNPibO0 155 | 5lm9laLSORc 156 | lEDKJFgpakg 157 | 889Kxn_RYzo 158 | Gt8Sfng1YCA 159 | PobONF_Mj4g 160 | bGfyLBoZPM4 161 | tfcNUo8qjrA 162 | VdFnyZWJAgo 163 | LRb3gwiLZI8 164 | Vxe4R5qqEPI 165 | LJ_IFurpFvo 166 | IgtMKN8awWA 167 | 31GOxPXDNkk 168 | 8o5ngxZaYYw 169 | 2D2ALfuqDpk 170 | tWH7qKYl5uQ 171 | FntHqAlD1S0 172 | GIB4tnBSWtg 173 | 5VACMCjJduY 174 | GXcRI0BdioE 175 | 4Tn7MBNTFAA 176 | -qbAJMQBoX4 177 | 3wL4J4IDh7c 178 | 7baQOJtiFMc 179 | YtOvrQNMkY0 180 | 2gk-IN35s5I 181 | FoOVn7MACAE 182 | S9ES30JJTZc 183 | Ljf3tZGxaJM 184 | kw-OQpF9N4E 185 | Lv2vc_8CZLc 186 | 0VyVd_QUCl8 187 | 6E1IGb3137U 188 | 98Menrr741Q 189 | McMd-s4XpZk 190 | BWqae9kUD_M 191 | U6j1KyG1mTw 192 | dhWU1tU1HbY 193 | 08FQMisUWAY 194 | LKMBOuDkEqo 195 | fful4o8CIW8 196 | 3Y2vbEwMkeQ 197 | Nap3NTXTNE4 198 | sTo7AxrWIHE 199 | FYxv-VOKa40 200 | 0lSLVJRTeEI 201 | Ln6ynUjGa80 202 | P5nnnybNQ_o 203 | XlPvAkLT3Yw 204 | F9JPakNkuok 205 | -oe-KptPGnY 206 | MTyf4utLlJk 207 | apGScvWeLaE 208 | DocbWENGxsE 209 | --IXKwloYLE 210 | MvPqp3oZwOk 211 | 59GKBsOJSNs 212 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch-pretrained-bert==0.6.2 2 | numpy==1.16.4 3 | lmdb==0.94 4 | tensorboardX==1.2 5 | tensorflow==1.13.1 6 | tensorpack==0.9.4 7 | tqdm==4.31.1 8 | easydict==1.9 9 | PyYAML==5.1.2 10 | jsonlines==1.2.0 11 | json-lines==0.5.0 12 | matplotlib 13 | Cython 14 | msgpack 15 | msgpack-numpy 16 | -------------------------------------------------------------------------------- /retrieval/eval_score.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | import numpy as np 6 | import tribert_retrieval_networks 7 | import orig_retrieval_networks 8 | from tqdm import tqdm 9 | 10 | 11 | def evaluate(model, test_vision, test_audio, test_pose, rand_idx, args, device): 12 | 13 | num_eval = test_vision.shape[0] 14 | 15 | correct_1 = 0 16 | correct_5 = 0 17 | correct_10 = 0 18 | 19 | # Get query embeddings and target embedding 20 | for i in tqdm(range(num_eval)): 21 | if args.retrieval_variant == 'aud2vis': 22 | query = torch.from_numpy(test_audio[rand_idx[i], ...]).float().to(device) 23 | result = torch.from_numpy(test_vision[rand_idx[i], ...]).float().to(device) 24 | if args.retrieval_variant == 'vis2aud': 25 | query = torch.from_numpy(test_vision[rand_idx[i], ...]).float().to(device) 26 | result = torch.from_numpy(test_audio[rand_idx[i], ...]).float().to(device) 27 | if args.retrieval_variant == 'aud2pose': 28 | query = torch.from_numpy(test_audio[rand_idx[i], ...]).float().to(device) 29 | result = torch.from_numpy(test_pose[rand_idx[i], ...]).float().to(device) 30 | if args.retrieval_variant == 'pose2aud': 31 | query = torch.from_numpy(test_pose[rand_idx[i], ...]).float().to(device) 32 | result = torch.from_numpy(test_audio[rand_idx[i], ...]).float().to(device) 33 | if args.retrieval_variant == 'vis+aud2pose': 34 | vision = torch.from_numpy(test_vision[rand_idx[i], ...]).float().to(device) 35 | audio = torch.from_numpy(test_audio[rand_idx[i], ...]).float().to(device) 36 | query = model.fuse_forward(vision, audio) 37 | result = torch.from_numpy(test_pose[rand_idx[i], ...]).float().to(device) 38 | 39 | scores = torch.zeros(num_eval).to(device) 40 | 41 | for j in range(test_vision.shape[0]): 42 | score = model.forward_once(query, result) 43 | scores[j] = score 44 | 45 | scores = F.softmax(scores, dim=0) 46 | ordered_scores = torch.argsort(scores, descending=True) 47 | 48 | if rand_idx[i] in ordered_scores[:1]: 49 | correct_1 += 1 50 | 51 | if rand_idx[i] in ordered_scores[:5]: 52 | correct_5 += 1 53 | 54 | if rand_idx[i] in ordered_scores[:10]: 55 | correct_10 += 1 56 | 57 | acc_1 = correct_1 / num_eval 58 | acc_5 = correct_5 / num_eval 59 | acc_10 = correct_10 / num_eval 60 | 61 | return acc_1, acc_5, acc_10 62 | 63 | 64 | def test(tribert_model, orig_model, tribert_embeddings, orig_embeddings, device, args): 65 | 66 | # Unpack embeddings 67 | tribert_vis = tribert_embeddings['vision'] 68 | tribert_aud = tribert_embeddings['audio'] 69 | tribert_pose = tribert_embeddings['pose'] 70 | 71 | orig_vis = orig_embeddings['vision'] 72 | orig_aud = orig_embeddings['audio'] 73 | orig_pose = orig_embeddings['pose'] 74 | 75 | num_tribert = tribert_vis.shape[0] 76 | num_orig = orig_vis.shape[0] 77 | 78 | print(f'Evaluating on {num_tribert} tribert embeddings and {num_orig} original embeddings...') 79 | 80 | # Shuffle the embeddings 81 | rand_tribert = np.arange(num_tribert) 82 | rand_orig = np.arange(num_orig) 83 | np.random.shuffle(rand_tribert) 84 | np.random.shuffle(rand_orig) 85 | 86 | # Evaluate retrieval on tribert embeddings 87 | tribert_acc1, tribert_acc5, tribert_acc10 = evaluate(tribert_model, tribert_vis, tribert_aud, tribert_pose, rand_tribert, args, device) 88 | 89 | # Evaluate retrieval on original embeddings 90 | orig_acc1, orig_acc5, orig_acc10 = evaluate(orig_model, orig_vis, orig_aud, orig_pose, rand_orig, args, device) 91 | 92 | return (tribert_acc1, tribert_acc5, tribert_acc10), (orig_acc1, orig_acc5, orig_acc10) 93 | 94 | if __name__ == '__main__': 95 | parser = argparse.ArgumentParser() 96 | parser.add_argument('--tribert_embedding_path', type=str, default='retrieval_embeddings/tribert_test.pt') 97 | parser.add_argument('--orig_embedding_path', type=str, default='retrieval_embeddings/orig_test.pt') 98 | parser.add_argument('--tribert_path', type=str, help='path of tribert embeddings retrieval network checkpoint') 99 | parser.add_argument('--orig_path', type=str, help='path of original embeddings retrieval network checkpoint') 100 | parser.add_argument('--retrieval_variant', type=str, choices=['aud2vis', 'vis2aud', 'aud2pose', 'pose2aud', 'vis+aud2pose'], help='select 1 of the 5 retrieval variants shown in the paper') 101 | args = parser.parse_args() 102 | 103 | if torch.cuda.is_available(): 104 | device = 'cuda' 105 | else: 106 | device = 'cpu' 107 | 108 | ''' 109 | tribert_embeddings and orig_embeddings are dictionaries saved as a pytorch .pt file. Each .pt file has 3 keys: 110 | vision, audio, and pose. The value for each key is the corresponding modality embedding, which has shape (n, k), 111 | where n is the number of embeddings and k is the dimensionality of the embedding 112 | ''' 113 | tribert_embeddings = torch.load(args.tribert_embedding_path) 114 | orig_embeddings = torch.load(args.orig_embedding_path) 115 | 116 | np.random.seed(10) 117 | torch.manual_seed(1) 118 | 119 | tribert_ckpt = torch.load(args.tribert_path, map_location=device) 120 | orig_ckpt = torch.load(args.orig_path, map_location=device) 121 | 122 | # Load retrieval model checkpoints 123 | if args.retrieval_variant == 'aud2vis': 124 | tribert = tribert_ckpt['aud2vis'] 125 | orig = orig_ckpt['aud2vis'] 126 | tribert_model = tribert_retrieval_networks.aud2vis() 127 | orig_model = orig_retrieval_networks.aud2vis() 128 | if args.retrieval_variant == 'vis2aud': 129 | tribert = tribert_ckpt['vis2aud'] 130 | orig = orig_ckpt['vis2aud'] 131 | tribert_model = tribert_retrieval_networks.vis2aud() 132 | orig_model = orig_retrieval_networks.vis2aud() 133 | if args.retrieval_variant == 'aud2pose': 134 | tribert = tribert_ckpt['aud2pose'] 135 | orig = orig_ckpt['aud2pose'] 136 | tribert_model = tribert_retrieval_networks.aud2pose() 137 | orig_model = orig_retrieval_networks.aud2pose() 138 | if args.retrieval_variant == 'pose2aud': 139 | tribert = tribert_ckpt['pose2aud'] 140 | orig = orig_ckpt['pose2aud'] 141 | tribert_model = tribert_retrieval_networks.pose2aud() 142 | orig_model = orig_retrieval_networks.pose2aud() 143 | if args.retrieval_variant == 'vis+aud2pose': 144 | tribert = tribert_ckpt['vis+aud2pose'] 145 | orig = orig_ckpt['vis+aud2pose'] 146 | tribert_model = tribert_retrieval_networks.visaud2pose() 147 | orig_model = orig_retrieval_networks.visaud2pose() 148 | 149 | tribert_model, orig_model = tribert_model.to(device), orig_model.to(device) 150 | tribert_model.load_state_dict(tribert['model_state_dict']) 151 | orig_model.load_state_dict(orig['model_state_dict']) 152 | 153 | tribert_model.eval() 154 | orig_model.eval() 155 | 156 | # Evaluate retrieval 157 | tribert_acc, orig_acc = test(tribert_model, orig_model, tribert_embeddings, orig_embeddings, device, args) 158 | 159 | print("*" * 80) 160 | print("TriBERT embedding retrieval results\n") 161 | print("*" * 80) 162 | print(f'Top-1 Acc: {tribert_acc[0] * 100:.2f}%, Top-5 Acc: {tribert_acc[1] * 100:.2f}%, Top-10 Acc: {tribert_acc[2] * 100:.2f}%') 163 | 164 | print("*" * 80) 165 | print("Original embedding retrieval results\n") 166 | print("*" * 80) 167 | print(f'Top-1 Acc: {orig_acc[0] * 100:.2f}%, Top-5 Acc: {orig_acc[1] * 100:.2f}%, Top-10 Acc: {orig_acc[2] * 100:.2f}%') 168 | -------------------------------------------------------------------------------- /retrieval/orig_retrieval_networks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class aud2vis(nn.Module): 7 | def __init__(self): 8 | super(aud2vis, self).__init__() 9 | 10 | # MLP to map audio embedding from (512) -> (8192) dim tensor 11 | self.audio_transform = nn.Sequential(nn.Linear(512, 8192)) 12 | 13 | self.pool = nn.AvgPool2d(kernel_size=7) 14 | 15 | # MLP to compute alignment score 16 | self.align_net = nn.Sequential(nn.Linear(8192, 4096), 17 | nn.Tanh(), 18 | nn.Linear(4096, 2048), 19 | nn.Tanh(), 20 | nn.Linear(2048, 1024), 21 | nn.Tanh(), 22 | nn.Linear(1024, 1)) 23 | 24 | # Single forward pass with one sample pair 25 | def forward_once(self, audio, vision): 26 | # audio is now 8192-dimensional 27 | audio = self.audio_transform(audio) 28 | 29 | vision = vision.view(-1, 4 * 2048, 7, 7) 30 | vision = self.pool(vision).squeeze() 31 | 32 | # Calculate alignment score 33 | align_score = self.align_net(audio * vision) 34 | return align_score 35 | 36 | def forward(self, sample, neg1, neg2): 37 | audio_anchor = sample['audio'] 38 | 39 | # Forward pass for positive pair 40 | pos_score = self.forward_once(audio_anchor, sample['vision']) 41 | 42 | # Forward pass for negative pair 1 43 | neg1_score = self.forward_once(audio_anchor, neg1['vision']) 44 | 45 | # Forward pass for negative pair 2 46 | neg2_score = self.forward_once(audio_anchor, neg2['vision']) 47 | 48 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1) 49 | 50 | 51 | class vis2aud(nn.Module): 52 | 53 | def __init__(self): 54 | super(vis2aud, self).__init__() 55 | 56 | # MLP to map audio embedding from (512) -> (8192) dim tensor 57 | self.audio_transform = nn.Sequential(nn.Linear(512, 8192)) 58 | 59 | self.pool = nn.AvgPool2d(kernel_size=7) 60 | 61 | # MLP to compute alignment score 62 | self.align_net = nn.Sequential(nn.Linear(8192, 4096), 63 | nn.Tanh(), 64 | nn.Linear(4096, 2048), 65 | nn.Tanh(), 66 | nn.Linear(2048, 1024), 67 | nn.Tanh(), 68 | nn.Linear(1024, 1)) 69 | 70 | # Single forward pass with one sample pair 71 | def forward_once(self, vision, audio): 72 | # audio is now 8192-dimensional 73 | audio = self.audio_transform(audio) 74 | 75 | # Calculate alignment score 76 | align_score = self.align_net(audio * vision) 77 | return align_score 78 | 79 | def forward(self, sample, neg1, neg2): 80 | vision_anchor = sample['vision'] 81 | vision_anchor = vision_anchor.view(-1, 4 * 2048, 7, 7) 82 | vision_anchor = self.pool(vision_anchor).squeeze() 83 | 84 | # Forward pass for positive pair 85 | pos_score = self.forward_once(vision_anchor, sample['audio']) 86 | 87 | # Forward pass for negative pair 1 88 | neg1_score = self.forward_once(vision_anchor, neg1['audio']) 89 | 90 | # Forward pass for negative pair 2 91 | neg2_score = self.forward_once(vision_anchor, neg2['audio']) 92 | 93 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1) 94 | 95 | 96 | class aud2pose(nn.Module): 97 | 98 | def __init__(self): 99 | super(aud2pose, self).__init__() 100 | 101 | # MLP to map audio and pose embeddings to same dimensionality 102 | self.audio_transform = nn.Sequential(nn.Linear(512, 4096)) 103 | self.pose_transform = nn.Linear(139264, 4096) 104 | 105 | # MLP to compute alignment score 106 | self.align_net = nn.Sequential(nn.Linear(4096, 2048), 107 | nn.Tanh(), 108 | nn.Linear(2048, 1024), 109 | nn.Tanh(), 110 | nn.Linear(1024, 512), 111 | nn.Tanh(), 112 | nn.Linear(512, 1)) 113 | 114 | # Single forward pass with one sample pair 115 | def forward_once(self, audio, pose): 116 | pose = self.pose_transform(pose) 117 | 118 | # Calculate alignment score 119 | align_score = self.align_net(audio * pose) 120 | return align_score 121 | 122 | def forward(self, sample, neg1, neg2): 123 | audio_anchor = sample['audio'] 124 | audio_anchor = self.audio_transform(audio_anchor) 125 | 126 | # Forward pass for positive pair 127 | pos_score = self.forward_once(audio_anchor, sample['pose']) 128 | 129 | # Forward pass for negative pair 1 130 | neg1_score = self.forward_once(audio_anchor, neg1['pose']) 131 | 132 | # Forward pass for negative pair 2 133 | neg2_score = self.forward_once(audio_anchor, neg2['pose']) 134 | 135 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1) 136 | 137 | 138 | class pose2aud(nn.Module): 139 | 140 | def __init__(self): 141 | super(pose2aud, self).__init__() 142 | 143 | # MLP to map audio and pose embeddings to same dimensionality 144 | self.audio_transform = nn.Sequential(nn.Linear(512, 4096)) 145 | self.pose_transform = nn.Linear(139264, 4096) 146 | 147 | # MLP to compute alignment score 148 | self.align_net = nn.Sequential(nn.Linear(4096, 2048), 149 | nn.Tanh(), 150 | nn.Linear(2048, 1024), 151 | nn.Tanh(), 152 | nn.Linear(1024, 512), 153 | nn.Tanh(), 154 | nn.Linear(512, 1)) 155 | 156 | # Single forward pass with one sample pair 157 | def forward_once(self, pose, audio): 158 | audio = self.audio_transform(audio) 159 | 160 | # Calculate alignment score 161 | align_score = self.align_net(audio * pose) 162 | return align_score 163 | 164 | def forward(self, sample, neg1, neg2): 165 | pose_anchor = sample['pose'] 166 | pose_anchor = self.pose_transform(pose_anchor) 167 | 168 | # Forward pass for positive pair 169 | pos_score = self.forward_once(pose_anchor, sample['audio']) 170 | 171 | # Forward pass for negative pair 1 172 | neg1_score = self.forward_once(pose_anchor, neg1['audio']) 173 | 174 | # Forward pass for negative pair 2 175 | neg2_score = self.forward_once(pose_anchor, neg2['audio']) 176 | 177 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1) 178 | 179 | 180 | class visaud2pose(nn.Module): 181 | 182 | def __init__(self): 183 | super(visaud2pose, self).__init__() 184 | 185 | # To map modalities to same dimensionality 186 | self.audio_transform = nn.Sequential(nn.Linear(512, 2048)) 187 | self.pose_transform = nn.Linear(139264, 2048) 188 | self.pool = nn.AvgPool2d(kernel_size=7) 189 | self.vision_transform = nn.Linear(8192, 2048) 190 | 191 | # MLP for fusing audio and pose embeddings 192 | self.fuse = nn.Sequential(nn.Linear(2048, 2048), 193 | nn.Tanh(), 194 | nn.Linear(2048, 2048), 195 | nn.Tanh(), 196 | nn.Linear(2048, 2048) 197 | ) 198 | 199 | # MLP to compute alignment score 200 | self.align_net = nn.Sequential(nn.Linear(2048, 1024), 201 | nn.Tanh(), 202 | nn.Linear(1024, 512), 203 | nn.Tanh(), 204 | nn.Linear(512, 1)) 205 | 206 | def fuse_forward(self, vision, audio): 207 | vision = vision.view(-1, 4 * 2048, 7, 7) 208 | vision = self.pool(vision).squeeze() 209 | vision = self.vision_transform(vision) 210 | 211 | # audio is now 8192-dimensional 212 | audio = self.audio_transform(audio) 213 | 214 | # Fuse audio and pose embeddings 215 | fuse_inter = (F.softmax(audio * vision, dim=-1) * vision) + audio 216 | 217 | fuse_embed = self.fuse(fuse_inter) + fuse_inter 218 | 219 | return fuse_embed 220 | 221 | # Single forward pass with one sample pair 222 | 223 | def forward_once(self, fuse, pose): 224 | pose = self.pose_transform(pose) 225 | 226 | # Calculate alignment score 227 | align_score = self.align_net(fuse * pose) 228 | return align_score 229 | 230 | def forward(self, sample, neg1, neg2, hard_neg): 231 | fuse_anchor = self.fuse_forward(sample['vision'], sample['audio']) 232 | 233 | # Forward pass for positive pair 234 | pos_score = self.forward_once(fuse_anchor, sample['pose']) 235 | 236 | # Forward pass for negative pair 1 237 | neg1_score = self.forward_once(fuse_anchor, neg1['pose']) 238 | 239 | # Forward pass for negative pair 2 240 | neg2_score = self.forward_once(fuse_anchor, neg2['pose']) 241 | 242 | hardneg_score = self.forward_once(fuse_anchor, hard_neg['pose']) 243 | 244 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score, hardneg_score), dim=1), dim=1) 245 | -------------------------------------------------------------------------------- /retrieval/retrieval_datasets.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import NearestNeighbors 2 | import torch 3 | from torch.utils.data import Dataset 4 | import numpy as np 5 | 6 | ''' 7 | Dataset used for training single modality -> single modality retrieval (eg. aud2vis, pose2aud, etc.). These retrieval 8 | networks are trained using 3-way multiple choice, with one positive pair and 2 hard negatives. Hard negatives are 9 | sampled using nearest neighbours. 10 | ''' 11 | class SingleModalRetrievalDataset(Dataset): 12 | 13 | def __init__(self, vision_embed, audio_embed, pose_embed, vision_trunc=None): 14 | self.vision_embed = np.load(vision_embed) 15 | self.audio_embed = np.load(audio_embed) 16 | self.pose_embed = np.load(pose_embed) 17 | 18 | # Run Nearest Neighbours for selecting negatives 19 | self.neigh = NearestNeighbors(n_neighbors=25) 20 | self.neigh.fit(self.vision_embed) 21 | 22 | def __len__(self): 23 | return self.vision_embed.shape[0] 24 | 25 | def __getitem__(self, idx): 26 | # Load sample embeddings 27 | vision = self.vision_embed[idx, ...] # (3*4*224*224) 28 | audio = self.audio_embed[idx, ...] # 29 | pose = self.pose_embed[idx, ...] # 30 | 31 | # Create 3 negative pairs of audio/pose by uniformly sampling from current image's k nearest neighbors 32 | ''' 33 | neg pair 1: correct audio, wrong pose 34 | neg pair 2: wrong audio, correct pose 35 | neg pair 3: wrong audio, wrong pose 36 | ''' 37 | neigh_ind = self.neigh.kneighbors([vision], return_distance=False).squeeze() 38 | neg_ind = neigh_ind[np.random.randint(low=1, high=25, size=3)] 39 | 40 | sample = {'vision': torch.from_numpy(vision).float(), 41 | 'audio': torch.from_numpy(audio).float(), 42 | 'pose': torch.from_numpy(pose).float()} 43 | 44 | neg1 = {'audio': torch.from_numpy(audio).float(), 45 | 'pose': torch.from_numpy(self.pose_embed[neg_ind[0], ...]).float()} 46 | 47 | neg2 = {'audio': torch.from_numpy(self.audio_embed[neg_ind[1], ...]).float(), 48 | 'pose': torch.from_numpy(pose).float()} 49 | 50 | 51 | target = 0 # In the output of model(), the first score is for the positive pair 52 | 53 | return sample, neg1, neg2, neg3, target 54 | 55 | 56 | ''' 57 | Used to train visaud2pose. 4-way multiple choice with one positive pair, two easy negatives, and a hard negative. 58 | Easy negatives are sampled randomly from the dataset while hard negatives are sampled using nearest neighbours. 59 | ''' 60 | class MultiModalRetrievalDataset(Dataset): 61 | 62 | def __init__(self, vision_embed, audio_embed, pose_embed): 63 | self.vision_embed = np.load(vision_embed) 64 | self.audio_embed = np.load(audio_embed) 65 | self.pose_embed = np.load(pose_embed) 66 | 67 | # Run Nearest Neighbours for selecting negatives 68 | self.neigh = NearestNeighbors(n_neighbors=25) 69 | self.neigh.fit(self.vision_embed) 70 | 71 | def __len__(self): 72 | return self.vision_embed.shape[0] 73 | 74 | def __getitem__(self, idx): 75 | # Load sample embeddings 76 | vision = self.vision_embed[idx, ...] # (8192) 77 | audio = self.audio_embed[idx, ...] # (4096) 78 | pose = self.pose_embed[idx, ...] # (8192) 79 | 80 | # Create 3 negative pairs of audio/pose by uniformly sampling from current image's k nearest neighbors 81 | ''' 82 | neg pair 1: correct audio, wrong pose 83 | neg pair 2: wrong audio, correct pose 84 | neg pair 3: wrong audio, wrong pose 85 | ''' 86 | neigh_ind = self.neigh.kneighbors([vision], return_distance=False).squeeze() 87 | hard_neg_ind = neigh_ind[np.random.randint(low=3, high=25, size=1)] 88 | neg_ind = np.random.randint(self.vision_embed.shape[0], size=2) 89 | 90 | sample = {'vision': torch.from_numpy(vision).float(), 91 | 'audio': torch.from_numpy(audio).float(), 92 | 'pose': torch.from_numpy(pose).float()} 93 | 94 | # Easy negatives 95 | neg1 = {'vision': torch.from_numpy(self.vision_embed[neg_ind[0], ...]).float(), 96 | 'audio': torch.from_numpy(self.audio_embed[neg_ind[0], ...]).float(), 97 | 'pose': torch.from_numpy(self.pose_embed[neg_ind[0], ...]).float()} 98 | 99 | neg2 = {'vision': torch.from_numpy(self.vision_embed[neg_ind[1], ...]).float(), 100 | 'audio': torch.from_numpy(self.audio_embed[neg_ind[1], ...]).float(), 101 | 'pose': torch.from_numpy(self.pose_embed[neg_ind[1], ...]).float()} 102 | 103 | # Hard negative 104 | neg3 = {'vision': torch.from_numpy(self.vision_embed[hard_neg_ind[0], ...]).float(), 105 | 'audio': torch.from_numpy(self.audio_embed[hard_neg_ind[0], ...]).float(), 106 | 'pose': torch.from_numpy(self.pose_embed[hard_neg_ind[0], ...]).float()} 107 | 108 | target = 0 # In the output of model(), the first score is for the positive pair 109 | 110 | return sample, neg1, neg2, neg3, target 111 | -------------------------------------------------------------------------------- /retrieval/train_retrieval_networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | from torch.utils.data import DataLoader 5 | from torch.nn import CrossEntropyLoss 6 | from torch.optim import Adam 7 | import numpy as np 8 | import tribert_retrieval_networks 9 | import orig_retrieval_networks 10 | from retrieval_datasets import * 11 | from tqdm import tqdm 12 | 13 | 14 | def train(model, criterion, optimizer, dataloader, device, args): 15 | CKPT_DIR = f'./checkpoints/orig/{args.exp_name}' 16 | if not os.path.exists(CKPT_DIR): 17 | os.makedirs(CKPT_DIR) 18 | 19 | loss_str = [] 20 | best_loss = 1000000 21 | for i in range(args.epochs): 22 | accum_loss = 0 23 | batch_cnt = 0 24 | for batch_idx, batch in enumerate(dataloader): 25 | optimizer.zero_grad() 26 | 27 | # Map all tensors to device 28 | batch[-1] = batch[-1].to(device) 29 | for data_dict in batch[:-1]: 30 | for key, value in data_dict.items(): 31 | data_dict[key] = data_dict[key].to(device) 32 | 33 | if args.retrieval_mode == 'visaud2pose': 34 | pos, neg1, neg2, hard_neg, target = batch[0], batch[1], batch[2], batch[3], batch[4] 35 | scores = model(pos, neg1, neg2, hard_neg) # scores is of shape (batch_size, 3) 36 | 37 | else: 38 | pos, neg1, neg2, target = batch[0], batch[1], batch[2], batch[3] 39 | scores = model(pos, neg1, neg2) # scores is of shape (batch_size, 3) 40 | 41 | loss = criterion(scores, target) 42 | 43 | loss_str.append(loss.item()) 44 | accum_loss += loss.item() 45 | batch_cnt += 1 46 | 47 | loss.backward() 48 | optimizer.step() 49 | 50 | avg_loss = accum_loss / batch_cnt 51 | 52 | if avg_loss < best_loss: 53 | best_loss = avg_loss 54 | CKPT_PATH = os.path.join(CKPT_DIR, f'{args.exp_name}.pt') 55 | torch.save({ 56 | 'epoch': i, 57 | 'model_state_dict': model.state_dict(), 58 | 'optimizer_state_dict': optimizer.state_dict(), 59 | 'loss': best_loss 60 | }, CKPT_PATH) 61 | 62 | loss_arr = np.asarray(loss_str) 63 | smallest_loss = loss_arr.min() 64 | np.save(os.path.join(CKPT_DIR, f'{args.exp_name}_loss_{smallest_loss:.2f}.npy'), loss_arr) 65 | 66 | return model 67 | 68 | 69 | if __name__ == '__main__': 70 | parser = argparse.ArgumentParser() 71 | 72 | # Training settings 73 | parser.add_argument('--exp_name', type=str, help='name of experiment. Will be used to name saved checkpoints') 74 | parser.add_argument('--embeddings_path', type=str, help='path of folder containing the .npy embedding files') 75 | parser.add_argument('--lr', type=float, default=0.0001) 76 | parser.add_argument('--batch_size', type=int, default=64) 77 | parser.add_argument('--epochs', type=int, default=1000) 78 | 79 | # Model settings 80 | parser.add_argument('--retrieval_mode', type=str, choices=['aud2vis', 'vis2aud', 'aud2pose', 'pose2aud', 'visaud2pose']) 81 | parser.add_argument('--embedding', type=str, choices=['orig', 'tribert'], 82 | help='retrieval using baseline representations or tribert representations') 83 | args = parser.parse_args() 84 | 85 | ''' 86 | Paths to vision, audio, and pose embeddings, saved as a numpy array in a .npy file. Each of these are of shape 87 | (n, k), where n is the number of embeddings and k the dimensionality of each embedding. 88 | 89 | NOTE: Be sure to use the correct type of embeddings (either baseline or tribert) 90 | ''' 91 | VIS_EMBED_PATH = os.path.join(args.embeddings_path, 'train_vision.npy') 92 | AUD_EMBED_PATH = os.path.join(args.embeddings_path, 'train_audio.npy') 93 | POS_EMBED_PATH = os.path.join(args.embeddings_path, 'train_pose.npy') 94 | 95 | device = 'cuda' 96 | 97 | np.random.seed(1) 98 | torch.manual_seed(1) 99 | 100 | # Load dataset and loader 101 | if args.retrieval_mode == 'visaud2pose': 102 | train_dataset = MultiModalRetrievalDataset(vision_embed=VIS_EMBED_PATH, 103 | audio_embed=AUD_EMBED_PATH, 104 | pose_embed=POS_EMBED_PATH) 105 | else: 106 | 107 | train_dataset = SingleModalRetrievalDataset(vision_embed=VIS_EMBED_PATH, 108 | audio_embed=AUD_EMBED_PATH, 109 | pose_embed=POS_EMBED_PATH) 110 | 111 | dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False) 112 | 113 | # Load model 114 | if args.embedding == 'tribert': 115 | if args.retrieval_mode == 'aud2vis': 116 | model = tribert_retrieval_networks.aud2vis() 117 | elif args.retrieval_mode == 'vis2aud': 118 | model = tribert_retrieval_networks.vis2aud() 119 | elif args.retrieval_mode == 'aud2pose': 120 | model = tribert_retrieval_networks.aud2pose() 121 | elif args.retrieval_mode == 'pose2aud': 122 | model = tribert_retrieval_networks.pose2aud() 123 | else: 124 | model = tribert_retrieval_networks.visaud2pose() 125 | 126 | else: 127 | if args.retrieval_mode == 'aud2vis': 128 | model = orig_retrieval_networks.aud2vis() 129 | elif args.retrieval_mode == 'vis2aud': 130 | model = orig_retrieval_networks.vis2aud() 131 | elif args.retrieval_mode == 'aud2pose': 132 | model = orig_retrieval_networks.aud2pose() 133 | elif args.retrieval_mode == 'pose2aud': 134 | model = orig_retrieval_networks.pose2aud() 135 | else: 136 | model = orig_retrieval_networks.visaud2pose() 137 | 138 | model = model.to(device) 139 | 140 | # Train 141 | criterion = CrossEntropyLoss() 142 | optimizer = Adam(model.parameters(), lr=args.lr) 143 | 144 | model = train(model, criterion, optimizer, dataloader, device, args) 145 | -------------------------------------------------------------------------------- /retrieval/tribert_retrieval_networks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class aud2vis(nn.Module): 7 | 8 | def __init__(self): 9 | super(aud2vis, self).__init__() 10 | 11 | # MLP to map audio embedding from (4096) -> (8192) dim tensor 12 | self.audio_transform = nn.Sequential(nn.Linear(4096, 8192)) 13 | 14 | # MLP to compute alignment score 15 | self.align_net = nn.Sequential(nn.Linear(8192, 4096), 16 | nn.Tanh(), 17 | nn.Linear(4096, 2048), 18 | nn.Tanh(), 19 | nn.Linear(2048, 1024), 20 | nn.Tanh(), 21 | nn.Linear(1024, 1)) 22 | 23 | # Single forward pass with one sample pair 24 | def forward_once(self, audio, vision): 25 | # audio is now 8192-dimensional 26 | audio = self.audio_transform(audio) 27 | 28 | # Calculate alignment score 29 | align_score = self.align_net(audio * vision) 30 | return align_score 31 | 32 | def forward(self, sample, neg1, neg2): 33 | audio_anchor = sample['audio'] 34 | 35 | # Forward pass for positive pair 36 | pos_score = self.forward_once(audio_anchor, sample['vision']) 37 | 38 | # Forward pass for negative pair 1 39 | neg1_score = self.forward_once(audio_anchor, neg1['vision']) 40 | 41 | # Forward pass for negative pair 2 42 | neg2_score = self.forward_once(audio_anchor, neg2['vision']) 43 | 44 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1) 45 | 46 | 47 | class vis2aud(nn.Module): 48 | 49 | def __init__(self): 50 | super(vis2aud, self).__init__() 51 | 52 | # MLP to map audio embedding from (4096) -> (8192) dim tensor 53 | self.audio_transform = nn.Sequential(nn.Linear(4096, 8192)) 54 | 55 | # MLP to compute alignment score 56 | self.align_net = nn.Sequential(nn.Linear(8192, 4096), 57 | nn.Tanh(), 58 | nn.Linear(4096, 2048), 59 | nn.Tanh(), 60 | nn.Linear(2048, 1024), 61 | nn.Tanh(), 62 | nn.Linear(1024, 1)) 63 | 64 | # Single forward pass with one sample pair 65 | def forward_once(self, vision, audio): 66 | # audio is now 8192-dimensional 67 | audio = self.audio_transform(audio) 68 | 69 | # Calculate alignment score 70 | align_score = self.align_net(audio * vision) 71 | return align_score 72 | 73 | def forward(self, sample, neg1, neg2): 74 | vision_anchor = sample['vision'] 75 | 76 | # Forward pass for positive pair 77 | pos_score = self.forward_once(vision_anchor, sample['audio']) 78 | 79 | # Forward pass for negative pair 1 80 | neg1_score = self.forward_once(vision_anchor, neg1['audio']) 81 | 82 | # Forward pass for negative pair 2 83 | neg2_score = self.forward_once(vision_anchor, neg2['audio']) 84 | 85 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1) 86 | 87 | 88 | class aud2pose(nn.Module): 89 | 90 | def __init__(self): 91 | super(aud2pose, self).__init__() 92 | 93 | # MLP to map audio embedding from (4096) -> (8192) dim tensor 94 | self.audio_transform = nn.Sequential(nn.Linear(4096, 8192)) 95 | 96 | # MLP to compute alignment score 97 | self.align_net = nn.Sequential(nn.Linear(8192, 4096), 98 | nn.Tanh(), 99 | nn.Linear(4096, 2048), 100 | nn.Tanh(), 101 | nn.Linear(2048, 1024), 102 | nn.Tanh(), 103 | nn.Linear(1024, 1)) 104 | 105 | # Single forward pass with one sample pair 106 | def forward_once(self, audio, pose): 107 | # audio is now 8192-dimensional 108 | audio = self.audio_transform(audio) 109 | 110 | # Calculate alignment score 111 | align_score = self.align_net(audio * pose) 112 | return align_score 113 | 114 | def forward(self, sample, neg1, neg2): 115 | audio_anchor = sample['audio'] 116 | 117 | # Forward pass for positive pair 118 | pos_score = self.forward_once(audio_anchor, sample['pose']) 119 | 120 | # Forward pass for negative pair 1 121 | neg1_score = self.forward_once(audio_anchor, neg1['pose']) 122 | 123 | # Forward pass for negative pair 2 124 | neg2_score = self.forward_once(audio_anchor, neg2['pose']) 125 | 126 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1) 127 | 128 | 129 | class pose2aud(nn.Module): 130 | 131 | def __init__(self): 132 | super(pose2aud, self).__init__() 133 | 134 | # MLP to map audio embedding from (4096) -> (8192) dim tensor 135 | self.audio_transform = nn.Sequential(nn.Linear(4096, 8192)) 136 | 137 | # MLP to compute alignment score 138 | self.align_net = nn.Sequential(nn.Linear(8192, 4096), 139 | nn.Tanh(), 140 | nn.Linear(4096, 2048), 141 | nn.Tanh(), 142 | nn.Linear(2048, 1024), 143 | nn.Tanh(), 144 | nn.Linear(1024, 1)) 145 | 146 | # Single forward pass with one sample pair 147 | def forward_once(self, pose, audio): 148 | # audio is now 8192-dimensional 149 | audio = self.audio_transform(audio) 150 | 151 | # Calculate alignment score 152 | align_score = self.align_net(audio * pose) 153 | return align_score 154 | 155 | def forward(self, sample, neg1, neg2): 156 | pose_anchor = sample['pose'] 157 | 158 | # Forward pass for positive pair 159 | pos_score = self.forward_once(pose_anchor, sample['audio']) 160 | 161 | # Forward pass for negative pair 1 162 | neg1_score = self.forward_once(pose_anchor, neg1['audio']) 163 | 164 | # Forward pass for negative pair 2 165 | neg2_score = self.forward_once(pose_anchor, neg2['audio']) 166 | 167 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1) 168 | 169 | 170 | class visaud2pose(nn.Module): 171 | 172 | def __init__(self): 173 | super(visaud2pose, self).__init__() 174 | 175 | # MLP to map audio embedding from (4096) -> (8192) dim tensor 176 | self.audio_transform = nn.Sequential(nn.Linear(4096, 8192)) 177 | 178 | # MLP for fusing audio and pose embeddings 179 | self.fuse = nn.Sequential(nn.Linear(8192, 8192), 180 | nn.Tanh(), 181 | nn.Linear(8192, 8192), 182 | nn.Tanh(), 183 | nn.Linear(8192, 8192) 184 | ) 185 | 186 | # MLP to compute alignment score 187 | self.align_net = nn.Sequential(nn.Linear(8192, 4096), 188 | nn.Tanh(), 189 | nn.Linear(4096, 2048), 190 | nn.Tanh(), 191 | nn.Linear(2048, 1024), 192 | nn.Tanh(), 193 | nn.Linear(1024, 1)) 194 | 195 | def fuse_forward(self, vision, audio): 196 | # audio is now 8192-dimensional 197 | audio = self.audio_transform(audio) 198 | 199 | audio = audio.view(-1, 4, 2, 1024) 200 | vision = vision.view(-1, 4, 2, 1024) 201 | 202 | # Fuse audio and pose embeddings 203 | fuse_inter = (F.softmax(audio * vision, dim=-1) * vision) + audio 204 | fuse_inter = fuse_inter.view(-1, 4*2*1024) 205 | fuse_embed = self.fuse(fuse_inter) + fuse_inter 206 | 207 | 208 | return fuse_embed 209 | 210 | # Single forward pass with one sample pair 211 | def forward_once(self, fuse, pose): 212 | 213 | # Calculate alignment score 214 | align_score = self.align_net(fuse * pose) 215 | return align_score 216 | 217 | def forward(self, sample, neg1, neg2, hard_neg): 218 | 219 | fuse_anchor = self.fuse_forward(sample['vision'], sample['audio']) 220 | 221 | # Forward pass for positive pair 222 | pos_score = self.forward_once(fuse_anchor, sample['pose']) 223 | 224 | # Forward pass for negative pair 1 225 | neg1_score = self.forward_once(fuse_anchor, neg1['pose']) 226 | 227 | # Forward pass for negative pair 2 228 | neg2_score = self.forward_once(fuse_anchor, neg2['pose']) 229 | 230 | hardneg_score = self.forward_once(fuse_anchor, hard_neg['pose']) 231 | 232 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score, hardneg_score), dim=1), dim=1) -------------------------------------------------------------------------------- /tribert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__init__.py -------------------------------------------------------------------------------- /tribert/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/__pycache__/audio_net.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/audio_net.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/__pycache__/criterion.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/criterion.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/__pycache__/tribert.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/tribert.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/__pycache__/vilbert.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/vilbert.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/datasets/__pycache__/base.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/datasets/__pycache__/base.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/datasets/__pycache__/music_multimodal.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/datasets/__pycache__/music_multimodal.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/datasets/__pycache__/video_transforms.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/datasets/__pycache__/video_transforms.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/datasets/base.py: -------------------------------------------------------------------------------- 1 | import random 2 | import csv 3 | import numpy as np 4 | import torch 5 | import torch.utils.data as torchdata 6 | from torchvision import transforms 7 | import torchaudio 8 | import librosa 9 | from PIL import Image 10 | import os 11 | 12 | from . import video_transforms as vtransforms 13 | 14 | 15 | class BaseDataset(torchdata.Dataset): 16 | def __init__(self, list_sample, opt, max_sample=-1, split='train'): 17 | # params 18 | self.num_frames = opt.num_frames 19 | self.stride_frames = opt.stride_frames 20 | self.frameRate = opt.frameRate 21 | self.imgSize = opt.imgSize 22 | self.audRate = opt.audRate 23 | self.audLen = opt.audLen 24 | self.audSec = 1. * self.audLen / self.audRate 25 | self.binary_mask = opt.binary_mask 26 | 27 | # STFT params 28 | self.log_freq = opt.log_freq 29 | self.stft_frame = opt.stft_frame 30 | self.stft_hop = opt.stft_hop 31 | self.HS = opt.stft_frame // 2 + 1 32 | self.WS = (self.audLen + 1) // self.stft_hop 33 | 34 | self.split = split 35 | self.seed = opt.seed 36 | random.seed(self.seed) 37 | 38 | # initialize video transform 39 | self._init_vtransform() 40 | 41 | # list_sample can be a python list or a csv file of list 42 | if isinstance(list_sample, str): 43 | # self.list_sample = [x.rstrip() for x in open(list_sample, 'r')] 44 | self.list_sample = [] 45 | for row in csv.reader(open(list_sample, 'r'), delimiter=','): 46 | if len(row) < 1: 47 | continue 48 | self.list_sample.append(row[0]) 49 | elif isinstance(list_sample, list): 50 | self.list_sample = list_sample 51 | else: 52 | raise('Error list_sample!') 53 | 54 | if self.split == 'train': 55 | self.list_sample *= opt.dup_trainset 56 | random.shuffle(self.list_sample) 57 | 58 | if max_sample > 0: 59 | self.list_sample = self.list_sample[0:max_sample] 60 | 61 | num_sample = len(self.list_sample) 62 | self.num_dataset = num_sample 63 | assert num_sample > 0 64 | print('# samples: {}'.format(num_sample)) 65 | 66 | def __len__(self): 67 | return len(self.list_sample) 68 | 69 | # video transform funcs 70 | def _init_vtransform(self): 71 | transform_list = [] 72 | mean = [0.485, 0.456, 0.406] 73 | std = [0.229, 0.224, 0.225] 74 | 75 | if self.split == 'train': 76 | transform_list.append(vtransforms.Resize(int(self.imgSize * 1.1), Image.BICUBIC)) 77 | transform_list.append(vtransforms.RandomCrop(self.imgSize)) 78 | transform_list.append(vtransforms.RandomHorizontalFlip()) 79 | else: 80 | transform_list.append(vtransforms.Resize(self.imgSize, Image.BICUBIC)) 81 | transform_list.append(vtransforms.CenterCrop(self.imgSize)) 82 | 83 | transform_list.append(vtransforms.ToTensor()) 84 | transform_list.append(vtransforms.Normalize(mean, std)) 85 | transform_list.append(vtransforms.Stack()) 86 | self.vid_transform = transforms.Compose(transform_list) 87 | 88 | # image transform funcs, deprecated 89 | def _init_transform(self): 90 | mean = [0.485, 0.456, 0.406] 91 | std = [0.229, 0.224, 0.225] 92 | 93 | if self.split == 'train': 94 | self.img_transform = transforms.Compose([ 95 | transforms.Scale(int(self.imgSize * 1.2)), 96 | transforms.RandomCrop(self.imgSize), 97 | transforms.RandomHorizontalFlip(), 98 | transforms.ToTensor(), 99 | transforms.Normalize(mean, std)]) 100 | else: 101 | self.img_transform = transforms.Compose([ 102 | transforms.Scale(self.imgSize), 103 | transforms.CenterCrop(self.imgSize), 104 | transforms.ToTensor(), 105 | transforms.Normalize(mean, std)]) 106 | 107 | def _load_frames(self, paths): 108 | frames = [] 109 | original_img_size = [] 110 | for path in paths: 111 | frames.append(self._load_frame(path)) 112 | for frame in frames: 113 | original_img_size.append(frame.size) 114 | frames = self.vid_transform(frames) 115 | return frames, original_img_size 116 | 117 | def _load_frame(self, path): 118 | img = Image.open(path).convert('RGB') 119 | return img 120 | 121 | def _stft(self, audio): 122 | spec = librosa.stft( 123 | audio, n_fft=self.stft_frame, hop_length=self.stft_hop) 124 | amp = np.abs(spec) 125 | phase = np.angle(spec) 126 | return torch.from_numpy(amp), torch.from_numpy(phase) 127 | 128 | def _load_audio_file(self, path): 129 | if path.endswith('.mp3'): 130 | audio_raw, rate = torchaudio.load(path) 131 | audio_raw = audio_raw.numpy().astype(np.float32) 132 | 133 | # range to [-1, 1] 134 | audio_raw *= (2.0**-31) 135 | 136 | # convert to mono 137 | if audio_raw.shape[1] == 2: 138 | audio_raw = (audio_raw[:, 0] + audio_raw[:, 1]) / 2 139 | else: 140 | audio_raw = audio_raw[:, 0] 141 | else: 142 | audio_raw, rate = librosa.load(path, sr=None, mono=True) 143 | 144 | return audio_raw, rate 145 | 146 | def _load_audio(self, path, center_timestamp, nearest_resample=False): 147 | audio = np.zeros(self.audLen, dtype=np.float32) 148 | 149 | # silent 150 | if path.endswith('silent'): 151 | return audio 152 | 153 | # load audio 154 | audio_raw, rate = self._load_audio_file(path) 155 | 156 | # repeat if audio is too short 157 | if audio_raw.shape[0] < rate * self.audSec: 158 | n = int(rate * self.audSec / audio_raw.shape[0]) + 1 159 | audio_raw = np.tile(audio_raw, n) 160 | 161 | # resample 162 | if rate > self.audRate: 163 | # print('resmaple {}->{}'.format(rate, self.audRate)) 164 | if nearest_resample: 165 | audio_raw = audio_raw[::rate//self.audRate] 166 | else: 167 | audio_raw = librosa.resample(audio_raw, rate, self.audRate) 168 | 169 | # crop N seconds 170 | len_raw = audio_raw.shape[0] 171 | center = int(center_timestamp * self.audRate) 172 | start = max(0, center - self.audLen // 2) 173 | end = min(len_raw, center + self.audLen // 2) 174 | 175 | audio[self.audLen//2-(center-start): self.audLen//2+(end-center)] = \ 176 | audio_raw[start:end] 177 | 178 | # randomize volume 179 | if self.split == 'train': 180 | scale = random.random() + 0.5 # 0.5-1.5 181 | audio *= scale 182 | audio[audio > 1.] = 1. 183 | audio[audio < -1.] = -1. 184 | 185 | return audio 186 | 187 | def _mix_n_and_stft(self, audios): 188 | N = len(audios) 189 | mags = [None for n in range(N)] 190 | 191 | # mix 192 | for n in range(N): 193 | audios[n] /= N 194 | audio_mix = np.asarray(audios).sum(axis=0) 195 | 196 | # STFT 197 | amp_mix, phase_mix = self._stft(audio_mix) 198 | for n in range(N): 199 | ampN, _ = self._stft(audios[n]) 200 | mags[n] = ampN.unsqueeze(0) 201 | 202 | # to tensor 203 | # audio_mix = torch.from_numpy(audio_mix) 204 | for n in range(N): 205 | audios[n] = torch.from_numpy(audios[n]) 206 | 207 | return amp_mix.unsqueeze(0), mags, phase_mix.unsqueeze(0) 208 | 209 | def dummy_mix_data(self, N): 210 | frames = [None for n in range(N)] 211 | audios = [None for n in range(N)] 212 | mags = [None for n in range(N)] 213 | 214 | amp_mix = torch.zeros(1, self.HS, self.WS) 215 | phase_mix = torch.zeros(1, self.HS, self.WS) 216 | 217 | for n in range(N): 218 | frames[n] = torch.zeros( 219 | 3, self.num_frames, self.imgSize, self.imgSize) 220 | audios[n] = torch.zeros(self.audLen) 221 | mags[n] = torch.zeros(1, self.HS, self.WS) 222 | 223 | return amp_mix, mags, frames, audios, phase_mix 224 | 225 | def check_video_frames_exists(self, frame_root, index): 226 | frame_path = os.path.join(frame_root, self.list_sample[index]) 227 | list_of_files = os.listdir(frame_path) 228 | if len(list_of_files) > 0: 229 | return index 230 | else: 231 | #print("self iteration") 232 | return self.check_video_frames_exists(frame_root, index+1) 233 | -------------------------------------------------------------------------------- /tribert/datasets/music_multimodal.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | from .base import BaseDataset 4 | import pickle 5 | import h5py 6 | import torch 7 | import numpy as np 8 | import json 9 | 10 | class InputFeatures(object): 11 | """A single set of features of data.""" 12 | def __init__(self, 13 | image_feat=None, 14 | image_label=None, 15 | image_mask=None, 16 | pose_feat=None, 17 | pose_loc=None, 18 | pose_label=None, 19 | pose_mask=None, 20 | audio_feat = None, 21 | audio_label = None, 22 | audio_target_label = None): 23 | self.image_feat = image_feat 24 | self.image_label = image_label 25 | self.image_mask = image_mask 26 | self.pose_feat = pose_feat 27 | self.pose_loc = pose_loc 28 | self.pose_label = pose_label 29 | self.pose_mask = pose_mask 30 | self.audio_feat = audio_feat 31 | self.audio_label = audio_label 32 | self.audio_target_label = audio_target_label 33 | 34 | class MUSICMixMultimodalDataset(BaseDataset): 35 | def __init__(self, list_sample, opt, **kwargs): 36 | super(MUSICMixMultimodalDataset, self).__init__( 37 | list_sample, opt, **kwargs) 38 | self.fps = opt.frameRate 39 | self.num_mix = opt.num_mix 40 | self.num_seq = 2 41 | self.root_path = "/ubc/cs/research/shield/datasets/MUSIC21_dataset/preprocessed_data/music21_bert_features/music_dataset" 42 | self.frame_root = "/ubc/cs/research/shield/datasets/MUSIC21_dataset/preprocessed_data/music21_all_frames" 43 | self.audio_root = "/ubc/cs/research/shield/datasets/MUSIC21_dataset/preprocessed_data/music21_bert_features/audio_feat" 44 | self.pose_root = "/ubc/cs/research/shield/datasets/MUSIC21_dataset/preprocessed_data/music21_bert_features/pose_feat_split_frame" 45 | self.pose_bbox_path = "/ubc/cs/research/shield/datasets/MUSIC21_dataset/preprocessed_data/music21_bert_features/alphapose_bbox_json" 46 | 47 | #audio classification 48 | gt_lable_dict = {'bagpipe': 1, 'clarinet': 2, 'flute': 3, 'drum': 4, 'acoustic_guitar':5,'ukulele': 6, 'accordion': 7, 'bassoon': 8, 'guzheng': 9, 'xylophone': 10, 'erhu': 11, 'tuba':12, 'congas': 13, 'saxophone': 14, 'cello': 15, 'violin': 16, 'electric_bass': 17, 'piano':18,'banjo': 19, 'trumpet': 20, 'pipa': 21} 49 | solo_file_path = os.path.join(self.root_path, "MUSIC21_solo_videos.json") 50 | duet_file_path = os.path.join(self.root_path,"MUSIC_duet_videos.json") 51 | dataDict_solo = None 52 | dataDict_duet = None 53 | 54 | with open(solo_file_path,'r') as load_f: 55 | dataDict_solo = json.load(load_f) 56 | with open(duet_file_path,'r') as load_f: 57 | dataDict_duet = json.load(load_f) 58 | 59 | self.all_vid_label = {} 60 | 61 | for key, value in dataDict_solo["videos"].items(): 62 | for item in value: 63 | self.all_vid_label[item] = [gt_lable_dict[key]] 64 | 65 | for key, value in dataDict_duet["videos"].items(): 66 | for item in value: 67 | self.all_vid_label[item] = [gt_lable_dict[key.split(' ')[0]], gt_lable_dict[key.split(' ')[1]]] 68 | 69 | 70 | def random_region(self,image_feat, num_boxes): 71 | output_label = np.zeros([self.num_frames, num_boxes]) 72 | for w in range(self.num_frames): 73 | for i in range(num_boxes): 74 | prob = random.random() 75 | # mask token with 15% probability 76 | if prob < 0.15 : 77 | prob /= 0.15 78 | # 80% randomly change token to mask token 79 | if prob < 0.9: 80 | image_feat[w,i] = 0 81 | # -> rest 10% randomly keep current token 82 | output_label[w,i] = 1 83 | else: 84 | output_label[w,i] = -1 85 | return image_feat, output_label 86 | 87 | def convert_example_to_features(self, image_feat, pose_feat, pose_loc, num_seq, audio_feat, label, target_label, num_frames): 88 | N = self.num_mix 89 | num_boxes = num_seq 90 | image_mask = [] 91 | pose_mask = [] 92 | image_label = [[] for n in range(N)] 93 | pose_label = [[] for n in range(N)] 94 | for n in range(N): 95 | image_feat[n], image_label[n] = self.random_region(image_feat[n], num_boxes) 96 | pose_feat[n], pose_label[n] = self.random_region(pose_feat[n], num_boxes) 97 | image_mask_tmp = [[1] * (num_boxes)] * self.num_frames 98 | pose_mask_tmp = [[1] * (num_boxes)] * self.num_frames 99 | 100 | #Zero-pad up to the visual sequence length. 101 | while len(image_mask_tmp) < self.num_frames: 102 | image_mask_tmp.append(0) 103 | image_label[n].append(-1) 104 | pose_mask_tmp.append(0) 105 | pose_label[n].append(-1) 106 | 107 | assert len(image_mask_tmp) == self.num_frames 108 | assert len(pose_mask_tmp) == self.num_frames 109 | image_mask.append(image_mask_tmp) 110 | pose_mask.append(pose_mask_tmp) 111 | 112 | features = InputFeatures( 113 | image_feat=image_feat, 114 | image_label=np.array(image_label), 115 | image_mask = np.array(image_mask), 116 | pose_feat = pose_feat, 117 | pose_loc = pose_loc, 118 | pose_label = np.array(pose_label), 119 | pose_mask = np.array(pose_mask), 120 | audio_feat = audio_feat, 121 | audio_label = label, 122 | audio_target_label = target_label) 123 | 124 | return features 125 | 126 | def __getitem__(self, index): 127 | N = self.num_mix 128 | frames = [None for n in range(N)] 129 | pose_features = [None for n in range(N)] 130 | audios = [None for n in range(N)] 131 | infos = [[] for n in range(N)] 132 | path_frames = [[] for n in range(N)] 133 | path_audios = ['' for n in range(N)] 134 | center_frames = [0 for n in range(N)] 135 | pose_location = np.zeros((N, self.num_frames, self.num_seq, 5), dtype=np.float32) 136 | final_pose_location = [None for n in range(N)] 137 | original_img_size = [None for n in range(N)] 138 | target_label = [[] for n in range(N)] 139 | label = torch.zeros(N, 21) 140 | 141 | # the first video 142 | index = self.check_video_frames_exists(self.frame_root, index) 143 | infos[0] = self.list_sample[index] 144 | 145 | # sample other videos 146 | if not self.split == 'train': 147 | random.seed(index) 148 | for n in range(1, N): 149 | indexN = random.randint(0, len(self.list_sample)-1) 150 | indexN = self.check_video_frames_exists(self.frame_root, indexN) 151 | infos[n] = self.list_sample[indexN] 152 | 153 | # select frames 154 | idx_margin = max( 155 | int(self.fps * 8), (self.num_frames // 2) * self.stride_frames) 156 | 157 | for n, infoN in enumerate(infos): 158 | #audio label for classification 159 | target = self.all_vid_label[infoN] 160 | for j in range(len(target)): 161 | label[n][target[j]-1]=1 162 | #process for batch compatible 163 | if len(target) < self.num_seq: 164 | target.append(-1) 165 | target_label[n] = target 166 | 167 | #load pose feat (generated by GCN) 168 | pose_feat_path = os.path.join(self.pose_root, infoN) 169 | pose_json = os.path.join(self.pose_bbox_path, infoN+".json") 170 | with open(pose_json, "rb") as f: 171 | pose_data = json.load(f) 172 | 173 | path_frameN = os.path.join(self.frame_root,infoN) 174 | path_audioN = os.path.join(self.audio_root,infoN+".wav") 175 | count_framesN = len(os.listdir(pose_feat_path)) 176 | 177 | if self.split == 'train': 178 | # random, not to sample start and end n-frames 179 | if idx_margin+1 < int(count_framesN)-idx_margin: 180 | center_frameN = random.randint( 181 | idx_margin+1, int(count_framesN)-idx_margin) 182 | else: 183 | center_frameN = int(count_framesN) // 2 184 | else: 185 | center_frameN = int(count_framesN) // 2 186 | center_frames[n] = center_frameN 187 | 188 | vid_pose_feat = torch.zeros(self.num_frames, 2, 256, 68) 189 | # absolute frame/audio paths 190 | for i in range(self.num_frames): 191 | idx_offset = i 192 | path_frames[n].append( 193 | os.path.join( 194 | path_frameN, 195 | '{:06d}.jpg'.format(center_frameN + idx_offset + 1))) 196 | 197 | #load pose features 198 | try: 199 | vid_pose_feat[i] = torch.from_numpy(np.load(os.path.join(pose_feat_path, '{:06d}.npy'.format(center_frameN + idx_offset)), allow_pickle=True)) 200 | except Exception as e: 201 | print("error in "+infoN) 202 | 203 | 204 | #load pose location (pose bbox) 205 | if '{:06d}.jpg'.format(center_frameN + idx_offset) in pose_data: 206 | bbox_list = pose_data['{:06d}.jpg'.format(center_frameN + idx_offset)] 207 | bbox_list.sort(key = lambda x: x[4], reverse=True) 208 | for s, box in enumerate(bbox_list): 209 | if s < self.num_seq: 210 | pose_location[n, i, s,:4] = box[:4] 211 | pose_features[n] = vid_pose_feat 212 | path_audios[n] = path_audioN 213 | 214 | # load frames and audios, STFT 215 | try: 216 | for n, infoN in enumerate(infos): 217 | frames[n], original_img_size[n] = self._load_frames(path_frames[n]) 218 | # jitter audio 219 | # center_timeN = (center_frames[n] - random.random()) / self.fps 220 | center_timeN = (center_frames[n] - 0.5) / self.fps 221 | audios[n] = self._load_audio(path_audios[n], center_timeN) 222 | mag_mix, mags, phase_mix = self._mix_n_and_stft(audios) 223 | 224 | except Exception as e: 225 | print('Failed loading frame/audio: {}'.format(e)) 226 | # create dummy data 227 | mag_mix, mags, frames, audios, phase_mix = \ 228 | self.dummy_mix_data(N) 229 | 230 | self.image_size = 224 231 | for n, infoN in enumerate(infos): 232 | for i in range(self.num_frames): 233 | #rescale bbox which we get from alphapose 234 | if not original_img_size[n] is None: 235 | x_ = original_img_size[n][i][0] 236 | y_ = original_img_size[n][i][1] 237 | x_scale = self.image_size/x_ 238 | y_scale = self.image_size/y_ 239 | else: 240 | x_scale = 1 241 | y_scale = 1 242 | for s in range(self.num_seq): 243 | pose_location[n,i,s,0] = pose_location[n,i,s,0] * x_scale 244 | pose_location[n,i,s,1] = pose_location[n,i,s,1] * y_scale 245 | pose_location[n,i,s,2] = pose_location[n,i,s,2] * x_scale 246 | pose_location[n,i,s,3] = pose_location[n,i,s,3] * y_scale 247 | pose_location[n,i,s,2] = pose_location[n,i,s,2] + pose_location[n,i,s,0] 248 | pose_location[n,i,s,3] = pose_location[n,i,s,3] + pose_location[n,i,s,1] 249 | 250 | #process pose location as vilbert 251 | pose_location[n,i,:,4] = (pose_location[n,i,:,3] - pose_location[n,i,:,1]) * (pose_location[n,i,:,2] - pose_location[n,i,:,0]) / (float(self.image_size) * float(self.image_size)) 252 | pose_location[n,i,:,0] = pose_location[n,i,:,0] / float(self.image_size) 253 | pose_location[n,i,:,1] = pose_location[n,i,:,1] / float(self.image_size) 254 | pose_location[n,i,:,2] = pose_location[n,i,:,2] / float(self.image_size) 255 | pose_location[n,i,:,3] = pose_location[n,i,:,3] / float(self.image_size) 256 | 257 | final_pose_location[n] = pose_location[n] 258 | 259 | ###start of bert dataloader 260 | cur_features = self.convert_example_to_features(frames, pose_features, final_pose_location, self.num_seq, mag_mix, label, target_label, self.num_frames) 261 | cur_tensors = (cur_features.image_feat, 262 | cur_features.image_label, 263 | cur_features.image_mask, 264 | cur_features.pose_feat, 265 | cur_features.pose_loc, 266 | cur_features.pose_label, 267 | cur_features.pose_mask, 268 | cur_features.audio_feat, 269 | cur_features.audio_label, 270 | cur_features.audio_target_label) 271 | image_feat, image_label,image_mask, pose_feat, pose_loc,pose_label, pose_mask, audio_feat, audio_label, audio_target_label = cur_tensors 272 | 273 | image_feat_final = [None for n in range(N)] #torch.zeros((N,self.num_frames+1, 3, self.image_size,self.image_size)) 274 | image_mask_final = [None for n in range(N)] #torch.zeros((N,self.num_frames+1, self.num_seq)) 275 | pose_feat_final = [None for n in range(N)] #torch.zeros((N, self.num_frames+1, self.num_seq, 256, 68)) 276 | pose_mask_final = [None for n in range(N)] #torch.zeros((N,self.num_frames+1, self.num_seq)) 277 | pose_loc_final = [None for n in range(N)] #torch.zeros((N,self.num_frames+1, self.num_seq,5)) 278 | 279 | for n in range(N): 280 | #batch_size = image_feat[n].shape[0] 281 | image_mask_tmp = image_mask[n] 282 | image_feat_tmp = image_feat[n] 283 | if len(image_mask_tmp.shape) < 2: 284 | image_mask_tmp = image_mask_tmp.reshape(1,image_mask_tmp.shape[0]) 285 | g_image_feat = np.sum(image_feat_tmp.numpy(), axis=1) / np.sum(image_mask_tmp) #, axis=1, keepdims=True) 286 | image_feat_tmp = np.concatenate([np.expand_dims(g_image_feat, axis=1), image_feat_tmp.numpy()], axis=1) 287 | image_feat_tmp = np.array(image_feat_tmp, dtype=np.float32) 288 | g_image_mask = np.repeat(np.array([[1, 1]]), 1, axis=0) 289 | image_mask_tmp = np.concatenate([g_image_mask, image_mask_tmp], axis=0) 290 | image_feat_final[n]= torch.from_numpy(image_feat_tmp) 291 | image_mask_final[n] = torch.from_numpy(image_mask_tmp) 292 | 293 | pose_mask_tmp = pose_mask[n] 294 | pose_feat_tmp = pose_feat[n] 295 | pose_loc_tmp = pose_loc[n] 296 | if len(pose_mask_tmp.shape) < 2: 297 | pose_mask_tmp = pose_mask_tmp.reshape(1,pose_mask_tmp.shape[0]) 298 | g_pose_feat = np.sum(pose_feat_tmp.numpy(), axis=0) / np.sum(pose_mask_tmp) #, axis=1, keepdims=True) 299 | pose_feat_tmp = np.concatenate([np.expand_dims(g_pose_feat, axis=0), pose_feat_tmp.numpy()], axis=0) 300 | pose_feat_tmp = np.array(pose_feat_tmp, dtype=np.float32) 301 | g_pose_loc = np.repeat(np.array([[0,0,1,1,1]], dtype=np.float32), 2, axis=0) 302 | pose_loc_tmp = np.concatenate([np.expand_dims(g_pose_loc, axis=0), pose_loc_tmp], axis=0) 303 | pose_loc_tmp = np.array(pose_loc_tmp, dtype=np.float32) 304 | g_pose_mask = np.repeat(np.array([[1,1]]), 1, axis=0) 305 | pose_mask_tmp = np.concatenate([g_pose_mask, pose_mask_tmp], axis=0) 306 | pose_feat_final[n] = torch.from_numpy(pose_feat_tmp) 307 | pose_mask_final[n] = torch.from_numpy(pose_mask_tmp) 308 | pose_loc_final[n] = torch.from_numpy(pose_loc_tmp) 309 | 310 | ###end of bert dataloader 311 | 312 | #ret_dict = {'mag_mix': mag_mix, 'frames': frames, 'mags': mags, 'pose_feat': pose_features, 'pose_loc': final_pose_location, 'label': label, 'target_label': target_label} 313 | 314 | ret_dict = {'mag_mix': audio_feat, 'frames': image_feat_final, 'image_label': image_label, 'image_mask': image_mask_final, 'mags': mags, 'pose_feat': pose_feat_final, 'pose_loc': pose_loc_final, 'pose_mask': pose_mask_final,'pose_label':pose_label, 'label': label, 'target_label': target_label} 315 | 316 | if self.split != 'train': 317 | ret_dict['audios'] = audios 318 | ret_dict['phase_mix'] = phase_mix 319 | ret_dict['infos'] = infos 320 | 321 | return ret_dict 322 | -------------------------------------------------------------------------------- /tribert/datasets/video_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numbers 3 | import torchvision.transforms.functional as F 4 | from PIL import Image 5 | import torch 6 | 7 | 8 | class Resize(object): 9 | def __init__(self, size, interpolation=Image.BILINEAR): 10 | self.size = size 11 | self.interpolation = interpolation 12 | 13 | def __call__(self, frames): 14 | """ 15 | Args: 16 | frames: a list of PIL Image 17 | Returns: 18 | a list of PIL Image: Rescaled images. 19 | """ 20 | out_frames = [] 21 | for frame in frames: 22 | out_frames.append(F.resize(frame, self.size, self.interpolation)) 23 | return out_frames 24 | 25 | 26 | class CenterCrop(object): 27 | def __init__(self, size): 28 | if isinstance(size, numbers.Number): 29 | self.size = (int(size), int(size)) 30 | else: 31 | self.size = size 32 | 33 | def __call__(self, frames): 34 | """ 35 | Args: 36 | frames: a list of PIL Image 37 | Returns: 38 | a list of PIL Image: Cropped images. 39 | """ 40 | out_frames = [] 41 | for frame in frames: 42 | out_frames.append(F.center_crop(frame, self.size)) 43 | return out_frames 44 | 45 | 46 | class RandomCrop(object): 47 | def __init__(self, size, padding=None, pad_if_needed=False, fill=0, padding_mode='constant'): 48 | if isinstance(size, numbers.Number): 49 | self.size = (int(size), int(size)) 50 | else: 51 | self.size = size 52 | self.padding = padding 53 | self.pad_if_needed = pad_if_needed 54 | self.fill = fill 55 | self.padding_mode = padding_mode 56 | 57 | @staticmethod 58 | def get_params(frames, output_size): 59 | """Get parameters for ``crop`` for a random crop. 60 | Args: 61 | frames: a list of PIL Image 62 | output_size (tuple): Expected output size of the crop. 63 | Returns: 64 | tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. 65 | """ 66 | w, h = frames[0].size 67 | th, tw = output_size 68 | if w == tw and h == th: 69 | return 0, 0, h, w 70 | 71 | i = random.randint(0, h - th) 72 | j = random.randint(0, w - tw) 73 | return i, j, th, tw 74 | 75 | def __call__(self, frames): 76 | """ 77 | Args: 78 | frames: a list of PIL Image 79 | Returns: 80 | a list of PIL Image: Cropped images. 81 | """ 82 | 83 | i, j, h, w = self.get_params(frames, self.size) 84 | 85 | out_frames = [] 86 | for frame in frames: 87 | if self.padding is not None: 88 | frame = F.pad(frame, self.padding, self.fill, self.padding_mode) 89 | 90 | # pad the width if needed 91 | if self.pad_if_needed and frame.size[0] < self.size[1]: 92 | frame = F.pad(frame, (int((1 + self.size[1] - frame.size[0]) / 2), 0), self.fill, self.padding_mode) 93 | # pad the height if needed 94 | if self.pad_if_needed and frame.size[1] < self.size[0]: 95 | frame = F.pad(frame, (0, int((1 + self.size[0] - frame.size[1]) / 2)), self.fill, self.padding_mode) 96 | 97 | out_frames.append(F.crop(frame, i, j, h, w)) 98 | return out_frames 99 | 100 | def __repr__(self): 101 | return self.__class__.__name__ + '(size={0}, padding={1})'.format(self.size, self.padding) 102 | 103 | 104 | class RandomHorizontalFlip(object): 105 | """Horizontally flip the given PIL Image randomly with a given probability. 106 | Args: 107 | p (float): probability of the image being flipped. Default value is 0.5 108 | """ 109 | 110 | def __init__(self, p=0.5): 111 | self.p = p 112 | 113 | def __call__(self, frames): 114 | """ 115 | Args: 116 | frames: a list of PIL Image 117 | Returns: 118 | a list of PIL Image: Flipped images. 119 | """ 120 | 121 | if random.random() < self.p: 122 | out_frames = [] 123 | for frame in frames: 124 | out_frames.append(F.hflip(frame)) 125 | return out_frames 126 | else: 127 | return frames 128 | 129 | def __repr__(self): 130 | return self.__class__.__name__ + '(p={})'.format(self.p) 131 | 132 | 133 | class ToTensor(object): 134 | """Convert a list of ``PIL Image`` or ``numpy.ndarray`` to tensor. 135 | Converts a list of PIL Image or numpy.ndarray (H x W x C) in the range 136 | [0, 255] to a torch.FloatTensor of shape (C x L xH x W) in the range 137 | [0.0, 1.0]. 138 | """ 139 | 140 | def __call__(self, frames): 141 | """ 142 | Args: 143 | frames: a list of (PIL Image or numpy.ndarray). 144 | Returns: 145 | a list of Tensor: Converted images. 146 | """ 147 | out_frames = [] 148 | for frame in frames: 149 | out_frames.append(F.to_tensor(frame)) 150 | return out_frames 151 | 152 | 153 | class Normalize(object): 154 | def __init__(self, mean, std): 155 | self.mean = mean 156 | self.std = std 157 | 158 | def __call__(self, frames): 159 | """ 160 | Args: 161 | frames: a list of Tensor image of size (C, H, W) to be normalized. 162 | Returns: 163 | a list of Tensor: a list of normalized Tensor images. 164 | """ 165 | out_frames = [] 166 | for frame in frames: 167 | out_frames.append(F.normalize(frame, self.mean, self.std)) 168 | return out_frames 169 | 170 | 171 | class Stack(object): 172 | def __init__(self, dim=1): 173 | self.dim = dim 174 | 175 | def __call__(self, frames): 176 | """ 177 | Args: 178 | frames: a list of (L) Tensor image of size (C, H, W). 179 | Returns: 180 | Tensor: a video Tensor of size (C, L, H, W). 181 | """ 182 | return torch.stack(frames, dim=self.dim) 183 | -------------------------------------------------------------------------------- /tribert/models/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchvision 3 | import torch.nn.functional as F 4 | 5 | from .audio_net import Unet 6 | from .vision_net import ResnetFC, ResnetDilated 7 | from .criterion import BCELoss, L1Loss, L2Loss 8 | 9 | 10 | def activate(x, activation): 11 | if activation == 'sigmoid': 12 | return torch.sigmoid(x) 13 | elif activation == 'softmax': 14 | return F.softmax(x, dim=1) 15 | elif activation == 'relu': 16 | return F.relu(x) 17 | elif activation == 'tanh': 18 | return F.tanh(x) 19 | elif activation == 'no': 20 | return x 21 | else: 22 | raise Exception('Unkown activation!') 23 | 24 | 25 | class ModelBuilder(): 26 | # custom weights initialization 27 | def weights_init(self, m): 28 | classname = m.__class__.__name__ 29 | if classname.find('Conv') != -1: 30 | m.weight.data.normal_(0.0, 0.001) 31 | elif classname.find('BatchNorm') != -1: 32 | m.weight.data.normal_(1.0, 0.02) 33 | m.bias.data.fill_(0) 34 | elif classname.find('Linear') != -1: 35 | m.weight.data.normal_(0.0, 0.0001) 36 | 37 | def build_sound(self, arch='unet5', fc_dim=64, weights=''): 38 | # 2D models 39 | if arch == 'unet5': 40 | net_sound = Unet(fc_dim=fc_dim, num_downs=5) 41 | elif arch == 'unet6': 42 | net_sound = Unet(fc_dim=fc_dim, num_downs=6) 43 | elif arch == 'unet7': 44 | net_sound = Unet(fc_dim=fc_dim, num_downs=7) 45 | else: 46 | raise Exception('Architecture undefined!') 47 | 48 | net_sound.apply(self.weights_init) 49 | if len(weights) > 0: 50 | print('Loading weights for net_sound') 51 | net_sound.load_state_dict(torch.load(weights)) 52 | 53 | return net_sound 54 | 55 | # builder for vision 56 | def build_frame(self, arch='resnet18', fc_dim=64, pool_type='avgpool', 57 | weights='', config=None): 58 | pretrained=True 59 | if arch == 'resnet18fc': 60 | original_resnet = torchvision.models.resnet18(pretrained) 61 | net = ResnetFC( 62 | original_resnet, fc_dim=fc_dim, pool_type=pool_type) 63 | elif arch == 'resnet18dilated': 64 | original_resnet = torchvision.models.resnet18(pretrained) 65 | net = ResnetDilated( 66 | original_resnet, fc_dim=fc_dim, pool_type=pool_type) 67 | elif arch == 'resnet50fc': 68 | original_resnet = torchvision.models.resnet50(pretrained) 69 | net = ResnetFC( 70 | original_resnet, fc_dim=fc_dim, pool_type=pool_type, config=config) 71 | elif arch == 'resnet101fc': 72 | original_resnet = torchvision.models.resnet101(pretrained) 73 | net = ResnetFC( 74 | original_resnet, fc_dim=fc_dim, pool_type=pool_type, config=config) 75 | else: 76 | raise Exception('Architecture undefined!') 77 | 78 | if len(weights) > 0: 79 | print('Loading weights for net_frame') 80 | net.load_state_dict(torch.load(weights)) 81 | return net 82 | 83 | 84 | 85 | def build_criterion(self, arch): 86 | if arch == 'bce': 87 | net = BCELoss() 88 | elif arch == 'l1': 89 | net = L1Loss() 90 | elif arch == 'l2': 91 | net = L2Loss() 92 | else: 93 | raise Exception('Architecture undefined!') 94 | return net 95 | -------------------------------------------------------------------------------- /tribert/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/models/__pycache__/audio_net.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/audio_net.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/models/__pycache__/criterion.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/criterion.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/models/__pycache__/synthesizer_net.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/synthesizer_net.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/models/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/models/__pycache__/vision_net.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/vision_net.cpython-36.pyc -------------------------------------------------------------------------------- /tribert/models/audio_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from collections import OrderedDict 5 | 6 | class Attention_block(nn.Module): 7 | def __init__(self,F_g,F_l,F_int): 8 | super(Attention_block,self).__init__() 9 | self.W_g = nn.Sequential( 10 | nn.Conv2d(F_g, F_int, kernel_size=1,stride=1,padding=0,bias=True), 11 | nn.BatchNorm2d(F_int) 12 | ) 13 | 14 | self.W_x = nn.Sequential( 15 | nn.Conv2d(F_l, F_int, kernel_size=1,stride=1,padding=0,bias=True), 16 | nn.BatchNorm2d(F_int) 17 | ) 18 | 19 | self.psi = nn.Sequential( 20 | nn.Conv2d(F_int, 1, kernel_size=1,stride=1,padding=0,bias=True), 21 | nn.BatchNorm2d(1), 22 | nn.Sigmoid() 23 | ) 24 | 25 | self.relu = nn.ReLU(inplace=True) 26 | 27 | def forward(self,g,x): 28 | g1 = self.W_g(g) 29 | x1 = self.W_x(x) 30 | psi = self.relu(g1+x1) 31 | psi = self.psi(psi) 32 | ### sampler 33 | #import ipdb; ipdb.set_trace() 34 | psi = F.upsample(psi, size=x.size()[2:], mode="bilinear", align_corners=True) 35 | 36 | return x*psi 37 | 38 | 39 | class Unet(nn.Module): 40 | def __init__(self, fc_dim=64, num_downs=5, ngf=64, use_dropout=False): 41 | super(Unet, self).__init__() 42 | # construct unet structure 43 | in_channels = 1 44 | out_channels = 1 #1 #Himu 45 | init_features = 64 46 | features = init_features 47 | self.encoder1 = Unet._block(in_channels, features, name="enc1") 48 | self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2) 49 | self.encoder2 = Unet._block(features, features * 2, name="enc2") 50 | self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2) 51 | self.encoder3 = Unet._block(features * 2, features * 4, name="enc3") 52 | self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2) 53 | self.encoder4 = Unet._block(features * 4, features * 8, name="enc4") 54 | self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2) 55 | 56 | self.bottleneck = Unet._block(features * 8, features * 24, name="bottleneck") #Himu features * 16 57 | 58 | self.upconv4 = nn.ConvTranspose2d( 59 | features * 24, features * 8, kernel_size=2, stride=2 #Himu features * 16 60 | ) 61 | self.Att4 = Attention_block(F_g = 8 * features, F_l = 8 * features, F_int = 4 * features) 62 | self.decoder4 = Unet._block((features * 8) * 2, features * 8, name="dec4") 63 | self.upconv3 = nn.ConvTranspose2d( 64 | features * 8, features * 4, kernel_size=2, stride=2 65 | ) 66 | self.Att3 = Attention_block(F_g = 4 * features, F_l = 4 * features, F_int = 2 * features) 67 | self.decoder3 = Unet._block((features * 4) * 2, features * 4, name="dec3") 68 | self.upconv2 = nn.ConvTranspose2d( 69 | features * 4, features * 2, kernel_size=2, stride=2 70 | ) 71 | self.Att2 = Attention_block(F_g = 2 * features, F_l = 2 * features, F_int = features) 72 | self.decoder2 = Unet._block((features * 2) * 2, features * 2, name="dec2") 73 | self.upconv1 = nn.ConvTranspose2d( 74 | features * 2, features, kernel_size=2, stride=2 75 | ) 76 | self.Att1 = Attention_block(F_g = features, F_l = features, F_int = int(features/2)) 77 | 78 | self.decoder1 = Unet._block(features * 2, features, name="dec1") 79 | 80 | self.conv = nn.Conv2d( 81 | in_channels=features, out_channels=out_channels, kernel_size=1 82 | ) 83 | self.sigmoid = nn.Sigmoid() 84 | 85 | self.mlp = nn.Sequential( 86 | nn.Linear(1536, 1536), 87 | nn.ReLU(), 88 | nn.Linear(1536, 1536)) 89 | 90 | 91 | def forward(self, x, feats): 92 | enc1 = self.encoder1(x) 93 | enc2 = self.encoder2(self.pool1(enc1)) 94 | enc3 = self.encoder3(self.pool2(enc2)) 95 | enc4 = self.encoder4(self.pool3(enc3)) 96 | 97 | bottleneck = self.bottleneck(self.pool4(enc4)) 98 | 99 | feat_all = feats.unsqueeze(2).unsqueeze(2).repeat(1,1,bottleneck.shape[2],bottleneck.shape[3]) 100 | combine_feat = F.softmax(torch.matmul(bottleneck,feat_all), dim=1) * feat_all 101 | combine_feat = combine_feat + bottleneck 102 | bottleneck = self.mlp(combine_feat.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous() + combine_feat 103 | #bottleneck = torch.cat((bottleneck, feat_all), dim=1) 104 | 105 | dec4 = self.upconv4(bottleneck) 106 | enc4 = self.Att4(g = dec4, x = enc4) 107 | dec4 = torch.cat((dec4, enc4), dim=1) 108 | dec4 = self.decoder4(dec4) 109 | 110 | dec3 = self.upconv3(dec4) 111 | enc3 = self.Att3(g = dec3, x = enc3) 112 | dec3 = torch.cat((dec3, enc3), dim=1) 113 | dec3 = self.decoder3(dec3) 114 | 115 | dec2 = self.upconv2(dec3) 116 | enc2 = self.Att2(g = dec2, x = enc2) 117 | dec2 = torch.cat((dec2, enc2), dim=1) 118 | dec2 = self.decoder2(dec2) 119 | 120 | dec1 = self.upconv1(dec2) 121 | enc1 = self.Att1(g = dec1, x = enc1) 122 | dec1 = torch.cat((dec1, enc1), dim=1) 123 | dec1 = self.decoder1(dec1) 124 | return self.conv(dec1) 125 | 126 | def weights_init(self, m): 127 | classname = m.__class__.__name__ 128 | if classname.find('Conv') != -1: 129 | m.weight.data.normal_(0.0, 0.001) 130 | elif classname.find('BatchNorm') != -1: 131 | m.weight.data.normal_(1.0, 0.02) 132 | m.bias.data.fill_(0) 133 | elif classname.find('Linear') != -1: 134 | m.weight.data.normal_(0.0, 0.0001) 135 | 136 | @staticmethod 137 | def _block(in_channels, features, name): 138 | return nn.Sequential( 139 | OrderedDict( 140 | [ 141 | ( 142 | name + "conv1", 143 | nn.Conv2d( 144 | in_channels=in_channels, 145 | out_channels=features, 146 | kernel_size=3, 147 | padding=1, 148 | bias=False, 149 | ), 150 | ), 151 | (name + "norm1", nn.BatchNorm2d(num_features=features)), 152 | (name + "relu1", nn.ReLU(inplace=True)), 153 | ( 154 | name + "conv2", 155 | nn.Conv2d( 156 | in_channels=features, 157 | out_channels=features, 158 | kernel_size=3, 159 | padding=1, 160 | bias=False, 161 | ), 162 | ), 163 | (name + "norm2", nn.BatchNorm2d(num_features=features)), 164 | (name + "relu2", nn.ReLU(inplace=True)), 165 | ] 166 | ) 167 | ) 168 | 169 | 170 | # Defines the submodule with skip connection. 171 | # X -------------------identity---------------------- X 172 | # |-- downsampling -- |submodule| -- upsampling --| 173 | class UnetBlock(nn.Module): 174 | def __init__(self, outer_nc, inner_input_nc, input_nc=None, 175 | submodule=None, outermost=False, innermost=False, 176 | use_dropout=False, inner_output_nc=None, noskip=False): 177 | super(UnetBlock, self).__init__() 178 | self.outermost = outermost 179 | self.noskip = noskip 180 | use_bias = False 181 | if input_nc is None: 182 | input_nc = outer_nc 183 | if innermost: 184 | inner_output_nc = inner_input_nc 185 | elif inner_output_nc is None: 186 | inner_output_nc = 2 * inner_input_nc 187 | 188 | downrelu = nn.LeakyReLU(0.2, True) 189 | downnorm = nn.BatchNorm2d(inner_input_nc) 190 | uprelu = nn.ReLU(True) 191 | upnorm = nn.BatchNorm2d(outer_nc) 192 | upsample = nn.Upsample( 193 | scale_factor=2, mode='bilinear', align_corners=True) 194 | 195 | if outermost: 196 | downconv = nn.Conv2d( 197 | input_nc, inner_input_nc, kernel_size=4, 198 | stride=2, padding=1, bias=use_bias) 199 | upconv = nn.Conv2d( 200 | inner_output_nc, outer_nc, kernel_size=3, padding=1) 201 | 202 | down = [downconv] 203 | up = [uprelu, upsample, upconv] 204 | model = down + [submodule] + up 205 | elif innermost: 206 | downconv = nn.Conv2d( 207 | input_nc, inner_input_nc, kernel_size=4, 208 | stride=2, padding=1, bias=use_bias) 209 | upconv = nn.Conv2d( 210 | inner_output_nc, outer_nc, kernel_size=3, 211 | padding=1, bias=use_bias) 212 | 213 | down = [downrelu, downconv] 214 | up = [uprelu, upsample, upconv, upnorm] 215 | model = down + up 216 | else: 217 | downconv = nn.Conv2d( 218 | input_nc, inner_input_nc, kernel_size=4, 219 | stride=2, padding=1, bias=use_bias) 220 | upconv = nn.Conv2d( 221 | inner_output_nc, outer_nc, kernel_size=3, 222 | padding=1, bias=use_bias) 223 | down = [downrelu, downconv, downnorm] 224 | up = [uprelu, upsample, upconv, upnorm] 225 | 226 | if use_dropout: 227 | model = down + [submodule] + up + [nn.Dropout(0.5)] 228 | else: 229 | model = down + [submodule] + up 230 | 231 | self.model = nn.Sequential(*model) 232 | 233 | def forward(self, x): 234 | if self.outermost or self.noskip: 235 | return self.model(x) 236 | else: 237 | return torch.cat([x, self.model(x)], 1) 238 | -------------------------------------------------------------------------------- /tribert/models/criterion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class BaseLoss(nn.Module): 7 | def __init__(self): 8 | super(BaseLoss, self).__init__() 9 | 10 | def forward(self, preds, targets, weight=None): 11 | if isinstance(preds, list): 12 | N = len(preds) 13 | if weight is None: 14 | weight = preds[0].new_ones(1) 15 | 16 | errs = [self._forward(preds[n], targets[n], weight) 17 | for n in range(N)] 18 | err = torch.mean(torch.stack(errs)) 19 | 20 | elif isinstance(preds, torch.Tensor): 21 | if weight is None: 22 | weight = preds.new_ones(1) 23 | err = self._forward(preds, targets, weight) 24 | 25 | return err 26 | 27 | 28 | class L1Loss(BaseLoss): 29 | def __init__(self): 30 | super(L1Loss, self).__init__() 31 | 32 | def _forward(self, pred, target, weight): 33 | return torch.mean(weight * torch.abs(pred - target)) 34 | 35 | 36 | class L2Loss(BaseLoss): 37 | def __init__(self): 38 | super(L2Loss, self).__init__() 39 | 40 | def _forward(self, pred, target, weight): 41 | return torch.mean(weight * torch.pow(pred - target, 2)) 42 | 43 | 44 | class BCELoss(BaseLoss): 45 | def __init__(self): 46 | super(BCELoss, self).__init__() 47 | 48 | def _forward(self, pred, target, weight): 49 | return F.binary_cross_entropy(pred, target, weight=weight) 50 | 51 | 52 | -------------------------------------------------------------------------------- /tribert/models/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | import numpy as np 5 | import librosa 6 | import cv2 7 | 8 | import subprocess as sp 9 | from threading import Timer 10 | 11 | 12 | def warpgrid(bs, HO, WO, warp=True): 13 | # meshgrid 14 | x = np.linspace(-1, 1, WO) 15 | y = np.linspace(-1, 1, HO) 16 | xv, yv = np.meshgrid(x, y) 17 | grid = np.zeros((bs, HO, WO, 2)) 18 | grid_x = xv 19 | if warp: 20 | grid_y = (np.power(21, (yv+1)/2) - 11) / 10 21 | else: 22 | grid_y = np.log(yv * 10 + 11) / np.log(21) * 2 - 1 23 | grid[:, :, :, 0] = grid_x 24 | grid[:, :, :, 1] = grid_y 25 | grid = grid.astype(np.float32) 26 | return grid 27 | 28 | 29 | def makedirs(path, remove=False): 30 | if os.path.isdir(path): 31 | if remove: 32 | shutil.rmtree(path) 33 | print('removed existing directory...') 34 | else: 35 | return 36 | os.makedirs(path) 37 | 38 | 39 | class AverageMeter(object): 40 | """Computes and stores the average and current value""" 41 | def __init__(self): 42 | self.initialized = False 43 | self.val = None 44 | self.avg = None 45 | self.sum = None 46 | self.count = None 47 | 48 | def initialize(self, val, weight): 49 | self.val = val 50 | self.avg = val 51 | self.sum = val*weight 52 | self.count = weight 53 | self.initialized = True 54 | 55 | def update(self, val, weight=1): 56 | val = np.asarray(val) 57 | if not self.initialized: 58 | self.initialize(val, weight) 59 | else: 60 | self.add(val, weight) 61 | 62 | def add(self, val, weight): 63 | self.val = val 64 | self.sum += val * weight 65 | self.count += weight 66 | self.avg = self.sum / self.count 67 | 68 | def value(self): 69 | if self.val is None: 70 | return 0. 71 | else: 72 | return self.val.tolist() 73 | 74 | def average(self): 75 | if self.avg is None: 76 | return 0. 77 | else: 78 | return self.avg.tolist() 79 | 80 | 81 | def recover_rgb(img): 82 | for t, m, s in zip(img, 83 | [0.485, 0.456, 0.406], 84 | [0.229, 0.224, 0.225]): 85 | t.mul_(s).add_(m) 86 | img = (img.numpy().transpose((1, 2, 0)) * 255).astype(np.uint8) 87 | return img 88 | 89 | 90 | def magnitude2heatmap(mag, log=True, scale=200.): 91 | if log: 92 | mag = np.log10(mag + 1.) 93 | mag *= scale 94 | mag[mag > 255] = 255 95 | mag = mag.astype(np.uint8) 96 | mag_color = cv2.applyColorMap(mag, cv2.COLORMAP_JET) 97 | mag_color = mag_color[:, :, ::-1] 98 | return mag_color 99 | 100 | 101 | def istft_reconstruction(mag, phase, hop_length=256): 102 | spec = mag.astype(np.complex) * np.exp(1j*phase) 103 | wav = librosa.istft(spec, hop_length=hop_length) 104 | return np.clip(wav, -1., 1.) 105 | 106 | 107 | class VideoWriter: 108 | """ Combine numpy frames into video using ffmpeg 109 | 110 | Arguments: 111 | filename: name of the output video 112 | fps: frame per second 113 | shape: shape of video frame 114 | 115 | Properties: 116 | add_frame(frame): 117 | add a frame to the video 118 | add_frames(frames): 119 | add multiple frames to the video 120 | release(): 121 | release writing pipe 122 | 123 | """ 124 | 125 | def __init__(self, filename, fps, shape): 126 | self.file = filename 127 | self.fps = fps 128 | self.shape = shape 129 | 130 | # video codec 131 | ext = filename.split('.')[-1] 132 | if ext == "mp4": 133 | self.vcodec = "h264" 134 | else: 135 | raise RuntimeError("Video codec not supoorted.") 136 | 137 | # video writing pipe 138 | cmd = [ 139 | "ffmpeg", 140 | "-y", # overwrite existing file 141 | "-f", "rawvideo", # file format 142 | "-s", "{}x{}".format(shape[1], shape[0]), # size of one frame 143 | "-pix_fmt", "rgb24", # 3 channels 144 | "-r", str(self.fps), # frames per second 145 | "-i", "-", # input comes from a pipe 146 | "-an", # not to expect any audio 147 | "-vcodec", self.vcodec, # video codec 148 | "-pix_fmt", "yuv420p", # output video in yuv420p 149 | self.file] 150 | 151 | self.pipe = sp.Popen(cmd, stdin=sp.PIPE, stderr=sp.PIPE, bufsize=10**9) 152 | 153 | def release(self): 154 | self.pipe.stdin.close() 155 | 156 | def add_frame(self, frame): 157 | assert len(frame.shape) == 3 158 | assert frame.shape[0] == self.shape[0] 159 | assert frame.shape[1] == self.shape[1] 160 | try: 161 | self.pipe.stdin.write(frame.tostring()) 162 | except: 163 | _, ffmpeg_error = self.pipe.communicate() 164 | print(ffmpeg_error) 165 | 166 | def add_frames(self, frames): 167 | for frame in frames: 168 | self.add_frame(frame) 169 | 170 | 171 | def kill_proc(proc): 172 | proc.kill() 173 | print('Process running overtime! Killed.') 174 | 175 | 176 | def run_proc_timeout(proc, timeout_sec): 177 | # kill_proc = lambda p: p.kill() 178 | timer = Timer(timeout_sec, kill_proc, [proc]) 179 | try: 180 | timer.start() 181 | proc.communicate() 182 | finally: 183 | timer.cancel() 184 | 185 | 186 | def combine_video_audio(src_video, src_audio, dst_video, verbose=False): 187 | try: 188 | cmd = ["ffmpeg", "-y", 189 | "-loglevel", "quiet", 190 | "-i", src_video, 191 | "-i", src_audio, 192 | "-c:v", "copy", 193 | "-c:a", "aac", 194 | "-strict", "experimental", 195 | dst_video] 196 | proc = sp.Popen(cmd) 197 | run_proc_timeout(proc, 10.) 198 | 199 | if verbose: 200 | print('Processed:{}'.format(dst_video)) 201 | except Exception as e: 202 | print('Error:[{}] {}'.format(dst_video, e)) 203 | 204 | 205 | # save video to the disk using ffmpeg 206 | def save_video(path, tensor, fps=25): 207 | assert tensor.ndim == 4, 'video should be in 4D numpy array' 208 | L, H, W, C = tensor.shape 209 | writer = VideoWriter( 210 | path, 211 | fps=fps, 212 | shape=[H, W]) 213 | for t in range(L): 214 | writer.add_frame(tensor[t]) 215 | writer.release() 216 | 217 | 218 | def save_audio(path, audio_numpy, sr): 219 | librosa.output.write_wav(path, audio_numpy, sr) 220 | -------------------------------------------------------------------------------- /tribert/tribert.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import logging 4 | import math 5 | import os 6 | import shutil 7 | import tarfile 8 | import tempfile 9 | import sys 10 | from io import open 11 | 12 | import torch 13 | from torch import nn 14 | from torch.nn import CrossEntropyLoss 15 | import torch.nn.functional as F 16 | from torch.nn.utils.weight_norm import weight_norm 17 | 18 | import ipdb 19 | 20 | import numpy as np 21 | 22 | 23 | class BertConfig(object): 24 | """Configuration class to store the configuration of a `BertModel`. 25 | """ 26 | 27 | def __init__( 28 | self, 29 | vocab_size_or_config_json_file, 30 | hidden_size=768, 31 | num_hidden_layers=12, 32 | num_attention_heads=12, 33 | intermediate_size=3072, 34 | hidden_act="gelu", 35 | hidden_dropout_prob=0.1, 36 | attention_probs_dropout_prob=0.1, 37 | max_position_embeddings=512, 38 | type_vocab_size=2, 39 | initializer_range=0.02, 40 | v_feature_size=1024, 41 | v_target_size=21, 42 | v_hidden_size=768, 43 | v_num_hidden_layers=3, 44 | v_num_attention_heads=12, 45 | v_intermediate_size=3072, 46 | p_feature_size = 1024, 47 | p_target_size=21, 48 | p_hidden_size=768, 49 | p_num_hidden_layers=3, 50 | p_num_attention_heads=12, 51 | p_intermediate_size=3072, 52 | bi_hidden_size=1024, 53 | bi_num_attention_heads=16, 54 | v_attention_probs_dropout_prob=0.1, 55 | v_hidden_act="gelu", 56 | v_hidden_dropout_prob=0.1, 57 | v_initializer_range=0.2, 58 | v_biattention_id=[0, 1], 59 | p_biattention_id=[10, 11], 60 | a_biattention_id=[16, 17], 61 | predict_feature=False, 62 | fast_mode=False, 63 | fixed_v_layer=0, 64 | fixed_p_layer=0, 65 | fixed_a_layer = 0, 66 | in_batch_pairs=False, 67 | fusion_method="mul", 68 | intra_gate=False, 69 | with_coattention=True 70 | ): 71 | 72 | assert len(v_biattention_id) == len(p_biattention_id) 73 | assert max(v_biattention_id) < v_num_hidden_layers 74 | assert max(p_biattention_id) < num_hidden_layers 75 | 76 | if isinstance(vocab_size_or_config_json_file, str) or ( 77 | sys.version_info[0] == 2 78 | and isinstance(vocab_size_or_config_json_file, unicode) 79 | ): 80 | with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: 81 | json_config = json.loads(reader.read()) 82 | for key, value in json_config.items(): 83 | self.__dict__[key] = value 84 | elif isinstance(vocab_size_or_config_json_file, int): 85 | self.vocab_size = vocab_size_or_config_json_file 86 | self.hidden_size = hidden_size 87 | self.num_hidden_layers = num_hidden_layers 88 | self.num_attention_heads = num_attention_heads 89 | self.hidden_act = hidden_act 90 | self.intermediate_size = intermediate_size 91 | self.hidden_dropout_prob = hidden_dropout_prob 92 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 93 | self.max_position_embeddings = max_position_embeddings 94 | self.type_vocab_size = type_vocab_size 95 | self.initializer_range = initializer_range 96 | self.v_feature_size = v_feature_size 97 | self.v_hidden_size = v_hidden_size 98 | self.v_num_hidden_layers = v_num_hidden_layers 99 | self.v_num_attention_heads = v_num_attention_heads 100 | self.v_intermediate_size = v_intermediate_size 101 | self.v_attention_probs_dropout_prob = v_attention_probs_dropout_prob 102 | self.v_hidden_act = v_hidden_act 103 | self.v_hidden_dropout_prob = v_hidden_dropout_prob 104 | self.v_initializer_range = v_initializer_range 105 | self.v_biattention_id = v_biattention_id 106 | self.p_biattention_id = p_biattention_id 107 | self.p_hidden_act = v_hidden_act 108 | self.p_hidden_dropout_prob = v_hidden_dropout_prob 109 | self.a_hidden_act = v_hidden_act 110 | self.a_hidden_dropout_prob = v_hidden_dropout_prob 111 | self.v_target_size = v_target_size 112 | self.bi_hidden_size = bi_hidden_size 113 | self.bi_num_attention_heads = bi_num_attention_heads 114 | self.predict_feature = predict_feature 115 | self.fast_mode = fast_mode 116 | self.fixed_v_layer = fixed_v_layer 117 | self.fixed_p_layer = fixed_p_layer 118 | self.fixed_a_layer = fixed_a_layer 119 | 120 | self.in_batch_pairs = in_batch_pairs 121 | self.fusion_method = fusion_method 122 | self.intra_gate = intra_gate 123 | self.with_coattention=with_coattention 124 | else: 125 | raise ValueError( 126 | "First argument must be either a vocabulary size (int)" 127 | "or the path to a pretrained model config file (str)" 128 | ) 129 | 130 | 131 | 132 | @classmethod 133 | def from_dict(cls, json_object): 134 | """Constructs a `BertConfig` from a Python dictionary of parameters.""" 135 | config = BertConfig(vocab_size_or_config_json_file=-1) 136 | for key, value in json_object.items(): 137 | config.__dict__[key] = value 138 | return config 139 | 140 | @classmethod 141 | def from_json_file(cls, json_file): 142 | """Constructs a `BertConfig` from a json file of parameters.""" 143 | with open(json_file, "r", encoding="utf-8") as reader: 144 | text = reader.read() 145 | return cls.from_dict(json.loads(text)) 146 | 147 | def __repr__(self): 148 | return str(self.to_json_string()) 149 | 150 | def to_dict(self): 151 | """Serializes this instance to a Python dictionary.""" 152 | output = copy.deepcopy(self.__dict__) 153 | return output 154 | 155 | def to_json_string(self): 156 | """Serializes this instance to a JSON string.""" 157 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 158 | 159 | 160 | -------------------------------------------------------------------------------- /utils_music21.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | import numpy as np 5 | import librosa 6 | import cv2 7 | 8 | import subprocess as sp 9 | from threading import Timer 10 | 11 | 12 | def warpgrid(bs, HO, WO, warp=True): 13 | # meshgrid 14 | x = np.linspace(-1, 1, WO) 15 | y = np.linspace(-1, 1, HO) 16 | xv, yv = np.meshgrid(x, y) 17 | grid = np.zeros((bs, HO, WO, 2)) 18 | grid_x = xv 19 | if warp: 20 | grid_y = (np.power(21, (yv+1)/2) - 11) / 10 21 | else: 22 | grid_y = np.log(yv * 10 + 11) / np.log(21) * 2 - 1 23 | grid[:, :, :, 0] = grid_x 24 | grid[:, :, :, 1] = grid_y 25 | grid = grid.astype(np.float32) 26 | return grid 27 | 28 | 29 | def makedirs(path, remove=False): 30 | if os.path.isdir(path): 31 | if remove: 32 | shutil.rmtree(path) 33 | print('removed existing directory...') 34 | else: 35 | return 36 | os.makedirs(path) 37 | 38 | 39 | class AverageMeter(object): 40 | """Computes and stores the average and current value""" 41 | def __init__(self): 42 | self.initialized = False 43 | self.val = None 44 | self.avg = None 45 | self.sum = None 46 | self.count = None 47 | 48 | def initialize(self, val, weight): 49 | self.val = val 50 | self.avg = val 51 | self.sum = val*weight 52 | self.count = weight 53 | self.initialized = True 54 | 55 | def update(self, val, weight=1): 56 | val = np.asarray(val) 57 | if not self.initialized: 58 | self.initialize(val, weight) 59 | else: 60 | self.add(val, weight) 61 | 62 | def add(self, val, weight): 63 | self.val = val 64 | self.sum += val * weight 65 | self.count += weight 66 | self.avg = self.sum / self.count 67 | 68 | def value(self): 69 | if self.val is None: 70 | return 0. 71 | else: 72 | return self.val.tolist() 73 | 74 | def average(self): 75 | if self.avg is None: 76 | return 0. 77 | else: 78 | return self.avg.tolist() 79 | 80 | 81 | def recover_rgb(img): 82 | for t, m, s in zip(img, 83 | [0.485, 0.456, 0.406], 84 | [0.229, 0.224, 0.225]): 85 | t.mul_(s).add_(m) 86 | img = (img.numpy().transpose((1, 2, 0)) * 255).astype(np.uint8) 87 | return img 88 | 89 | 90 | def magnitude2heatmap(mag, log=True, scale=200.): 91 | if log: 92 | mag = np.log10(mag + 1.) 93 | mag *= scale 94 | mag[mag > 255] = 255 95 | mag = mag.astype(np.uint8) 96 | mag_color = cv2.applyColorMap(mag, cv2.COLORMAP_JET) 97 | mag_color = mag_color[:, :, ::-1] 98 | return mag_color 99 | 100 | 101 | def istft_reconstruction(mag, phase, hop_length=256): 102 | spec = mag.astype(np.complex) * np.exp(1j*phase) 103 | wav = librosa.istft(spec, hop_length=hop_length) 104 | return np.clip(wav, -1., 1.) 105 | 106 | 107 | class VideoWriter: 108 | """ Combine numpy frames into video using ffmpeg 109 | 110 | Arguments: 111 | filename: name of the output video 112 | fps: frame per second 113 | shape: shape of video frame 114 | 115 | Properties: 116 | add_frame(frame): 117 | add a frame to the video 118 | add_frames(frames): 119 | add multiple frames to the video 120 | release(): 121 | release writing pipe 122 | 123 | """ 124 | 125 | def __init__(self, filename, fps, shape): 126 | self.file = filename 127 | self.fps = fps 128 | self.shape = shape 129 | 130 | # video codec 131 | ext = filename.split('.')[-1] 132 | if ext == "mp4": 133 | self.vcodec = "h264" 134 | else: 135 | raise RuntimeError("Video codec not supoorted.") 136 | 137 | # video writing pipe 138 | cmd = [ 139 | "ffmpeg", 140 | "-y", # overwrite existing file 141 | "-f", "rawvideo", # file format 142 | "-s", "{}x{}".format(shape[1], shape[0]), # size of one frame 143 | "-pix_fmt", "rgb24", # 3 channels 144 | "-r", str(self.fps), # frames per second 145 | "-i", "-", # input comes from a pipe 146 | "-an", # not to expect any audio 147 | "-vcodec", self.vcodec, # video codec 148 | "-pix_fmt", "yuv420p", # output video in yuv420p 149 | self.file] 150 | 151 | self.pipe = sp.Popen(cmd, stdin=sp.PIPE, stderr=sp.PIPE, bufsize=10**9) 152 | 153 | def release(self): 154 | self.pipe.stdin.close() 155 | 156 | def add_frame(self, frame): 157 | assert len(frame.shape) == 3 158 | assert frame.shape[0] == self.shape[0] 159 | assert frame.shape[1] == self.shape[1] 160 | try: 161 | self.pipe.stdin.write(frame.tostring()) 162 | except: 163 | _, ffmpeg_error = self.pipe.communicate() 164 | print(ffmpeg_error) 165 | 166 | def add_frames(self, frames): 167 | for frame in frames: 168 | self.add_frame(frame) 169 | 170 | 171 | def kill_proc(proc): 172 | proc.kill() 173 | print('Process running overtime! Killed.') 174 | 175 | 176 | def run_proc_timeout(proc, timeout_sec): 177 | # kill_proc = lambda p: p.kill() 178 | timer = Timer(timeout_sec, kill_proc, [proc]) 179 | try: 180 | timer.start() 181 | proc.communicate() 182 | finally: 183 | timer.cancel() 184 | 185 | 186 | def combine_video_audio(src_video, src_audio, dst_video, verbose=False): 187 | try: 188 | cmd = ["ffmpeg", "-y", 189 | "-loglevel", "quiet", 190 | "-i", src_video, 191 | "-i", src_audio, 192 | "-c:v", "copy", 193 | "-c:a", "aac", 194 | "-strict", "experimental", 195 | dst_video] 196 | proc = sp.Popen(cmd) 197 | run_proc_timeout(proc, 10.) 198 | 199 | if verbose: 200 | print('Processed:{}'.format(dst_video)) 201 | except Exception as e: 202 | print('Error:[{}] {}'.format(dst_video, e)) 203 | 204 | 205 | # save video to the disk using ffmpeg 206 | def save_video(path, tensor, fps=25): 207 | assert tensor.ndim == 4, 'video should be in 4D numpy array' 208 | L, H, W, C = tensor.shape 209 | writer = VideoWriter( 210 | path, 211 | fps=fps, 212 | shape=[H, W]) 213 | for t in range(L): 214 | writer.add_frame(tensor[t]) 215 | writer.release() 216 | 217 | 218 | def save_audio(path, audio_numpy, sr): 219 | librosa.output.write_wav(path, audio_numpy, sr) 220 | -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/av1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/av1.mp4 -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/av2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/av2.mp4 -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gt1.wav -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gt2.wav -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/mix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/mix.jpg -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/mix.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/mix.wav -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/pred1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/pred1.wav -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/pred2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/pred2.wav -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/video1.mp4 -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/video2.mp4 -------------------------------------------------------------------------------- /visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/weight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/weight.jpg -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/av1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/av1.mp4 -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/av2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/av2.mp4 -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gt1.wav -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gt2.wav -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/mix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/mix.jpg -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/mix.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/mix.wav -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/pred1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/pred1.wav -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/pred2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/pred2.wav -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/video1.mp4 -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/video2.mp4 -------------------------------------------------------------------------------- /visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/weight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/weight.jpg -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/av1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/av1.mp4 -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/av2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/av2.mp4 -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gt1.wav -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gt2.wav -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/mix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/mix.jpg -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/mix.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/mix.wav -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/pred1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/pred1.wav -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/pred2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/pred2.wav -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/video1.mp4 -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/video2.mp4 -------------------------------------------------------------------------------- /visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/weight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/weight.jpg -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/av1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/av1.mp4 -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/av2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/av2.mp4 -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gt1.wav -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gt2.wav -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/mix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/mix.jpg -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/mix.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/mix.wav -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/pred1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/pred1.wav -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/pred2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/pred2.wav -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/video1.mp4 -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/video2.mp4 -------------------------------------------------------------------------------- /visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/weight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/weight.jpg -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/av1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/av1.mp4 -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/av2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/av2.mp4 -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gt1.wav -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gt2.wav -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/mix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/mix.jpg -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/mix.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/mix.wav -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/pred1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/pred1.wav -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/pred2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/pred2.wav -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/video1.mp4 -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/video2.mp4 -------------------------------------------------------------------------------- /visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/weight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/weight.jpg -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/av1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/av1.mp4 -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/av2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/av2.mp4 -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gt1.wav -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gt2.wav -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/mix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/mix.jpg -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/mix.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/mix.wav -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/pred1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/pred1.wav -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/pred2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/pred2.wav -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/video1.mp4 -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/video2.mp4 -------------------------------------------------------------------------------- /visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/weight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/weight.jpg -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/av1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/av1.mp4 -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/av2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/av2.mp4 -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gt1.wav -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gt2.wav -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/mix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/mix.jpg -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/mix.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/mix.wav -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/pred1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/pred1.wav -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/pred2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/pred2.wav -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/video1.mp4 -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/video2.mp4 -------------------------------------------------------------------------------- /visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/weight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/weight.jpg -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/av1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/av1.mp4 -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/av2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/av2.mp4 -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gt1.wav -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gt2.wav -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/mix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/mix.jpg -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/mix.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/mix.wav -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/pred1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/pred1.wav -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/pred2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/pred2.wav -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/video1.mp4 -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/video2.mp4 -------------------------------------------------------------------------------- /visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/weight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/weight.jpg -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/av1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/av1.mp4 -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/av2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/av2.mp4 -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gt1.wav -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gt2.wav -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/mix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/mix.jpg -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/mix.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/mix.wav -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/pred1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/pred1.wav -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/pred2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/pred2.wav -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/video1.mp4 -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/video2.mp4 -------------------------------------------------------------------------------- /visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/weight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/weight.jpg -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/av1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/av1.mp4 -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/av2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/av2.mp4 -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gt1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gt1.wav -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gt2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gt2.wav -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/mix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/mix.jpg -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/mix.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/mix.wav -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/pred1.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/pred1.wav -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/pred2.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/pred2.wav -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predamp1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predamp1.jpg -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predamp2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predamp2.jpg -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predmask1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predmask1.jpg -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predmask2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predmask2.jpg -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/video1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/video1.mp4 -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/video2.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/video2.mp4 -------------------------------------------------------------------------------- /visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/weight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/weight.jpg -------------------------------------------------------------------------------- /visualization/index.html: -------------------------------------------------------------------------------- 1 |
FilenameInput Mixed AudioVideo 1Predicted Audio 1GroundTruth Audio 1Predicted Mask 1GroundTruth Mask 1Video 2Predicted Audio 2GroundTruth Audio 2Predicted Mask 2GroundTruth Mask 2Loss weighting
audio-PlKCXvBDxaI+audio-0yR5s-CSw4E
audio-2XY77lk_LCQ+audio-1K-0VC9hWIA
audio-0pXFhOg1o2c+audio-bGfyLBoZPM4
audio-Xafuav13p2E+audio-t6B3JugXgTI
audio-gK3ooFKujO0+audio-087sDeYPdjY
audio-P_dHPMofcwM+audio-EXMQITpeeaM
audio-E_ugm84TMvo+audio-PMDSfAZ4-eo
audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c
audio-EVp6jkgYuUc+audio-8YELO9yxs_c
audio-GV2bbRPFhvk+audio-NdzCe7COROw
-------------------------------------------------------------------------------- /viz.py: -------------------------------------------------------------------------------- 1 | import os 2 | import matplotlib 3 | matplotlib.use('Agg') 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def plot_loss_metrics(path, history): 8 | fig = plt.figure() 9 | plt.plot(history['train']['epoch'], history['train']['err'], 10 | color='b', label='training') 11 | plt.plot(history['val']['epoch'], history['val']['err'], 12 | color='c', label='validation') 13 | plt.legend() 14 | fig.savefig(os.path.join(path, 'loss.png'), dpi=200) 15 | plt.close('all') 16 | 17 | fig = plt.figure() 18 | plt.plot(history['val']['epoch'], history['val']['sdr'], 19 | color='r', label='SDR') 20 | plt.plot(history['val']['epoch'], history['val']['sir'], 21 | color='g', label='SIR') 22 | plt.plot(history['val']['epoch'], history['val']['sar'], 23 | color='b', label='SAR') 24 | plt.legend() 25 | fig.savefig(os.path.join(path, 'metrics.png'), dpi=200) 26 | plt.close('all') 27 | 28 | 29 | class HTMLVisualizer(): 30 | def __init__(self, fn_html): 31 | self.fn_html = fn_html 32 | self.content = '' 33 | self.content += '' 34 | 35 | def add_header(self, elements): 36 | self.content += '' 37 | for element in elements: 38 | self.content += ''.format(element) 39 | self.content += '' 40 | 41 | def add_rows(self, rows): 42 | for row in rows: 43 | self.add_row(row) 44 | 45 | def add_row(self, elements): 46 | self.content += '' 47 | 48 | # a list of cells 49 | for element in elements: 50 | self.content += '' 63 | 64 | self.content += '' 65 | 66 | def write_html(self): 67 | self.content += '
{}
' 51 | 52 | # fill a cell 53 | for key, val in element.items(): 54 | if key == 'text': 55 | self.content += val 56 | elif key == 'image': 57 | self.content += ''.format(val) 58 | elif key == 'audio': 59 | self.content += ''.format(val) 60 | elif key == 'video': 61 | self.content += '
' 68 | with open(self.fn_html, 'w') as f: 69 | f.write(self.content) 70 | --------------------------------------------------------------------------------