├── README.md
├── config
└── multi_modal_bert_base_layer_conect.json
├── data
├── music21_train.csv
└── music21_val.csv
├── requirements.txt
├── retrieval
├── eval_score.py
├── orig_retrieval_networks.py
├── retrieval_datasets.py
├── train_retrieval_networks.py
└── tribert_retrieval_networks.py
├── train_trimodal.py
├── tribert
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── audio_net.cpython-36.pyc
│ ├── criterion.cpython-36.pyc
│ ├── tribert.cpython-36.pyc
│ ├── utils.cpython-36.pyc
│ └── vilbert.cpython-36.pyc
├── datasets
│ ├── __pycache__
│ │ ├── base.cpython-36.pyc
│ │ ├── music_multimodal.cpython-36.pyc
│ │ └── video_transforms.cpython-36.pyc
│ ├── base.py
│ ├── music_multimodal.py
│ └── video_transforms.py
├── models
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── audio_net.cpython-36.pyc
│ │ ├── criterion.cpython-36.pyc
│ │ ├── synthesizer_net.cpython-36.pyc
│ │ ├── utils.cpython-36.pyc
│ │ └── vision_net.cpython-36.pyc
│ ├── audio_net.py
│ ├── criterion.py
│ ├── utils.py
│ └── vision_net.py
├── optimization.py
└── tribert.py
├── utils_music21.py
├── visualization
├── audio-0pXFhOg1o2c+audio-bGfyLBoZPM4
│ ├── av1.mp4
│ ├── av2.mp4
│ ├── gt1.wav
│ ├── gt2.wav
│ ├── gtamp1.jpg
│ ├── gtamp2.jpg
│ ├── gtmask1.jpg
│ ├── gtmask2.jpg
│ ├── mix.jpg
│ ├── mix.wav
│ ├── pred1.wav
│ ├── pred2.wav
│ ├── predamp1.jpg
│ ├── predamp2.jpg
│ ├── predmask1.jpg
│ ├── predmask2.jpg
│ ├── video1.mp4
│ ├── video2.mp4
│ └── weight.jpg
├── audio-2XY77lk_LCQ+audio-1K-0VC9hWIA
│ ├── av1.mp4
│ ├── av2.mp4
│ ├── gt1.wav
│ ├── gt2.wav
│ ├── gtamp1.jpg
│ ├── gtamp2.jpg
│ ├── gtmask1.jpg
│ ├── gtmask2.jpg
│ ├── mix.jpg
│ ├── mix.wav
│ ├── pred1.wav
│ ├── pred2.wav
│ ├── predamp1.jpg
│ ├── predamp2.jpg
│ ├── predmask1.jpg
│ ├── predmask2.jpg
│ ├── video1.mp4
│ ├── video2.mp4
│ └── weight.jpg
├── audio-EVp6jkgYuUc+audio-8YELO9yxs_c
│ ├── av1.mp4
│ ├── av2.mp4
│ ├── gt1.wav
│ ├── gt2.wav
│ ├── gtamp1.jpg
│ ├── gtamp2.jpg
│ ├── gtmask1.jpg
│ ├── gtmask2.jpg
│ ├── mix.jpg
│ ├── mix.wav
│ ├── pred1.wav
│ ├── pred2.wav
│ ├── predamp1.jpg
│ ├── predamp2.jpg
│ ├── predmask1.jpg
│ ├── predmask2.jpg
│ ├── video1.mp4
│ ├── video2.mp4
│ └── weight.jpg
├── audio-E_ugm84TMvo+audio-PMDSfAZ4-eo
│ ├── av1.mp4
│ ├── av2.mp4
│ ├── gt1.wav
│ ├── gt2.wav
│ ├── gtamp1.jpg
│ ├── gtamp2.jpg
│ ├── gtmask1.jpg
│ ├── gtmask2.jpg
│ ├── mix.jpg
│ ├── mix.wav
│ ├── pred1.wav
│ ├── pred2.wav
│ ├── predamp1.jpg
│ ├── predamp2.jpg
│ ├── predmask1.jpg
│ ├── predmask2.jpg
│ ├── video1.mp4
│ ├── video2.mp4
│ └── weight.jpg
├── audio-GV2bbRPFhvk+audio-NdzCe7COROw
│ ├── av1.mp4
│ ├── av2.mp4
│ ├── gt1.wav
│ ├── gt2.wav
│ ├── gtamp1.jpg
│ ├── gtamp2.jpg
│ ├── gtmask1.jpg
│ ├── gtmask2.jpg
│ ├── mix.jpg
│ ├── mix.wav
│ ├── pred1.wav
│ ├── pred2.wav
│ ├── predamp1.jpg
│ ├── predamp2.jpg
│ ├── predmask1.jpg
│ ├── predmask2.jpg
│ ├── video1.mp4
│ ├── video2.mp4
│ └── weight.jpg
├── audio-P_dHPMofcwM+audio-EXMQITpeeaM
│ ├── av1.mp4
│ ├── av2.mp4
│ ├── gt1.wav
│ ├── gt2.wav
│ ├── gtamp1.jpg
│ ├── gtamp2.jpg
│ ├── gtmask1.jpg
│ ├── gtmask2.jpg
│ ├── mix.jpg
│ ├── mix.wav
│ ├── pred1.wav
│ ├── pred2.wav
│ ├── predamp1.jpg
│ ├── predamp2.jpg
│ ├── predmask1.jpg
│ ├── predmask2.jpg
│ ├── video1.mp4
│ ├── video2.mp4
│ └── weight.jpg
├── audio-PlKCXvBDxaI+audio-0yR5s-CSw4E
│ ├── av1.mp4
│ ├── av2.mp4
│ ├── gt1.wav
│ ├── gt2.wav
│ ├── gtamp1.jpg
│ ├── gtamp2.jpg
│ ├── gtmask1.jpg
│ ├── gtmask2.jpg
│ ├── mix.jpg
│ ├── mix.wav
│ ├── pred1.wav
│ ├── pred2.wav
│ ├── predamp1.jpg
│ ├── predamp2.jpg
│ ├── predmask1.jpg
│ ├── predmask2.jpg
│ ├── video1.mp4
│ ├── video2.mp4
│ └── weight.jpg
├── audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c
│ ├── av1.mp4
│ ├── av2.mp4
│ ├── gt1.wav
│ ├── gt2.wav
│ ├── gtamp1.jpg
│ ├── gtamp2.jpg
│ ├── gtmask1.jpg
│ ├── gtmask2.jpg
│ ├── mix.jpg
│ ├── mix.wav
│ ├── pred1.wav
│ ├── pred2.wav
│ ├── predamp1.jpg
│ ├── predamp2.jpg
│ ├── predmask1.jpg
│ ├── predmask2.jpg
│ ├── video1.mp4
│ ├── video2.mp4
│ └── weight.jpg
├── audio-Xafuav13p2E+audio-t6B3JugXgTI
│ ├── av1.mp4
│ ├── av2.mp4
│ ├── gt1.wav
│ ├── gt2.wav
│ ├── gtamp1.jpg
│ ├── gtamp2.jpg
│ ├── gtmask1.jpg
│ ├── gtmask2.jpg
│ ├── mix.jpg
│ ├── mix.wav
│ ├── pred1.wav
│ ├── pred2.wav
│ ├── predamp1.jpg
│ ├── predamp2.jpg
│ ├── predmask1.jpg
│ ├── predmask2.jpg
│ ├── video1.mp4
│ ├── video2.mp4
│ └── weight.jpg
├── audio-gK3ooFKujO0+audio-087sDeYPdjY
│ ├── av1.mp4
│ ├── av2.mp4
│ ├── gt1.wav
│ ├── gt2.wav
│ ├── gtamp1.jpg
│ ├── gtamp2.jpg
│ ├── gtmask1.jpg
│ ├── gtmask2.jpg
│ ├── mix.jpg
│ ├── mix.wav
│ ├── pred1.wav
│ ├── pred2.wav
│ ├── predamp1.jpg
│ ├── predamp2.jpg
│ ├── predmask1.jpg
│ ├── predmask2.jpg
│ ├── video1.mp4
│ ├── video2.mp4
│ └── weight.jpg
└── index.html
└── viz.py
/README.md:
--------------------------------------------------------------------------------
1 | # TriBERT
2 |
3 | This repository contains the code for the NeurIPS 2021 paper titled ["TriBERT: Full-body Human-centric Audio-visual Representation Learning for Visual Sound Separation"](https://arxiv.org/pdf/2110.13412.pdf).
4 |
5 | # Data pre-processing:
6 |
7 | Please download [MUSIC21](https://github.com/roudimit/MUSIC_dataset). we found 314 videos are missing. Moreover, the train/val/test split was unavailable. Therefore, we used a random 80/20 train/test split which is given in [data](https://github.com/ubc-vision/TriBERT/tree/master/data).
8 |
9 | After downloading the dataset, please consider following steps as data pre-processing.
10 |
11 | 1. Following [Sound-of-Pixels](https://github.com/hangzhaomit/Sound-of-Pixels) we extracted video frames at 8fps and waveforms at 11025Hz from videos. We considered these frames and waveforms as our visual and audio input for TriBERT model.
12 | 2. Setup [AlphaPose toolbox](https://github.com/MVIG-SJTU/AlphaPose) to detect 26 keypoints for body joints and 21 keypoints for each hand.
13 | 3. Re-train [ST-GCN network](https://github.com/yysijie/st-gcn) with the keypoints detected using AlphaPose and extract body joint features of size 256 × 68. These features will be considered as pose embedding to pose stream of TriBERT model.
14 |
15 | # Pre-trained model
16 |
17 | Please download our pre-trained model from [Google Drive](https://drive.google.com/file/d/1cOIEUzcp7tKO1C6OyXwso2Rrm0wZHuu2/view?usp=sharing). To train from scratch please pre-process the data first and then run:
18 |
19 | ```
20 | python train_trimodal.py
21 |
22 | ```
23 |
24 | # Multi-modal Retrieval
25 |
26 | The code used for our multi-modal retrieval experiments are in the `retrieval` directory. We conduct retrieval on TriBERT embeddings as well as baseline (before passing through TriBERT) embeddings. The networks used for these tasks are located in `tribert_retrieval_networks.py` and `orig_retrieval_networks.py`, respectively.
27 |
28 | To train a retrieval network, use `train_retrieval_networks.py`. To evaluate the performance of a specific type of retrieval between TriBERT embeddings and baseline embeddings, use `train_retrieval_networks.py`.
29 |
30 |
31 | # Acknowledgment
32 |
33 | This repository is developed on top of [ViLBERT](https://github.com/jiasenlu/vilbert_beta) and [Sound-of-Pixels](https://github.com/hangzhaomit/Sound-of-Pixels). Please also refer to the original License of these projects.
34 |
35 | # Bibtext
36 |
37 | If you find this code is useful for your research, please cite our paper
38 |
39 |
40 | ```
41 | @inproceedings{rahman2021tribert,
42 | title={TriBERT: Human-centric Audio-visual Representation Learning},
43 | author={Rahman, Tanzila and Yang, Mengyu and Sigal, Leonid},
44 | booktitle={Thirty-Fifth Conference on Neural Information Processing Systems},
45 | year={2021}
46 | }
47 | ```
48 |
--------------------------------------------------------------------------------
/config/multi_modal_bert_base_layer_conect.json:
--------------------------------------------------------------------------------
1 | {
2 | "attention_probs_dropout_prob": 0.1,
3 | "hidden_act": "gelu",
4 | "hidden_dropout_prob": 0.1,
5 | "hidden_size": 768,
6 | "initializer_range": 0.02,
7 | "intermediate_size": 3072,
8 | "max_position_embeddings": 512,
9 | "num_attention_heads": 12,
10 | "num_hidden_layers": 12,
11 | "type_vocab_size": 2,
12 | "vocab_size": 30522,
13 | "v_feature_size": 1024,
14 | "v_target_size": 21,
15 | "v_hidden_size": 1024,
16 | "v_num_hidden_layers":6,
17 | "v_num_attention_heads":8,
18 | "v_intermediate_size":1024,
19 | "p_number_of_keypoints":68,
20 | "p_keypoints_feature":256,
21 | "p_hidden_size":1024,
22 | "p_arget_size":21,
23 | "p_num_hidden_layers":6,
24 | "p_num_attention_heads":8,
25 | "p_intermediate_size":1024,
26 | "a_feature_size": 2048,
27 | "a_target_size": 21,
28 | "a_hidden_size": 512,
29 | "a_num_hidden_layers":6,
30 | "_num_attention_heads":8,
31 | "a_intermediate_size":1024,
32 | "bi_hidden_size":512,
33 | "bi_num_attention_heads":8,
34 | "bi_intermediate_size": 1024,
35 | "bi_attention_type":1,
36 | "v_attention_probs_dropout_prob":0.1,
37 | "v_hidden_act":"gelu",
38 | "v_hidden_dropout_prob":0.1,
39 | "v_initializer_range":0.02,
40 | "v_biattention_id":[0, 1, 2, 3, 4, 5],
41 | "p_biattention_id":[0, 1, 2, 3, 4, 5],
42 | "a_biattention_id":[0, 1, 2, 3, 4, 5],
43 | "pooling_method": "mul",
44 | "arch_sound": "unet7",
45 | "arch_synthesizer": "linear",
46 | "arch_frame": "resnet50fc",
47 | "img_pool": "maxpool",
48 | "num_channels": 32,
49 | "binary_mask": 1,
50 | "loss": "bce",
51 | "weighted_loss":1,
52 | "num_mix":2,
53 | "log_freq": 1,
54 | "num_frames":3,
55 | "stride_frames":24,
56 | "frameRate": 8,
57 | "audLen": 65535,
58 | "audRate": 11025,
59 | "num_gpus":4,
60 | "workers": 48,
61 | "batch_size_per_gpu":20,
62 | "lr_frame":1e-4,
63 | "lr_sound": 1e-3,
64 | "lr_synthesizer": 1e-3,
65 | "num_epoch": 100,
66 | "disp_iter": 20,
67 | "num_vis": 0,
68 | "num_val": 256,
69 | "imgSize": 224,
70 | "stft_frame": 1022,
71 | "stft_hop": 256,
72 | "seed": 1234,
73 | "dup_trainset": 100,
74 | "weights_sound": "./ckpt_tribert/sound_pretrain.pth",
75 | "img_pool": "maxpool",
76 | "weights_frame": "./ckpt_tribert/frame_pretrain.pth",
77 | "arch_synthesizer": "linear",
78 | "sound_activation": "no",
79 | "img_activation": "sigmoid",
80 | "output_activation": "sigmoid",
81 | "mask_thres" : 0.5,
82 | "ckpt" : "./ckpt_tribert",
83 | "weight_decay" : "1e-4",
84 | "beta1" : 0.9,
85 | "lr_steps" : "[40, 60]"
86 | }
87 |
--------------------------------------------------------------------------------
/data/music21_train.csv:
--------------------------------------------------------------------------------
1 | bWPte6-VThw
2 | Uh_PzT65l2E
3 | STtGADifdUw
4 | 1eqS3rYvDjI
5 | HdmrBA6sBYE
6 | QxZO_ZxOZRU
7 | GouIcBwgP_U
8 | _idF1Hf3-CI
9 | Lje_xoSVBdQ
10 | Us9dxgfBfeo
11 | XZ1rUpbdu-Y
12 | FnAhTPl8ees
13 | VVq9NZPpaJc
14 | EBWWzSobojo
15 | 8mDNLCo1McQ
16 | NihlZckO34o
17 | UD2SYltnnlY
18 | wHxCLLVtkKo
19 | IcV9yqQOqkw
20 | PWX_mohRkQ4
21 | 1uG4jnhHgAc
22 | rYeUVUQ9lzI
23 | F6bIb1ucefE
24 | ljHG8rMHSHc
25 | ErtFfLCNtz8
26 | 11KsiMpJg2I
27 | xkq-VyAzFHc
28 | 5Eg6GNhszNE
29 | AJOIqmlI65Y
30 | _9aFrft67Zw
31 | FJiGcSP4Dak
32 | qLfSgZThsDM
33 | mRIeWFw9040
34 | GQGBzVaDTmM
35 | stnmYDeNpZM
36 | N1HwJvvYyoo
37 | -UoFXtE2eqk
38 | Cc75Ef4kBbk
39 | ALMOnRonTPM
40 | clVigOfCCNc
41 | X2QQ9FPQjQs
42 | GF-3c8hBi9I
43 | kbTy_5o6AB4
44 | -z1JPhZ-BUY
45 | 5DYC92362Qg
46 | jf0ondpk0jE
47 | ag7eC_pVobo
48 | nTMJTm6tqoY
49 | TSktVYy2820
50 | iqyw0xR5Wf0
51 | PoJuodmk9Ms
52 | 8LU-kq19V3Y
53 | 6YyYNvPORcs
54 | 1nHGmlUj8GM
55 | REiXxfSu3Fw
56 | BBIENSA_GkE
57 | Aibr0BCz-Yk
58 | 41b8GUEx-OM
59 | WSRAO6abxog
60 | cMYr693e3Eo
61 | D5gWhSH7jRU
62 | wfGx5L88O4Q
63 | G46O0IzYtt0
64 | W9ZICrM6V-s
65 | FrNjZANKt-Q
66 | JbGvqve_JH0
67 | 70vWxJiWrFU
68 | pBJUB98yZTw
69 | HQveyGmUBSs
70 | JEDgAVcR_yE
71 | S2IWw6z-cfc
72 | T90-aKvgnkE
73 | 1Ju4S0qeDqw
74 | XuZIw5OkZwU
75 | Fis5cOQ5tGQ
76 | WKeppEkkL-Y
77 | XlxYlk8IShg
78 | UIG0kM1HzC8
79 | Ith2Ni8jg1Y
80 | OkP0XYvGBfs
81 | focoWBYg16w
82 | 9RSYp7jWRfY
83 | iVzOtFTfgrg
84 | V9rjOI5YEDE
85 | OgHPzxGb1sM
86 | Ujpp_SiEAlY
87 | E_TBQsEEw5A
88 | OVv7XW_4lN8
89 | An1z4o0l68c
90 | oWvLCWWZVYk
91 | WkY638E5wy8
92 | U2NMdF5yzdE
93 | 6Gji_i1TWPY
94 | BbPsU29zD1U
95 | MVZGmcXU2D8
96 | aFRh4pn6yzY
97 | 8iry5jpDxV4
98 | bLvdb6haNVk
99 | D6YQ6WwWZrk
100 | V9NILCtoTkc
101 | Giy_Btup7DQ
102 | JnAAzR4XZ-I
103 | HhWAn3EXqrA
104 | 4AsWvRTLPWI
105 | oQJVBH6ST7o
106 | j-B6-AoZpo8
107 | LzTaV_X7srE
108 | 7kbgi9oNEi0
109 | AIcISC49phE
110 | b9hVMnMfLoA
111 | _IcBMTI8PEQ
112 | pfIpwtClF4M
113 | UF7UE6P2wVo
114 | pDS4o2C-eUQ
115 | JeZyrStc2ws
116 | fhzqzm0dYuE
117 | X8rR0SV_BhU
118 | ufsSRKgUb6g
119 | eBoCeQtoFtM
120 | 3C4RYkpivCU
121 | 2C8PwkOElRg
122 | IHZSAJkNpps
123 | NL70xysQ-Ss
124 | b6_iAVepfo8
125 | Eaq914E77oY
126 | aXeErkTlDEg
127 | za7phW3V0xE
128 | l7LptHCRkQU
129 | ADxyFxpe1ls
130 | qHOw-XwugYM
131 | LOm0yStU_YM
132 | wuVq8ksXZ8c
133 | No-IJw4uWMw
134 | 2skFRfSMYnQ
135 | 8EU0xM-pQOI
136 | 5OrjP43Yigo
137 | IFEP8Z5Qos0
138 | 1tPJuR1AWmI
139 | hslVlYk0WEc
140 | KPTyodoU2ng
141 | D0gOvIiH8Hg
142 | Rr1UkDkhDlA
143 | 5wtXVHH99PU
144 | -ibvkmiORQI
145 | XXmpQ0KkZ_E
146 | ZgGEuzLZw2E
147 | KxuFM4z9EJ0
148 | GB9mc0eIa_8
149 | 4B-mDmpRA4I
150 | cTO_2R81fdY
151 | ZSJu_zyOaWc
152 | SVZNEDEIK48
153 | 3XMD9TRDx2s
154 | 6pkShdDe9LU
155 | RqPkcu-xidI
156 | 3P0h1Ib2kM0
157 | NcjXqWPG2Nw
158 | GAZvmnoaMAs
159 | VlIcqDWmPkw
160 | 35TPV7_D7do
161 | BO7_8o4QSr4
162 | nyOkMXal0yA
163 | 4QYLzfVmyvQ
164 | co1QtO9H8zQ
165 | 2CcXQsF06Xo
166 | AAUm9njcMRk
167 | -XAsXmbExkE
168 | 1MiyvDB_bzM
169 | 41iiV-wahIY
170 | _vmWIk6aRXU
171 | Sw04q5d7d2Q
172 | HO2irXZhr-k
173 | 4DEgI36KJUU
174 | kjKR2GBuQWw
175 | kdB0igDRaLk
176 | jW_AuF6dQYY
177 | NIEqzh0dvow
178 | 10RkjPIguAY
179 | 4Oy5VmaacaI
180 | FpFuu7cbVt0
181 | rZYf9Dl8ew0
182 | EwG7o3SS-r0
183 | UYp3dKioAao
184 | 0PqIvfpaVSc
185 | iffYeAAUQ1o
186 | 8YNdpTm2vsc
187 | YwDABKyGY_g
188 | AX9ygkC_3SY
189 | iN1dXiDWpw8
190 | 3N1KAaBJkwU
191 | NDVxjRCWjYU
192 | lDCeAgGUwyc
193 | CaeZoHNHul0
194 | 7RRqeIUqXPs
195 | r2w-OciR4hw
196 | 9_ybb2vE4Z8
197 | 0DBd-xYa4Ck
198 | 60tvG40iSxo
199 | 43N29koKyPo
200 | IS5LJPPQC68
201 | NLGte2YC6Oc
202 | G_rINXF-bzg
203 | qb-ttddKLYo
204 | ASUUqMvTdjg
205 | FryEgnBZN8U
206 | r29WR4uOf9k
207 | PA8DVnECfh0
208 | 97UEABGyyw8
209 | 894n_sDSEc0
210 | d1S-RVDZv64
211 | OuoWrVc8BkE
212 | 5GWxV8I21AQ
213 | zkA4sRrrHVg
214 | 2ARPm0djKIk
215 | EtrYq3j7src
216 | T7cs_RGdb9k
217 | wnsA_6nhIIE
218 | PFX2WTY9gWI
219 | MUalqwUFrPs
220 | Z4CRwrR_lBE
221 | m_sKRr_n1NE
222 | r-hUC0z2VDE
223 | 141eeOiScwo
224 | jXyE0gq1upc
225 | P_lSgczU2Sk
226 | 1fsMnm7bYLg
227 | pR9R6DxqAhE
228 | hs8tK5bp5D8
229 | yn1NaZWN_qQ
230 | c7Jyu_OiEB0
231 | OufpA4FV1Tc
232 | -ryG2g4d6Qc
233 | 1odwJu3RtlI
234 | EanNPdLUh9U
235 | h-gSV3p0m9E
236 | lTVGdSchoPs
237 | _49-JJzSxTw
238 | CO6CPC-NZhE
239 | C2si-AWsap0
240 | 0AKB1aSpyNg
241 | tacEJsR5akI
242 | X4UWEifacwI
243 | 5e3dhOOmz8g
244 | 2TLnVbJzH3E
245 | CtrTfd8foxA
246 | GTmzbN99PWo
247 | TBngVoy_GrE
248 | EYGbVrIlL6M
249 | Zqg4DwXmYBI
250 | 5ESDEU04WSQ
251 | YPiD4WdnUMQ
252 | b1ZNLsvmRNQ
253 | Cy1i3u-Lz1w
254 | S3G9BjzPKXo
255 | -_-TDDxoaAc
256 | 9g20WQ0WHn0
257 | Von-UD2RWnU
258 | 23DDVbzSig8
259 | aV-74W7SdJ4
260 | RUUtdFmg03M
261 | oqHi5-Io4Uo
262 | OoV1TBZamMI
263 | lTu6hayHl5k
264 | cp_0VICDDSc
265 | MCftKtqmH1A
266 | _AtTFfgZvEk
267 | GOYQXO-DJcg
268 | aJpxlrsy2As
269 | txpFPySWtiE
270 | Lp9rAMPaqLQ
271 | OBbmnUIJQDQ
272 | 8B9WaDcotLg
273 | 6FhYpuJyGT4
274 | Q5-AV79Rh5I
275 | 1i8XHo25_KU
276 | TyMrlwmg5M4
277 | eCQO6k5Qrmg
278 | ByaSZ2dD0yY
279 | U7X8kVuyZTE
280 | aZLkNOye_wA
281 | 9XoXQXCVUQY
282 | 3d1b4UH43-E
283 | 5nUYCnCQd6M
284 | b7p-tCYilKM
285 | 75bCUXEv1hs
286 | VtZXRWtp5G0
287 | Mhc9ckO6esk
288 | bLjS6E6c0IA
289 | 3RP8GYBEiiw
290 | wEzEcfz1QS0
291 | 6XeB5fgtU9Y
292 | msVlr2fgcgM
293 | cZa21THo--4
294 | _YvHViIph10
295 | yy2vL2RUiPI
296 | KrlDeCWv9ak
297 | PkJWA3cf-z4
298 | EXKxvMVayRk
299 | I0Cyq1YlSVM
300 | eIqVZuyPu2U
301 | 1E7U1w2Z8C0
302 | K1LRscO7TSw
303 | TWrh_c9YRp8
304 | XlvlHfWjKx4
305 | XFcB4jGhOBE
306 | WaiDZX7FH-8
307 | CTtHozrSfh0
308 | TSEWnnKagqQ
309 | YLTpkQqX_ek
310 | 7VEwnKM85xs
311 | 9VaXf5R7Xs8
312 | 2PKQczRfm1g
313 | nvjfuTKz2yA
314 | 1F3ay026U60
315 | HHaApvpQVdQ
316 | aerJoJj--HU
317 | Wt40rzyGSBs
318 | WNeHlya4VnQ
319 | 0o22Fy2Hr3s
320 | 6YBD40q1ads
321 | C3s60zFIPTs
322 | TOLoplXKr7U
323 | SPXyBDbGt4s
324 | D296jSWKxiM
325 | 0OrLdMPOyos
326 | qKnggsxx2II
327 | fFkthPInr8k
328 | sybsjsj1zPo
329 | HPyM9xzrih0
330 | XS_dSmZ0xbw
331 | 5HNlS9UFELE
332 | WjwOm5pJlI0
333 | 4QWePxpUu6c
334 | L6V8S8U9iLs
335 | Bsd2TJw0Wss
336 | XlS6Ao_V8nE
337 | HyD1g1KDoEE
338 | F4IPsMGMPgM
339 | kx6qqoGft2w
340 | aavW-tx8daw
341 | LUZf782HzNo
342 | VUXyjq1bw_Y
343 | -7WIMBTTvZs
344 | h2oiK1-8U5U
345 | XmjMGVv3TLk
346 | 7171YEDwyGc
347 | 2ngAIYZqDxE
348 | 6MEnSbm3xl0
349 | e5YNEk9Ye-o
350 | 9YS7ReG44zk
351 | kXB2EMfU7Ho
352 | KKW1Gr3Xttw
353 | 9OZHbvWdBbQ
354 | 2R12lQszz90
355 | oOCHF3GgKLs
356 | PDohhCaNf98
357 | U4RSnhRVUR0
358 | SMp2-xo1lZE
359 | 6EaTems2LNs
360 | LPqBkF5NVSQ
361 | jr-coZHlWl8
362 | V8ifoas59TM
363 | W9VsprvJQAM
364 | 9YNITyPKZFQ
365 | S3ZIb96M6Ag
366 | iLeEWSC2M2U
367 | Yz6izw5dhn8
368 | 9bEbuj4Qb3I
369 | Te3bHjd0kdc
370 | jTnHP1szyyw
371 | oEb2mlij38w
372 | XbOYpwFkn7o
373 | cyUUlWSEujk
374 | LqyX7bRrMTw
375 | lMQVRb2mz44
376 | 4TFz7QX5aNg
377 | gOsW2AVyjyc
378 | LUWw06fW4zU
379 | SDR39fhyZRI
380 | HI8t07KXWaw
381 | QeravklOaAg
382 | W8buacmF8g0
383 | MKSTzG114d0
384 | OIBq0bBziuw
385 | iB17xqmFw3A
386 | 8rb-thew50c
387 | uJnD0NCyJoc
388 | PYjwNew61XQ
389 | 5675YbIvcz0
390 | -4nIqLncdB0
391 | DS500VD8YDg
392 | LIocmQeT8_Q
393 | ZvLT_J9Uv4o
394 | Lq3671NhNZk
395 | D-2OOEcv_ZI
396 | 4tkY07v9YWg
397 | tHvLBLCBHyU
398 | Sd-tIwLzonE
399 | YFxW8-V050U
400 | q_ycwl0c448
401 | 6nxSuOPWrn4
402 | x9TgjVZPZbM
403 | -94WkAANyB0
404 | IlDWk-x6Cag
405 | 73ikMuXwKgc
406 | K_U5-GbPgi0
407 | C3QJz0gN9i8
408 | cPKV0QF9Scg
409 | AP-vB4w_7Ag
410 | w55jJXSspUo
411 | 9i79TXP7V2c
412 | ZRpsKMe5dSg
413 | 6G78knF3eFk
414 | bsorCgLZkkA
415 | B2rJa22OHkk
416 | bowjUDJsWq8
417 | fxG1YUtix_o
418 | pXE3Bl5-GoM
419 | P4KxMdjna0k
420 | wgpwadsv5YQ
421 | vd1dgdxlA94
422 | 4bAhK1C6BWw
423 | qsciXAjiwF0
424 | Mff3pp-Fj2w
425 | WgqK2DARbIQ
426 | GSol2w0QnJE
427 | JB8-oKS6K-8
428 | CjNKUiFHUC8
429 | 9xj3e5A8XwE
430 | ZANyOBEWiJA
431 | 3-95PC9eX40
432 | b-50KrgDpjU
433 | _VwGcE2IOMc
434 | 1mCi6eWXVbg
435 | h-58UapA8gY
436 | KG-tXicVpsg
437 | iEOdwzdXmfE
438 | 6syzZgAk2vI
439 | kao4iqPuQBI
440 | 37HdHAzJrOQ
441 | MnoQccPJbyM
442 | 3n-vlswrQYM
443 | 7Zudy1bw7bw
444 | BI7SMfb-sek
445 | x5Bzb6R73JQ
446 | WeeRb3LMb8E
447 | QqKlMPzQj7k
448 | MvkJvZSOmjY
449 | 4q7L3TX3Hl0
450 | Gufj4Jhk92I
451 | 8lol9fS2CHU
452 | nKc4srJBgYU
453 | Pzf9MQKkoNM
454 | unRA0xR-XWc
455 | e2aq2A6Bhuc
456 | 2V12S02E2Jc
457 | -zIIUIlfKlY
458 | 1oz3h9doX_g
459 | y0WB29MJJgQ
460 | NOYN6GRhnTs
461 | ZcKsPFZG4_0
462 | DshS7-Mn9dM
463 | -wxurZAFJWA
464 | JRPtROMynA8
465 | q4kscRK91y4
466 | DLziSt24jrg
467 | w-IEfjDTi9c
468 | 1fbHYpRstQs
469 | D2bFizJfvi8
470 | PeT8aBVvtFI
471 | 3zi2fHlfP_8
472 | HmIPqgiaW24
473 | -eNT-Gvikbs
474 | jLY5DKM6HoI
475 | QOYMhOXfGMw
476 | IGouH5bKhj8
477 | HwVslStJE6U
478 | NeayVC7CEvM
479 | 9dOqmTMqp1k
480 | _O1mTqCSo6E
481 | z73RZix8Q_o
482 | 48E4sS8K-bM
483 | UI7k6m8Gdz4
484 | awCkfdohkzE
485 | ywH4K8mQ-1s
486 | bq5utI8KK_8
487 | WDJYj3SvUlw
488 | _u-pz6i3iDY
489 | 341do4TeJT0
490 | HnUOaSfTA6c
491 | 8Oy-rsKES5I
492 | 7wmiNt5a79s
493 | Qy2y8X-Ec8Q
494 | eHbxLcoLWYY
495 | rc2taOIxV_M
496 | TREfn2OfujU
497 | 47Hl3Rb2164
498 | ySFS1KgXerA
499 | qljUWQ1rY3Q
500 | sqT7a8TBgOg
501 | 8P6PFNNCrgs
502 | 5GpXUu8cz1c
503 | 4fGE07dATvc
504 | CG37e-Q2NeU
505 | RgB3wzNVZyY
506 | XIcV5kBVIbw
507 | BMhnTdy-A0M
508 | CeOAuSm1NUo
509 | B7vaxJhgCQE
510 | 3AQsq1PG1Wc
511 | jPaCkyptN1s
512 | aYCanSq5Y7E
513 | vTNNzAqqeZ8
514 | EbqdFn9dNvM
515 | 1OsOEt5Qpfc
516 | TUrhztP4TGo
517 | C74cF3O6ZFo
518 | I1K7TJcgOsE
519 | _Nr58vTAIkw
520 | dpnBlIZGyJA
521 | ENnSwgLc2B4
522 | fXjfvEcAV6w
523 | AQBgyM8NrHQ
524 | Vq0eH1uUvXc
525 | Z_yNLFYjpsY
526 | Fif2-OjH-8A
527 | UMCbhbok-ec
528 | UfP31JXZbKU
529 | VLAFJ3PD830
530 | 6apsC8o3f6k
531 | 1n44lCfG59k
532 | -HLTNgdajqw
533 | LeH5Urwtlug
534 | QaOUijKCqZU
535 | o3a6F070Xt0
536 | nvzbPGgv7R4
537 | BxmkJkFr1nk
538 | xjhZhI2Zthg
539 | zOWt7kRh9e8
540 | 1t-zt8nAsQo
541 | Hb_UTQsnFqA
542 | -XMgVGqJm8U
543 | hYepU4PQFAo
544 | Owzy345m12U
545 | HJL0k3fQx3s
546 | kQZJh5Clq_s
547 | PEyIPSlJNQM
548 | 6AukD08i9GM
549 | HGqEtp1wpyc
550 | JoIl89Ybhs0
551 | 09C8NuLfsxc
552 | JmQ1C-cwiuY
553 | 9L3e1AuZNIM
554 | 0oKi3ARn640
555 | 8DHG_hVSw1o
556 | zPzDLRIk6Es
557 | EtP6JBWtd7s
558 | E_8W7f9ZZpY
559 | b6TT-5B4dXw
560 | COfSRe0FHWg
561 | 5P7iGxvFI4Q
562 | YmKMEpt9OjQ
563 | AszMf9QGxug
564 | PY3pK0Guf2Y
565 | zhevOlbQf8s
566 | 4h550TCCd9w
567 | BwDQ9VIW74Q
568 | tGG2P7mjuqQ
569 | j0U_hqIH_XM
570 | -_jlqeSs6ZU
571 | 2WZsNZrbt6w
572 | nQpXK7PS-oo
573 | 4lVuJEIaXgo
574 | 0N26WnKiCIg
575 | 5iKfNcUcPI0
576 | OZDN7VkC6AY
577 | ZPpbacaDz3k
578 | H6fPWLu6biA
579 | CFcPlv9RhEY
580 | 8zJgbPSLDLw
581 | yXKJiqOuvkA
582 | 1vZ-IKkcPL4
583 | X2hHGcwLPhs
584 | 9fOU26yd0Pw
585 | VPKf9yRkKf4
586 | JDV3yiF-6DU
587 | RB2WGan5ghM
588 | 2ZTKgdvVo7k
589 | bapfkDpaSGw
590 | 0px0WUwkOy8
591 | vBgwzJYJ5yk
592 | Q8efUjCGqvo
593 | 9ygrq7in_uI
594 | jB58yIBmcTo
595 | GMYP2T_ZTkw
596 | UFNc_74au6A
597 | 3TJPKAfxyEE
598 | g1sNdUDTfJQ
599 | _MkybbNPYFI
600 | n8-2q4dheyU
601 | KenJhs2kS3Y
602 | za84Zws-5gY
603 | 2ZxAVUsuE4Y
604 | lVQYZKrq1aY
605 | fwBwJFRrD4w
606 | OxA5Z_5-w0E
607 | W65r_CnPlKg
608 | AKNt1SF-5VA
609 | -tY40Ev8IzE
610 | cFdMRCeB8WQ
611 | 3-zT9mN8Lio
612 | rxr5tTH8_mM
613 | OCmsvdY_9RM
614 | qs68YTfKHkw
615 | RWP6BHh_c7Y
616 | pBU7dNbUxQs
617 | DF3RPgRYw7s
618 | ARNQJIxtj7E
619 | Bsgp-hgtF7E
620 | aTYc-hHt4us
621 | GPh_9jtWub8
622 | aY4Ra2KOyas
623 | 5yi3rSuvNyI
624 | GhwLgGnTMG8
625 | nZ4HNz9xvSw
626 | ZXz0cVq_RTI
627 | GtQixP8bAk8
628 | 8caeW1WShg0
629 | 3daxf9BhdyY
630 | 7G6QVPCHv6Q
631 | EvBzKBdfZxA
632 | 9PJgKV2VCcM
633 | 40KRN9rzkpk
634 | UZK_2bTuzrI
635 | geK-gOMgm20
636 | zhaLSfnahL0
637 | 9i1yD3mnqUg
638 | 2tG4y_uyEAY
639 | _oUTOskOwXs
640 | 7gP4eJzp8Ro
641 | KS-fMmS96XU
642 | H4K5buSMzdA
643 | 5QQb0kt_kAA
644 | 8GsWxA7kalQ
645 | WVyd3N6u5YE
646 | gfkfwnOFybM
647 | wzNBinIsa3c
648 | Cml4I28SUHc
649 | R700Q9DEUEk
650 | 9L4BvIDqhm8
651 | dijkShln-ig
652 | BPMsd_gDFb0
653 | 04lh6kxa5Kc
654 | QZEkGj9m1qs
655 | wNAFZMyLNUA
656 | I5LCi0sWTU4
657 | OgRf5gri-9k
658 | 7zfk-5Wq5Aw
659 | 4AI18Hhy5No
660 | e73x3LGeb_Q
661 | 8VfMykySryc
662 | HOUBn-wHcwQ
663 | dAOjddR-84E
664 | XI7Fx7rJt4I
665 | WVeheJFHQS4
666 | -L5Bo-mWKCI
667 | RK__-JoNte0
668 | Xx0Haa8Fk2o
669 | 26HLgXWF-Co
670 | w81p5ty4OZc
671 | CQnWcgZfqRM
672 | 2_valn6NOrQ
673 | 07BlCwnia_U
674 | xdgQ6LJFGTI
675 | vEOyhF8KLPE
676 | NLooa9XHDMc
677 | fH4pDr8CqI8
678 | c-Yr5qyCefA
679 | DVrCy36Uigo
680 | JkhKTTB0YB8
681 | E7E1kMExVcg
682 | P-B8ooirjHE
683 | 8sgzGhxw-tM
684 | d5IO46KfjR4
685 | GMGnSMNrvfI
686 | BxdQ2OKFP3Q
687 | LKVxvHeb8hs
688 | Mdryqvk0UH0
689 | yoCJxgwzJy0
690 | 9kwbEUqCNbU
691 | 2toke35E958
692 | X3GbAKww1KI
693 | SkXnQi_nNbk
694 | FfV5O2F2waI
695 | -ktVpJDWu_k
696 | Hy-gFmxHhxs
697 | MJmVYLWPnCI
698 | IKUVgbi1e2U
699 | mvE-NTWT77c
700 | I0LedcEaPL0
701 | -piYS0yc_dI
702 | H-j5lp2QjEE
703 | 6Fm4fJTDr68
704 | VQR_CcRrZA0
705 | F_AqoGpCWHg
706 | 3dbHMyBWR7M
707 | ZtVxXYObmOQ
708 | -e5DuAUwBgA
709 | y-Y6QAUqSKg
710 | cNLX70EhWtg
711 | cFskXS823Kk
712 | wkkzzAjSxDs
713 | tAuYNf2hqHU
714 | M7FQHjE1AGY
715 | 5UZF3bGoZpM
716 | t_3Zm1PbydA
717 | 6CTOaH3qQKY
718 | k3371a9MXsI
719 | Me36Lr7rpAM
720 | 82pGU0POQSM
721 | I2QXo4mGeRE
722 | nCEKU3Rpvok
723 | QUAQqGpml4M
724 | 4ftnQdEaLO8
725 | zacX_7sb4j4
726 | pQ9IiR2pFQY
727 | MwHZRjcXgRE
728 | Sw8346DYwME
729 | 5awjINljUoo
730 | WnUAJsk97k4
731 | DPSLdZBaQTM
732 | QoFHDWgxD9w
733 | 49EvJsKoEko
734 | A22AWJ6zFBs
735 | ZrLi-YLGVnk
736 | BB6a5BPpbFs
737 | SAN5JBTnlas
738 | Amk7Ssb5EHo
739 | 4-hZDt0Vr5I
740 | pHUAsuSJeoA
741 | hDYLqqa1FNQ
742 | o7YS-fzv5_k
743 | CKCY0Ib-PRY
744 | P5pxcBdxa4E
745 | RfTCdZEqWcg
746 | Lop_iMipljc
747 | pfGRBQiaYZo
748 | S86qStDAz3A
749 | ZFp0yw9Dxcw
750 | 1TBxNBIUx3o
751 | kuNep98Fd8o
752 | 9cko37myhM0
753 | -SzFwDekTGo
754 | 2LouW42GMpQ
755 | D1DjJqqK5Rw
756 | I5IYfx7ontI
757 | JBozc8J3LXc
758 | fhf2cx9t4vY
759 | FR_1gEaBeeE
760 | nMYw1bi81Dg
761 | lJfi82vd9eI
762 | Z36VEiqi02Y
763 | LHO49Q572F4
764 | loCRmBvtHZ4
765 | 5IoSJhlLmqI
766 | Jd5heRezP5s
767 | M3YZEmQyt6c
768 | 85KcVDCQLfc
769 | 6Ubt6kjJJRg
770 | _jPFkOkNjuo
771 | 8rcClHLysvU
772 | ensmwD87FjM
773 | KW2cyLN7bRM
774 | 4PUBRWXm9wQ
775 | kotc5pssmt8
776 | 1fhZ3iBeK2g
777 | NhWImcVAnNg
778 | qLLwSb9YX5g
779 | ZYMb7ZJkJgc
780 | BabDKZLYlyE
781 | NhkPpOvrtz0
782 | s1hBC8N4-Zo
783 | Ee9n3mxehr4
784 | xyi0Ft3sb0U
785 | 6I846oP0E7Q
786 | 3q104tR3k2Q
787 | Aiantckb2lk
788 | vhVtX8F0-BY
789 | 7C-udYVfjoY
790 | DK6cSN_7lwk
791 | VA1EolcDwZI
792 | JxgWTP5Nnww
793 | 2NaPqrUQ-3U
794 | bcSGjoMLe4A
795 | qBnsIqyXheM
796 | K-EqzJG2W0U
797 | ZFuLPm6-kVw
798 | U6iN9QcIp7I
799 | eAS-Lj0IRk4
800 | N3nwDmWvsF4
801 | S0M77oMpTws
802 | E6jlrR-M3tk
803 | 2ZmGv8hr0ag
804 | HeWD0YLbwOw
805 | GKwO7cy2JIA
806 | lVpWbUIW0gY
807 | 603xwk_CKug
808 | Jtq-HDS3Zdc
809 | GezwOJW568A
810 | Sllno_P7l8M
811 | -3V-vUMkHqk
812 | HXHJ1hPyHd8
813 | 6PgAIsZOc5o
814 | 2gyrqiqTuiM
815 | JByLsqhRL_0
816 | 3EkqUlQ7Mw4
817 | InzjAr8VdvY
818 | IZx6ZXWuOGY
819 | bBCBMV7K-YA
820 | -7l7M6vMm1k
821 | HaX5E66eW24
822 | FGSdBlb55Ao
823 | ZVcWGHRPcP8
824 | 5UcdHNTO1-U
825 | XEj3ejyxjNU
826 | ds0ayqYweAc
827 | pWGLb57zbz8
828 | Uu9lMixnPGU
829 | KgebjbW1mGk
830 | 7q1E1pueAU4
831 | CSKOEhh5nBk
832 | 2OMFkTM5Mrs
833 | 18Uttg-RRIU
834 | TM-dWqH4qFI
835 | 3TfXOozH1XQ
836 | DvbeXXoraro
837 | -ieHd2A9vOg
838 | naAz_RNicmU
839 | UNFwNbFdiTI
840 | dfLQaboaa84
841 |
--------------------------------------------------------------------------------
/data/music21_val.csv:
--------------------------------------------------------------------------------
1 | PlKCXvBDxaI
2 | Ba4ajAm0VDI
3 | 2XY77lk_LCQ
4 | kGAREuYVNDY
5 | LfPea4VOU7Q
6 | 0pXFhOg1o2c
7 | DLTK2_UyrDI
8 | Xafuav13p2E
9 | PohzBumrY2Q
10 | wEF6fWnx_wY
11 | gK3ooFKujO0
12 | P_dHPMofcwM
13 | E_ugm84TMvo
14 | 1owUN7YXSzE
15 | 1K-0VC9hWIA
16 | KsvxsTNvqno
17 | Y-dpHbPGQWE
18 | pIPr-lp0C6E
19 | Vk9YR-NLkmQ
20 | TaII-y9-5Iw
21 | W1I1I6J_U5Q
22 | EVp6jkgYuUc
23 | GV2bbRPFhvk
24 | R6M2LLmUg1k
25 | 8OocjGxAtHw
26 | 7uQ-FX2YPYY
27 | VcvG3_CEScw
28 | nCvy2n4ET8Y
29 | hf1QMr3zCv8
30 | Z1_Pkm_SCsU
31 | P6mSuuTzYxk
32 | 9sYVHOsxdEA
33 | 6hQHOZNwfRs
34 | 6D0ciGPtcqo
35 | -0gYWIOfqdM
36 | NdzCe7COROw
37 | 8RjCoqH6jdM
38 | bSJ2gRMbDjg
39 | FYqqkgkeYrI
40 | AlYZ7nWkzcQ
41 | Hw7Z9LUqCSI
42 | KjkMQ12U0I0
43 | 8YELO9yxs_c
44 | FyJTmjsiDuM
45 | -mXzp53rYGs
46 | 6YkGjuk773U
47 | nUZ9jX1J78c
48 | QWttV5rZMrg
49 | os3aGWhaxhY
50 | cZoWhKTZTYY
51 | LCfjGiTfHfI
52 | 6fRV-ERTGlI
53 | lp4_79ooAQo
54 | HosvsEE5SqA
55 | N8v-xQ0p8yA
56 | rg41DcQAwWI
57 | PF9PrzmkR2Q
58 | mUUpqPK_w-o
59 | QVXgHLXaoFw
60 | -Gdh8N_KpLY
61 | VYkZb7F4zKo
62 | BQq0jYGyyuo
63 | 44FPny8k4DM
64 | 9EizWhixKCg
65 | P3RSXOzak0o
66 | NxcaesZGezI
67 | 8zotbsDK62Q
68 | -X4loU7hcpw
69 | s-UjwvJ0eeA
70 | pgedyY56-bU
71 | oqwostd-nHQ
72 | _3pLesuLT4w
73 | GL-1Ke5Keok
74 | 7jsI_0tw8MY
75 | 4CTkB84ukfY
76 | VcEhFKOTF7A
77 | 2kBkI6f1X_s
78 | 9okuB-rub8c
79 | 2lJh-6TYdX8
80 | lm51ySOO71o
81 | qMFYRw0uZ_E
82 | -_F1gRwExMw
83 | t6B3JugXgTI
84 | v5bCWMzpeX8
85 | 6A6wmDbAG7s
86 | 6gjpPVZnO64
87 | E5elpplcRY8
88 | Ba5_zNpzAgw
89 | Rj5XJO5T8PY
90 | qZZ1t3ubduo
91 | ZoE25YHTGDQ
92 | -DlGdZNAsxA
93 | XgU1dc7eys0
94 | 2avwRKFpUbw
95 | frHmgK4u0Gg
96 | 9SZLDvRaHEA
97 | HpVhQbyM2dc
98 | DIt5UQR1tdo
99 | 0yR5s-CSw4E
100 | xtVkxXOu_Mo
101 | HihdcTYa-8k
102 | WS-i7gOmKSo
103 | aCi1P8Rymo4
104 | 7zrsesZWrhc
105 | wh4H5Cfg1RQ
106 | Oe0njWj0KDE
107 | C952gmBlkIU
108 | 5_NsbkyLO2Q
109 | MqB8Ux4DsIM
110 | i6t-dNA9Fck
111 | -H44pbRIg7s
112 | DGn7xcsxMPk
113 | 3yRMbH36HRE
114 | WnXzjXoSF_g
115 | nBHt2QYgzsY
116 | EXMQITpeeaM
117 | FcO9w1mYEEs
118 | -5CDUnGu3jA
119 | JeohKdrShNI
120 | D6RmuscqupA
121 | KC2RgtW4IZ0
122 | PMDSfAZ4-eo
123 | aZjtE82spMA
124 | PCicM6i59_I
125 | cIR1PDcHaew
126 | 6d0tmb28kV4
127 | -5MipMQ25cU
128 | QoBoHAujB6I
129 | 2P83WJXifEs
130 | DlqCn_xrNRU
131 | zJIt_KdE3Jc
132 | -QEVcuzr5bQ
133 | dXiiW1p_lTI
134 | BQH7E8ByxC8
135 | i53jcmLQESI
136 | h61TbdGR7Rk
137 | S1iKARDRvX4
138 | a9psZCnRJes
139 | Vs1NPirh2jI
140 | Xm5a8vCHlFw
141 | Mh2tJbJiWQQ
142 | j43Hj6k9Zy4
143 | Ct-SQmiA31s
144 | A8TgRagoY18
145 | SyToSKUxgKs
146 | j97FUC5-hpY
147 | 087sDeYPdjY
148 | F-sSpzBT_Rk
149 | dVxfGxQq-vc
150 | 19CdPaHrVPQ
151 | YiE-dyrpRgA
152 | mjL-XSxiu5k
153 | UKk9DJD1jOo
154 | nMLPDNPibO0
155 | 5lm9laLSORc
156 | lEDKJFgpakg
157 | 889Kxn_RYzo
158 | Gt8Sfng1YCA
159 | PobONF_Mj4g
160 | bGfyLBoZPM4
161 | tfcNUo8qjrA
162 | VdFnyZWJAgo
163 | LRb3gwiLZI8
164 | Vxe4R5qqEPI
165 | LJ_IFurpFvo
166 | IgtMKN8awWA
167 | 31GOxPXDNkk
168 | 8o5ngxZaYYw
169 | 2D2ALfuqDpk
170 | tWH7qKYl5uQ
171 | FntHqAlD1S0
172 | GIB4tnBSWtg
173 | 5VACMCjJduY
174 | GXcRI0BdioE
175 | 4Tn7MBNTFAA
176 | -qbAJMQBoX4
177 | 3wL4J4IDh7c
178 | 7baQOJtiFMc
179 | YtOvrQNMkY0
180 | 2gk-IN35s5I
181 | FoOVn7MACAE
182 | S9ES30JJTZc
183 | Ljf3tZGxaJM
184 | kw-OQpF9N4E
185 | Lv2vc_8CZLc
186 | 0VyVd_QUCl8
187 | 6E1IGb3137U
188 | 98Menrr741Q
189 | McMd-s4XpZk
190 | BWqae9kUD_M
191 | U6j1KyG1mTw
192 | dhWU1tU1HbY
193 | 08FQMisUWAY
194 | LKMBOuDkEqo
195 | fful4o8CIW8
196 | 3Y2vbEwMkeQ
197 | Nap3NTXTNE4
198 | sTo7AxrWIHE
199 | FYxv-VOKa40
200 | 0lSLVJRTeEI
201 | Ln6ynUjGa80
202 | P5nnnybNQ_o
203 | XlPvAkLT3Yw
204 | F9JPakNkuok
205 | -oe-KptPGnY
206 | MTyf4utLlJk
207 | apGScvWeLaE
208 | DocbWENGxsE
209 | --IXKwloYLE
210 | MvPqp3oZwOk
211 | 59GKBsOJSNs
212 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch-pretrained-bert==0.6.2
2 | numpy==1.16.4
3 | lmdb==0.94
4 | tensorboardX==1.2
5 | tensorflow==1.13.1
6 | tensorpack==0.9.4
7 | tqdm==4.31.1
8 | easydict==1.9
9 | PyYAML==5.1.2
10 | jsonlines==1.2.0
11 | json-lines==0.5.0
12 | matplotlib
13 | Cython
14 | msgpack
15 | msgpack-numpy
16 |
--------------------------------------------------------------------------------
/retrieval/eval_score.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import torch
3 | import torch.nn.functional as F
4 |
5 | import numpy as np
6 | import tribert_retrieval_networks
7 | import orig_retrieval_networks
8 | from tqdm import tqdm
9 |
10 |
11 | def evaluate(model, test_vision, test_audio, test_pose, rand_idx, args, device):
12 |
13 | num_eval = test_vision.shape[0]
14 |
15 | correct_1 = 0
16 | correct_5 = 0
17 | correct_10 = 0
18 |
19 | # Get query embeddings and target embedding
20 | for i in tqdm(range(num_eval)):
21 | if args.retrieval_variant == 'aud2vis':
22 | query = torch.from_numpy(test_audio[rand_idx[i], ...]).float().to(device)
23 | result = torch.from_numpy(test_vision[rand_idx[i], ...]).float().to(device)
24 | if args.retrieval_variant == 'vis2aud':
25 | query = torch.from_numpy(test_vision[rand_idx[i], ...]).float().to(device)
26 | result = torch.from_numpy(test_audio[rand_idx[i], ...]).float().to(device)
27 | if args.retrieval_variant == 'aud2pose':
28 | query = torch.from_numpy(test_audio[rand_idx[i], ...]).float().to(device)
29 | result = torch.from_numpy(test_pose[rand_idx[i], ...]).float().to(device)
30 | if args.retrieval_variant == 'pose2aud':
31 | query = torch.from_numpy(test_pose[rand_idx[i], ...]).float().to(device)
32 | result = torch.from_numpy(test_audio[rand_idx[i], ...]).float().to(device)
33 | if args.retrieval_variant == 'vis+aud2pose':
34 | vision = torch.from_numpy(test_vision[rand_idx[i], ...]).float().to(device)
35 | audio = torch.from_numpy(test_audio[rand_idx[i], ...]).float().to(device)
36 | query = model.fuse_forward(vision, audio)
37 | result = torch.from_numpy(test_pose[rand_idx[i], ...]).float().to(device)
38 |
39 | scores = torch.zeros(num_eval).to(device)
40 |
41 | for j in range(test_vision.shape[0]):
42 | score = model.forward_once(query, result)
43 | scores[j] = score
44 |
45 | scores = F.softmax(scores, dim=0)
46 | ordered_scores = torch.argsort(scores, descending=True)
47 |
48 | if rand_idx[i] in ordered_scores[:1]:
49 | correct_1 += 1
50 |
51 | if rand_idx[i] in ordered_scores[:5]:
52 | correct_5 += 1
53 |
54 | if rand_idx[i] in ordered_scores[:10]:
55 | correct_10 += 1
56 |
57 | acc_1 = correct_1 / num_eval
58 | acc_5 = correct_5 / num_eval
59 | acc_10 = correct_10 / num_eval
60 |
61 | return acc_1, acc_5, acc_10
62 |
63 |
64 | def test(tribert_model, orig_model, tribert_embeddings, orig_embeddings, device, args):
65 |
66 | # Unpack embeddings
67 | tribert_vis = tribert_embeddings['vision']
68 | tribert_aud = tribert_embeddings['audio']
69 | tribert_pose = tribert_embeddings['pose']
70 |
71 | orig_vis = orig_embeddings['vision']
72 | orig_aud = orig_embeddings['audio']
73 | orig_pose = orig_embeddings['pose']
74 |
75 | num_tribert = tribert_vis.shape[0]
76 | num_orig = orig_vis.shape[0]
77 |
78 | print(f'Evaluating on {num_tribert} tribert embeddings and {num_orig} original embeddings...')
79 |
80 | # Shuffle the embeddings
81 | rand_tribert = np.arange(num_tribert)
82 | rand_orig = np.arange(num_orig)
83 | np.random.shuffle(rand_tribert)
84 | np.random.shuffle(rand_orig)
85 |
86 | # Evaluate retrieval on tribert embeddings
87 | tribert_acc1, tribert_acc5, tribert_acc10 = evaluate(tribert_model, tribert_vis, tribert_aud, tribert_pose, rand_tribert, args, device)
88 |
89 | # Evaluate retrieval on original embeddings
90 | orig_acc1, orig_acc5, orig_acc10 = evaluate(orig_model, orig_vis, orig_aud, orig_pose, rand_orig, args, device)
91 |
92 | return (tribert_acc1, tribert_acc5, tribert_acc10), (orig_acc1, orig_acc5, orig_acc10)
93 |
94 | if __name__ == '__main__':
95 | parser = argparse.ArgumentParser()
96 | parser.add_argument('--tribert_embedding_path', type=str, default='retrieval_embeddings/tribert_test.pt')
97 | parser.add_argument('--orig_embedding_path', type=str, default='retrieval_embeddings/orig_test.pt')
98 | parser.add_argument('--tribert_path', type=str, help='path of tribert embeddings retrieval network checkpoint')
99 | parser.add_argument('--orig_path', type=str, help='path of original embeddings retrieval network checkpoint')
100 | parser.add_argument('--retrieval_variant', type=str, choices=['aud2vis', 'vis2aud', 'aud2pose', 'pose2aud', 'vis+aud2pose'], help='select 1 of the 5 retrieval variants shown in the paper')
101 | args = parser.parse_args()
102 |
103 | if torch.cuda.is_available():
104 | device = 'cuda'
105 | else:
106 | device = 'cpu'
107 |
108 | '''
109 | tribert_embeddings and orig_embeddings are dictionaries saved as a pytorch .pt file. Each .pt file has 3 keys:
110 | vision, audio, and pose. The value for each key is the corresponding modality embedding, which has shape (n, k),
111 | where n is the number of embeddings and k is the dimensionality of the embedding
112 | '''
113 | tribert_embeddings = torch.load(args.tribert_embedding_path)
114 | orig_embeddings = torch.load(args.orig_embedding_path)
115 |
116 | np.random.seed(10)
117 | torch.manual_seed(1)
118 |
119 | tribert_ckpt = torch.load(args.tribert_path, map_location=device)
120 | orig_ckpt = torch.load(args.orig_path, map_location=device)
121 |
122 | # Load retrieval model checkpoints
123 | if args.retrieval_variant == 'aud2vis':
124 | tribert = tribert_ckpt['aud2vis']
125 | orig = orig_ckpt['aud2vis']
126 | tribert_model = tribert_retrieval_networks.aud2vis()
127 | orig_model = orig_retrieval_networks.aud2vis()
128 | if args.retrieval_variant == 'vis2aud':
129 | tribert = tribert_ckpt['vis2aud']
130 | orig = orig_ckpt['vis2aud']
131 | tribert_model = tribert_retrieval_networks.vis2aud()
132 | orig_model = orig_retrieval_networks.vis2aud()
133 | if args.retrieval_variant == 'aud2pose':
134 | tribert = tribert_ckpt['aud2pose']
135 | orig = orig_ckpt['aud2pose']
136 | tribert_model = tribert_retrieval_networks.aud2pose()
137 | orig_model = orig_retrieval_networks.aud2pose()
138 | if args.retrieval_variant == 'pose2aud':
139 | tribert = tribert_ckpt['pose2aud']
140 | orig = orig_ckpt['pose2aud']
141 | tribert_model = tribert_retrieval_networks.pose2aud()
142 | orig_model = orig_retrieval_networks.pose2aud()
143 | if args.retrieval_variant == 'vis+aud2pose':
144 | tribert = tribert_ckpt['vis+aud2pose']
145 | orig = orig_ckpt['vis+aud2pose']
146 | tribert_model = tribert_retrieval_networks.visaud2pose()
147 | orig_model = orig_retrieval_networks.visaud2pose()
148 |
149 | tribert_model, orig_model = tribert_model.to(device), orig_model.to(device)
150 | tribert_model.load_state_dict(tribert['model_state_dict'])
151 | orig_model.load_state_dict(orig['model_state_dict'])
152 |
153 | tribert_model.eval()
154 | orig_model.eval()
155 |
156 | # Evaluate retrieval
157 | tribert_acc, orig_acc = test(tribert_model, orig_model, tribert_embeddings, orig_embeddings, device, args)
158 |
159 | print("*" * 80)
160 | print("TriBERT embedding retrieval results\n")
161 | print("*" * 80)
162 | print(f'Top-1 Acc: {tribert_acc[0] * 100:.2f}%, Top-5 Acc: {tribert_acc[1] * 100:.2f}%, Top-10 Acc: {tribert_acc[2] * 100:.2f}%')
163 |
164 | print("*" * 80)
165 | print("Original embedding retrieval results\n")
166 | print("*" * 80)
167 | print(f'Top-1 Acc: {orig_acc[0] * 100:.2f}%, Top-5 Acc: {orig_acc[1] * 100:.2f}%, Top-10 Acc: {orig_acc[2] * 100:.2f}%')
168 |
--------------------------------------------------------------------------------
/retrieval/orig_retrieval_networks.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class aud2vis(nn.Module):
7 | def __init__(self):
8 | super(aud2vis, self).__init__()
9 |
10 | # MLP to map audio embedding from (512) -> (8192) dim tensor
11 | self.audio_transform = nn.Sequential(nn.Linear(512, 8192))
12 |
13 | self.pool = nn.AvgPool2d(kernel_size=7)
14 |
15 | # MLP to compute alignment score
16 | self.align_net = nn.Sequential(nn.Linear(8192, 4096),
17 | nn.Tanh(),
18 | nn.Linear(4096, 2048),
19 | nn.Tanh(),
20 | nn.Linear(2048, 1024),
21 | nn.Tanh(),
22 | nn.Linear(1024, 1))
23 |
24 | # Single forward pass with one sample pair
25 | def forward_once(self, audio, vision):
26 | # audio is now 8192-dimensional
27 | audio = self.audio_transform(audio)
28 |
29 | vision = vision.view(-1, 4 * 2048, 7, 7)
30 | vision = self.pool(vision).squeeze()
31 |
32 | # Calculate alignment score
33 | align_score = self.align_net(audio * vision)
34 | return align_score
35 |
36 | def forward(self, sample, neg1, neg2):
37 | audio_anchor = sample['audio']
38 |
39 | # Forward pass for positive pair
40 | pos_score = self.forward_once(audio_anchor, sample['vision'])
41 |
42 | # Forward pass for negative pair 1
43 | neg1_score = self.forward_once(audio_anchor, neg1['vision'])
44 |
45 | # Forward pass for negative pair 2
46 | neg2_score = self.forward_once(audio_anchor, neg2['vision'])
47 |
48 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1)
49 |
50 |
51 | class vis2aud(nn.Module):
52 |
53 | def __init__(self):
54 | super(vis2aud, self).__init__()
55 |
56 | # MLP to map audio embedding from (512) -> (8192) dim tensor
57 | self.audio_transform = nn.Sequential(nn.Linear(512, 8192))
58 |
59 | self.pool = nn.AvgPool2d(kernel_size=7)
60 |
61 | # MLP to compute alignment score
62 | self.align_net = nn.Sequential(nn.Linear(8192, 4096),
63 | nn.Tanh(),
64 | nn.Linear(4096, 2048),
65 | nn.Tanh(),
66 | nn.Linear(2048, 1024),
67 | nn.Tanh(),
68 | nn.Linear(1024, 1))
69 |
70 | # Single forward pass with one sample pair
71 | def forward_once(self, vision, audio):
72 | # audio is now 8192-dimensional
73 | audio = self.audio_transform(audio)
74 |
75 | # Calculate alignment score
76 | align_score = self.align_net(audio * vision)
77 | return align_score
78 |
79 | def forward(self, sample, neg1, neg2):
80 | vision_anchor = sample['vision']
81 | vision_anchor = vision_anchor.view(-1, 4 * 2048, 7, 7)
82 | vision_anchor = self.pool(vision_anchor).squeeze()
83 |
84 | # Forward pass for positive pair
85 | pos_score = self.forward_once(vision_anchor, sample['audio'])
86 |
87 | # Forward pass for negative pair 1
88 | neg1_score = self.forward_once(vision_anchor, neg1['audio'])
89 |
90 | # Forward pass for negative pair 2
91 | neg2_score = self.forward_once(vision_anchor, neg2['audio'])
92 |
93 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1)
94 |
95 |
96 | class aud2pose(nn.Module):
97 |
98 | def __init__(self):
99 | super(aud2pose, self).__init__()
100 |
101 | # MLP to map audio and pose embeddings to same dimensionality
102 | self.audio_transform = nn.Sequential(nn.Linear(512, 4096))
103 | self.pose_transform = nn.Linear(139264, 4096)
104 |
105 | # MLP to compute alignment score
106 | self.align_net = nn.Sequential(nn.Linear(4096, 2048),
107 | nn.Tanh(),
108 | nn.Linear(2048, 1024),
109 | nn.Tanh(),
110 | nn.Linear(1024, 512),
111 | nn.Tanh(),
112 | nn.Linear(512, 1))
113 |
114 | # Single forward pass with one sample pair
115 | def forward_once(self, audio, pose):
116 | pose = self.pose_transform(pose)
117 |
118 | # Calculate alignment score
119 | align_score = self.align_net(audio * pose)
120 | return align_score
121 |
122 | def forward(self, sample, neg1, neg2):
123 | audio_anchor = sample['audio']
124 | audio_anchor = self.audio_transform(audio_anchor)
125 |
126 | # Forward pass for positive pair
127 | pos_score = self.forward_once(audio_anchor, sample['pose'])
128 |
129 | # Forward pass for negative pair 1
130 | neg1_score = self.forward_once(audio_anchor, neg1['pose'])
131 |
132 | # Forward pass for negative pair 2
133 | neg2_score = self.forward_once(audio_anchor, neg2['pose'])
134 |
135 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1)
136 |
137 |
138 | class pose2aud(nn.Module):
139 |
140 | def __init__(self):
141 | super(pose2aud, self).__init__()
142 |
143 | # MLP to map audio and pose embeddings to same dimensionality
144 | self.audio_transform = nn.Sequential(nn.Linear(512, 4096))
145 | self.pose_transform = nn.Linear(139264, 4096)
146 |
147 | # MLP to compute alignment score
148 | self.align_net = nn.Sequential(nn.Linear(4096, 2048),
149 | nn.Tanh(),
150 | nn.Linear(2048, 1024),
151 | nn.Tanh(),
152 | nn.Linear(1024, 512),
153 | nn.Tanh(),
154 | nn.Linear(512, 1))
155 |
156 | # Single forward pass with one sample pair
157 | def forward_once(self, pose, audio):
158 | audio = self.audio_transform(audio)
159 |
160 | # Calculate alignment score
161 | align_score = self.align_net(audio * pose)
162 | return align_score
163 |
164 | def forward(self, sample, neg1, neg2):
165 | pose_anchor = sample['pose']
166 | pose_anchor = self.pose_transform(pose_anchor)
167 |
168 | # Forward pass for positive pair
169 | pos_score = self.forward_once(pose_anchor, sample['audio'])
170 |
171 | # Forward pass for negative pair 1
172 | neg1_score = self.forward_once(pose_anchor, neg1['audio'])
173 |
174 | # Forward pass for negative pair 2
175 | neg2_score = self.forward_once(pose_anchor, neg2['audio'])
176 |
177 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1)
178 |
179 |
180 | class visaud2pose(nn.Module):
181 |
182 | def __init__(self):
183 | super(visaud2pose, self).__init__()
184 |
185 | # To map modalities to same dimensionality
186 | self.audio_transform = nn.Sequential(nn.Linear(512, 2048))
187 | self.pose_transform = nn.Linear(139264, 2048)
188 | self.pool = nn.AvgPool2d(kernel_size=7)
189 | self.vision_transform = nn.Linear(8192, 2048)
190 |
191 | # MLP for fusing audio and pose embeddings
192 | self.fuse = nn.Sequential(nn.Linear(2048, 2048),
193 | nn.Tanh(),
194 | nn.Linear(2048, 2048),
195 | nn.Tanh(),
196 | nn.Linear(2048, 2048)
197 | )
198 |
199 | # MLP to compute alignment score
200 | self.align_net = nn.Sequential(nn.Linear(2048, 1024),
201 | nn.Tanh(),
202 | nn.Linear(1024, 512),
203 | nn.Tanh(),
204 | nn.Linear(512, 1))
205 |
206 | def fuse_forward(self, vision, audio):
207 | vision = vision.view(-1, 4 * 2048, 7, 7)
208 | vision = self.pool(vision).squeeze()
209 | vision = self.vision_transform(vision)
210 |
211 | # audio is now 8192-dimensional
212 | audio = self.audio_transform(audio)
213 |
214 | # Fuse audio and pose embeddings
215 | fuse_inter = (F.softmax(audio * vision, dim=-1) * vision) + audio
216 |
217 | fuse_embed = self.fuse(fuse_inter) + fuse_inter
218 |
219 | return fuse_embed
220 |
221 | # Single forward pass with one sample pair
222 |
223 | def forward_once(self, fuse, pose):
224 | pose = self.pose_transform(pose)
225 |
226 | # Calculate alignment score
227 | align_score = self.align_net(fuse * pose)
228 | return align_score
229 |
230 | def forward(self, sample, neg1, neg2, hard_neg):
231 | fuse_anchor = self.fuse_forward(sample['vision'], sample['audio'])
232 |
233 | # Forward pass for positive pair
234 | pos_score = self.forward_once(fuse_anchor, sample['pose'])
235 |
236 | # Forward pass for negative pair 1
237 | neg1_score = self.forward_once(fuse_anchor, neg1['pose'])
238 |
239 | # Forward pass for negative pair 2
240 | neg2_score = self.forward_once(fuse_anchor, neg2['pose'])
241 |
242 | hardneg_score = self.forward_once(fuse_anchor, hard_neg['pose'])
243 |
244 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score, hardneg_score), dim=1), dim=1)
245 |
--------------------------------------------------------------------------------
/retrieval/retrieval_datasets.py:
--------------------------------------------------------------------------------
1 | from sklearn.neighbors import NearestNeighbors
2 | import torch
3 | from torch.utils.data import Dataset
4 | import numpy as np
5 |
6 | '''
7 | Dataset used for training single modality -> single modality retrieval (eg. aud2vis, pose2aud, etc.). These retrieval
8 | networks are trained using 3-way multiple choice, with one positive pair and 2 hard negatives. Hard negatives are
9 | sampled using nearest neighbours.
10 | '''
11 | class SingleModalRetrievalDataset(Dataset):
12 |
13 | def __init__(self, vision_embed, audio_embed, pose_embed, vision_trunc=None):
14 | self.vision_embed = np.load(vision_embed)
15 | self.audio_embed = np.load(audio_embed)
16 | self.pose_embed = np.load(pose_embed)
17 |
18 | # Run Nearest Neighbours for selecting negatives
19 | self.neigh = NearestNeighbors(n_neighbors=25)
20 | self.neigh.fit(self.vision_embed)
21 |
22 | def __len__(self):
23 | return self.vision_embed.shape[0]
24 |
25 | def __getitem__(self, idx):
26 | # Load sample embeddings
27 | vision = self.vision_embed[idx, ...] # (3*4*224*224)
28 | audio = self.audio_embed[idx, ...] #
29 | pose = self.pose_embed[idx, ...] #
30 |
31 | # Create 3 negative pairs of audio/pose by uniformly sampling from current image's k nearest neighbors
32 | '''
33 | neg pair 1: correct audio, wrong pose
34 | neg pair 2: wrong audio, correct pose
35 | neg pair 3: wrong audio, wrong pose
36 | '''
37 | neigh_ind = self.neigh.kneighbors([vision], return_distance=False).squeeze()
38 | neg_ind = neigh_ind[np.random.randint(low=1, high=25, size=3)]
39 |
40 | sample = {'vision': torch.from_numpy(vision).float(),
41 | 'audio': torch.from_numpy(audio).float(),
42 | 'pose': torch.from_numpy(pose).float()}
43 |
44 | neg1 = {'audio': torch.from_numpy(audio).float(),
45 | 'pose': torch.from_numpy(self.pose_embed[neg_ind[0], ...]).float()}
46 |
47 | neg2 = {'audio': torch.from_numpy(self.audio_embed[neg_ind[1], ...]).float(),
48 | 'pose': torch.from_numpy(pose).float()}
49 |
50 |
51 | target = 0 # In the output of model(), the first score is for the positive pair
52 |
53 | return sample, neg1, neg2, neg3, target
54 |
55 |
56 | '''
57 | Used to train visaud2pose. 4-way multiple choice with one positive pair, two easy negatives, and a hard negative.
58 | Easy negatives are sampled randomly from the dataset while hard negatives are sampled using nearest neighbours.
59 | '''
60 | class MultiModalRetrievalDataset(Dataset):
61 |
62 | def __init__(self, vision_embed, audio_embed, pose_embed):
63 | self.vision_embed = np.load(vision_embed)
64 | self.audio_embed = np.load(audio_embed)
65 | self.pose_embed = np.load(pose_embed)
66 |
67 | # Run Nearest Neighbours for selecting negatives
68 | self.neigh = NearestNeighbors(n_neighbors=25)
69 | self.neigh.fit(self.vision_embed)
70 |
71 | def __len__(self):
72 | return self.vision_embed.shape[0]
73 |
74 | def __getitem__(self, idx):
75 | # Load sample embeddings
76 | vision = self.vision_embed[idx, ...] # (8192)
77 | audio = self.audio_embed[idx, ...] # (4096)
78 | pose = self.pose_embed[idx, ...] # (8192)
79 |
80 | # Create 3 negative pairs of audio/pose by uniformly sampling from current image's k nearest neighbors
81 | '''
82 | neg pair 1: correct audio, wrong pose
83 | neg pair 2: wrong audio, correct pose
84 | neg pair 3: wrong audio, wrong pose
85 | '''
86 | neigh_ind = self.neigh.kneighbors([vision], return_distance=False).squeeze()
87 | hard_neg_ind = neigh_ind[np.random.randint(low=3, high=25, size=1)]
88 | neg_ind = np.random.randint(self.vision_embed.shape[0], size=2)
89 |
90 | sample = {'vision': torch.from_numpy(vision).float(),
91 | 'audio': torch.from_numpy(audio).float(),
92 | 'pose': torch.from_numpy(pose).float()}
93 |
94 | # Easy negatives
95 | neg1 = {'vision': torch.from_numpy(self.vision_embed[neg_ind[0], ...]).float(),
96 | 'audio': torch.from_numpy(self.audio_embed[neg_ind[0], ...]).float(),
97 | 'pose': torch.from_numpy(self.pose_embed[neg_ind[0], ...]).float()}
98 |
99 | neg2 = {'vision': torch.from_numpy(self.vision_embed[neg_ind[1], ...]).float(),
100 | 'audio': torch.from_numpy(self.audio_embed[neg_ind[1], ...]).float(),
101 | 'pose': torch.from_numpy(self.pose_embed[neg_ind[1], ...]).float()}
102 |
103 | # Hard negative
104 | neg3 = {'vision': torch.from_numpy(self.vision_embed[hard_neg_ind[0], ...]).float(),
105 | 'audio': torch.from_numpy(self.audio_embed[hard_neg_ind[0], ...]).float(),
106 | 'pose': torch.from_numpy(self.pose_embed[hard_neg_ind[0], ...]).float()}
107 |
108 | target = 0 # In the output of model(), the first score is for the positive pair
109 |
110 | return sample, neg1, neg2, neg3, target
111 |
--------------------------------------------------------------------------------
/retrieval/train_retrieval_networks.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import torch
4 | from torch.utils.data import DataLoader
5 | from torch.nn import CrossEntropyLoss
6 | from torch.optim import Adam
7 | import numpy as np
8 | import tribert_retrieval_networks
9 | import orig_retrieval_networks
10 | from retrieval_datasets import *
11 | from tqdm import tqdm
12 |
13 |
14 | def train(model, criterion, optimizer, dataloader, device, args):
15 | CKPT_DIR = f'./checkpoints/orig/{args.exp_name}'
16 | if not os.path.exists(CKPT_DIR):
17 | os.makedirs(CKPT_DIR)
18 |
19 | loss_str = []
20 | best_loss = 1000000
21 | for i in range(args.epochs):
22 | accum_loss = 0
23 | batch_cnt = 0
24 | for batch_idx, batch in enumerate(dataloader):
25 | optimizer.zero_grad()
26 |
27 | # Map all tensors to device
28 | batch[-1] = batch[-1].to(device)
29 | for data_dict in batch[:-1]:
30 | for key, value in data_dict.items():
31 | data_dict[key] = data_dict[key].to(device)
32 |
33 | if args.retrieval_mode == 'visaud2pose':
34 | pos, neg1, neg2, hard_neg, target = batch[0], batch[1], batch[2], batch[3], batch[4]
35 | scores = model(pos, neg1, neg2, hard_neg) # scores is of shape (batch_size, 3)
36 |
37 | else:
38 | pos, neg1, neg2, target = batch[0], batch[1], batch[2], batch[3]
39 | scores = model(pos, neg1, neg2) # scores is of shape (batch_size, 3)
40 |
41 | loss = criterion(scores, target)
42 |
43 | loss_str.append(loss.item())
44 | accum_loss += loss.item()
45 | batch_cnt += 1
46 |
47 | loss.backward()
48 | optimizer.step()
49 |
50 | avg_loss = accum_loss / batch_cnt
51 |
52 | if avg_loss < best_loss:
53 | best_loss = avg_loss
54 | CKPT_PATH = os.path.join(CKPT_DIR, f'{args.exp_name}.pt')
55 | torch.save({
56 | 'epoch': i,
57 | 'model_state_dict': model.state_dict(),
58 | 'optimizer_state_dict': optimizer.state_dict(),
59 | 'loss': best_loss
60 | }, CKPT_PATH)
61 |
62 | loss_arr = np.asarray(loss_str)
63 | smallest_loss = loss_arr.min()
64 | np.save(os.path.join(CKPT_DIR, f'{args.exp_name}_loss_{smallest_loss:.2f}.npy'), loss_arr)
65 |
66 | return model
67 |
68 |
69 | if __name__ == '__main__':
70 | parser = argparse.ArgumentParser()
71 |
72 | # Training settings
73 | parser.add_argument('--exp_name', type=str, help='name of experiment. Will be used to name saved checkpoints')
74 | parser.add_argument('--embeddings_path', type=str, help='path of folder containing the .npy embedding files')
75 | parser.add_argument('--lr', type=float, default=0.0001)
76 | parser.add_argument('--batch_size', type=int, default=64)
77 | parser.add_argument('--epochs', type=int, default=1000)
78 |
79 | # Model settings
80 | parser.add_argument('--retrieval_mode', type=str, choices=['aud2vis', 'vis2aud', 'aud2pose', 'pose2aud', 'visaud2pose'])
81 | parser.add_argument('--embedding', type=str, choices=['orig', 'tribert'],
82 | help='retrieval using baseline representations or tribert representations')
83 | args = parser.parse_args()
84 |
85 | '''
86 | Paths to vision, audio, and pose embeddings, saved as a numpy array in a .npy file. Each of these are of shape
87 | (n, k), where n is the number of embeddings and k the dimensionality of each embedding.
88 |
89 | NOTE: Be sure to use the correct type of embeddings (either baseline or tribert)
90 | '''
91 | VIS_EMBED_PATH = os.path.join(args.embeddings_path, 'train_vision.npy')
92 | AUD_EMBED_PATH = os.path.join(args.embeddings_path, 'train_audio.npy')
93 | POS_EMBED_PATH = os.path.join(args.embeddings_path, 'train_pose.npy')
94 |
95 | device = 'cuda'
96 |
97 | np.random.seed(1)
98 | torch.manual_seed(1)
99 |
100 | # Load dataset and loader
101 | if args.retrieval_mode == 'visaud2pose':
102 | train_dataset = MultiModalRetrievalDataset(vision_embed=VIS_EMBED_PATH,
103 | audio_embed=AUD_EMBED_PATH,
104 | pose_embed=POS_EMBED_PATH)
105 | else:
106 |
107 | train_dataset = SingleModalRetrievalDataset(vision_embed=VIS_EMBED_PATH,
108 | audio_embed=AUD_EMBED_PATH,
109 | pose_embed=POS_EMBED_PATH)
110 |
111 | dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=False)
112 |
113 | # Load model
114 | if args.embedding == 'tribert':
115 | if args.retrieval_mode == 'aud2vis':
116 | model = tribert_retrieval_networks.aud2vis()
117 | elif args.retrieval_mode == 'vis2aud':
118 | model = tribert_retrieval_networks.vis2aud()
119 | elif args.retrieval_mode == 'aud2pose':
120 | model = tribert_retrieval_networks.aud2pose()
121 | elif args.retrieval_mode == 'pose2aud':
122 | model = tribert_retrieval_networks.pose2aud()
123 | else:
124 | model = tribert_retrieval_networks.visaud2pose()
125 |
126 | else:
127 | if args.retrieval_mode == 'aud2vis':
128 | model = orig_retrieval_networks.aud2vis()
129 | elif args.retrieval_mode == 'vis2aud':
130 | model = orig_retrieval_networks.vis2aud()
131 | elif args.retrieval_mode == 'aud2pose':
132 | model = orig_retrieval_networks.aud2pose()
133 | elif args.retrieval_mode == 'pose2aud':
134 | model = orig_retrieval_networks.pose2aud()
135 | else:
136 | model = orig_retrieval_networks.visaud2pose()
137 |
138 | model = model.to(device)
139 |
140 | # Train
141 | criterion = CrossEntropyLoss()
142 | optimizer = Adam(model.parameters(), lr=args.lr)
143 |
144 | model = train(model, criterion, optimizer, dataloader, device, args)
145 |
--------------------------------------------------------------------------------
/retrieval/tribert_retrieval_networks.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class aud2vis(nn.Module):
7 |
8 | def __init__(self):
9 | super(aud2vis, self).__init__()
10 |
11 | # MLP to map audio embedding from (4096) -> (8192) dim tensor
12 | self.audio_transform = nn.Sequential(nn.Linear(4096, 8192))
13 |
14 | # MLP to compute alignment score
15 | self.align_net = nn.Sequential(nn.Linear(8192, 4096),
16 | nn.Tanh(),
17 | nn.Linear(4096, 2048),
18 | nn.Tanh(),
19 | nn.Linear(2048, 1024),
20 | nn.Tanh(),
21 | nn.Linear(1024, 1))
22 |
23 | # Single forward pass with one sample pair
24 | def forward_once(self, audio, vision):
25 | # audio is now 8192-dimensional
26 | audio = self.audio_transform(audio)
27 |
28 | # Calculate alignment score
29 | align_score = self.align_net(audio * vision)
30 | return align_score
31 |
32 | def forward(self, sample, neg1, neg2):
33 | audio_anchor = sample['audio']
34 |
35 | # Forward pass for positive pair
36 | pos_score = self.forward_once(audio_anchor, sample['vision'])
37 |
38 | # Forward pass for negative pair 1
39 | neg1_score = self.forward_once(audio_anchor, neg1['vision'])
40 |
41 | # Forward pass for negative pair 2
42 | neg2_score = self.forward_once(audio_anchor, neg2['vision'])
43 |
44 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1)
45 |
46 |
47 | class vis2aud(nn.Module):
48 |
49 | def __init__(self):
50 | super(vis2aud, self).__init__()
51 |
52 | # MLP to map audio embedding from (4096) -> (8192) dim tensor
53 | self.audio_transform = nn.Sequential(nn.Linear(4096, 8192))
54 |
55 | # MLP to compute alignment score
56 | self.align_net = nn.Sequential(nn.Linear(8192, 4096),
57 | nn.Tanh(),
58 | nn.Linear(4096, 2048),
59 | nn.Tanh(),
60 | nn.Linear(2048, 1024),
61 | nn.Tanh(),
62 | nn.Linear(1024, 1))
63 |
64 | # Single forward pass with one sample pair
65 | def forward_once(self, vision, audio):
66 | # audio is now 8192-dimensional
67 | audio = self.audio_transform(audio)
68 |
69 | # Calculate alignment score
70 | align_score = self.align_net(audio * vision)
71 | return align_score
72 |
73 | def forward(self, sample, neg1, neg2):
74 | vision_anchor = sample['vision']
75 |
76 | # Forward pass for positive pair
77 | pos_score = self.forward_once(vision_anchor, sample['audio'])
78 |
79 | # Forward pass for negative pair 1
80 | neg1_score = self.forward_once(vision_anchor, neg1['audio'])
81 |
82 | # Forward pass for negative pair 2
83 | neg2_score = self.forward_once(vision_anchor, neg2['audio'])
84 |
85 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1)
86 |
87 |
88 | class aud2pose(nn.Module):
89 |
90 | def __init__(self):
91 | super(aud2pose, self).__init__()
92 |
93 | # MLP to map audio embedding from (4096) -> (8192) dim tensor
94 | self.audio_transform = nn.Sequential(nn.Linear(4096, 8192))
95 |
96 | # MLP to compute alignment score
97 | self.align_net = nn.Sequential(nn.Linear(8192, 4096),
98 | nn.Tanh(),
99 | nn.Linear(4096, 2048),
100 | nn.Tanh(),
101 | nn.Linear(2048, 1024),
102 | nn.Tanh(),
103 | nn.Linear(1024, 1))
104 |
105 | # Single forward pass with one sample pair
106 | def forward_once(self, audio, pose):
107 | # audio is now 8192-dimensional
108 | audio = self.audio_transform(audio)
109 |
110 | # Calculate alignment score
111 | align_score = self.align_net(audio * pose)
112 | return align_score
113 |
114 | def forward(self, sample, neg1, neg2):
115 | audio_anchor = sample['audio']
116 |
117 | # Forward pass for positive pair
118 | pos_score = self.forward_once(audio_anchor, sample['pose'])
119 |
120 | # Forward pass for negative pair 1
121 | neg1_score = self.forward_once(audio_anchor, neg1['pose'])
122 |
123 | # Forward pass for negative pair 2
124 | neg2_score = self.forward_once(audio_anchor, neg2['pose'])
125 |
126 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1)
127 |
128 |
129 | class pose2aud(nn.Module):
130 |
131 | def __init__(self):
132 | super(pose2aud, self).__init__()
133 |
134 | # MLP to map audio embedding from (4096) -> (8192) dim tensor
135 | self.audio_transform = nn.Sequential(nn.Linear(4096, 8192))
136 |
137 | # MLP to compute alignment score
138 | self.align_net = nn.Sequential(nn.Linear(8192, 4096),
139 | nn.Tanh(),
140 | nn.Linear(4096, 2048),
141 | nn.Tanh(),
142 | nn.Linear(2048, 1024),
143 | nn.Tanh(),
144 | nn.Linear(1024, 1))
145 |
146 | # Single forward pass with one sample pair
147 | def forward_once(self, pose, audio):
148 | # audio is now 8192-dimensional
149 | audio = self.audio_transform(audio)
150 |
151 | # Calculate alignment score
152 | align_score = self.align_net(audio * pose)
153 | return align_score
154 |
155 | def forward(self, sample, neg1, neg2):
156 | pose_anchor = sample['pose']
157 |
158 | # Forward pass for positive pair
159 | pos_score = self.forward_once(pose_anchor, sample['audio'])
160 |
161 | # Forward pass for negative pair 1
162 | neg1_score = self.forward_once(pose_anchor, neg1['audio'])
163 |
164 | # Forward pass for negative pair 2
165 | neg2_score = self.forward_once(pose_anchor, neg2['audio'])
166 |
167 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score), dim=1), dim=1)
168 |
169 |
170 | class visaud2pose(nn.Module):
171 |
172 | def __init__(self):
173 | super(visaud2pose, self).__init__()
174 |
175 | # MLP to map audio embedding from (4096) -> (8192) dim tensor
176 | self.audio_transform = nn.Sequential(nn.Linear(4096, 8192))
177 |
178 | # MLP for fusing audio and pose embeddings
179 | self.fuse = nn.Sequential(nn.Linear(8192, 8192),
180 | nn.Tanh(),
181 | nn.Linear(8192, 8192),
182 | nn.Tanh(),
183 | nn.Linear(8192, 8192)
184 | )
185 |
186 | # MLP to compute alignment score
187 | self.align_net = nn.Sequential(nn.Linear(8192, 4096),
188 | nn.Tanh(),
189 | nn.Linear(4096, 2048),
190 | nn.Tanh(),
191 | nn.Linear(2048, 1024),
192 | nn.Tanh(),
193 | nn.Linear(1024, 1))
194 |
195 | def fuse_forward(self, vision, audio):
196 | # audio is now 8192-dimensional
197 | audio = self.audio_transform(audio)
198 |
199 | audio = audio.view(-1, 4, 2, 1024)
200 | vision = vision.view(-1, 4, 2, 1024)
201 |
202 | # Fuse audio and pose embeddings
203 | fuse_inter = (F.softmax(audio * vision, dim=-1) * vision) + audio
204 | fuse_inter = fuse_inter.view(-1, 4*2*1024)
205 | fuse_embed = self.fuse(fuse_inter) + fuse_inter
206 |
207 |
208 | return fuse_embed
209 |
210 | # Single forward pass with one sample pair
211 | def forward_once(self, fuse, pose):
212 |
213 | # Calculate alignment score
214 | align_score = self.align_net(fuse * pose)
215 | return align_score
216 |
217 | def forward(self, sample, neg1, neg2, hard_neg):
218 |
219 | fuse_anchor = self.fuse_forward(sample['vision'], sample['audio'])
220 |
221 | # Forward pass for positive pair
222 | pos_score = self.forward_once(fuse_anchor, sample['pose'])
223 |
224 | # Forward pass for negative pair 1
225 | neg1_score = self.forward_once(fuse_anchor, neg1['pose'])
226 |
227 | # Forward pass for negative pair 2
228 | neg2_score = self.forward_once(fuse_anchor, neg2['pose'])
229 |
230 | hardneg_score = self.forward_once(fuse_anchor, hard_neg['pose'])
231 |
232 | return F.softmax(torch.cat((pos_score, neg1_score, neg2_score, hardneg_score), dim=1), dim=1)
--------------------------------------------------------------------------------
/tribert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__init__.py
--------------------------------------------------------------------------------
/tribert/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/__pycache__/audio_net.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/audio_net.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/__pycache__/criterion.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/criterion.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/__pycache__/tribert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/tribert.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/__pycache__/vilbert.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/__pycache__/vilbert.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/datasets/__pycache__/base.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/datasets/__pycache__/base.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/datasets/__pycache__/music_multimodal.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/datasets/__pycache__/music_multimodal.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/datasets/__pycache__/video_transforms.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/datasets/__pycache__/video_transforms.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/datasets/base.py:
--------------------------------------------------------------------------------
1 | import random
2 | import csv
3 | import numpy as np
4 | import torch
5 | import torch.utils.data as torchdata
6 | from torchvision import transforms
7 | import torchaudio
8 | import librosa
9 | from PIL import Image
10 | import os
11 |
12 | from . import video_transforms as vtransforms
13 |
14 |
15 | class BaseDataset(torchdata.Dataset):
16 | def __init__(self, list_sample, opt, max_sample=-1, split='train'):
17 | # params
18 | self.num_frames = opt.num_frames
19 | self.stride_frames = opt.stride_frames
20 | self.frameRate = opt.frameRate
21 | self.imgSize = opt.imgSize
22 | self.audRate = opt.audRate
23 | self.audLen = opt.audLen
24 | self.audSec = 1. * self.audLen / self.audRate
25 | self.binary_mask = opt.binary_mask
26 |
27 | # STFT params
28 | self.log_freq = opt.log_freq
29 | self.stft_frame = opt.stft_frame
30 | self.stft_hop = opt.stft_hop
31 | self.HS = opt.stft_frame // 2 + 1
32 | self.WS = (self.audLen + 1) // self.stft_hop
33 |
34 | self.split = split
35 | self.seed = opt.seed
36 | random.seed(self.seed)
37 |
38 | # initialize video transform
39 | self._init_vtransform()
40 |
41 | # list_sample can be a python list or a csv file of list
42 | if isinstance(list_sample, str):
43 | # self.list_sample = [x.rstrip() for x in open(list_sample, 'r')]
44 | self.list_sample = []
45 | for row in csv.reader(open(list_sample, 'r'), delimiter=','):
46 | if len(row) < 1:
47 | continue
48 | self.list_sample.append(row[0])
49 | elif isinstance(list_sample, list):
50 | self.list_sample = list_sample
51 | else:
52 | raise('Error list_sample!')
53 |
54 | if self.split == 'train':
55 | self.list_sample *= opt.dup_trainset
56 | random.shuffle(self.list_sample)
57 |
58 | if max_sample > 0:
59 | self.list_sample = self.list_sample[0:max_sample]
60 |
61 | num_sample = len(self.list_sample)
62 | self.num_dataset = num_sample
63 | assert num_sample > 0
64 | print('# samples: {}'.format(num_sample))
65 |
66 | def __len__(self):
67 | return len(self.list_sample)
68 |
69 | # video transform funcs
70 | def _init_vtransform(self):
71 | transform_list = []
72 | mean = [0.485, 0.456, 0.406]
73 | std = [0.229, 0.224, 0.225]
74 |
75 | if self.split == 'train':
76 | transform_list.append(vtransforms.Resize(int(self.imgSize * 1.1), Image.BICUBIC))
77 | transform_list.append(vtransforms.RandomCrop(self.imgSize))
78 | transform_list.append(vtransforms.RandomHorizontalFlip())
79 | else:
80 | transform_list.append(vtransforms.Resize(self.imgSize, Image.BICUBIC))
81 | transform_list.append(vtransforms.CenterCrop(self.imgSize))
82 |
83 | transform_list.append(vtransforms.ToTensor())
84 | transform_list.append(vtransforms.Normalize(mean, std))
85 | transform_list.append(vtransforms.Stack())
86 | self.vid_transform = transforms.Compose(transform_list)
87 |
88 | # image transform funcs, deprecated
89 | def _init_transform(self):
90 | mean = [0.485, 0.456, 0.406]
91 | std = [0.229, 0.224, 0.225]
92 |
93 | if self.split == 'train':
94 | self.img_transform = transforms.Compose([
95 | transforms.Scale(int(self.imgSize * 1.2)),
96 | transforms.RandomCrop(self.imgSize),
97 | transforms.RandomHorizontalFlip(),
98 | transforms.ToTensor(),
99 | transforms.Normalize(mean, std)])
100 | else:
101 | self.img_transform = transforms.Compose([
102 | transforms.Scale(self.imgSize),
103 | transforms.CenterCrop(self.imgSize),
104 | transforms.ToTensor(),
105 | transforms.Normalize(mean, std)])
106 |
107 | def _load_frames(self, paths):
108 | frames = []
109 | original_img_size = []
110 | for path in paths:
111 | frames.append(self._load_frame(path))
112 | for frame in frames:
113 | original_img_size.append(frame.size)
114 | frames = self.vid_transform(frames)
115 | return frames, original_img_size
116 |
117 | def _load_frame(self, path):
118 | img = Image.open(path).convert('RGB')
119 | return img
120 |
121 | def _stft(self, audio):
122 | spec = librosa.stft(
123 | audio, n_fft=self.stft_frame, hop_length=self.stft_hop)
124 | amp = np.abs(spec)
125 | phase = np.angle(spec)
126 | return torch.from_numpy(amp), torch.from_numpy(phase)
127 |
128 | def _load_audio_file(self, path):
129 | if path.endswith('.mp3'):
130 | audio_raw, rate = torchaudio.load(path)
131 | audio_raw = audio_raw.numpy().astype(np.float32)
132 |
133 | # range to [-1, 1]
134 | audio_raw *= (2.0**-31)
135 |
136 | # convert to mono
137 | if audio_raw.shape[1] == 2:
138 | audio_raw = (audio_raw[:, 0] + audio_raw[:, 1]) / 2
139 | else:
140 | audio_raw = audio_raw[:, 0]
141 | else:
142 | audio_raw, rate = librosa.load(path, sr=None, mono=True)
143 |
144 | return audio_raw, rate
145 |
146 | def _load_audio(self, path, center_timestamp, nearest_resample=False):
147 | audio = np.zeros(self.audLen, dtype=np.float32)
148 |
149 | # silent
150 | if path.endswith('silent'):
151 | return audio
152 |
153 | # load audio
154 | audio_raw, rate = self._load_audio_file(path)
155 |
156 | # repeat if audio is too short
157 | if audio_raw.shape[0] < rate * self.audSec:
158 | n = int(rate * self.audSec / audio_raw.shape[0]) + 1
159 | audio_raw = np.tile(audio_raw, n)
160 |
161 | # resample
162 | if rate > self.audRate:
163 | # print('resmaple {}->{}'.format(rate, self.audRate))
164 | if nearest_resample:
165 | audio_raw = audio_raw[::rate//self.audRate]
166 | else:
167 | audio_raw = librosa.resample(audio_raw, rate, self.audRate)
168 |
169 | # crop N seconds
170 | len_raw = audio_raw.shape[0]
171 | center = int(center_timestamp * self.audRate)
172 | start = max(0, center - self.audLen // 2)
173 | end = min(len_raw, center + self.audLen // 2)
174 |
175 | audio[self.audLen//2-(center-start): self.audLen//2+(end-center)] = \
176 | audio_raw[start:end]
177 |
178 | # randomize volume
179 | if self.split == 'train':
180 | scale = random.random() + 0.5 # 0.5-1.5
181 | audio *= scale
182 | audio[audio > 1.] = 1.
183 | audio[audio < -1.] = -1.
184 |
185 | return audio
186 |
187 | def _mix_n_and_stft(self, audios):
188 | N = len(audios)
189 | mags = [None for n in range(N)]
190 |
191 | # mix
192 | for n in range(N):
193 | audios[n] /= N
194 | audio_mix = np.asarray(audios).sum(axis=0)
195 |
196 | # STFT
197 | amp_mix, phase_mix = self._stft(audio_mix)
198 | for n in range(N):
199 | ampN, _ = self._stft(audios[n])
200 | mags[n] = ampN.unsqueeze(0)
201 |
202 | # to tensor
203 | # audio_mix = torch.from_numpy(audio_mix)
204 | for n in range(N):
205 | audios[n] = torch.from_numpy(audios[n])
206 |
207 | return amp_mix.unsqueeze(0), mags, phase_mix.unsqueeze(0)
208 |
209 | def dummy_mix_data(self, N):
210 | frames = [None for n in range(N)]
211 | audios = [None for n in range(N)]
212 | mags = [None for n in range(N)]
213 |
214 | amp_mix = torch.zeros(1, self.HS, self.WS)
215 | phase_mix = torch.zeros(1, self.HS, self.WS)
216 |
217 | for n in range(N):
218 | frames[n] = torch.zeros(
219 | 3, self.num_frames, self.imgSize, self.imgSize)
220 | audios[n] = torch.zeros(self.audLen)
221 | mags[n] = torch.zeros(1, self.HS, self.WS)
222 |
223 | return amp_mix, mags, frames, audios, phase_mix
224 |
225 | def check_video_frames_exists(self, frame_root, index):
226 | frame_path = os.path.join(frame_root, self.list_sample[index])
227 | list_of_files = os.listdir(frame_path)
228 | if len(list_of_files) > 0:
229 | return index
230 | else:
231 | #print("self iteration")
232 | return self.check_video_frames_exists(frame_root, index+1)
233 |
--------------------------------------------------------------------------------
/tribert/datasets/music_multimodal.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 | from .base import BaseDataset
4 | import pickle
5 | import h5py
6 | import torch
7 | import numpy as np
8 | import json
9 |
10 | class InputFeatures(object):
11 | """A single set of features of data."""
12 | def __init__(self,
13 | image_feat=None,
14 | image_label=None,
15 | image_mask=None,
16 | pose_feat=None,
17 | pose_loc=None,
18 | pose_label=None,
19 | pose_mask=None,
20 | audio_feat = None,
21 | audio_label = None,
22 | audio_target_label = None):
23 | self.image_feat = image_feat
24 | self.image_label = image_label
25 | self.image_mask = image_mask
26 | self.pose_feat = pose_feat
27 | self.pose_loc = pose_loc
28 | self.pose_label = pose_label
29 | self.pose_mask = pose_mask
30 | self.audio_feat = audio_feat
31 | self.audio_label = audio_label
32 | self.audio_target_label = audio_target_label
33 |
34 | class MUSICMixMultimodalDataset(BaseDataset):
35 | def __init__(self, list_sample, opt, **kwargs):
36 | super(MUSICMixMultimodalDataset, self).__init__(
37 | list_sample, opt, **kwargs)
38 | self.fps = opt.frameRate
39 | self.num_mix = opt.num_mix
40 | self.num_seq = 2
41 | self.root_path = "/ubc/cs/research/shield/datasets/MUSIC21_dataset/preprocessed_data/music21_bert_features/music_dataset"
42 | self.frame_root = "/ubc/cs/research/shield/datasets/MUSIC21_dataset/preprocessed_data/music21_all_frames"
43 | self.audio_root = "/ubc/cs/research/shield/datasets/MUSIC21_dataset/preprocessed_data/music21_bert_features/audio_feat"
44 | self.pose_root = "/ubc/cs/research/shield/datasets/MUSIC21_dataset/preprocessed_data/music21_bert_features/pose_feat_split_frame"
45 | self.pose_bbox_path = "/ubc/cs/research/shield/datasets/MUSIC21_dataset/preprocessed_data/music21_bert_features/alphapose_bbox_json"
46 |
47 | #audio classification
48 | gt_lable_dict = {'bagpipe': 1, 'clarinet': 2, 'flute': 3, 'drum': 4, 'acoustic_guitar':5,'ukulele': 6, 'accordion': 7, 'bassoon': 8, 'guzheng': 9, 'xylophone': 10, 'erhu': 11, 'tuba':12, 'congas': 13, 'saxophone': 14, 'cello': 15, 'violin': 16, 'electric_bass': 17, 'piano':18,'banjo': 19, 'trumpet': 20, 'pipa': 21}
49 | solo_file_path = os.path.join(self.root_path, "MUSIC21_solo_videos.json")
50 | duet_file_path = os.path.join(self.root_path,"MUSIC_duet_videos.json")
51 | dataDict_solo = None
52 | dataDict_duet = None
53 |
54 | with open(solo_file_path,'r') as load_f:
55 | dataDict_solo = json.load(load_f)
56 | with open(duet_file_path,'r') as load_f:
57 | dataDict_duet = json.load(load_f)
58 |
59 | self.all_vid_label = {}
60 |
61 | for key, value in dataDict_solo["videos"].items():
62 | for item in value:
63 | self.all_vid_label[item] = [gt_lable_dict[key]]
64 |
65 | for key, value in dataDict_duet["videos"].items():
66 | for item in value:
67 | self.all_vid_label[item] = [gt_lable_dict[key.split(' ')[0]], gt_lable_dict[key.split(' ')[1]]]
68 |
69 |
70 | def random_region(self,image_feat, num_boxes):
71 | output_label = np.zeros([self.num_frames, num_boxes])
72 | for w in range(self.num_frames):
73 | for i in range(num_boxes):
74 | prob = random.random()
75 | # mask token with 15% probability
76 | if prob < 0.15 :
77 | prob /= 0.15
78 | # 80% randomly change token to mask token
79 | if prob < 0.9:
80 | image_feat[w,i] = 0
81 | # -> rest 10% randomly keep current token
82 | output_label[w,i] = 1
83 | else:
84 | output_label[w,i] = -1
85 | return image_feat, output_label
86 |
87 | def convert_example_to_features(self, image_feat, pose_feat, pose_loc, num_seq, audio_feat, label, target_label, num_frames):
88 | N = self.num_mix
89 | num_boxes = num_seq
90 | image_mask = []
91 | pose_mask = []
92 | image_label = [[] for n in range(N)]
93 | pose_label = [[] for n in range(N)]
94 | for n in range(N):
95 | image_feat[n], image_label[n] = self.random_region(image_feat[n], num_boxes)
96 | pose_feat[n], pose_label[n] = self.random_region(pose_feat[n], num_boxes)
97 | image_mask_tmp = [[1] * (num_boxes)] * self.num_frames
98 | pose_mask_tmp = [[1] * (num_boxes)] * self.num_frames
99 |
100 | #Zero-pad up to the visual sequence length.
101 | while len(image_mask_tmp) < self.num_frames:
102 | image_mask_tmp.append(0)
103 | image_label[n].append(-1)
104 | pose_mask_tmp.append(0)
105 | pose_label[n].append(-1)
106 |
107 | assert len(image_mask_tmp) == self.num_frames
108 | assert len(pose_mask_tmp) == self.num_frames
109 | image_mask.append(image_mask_tmp)
110 | pose_mask.append(pose_mask_tmp)
111 |
112 | features = InputFeatures(
113 | image_feat=image_feat,
114 | image_label=np.array(image_label),
115 | image_mask = np.array(image_mask),
116 | pose_feat = pose_feat,
117 | pose_loc = pose_loc,
118 | pose_label = np.array(pose_label),
119 | pose_mask = np.array(pose_mask),
120 | audio_feat = audio_feat,
121 | audio_label = label,
122 | audio_target_label = target_label)
123 |
124 | return features
125 |
126 | def __getitem__(self, index):
127 | N = self.num_mix
128 | frames = [None for n in range(N)]
129 | pose_features = [None for n in range(N)]
130 | audios = [None for n in range(N)]
131 | infos = [[] for n in range(N)]
132 | path_frames = [[] for n in range(N)]
133 | path_audios = ['' for n in range(N)]
134 | center_frames = [0 for n in range(N)]
135 | pose_location = np.zeros((N, self.num_frames, self.num_seq, 5), dtype=np.float32)
136 | final_pose_location = [None for n in range(N)]
137 | original_img_size = [None for n in range(N)]
138 | target_label = [[] for n in range(N)]
139 | label = torch.zeros(N, 21)
140 |
141 | # the first video
142 | index = self.check_video_frames_exists(self.frame_root, index)
143 | infos[0] = self.list_sample[index]
144 |
145 | # sample other videos
146 | if not self.split == 'train':
147 | random.seed(index)
148 | for n in range(1, N):
149 | indexN = random.randint(0, len(self.list_sample)-1)
150 | indexN = self.check_video_frames_exists(self.frame_root, indexN)
151 | infos[n] = self.list_sample[indexN]
152 |
153 | # select frames
154 | idx_margin = max(
155 | int(self.fps * 8), (self.num_frames // 2) * self.stride_frames)
156 |
157 | for n, infoN in enumerate(infos):
158 | #audio label for classification
159 | target = self.all_vid_label[infoN]
160 | for j in range(len(target)):
161 | label[n][target[j]-1]=1
162 | #process for batch compatible
163 | if len(target) < self.num_seq:
164 | target.append(-1)
165 | target_label[n] = target
166 |
167 | #load pose feat (generated by GCN)
168 | pose_feat_path = os.path.join(self.pose_root, infoN)
169 | pose_json = os.path.join(self.pose_bbox_path, infoN+".json")
170 | with open(pose_json, "rb") as f:
171 | pose_data = json.load(f)
172 |
173 | path_frameN = os.path.join(self.frame_root,infoN)
174 | path_audioN = os.path.join(self.audio_root,infoN+".wav")
175 | count_framesN = len(os.listdir(pose_feat_path))
176 |
177 | if self.split == 'train':
178 | # random, not to sample start and end n-frames
179 | if idx_margin+1 < int(count_framesN)-idx_margin:
180 | center_frameN = random.randint(
181 | idx_margin+1, int(count_framesN)-idx_margin)
182 | else:
183 | center_frameN = int(count_framesN) // 2
184 | else:
185 | center_frameN = int(count_framesN) // 2
186 | center_frames[n] = center_frameN
187 |
188 | vid_pose_feat = torch.zeros(self.num_frames, 2, 256, 68)
189 | # absolute frame/audio paths
190 | for i in range(self.num_frames):
191 | idx_offset = i
192 | path_frames[n].append(
193 | os.path.join(
194 | path_frameN,
195 | '{:06d}.jpg'.format(center_frameN + idx_offset + 1)))
196 |
197 | #load pose features
198 | try:
199 | vid_pose_feat[i] = torch.from_numpy(np.load(os.path.join(pose_feat_path, '{:06d}.npy'.format(center_frameN + idx_offset)), allow_pickle=True))
200 | except Exception as e:
201 | print("error in "+infoN)
202 |
203 |
204 | #load pose location (pose bbox)
205 | if '{:06d}.jpg'.format(center_frameN + idx_offset) in pose_data:
206 | bbox_list = pose_data['{:06d}.jpg'.format(center_frameN + idx_offset)]
207 | bbox_list.sort(key = lambda x: x[4], reverse=True)
208 | for s, box in enumerate(bbox_list):
209 | if s < self.num_seq:
210 | pose_location[n, i, s,:4] = box[:4]
211 | pose_features[n] = vid_pose_feat
212 | path_audios[n] = path_audioN
213 |
214 | # load frames and audios, STFT
215 | try:
216 | for n, infoN in enumerate(infos):
217 | frames[n], original_img_size[n] = self._load_frames(path_frames[n])
218 | # jitter audio
219 | # center_timeN = (center_frames[n] - random.random()) / self.fps
220 | center_timeN = (center_frames[n] - 0.5) / self.fps
221 | audios[n] = self._load_audio(path_audios[n], center_timeN)
222 | mag_mix, mags, phase_mix = self._mix_n_and_stft(audios)
223 |
224 | except Exception as e:
225 | print('Failed loading frame/audio: {}'.format(e))
226 | # create dummy data
227 | mag_mix, mags, frames, audios, phase_mix = \
228 | self.dummy_mix_data(N)
229 |
230 | self.image_size = 224
231 | for n, infoN in enumerate(infos):
232 | for i in range(self.num_frames):
233 | #rescale bbox which we get from alphapose
234 | if not original_img_size[n] is None:
235 | x_ = original_img_size[n][i][0]
236 | y_ = original_img_size[n][i][1]
237 | x_scale = self.image_size/x_
238 | y_scale = self.image_size/y_
239 | else:
240 | x_scale = 1
241 | y_scale = 1
242 | for s in range(self.num_seq):
243 | pose_location[n,i,s,0] = pose_location[n,i,s,0] * x_scale
244 | pose_location[n,i,s,1] = pose_location[n,i,s,1] * y_scale
245 | pose_location[n,i,s,2] = pose_location[n,i,s,2] * x_scale
246 | pose_location[n,i,s,3] = pose_location[n,i,s,3] * y_scale
247 | pose_location[n,i,s,2] = pose_location[n,i,s,2] + pose_location[n,i,s,0]
248 | pose_location[n,i,s,3] = pose_location[n,i,s,3] + pose_location[n,i,s,1]
249 |
250 | #process pose location as vilbert
251 | pose_location[n,i,:,4] = (pose_location[n,i,:,3] - pose_location[n,i,:,1]) * (pose_location[n,i,:,2] - pose_location[n,i,:,0]) / (float(self.image_size) * float(self.image_size))
252 | pose_location[n,i,:,0] = pose_location[n,i,:,0] / float(self.image_size)
253 | pose_location[n,i,:,1] = pose_location[n,i,:,1] / float(self.image_size)
254 | pose_location[n,i,:,2] = pose_location[n,i,:,2] / float(self.image_size)
255 | pose_location[n,i,:,3] = pose_location[n,i,:,3] / float(self.image_size)
256 |
257 | final_pose_location[n] = pose_location[n]
258 |
259 | ###start of bert dataloader
260 | cur_features = self.convert_example_to_features(frames, pose_features, final_pose_location, self.num_seq, mag_mix, label, target_label, self.num_frames)
261 | cur_tensors = (cur_features.image_feat,
262 | cur_features.image_label,
263 | cur_features.image_mask,
264 | cur_features.pose_feat,
265 | cur_features.pose_loc,
266 | cur_features.pose_label,
267 | cur_features.pose_mask,
268 | cur_features.audio_feat,
269 | cur_features.audio_label,
270 | cur_features.audio_target_label)
271 | image_feat, image_label,image_mask, pose_feat, pose_loc,pose_label, pose_mask, audio_feat, audio_label, audio_target_label = cur_tensors
272 |
273 | image_feat_final = [None for n in range(N)] #torch.zeros((N,self.num_frames+1, 3, self.image_size,self.image_size))
274 | image_mask_final = [None for n in range(N)] #torch.zeros((N,self.num_frames+1, self.num_seq))
275 | pose_feat_final = [None for n in range(N)] #torch.zeros((N, self.num_frames+1, self.num_seq, 256, 68))
276 | pose_mask_final = [None for n in range(N)] #torch.zeros((N,self.num_frames+1, self.num_seq))
277 | pose_loc_final = [None for n in range(N)] #torch.zeros((N,self.num_frames+1, self.num_seq,5))
278 |
279 | for n in range(N):
280 | #batch_size = image_feat[n].shape[0]
281 | image_mask_tmp = image_mask[n]
282 | image_feat_tmp = image_feat[n]
283 | if len(image_mask_tmp.shape) < 2:
284 | image_mask_tmp = image_mask_tmp.reshape(1,image_mask_tmp.shape[0])
285 | g_image_feat = np.sum(image_feat_tmp.numpy(), axis=1) / np.sum(image_mask_tmp) #, axis=1, keepdims=True)
286 | image_feat_tmp = np.concatenate([np.expand_dims(g_image_feat, axis=1), image_feat_tmp.numpy()], axis=1)
287 | image_feat_tmp = np.array(image_feat_tmp, dtype=np.float32)
288 | g_image_mask = np.repeat(np.array([[1, 1]]), 1, axis=0)
289 | image_mask_tmp = np.concatenate([g_image_mask, image_mask_tmp], axis=0)
290 | image_feat_final[n]= torch.from_numpy(image_feat_tmp)
291 | image_mask_final[n] = torch.from_numpy(image_mask_tmp)
292 |
293 | pose_mask_tmp = pose_mask[n]
294 | pose_feat_tmp = pose_feat[n]
295 | pose_loc_tmp = pose_loc[n]
296 | if len(pose_mask_tmp.shape) < 2:
297 | pose_mask_tmp = pose_mask_tmp.reshape(1,pose_mask_tmp.shape[0])
298 | g_pose_feat = np.sum(pose_feat_tmp.numpy(), axis=0) / np.sum(pose_mask_tmp) #, axis=1, keepdims=True)
299 | pose_feat_tmp = np.concatenate([np.expand_dims(g_pose_feat, axis=0), pose_feat_tmp.numpy()], axis=0)
300 | pose_feat_tmp = np.array(pose_feat_tmp, dtype=np.float32)
301 | g_pose_loc = np.repeat(np.array([[0,0,1,1,1]], dtype=np.float32), 2, axis=0)
302 | pose_loc_tmp = np.concatenate([np.expand_dims(g_pose_loc, axis=0), pose_loc_tmp], axis=0)
303 | pose_loc_tmp = np.array(pose_loc_tmp, dtype=np.float32)
304 | g_pose_mask = np.repeat(np.array([[1,1]]), 1, axis=0)
305 | pose_mask_tmp = np.concatenate([g_pose_mask, pose_mask_tmp], axis=0)
306 | pose_feat_final[n] = torch.from_numpy(pose_feat_tmp)
307 | pose_mask_final[n] = torch.from_numpy(pose_mask_tmp)
308 | pose_loc_final[n] = torch.from_numpy(pose_loc_tmp)
309 |
310 | ###end of bert dataloader
311 |
312 | #ret_dict = {'mag_mix': mag_mix, 'frames': frames, 'mags': mags, 'pose_feat': pose_features, 'pose_loc': final_pose_location, 'label': label, 'target_label': target_label}
313 |
314 | ret_dict = {'mag_mix': audio_feat, 'frames': image_feat_final, 'image_label': image_label, 'image_mask': image_mask_final, 'mags': mags, 'pose_feat': pose_feat_final, 'pose_loc': pose_loc_final, 'pose_mask': pose_mask_final,'pose_label':pose_label, 'label': label, 'target_label': target_label}
315 |
316 | if self.split != 'train':
317 | ret_dict['audios'] = audios
318 | ret_dict['phase_mix'] = phase_mix
319 | ret_dict['infos'] = infos
320 |
321 | return ret_dict
322 |
--------------------------------------------------------------------------------
/tribert/datasets/video_transforms.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numbers
3 | import torchvision.transforms.functional as F
4 | from PIL import Image
5 | import torch
6 |
7 |
8 | class Resize(object):
9 | def __init__(self, size, interpolation=Image.BILINEAR):
10 | self.size = size
11 | self.interpolation = interpolation
12 |
13 | def __call__(self, frames):
14 | """
15 | Args:
16 | frames: a list of PIL Image
17 | Returns:
18 | a list of PIL Image: Rescaled images.
19 | """
20 | out_frames = []
21 | for frame in frames:
22 | out_frames.append(F.resize(frame, self.size, self.interpolation))
23 | return out_frames
24 |
25 |
26 | class CenterCrop(object):
27 | def __init__(self, size):
28 | if isinstance(size, numbers.Number):
29 | self.size = (int(size), int(size))
30 | else:
31 | self.size = size
32 |
33 | def __call__(self, frames):
34 | """
35 | Args:
36 | frames: a list of PIL Image
37 | Returns:
38 | a list of PIL Image: Cropped images.
39 | """
40 | out_frames = []
41 | for frame in frames:
42 | out_frames.append(F.center_crop(frame, self.size))
43 | return out_frames
44 |
45 |
46 | class RandomCrop(object):
47 | def __init__(self, size, padding=None, pad_if_needed=False, fill=0, padding_mode='constant'):
48 | if isinstance(size, numbers.Number):
49 | self.size = (int(size), int(size))
50 | else:
51 | self.size = size
52 | self.padding = padding
53 | self.pad_if_needed = pad_if_needed
54 | self.fill = fill
55 | self.padding_mode = padding_mode
56 |
57 | @staticmethod
58 | def get_params(frames, output_size):
59 | """Get parameters for ``crop`` for a random crop.
60 | Args:
61 | frames: a list of PIL Image
62 | output_size (tuple): Expected output size of the crop.
63 | Returns:
64 | tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
65 | """
66 | w, h = frames[0].size
67 | th, tw = output_size
68 | if w == tw and h == th:
69 | return 0, 0, h, w
70 |
71 | i = random.randint(0, h - th)
72 | j = random.randint(0, w - tw)
73 | return i, j, th, tw
74 |
75 | def __call__(self, frames):
76 | """
77 | Args:
78 | frames: a list of PIL Image
79 | Returns:
80 | a list of PIL Image: Cropped images.
81 | """
82 |
83 | i, j, h, w = self.get_params(frames, self.size)
84 |
85 | out_frames = []
86 | for frame in frames:
87 | if self.padding is not None:
88 | frame = F.pad(frame, self.padding, self.fill, self.padding_mode)
89 |
90 | # pad the width if needed
91 | if self.pad_if_needed and frame.size[0] < self.size[1]:
92 | frame = F.pad(frame, (int((1 + self.size[1] - frame.size[0]) / 2), 0), self.fill, self.padding_mode)
93 | # pad the height if needed
94 | if self.pad_if_needed and frame.size[1] < self.size[0]:
95 | frame = F.pad(frame, (0, int((1 + self.size[0] - frame.size[1]) / 2)), self.fill, self.padding_mode)
96 |
97 | out_frames.append(F.crop(frame, i, j, h, w))
98 | return out_frames
99 |
100 | def __repr__(self):
101 | return self.__class__.__name__ + '(size={0}, padding={1})'.format(self.size, self.padding)
102 |
103 |
104 | class RandomHorizontalFlip(object):
105 | """Horizontally flip the given PIL Image randomly with a given probability.
106 | Args:
107 | p (float): probability of the image being flipped. Default value is 0.5
108 | """
109 |
110 | def __init__(self, p=0.5):
111 | self.p = p
112 |
113 | def __call__(self, frames):
114 | """
115 | Args:
116 | frames: a list of PIL Image
117 | Returns:
118 | a list of PIL Image: Flipped images.
119 | """
120 |
121 | if random.random() < self.p:
122 | out_frames = []
123 | for frame in frames:
124 | out_frames.append(F.hflip(frame))
125 | return out_frames
126 | else:
127 | return frames
128 |
129 | def __repr__(self):
130 | return self.__class__.__name__ + '(p={})'.format(self.p)
131 |
132 |
133 | class ToTensor(object):
134 | """Convert a list of ``PIL Image`` or ``numpy.ndarray`` to tensor.
135 | Converts a list of PIL Image or numpy.ndarray (H x W x C) in the range
136 | [0, 255] to a torch.FloatTensor of shape (C x L xH x W) in the range
137 | [0.0, 1.0].
138 | """
139 |
140 | def __call__(self, frames):
141 | """
142 | Args:
143 | frames: a list of (PIL Image or numpy.ndarray).
144 | Returns:
145 | a list of Tensor: Converted images.
146 | """
147 | out_frames = []
148 | for frame in frames:
149 | out_frames.append(F.to_tensor(frame))
150 | return out_frames
151 |
152 |
153 | class Normalize(object):
154 | def __init__(self, mean, std):
155 | self.mean = mean
156 | self.std = std
157 |
158 | def __call__(self, frames):
159 | """
160 | Args:
161 | frames: a list of Tensor image of size (C, H, W) to be normalized.
162 | Returns:
163 | a list of Tensor: a list of normalized Tensor images.
164 | """
165 | out_frames = []
166 | for frame in frames:
167 | out_frames.append(F.normalize(frame, self.mean, self.std))
168 | return out_frames
169 |
170 |
171 | class Stack(object):
172 | def __init__(self, dim=1):
173 | self.dim = dim
174 |
175 | def __call__(self, frames):
176 | """
177 | Args:
178 | frames: a list of (L) Tensor image of size (C, H, W).
179 | Returns:
180 | Tensor: a video Tensor of size (C, L, H, W).
181 | """
182 | return torch.stack(frames, dim=self.dim)
183 |
--------------------------------------------------------------------------------
/tribert/models/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchvision
3 | import torch.nn.functional as F
4 |
5 | from .audio_net import Unet
6 | from .vision_net import ResnetFC, ResnetDilated
7 | from .criterion import BCELoss, L1Loss, L2Loss
8 |
9 |
10 | def activate(x, activation):
11 | if activation == 'sigmoid':
12 | return torch.sigmoid(x)
13 | elif activation == 'softmax':
14 | return F.softmax(x, dim=1)
15 | elif activation == 'relu':
16 | return F.relu(x)
17 | elif activation == 'tanh':
18 | return F.tanh(x)
19 | elif activation == 'no':
20 | return x
21 | else:
22 | raise Exception('Unkown activation!')
23 |
24 |
25 | class ModelBuilder():
26 | # custom weights initialization
27 | def weights_init(self, m):
28 | classname = m.__class__.__name__
29 | if classname.find('Conv') != -1:
30 | m.weight.data.normal_(0.0, 0.001)
31 | elif classname.find('BatchNorm') != -1:
32 | m.weight.data.normal_(1.0, 0.02)
33 | m.bias.data.fill_(0)
34 | elif classname.find('Linear') != -1:
35 | m.weight.data.normal_(0.0, 0.0001)
36 |
37 | def build_sound(self, arch='unet5', fc_dim=64, weights=''):
38 | # 2D models
39 | if arch == 'unet5':
40 | net_sound = Unet(fc_dim=fc_dim, num_downs=5)
41 | elif arch == 'unet6':
42 | net_sound = Unet(fc_dim=fc_dim, num_downs=6)
43 | elif arch == 'unet7':
44 | net_sound = Unet(fc_dim=fc_dim, num_downs=7)
45 | else:
46 | raise Exception('Architecture undefined!')
47 |
48 | net_sound.apply(self.weights_init)
49 | if len(weights) > 0:
50 | print('Loading weights for net_sound')
51 | net_sound.load_state_dict(torch.load(weights))
52 |
53 | return net_sound
54 |
55 | # builder for vision
56 | def build_frame(self, arch='resnet18', fc_dim=64, pool_type='avgpool',
57 | weights='', config=None):
58 | pretrained=True
59 | if arch == 'resnet18fc':
60 | original_resnet = torchvision.models.resnet18(pretrained)
61 | net = ResnetFC(
62 | original_resnet, fc_dim=fc_dim, pool_type=pool_type)
63 | elif arch == 'resnet18dilated':
64 | original_resnet = torchvision.models.resnet18(pretrained)
65 | net = ResnetDilated(
66 | original_resnet, fc_dim=fc_dim, pool_type=pool_type)
67 | elif arch == 'resnet50fc':
68 | original_resnet = torchvision.models.resnet50(pretrained)
69 | net = ResnetFC(
70 | original_resnet, fc_dim=fc_dim, pool_type=pool_type, config=config)
71 | elif arch == 'resnet101fc':
72 | original_resnet = torchvision.models.resnet101(pretrained)
73 | net = ResnetFC(
74 | original_resnet, fc_dim=fc_dim, pool_type=pool_type, config=config)
75 | else:
76 | raise Exception('Architecture undefined!')
77 |
78 | if len(weights) > 0:
79 | print('Loading weights for net_frame')
80 | net.load_state_dict(torch.load(weights))
81 | return net
82 |
83 |
84 |
85 | def build_criterion(self, arch):
86 | if arch == 'bce':
87 | net = BCELoss()
88 | elif arch == 'l1':
89 | net = L1Loss()
90 | elif arch == 'l2':
91 | net = L2Loss()
92 | else:
93 | raise Exception('Architecture undefined!')
94 | return net
95 |
--------------------------------------------------------------------------------
/tribert/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/models/__pycache__/audio_net.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/audio_net.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/models/__pycache__/criterion.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/criterion.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/models/__pycache__/synthesizer_net.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/synthesizer_net.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/models/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/models/__pycache__/vision_net.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/tribert/models/__pycache__/vision_net.cpython-36.pyc
--------------------------------------------------------------------------------
/tribert/models/audio_net.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from collections import OrderedDict
5 |
6 | class Attention_block(nn.Module):
7 | def __init__(self,F_g,F_l,F_int):
8 | super(Attention_block,self).__init__()
9 | self.W_g = nn.Sequential(
10 | nn.Conv2d(F_g, F_int, kernel_size=1,stride=1,padding=0,bias=True),
11 | nn.BatchNorm2d(F_int)
12 | )
13 |
14 | self.W_x = nn.Sequential(
15 | nn.Conv2d(F_l, F_int, kernel_size=1,stride=1,padding=0,bias=True),
16 | nn.BatchNorm2d(F_int)
17 | )
18 |
19 | self.psi = nn.Sequential(
20 | nn.Conv2d(F_int, 1, kernel_size=1,stride=1,padding=0,bias=True),
21 | nn.BatchNorm2d(1),
22 | nn.Sigmoid()
23 | )
24 |
25 | self.relu = nn.ReLU(inplace=True)
26 |
27 | def forward(self,g,x):
28 | g1 = self.W_g(g)
29 | x1 = self.W_x(x)
30 | psi = self.relu(g1+x1)
31 | psi = self.psi(psi)
32 | ### sampler
33 | #import ipdb; ipdb.set_trace()
34 | psi = F.upsample(psi, size=x.size()[2:], mode="bilinear", align_corners=True)
35 |
36 | return x*psi
37 |
38 |
39 | class Unet(nn.Module):
40 | def __init__(self, fc_dim=64, num_downs=5, ngf=64, use_dropout=False):
41 | super(Unet, self).__init__()
42 | # construct unet structure
43 | in_channels = 1
44 | out_channels = 1 #1 #Himu
45 | init_features = 64
46 | features = init_features
47 | self.encoder1 = Unet._block(in_channels, features, name="enc1")
48 | self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
49 | self.encoder2 = Unet._block(features, features * 2, name="enc2")
50 | self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
51 | self.encoder3 = Unet._block(features * 2, features * 4, name="enc3")
52 | self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
53 | self.encoder4 = Unet._block(features * 4, features * 8, name="enc4")
54 | self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
55 |
56 | self.bottleneck = Unet._block(features * 8, features * 24, name="bottleneck") #Himu features * 16
57 |
58 | self.upconv4 = nn.ConvTranspose2d(
59 | features * 24, features * 8, kernel_size=2, stride=2 #Himu features * 16
60 | )
61 | self.Att4 = Attention_block(F_g = 8 * features, F_l = 8 * features, F_int = 4 * features)
62 | self.decoder4 = Unet._block((features * 8) * 2, features * 8, name="dec4")
63 | self.upconv3 = nn.ConvTranspose2d(
64 | features * 8, features * 4, kernel_size=2, stride=2
65 | )
66 | self.Att3 = Attention_block(F_g = 4 * features, F_l = 4 * features, F_int = 2 * features)
67 | self.decoder3 = Unet._block((features * 4) * 2, features * 4, name="dec3")
68 | self.upconv2 = nn.ConvTranspose2d(
69 | features * 4, features * 2, kernel_size=2, stride=2
70 | )
71 | self.Att2 = Attention_block(F_g = 2 * features, F_l = 2 * features, F_int = features)
72 | self.decoder2 = Unet._block((features * 2) * 2, features * 2, name="dec2")
73 | self.upconv1 = nn.ConvTranspose2d(
74 | features * 2, features, kernel_size=2, stride=2
75 | )
76 | self.Att1 = Attention_block(F_g = features, F_l = features, F_int = int(features/2))
77 |
78 | self.decoder1 = Unet._block(features * 2, features, name="dec1")
79 |
80 | self.conv = nn.Conv2d(
81 | in_channels=features, out_channels=out_channels, kernel_size=1
82 | )
83 | self.sigmoid = nn.Sigmoid()
84 |
85 | self.mlp = nn.Sequential(
86 | nn.Linear(1536, 1536),
87 | nn.ReLU(),
88 | nn.Linear(1536, 1536))
89 |
90 |
91 | def forward(self, x, feats):
92 | enc1 = self.encoder1(x)
93 | enc2 = self.encoder2(self.pool1(enc1))
94 | enc3 = self.encoder3(self.pool2(enc2))
95 | enc4 = self.encoder4(self.pool3(enc3))
96 |
97 | bottleneck = self.bottleneck(self.pool4(enc4))
98 |
99 | feat_all = feats.unsqueeze(2).unsqueeze(2).repeat(1,1,bottleneck.shape[2],bottleneck.shape[3])
100 | combine_feat = F.softmax(torch.matmul(bottleneck,feat_all), dim=1) * feat_all
101 | combine_feat = combine_feat + bottleneck
102 | bottleneck = self.mlp(combine_feat.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous() + combine_feat
103 | #bottleneck = torch.cat((bottleneck, feat_all), dim=1)
104 |
105 | dec4 = self.upconv4(bottleneck)
106 | enc4 = self.Att4(g = dec4, x = enc4)
107 | dec4 = torch.cat((dec4, enc4), dim=1)
108 | dec4 = self.decoder4(dec4)
109 |
110 | dec3 = self.upconv3(dec4)
111 | enc3 = self.Att3(g = dec3, x = enc3)
112 | dec3 = torch.cat((dec3, enc3), dim=1)
113 | dec3 = self.decoder3(dec3)
114 |
115 | dec2 = self.upconv2(dec3)
116 | enc2 = self.Att2(g = dec2, x = enc2)
117 | dec2 = torch.cat((dec2, enc2), dim=1)
118 | dec2 = self.decoder2(dec2)
119 |
120 | dec1 = self.upconv1(dec2)
121 | enc1 = self.Att1(g = dec1, x = enc1)
122 | dec1 = torch.cat((dec1, enc1), dim=1)
123 | dec1 = self.decoder1(dec1)
124 | return self.conv(dec1)
125 |
126 | def weights_init(self, m):
127 | classname = m.__class__.__name__
128 | if classname.find('Conv') != -1:
129 | m.weight.data.normal_(0.0, 0.001)
130 | elif classname.find('BatchNorm') != -1:
131 | m.weight.data.normal_(1.0, 0.02)
132 | m.bias.data.fill_(0)
133 | elif classname.find('Linear') != -1:
134 | m.weight.data.normal_(0.0, 0.0001)
135 |
136 | @staticmethod
137 | def _block(in_channels, features, name):
138 | return nn.Sequential(
139 | OrderedDict(
140 | [
141 | (
142 | name + "conv1",
143 | nn.Conv2d(
144 | in_channels=in_channels,
145 | out_channels=features,
146 | kernel_size=3,
147 | padding=1,
148 | bias=False,
149 | ),
150 | ),
151 | (name + "norm1", nn.BatchNorm2d(num_features=features)),
152 | (name + "relu1", nn.ReLU(inplace=True)),
153 | (
154 | name + "conv2",
155 | nn.Conv2d(
156 | in_channels=features,
157 | out_channels=features,
158 | kernel_size=3,
159 | padding=1,
160 | bias=False,
161 | ),
162 | ),
163 | (name + "norm2", nn.BatchNorm2d(num_features=features)),
164 | (name + "relu2", nn.ReLU(inplace=True)),
165 | ]
166 | )
167 | )
168 |
169 |
170 | # Defines the submodule with skip connection.
171 | # X -------------------identity---------------------- X
172 | # |-- downsampling -- |submodule| -- upsampling --|
173 | class UnetBlock(nn.Module):
174 | def __init__(self, outer_nc, inner_input_nc, input_nc=None,
175 | submodule=None, outermost=False, innermost=False,
176 | use_dropout=False, inner_output_nc=None, noskip=False):
177 | super(UnetBlock, self).__init__()
178 | self.outermost = outermost
179 | self.noskip = noskip
180 | use_bias = False
181 | if input_nc is None:
182 | input_nc = outer_nc
183 | if innermost:
184 | inner_output_nc = inner_input_nc
185 | elif inner_output_nc is None:
186 | inner_output_nc = 2 * inner_input_nc
187 |
188 | downrelu = nn.LeakyReLU(0.2, True)
189 | downnorm = nn.BatchNorm2d(inner_input_nc)
190 | uprelu = nn.ReLU(True)
191 | upnorm = nn.BatchNorm2d(outer_nc)
192 | upsample = nn.Upsample(
193 | scale_factor=2, mode='bilinear', align_corners=True)
194 |
195 | if outermost:
196 | downconv = nn.Conv2d(
197 | input_nc, inner_input_nc, kernel_size=4,
198 | stride=2, padding=1, bias=use_bias)
199 | upconv = nn.Conv2d(
200 | inner_output_nc, outer_nc, kernel_size=3, padding=1)
201 |
202 | down = [downconv]
203 | up = [uprelu, upsample, upconv]
204 | model = down + [submodule] + up
205 | elif innermost:
206 | downconv = nn.Conv2d(
207 | input_nc, inner_input_nc, kernel_size=4,
208 | stride=2, padding=1, bias=use_bias)
209 | upconv = nn.Conv2d(
210 | inner_output_nc, outer_nc, kernel_size=3,
211 | padding=1, bias=use_bias)
212 |
213 | down = [downrelu, downconv]
214 | up = [uprelu, upsample, upconv, upnorm]
215 | model = down + up
216 | else:
217 | downconv = nn.Conv2d(
218 | input_nc, inner_input_nc, kernel_size=4,
219 | stride=2, padding=1, bias=use_bias)
220 | upconv = nn.Conv2d(
221 | inner_output_nc, outer_nc, kernel_size=3,
222 | padding=1, bias=use_bias)
223 | down = [downrelu, downconv, downnorm]
224 | up = [uprelu, upsample, upconv, upnorm]
225 |
226 | if use_dropout:
227 | model = down + [submodule] + up + [nn.Dropout(0.5)]
228 | else:
229 | model = down + [submodule] + up
230 |
231 | self.model = nn.Sequential(*model)
232 |
233 | def forward(self, x):
234 | if self.outermost or self.noskip:
235 | return self.model(x)
236 | else:
237 | return torch.cat([x, self.model(x)], 1)
238 |
--------------------------------------------------------------------------------
/tribert/models/criterion.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class BaseLoss(nn.Module):
7 | def __init__(self):
8 | super(BaseLoss, self).__init__()
9 |
10 | def forward(self, preds, targets, weight=None):
11 | if isinstance(preds, list):
12 | N = len(preds)
13 | if weight is None:
14 | weight = preds[0].new_ones(1)
15 |
16 | errs = [self._forward(preds[n], targets[n], weight)
17 | for n in range(N)]
18 | err = torch.mean(torch.stack(errs))
19 |
20 | elif isinstance(preds, torch.Tensor):
21 | if weight is None:
22 | weight = preds.new_ones(1)
23 | err = self._forward(preds, targets, weight)
24 |
25 | return err
26 |
27 |
28 | class L1Loss(BaseLoss):
29 | def __init__(self):
30 | super(L1Loss, self).__init__()
31 |
32 | def _forward(self, pred, target, weight):
33 | return torch.mean(weight * torch.abs(pred - target))
34 |
35 |
36 | class L2Loss(BaseLoss):
37 | def __init__(self):
38 | super(L2Loss, self).__init__()
39 |
40 | def _forward(self, pred, target, weight):
41 | return torch.mean(weight * torch.pow(pred - target, 2))
42 |
43 |
44 | class BCELoss(BaseLoss):
45 | def __init__(self):
46 | super(BCELoss, self).__init__()
47 |
48 | def _forward(self, pred, target, weight):
49 | return F.binary_cross_entropy(pred, target, weight=weight)
50 |
51 |
52 |
--------------------------------------------------------------------------------
/tribert/models/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 |
4 | import numpy as np
5 | import librosa
6 | import cv2
7 |
8 | import subprocess as sp
9 | from threading import Timer
10 |
11 |
12 | def warpgrid(bs, HO, WO, warp=True):
13 | # meshgrid
14 | x = np.linspace(-1, 1, WO)
15 | y = np.linspace(-1, 1, HO)
16 | xv, yv = np.meshgrid(x, y)
17 | grid = np.zeros((bs, HO, WO, 2))
18 | grid_x = xv
19 | if warp:
20 | grid_y = (np.power(21, (yv+1)/2) - 11) / 10
21 | else:
22 | grid_y = np.log(yv * 10 + 11) / np.log(21) * 2 - 1
23 | grid[:, :, :, 0] = grid_x
24 | grid[:, :, :, 1] = grid_y
25 | grid = grid.astype(np.float32)
26 | return grid
27 |
28 |
29 | def makedirs(path, remove=False):
30 | if os.path.isdir(path):
31 | if remove:
32 | shutil.rmtree(path)
33 | print('removed existing directory...')
34 | else:
35 | return
36 | os.makedirs(path)
37 |
38 |
39 | class AverageMeter(object):
40 | """Computes and stores the average and current value"""
41 | def __init__(self):
42 | self.initialized = False
43 | self.val = None
44 | self.avg = None
45 | self.sum = None
46 | self.count = None
47 |
48 | def initialize(self, val, weight):
49 | self.val = val
50 | self.avg = val
51 | self.sum = val*weight
52 | self.count = weight
53 | self.initialized = True
54 |
55 | def update(self, val, weight=1):
56 | val = np.asarray(val)
57 | if not self.initialized:
58 | self.initialize(val, weight)
59 | else:
60 | self.add(val, weight)
61 |
62 | def add(self, val, weight):
63 | self.val = val
64 | self.sum += val * weight
65 | self.count += weight
66 | self.avg = self.sum / self.count
67 |
68 | def value(self):
69 | if self.val is None:
70 | return 0.
71 | else:
72 | return self.val.tolist()
73 |
74 | def average(self):
75 | if self.avg is None:
76 | return 0.
77 | else:
78 | return self.avg.tolist()
79 |
80 |
81 | def recover_rgb(img):
82 | for t, m, s in zip(img,
83 | [0.485, 0.456, 0.406],
84 | [0.229, 0.224, 0.225]):
85 | t.mul_(s).add_(m)
86 | img = (img.numpy().transpose((1, 2, 0)) * 255).astype(np.uint8)
87 | return img
88 |
89 |
90 | def magnitude2heatmap(mag, log=True, scale=200.):
91 | if log:
92 | mag = np.log10(mag + 1.)
93 | mag *= scale
94 | mag[mag > 255] = 255
95 | mag = mag.astype(np.uint8)
96 | mag_color = cv2.applyColorMap(mag, cv2.COLORMAP_JET)
97 | mag_color = mag_color[:, :, ::-1]
98 | return mag_color
99 |
100 |
101 | def istft_reconstruction(mag, phase, hop_length=256):
102 | spec = mag.astype(np.complex) * np.exp(1j*phase)
103 | wav = librosa.istft(spec, hop_length=hop_length)
104 | return np.clip(wav, -1., 1.)
105 |
106 |
107 | class VideoWriter:
108 | """ Combine numpy frames into video using ffmpeg
109 |
110 | Arguments:
111 | filename: name of the output video
112 | fps: frame per second
113 | shape: shape of video frame
114 |
115 | Properties:
116 | add_frame(frame):
117 | add a frame to the video
118 | add_frames(frames):
119 | add multiple frames to the video
120 | release():
121 | release writing pipe
122 |
123 | """
124 |
125 | def __init__(self, filename, fps, shape):
126 | self.file = filename
127 | self.fps = fps
128 | self.shape = shape
129 |
130 | # video codec
131 | ext = filename.split('.')[-1]
132 | if ext == "mp4":
133 | self.vcodec = "h264"
134 | else:
135 | raise RuntimeError("Video codec not supoorted.")
136 |
137 | # video writing pipe
138 | cmd = [
139 | "ffmpeg",
140 | "-y", # overwrite existing file
141 | "-f", "rawvideo", # file format
142 | "-s", "{}x{}".format(shape[1], shape[0]), # size of one frame
143 | "-pix_fmt", "rgb24", # 3 channels
144 | "-r", str(self.fps), # frames per second
145 | "-i", "-", # input comes from a pipe
146 | "-an", # not to expect any audio
147 | "-vcodec", self.vcodec, # video codec
148 | "-pix_fmt", "yuv420p", # output video in yuv420p
149 | self.file]
150 |
151 | self.pipe = sp.Popen(cmd, stdin=sp.PIPE, stderr=sp.PIPE, bufsize=10**9)
152 |
153 | def release(self):
154 | self.pipe.stdin.close()
155 |
156 | def add_frame(self, frame):
157 | assert len(frame.shape) == 3
158 | assert frame.shape[0] == self.shape[0]
159 | assert frame.shape[1] == self.shape[1]
160 | try:
161 | self.pipe.stdin.write(frame.tostring())
162 | except:
163 | _, ffmpeg_error = self.pipe.communicate()
164 | print(ffmpeg_error)
165 |
166 | def add_frames(self, frames):
167 | for frame in frames:
168 | self.add_frame(frame)
169 |
170 |
171 | def kill_proc(proc):
172 | proc.kill()
173 | print('Process running overtime! Killed.')
174 |
175 |
176 | def run_proc_timeout(proc, timeout_sec):
177 | # kill_proc = lambda p: p.kill()
178 | timer = Timer(timeout_sec, kill_proc, [proc])
179 | try:
180 | timer.start()
181 | proc.communicate()
182 | finally:
183 | timer.cancel()
184 |
185 |
186 | def combine_video_audio(src_video, src_audio, dst_video, verbose=False):
187 | try:
188 | cmd = ["ffmpeg", "-y",
189 | "-loglevel", "quiet",
190 | "-i", src_video,
191 | "-i", src_audio,
192 | "-c:v", "copy",
193 | "-c:a", "aac",
194 | "-strict", "experimental",
195 | dst_video]
196 | proc = sp.Popen(cmd)
197 | run_proc_timeout(proc, 10.)
198 |
199 | if verbose:
200 | print('Processed:{}'.format(dst_video))
201 | except Exception as e:
202 | print('Error:[{}] {}'.format(dst_video, e))
203 |
204 |
205 | # save video to the disk using ffmpeg
206 | def save_video(path, tensor, fps=25):
207 | assert tensor.ndim == 4, 'video should be in 4D numpy array'
208 | L, H, W, C = tensor.shape
209 | writer = VideoWriter(
210 | path,
211 | fps=fps,
212 | shape=[H, W])
213 | for t in range(L):
214 | writer.add_frame(tensor[t])
215 | writer.release()
216 |
217 |
218 | def save_audio(path, audio_numpy, sr):
219 | librosa.output.write_wav(path, audio_numpy, sr)
220 |
--------------------------------------------------------------------------------
/tribert/tribert.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import json
3 | import logging
4 | import math
5 | import os
6 | import shutil
7 | import tarfile
8 | import tempfile
9 | import sys
10 | from io import open
11 |
12 | import torch
13 | from torch import nn
14 | from torch.nn import CrossEntropyLoss
15 | import torch.nn.functional as F
16 | from torch.nn.utils.weight_norm import weight_norm
17 |
18 | import ipdb
19 |
20 | import numpy as np
21 |
22 |
23 | class BertConfig(object):
24 | """Configuration class to store the configuration of a `BertModel`.
25 | """
26 |
27 | def __init__(
28 | self,
29 | vocab_size_or_config_json_file,
30 | hidden_size=768,
31 | num_hidden_layers=12,
32 | num_attention_heads=12,
33 | intermediate_size=3072,
34 | hidden_act="gelu",
35 | hidden_dropout_prob=0.1,
36 | attention_probs_dropout_prob=0.1,
37 | max_position_embeddings=512,
38 | type_vocab_size=2,
39 | initializer_range=0.02,
40 | v_feature_size=1024,
41 | v_target_size=21,
42 | v_hidden_size=768,
43 | v_num_hidden_layers=3,
44 | v_num_attention_heads=12,
45 | v_intermediate_size=3072,
46 | p_feature_size = 1024,
47 | p_target_size=21,
48 | p_hidden_size=768,
49 | p_num_hidden_layers=3,
50 | p_num_attention_heads=12,
51 | p_intermediate_size=3072,
52 | bi_hidden_size=1024,
53 | bi_num_attention_heads=16,
54 | v_attention_probs_dropout_prob=0.1,
55 | v_hidden_act="gelu",
56 | v_hidden_dropout_prob=0.1,
57 | v_initializer_range=0.2,
58 | v_biattention_id=[0, 1],
59 | p_biattention_id=[10, 11],
60 | a_biattention_id=[16, 17],
61 | predict_feature=False,
62 | fast_mode=False,
63 | fixed_v_layer=0,
64 | fixed_p_layer=0,
65 | fixed_a_layer = 0,
66 | in_batch_pairs=False,
67 | fusion_method="mul",
68 | intra_gate=False,
69 | with_coattention=True
70 | ):
71 |
72 | assert len(v_biattention_id) == len(p_biattention_id)
73 | assert max(v_biattention_id) < v_num_hidden_layers
74 | assert max(p_biattention_id) < num_hidden_layers
75 |
76 | if isinstance(vocab_size_or_config_json_file, str) or (
77 | sys.version_info[0] == 2
78 | and isinstance(vocab_size_or_config_json_file, unicode)
79 | ):
80 | with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
81 | json_config = json.loads(reader.read())
82 | for key, value in json_config.items():
83 | self.__dict__[key] = value
84 | elif isinstance(vocab_size_or_config_json_file, int):
85 | self.vocab_size = vocab_size_or_config_json_file
86 | self.hidden_size = hidden_size
87 | self.num_hidden_layers = num_hidden_layers
88 | self.num_attention_heads = num_attention_heads
89 | self.hidden_act = hidden_act
90 | self.intermediate_size = intermediate_size
91 | self.hidden_dropout_prob = hidden_dropout_prob
92 | self.attention_probs_dropout_prob = attention_probs_dropout_prob
93 | self.max_position_embeddings = max_position_embeddings
94 | self.type_vocab_size = type_vocab_size
95 | self.initializer_range = initializer_range
96 | self.v_feature_size = v_feature_size
97 | self.v_hidden_size = v_hidden_size
98 | self.v_num_hidden_layers = v_num_hidden_layers
99 | self.v_num_attention_heads = v_num_attention_heads
100 | self.v_intermediate_size = v_intermediate_size
101 | self.v_attention_probs_dropout_prob = v_attention_probs_dropout_prob
102 | self.v_hidden_act = v_hidden_act
103 | self.v_hidden_dropout_prob = v_hidden_dropout_prob
104 | self.v_initializer_range = v_initializer_range
105 | self.v_biattention_id = v_biattention_id
106 | self.p_biattention_id = p_biattention_id
107 | self.p_hidden_act = v_hidden_act
108 | self.p_hidden_dropout_prob = v_hidden_dropout_prob
109 | self.a_hidden_act = v_hidden_act
110 | self.a_hidden_dropout_prob = v_hidden_dropout_prob
111 | self.v_target_size = v_target_size
112 | self.bi_hidden_size = bi_hidden_size
113 | self.bi_num_attention_heads = bi_num_attention_heads
114 | self.predict_feature = predict_feature
115 | self.fast_mode = fast_mode
116 | self.fixed_v_layer = fixed_v_layer
117 | self.fixed_p_layer = fixed_p_layer
118 | self.fixed_a_layer = fixed_a_layer
119 |
120 | self.in_batch_pairs = in_batch_pairs
121 | self.fusion_method = fusion_method
122 | self.intra_gate = intra_gate
123 | self.with_coattention=with_coattention
124 | else:
125 | raise ValueError(
126 | "First argument must be either a vocabulary size (int)"
127 | "or the path to a pretrained model config file (str)"
128 | )
129 |
130 |
131 |
132 | @classmethod
133 | def from_dict(cls, json_object):
134 | """Constructs a `BertConfig` from a Python dictionary of parameters."""
135 | config = BertConfig(vocab_size_or_config_json_file=-1)
136 | for key, value in json_object.items():
137 | config.__dict__[key] = value
138 | return config
139 |
140 | @classmethod
141 | def from_json_file(cls, json_file):
142 | """Constructs a `BertConfig` from a json file of parameters."""
143 | with open(json_file, "r", encoding="utf-8") as reader:
144 | text = reader.read()
145 | return cls.from_dict(json.loads(text))
146 |
147 | def __repr__(self):
148 | return str(self.to_json_string())
149 |
150 | def to_dict(self):
151 | """Serializes this instance to a Python dictionary."""
152 | output = copy.deepcopy(self.__dict__)
153 | return output
154 |
155 | def to_json_string(self):
156 | """Serializes this instance to a JSON string."""
157 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
158 |
159 |
160 |
--------------------------------------------------------------------------------
/utils_music21.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 |
4 | import numpy as np
5 | import librosa
6 | import cv2
7 |
8 | import subprocess as sp
9 | from threading import Timer
10 |
11 |
12 | def warpgrid(bs, HO, WO, warp=True):
13 | # meshgrid
14 | x = np.linspace(-1, 1, WO)
15 | y = np.linspace(-1, 1, HO)
16 | xv, yv = np.meshgrid(x, y)
17 | grid = np.zeros((bs, HO, WO, 2))
18 | grid_x = xv
19 | if warp:
20 | grid_y = (np.power(21, (yv+1)/2) - 11) / 10
21 | else:
22 | grid_y = np.log(yv * 10 + 11) / np.log(21) * 2 - 1
23 | grid[:, :, :, 0] = grid_x
24 | grid[:, :, :, 1] = grid_y
25 | grid = grid.astype(np.float32)
26 | return grid
27 |
28 |
29 | def makedirs(path, remove=False):
30 | if os.path.isdir(path):
31 | if remove:
32 | shutil.rmtree(path)
33 | print('removed existing directory...')
34 | else:
35 | return
36 | os.makedirs(path)
37 |
38 |
39 | class AverageMeter(object):
40 | """Computes and stores the average and current value"""
41 | def __init__(self):
42 | self.initialized = False
43 | self.val = None
44 | self.avg = None
45 | self.sum = None
46 | self.count = None
47 |
48 | def initialize(self, val, weight):
49 | self.val = val
50 | self.avg = val
51 | self.sum = val*weight
52 | self.count = weight
53 | self.initialized = True
54 |
55 | def update(self, val, weight=1):
56 | val = np.asarray(val)
57 | if not self.initialized:
58 | self.initialize(val, weight)
59 | else:
60 | self.add(val, weight)
61 |
62 | def add(self, val, weight):
63 | self.val = val
64 | self.sum += val * weight
65 | self.count += weight
66 | self.avg = self.sum / self.count
67 |
68 | def value(self):
69 | if self.val is None:
70 | return 0.
71 | else:
72 | return self.val.tolist()
73 |
74 | def average(self):
75 | if self.avg is None:
76 | return 0.
77 | else:
78 | return self.avg.tolist()
79 |
80 |
81 | def recover_rgb(img):
82 | for t, m, s in zip(img,
83 | [0.485, 0.456, 0.406],
84 | [0.229, 0.224, 0.225]):
85 | t.mul_(s).add_(m)
86 | img = (img.numpy().transpose((1, 2, 0)) * 255).astype(np.uint8)
87 | return img
88 |
89 |
90 | def magnitude2heatmap(mag, log=True, scale=200.):
91 | if log:
92 | mag = np.log10(mag + 1.)
93 | mag *= scale
94 | mag[mag > 255] = 255
95 | mag = mag.astype(np.uint8)
96 | mag_color = cv2.applyColorMap(mag, cv2.COLORMAP_JET)
97 | mag_color = mag_color[:, :, ::-1]
98 | return mag_color
99 |
100 |
101 | def istft_reconstruction(mag, phase, hop_length=256):
102 | spec = mag.astype(np.complex) * np.exp(1j*phase)
103 | wav = librosa.istft(spec, hop_length=hop_length)
104 | return np.clip(wav, -1., 1.)
105 |
106 |
107 | class VideoWriter:
108 | """ Combine numpy frames into video using ffmpeg
109 |
110 | Arguments:
111 | filename: name of the output video
112 | fps: frame per second
113 | shape: shape of video frame
114 |
115 | Properties:
116 | add_frame(frame):
117 | add a frame to the video
118 | add_frames(frames):
119 | add multiple frames to the video
120 | release():
121 | release writing pipe
122 |
123 | """
124 |
125 | def __init__(self, filename, fps, shape):
126 | self.file = filename
127 | self.fps = fps
128 | self.shape = shape
129 |
130 | # video codec
131 | ext = filename.split('.')[-1]
132 | if ext == "mp4":
133 | self.vcodec = "h264"
134 | else:
135 | raise RuntimeError("Video codec not supoorted.")
136 |
137 | # video writing pipe
138 | cmd = [
139 | "ffmpeg",
140 | "-y", # overwrite existing file
141 | "-f", "rawvideo", # file format
142 | "-s", "{}x{}".format(shape[1], shape[0]), # size of one frame
143 | "-pix_fmt", "rgb24", # 3 channels
144 | "-r", str(self.fps), # frames per second
145 | "-i", "-", # input comes from a pipe
146 | "-an", # not to expect any audio
147 | "-vcodec", self.vcodec, # video codec
148 | "-pix_fmt", "yuv420p", # output video in yuv420p
149 | self.file]
150 |
151 | self.pipe = sp.Popen(cmd, stdin=sp.PIPE, stderr=sp.PIPE, bufsize=10**9)
152 |
153 | def release(self):
154 | self.pipe.stdin.close()
155 |
156 | def add_frame(self, frame):
157 | assert len(frame.shape) == 3
158 | assert frame.shape[0] == self.shape[0]
159 | assert frame.shape[1] == self.shape[1]
160 | try:
161 | self.pipe.stdin.write(frame.tostring())
162 | except:
163 | _, ffmpeg_error = self.pipe.communicate()
164 | print(ffmpeg_error)
165 |
166 | def add_frames(self, frames):
167 | for frame in frames:
168 | self.add_frame(frame)
169 |
170 |
171 | def kill_proc(proc):
172 | proc.kill()
173 | print('Process running overtime! Killed.')
174 |
175 |
176 | def run_proc_timeout(proc, timeout_sec):
177 | # kill_proc = lambda p: p.kill()
178 | timer = Timer(timeout_sec, kill_proc, [proc])
179 | try:
180 | timer.start()
181 | proc.communicate()
182 | finally:
183 | timer.cancel()
184 |
185 |
186 | def combine_video_audio(src_video, src_audio, dst_video, verbose=False):
187 | try:
188 | cmd = ["ffmpeg", "-y",
189 | "-loglevel", "quiet",
190 | "-i", src_video,
191 | "-i", src_audio,
192 | "-c:v", "copy",
193 | "-c:a", "aac",
194 | "-strict", "experimental",
195 | dst_video]
196 | proc = sp.Popen(cmd)
197 | run_proc_timeout(proc, 10.)
198 |
199 | if verbose:
200 | print('Processed:{}'.format(dst_video))
201 | except Exception as e:
202 | print('Error:[{}] {}'.format(dst_video, e))
203 |
204 |
205 | # save video to the disk using ffmpeg
206 | def save_video(path, tensor, fps=25):
207 | assert tensor.ndim == 4, 'video should be in 4D numpy array'
208 | L, H, W, C = tensor.shape
209 | writer = VideoWriter(
210 | path,
211 | fps=fps,
212 | shape=[H, W])
213 | for t in range(L):
214 | writer.add_frame(tensor[t])
215 | writer.release()
216 |
217 |
218 | def save_audio(path, audio_numpy, sr):
219 | librosa.output.write_wav(path, audio_numpy, sr)
220 |
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/av1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/av1.mp4
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/av2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/av2.mp4
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gt1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gt1.wav
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gt2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gt2.wav
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/gtmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/mix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/mix.jpg
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/mix.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/mix.wav
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/pred1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/pred1.wav
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/pred2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/pred2.wav
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/predmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/video1.mp4
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/video2.mp4
--------------------------------------------------------------------------------
/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/weight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-0pXFhOg1o2c+audio-bGfyLBoZPM4/weight.jpg
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/av1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/av1.mp4
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/av2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/av2.mp4
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gt1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gt1.wav
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gt2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gt2.wav
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/gtmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/mix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/mix.jpg
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/mix.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/mix.wav
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/pred1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/pred1.wav
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/pred2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/pred2.wav
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/predmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/video1.mp4
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/video2.mp4
--------------------------------------------------------------------------------
/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/weight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-2XY77lk_LCQ+audio-1K-0VC9hWIA/weight.jpg
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/av1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/av1.mp4
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/av2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/av2.mp4
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gt1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gt1.wav
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gt2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gt2.wav
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/gtmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/mix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/mix.jpg
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/mix.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/mix.wav
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/pred1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/pred1.wav
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/pred2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/pred2.wav
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/predmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/video1.mp4
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/video2.mp4
--------------------------------------------------------------------------------
/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/weight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-EVp6jkgYuUc+audio-8YELO9yxs_c/weight.jpg
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/av1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/av1.mp4
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/av2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/av2.mp4
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gt1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gt1.wav
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gt2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gt2.wav
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/gtmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/mix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/mix.jpg
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/mix.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/mix.wav
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/pred1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/pred1.wav
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/pred2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/pred2.wav
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/predmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/video1.mp4
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/video2.mp4
--------------------------------------------------------------------------------
/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/weight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-E_ugm84TMvo+audio-PMDSfAZ4-eo/weight.jpg
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/av1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/av1.mp4
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/av2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/av2.mp4
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gt1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gt1.wav
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gt2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gt2.wav
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/gtmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/mix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/mix.jpg
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/mix.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/mix.wav
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/pred1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/pred1.wav
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/pred2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/pred2.wav
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/predmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/video1.mp4
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/video2.mp4
--------------------------------------------------------------------------------
/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/weight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-GV2bbRPFhvk+audio-NdzCe7COROw/weight.jpg
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/av1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/av1.mp4
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/av2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/av2.mp4
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gt1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gt1.wav
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gt2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gt2.wav
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/gtmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/mix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/mix.jpg
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/mix.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/mix.wav
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/pred1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/pred1.wav
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/pred2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/pred2.wav
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/predmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/video1.mp4
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/video2.mp4
--------------------------------------------------------------------------------
/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/weight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-P_dHPMofcwM+audio-EXMQITpeeaM/weight.jpg
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/av1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/av1.mp4
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/av2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/av2.mp4
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gt1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gt1.wav
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gt2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gt2.wav
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/gtmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/mix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/mix.jpg
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/mix.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/mix.wav
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/pred1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/pred1.wav
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/pred2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/pred2.wav
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/predmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/video1.mp4
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/video2.mp4
--------------------------------------------------------------------------------
/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/weight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-PlKCXvBDxaI+audio-0yR5s-CSw4E/weight.jpg
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/av1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/av1.mp4
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/av2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/av2.mp4
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gt1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gt1.wav
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gt2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gt2.wav
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/gtmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/mix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/mix.jpg
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/mix.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/mix.wav
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/pred1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/pred1.wav
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/pred2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/pred2.wav
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/predmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/video1.mp4
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/video2.mp4
--------------------------------------------------------------------------------
/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/weight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c/weight.jpg
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/av1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/av1.mp4
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/av2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/av2.mp4
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gt1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gt1.wav
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gt2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gt2.wav
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/gtmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/mix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/mix.jpg
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/mix.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/mix.wav
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/pred1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/pred1.wav
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/pred2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/pred2.wav
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/predmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/video1.mp4
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/video2.mp4
--------------------------------------------------------------------------------
/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/weight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-Xafuav13p2E+audio-t6B3JugXgTI/weight.jpg
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/av1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/av1.mp4
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/av2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/av2.mp4
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gt1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gt1.wav
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gt2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gt2.wav
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/gtmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/mix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/mix.jpg
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/mix.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/mix.wav
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/pred1.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/pred1.wav
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/pred2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/pred2.wav
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predamp1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predamp1.jpg
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predamp2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predamp2.jpg
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predmask1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predmask1.jpg
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predmask2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/predmask2.jpg
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/video1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/video1.mp4
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/video2.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/video2.mp4
--------------------------------------------------------------------------------
/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/weight.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubc-vision/TriBERT/17380ddbfe8e9d902d1e539d6af4a5f16786e222/visualization/audio-gK3ooFKujO0+audio-087sDeYPdjY/weight.jpg
--------------------------------------------------------------------------------
/visualization/index.html:
--------------------------------------------------------------------------------
1 |
Filename | Input Mixed Audio | Video 1 | Predicted Audio 1 | GroundTruth Audio 1 | Predicted Mask 1 | GroundTruth Mask 1 | Video 2 | Predicted Audio 2 | GroundTruth Audio 2 | Predicted Mask 2 | GroundTruth Mask 2 | Loss weighting |
---|
audio-PlKCXvBDxaI+audio-0yR5s-CSw4E |  | |  |  |  |  | |  |  |  |  |  |
audio-2XY77lk_LCQ+audio-1K-0VC9hWIA |  | |  |  |  |  | |  |  |  |  |  |
audio-0pXFhOg1o2c+audio-bGfyLBoZPM4 |  | |  |  |  |  | |  |  |  |  |  |
audio-Xafuav13p2E+audio-t6B3JugXgTI |  | |  |  |  |  | |  |  |  |  |  |
audio-gK3ooFKujO0+audio-087sDeYPdjY |  | |  |  |  |  | |  |  |  |  |  |
audio-P_dHPMofcwM+audio-EXMQITpeeaM |  | |  |  |  |  | |  |  |  |  |  |
audio-E_ugm84TMvo+audio-PMDSfAZ4-eo |  | |  |  |  |  | |  |  |  |  |  |
audio-Vk9YR-NLkmQ+audio-nUZ9jX1J78c |  | |  |  |  |  | |  |  |  |  |  |
audio-EVp6jkgYuUc+audio-8YELO9yxs_c |  | |  |  |  |  | |  |  |  |  |  |
audio-GV2bbRPFhvk+audio-NdzCe7COROw |  | |  |  |  |  | |  |  |  |  |  |
--------------------------------------------------------------------------------
/viz.py:
--------------------------------------------------------------------------------
1 | import os
2 | import matplotlib
3 | matplotlib.use('Agg')
4 | import matplotlib.pyplot as plt
5 |
6 |
7 | def plot_loss_metrics(path, history):
8 | fig = plt.figure()
9 | plt.plot(history['train']['epoch'], history['train']['err'],
10 | color='b', label='training')
11 | plt.plot(history['val']['epoch'], history['val']['err'],
12 | color='c', label='validation')
13 | plt.legend()
14 | fig.savefig(os.path.join(path, 'loss.png'), dpi=200)
15 | plt.close('all')
16 |
17 | fig = plt.figure()
18 | plt.plot(history['val']['epoch'], history['val']['sdr'],
19 | color='r', label='SDR')
20 | plt.plot(history['val']['epoch'], history['val']['sir'],
21 | color='g', label='SIR')
22 | plt.plot(history['val']['epoch'], history['val']['sar'],
23 | color='b', label='SAR')
24 | plt.legend()
25 | fig.savefig(os.path.join(path, 'metrics.png'), dpi=200)
26 | plt.close('all')
27 |
28 |
29 | class HTMLVisualizer():
30 | def __init__(self, fn_html):
31 | self.fn_html = fn_html
32 | self.content = ''
33 | self.content += ''
34 |
35 | def add_header(self, elements):
36 | self.content += ''
37 | for element in elements:
38 | self.content += '{} | '.format(element)
39 | self.content += '
'
40 |
41 | def add_rows(self, rows):
42 | for row in rows:
43 | self.add_row(row)
44 |
45 | def add_row(self, elements):
46 | self.content += ''
47 |
48 | # a list of cells
49 | for element in elements:
50 | self.content += ''
51 |
52 | # fill a cell
53 | for key, val in element.items():
54 | if key == 'text':
55 | self.content += val
56 | elif key == 'image':
57 | self.content += ' '.format(val)
58 | elif key == 'audio':
59 | self.content += ''.format(val)
60 | elif key == 'video':
61 | self.content += ' | '
63 |
64 | self.content += '
'
65 |
66 | def write_html(self):
67 | self.content += '
'
68 | with open(self.fn_html, 'w') as f:
69 | f.write(self.content)
70 |
--------------------------------------------------------------------------------