├── LFAV_dataset ├── test │ ├── test_audio.csv │ ├── test_audio_weakly.csv │ ├── test_visual.csv │ ├── test_visual_weakly.csv │ └── test_weak_av.csv ├── train │ ├── train_audio_weakly.csv │ ├── train_visual_weakly.csv │ └── train_weakly.csv └── val │ ├── val_audio.csv │ ├── val_audio_weakly.csv │ ├── val_visual.csv │ ├── val_visual_weakly.csv │ └── val_weak_av.csv ├── LICENSE ├── README.md └── src ├── __pycache__ ├── dataloader.cpython-38.pyc └── dataloader.cpython-39.pyc ├── dataloader.py ├── main.py ├── models ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── base_models.cpython-36.pyc │ ├── base_models.cpython-38.pyc │ ├── canny1d.cpython-38.pyc │ ├── get_adj.cpython-38.pyc │ ├── get_multi_adj.cpython-38.pyc │ ├── graph_modules.cpython-36.pyc │ ├── graph_modules.cpython-38.pyc │ ├── graph_modules_single.cpython-38.pyc │ ├── mhsa_layer.cpython-38.pyc │ ├── model.cpython-38.pyc │ ├── modules.cpython-36.pyc │ ├── modules.cpython-38.pyc │ ├── modules_new_stage2.cpython-38.pyc │ ├── stage_one.cpython-38.pyc │ ├── stage_three.cpython-38.pyc │ ├── stage_two.cpython-38.pyc │ ├── stage_two_new.cpython-38.pyc │ └── transformer.cpython-38.pyc ├── base_models.py ├── get_adj.py ├── get_multi_adj.py ├── graph_modules.py ├── graph_modules_single.py ├── mhsa_layer.py ├── model.py ├── modules.py ├── modules_new_stage2.py ├── stage_one.py ├── stage_three.py ├── stage_two_new.py └── transformer.py ├── scripts ├── test_s3.sh └── train_s3.sh ├── tools ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── distance.cpython-38.pyc │ ├── evaluation.cpython-38.pyc │ ├── evaluation_stat.cpython-38.pyc │ ├── get_subgraph.cpython-38.pyc │ ├── plot_graph_mat.cpython-38.pyc │ ├── plot_hist.cpython-38.pyc │ ├── plot_prob.cpython-38.pyc │ ├── stat_neighbor.cpython-38.pyc │ ├── subgraph_analysis.cpython-38.pyc │ └── train.cpython-38.pyc ├── distance.py ├── evaluation.py └── train.py └── utils ├── .DS_Store ├── __init__.py ├── __pycache__ ├── __init__.cpython-36.pyc ├── __init__.cpython-38.pyc ├── constant.cpython-36.pyc ├── constant.cpython-38.pyc ├── eval_metrics.cpython-38.pyc ├── set_seed.cpython-36.pyc └── set_seed.cpython-38.pyc ├── constant.py ├── eval_metrics.py └── set_seed.py /LFAV_dataset/val/val_audio_weakly.csv: -------------------------------------------------------------------------------- 1 | filename event_labels 2 | -2UWP3n_05A clapping,cheering,singing,speech,laughter 3 | -8yJXNCnRpk laughter,speech,cello,guitar 4 | -L_xGAyM_Qw chicken_rooster 5 | 01iz9kmvbxg speech,drum,singing,cheering,piano,clapping,laughter 6 | 08hzunIk81Y drum,cat,car,speech,cheering,laughter,clapping 7 | 0Hz4R_m0hmI speech,car 8 | 0REH7oc48C0 rodents,laughter 9 | 0ZOyJM8b0a0 helicopter,speech,car 10 | 0_tZ4OTT1MI speech,laughter,singing,violin 11 | 0b-o15cMJPE cheering,guitar,violin,clapping,singing 12 | 0eXu0vIIMVc speech,chainsaw 13 | 0fjrDsLr_aE chainsaw 14 | 0pJ1oD-OXZo speech,guitar 15 | 0vzuDkjtDOY piano,speech,laughter,clapping 16 | 119gUU8x0m8 helicopter,speech,cheering 17 | 1DCDHmGcB5c helicopter,speech 18 | 1KSpPr05jgo drum,guitar,singing 19 | 1OJp9yVaZiE guitar,cello,banjo,violin 20 | 1Rt6qQz0w7c playing_badminton,speech,clapping,cheering 21 | 1cJ4FKeXAQA speech,guitar 22 | 1hNwQMJtGew rodents 23 | 1ozVekZgpZE helicopter 24 | 1ur8hokq-Cs guitar,speech,chainsaw 25 | 1zKq8IJkcnk speech,drum,guitar,singing 26 | 24nBh-zzpsQ piano,speech,cheering,cry 27 | 2I1ZU5g1QNo cheering,singing,violin 28 | 2MYfUQFr6S0 chainsaw 29 | 2V9PXkRdE3I speech,guitar 30 | 2WogzOS_D9Q chainsaw 31 | 2_WpcixEOP4 speech,drum,cello 32 | 2bmuqqUE8Z4 drum,speech,singing,laughter,clapping 33 | 2pVJmT0AG3Q car,bicycle 34 | 2t0HUQleGwU rodents,speech,laughter,cry 35 | 2xz2ti1RfwY piano,violin,clapping 36 | 305jW5Q5Ewk cheering,clapping,speech,singing,guitar 37 | 36fI5gwfHqU playing_tennis 38 | 38Z3ovrzVdo fixed-wing_aircraft,laughter,speech 39 | 3Bsnuew8f6s piano 40 | 3EifSw1bvmA singing,cheering,piano,drum 41 | 3Gbj7ukbvQE violin 42 | 3KjtkJiip-8 playing_badminton 43 | 3LDmXpcjrC4 speech,frisbee 44 | 3M2TFEF01t4 playing_badminton,speech 45 | 3NHJUkdHwtA dog,shofar 46 | 3R-1x4pPBBQ speech,playing_baseball 47 | 3Sby3LP-cbU speech,chainsaw 48 | 3UF7YxKUeks speech,laughter,cheering 49 | 3X2xAD-9eiM helicopter,car,chainsaw 50 | 3_J8f_tvqmo cheering 51 | 3e-52Lkrl0w laughter,speech 52 | 3lzA9PRMiUk car_alarm,speech 53 | 3o-Eaixw6ec helicopter,car 54 | 3sMa94JiLZI chainsaw,speech 55 | 3wZ_HHViGpM playing_badminton 56 | 4-X3c59e3Hk helicopter,car 57 | 41V_J6hdEhs playing_badminton 58 | 45RDZ3owtvY helicopter,chainsaw,laughter,speech 59 | 48XkS3Hkmws cry,laughter,speech 60 | 4DDfD4-cEVo fixed-wing_aircraft,speech 61 | 4I6z0N2xdh8 alarm 62 | 4MAdVOtBQD8 clapping,singing,speech 63 | 4Yt7hmubn58 playing_badminton 64 | 4exLWR92ycs singing,guitar,laughter,speech 65 | 4igvehYCc7U singing,guitar,speech,clapping,cheering 66 | 4leF1RwXFSE chainsaw 67 | 4ssWujrqMhs speech,alarm,guitar,cheering,clapping,laughter,piano 68 | 4tgjvpFd6mc rodents 69 | 4xyw9i0VvRY clapping,cheering 70 | 50btuV9sY60 fixed-wing_aircraft,speech 71 | 54PhWYrDTgw chainsaw,speech 72 | 56ddX-fmEtY speech,cry,dog 73 | 598s4riqw3M speech,laughter 74 | 5CkG2GTiK4M chainsaw,speech 75 | 5ECzbUNtnOk speech,drum 76 | 5J-ZeYlddSk clapping,guitar,violin 77 | 5K7ny6CVZRE speech,guitar,singing 78 | 5NqpmNqPDeQ chainsaw 79 | 5OUHZWamrec speech,piano,clapping 80 | 5RPDLz_RCbE helicopter 81 | 5V51JifwW48 cry,speech,dog 82 | 5ZELJ-0sGCw cello,laughter,cheering,clapping 83 | 5cKS_KuoU00 singing,fixed-wing_aircraft,speech 84 | 5kE3ngBWV_U speech,fixed-wing_aircraft,laughter 85 | 5mKfMbsjeAs chainsaw 86 | 5syUwJ9dEpE shofar 87 | 5uwzHqZn-Os alarm 88 | 5vlGKyxl22M speech,cheering,laughter 89 | 5zSPUfVm32A chainsaw,speech 90 | 6-atw21yanY playing_volleyball,cheering,clapping 91 | 65PjoTErZlw playing_badminton,clapping,speech 92 | 67PlMRhS5qc chainsaw 93 | 6AVMcJa77PM speech,cheering,laughter 94 | 6Dg6Q6tQMiE drum,guitar,singing 95 | 6GLXPPOYfWs playing_badminton,speech,cheering 96 | 6Jls8goVRuU playing_basketball,cheering,clapping,speech 97 | 6NOW0_HuUCQ cat,chainsaw 98 | 6TnKvlQ2h7s cheering,speech 99 | 6WlLqF5eXkM guitar,cello,speech,clapping,banjo 100 | 6ctzdi3lZ5M rodents 101 | 6iefuGEvlHI shofar 102 | 6mraw-F7qQg accordion,cello 103 | 6sHHdIWDmnk speech,guitar,cheering,laughter 104 | 6ynhCAZ8KIo guitar,speech,cheering,clapping 105 | 736A0wfsYDw playing_badminton,speech,cheering,clapping 106 | 76rJsot3DqM chainsaw 107 | 78D25pNtSSo car 108 | 7EJ1TlhoFKc chainsaw 109 | 7MK3P3StM6U speech,clapping,playing_badminton 110 | 7Sidspb9zhk horse,speech,singing 111 | 7Wb2zZQkpPo speech 112 | 7YwxtA1MNuk guitar,singing,cheering,speech,clapping 113 | 7_pRiUfp938 speech,singing,laughter 114 | 7bgTxPSZ2-o speech,chainsaw 115 | 7iIBkME4bSM car_alarm,helicopter 116 | 7myife0ZQSc cheering,clapping,singing 117 | 7qbbqP9mTIE shofar 118 | 8-E3tzyQdN4 helicopter,speech,laughter 119 | 88cL4GK6vyI singing,cheering,clapping,speech 120 | 8NY3TWbkg3I chainsaw 121 | 8_t0cc17u68 speech,piano,singing,drum 122 | 8gO_lxThc1M piano,guitar 123 | 8mc7uArd7zw guitar,speech 124 | 8q_vgp0FDhI cry,speech,helicopter,laughter 125 | 8t-JhhYhZyk singing,guitar,banjo,cello,violin 126 | 8yBGrAo62iQ chainsaw 127 | 9-iVuKsU8ZI rodents,laughter,speech 128 | 91SmSyaO3ys speech,cheering,clapping,laughter 129 | 95shZG1q4ks speech,laughter,playing_volleyball 130 | 9DEfXtd0yPg speech 131 | 9NP-1iiy8P4 chainsaw 132 | 9TMpVoKPUgM speech,cheering 133 | 9Xw3rUf873I playing_badminton,laughter,speech 134 | 9dpDFE3UL_E accordion,laughter 135 | 9gKnLxvCCJY playing_badminton,speech,clapping 136 | 9rdCnIUnxVs laughter,violin,singing,clapping 137 | 9w72SdFNnXU speech,chainsaw 138 | 9z6Mzyar4gU drum,speech,playing_badminton 139 | A4Ly7wwElF0 speech,playing_basketball 140 | AAxqVPd_wd0 accordion,drum,clapping 141 | AHSLBch9Y6A playing_badminton,speech 142 | AJrmVyCWbq8 piano,car 143 | ANAGPl7QZ0k helicopter,car 144 | AQvhFMkI51Y helicopter 145 | ATd-LDQ5MvM speech,clapping,piano,cheering 146 | Aatxa3zgXFk cheering,clapping 147 | AcErIT001pg piano,guitar 148 | AdTZKIG4t-A fixed-wing_aircraft 149 | AhKBRh8Oxfs speech,playing_badminton 150 | Al5Ks0PnV0w playing_soccer,speech 151 | ApWlaLTPpho horse 152 | ArVDnCbQ6mU helicopter,speech,fixed-wing_aircraft,laughter 153 | B-Ua2uzRwu4 speech,playing_badminton 154 | B8ISzf2pryI speech 155 | BCyCm2E_GC0 helicopter,car 156 | BFNekjEgvuk chicken_rooster 157 | BMVJbAObEQM violin,drum,singing 158 | BVT3acNFzqc guitar,singing,clapping,cheering 159 | BeS5uOUBNFc speech,playing_soccer 160 | BjtruQ_xQQ8 helicopter,speech 161 | Bql-XqRjM7U speech,singing,guitar,laughter 162 | BtP6_sCuSLM laughter,speech,clapping 163 | BxXl5j9qEuc speech,drum,alarm 164 | C2A9dIL2ECQ helicopter,alarm,speech 165 | C6nHvKcT6j8 speech,clapping 166 | C8cxbPH-Hmg playing_badminton,speech 167 | C9scOaHzSOA playing_badminton,speech 168 | CIvtiNpKEY0 speech,clapping 169 | CLm9FYDRxsA helicopter 170 | CQTa01iGUSs guitar 171 | CXRO93dNB_A playing_tennis,speech,car_alarm 172 | C_G0qaFcSCM singing,clapping 173 | CeiVB75kOOU accordion,drum 174 | CpG-YZWg3L4 cheering,speech,car,singing 175 | CskZG-HM5Ko accordion,speech 176 | Cw-US2uaOKs helicopter,speech,laughter 177 | D9roj8pPsvU speech,drum,guitar,clapping 178 | DAC2Z-54HDY piano,speech,laughter 179 | DFYOtfJsXgM drum,playing_baseball,speech,playing_badminton,cheering,clapping 180 | DQ3lbf7PCoY clapping,speech,banjo,drum,singing 181 | DUlxiX4ri1Q alarm 182 | DXGQnIv5vkA speech,dog,laughter 183 | DeQ_3bCcodk rodents 184 | DkXsN6qwDEw horse,cheering,frisbee,dog,singing,clapping,laughter 185 | Do__idgHWJM speech,playing_basketball 186 | DxqGhpqu2z4 speech,guitar 187 | DynoOMg9Mvw fixed-wing_aircraft,speech 188 | E4qiicR74hY clapping,violin,piano 189 | E6CcUj2mDbI dog 190 | E8kiWmlMN_A speech,playing_soccer 191 | EA58Fo2e0CI speech,laughter 192 | EHTTmj4bBDg singing,guitar,drum,piano,clapping,cheering,speech 193 | EPEmEMp8zp4 piano,cello,speech,violin 194 | ESRG74Q_r6k playing_basketball,speech 195 | EayN_Jj0740 violin,clapping 196 | EhAY6-dIO2I dog 197 | EleoJrKrHO0 guitar,violin,accordion,piano 198 | EtV6RzLCBlU speech,guitar,singing,cheering 199 | EvA2RNci3SE singing,speech,guitar,laughter,clapping 200 | F4z5PKQT9Cg piano,cello 201 | FA0avnkqRV0 clapping,cheering 202 | FKQCVhR702M car 203 | FQqwy9PPhX4 speech,playing_basketball 204 | FTyIy8TLoCk dog,speech 205 | FYv4-57P9r0 speech,alarm 206 | F_WZZd7efr0 clapping,piano,violin 207 | Fbs_wxRXOVU rodents,speech,laughter,cheering 208 | Fju_-bLUhtg speech,playing_badminton 209 | FnA_H-RBaXE speech,horse 210 | Fqc_5UqYffI helicopter 211 | Fygm03Gw19Y helicopter,alarm 212 | G486CNb4GTw drum,guitar,singing,clapping,cheering 213 | G7bNye0f9eQ clapping,speech,playing_baseball,cheering 214 | GCKIImqFeQ4 speech,cheering 215 | GEsAdwCJox0 speech,drum 216 | GI7p6fbHfx4 guitar 217 | GQJ3-6ZBsfg playing_basketball,speech 218 | GYvApvHOPXc rodents 219 | Gdr3ImP4Gv4 banjo,drum,singing,speech,clapping 220 | GlTsktEX8jw speech,clapping,playing_soccer 221 | GoUuozWmoFM piano,violin 222 | GroFCq4B5xo playing_ping-pong,speech 223 | GvrN5VT1Zf0 guitar,singing,speech,laughter,clapping,cat 224 | GywIlkOUPWw laughter,speech 225 | H29pf9Is1qE violin,guitar,piano,clapping 226 | H5wBI98NXEE speech,cheering 227 | HCPfyrSiYQg drum,singing,cheering 228 | HFNY9NKFA4g playing_tennis 229 | HLqhkZAW0K8 speech,fixed-wing_aircraft 230 | HQtPEVGdOMw car,speech,laughter 231 | HcDycLidmE4 laughter 232 | HhxJViNn3Ik car,speech,laughter,dog 233 | HnURtEzECx8 playing_basketball,speech 234 | HoPcLLZhVA8 dance 235 | I5ECnXmzfCo playing_basketball,car_alarm,speech 236 | I6dcgD7U7Ng speech,playing_badminton,clapping 237 | IHqP9xNt9Rk laughter,clapping,speech 238 | IOBbVjG2IC8 helicopter,speech 239 | IbJdgddf2rs dog,alarm,cheering 240 | Ioum3RLkdvU cheering 241 | Isqd8mYJn6M laughter,speech 242 | IuYrbeqc-mA playing_tennis,cheering,speech,fixed-wing_aircraft 243 | J-oqCqZ2BGs playing_badminton,speech 244 | J3QTd-dLUUw speech,laughter,dog 245 | JBR2fOhOOs8 speech,laughter,cheering,clapping,playing_volleyball 246 | JEGQJNJ59Vo piano,singing,cheering,clapping,speech,laughter 247 | JJPnKWnyBk0 violin,accordion,drum,clapping 248 | JVdSd4gjptA guitar,singing,accordion,speech,laughter 249 | Jan3-mrKhes speech,piano,laughter,clapping,cheering 250 | Jf0tIV0FNuc singing,guitar,speech 251 | Ji7BgSmw-to violin,clapping 252 | Jmsd-qI9UCk clapping,speech,cheering 253 | JzOm6le-0jM speech,clapping,laughter 254 | K0qjglnaeCE laughter,speech,cry 255 | KANSCrz4ITY speech,laughter 256 | KI8ciS9W4Ww piano 257 | KM7OARP3qnU cheering,playing_volleyball,clapping,speech 258 | KOgVHbEZBRI playing_volleyball,clapping 259 | KTv3S4KMrxs guitar,drum,piano,singing 260 | KYbpNBqDRVM speech,playing_soccer,clapping 261 | KhN9_80lJE0 alarm,speech 262 | L1ds1J4Mzwc speech 263 | LAyb0tD6cC8 guitar,singing 264 | LGt7J0s1aZw fixed-wing_aircraft 265 | LMIlVKyiT5I speech,dog,laughter,cheering 266 | LSo9hvy1O_U cheering,clapping,speech 267 | LYq7aiuUo9Q cheering,speech,cry,dog,laughter,clapping 268 | Ldrwlo6nSXQ clapping,cheering,speech 269 | LjN5zrMeRQs speech,playing_soccer,alarm,cheering 270 | LqLrcG29b0o guitar,cello,accordion 271 | M3O49xZUjnE speech,laughter,cheering,clapping 272 | MQdNi2PLaSg speech 273 | MSF2r6_0mmU violin,piano,clapping,speech 274 | Mfi0NMtfUzI drum 275 | MkMOpb3HVxM speech,horse,laughter 276 | MycEFlLDOkY clapping,speech,cheering,guitar,singing 277 | N3PgUuVAMpA violin,cello,guitar,clapping,speech,laughter 278 | N6_yGKH-4Oo speech,playing_soccer 279 | NBpAt3TVOK4 playing_soccer,speech,cheering 280 | NL3J180GKDs playing_basketball,speech,laughter,cheering 281 | NTbNCezCJNM violin,piano,cello,drum 282 | NZL4sUqjUv0 clapping,laughter,piano 283 | NndiiezGkNY cheering,clapping,laughter 284 | Nz_CuNe0cT0 singing,cheering 285 | O2zgJUmCLt8 cheering,speech,clapping 286 | OB_tVv18Zhw guitar,drum 287 | OI5wo5D8k8Q piano,speech,laughter,singing,clapping 288 | OVHtv2UehLU speech,clapping,cheering,piano 289 | OieTT6UUs2g speech 290 | OrMf-D4QOqM speech,guitar 291 | P3gIi9su2Rw dog 292 | PGmM0l9jfC4 speech,laughter,cry 293 | POBYjeKzvcY speech,laughter,cry 294 | PYxx7gxFIZM clapping,playing_badminton,cheering,speech 295 | Pfof8hGZIt8 guitar 296 | PxKp7_lASKI cheering,laughter,clapping,speech,drum,singing 297 | Q-4edTX71fI banjo,violin,singing 298 | Q9oLbZlT3sQ speech,clapping 299 | QKj8n_um4Zg speech,playing_soccer,clapping,alarm,cheering,guitar 300 | QUR6mUOZJes car_alarm,helicopter 301 | Qkzg7wB-nxw accordion,piano,clapping,cheering,speech 302 | QwF7a9grl2w cheering,singing 303 | R9OrWMSMKXg shofar 304 | RToAwVlhA7A cheering,clapping,guitar,speech,drum,singing 305 | RgaVv4p1SkA guitar 306 | RwiF_Q8-pyU violin,car 307 | SO4O873MsXk singing,cheering,clapping,speech 308 | S_1q7Q5DX78 cello,guitar,accordion 309 | SevXOPt5cCo cheering,clapping,playing_badminton,laughter,speech 310 | T-shmnmIC2w piano,singing,laughter,clapping,cheering 311 | TEy5H1Pe7ck shofar 312 | TQ_JHaSF-D8 clapping,guitar 313 | TzyUERfGnRo guitar,accordion,cheering,clapping 314 | U7Yne1CRZyg dog,car,speech,car_alarm 315 | US_NJY_cGqw rodents,speech 316 | UtnGtAYOPgM rodents 317 | VMAk1_bcv90 speech,clapping 318 | VuzvEGJi8ew speech 319 | WCRlezUfsyo speech,laughter,cheering,guitar,clapping 320 | WaaANll8h18 speech,guitar,drum,piano 321 | WoEBDNm7n_c speech,car,laughter 322 | X7ymriMhoj0 speech,laughter 323 | XNcZDhjb9mU drum,speech,clapping,alarm 324 | XX6tSilHbPs piano,guitar,singing,drum,clapping,cheering 325 | XfIf9hK8HiA guitar 326 | XusQ3T98QAc speech,dog 327 | Y9VBgCC2D0A drum,guitar,piano,singing,clapping 328 | YKs8UbLjPwo drum,singing,speech,laughter,clapping 329 | YbQ44pTU-FM speech,banjo,violin,guitar,cello,singing,laughter 330 | ZF-pF6v3fb0 laughter,clapping,cheering,speech,accordion 331 | ZX3i8qp4Hsk speech,piano,singing 332 | ZerAVavECMs singing,laughter,speech 333 | _-sfoqUa0vs speech,cheering,clapping,laughter,violin,drum,singing,cello 334 | _5jP7maXfwQ speech 335 | _8xMgAApZIU singing,speech,playing_soccer,clapping,drum 336 | _DkR4H_QOyo guitar,singing 337 | _LQ-2JH5I4U cheering,clapping,playing_volleyball,speech 338 | _OXU50DYMfg violin 339 | _XKrJ5QwjIo accordion,guitar,drum,piano,singing 340 | _hxeoQvSuic cheering,clapping,speech,piano,guitar,drum 341 | _lZzYZBxbxA playing_tennis,cheering,clapping 342 | _qpfYglF5bI clapping,cheering,laughter,singing 343 | _tGdoqj0Qh8 dog,shofar 344 | a2c8sv0SLUo speech 345 | a84qOOQgfp0 singing,cheering,clapping,speech 346 | a9jkbha8bmc speech,car,laughter 347 | aHwzzSYhHrs violin,guitar,cello 348 | aPwcDG6F2aM clapping,cheering,speech,drum 349 | a_90uFCrWWY speech,dog 350 | ar_G9Goi-TI drum,guitar,singing,cheering 351 | avJNxcysMCk speech,clapping 352 | axXpdF1Z9II car 353 | b5x3xJITTsQ speech,playing_soccer,cheering,clapping 354 | bELmGWXOlQ0 speech,clapping,laughter,singing,drum 355 | bK05D61SXmM speech,playing_baseball 356 | bP8P5WBebl0 guitar,singing 357 | bV6YVSupGVg cheering,speech,clapping 358 | bX7S3HqdlAI speech,laughter,car,dog,cheering,clapping 359 | beqQ4gtKn84 speech,laughter 360 | blL29NvAMic bicycle 361 | btwy0yCQL14 speech,clapping,violin 362 | by4dS4tgkY0 speech 363 | c-cAHw-Q5lc car,alarm,speech 364 | c4PS6uywL7o speech,laughter 365 | cFYxRp9K9L4 violin,piano,clapping,cheering 366 | cXoGMM-PyHY chicken_rooster 367 | ckMCPCQkM_k speech,laughter 368 | cpg9rdc-ONc dog,cheering,speech 369 | cxxeRzfJ1_c car 370 | d3Ltp10L_q8 speech,laughter,singing,guitar 371 | d7Ny0aaL90E clapping,cheering,piano,violin,speech 372 | dD1tDnSUQ_I speech,singing,clapping 373 | dNBRFnEpCOw shofar,speech,cheering,clapping,guitar,banjo,drum,singing,laughter 374 | dQw4w9WgXcQ dance,singing 375 | dV6UMWryqGY speech,singing,clapping,chainsaw 376 | dZ-aFiQAPwI speech,dog 377 | df_Xv2M5WT4 playing_tennis,speech,cheering,clapping 378 | dmYHbO-eu8E dog,speech,singing 379 | dr1QWbfsPGY singing,cheering,speech 380 | e0a1lp4ZWu8 speech,laughter,clapping,horse 381 | e9xV240-Jzw drum,cheering,clapping,speech 382 | eGijYoJ_6nY guitar 383 | eS1r2Qi0qUM playing_badminton,speech,laughter,cheering 384 | eYYMdLk4IcY piano,speech,cello,shofar,violin 385 | egvE90X3NAE rodents,speech,dog 386 | es2krnRZ2Ko speech,clapping 387 | eyXE8nKb0AA banjo,speech 388 | f4dndkH9Y_w banjo,violin,laughter,cheering,clapping 389 | fIdVyM5Nr5w speech,guitar,cheering,singing 390 | fQDaz5kZHIk drum,accordion 391 | faXQShej178 car 392 | fgL83KdXw_U cello,violin,clapping 393 | fxI8zr-CPRg piano,cello 394 | g3uQ-O_o_Rc laughter,speech 395 | gBh0Ig5kzCs guitar,violin,cello 396 | gO8N3L_aERg speech,alarm 397 | gV6hP9wpMW8 speech,clapping,cheering,laughter,car 398 | gpN2k5zz81o speech,cheering,clapping,violin,singing,laughter 399 | gt42q0Ck-C4 cheering,clapping 400 | h7oJYFZViuI accordion,drum,guitar,piano,singing 401 | hBDAh1ivs0o speech,laughter,singing,clapping,cheering 402 | hLwhdHEDb-U playing_baseball,speech 403 | hXSee4C6pyE speech 404 | hgT707uizYI clapping,speech,laughter 405 | hmopCSSKX2U speech,singing 406 | hthXd_Js5AE guitar,drum,singing,cheering,clapping 407 | i3JjmOTkPOw speech,clapping,cheering,laughter 408 | iAQYcYCalOA drum,clapping,singing 409 | iFr2H2p063M speech,car,laughter 410 | iWdE8ry309k dog,cat,speech,laughter 411 | ifHRDBx-ctw accordion 412 | iqoPD-DlGIE piano 413 | j-QeBhOXPf8 violin 414 | j3oEXXy6-p0 piano,laughter 415 | jAsKLkkO6rs singing,laughter,speech 416 | jOs3UfxE8XE cheering,singing,speech 417 | jkXJfDPB57w singing,drum 418 | jroTvXsKUiA alarm,speech 419 | k5ug9gzPS4M playing_basketball 420 | kBiGFEIqZHA playing_soccer,clapping 421 | kNZkGy3wAEE speech,singing,guitar,laughter 422 | kda6dpSX0jY speech,dog,helicopter 423 | kolGmya2OIs speech,guitar,singing 424 | kwNR-6sfog4 speech,piano,laughter,clapping 425 | l6p53lBcIGw horse 426 | lHcUr0POrKY guitar,singing,laughter 427 | lNQexzgzvw4 speech 428 | lWTnQ4qj24U accordion,clapping,cello,guitar,piano,cheering 429 | lfizshQHNQo piano,cello 430 | lnijMHh8qh4 speech,laughter,piano 431 | lvZBGJkbeTE cheering,singing,clapping 432 | m5ex66Vst80 drum,singing,guitar,speech 433 | mQl_6mcQBaQ piano,speech,clapping,cheering 434 | mcc8UNgZh_I speech,car_alarm 435 | mkN6QdXU2fQ cheering,guitar,drum,speech 436 | n47yP1yfJWQ violin,clapping 437 | nAExL0lNLM0 clapping,playing_volleyball,cheering,singing,car_alarm,speech 438 | nBNHSOn7N1s piano,laughter,cheering 439 | nIsZ4Q0y100 speech,cheering 440 | nNxaCXUC0l4 speech,guitar,singing 441 | nRxFEDNT-Z0 cheering,singing,drum,speech 442 | npyseVDYQbY violin,guitar,accordion 443 | o16h6voegFU speech,laughter,singing 444 | oEepqn5vJiE playing_baseball 445 | oNNlSdkjuqA speech 446 | od140iTEaVg car,helicopter,laughter 447 | ooDu3m-ixQs helicopter,chainsaw,speech,laughter 448 | p6BWQkhJ2kQ speech,violin 449 | p6ra0IBnL20 drum,speech,singing,guitar,shofar 450 | pH4q6OXhZgU speech,playing_soccer,clapping,cheering 451 | pOLqlica-bM car 452 | piaLzfKLs9c playing_volleyball,speech,laughter,dog 453 | psMNOvfJ1LE car,guitar,bicycle 454 | qRFNqIN_LSc accordion,clapping 455 | q_NJ800CdGU accordion,guitar,drum,piano,singing 456 | qfGggAGITwg cheering,guitar,drum,cello 457 | qqkhtf9zplA piano 458 | r4lWcFb8854 speech,violin,singing 459 | rDq5TQYn9Ak laughter,cheering,singing,clapping 460 | rRl66L-5TY4 violin,cheering 461 | r_5bYNqpEFU piano,cello 462 | rokxoxFsWPU speech,laughter 463 | rtMR524m0BM speech,singing 464 | s3c0A6YmIIc cello,guitar,violin,singing,clapping,cheering 465 | sPvTATw6LQA violin,speech,clapping 466 | sfyd3WX2OjE shofar,speech 467 | t56JVDxycY4 speech,cheering,clapping,dance,singing,drum 468 | tTOihIRXPGs drum 469 | ty_LDBaE7gw piano,violin,drum 470 | uEwMTxbpbrA violin,cello,shofar,cheering,clapping 471 | uhFJdAJkgjo drum,speech 472 | v2lwdvydNsw speech,laughter 473 | vKBn0Rn93TQ piano 474 | vVkdhLjRchA guitar,shofar,drum 475 | vxWMRjF_0wM speech,playing_volleyball,laughter,clapping 476 | wDKE5BHyCBg speech,dog 477 | wNCJhki5_eM cello,guitar,piano,violin 478 | weKt0KjakLI speech,shofar 479 | x5isH2gqS4o horse,speech 480 | xPvEzH2UXnw speech,dog 481 | xsRNDo8Upys laughter,clapping,piano 482 | yJglmIQzriI speech,piano,guitar,clapping 483 | ySibvCKNOU8 drum,speech,playing_basketball,cheering 484 | ysCaqh38JVQ speech 485 | zO8IwPbALK4 cheering,clapping,guitar,singing 486 | zfeHACNRs20 cheering,speech,clapping,singing,piano 487 | zmCKZYKsiGM cheering,clapping,speech,laughter 488 | -------------------------------------------------------------------------------- /LFAV_dataset/val/val_visual_weakly.csv: -------------------------------------------------------------------------------- 1 | filename event_labels 2 | -2UWP3n_05A speech,singing 3 | -8yJXNCnRpk cello,guitar,cat 4 | -L_xGAyM_Qw chicken_rooster 5 | 01iz9kmvbxg speech,dance,clapping,cheering,laughter 6 | 08hzunIk81Y bicycle,car,dance,dog,speech,laughter,clapping 7 | 0Hz4R_m0hmI car 8 | 0REH7oc48C0 rodents 9 | 0ZOyJM8b0a0 speech,helicopter 10 | 0_tZ4OTT1MI guitar,dance,speech,singing 11 | 0b-o15cMJPE violin,speech,laughter,dance 12 | 0eXu0vIIMVc chainsaw,speech 13 | 0fjrDsLr_aE chainsaw 14 | 0pJ1oD-OXZo speech 15 | 0vzuDkjtDOY speech,laughter 16 | 119gUU8x0m8 helicopter,bicycle,car,speech 17 | 1DCDHmGcB5c car,helicopter 18 | 1KSpPr05jgo guitar,drum,singing 19 | 1OJp9yVaZiE speech,guitar,cello,banjo,violin,singing 20 | 1Rt6qQz0w7c playing_badminton,clapping,speech,cheering,laughter 21 | 1cJ4FKeXAQA guitar,speech,laughter 22 | 1hNwQMJtGew rodents 23 | 1ozVekZgpZE helicopter,car,bicycle 24 | 1ur8hokq-Cs chainsaw 25 | 1zKq8IJkcnk speech,drum,guitar,singing 26 | 24nBh-zzpsQ piano 27 | 2I1ZU5g1QNo piano,singing,guitar,violin 28 | 2MYfUQFr6S0 speech,chainsaw 29 | 2V9PXkRdE3I guitar 30 | 2WogzOS_D9Q chainsaw 31 | 2_WpcixEOP4 cello,dog,speech 32 | 2bmuqqUE8Z4 speech,drum,laughter,cheering,clapping 33 | 2pVJmT0AG3Q car_alarm 34 | 2t0HUQleGwU rodents 35 | 2xz2ti1RfwY piano,violin,clapping 36 | 305jW5Q5Ewk guitar,speech,singing,clapping 37 | 36fI5gwfHqU playing_tennis 38 | 38Z3ovrzVdo fixed-wing_aircraft 39 | 3Bsnuew8f6s piano,laughter 40 | 3EifSw1bvmA speech,singing,dance,piano,drum 41 | 3Gbj7ukbvQE speech,violin,laughter 42 | 3KjtkJiip-8 playing_badminton,laughter,speech 43 | 3LDmXpcjrC4 speech,frisbee 44 | 3M2TFEF01t4 playing_badminton,laughter,clapping 45 | 3NHJUkdHwtA shofar 46 | 3R-1x4pPBBQ speech,playing_baseball 47 | 3Sby3LP-cbU chainsaw 48 | 3UF7YxKUeks speech,laughter,cheering,cry 49 | 3X2xAD-9eiM helicopter 50 | 3_J8f_tvqmo playing_volleyball 51 | 3e-52Lkrl0w bicycle 52 | 3lzA9PRMiUk car,car_alarm,bicycle 53 | 3o-Eaixw6ec speech,helicopter 54 | 3sMa94JiLZI chainsaw,car,dog 55 | 3wZ_HHViGpM playing_badminton 56 | 4-X3c59e3Hk helicopter 57 | 41V_J6hdEhs playing_badminton,speech 58 | 45RDZ3owtvY helicopter,chainsaw,speech 59 | 48XkS3Hkmws cry,laughter,dog 60 | 4DDfD4-cEVo fixed-wing_aircraft 61 | 4I6z0N2xdh8 speech,alarm 62 | 4MAdVOtBQD8 laughter,dance 63 | 4Yt7hmubn58 playing_badminton,speech 64 | 4exLWR92ycs singing,laughter,speech 65 | 4igvehYCc7U guitar,singing,speech,laughter 66 | 4leF1RwXFSE chainsaw 67 | 4ssWujrqMhs speech 68 | 4tgjvpFd6mc rodents 69 | 4xyw9i0VvRY playing_volleyball,cheering 70 | 50btuV9sY60 fixed-wing_aircraft 71 | 54PhWYrDTgw chainsaw,car,dog 72 | 56ddX-fmEtY speech,laughter,cry,clapping 73 | 598s4riqw3M horse,frisbee,dog,speech,laughter 74 | 5CkG2GTiK4M chainsaw 75 | 5ECzbUNtnOk cat 76 | 5J-ZeYlddSk clapping,laughter,speech,violin,dance 77 | 5K7ny6CVZRE laughter,speech,guitar,singing 78 | 5NqpmNqPDeQ chainsaw 79 | 5OUHZWamrec speech,piano,clapping,laughter 80 | 5RPDLz_RCbE helicopter,bicycle 81 | 5V51JifwW48 cry 82 | 5ZELJ-0sGCw laughter,cello,clapping 83 | 5cKS_KuoU00 fixed-wing_aircraft 84 | 5kE3ngBWV_U fixed-wing_aircraft,train,speech,laughter,car 85 | 5mKfMbsjeAs chainsaw 86 | 5syUwJ9dEpE shofar 87 | 5uwzHqZn-Os alarm 88 | 5vlGKyxl22M speech,cheering 89 | 5zSPUfVm32A chainsaw 90 | 6-atw21yanY playing_volleyball,clapping,playing_basketball 91 | 65PjoTErZlw playing_badminton 92 | 67PlMRhS5qc chainsaw 93 | 6AVMcJa77PM car,laughter,speech,cheering 94 | 6Dg6Q6tQMiE laughter,guitar,singing 95 | 6GLXPPOYfWs playing_badminton 96 | 6Jls8goVRuU playing_basketball,cheering,clapping,dance 97 | 6NOW0_HuUCQ chainsaw 98 | 6TnKvlQ2h7s playing_soccer 99 | 6WlLqF5eXkM guitar,cello,banjo 100 | 6ctzdi3lZ5M rodents 101 | 6iefuGEvlHI shofar 102 | 6mraw-F7qQg cello,accordion 103 | 6sHHdIWDmnk speech,laughter,guitar,singing 104 | 6ynhCAZ8KIo guitar,speech,laughter 105 | 736A0wfsYDw playing_badminton,clapping 106 | 76rJsot3DqM chainsaw 107 | 78D25pNtSSo speech,car_alarm 108 | 7EJ1TlhoFKc chainsaw 109 | 7MK3P3StM6U playing_badminton,speech 110 | 7Sidspb9zhk horse 111 | 7Wb2zZQkpPo car,helicopter 112 | 7YwxtA1MNuk guitar,laughter,speech,singing,cheering,clapping 113 | 7_pRiUfp938 speech,dance 114 | 7bgTxPSZ2-o chainsaw,speech,car 115 | 7iIBkME4bSM speech,helicopter,car_alarm 116 | 7myife0ZQSc dance,clapping 117 | 7qbbqP9mTIE shofar,car 118 | 8-E3tzyQdN4 helicopter,car 119 | 88cL4GK6vyI clapping,singing,laughter,cheering 120 | 8NY3TWbkg3I chainsaw,car 121 | 8_t0cc17u68 speech,laughter,dance,singing,clapping,cry 122 | 8gO_lxThc1M piano,guitar,singing 123 | 8mc7uArd7zw guitar,piano 124 | 8q_vgp0FDhI helicopter,car 125 | 8t-JhhYhZyk singing,guitar,violin,banjo,cello 126 | 8yBGrAo62iQ chainsaw 127 | 9-iVuKsU8ZI rodents 128 | 91SmSyaO3ys clapping,speech,laughter,cheering 129 | 95shZG1q4ks playing_volleyball,laughter,speech 130 | 9DEfXtd0yPg cat 131 | 9NP-1iiy8P4 speech,chainsaw 132 | 9TMpVoKPUgM playing_soccer,cheering 133 | 9Xw3rUf873I playing_badminton 134 | 9dpDFE3UL_E accordion,dance 135 | 9gKnLxvCCJY playing_badminton,speech 136 | 9rdCnIUnxVs speech,violin,dance,singing 137 | 9w72SdFNnXU chainsaw 138 | 9z6Mzyar4gU playing_badminton 139 | A4Ly7wwElF0 playing_basketball 140 | AAxqVPd_wd0 accordion,car,clapping 141 | AHSLBch9Y6A playing_badminton 142 | AJrmVyCWbq8 piano 143 | ANAGPl7QZ0k helicopter 144 | AQvhFMkI51Y helicopter 145 | ATd-LDQ5MvM speech,laughter,clapping,piano 146 | Aatxa3zgXFk dance,cheering 147 | AcErIT001pg guitar 148 | AdTZKIG4t-A fixed-wing_aircraft 149 | AhKBRh8Oxfs speech,playing_badminton 150 | Al5Ks0PnV0w playing_soccer 151 | ApWlaLTPpho horse,car 152 | ArVDnCbQ6mU helicopter,car,fixed-wing_aircraft 153 | B-Ua2uzRwu4 playing_badminton,speech 154 | B8ISzf2pryI cat,dog 155 | BCyCm2E_GC0 helicopter,car 156 | BFNekjEgvuk cat,chicken_rooster 157 | BMVJbAObEQM violin,drum,singing 158 | BVT3acNFzqc guitar,singing,clapping,cheering,speech 159 | BeS5uOUBNFc playing_soccer 160 | BjtruQ_xQQ8 helicopter,car,speech,dog 161 | Bql-XqRjM7U speech,guitar,singing 162 | BtP6_sCuSLM speech,laughter 163 | BxXl5j9qEuc speech,alarm,laughter 164 | C2A9dIL2ECQ helicopter,dog 165 | C6nHvKcT6j8 speech,clapping,dance 166 | C8cxbPH-Hmg speech,playing_badminton 167 | C9scOaHzSOA playing_badminton 168 | CIvtiNpKEY0 speech 169 | CLm9FYDRxsA helicopter 170 | CQTa01iGUSs guitar 171 | CXRO93dNB_A playing_tennis 172 | C_G0qaFcSCM singing,clapping 173 | CeiVB75kOOU accordion 174 | CpG-YZWg3L4 cheering,car,speech,frisbee 175 | CskZG-HM5Ko accordion,speech,laughter 176 | Cw-US2uaOKs helicopter 177 | D9roj8pPsvU drum,guitar,singing,clapping,cheering 178 | DAC2Z-54HDY piano,laughter,speech 179 | DFYOtfJsXgM playing_baseball,playing_badminton,speech,clapping 180 | DQ3lbf7PCoY banjo,drum,car,speech,bicycle,singing,clapping 181 | DUlxiX4ri1Q alarm 182 | DXGQnIv5vkA dog 183 | DeQ_3bCcodk rodents 184 | DkXsN6qwDEw horse,cheering 185 | Do__idgHWJM speech,playing_basketball 186 | DxqGhpqu2z4 speech,guitar 187 | DynoOMg9Mvw fixed-wing_aircraft 188 | E4qiicR74hY clapping,violin,piano 189 | E6CcUj2mDbI speech 190 | E8kiWmlMN_A playing_soccer,speech,car 191 | EA58Fo2e0CI speech,laughter 192 | EHTTmj4bBDg singing,guitar,drum,piano,clapping 193 | EPEmEMp8zp4 piano,cello,speech,shofar,violin 194 | ESRG74Q_r6k playing_basketball 195 | EayN_Jj0740 violin 196 | EhAY6-dIO2I dog 197 | EleoJrKrHO0 guitar,violin,accordion,piano,clapping 198 | EtV6RzLCBlU guitar,speech,laughter,singing,cheering 199 | EvA2RNci3SE guitar,dance 200 | F4z5PKQT9Cg piano,cello 201 | FA0avnkqRV0 clapping,cheering 202 | FKQCVhR702M dog 203 | FQqwy9PPhX4 playing_basketball 204 | FTyIy8TLoCk dog,speech 205 | FYv4-57P9r0 speech,alarm 206 | F_WZZd7efr0 violin 207 | Fbs_wxRXOVU rodents 208 | Fju_-bLUhtg speech,playing_badminton 209 | FnA_H-RBaXE speech,horse 210 | Fqc_5UqYffI helicopter,car 211 | Fygm03Gw19Y helicopter 212 | G486CNb4GTw guitar,singing,drum,cheering,clapping 213 | G7bNye0f9eQ clapping,speech,laughter,playing_baseball,dance,cheering 214 | GCKIImqFeQ4 dog 215 | GEsAdwCJox0 speech,playing_baseball 216 | GI7p6fbHfx4 dog,guitar 217 | GQJ3-6ZBsfg playing_basketball 218 | GYvApvHOPXc rodents 219 | Gdr3ImP4Gv4 banjo,drum 220 | GlTsktEX8jw clapping,playing_soccer 221 | GoUuozWmoFM violin,piano 222 | GroFCq4B5xo playing_ping-pong,speech 223 | GvrN5VT1Zf0 guitar 224 | GywIlkOUPWw bicycle 225 | H29pf9Is1qE violin,piano,guitar 226 | H5wBI98NXEE speech,playing_basketball 227 | HCPfyrSiYQg drum,laughter 228 | HFNY9NKFA4g playing_tennis 229 | HLqhkZAW0K8 fixed-wing_aircraft,car 230 | HQtPEVGdOMw car,bicycle 231 | HcDycLidmE4 cat 232 | HhxJViNn3Ik dog,car 233 | HnURtEzECx8 playing_basketball,speech 234 | HoPcLLZhVA8 car_alarm 235 | I5ECnXmzfCo playing_basketball 236 | I6dcgD7U7Ng playing_badminton,clapping 237 | IHqP9xNt9Rk laughter,speech,clapping 238 | IOBbVjG2IC8 helicopter 239 | IbJdgddf2rs alarm,dog 240 | Ioum3RLkdvU dance 241 | Isqd8mYJn6M laughter,speech 242 | IuYrbeqc-mA playing_tennis,car 243 | J-oqCqZ2BGs playing_badminton 244 | J3QTd-dLUUw dog,speech 245 | JBR2fOhOOs8 playing_volleyball 246 | JEGQJNJ59Vo piano,singing,speech 247 | JJPnKWnyBk0 violin,accordion,drum 248 | JVdSd4gjptA car,guitar,singing,accordion,speech,laughter 249 | Jan3-mrKhes piano,bicycle,train,car,clapping 250 | Jf0tIV0FNuc singing,guitar,speech 251 | Ji7BgSmw-to violin 252 | Jmsd-qI9UCk speech,car,clapping,cheering 253 | JzOm6le-0jM speech,laughter,clapping 254 | K0qjglnaeCE car,dog,cry 255 | KANSCrz4ITY cat 256 | KI8ciS9W4Ww piano 257 | KM7OARP3qnU playing_volleyball,cheering,clapping,laughter,speech 258 | KOgVHbEZBRI playing_volleyball 259 | KTv3S4KMrxs guitar,drum,piano,singing 260 | KYbpNBqDRVM playing_soccer 261 | KhN9_80lJE0 horse 262 | L1ds1J4Mzwc speech,car 263 | LAyb0tD6cC8 guitar,piano,singing 264 | LGt7J0s1aZw fixed-wing_aircraft 265 | LMIlVKyiT5I dog,car 266 | LSo9hvy1O_U playing_volleyball 267 | LYq7aiuUo9Q dog,speech,clapping,laughter 268 | Ldrwlo6nSXQ bicycle,clapping 269 | LjN5zrMeRQs playing_soccer,speech,clapping,cheering 270 | LqLrcG29b0o guitar,cello,accordion 271 | M3O49xZUjnE speech,laughter,playing_soccer,clapping 272 | MQdNi2PLaSg speech,car,car_alarm 273 | MSF2r6_0mmU violin,cello,piano,speech 274 | Mfi0NMtfUzI cat 275 | MkMOpb3HVxM horse 276 | MycEFlLDOkY clapping,guitar,speech,cheering,singing 277 | N3PgUuVAMpA violin,cello,guitar,bicycle,car 278 | N6_yGKH-4Oo speech,playing_soccer 279 | NBpAt3TVOK4 cheering,playing_soccer 280 | NL3J180GKDs playing_basketball 281 | NTbNCezCJNM violin,piano,cello,drum,singing 282 | NZL4sUqjUv0 piano 283 | NndiiezGkNY clapping,laughter 284 | Nz_CuNe0cT0 car,singing 285 | O2zgJUmCLt8 playing_volleyball 286 | OB_tVv18Zhw guitar,drum 287 | OI5wo5D8k8Q piano,speech,laughter,clapping 288 | OVHtv2UehLU speech,piano 289 | OieTT6UUs2g cat,rodents 290 | OrMf-D4QOqM guitar,speech 291 | P3gIi9su2Rw speech 292 | PGmM0l9jfC4 speech,laughter 293 | POBYjeKzvcY cry 294 | PYxx7gxFIZM playing_badminton 295 | Pfof8hGZIt8 guitar 296 | PxKp7_lASKI speech,laughter,drum,singing,dance,cheering 297 | Q-4edTX71fI banjo,violin,singing 298 | Q9oLbZlT3sQ speech,horse,clapping 299 | QKj8n_um4Zg playing_soccer 300 | QUR6mUOZJes car_alarm,helicopter,speech,car 301 | Qkzg7wB-nxw accordion,clapping 302 | QwF7a9grl2w dance 303 | R9OrWMSMKXg speech,shofar 304 | RToAwVlhA7A clapping,cheering,guitar,singing,drum 305 | RgaVv4p1SkA guitar,singing,clapping,cheering 306 | RwiF_Q8-pyU violin,car,speech 307 | SO4O873MsXk drum,clapping,dance,guitar 308 | S_1q7Q5DX78 cello,guitar,accordion 309 | SevXOPt5cCo playing_badminton,clapping,playing_basketball,laughter 310 | T-shmnmIC2w piano,singing,laughter,clapping 311 | TEy5H1Pe7ck shofar 312 | TQ_JHaSF-D8 guitar,clapping,playing_volleyball 313 | TzyUERfGnRo accordion,guitar,drum,laughter,clapping 314 | U7Yne1CRZyg dog,car 315 | US_NJY_cGqw rodents 316 | UtnGtAYOPgM rodents 317 | VMAk1_bcv90 horse,car,laughter 318 | VuzvEGJi8ew speech,car 319 | WCRlezUfsyo speech,guitar 320 | WaaANll8h18 speech 321 | WoEBDNm7n_c speech,car,bicycle,cat 322 | X7ymriMhoj0 speech,laughter,clapping 323 | XNcZDhjb9mU clapping,speech,playing_badminton,laughter,dance 324 | XX6tSilHbPs singing,guitar 325 | XfIf9hK8HiA guitar 326 | XusQ3T98QAc dog,car 327 | Y9VBgCC2D0A dance,singing,drum 328 | YKs8UbLjPwo dance,laughter 329 | YbQ44pTU-FM cello,violin,guitar,banjo,singing 330 | ZF-pF6v3fb0 accordion,laughter,speech 331 | ZX3i8qp4Hsk speech,piano,singing 332 | ZerAVavECMs dance,speech 333 | _-sfoqUa0vs violin,speech,laughter,clapping,cry,cello,drum,horse 334 | _5jP7maXfwQ car,dog 335 | _8xMgAApZIU playing_soccer,speech,clapping 336 | _DkR4H_QOyo guitar,singing 337 | _LQ-2JH5I4U cheering,clapping,playing_volleyball 338 | _OXU50DYMfg violin,horse 339 | _XKrJ5QwjIo drum,guitar,accordion,piano,singing 340 | _hxeoQvSuic dance 341 | _lZzYZBxbxA playing_tennis 342 | _qpfYglF5bI laughter,clapping 343 | _tGdoqj0Qh8 speech,shofar,dog 344 | a2c8sv0SLUo speech 345 | a84qOOQgfp0 dance,clapping 346 | a9jkbha8bmc dog,car 347 | aHwzzSYhHrs violin,guitar,cello 348 | aPwcDG6F2aM speech,clapping,cheering,laughter,drum,guitar 349 | a_90uFCrWWY guitar,speech,dog 350 | ar_G9Goi-TI drum,singing 351 | avJNxcysMCk clapping,cheering 352 | axXpdF1Z9II car 353 | b5x3xJITTsQ speech,playing_soccer 354 | bELmGWXOlQ0 speech,clapping,laughter 355 | bK05D61SXmM playing_baseball 356 | bP8P5WBebl0 guitar,singing 357 | bV6YVSupGVg playing_soccer,clapping 358 | bX7S3HqdlAI speech,laughter,car,dog,clapping 359 | beqQ4gtKn84 horse 360 | blL29NvAMic bicycle 361 | btwy0yCQL14 speech,clapping,violin 362 | by4dS4tgkY0 car,bicycle,speech 363 | c-cAHw-Q5lc car,speech 364 | c4PS6uywL7o rodents,speech 365 | cFYxRp9K9L4 chicken_rooster,piano 366 | cXoGMM-PyHY chicken_rooster 367 | ckMCPCQkM_k speech,laughter,horse 368 | cpg9rdc-ONc dog 369 | cxxeRzfJ1_c car 370 | d3Ltp10L_q8 speech,laughter,singing,guitar 371 | d7Ny0aaL90E clapping,violin,piano 372 | dD1tDnSUQ_I speech,singing,clapping 373 | dNBRFnEpCOw speech,guitar,banjo,drum,accordion,singing,dance,laughter 374 | dQw4w9WgXcQ dance,singing 375 | dV6UMWryqGY dance 376 | dZ-aFiQAPwI cat,dog 377 | df_Xv2M5WT4 playing_tennis,car,clapping 378 | dmYHbO-eu8E dog 379 | dr1QWbfsPGY car,cheering 380 | e0a1lp4ZWu8 horse,car,speech,laughter,clapping 381 | e9xV240-Jzw drum,guitar,clapping,cheering,speech 382 | eGijYoJ_6nY guitar 383 | eS1r2Qi0qUM playing_badminton 384 | eYYMdLk4IcY speech,laughter,car,horse,shofar,accordion,violin 385 | egvE90X3NAE cat,rodents,speech,car,laughter 386 | es2krnRZ2Ko speech,laughter,clapping 387 | eyXE8nKb0AA banjo,speech 388 | f4dndkH9Y_w banjo,violin,clapping 389 | fIdVyM5Nr5w speech,guitar,singing,laughter 390 | fQDaz5kZHIk drum,accordion,car 391 | faXQShej178 bicycle,cat,car 392 | fgL83KdXw_U violin,cello,clapping 393 | fxI8zr-CPRg piano,cello,laughter 394 | g3uQ-O_o_Rc rodents 395 | gBh0Ig5kzCs guitar,violin,cello 396 | gO8N3L_aERg speech,cat,alarm 397 | gV6hP9wpMW8 speech,laughter,car 398 | gpN2k5zz81o speech,clapping,cheering,violin,singing 399 | gt42q0Ck-C4 bicycle,clapping 400 | h7oJYFZViuI dance,guitar,accordion,drum,piano,singing 401 | hBDAh1ivs0o speech,dance,laughter,cheering,singing 402 | hLwhdHEDb-U playing_baseball,clapping 403 | hXSee4C6pyE bicycle 404 | hgT707uizYI clapping,speech,laughter 405 | hmopCSSKX2U car,singing,laughter 406 | hthXd_Js5AE drum,guitar 407 | i3JjmOTkPOw laughter,clapping 408 | iAQYcYCalOA drum,laughter,clapping,dance,singing 409 | iFr2H2p063M dog,car 410 | iWdE8ry309k speech,dog,cat,laughter 411 | ifHRDBx-ctw accordion 412 | iqoPD-DlGIE piano 413 | j-QeBhOXPf8 violin 414 | j3oEXXy6-p0 laughter,piano 415 | jAsKLkkO6rs guitar,speech,laughter 416 | jOs3UfxE8XE singing,dance,laughter 417 | jkXJfDPB57w singing,drum 418 | jroTvXsKUiA dog,car 419 | k5ug9gzPS4M playing_basketball,speech 420 | kBiGFEIqZHA playing_soccer 421 | kNZkGy3wAEE laughter,speech,singing,guitar 422 | kda6dpSX0jY helicopter,speech,car 423 | kolGmya2OIs speech,guitar,singing 424 | kwNR-6sfog4 speech,violin,clapping 425 | l6p53lBcIGw horse 426 | lHcUr0POrKY guitar,singing,car,laughter,speech 427 | lNQexzgzvw4 speech,frisbee 428 | lWTnQ4qj24U accordion,piano,cello,drum,clapping 429 | lfizshQHNQo cello,piano 430 | lnijMHh8qh4 laughter,speech 431 | lvZBGJkbeTE dance,singing,clapping 432 | m5ex66Vst80 drum,singing,guitar,piano,speech 433 | mQl_6mcQBaQ piano,clapping,train,car,cheering 434 | mcc8UNgZh_I speech,car,car_alarm 435 | mkN6QdXU2fQ guitar,drum,car,speech 436 | n47yP1yfJWQ violin 437 | nAExL0lNLM0 playing_volleyball,clapping,cheering,speech,laughter 438 | nBNHSOn7N1s piano,clapping,laughter 439 | nIsZ4Q0y100 bicycle 440 | nNxaCXUC0l4 guitar,laughter,speech,singing 441 | nRxFEDNT-Z0 drum,dance 442 | npyseVDYQbY violin,guitar,accordion 443 | o16h6voegFU speech,laughter,piano,singing 444 | oEepqn5vJiE playing_baseball,speech,drum 445 | oNNlSdkjuqA horse,speech 446 | od140iTEaVg car,helicopter,laughter 447 | ooDu3m-ixQs helicopter,chainsaw 448 | p6BWQkhJ2kQ speech,violin 449 | p6ra0IBnL20 shofar,drum,car 450 | pH4q6OXhZgU cheering,playing_soccer,clapping 451 | pOLqlica-bM car 452 | piaLzfKLs9c car,playing_volleyball,dog 453 | psMNOvfJ1LE car,singing,guitar 454 | qRFNqIN_LSc accordion 455 | q_NJ800CdGU accordion,guitar,drum,piano,singing 456 | qfGggAGITwg cheering,drum,cello,dance,guitar,laughter 457 | qqkhtf9zplA dance 458 | r4lWcFb8854 violin,speech,laughter 459 | rDq5TQYn9Ak laughter,dance,cheering,singing,clapping 460 | rRl66L-5TY4 violin 461 | r_5bYNqpEFU piano,cello 462 | rokxoxFsWPU horse,dog,laughter 463 | rtMR524m0BM dance 464 | s3c0A6YmIIc cello,guitar,violin,singing,dance 465 | sPvTATw6LQA violin 466 | sfyd3WX2OjE shofar,speech 467 | t56JVDxycY4 clapping,dance,cheering,speech 468 | tTOihIRXPGs dog 469 | ty_LDBaE7gw violin,piano,drum 470 | uEwMTxbpbrA violin,cello,shofar,laughter,clapping 471 | uhFJdAJkgjo accordion,speech 472 | v2lwdvydNsw speech,laughter,car 473 | vKBn0Rn93TQ piano 474 | vVkdhLjRchA shofar 475 | vxWMRjF_0wM playing_volleyball 476 | wDKE5BHyCBg dog,frisbee 477 | wNCJhki5_eM cello,piano,guitar,violin 478 | weKt0KjakLI shofar,speech 479 | x5isH2gqS4o horse 480 | xPvEzH2UXnw dog 481 | xsRNDo8Upys dance,piano 482 | yJglmIQzriI piano,guitar,speech 483 | ySibvCKNOU8 playing_basketball,cheering,clapping 484 | ysCaqh38JVQ laughter,speech 485 | zO8IwPbALK4 guitar,speech,singing 486 | zfeHACNRs20 speech,singing,clapping,cheering,piano 487 | zmCKZYKsiGM speech,laughter 488 | -------------------------------------------------------------------------------- /LFAV_dataset/val/val_weak_av.csv: -------------------------------------------------------------------------------- 1 | filename event_labels 2 | -2UWP3n_05A clapping,cheering,singing,speech,laughter 3 | -8yJXNCnRpk laughter,speech,cello,guitar,cat 4 | -L_xGAyM_Qw chicken_rooster 5 | 01iz9kmvbxg speech,drum,singing,cheering,piano,clapping,laughter,dance 6 | 08hzunIk81Y drum,cat,car,speech,cheering,laughter,clapping,bicycle,dance,dog 7 | 0Hz4R_m0hmI speech,car 8 | 0REH7oc48C0 rodents,laughter 9 | 0ZOyJM8b0a0 helicopter,speech,car 10 | 0_tZ4OTT1MI speech,laughter,singing,violin,guitar,dance 11 | 0b-o15cMJPE cheering,guitar,violin,clapping,singing,speech,laughter,dance 12 | 0eXu0vIIMVc speech,chainsaw 13 | 0fjrDsLr_aE chainsaw 14 | 0pJ1oD-OXZo speech,guitar 15 | 0vzuDkjtDOY piano,speech,laughter,clapping 16 | 119gUU8x0m8 helicopter,speech,cheering,bicycle,car 17 | 1DCDHmGcB5c helicopter,speech,car 18 | 1KSpPr05jgo drum,guitar,singing 19 | 1OJp9yVaZiE guitar,cello,banjo,violin,speech,singing 20 | 1Rt6qQz0w7c playing_badminton,speech,clapping,cheering,laughter 21 | 1cJ4FKeXAQA speech,guitar,laughter 22 | 1hNwQMJtGew rodents 23 | 1ozVekZgpZE helicopter,car,bicycle 24 | 1ur8hokq-Cs guitar,speech,chainsaw 25 | 1zKq8IJkcnk speech,drum,guitar,singing 26 | 24nBh-zzpsQ piano,speech,cheering,cry 27 | 2I1ZU5g1QNo cheering,singing,violin,piano,guitar 28 | 2MYfUQFr6S0 chainsaw,speech 29 | 2V9PXkRdE3I speech,guitar 30 | 2WogzOS_D9Q chainsaw 31 | 2_WpcixEOP4 speech,drum,cello,dog 32 | 2bmuqqUE8Z4 drum,speech,singing,laughter,clapping,cheering 33 | 2pVJmT0AG3Q car,bicycle,car_alarm 34 | 2t0HUQleGwU rodents,speech,laughter,cry 35 | 2xz2ti1RfwY piano,violin,clapping 36 | 305jW5Q5Ewk cheering,clapping,speech,singing,guitar 37 | 36fI5gwfHqU playing_tennis 38 | 38Z3ovrzVdo fixed-wing_aircraft,laughter,speech 39 | 3Bsnuew8f6s piano,laughter 40 | 3EifSw1bvmA singing,cheering,piano,drum,speech,dance 41 | 3Gbj7ukbvQE violin,speech,laughter 42 | 3KjtkJiip-8 playing_badminton,laughter,speech 43 | 3LDmXpcjrC4 speech,frisbee 44 | 3M2TFEF01t4 playing_badminton,speech,laughter,clapping 45 | 3NHJUkdHwtA dog,shofar 46 | 3R-1x4pPBBQ speech,playing_baseball 47 | 3Sby3LP-cbU speech,chainsaw 48 | 3UF7YxKUeks speech,laughter,cheering,cry 49 | 3X2xAD-9eiM helicopter,car,chainsaw 50 | 3_J8f_tvqmo cheering,playing_volleyball 51 | 3e-52Lkrl0w laughter,speech,bicycle 52 | 3lzA9PRMiUk car_alarm,speech,car,bicycle 53 | 3o-Eaixw6ec helicopter,car,speech 54 | 3sMa94JiLZI chainsaw,speech,car,dog 55 | 3wZ_HHViGpM playing_badminton 56 | 4-X3c59e3Hk helicopter,car 57 | 41V_J6hdEhs playing_badminton,speech 58 | 45RDZ3owtvY helicopter,chainsaw,laughter,speech 59 | 48XkS3Hkmws cry,laughter,speech,dog 60 | 4DDfD4-cEVo fixed-wing_aircraft,speech 61 | 4I6z0N2xdh8 alarm,speech 62 | 4MAdVOtBQD8 clapping,singing,speech,laughter,dance 63 | 4Yt7hmubn58 playing_badminton,speech 64 | 4exLWR92ycs singing,guitar,laughter,speech 65 | 4igvehYCc7U singing,guitar,speech,clapping,cheering,laughter 66 | 4leF1RwXFSE chainsaw 67 | 4ssWujrqMhs speech,alarm,guitar,cheering,clapping,laughter,piano 68 | 4tgjvpFd6mc rodents 69 | 4xyw9i0VvRY clapping,cheering,playing_volleyball 70 | 50btuV9sY60 fixed-wing_aircraft,speech 71 | 54PhWYrDTgw chainsaw,speech,car,dog 72 | 56ddX-fmEtY speech,cry,dog,laughter,clapping 73 | 598s4riqw3M speech,laughter,horse,frisbee,dog 74 | 5CkG2GTiK4M chainsaw,speech 75 | 5ECzbUNtnOk speech,drum,cat 76 | 5J-ZeYlddSk clapping,guitar,violin,laughter,speech,dance 77 | 5K7ny6CVZRE speech,guitar,singing,laughter 78 | 5NqpmNqPDeQ chainsaw 79 | 5OUHZWamrec speech,piano,clapping,laughter 80 | 5RPDLz_RCbE helicopter,bicycle 81 | 5V51JifwW48 cry,speech,dog 82 | 5ZELJ-0sGCw cello,laughter,cheering,clapping 83 | 5cKS_KuoU00 singing,fixed-wing_aircraft,speech 84 | 5kE3ngBWV_U speech,fixed-wing_aircraft,laughter,car 85 | 5mKfMbsjeAs chainsaw 86 | 5syUwJ9dEpE shofar 87 | 5uwzHqZn-Os alarm 88 | 5vlGKyxl22M speech,cheering,laughter 89 | 5zSPUfVm32A chainsaw,speech 90 | 6-atw21yanY playing_volleyball,cheering,clapping 91 | 65PjoTErZlw playing_badminton,clapping,speech 92 | 67PlMRhS5qc chainsaw 93 | 6AVMcJa77PM speech,cheering,laughter,car 94 | 6Dg6Q6tQMiE drum,guitar,singing,laughter 95 | 6GLXPPOYfWs playing_badminton,speech,cheering 96 | 6Jls8goVRuU playing_basketball,cheering,clapping,speech,dance 97 | 6NOW0_HuUCQ cat,chainsaw 98 | 6TnKvlQ2h7s cheering,speech,playing_soccer 99 | 6WlLqF5eXkM guitar,cello,speech,clapping,banjo 100 | 6ctzdi3lZ5M rodents 101 | 6iefuGEvlHI shofar 102 | 6mraw-F7qQg accordion,cello 103 | 6sHHdIWDmnk speech,guitar,cheering,laughter,singing 104 | 6ynhCAZ8KIo guitar,speech,cheering,clapping,laughter 105 | 736A0wfsYDw playing_badminton,speech,cheering,clapping 106 | 76rJsot3DqM chainsaw 107 | 78D25pNtSSo car,speech,car_alarm 108 | 7EJ1TlhoFKc chainsaw 109 | 7MK3P3StM6U speech,clapping,playing_badminton 110 | 7Sidspb9zhk horse,speech,singing 111 | 7Wb2zZQkpPo speech,car,helicopter 112 | 7YwxtA1MNuk guitar,singing,cheering,speech,clapping,laughter 113 | 7_pRiUfp938 speech,singing,laughter,dance 114 | 7bgTxPSZ2-o speech,chainsaw,car 115 | 7iIBkME4bSM car_alarm,helicopter,speech 116 | 7myife0ZQSc cheering,clapping,singing,dance 117 | 7qbbqP9mTIE shofar,car 118 | 8-E3tzyQdN4 helicopter,speech,laughter,car 119 | 88cL4GK6vyI singing,cheering,clapping,speech,laughter 120 | 8NY3TWbkg3I chainsaw,car 121 | 8_t0cc17u68 speech,piano,singing,drum,laughter,dance,clapping,cry 122 | 8gO_lxThc1M piano,guitar,singing 123 | 8mc7uArd7zw guitar,speech,piano 124 | 8q_vgp0FDhI cry,speech,helicopter,laughter,car 125 | 8t-JhhYhZyk singing,guitar,banjo,cello,violin 126 | 8yBGrAo62iQ chainsaw 127 | 9-iVuKsU8ZI rodents,laughter,speech 128 | 91SmSyaO3ys speech,cheering,clapping,laughter 129 | 95shZG1q4ks speech,laughter,playing_volleyball 130 | 9DEfXtd0yPg speech,cat 131 | 9NP-1iiy8P4 chainsaw,speech 132 | 9TMpVoKPUgM speech,cheering,playing_soccer 133 | 9Xw3rUf873I playing_badminton,laughter,speech 134 | 9dpDFE3UL_E accordion,laughter,dance 135 | 9gKnLxvCCJY playing_badminton,speech,clapping 136 | 9rdCnIUnxVs laughter,violin,singing,clapping,speech,dance 137 | 9w72SdFNnXU speech,chainsaw 138 | 9z6Mzyar4gU drum,speech,playing_badminton 139 | A4Ly7wwElF0 speech,playing_basketball 140 | AAxqVPd_wd0 accordion,drum,clapping,car 141 | AHSLBch9Y6A playing_badminton,speech 142 | AJrmVyCWbq8 piano,car 143 | ANAGPl7QZ0k helicopter,car 144 | AQvhFMkI51Y helicopter 145 | ATd-LDQ5MvM speech,clapping,piano,cheering,laughter 146 | Aatxa3zgXFk cheering,clapping,dance 147 | AcErIT001pg piano,guitar 148 | AdTZKIG4t-A fixed-wing_aircraft 149 | AhKBRh8Oxfs speech,playing_badminton 150 | Al5Ks0PnV0w playing_soccer,speech 151 | ApWlaLTPpho horse,car 152 | ArVDnCbQ6mU helicopter,speech,fixed-wing_aircraft,laughter,car 153 | B-Ua2uzRwu4 speech,playing_badminton 154 | B8ISzf2pryI speech,cat,dog 155 | BCyCm2E_GC0 helicopter,car 156 | BFNekjEgvuk chicken_rooster,cat 157 | BMVJbAObEQM violin,drum,singing 158 | BVT3acNFzqc guitar,singing,clapping,cheering,speech 159 | BeS5uOUBNFc speech,playing_soccer 160 | BjtruQ_xQQ8 helicopter,speech,car,dog 161 | Bql-XqRjM7U speech,singing,guitar,laughter 162 | BtP6_sCuSLM laughter,speech,clapping 163 | BxXl5j9qEuc speech,drum,alarm,laughter 164 | C2A9dIL2ECQ helicopter,alarm,speech,dog 165 | C6nHvKcT6j8 speech,clapping,dance 166 | C8cxbPH-Hmg playing_badminton,speech 167 | C9scOaHzSOA playing_badminton,speech 168 | CIvtiNpKEY0 speech,clapping 169 | CLm9FYDRxsA helicopter 170 | CQTa01iGUSs guitar 171 | CXRO93dNB_A playing_tennis,speech,car_alarm 172 | C_G0qaFcSCM singing,clapping 173 | CeiVB75kOOU accordion,drum 174 | CpG-YZWg3L4 cheering,speech,car,singing,frisbee 175 | CskZG-HM5Ko accordion,speech,laughter 176 | Cw-US2uaOKs helicopter,speech,laughter 177 | D9roj8pPsvU speech,drum,guitar,clapping,singing,cheering 178 | DAC2Z-54HDY piano,speech,laughter 179 | DFYOtfJsXgM drum,playing_baseball,speech,playing_badminton,cheering,clapping 180 | DQ3lbf7PCoY clapping,speech,banjo,drum,singing,car,bicycle 181 | DUlxiX4ri1Q alarm 182 | DXGQnIv5vkA speech,dog,laughter 183 | DeQ_3bCcodk rodents 184 | DkXsN6qwDEw horse,cheering,frisbee,dog,singing,clapping,laughter 185 | Do__idgHWJM speech,playing_basketball 186 | DxqGhpqu2z4 speech,guitar 187 | DynoOMg9Mvw fixed-wing_aircraft,speech 188 | E4qiicR74hY clapping,violin,piano 189 | E6CcUj2mDbI dog,speech 190 | E8kiWmlMN_A speech,playing_soccer,car 191 | EA58Fo2e0CI speech,laughter 192 | EHTTmj4bBDg singing,guitar,drum,piano,clapping,cheering,speech 193 | EPEmEMp8zp4 piano,cello,speech,violin,shofar 194 | ESRG74Q_r6k playing_basketball,speech 195 | EayN_Jj0740 violin,clapping 196 | EhAY6-dIO2I dog 197 | EleoJrKrHO0 guitar,violin,accordion,piano,clapping 198 | EtV6RzLCBlU speech,guitar,singing,cheering,laughter 199 | EvA2RNci3SE singing,speech,guitar,laughter,clapping,dance 200 | F4z5PKQT9Cg piano,cello 201 | FA0avnkqRV0 clapping,cheering 202 | FKQCVhR702M car,dog 203 | FQqwy9PPhX4 speech,playing_basketball 204 | FTyIy8TLoCk dog,speech 205 | FYv4-57P9r0 speech,alarm 206 | F_WZZd7efr0 clapping,piano,violin 207 | Fbs_wxRXOVU rodents,speech,laughter,cheering 208 | Fju_-bLUhtg speech,playing_badminton 209 | FnA_H-RBaXE speech,horse 210 | Fqc_5UqYffI helicopter,car 211 | Fygm03Gw19Y helicopter,alarm 212 | G486CNb4GTw drum,guitar,singing,clapping,cheering 213 | G7bNye0f9eQ clapping,speech,playing_baseball,cheering,laughter,dance 214 | GCKIImqFeQ4 speech,cheering,dog 215 | GEsAdwCJox0 speech,drum,playing_baseball 216 | GI7p6fbHfx4 guitar,dog 217 | GQJ3-6ZBsfg playing_basketball,speech 218 | GYvApvHOPXc rodents 219 | Gdr3ImP4Gv4 banjo,drum,singing,speech,clapping 220 | GlTsktEX8jw speech,clapping,playing_soccer 221 | GoUuozWmoFM piano,violin 222 | GroFCq4B5xo playing_ping-pong,speech 223 | GvrN5VT1Zf0 guitar,singing,speech,laughter,clapping,cat 224 | GywIlkOUPWw laughter,speech,bicycle 225 | H29pf9Is1qE violin,guitar,piano,clapping 226 | H5wBI98NXEE speech,cheering,playing_basketball 227 | HCPfyrSiYQg drum,singing,cheering,laughter 228 | HFNY9NKFA4g playing_tennis 229 | HLqhkZAW0K8 speech,fixed-wing_aircraft,car 230 | HQtPEVGdOMw car,speech,laughter,bicycle 231 | HcDycLidmE4 laughter,cat 232 | HhxJViNn3Ik car,speech,laughter,dog 233 | HnURtEzECx8 playing_basketball,speech 234 | HoPcLLZhVA8 dance,car_alarm 235 | I5ECnXmzfCo playing_basketball,car_alarm,speech 236 | I6dcgD7U7Ng speech,playing_badminton,clapping 237 | IHqP9xNt9Rk laughter,clapping,speech 238 | IOBbVjG2IC8 helicopter,speech 239 | IbJdgddf2rs dog,alarm,cheering 240 | Ioum3RLkdvU cheering,dance 241 | Isqd8mYJn6M laughter,speech 242 | IuYrbeqc-mA playing_tennis,cheering,speech,fixed-wing_aircraft,car 243 | J-oqCqZ2BGs playing_badminton,speech 244 | J3QTd-dLUUw speech,laughter,dog 245 | JBR2fOhOOs8 speech,laughter,cheering,clapping,playing_volleyball 246 | JEGQJNJ59Vo piano,singing,cheering,clapping,speech,laughter 247 | JJPnKWnyBk0 violin,accordion,drum,clapping 248 | JVdSd4gjptA guitar,singing,accordion,speech,laughter,car 249 | Jan3-mrKhes speech,piano,laughter,clapping,cheering,bicycle,car 250 | Jf0tIV0FNuc singing,guitar,speech 251 | Ji7BgSmw-to violin,clapping 252 | Jmsd-qI9UCk clapping,speech,cheering,car 253 | JzOm6le-0jM speech,clapping,laughter 254 | K0qjglnaeCE laughter,speech,cry,car,dog 255 | KANSCrz4ITY speech,laughter,cat 256 | KI8ciS9W4Ww piano 257 | KM7OARP3qnU cheering,playing_volleyball,clapping,speech,laughter 258 | KOgVHbEZBRI playing_volleyball,clapping 259 | KTv3S4KMrxs guitar,drum,piano,singing 260 | KYbpNBqDRVM speech,playing_soccer,clapping 261 | KhN9_80lJE0 alarm,speech,horse 262 | L1ds1J4Mzwc speech,car 263 | LAyb0tD6cC8 guitar,singing,piano 264 | LGt7J0s1aZw fixed-wing_aircraft 265 | LMIlVKyiT5I speech,dog,laughter,cheering,car 266 | LSo9hvy1O_U cheering,clapping,speech,playing_volleyball 267 | LYq7aiuUo9Q cheering,speech,cry,dog,laughter,clapping 268 | Ldrwlo6nSXQ clapping,cheering,speech,bicycle 269 | LjN5zrMeRQs speech,playing_soccer,alarm,cheering,clapping 270 | LqLrcG29b0o guitar,cello,accordion 271 | M3O49xZUjnE speech,laughter,cheering,clapping,playing_soccer 272 | MQdNi2PLaSg speech,car,car_alarm 273 | MSF2r6_0mmU violin,piano,clapping,speech,cello 274 | Mfi0NMtfUzI drum,cat 275 | MkMOpb3HVxM speech,horse,laughter 276 | MycEFlLDOkY clapping,speech,cheering,guitar,singing 277 | N3PgUuVAMpA violin,cello,guitar,clapping,speech,laughter,bicycle,car 278 | N6_yGKH-4Oo speech,playing_soccer 279 | NBpAt3TVOK4 playing_soccer,speech,cheering 280 | NL3J180GKDs playing_basketball,speech,laughter,cheering 281 | NTbNCezCJNM violin,piano,cello,drum,singing 282 | NZL4sUqjUv0 clapping,laughter,piano 283 | NndiiezGkNY cheering,clapping,laughter 284 | Nz_CuNe0cT0 singing,cheering,car 285 | O2zgJUmCLt8 cheering,speech,clapping,playing_volleyball 286 | OB_tVv18Zhw guitar,drum 287 | OI5wo5D8k8Q piano,speech,laughter,singing,clapping 288 | OVHtv2UehLU speech,clapping,cheering,piano 289 | OieTT6UUs2g speech,cat,rodents 290 | OrMf-D4QOqM speech,guitar 291 | P3gIi9su2Rw dog,speech 292 | PGmM0l9jfC4 speech,laughter,cry 293 | POBYjeKzvcY speech,laughter,cry 294 | PYxx7gxFIZM clapping,playing_badminton,cheering,speech 295 | Pfof8hGZIt8 guitar 296 | PxKp7_lASKI cheering,laughter,clapping,speech,drum,singing,dance 297 | Q-4edTX71fI banjo,violin,singing 298 | Q9oLbZlT3sQ speech,clapping,horse 299 | QKj8n_um4Zg speech,playing_soccer,clapping,alarm,cheering,guitar 300 | QUR6mUOZJes car_alarm,helicopter,speech,car 301 | Qkzg7wB-nxw accordion,piano,clapping,cheering,speech 302 | QwF7a9grl2w cheering,singing,dance 303 | R9OrWMSMKXg shofar,speech 304 | RToAwVlhA7A cheering,clapping,guitar,speech,drum,singing 305 | RgaVv4p1SkA guitar,singing,clapping,cheering 306 | RwiF_Q8-pyU violin,car,speech 307 | SO4O873MsXk singing,cheering,clapping,speech,drum,dance,guitar 308 | S_1q7Q5DX78 cello,guitar,accordion 309 | SevXOPt5cCo cheering,clapping,playing_badminton,laughter,speech,playing_basketball 310 | T-shmnmIC2w piano,singing,laughter,clapping,cheering 311 | TEy5H1Pe7ck shofar 312 | TQ_JHaSF-D8 clapping,guitar,playing_volleyball 313 | TzyUERfGnRo guitar,accordion,cheering,clapping,drum,laughter 314 | U7Yne1CRZyg dog,car,speech,car_alarm 315 | US_NJY_cGqw rodents,speech 316 | UtnGtAYOPgM rodents 317 | VMAk1_bcv90 speech,clapping,horse,car,laughter 318 | VuzvEGJi8ew speech,car 319 | WCRlezUfsyo speech,laughter,cheering,guitar,clapping 320 | WaaANll8h18 speech,guitar,drum,piano 321 | WoEBDNm7n_c speech,car,laughter,bicycle,cat 322 | X7ymriMhoj0 speech,laughter,clapping 323 | XNcZDhjb9mU drum,speech,clapping,alarm,playing_badminton,laughter,dance 324 | XX6tSilHbPs piano,guitar,singing,drum,clapping,cheering 325 | XfIf9hK8HiA guitar 326 | XusQ3T98QAc speech,dog,car 327 | Y9VBgCC2D0A drum,guitar,piano,singing,clapping,dance 328 | YKs8UbLjPwo drum,singing,speech,laughter,clapping,dance 329 | YbQ44pTU-FM speech,banjo,violin,guitar,cello,singing,laughter 330 | ZF-pF6v3fb0 laughter,clapping,cheering,speech,accordion 331 | ZX3i8qp4Hsk speech,piano,singing 332 | ZerAVavECMs singing,laughter,speech,dance 333 | _-sfoqUa0vs speech,cheering,clapping,laughter,violin,drum,singing,cello,cry,horse 334 | _5jP7maXfwQ speech,car,dog 335 | _8xMgAApZIU singing,speech,playing_soccer,clapping,drum 336 | _DkR4H_QOyo guitar,singing 337 | _LQ-2JH5I4U cheering,clapping,playing_volleyball,speech 338 | _OXU50DYMfg violin,horse 339 | _XKrJ5QwjIo accordion,guitar,drum,piano,singing 340 | _hxeoQvSuic cheering,clapping,speech,piano,guitar,drum,dance 341 | _lZzYZBxbxA playing_tennis,cheering,clapping 342 | _qpfYglF5bI clapping,cheering,laughter,singing 343 | _tGdoqj0Qh8 dog,shofar,speech 344 | a2c8sv0SLUo speech 345 | a84qOOQgfp0 singing,cheering,clapping,speech,dance 346 | a9jkbha8bmc speech,car,laughter,dog 347 | aHwzzSYhHrs violin,guitar,cello 348 | aPwcDG6F2aM clapping,cheering,speech,drum,laughter,guitar 349 | a_90uFCrWWY speech,dog,guitar 350 | ar_G9Goi-TI drum,guitar,singing,cheering 351 | avJNxcysMCk speech,clapping,cheering 352 | axXpdF1Z9II car 353 | b5x3xJITTsQ speech,playing_soccer,cheering,clapping 354 | bELmGWXOlQ0 speech,clapping,laughter,singing,drum 355 | bK05D61SXmM speech,playing_baseball 356 | bP8P5WBebl0 guitar,singing 357 | bV6YVSupGVg cheering,speech,clapping,playing_soccer 358 | bX7S3HqdlAI speech,laughter,car,dog,cheering,clapping 359 | beqQ4gtKn84 speech,laughter,horse 360 | blL29NvAMic bicycle 361 | btwy0yCQL14 speech,clapping,violin 362 | by4dS4tgkY0 speech,car,bicycle 363 | c-cAHw-Q5lc car,alarm,speech 364 | c4PS6uywL7o speech,laughter,rodents 365 | cFYxRp9K9L4 violin,piano,clapping,cheering,chicken_rooster 366 | cXoGMM-PyHY chicken_rooster 367 | ckMCPCQkM_k speech,laughter,horse 368 | cpg9rdc-ONc dog,cheering,speech 369 | cxxeRzfJ1_c car 370 | d3Ltp10L_q8 speech,laughter,singing,guitar 371 | d7Ny0aaL90E clapping,cheering,piano,violin,speech 372 | dD1tDnSUQ_I speech,singing,clapping 373 | dNBRFnEpCOw shofar,speech,cheering,clapping,guitar,banjo,drum,singing,laughter,accordion,dance 374 | dQw4w9WgXcQ dance,singing 375 | dV6UMWryqGY speech,singing,clapping,chainsaw,dance 376 | dZ-aFiQAPwI speech,dog,cat 377 | df_Xv2M5WT4 playing_tennis,speech,cheering,clapping,car 378 | dmYHbO-eu8E dog,speech,singing 379 | dr1QWbfsPGY singing,cheering,speech,car 380 | e0a1lp4ZWu8 speech,laughter,clapping,horse,car 381 | e9xV240-Jzw drum,cheering,clapping,speech,guitar 382 | eGijYoJ_6nY guitar 383 | eS1r2Qi0qUM playing_badminton,speech,laughter,cheering 384 | eYYMdLk4IcY piano,speech,cello,shofar,violin,laughter,car,horse,accordion 385 | egvE90X3NAE rodents,speech,dog,cat,car,laughter 386 | es2krnRZ2Ko speech,clapping,laughter 387 | eyXE8nKb0AA banjo,speech 388 | f4dndkH9Y_w banjo,violin,laughter,cheering,clapping 389 | fIdVyM5Nr5w speech,guitar,cheering,singing,laughter 390 | fQDaz5kZHIk drum,accordion,car 391 | faXQShej178 car,bicycle,cat 392 | fgL83KdXw_U cello,violin,clapping 393 | fxI8zr-CPRg piano,cello,laughter 394 | g3uQ-O_o_Rc laughter,speech,rodents 395 | gBh0Ig5kzCs guitar,violin,cello 396 | gO8N3L_aERg speech,alarm,cat 397 | gV6hP9wpMW8 speech,clapping,cheering,laughter,car 398 | gpN2k5zz81o speech,cheering,clapping,violin,singing,laughter 399 | gt42q0Ck-C4 cheering,clapping,bicycle 400 | h7oJYFZViuI accordion,drum,guitar,piano,singing,dance 401 | hBDAh1ivs0o speech,laughter,singing,clapping,cheering,dance 402 | hLwhdHEDb-U playing_baseball,speech,clapping 403 | hXSee4C6pyE speech,bicycle 404 | hgT707uizYI clapping,speech,laughter 405 | hmopCSSKX2U speech,singing,car,laughter 406 | hthXd_Js5AE guitar,drum,singing,cheering,clapping 407 | i3JjmOTkPOw speech,clapping,cheering,laughter 408 | iAQYcYCalOA drum,clapping,singing,laughter,dance 409 | iFr2H2p063M speech,car,laughter,dog 410 | iWdE8ry309k dog,cat,speech,laughter 411 | ifHRDBx-ctw accordion 412 | iqoPD-DlGIE piano 413 | j-QeBhOXPf8 violin 414 | j3oEXXy6-p0 piano,laughter 415 | jAsKLkkO6rs singing,laughter,speech,guitar 416 | jOs3UfxE8XE cheering,singing,speech,dance,laughter 417 | jkXJfDPB57w singing,drum 418 | jroTvXsKUiA alarm,speech,dog,car 419 | k5ug9gzPS4M playing_basketball,speech 420 | kBiGFEIqZHA playing_soccer,clapping 421 | kNZkGy3wAEE speech,singing,guitar,laughter 422 | kda6dpSX0jY speech,dog,helicopter,car 423 | kolGmya2OIs speech,guitar,singing 424 | kwNR-6sfog4 speech,piano,laughter,clapping,violin 425 | l6p53lBcIGw horse 426 | lHcUr0POrKY guitar,singing,laughter,car,speech 427 | lNQexzgzvw4 speech,frisbee 428 | lWTnQ4qj24U accordion,clapping,cello,guitar,piano,cheering,drum 429 | lfizshQHNQo piano,cello 430 | lnijMHh8qh4 speech,laughter,piano 431 | lvZBGJkbeTE cheering,singing,clapping,dance 432 | m5ex66Vst80 drum,singing,guitar,speech,piano 433 | mQl_6mcQBaQ piano,speech,clapping,cheering,car 434 | mcc8UNgZh_I speech,car_alarm,car 435 | mkN6QdXU2fQ cheering,guitar,drum,speech,car 436 | n47yP1yfJWQ violin,clapping 437 | nAExL0lNLM0 clapping,playing_volleyball,cheering,singing,car_alarm,speech,laughter 438 | nBNHSOn7N1s piano,laughter,cheering,clapping 439 | nIsZ4Q0y100 speech,cheering,bicycle 440 | nNxaCXUC0l4 speech,guitar,singing,laughter 441 | nRxFEDNT-Z0 cheering,singing,drum,speech,dance 442 | npyseVDYQbY violin,guitar,accordion 443 | o16h6voegFU speech,laughter,singing,piano 444 | oEepqn5vJiE playing_baseball,speech,drum 445 | oNNlSdkjuqA speech,horse 446 | od140iTEaVg car,helicopter,laughter 447 | ooDu3m-ixQs helicopter,chainsaw,speech,laughter 448 | p6BWQkhJ2kQ speech,violin 449 | p6ra0IBnL20 drum,speech,singing,guitar,shofar,car 450 | pH4q6OXhZgU speech,playing_soccer,clapping,cheering 451 | pOLqlica-bM car 452 | piaLzfKLs9c playing_volleyball,speech,laughter,dog,car 453 | psMNOvfJ1LE car,guitar,bicycle,singing 454 | qRFNqIN_LSc accordion,clapping 455 | q_NJ800CdGU accordion,guitar,drum,piano,singing 456 | qfGggAGITwg cheering,guitar,drum,cello,dance,laughter 457 | qqkhtf9zplA piano,dance 458 | r4lWcFb8854 speech,violin,singing,laughter 459 | rDq5TQYn9Ak laughter,cheering,singing,clapping,dance 460 | rRl66L-5TY4 violin,cheering 461 | r_5bYNqpEFU piano,cello 462 | rokxoxFsWPU speech,laughter,horse,dog 463 | rtMR524m0BM speech,singing,dance 464 | s3c0A6YmIIc cello,guitar,violin,singing,clapping,cheering,dance 465 | sPvTATw6LQA violin,speech,clapping 466 | sfyd3WX2OjE shofar,speech 467 | t56JVDxycY4 speech,cheering,clapping,dance,singing,drum 468 | tTOihIRXPGs drum,dog 469 | ty_LDBaE7gw piano,violin,drum 470 | uEwMTxbpbrA violin,cello,shofar,cheering,clapping,laughter 471 | uhFJdAJkgjo drum,speech,accordion 472 | v2lwdvydNsw speech,laughter,car 473 | vKBn0Rn93TQ piano 474 | vVkdhLjRchA guitar,shofar,drum 475 | vxWMRjF_0wM speech,playing_volleyball,laughter,clapping 476 | wDKE5BHyCBg speech,dog,frisbee 477 | wNCJhki5_eM cello,guitar,piano,violin 478 | weKt0KjakLI speech,shofar 479 | x5isH2gqS4o horse,speech 480 | xPvEzH2UXnw speech,dog 481 | xsRNDo8Upys laughter,clapping,piano,dance 482 | yJglmIQzriI speech,piano,guitar,clapping 483 | ySibvCKNOU8 drum,speech,playing_basketball,cheering,clapping 484 | ysCaqh38JVQ speech,laughter 485 | zO8IwPbALK4 cheering,clapping,guitar,singing,speech 486 | zfeHACNRs20 cheering,speech,clapping,singing,piano 487 | zmCKZYKsiGM cheering,clapping,speech,laughter 488 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 GeWu-Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Towards Long Form Audio-visual Video Understanding (ACM TOMM 2024). 2 | 3 | [Project Page](https://gewu-lab.github.io/LFAV/) 4 | 5 | [Paper](https://dl.acm.org/doi/pdf/10.1145/3672079) 6 | 7 | ### Dataset & Features 8 | 9 | #### YouTube ID 10 | 11 | The dataset is collected from [YouTube](https://www.youtube.com/), you can find the ID of each video in [annotation files](https://github.com/GeWu-Lab/LFAV/tree/main/LFAV_dataset). 12 | 13 | #### Features 14 | 15 | We use VGGish to extract audio features, use ResNet18 and R(2+1)D-18 to extract visual features. 16 | 17 | VGGish feature: [Google Drive](https://drive.google.com/file/d/1bvTBotLHnPGIeIAkkgMWK7wcWjZ5xbfo/view), [Baidu Drive](https://pan.baidu.com/share/init?surl=nSdhEilGxGFs-7FOgsDoFw) (pwd: lfav), (~212M). 18 | 19 | ResNet18 feature: [Google Drive](https://drive.google.com/file/d/14p4jgDo-tteeZPzRBbEq1982tT-uxviZ/view), [Baidu Drive](https://pan.baidu.com/s/1GAstblAMXbhlUj_8QD_ONg) (pwd: lfav), (~1.9G). 20 | 21 | R(2+1)D-18feature: [Google Drive](https://drive.google.com/file/d/1FfLpS0PLPXNJ28SqqYLb_vBATlUWnDvK/view), [Baidu Drive](https://pan.baidu.com/share/init?surl=-jRD7MQ0RT0lAN5DP40syA) (pwd: lfav), (~1.9G). 22 | 23 | ### Annotations 24 | 25 | Label files are in the folder `LFAV_dataset`. 26 | 27 | #### training set 28 | 29 | ``` 30 | # LFAV training set annotations 31 | cd LFAV_dataset 32 | cd ./train 33 | train_audio_weakly.csv: video-level audio annotaions of training set 34 | train_visual_weakly.csv: video-level visual annotaions of training set 35 | train_weakly.csv: video-level annotations (union of video-level audio annotations and visual annotations) of training set 36 | ``` 37 | 38 | #### validation set 39 | 40 | ``` 41 | # LFAV validation set annotations 42 | cd LFAV_dataset 43 | cd ./val 44 | val_audio_weakly.csv: video-level audio annotaions of validation set 45 | val_visual_weakly.csv: video-level visual annotaions of validation set 46 | val_weakly_av.csv: video-level annotations (union of video-level audio annotations and visual annotations) of validation set 47 | val_audio.csv: event-level audio annotaions of validation set 48 | val_visual.csv: event-level visual annotaions of validation set 49 | ``` 50 | 51 | #### testing set 52 | 53 | ``` 54 | # LFAV testing set annotations 55 | cd LFAV_dataset 56 | cd ./test 57 | test_audio_weakly.csv: video-level audio annotaions of testing set 58 | test_visual_weakly.csv: video-level visual annotaions of testing set 59 | test_weakly_av.csv: video-level annotations (union of video-level audio annotations and visual annotations) of testing set 60 | test_audio.csv: event-level audio annotaions of testing set 61 | test_visual.csv: event-level visual annotaions of testing set 62 | ``` 63 | 64 | ### Train and test 65 | 66 | Source code is in the folder `src`. 67 | 68 | The script of training all three phases is in: 69 | 70 | ``` 71 | src/scripts/train_s3.sh 72 | ``` 73 | 74 | If you want to train one or two phases, just edit the arg "num_stages" to 1 or 2. 75 | 76 | The script of testing all three phases is in: 77 | 78 | ``` 79 | src/scripts/test_s3.sh 80 | ``` 81 | 82 | We also provide our trained weights of the complete method (three phases): [Google Drive](https://drive.google.com/file/d/10v-1WnUhHf-0ehH8yXJ0pSkGDYtKVdPy/view?usp=sharing), [Baidu Drive](https://pan.baidu.com/s/1-wki3AfPAz3YnzNmGrC0wA?pwd=lfav) (pwd: lfav). 83 | 84 | ### Publication(s) 85 | 86 | If you find our work useful in your research, please cite our paper. 87 | 88 | ``` 89 | @article{hou2024toward, 90 | title={Toward Long Form Audio-Visual Video Understanding}, 91 | author={Hou, Wenxuan and Li, Guangyao and Tian, Yapeng and Hu, Di}, 92 | journal={ACM Transactions on Multimedia Computing, Communications and Applications}, 93 | volume={20}, 94 | number={9}, 95 | pages={1--26}, 96 | year={2024}, 97 | publisher={ACM New York, NY} 98 | } 99 | ``` 100 | 101 | ### Acknowledgement 102 | 103 | This research was supported by National Natural Science Foundation of China (NO.62106272), and Public Computing Cloud, Renmin University of China. 104 | 105 | The source code referenced [AVVP-ECCV20](https://github.com/YapengTian/AVVP-ECCV20). 106 | 107 | ### License 108 | 109 | This project is released under the [CC BY-NC 4.0 License](https://creativecommons.org/licenses/by-nc/4.0/). 110 | -------------------------------------------------------------------------------- /src/__pycache__/dataloader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/__pycache__/dataloader.cpython-38.pyc -------------------------------------------------------------------------------- /src/__pycache__/dataloader.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/__pycache__/dataloader.cpython-39.pyc -------------------------------------------------------------------------------- /src/dataloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import torch 6 | import torch.nn.functional as F 7 | from torch.utils.data import Dataset 8 | from utils import constant 9 | 10 | 11 | def ids_to_multinomial(ids): 12 | """ label encoding 13 | 14 | Returns: 15 | 1d array, multimonial representation, e.g. [1,0,1,0,0,...] 16 | """ 17 | categories = constant.CATEGORIES 18 | 19 | id_to_idx = {id: index for index, id in enumerate(categories)} 20 | 21 | y = np.zeros(len(categories)) 22 | 23 | for id in ids: 24 | try: 25 | index = id_to_idx[id.strip()] 26 | y[index] = 1 27 | except KeyError: 28 | if id == 'silent': 29 | pass 30 | else: 31 | print('id: ', id) 32 | print('ids:', ids) 33 | return y 34 | 35 | 36 | def ids_to_multinomial_null(ids): 37 | """ label encoding 38 | 39 | Returns: 40 | 1d array, multimonial representation, e.g. [1,0,1,0,0,...] 41 | """ 42 | categories = constant.CATEGORIES_NULL 43 | 44 | id_to_idx = {id: index for index, id in enumerate(categories)} 45 | 46 | y = np.zeros(len(categories)) 47 | 48 | for id in ids: 49 | index = id_to_idx[id.strip()] 50 | y[index] = 1 51 | return y 52 | 53 | 54 | class MESSDataset(Dataset): 55 | 56 | def __init__(self, label, audio_dir, video_dir, st_dir, transform=None): 57 | self.df = pd.read_csv(label, header=0, sep='\t') 58 | self.filenames = self.df["filename"] 59 | self.audio_dir = audio_dir 60 | self.video_dir = video_dir 61 | self.st_dir = st_dir 62 | self.transform = transform 63 | 64 | def __len__(self): 65 | return len(self.filenames) 66 | 67 | def __getitem__(self, idx): 68 | row = self.df.loc[idx, :] 69 | name = row[0][:11] 70 | audio = np.load(os.path.join(self.audio_dir, name + '.npy')) 71 | video_s = np.load(os.path.join(self.video_dir, name + '.npy')) 72 | video_st = np.load(os.path.join(self.st_dir, name + '.npy')) 73 | ids = row[-1].split(',') 74 | label = ids_to_multinomial(ids) 75 | 76 | 77 | sample = {'name': name, 'audio': audio, 'video_s': video_s, 'video_st': video_st, 'label': label} 78 | 79 | if self.transform: 80 | sample = self.transform(sample) 81 | 82 | return sample 83 | 84 | class MESSDatasetNew(Dataset): 85 | 86 | def __init__(self, label, label_a, label_v, audio_dir, video_dir, st_dir, transform=None): 87 | self.df = pd.read_csv(label, header=0, sep='\t') 88 | self.df_a = pd.read_csv(label_a, header=0, sep='\t') 89 | self.df_v = pd.read_csv(label_v, header=0, sep='\t') 90 | self.filenames = self.df_a["filename"] 91 | self.audio_dir = audio_dir 92 | self.video_dir = video_dir 93 | self.st_dir = st_dir 94 | self.transform = transform 95 | 96 | def __len__(self): 97 | return len(self.filenames) 98 | 99 | def __getitem__(self, idx): 100 | row = self.df.loc[idx, :] 101 | row_a = self.df_a.loc[idx, :] 102 | row_v = self.df_v.loc[idx, :] 103 | name = row_a[0][:11] 104 | 105 | audio = np.load(os.path.join(self.audio_dir, name + '.npy')) 106 | video_s = np.load(os.path.join(self.video_dir, name + '.npy')) 107 | video_st = np.load(os.path.join(self.st_dir, name + '.npy')) 108 | 109 | 110 | ids = row[-1].split(',') 111 | label = ids_to_multinomial(ids) 112 | 113 | 114 | ids_a = row_a[-1].split(',') 115 | label_a = ids_to_multinomial(ids_a) 116 | 117 | 118 | ids_v = row_v[-1].split(',') 119 | label_v = ids_to_multinomial(ids_v) 120 | 121 | label = [label, label_a, label_v] 122 | 123 | 124 | sample = {'name': name, 'audio': audio, 'video_s': video_s, 'video_st': video_st, 'label': label} 125 | 126 | if self.transform: 127 | sample = self.transform(sample) 128 | 129 | 130 | return sample 131 | 132 | 133 | class ToTensor(object): 134 | 135 | def __call__(self, sample): 136 | if len(sample) == 2: 137 | audio = sample['audio'] 138 | label = sample['label'] 139 | return {'audio': torch.from_numpy(audio), 'label': torch.from_numpy(label)} 140 | else: 141 | name = sample['name'] 142 | audio = sample['audio'] 143 | video_s = sample['video_s'] 144 | video_st = sample['video_st'] 145 | label = sample['label'] 146 | 147 | return {'name': name, 'audio': torch.from_numpy(audio), 'video_s': torch.from_numpy(video_s), 148 | 'video_st': torch.from_numpy(video_st), 149 | 'label': label} 150 | 151 | 152 | class ToEqualLength(object): 153 | 154 | def __init__(self, length=1000): 155 | 156 | self.length = length 157 | 158 | def __call__(self, sample): 159 | 160 | if len(sample) == 2: 161 | audio = sample['audio'] 162 | label = sample['label'] 163 | 164 | audio = audio.unsqueeze(0).permute(0, 2, 1).contiguous() 165 | audio = F.interpolate(audio, size=self.length, mode='linear') 166 | audio = audio.permute(0, 2, 1).contiguous().squeeze() 167 | 168 | return {'audio': audio, 'label': label} 169 | else: 170 | name = sample['name'] 171 | audio = sample['audio'] 172 | video_s = sample['video_s'] 173 | video_st = sample['video_st'] 174 | label = sample['label'] 175 | 176 | audio = audio.unsqueeze(0).permute(0, 2, 1).contiguous() 177 | audio = F.interpolate(audio, size=self.length, mode='linear') 178 | audio = audio.permute(0, 2, 1).contiguous().squeeze() 179 | 180 | video_s = video_s.unsqueeze(0).permute(0, 2, 1).contiguous() 181 | video_s = F.interpolate(video_s, size=self.length, mode='linear') 182 | video_s = video_s.permute(0, 2, 1).contiguous().squeeze() 183 | 184 | video_st = video_st.unsqueeze(0).permute(0, 2, 1).contiguous() 185 | video_st = F.interpolate(video_st, size=self.length, mode='linear') 186 | video_st = video_st.permute(0, 2, 1).contiguous().squeeze() 187 | 188 | return {'name': name, 'audio': audio, 'video_s': video_s, 189 | 'video_st': video_st, 'label': label} 190 | 191 | 192 | class ToEqualLengthSample(object): 193 | 194 | def __init__(self, length=200): 195 | 196 | self.length = length 197 | 198 | def __call__(self, sample): 199 | 200 | if len(sample) == 2: 201 | audio = sample['audio'] 202 | label = sample['label'] 203 | 204 | audio = audio.unsqueeze(0).permute(0, 2, 1).contiguous() 205 | audio = F.interpolate(audio, size=self.length, mode='linear') 206 | audio = audio.permute(0, 2, 1).contiguous().squeeze() 207 | 208 | return {'audio': audio, 'label': label} 209 | else: 210 | name = sample['name'] 211 | audio = sample['audio'] 212 | video_s = sample['video_s'] 213 | video_st = sample['video_st'] 214 | label = sample['label'] 215 | 216 | seq_len = audio.size(0) 217 | if seq_len >= self.length: 218 | audio = self.downsample(audio, self.length) 219 | video_s = self.downsample(video_s, self.length) 220 | video_st = self.downsample(video_st, self.length) 221 | else: 222 | audio = self.upsample(audio, self.length) 223 | video_s = self.upsample(video_s, self.length) 224 | video_st = self.upsample(video_st, self.length) 225 | 226 | return {'name': name, 'audio': audio, 'video_s': video_s, 227 | 'video_st': video_st, 'label': label} 228 | 229 | @staticmethod 230 | def downsample(x, length): 231 | stride = x.size(0) // length 232 | sampled_x = x[::stride, :] 233 | sampled_x = sampled_x[:length, :] 234 | return sampled_x 235 | 236 | @staticmethod 237 | def upsample(x, length): 238 | x = x.unsqueeze(0).permute(0, 2, 1).contiguous() 239 | sampled_x = F.interpolate(x, length, mode='linear') 240 | sampled_x = sampled_x.permute(0, 2, 1).contiguous().squeeze() 241 | return sampled_x 242 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import argparse 4 | import os 5 | import warnings 6 | import time 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | from torch.utils.data import DataLoader 13 | from torchvision.transforms import transforms 14 | 15 | from dataloader import MESSDataset, MESSDatasetNew, ToTensor, ToEqualLengthSample 16 | from tools import train, evaluation 17 | # from tools.evaluation_stat import evaluation_stat 18 | from models import MultiStageNet 19 | from utils import setup_seed 20 | from tensorboardX import SummaryWriter 21 | 22 | warnings.filterwarnings('ignore') 23 | np.set_printoptions(threshold=np.inf) 24 | 25 | print("\n------------------ MESS experiment -------------------------\n") 26 | 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser(description='PyTorch Implementation of ' 30 | 'Multi-Event Scene Understanding with Sight and Sound') 31 | 32 | # feature paths 33 | parser.add_argument("--audio_feature_path", type=str, required=True, 34 | help="audio feature path") 35 | parser.add_argument("--visual_feature_path", type=str, required=True, 36 | help="2D visual feature path dir") 37 | parser.add_argument("--spatio_temporal_visual_feature_path", type=str, required=True, 38 | help="spatio-temporal visual feature path") 39 | 40 | # label utility 41 | parser.add_argument("--label_format", type=str, default='video', choices=['video'], 42 | help="use audio-visual separate weakly annotated labels or video level ones") 43 | parser.add_argument("--real_av_labels", action='store_true', 44 | help="use real audio-visual separate weakly labels or label smoothing") 45 | # train_label paths 46 | parser.add_argument("--weak_label_train_video_level", type=str, default="label_path/train/train_weakly.txt") 47 | parser.add_argument("--weak_label_train_audio", type=str, default="label_path/train/train_audio_weakly.txt", 48 | help="audio weak train csv file") 49 | parser.add_argument("--weak_label_train_visual", type=str, default="label_path/train/train_visual_weakly.txt", 50 | help="visual weak train csv file") 51 | # val_label_paths 52 | parser.add_argument("--weak_label_val_audio", type=str, default="label_path/val/val_audio_weakly.csv", 53 | help="audio weak test csv file") 54 | parser.add_argument("--weak_label_val_visual", type=str, default="label_path/val/val_visual_weakly.csv", 55 | help="visual weak test csv file") 56 | parser.add_argument("--weak_label_val", type=str, default="label_path/val/val_weak_av.csv", 57 | help="weak test csv file") 58 | parser.add_argument("--label_val_audio", type=str, default="label_path/val/val_audio.csv", 59 | help="temporally fine-grained annotated validation csv file for audio") 60 | parser.add_argument("--label_val_visual", type=str, default="label_path/val/val_visual.csv", 61 | help="temporally fine-grained annotated validation csv file for visual") 62 | 63 | # test_label_paths 64 | parser.add_argument("--weak_label_test_audio", type=str, default="label_path/test/test_audio_weakly.csv", 65 | help="audio weak test csv file") 66 | parser.add_argument("--weak_label_test_visual", type=str, default="label_path/test/test_visual_weakly.csv", 67 | help="visual weak test csv file") 68 | parser.add_argument("--weak_label_test", type=str, default="label_path/test/test_weak_av.csv", 69 | help="weak test csv file") 70 | parser.add_argument("--label_test_audio", type=str, default="label_path/test/test_audio.csv", 71 | help="temporally fine-grained annotated validation csv file for audio") 72 | parser.add_argument("--label_test_visual", type=str, default="label_path/test/test_visual.csv", 73 | help="temporally fine-grained annotated validation csv file for visual") 74 | 75 | # training settings 76 | parser.add_argument('--batch_size', type=int, default=16, help='input batch size for training (default: 16)') 77 | parser.add_argument('--epochs', type=int, default=30, help='number of epochs to train (default: 30)') 78 | parser.add_argument('--stg1_lr', type=float, default=1e-4, help='learning rate (default: 1e-4)') 79 | parser.add_argument('--stg2_lr', type=float, default=1e-4, help='learning rate (default: 1e-4)') 80 | parser.add_argument('--stg3_lr', type=float, default=1e-4, help='learning rate (default: 3.3e-5)') 81 | parser.add_argument('--step_size', type=int, default=10, help='step size') 82 | parser.add_argument('--stg2_evnet_loss_weight', type=float, default=1.0, help='learning rate (default: 1e-4)') 83 | parser.add_argument('--stg3_event_loss_weight', type=float, default=1.0, help='learning rate (default: 1e-4)') 84 | parser.add_argument('--stg3_av_loss_weight', type=float, default=1.0, help='stg3 av loss weight') 85 | parser.add_argument("--just_event_loss", action='store_true',help="just use event loss in stage2") 86 | parser.add_argument("--el_warm_up_epoch", type=int, default=0, help="will be use in future") 87 | parser.add_argument("--s3_el_warm_up_epoch", type=int, default=0, help="will be use in future") 88 | parser.add_argument('--num_classes', type=int, default=35) 89 | 90 | 91 | # model settings 92 | parser.add_argument("--num_stages", type=int, default=1, help="number of stages in the model") 93 | # 1. Segment Level Network 94 | parser.add_argument("--transformer_dim", type=int, default=512, 95 | help="dimension in multiscale transformer") 96 | parser.add_argument("--transformer_num_heads", type=int, default=8, 97 | help="number of heads in multiscale transformer") 98 | parser.add_argument("--mask_generator_type", type=str, default='conv', choices=['conv', 'attn'], 99 | help="network type of the mask generator") 100 | parser.add_argument("--transformer_temperature", type=float, default=1., 101 | help="temperature in softmax of multiscale transformer") 102 | parser.add_argument("--num_transformer_layers", type=int, default=6, 103 | help="number of layers in multiscale transformer") 104 | parser.add_argument("--window_shift", action='store_true', 105 | help="whether to shift window within a layer") 106 | parser.add_argument("--basic_window_size", type=int, default=2, 107 | help="the size of smallest window in multiscale transformer") 108 | parser.add_argument("--flow_between_layers", type=str, default='sequential', 109 | choices=['sequential', 'ada_weight', 'dense_connected'], 110 | help="control the feature flow in multiscale window transformer") 111 | parser.add_argument("--s1_attn", type=str, default='all', choices=['all', 'self', 'cm', 'none'], 112 | help="the size of smallest window in multiscale transformer") 113 | 114 | # 2. GAT Network 115 | # hyp-parameters in GAT is fixed, not in args 116 | parser.add_argument("--extract_sb_th", type=float, default=0.7, 117 | help="threshold when extract subgraph accroding to the stage1") 118 | parser.add_argument("--add_sb_cos_th", type=float, default=0.9, 119 | help="cos sim th when add node") 120 | parser.add_argument("--add_sb_pred_th", type=float, default=0.5, 121 | help="cos sim th when add node") 122 | parser.add_argument("--pool_method", type=str, default='avg',choices=['avg', 'att'], 123 | help="method of get event feature") 124 | parser.add_argument("--gat_residual", action='store_true',help="use skip connection in GAT") 125 | parser.add_argument("--cross_modal", action='store_true',help="will be use in future") 126 | parser.add_argument("--adj_mode", default='local', choices=['local', 'global'], help="will be use in future") 127 | 128 | # prior GAT args, do not use new, will delete in future 129 | parser.add_argument("--gat_edge_threshold", type=float, default=0.5, 130 | help="mask edge threshold before GAT") 131 | parser.add_argument("--graph_op_type", type=str, default='conv', choices=['conv', 'attn'], 132 | help="network type of the graph operation") 133 | 134 | # 3 event interaction network 135 | parser.add_argument("--event_interaction_op", type=str, default='attn', 136 | choices=['attn','mhsa','none'], help="operations of event interaction, now just can choose GAT, maybe add more in future") 137 | parser.add_argument("--s3_within_modal", action='store_true', help="within modal interaction in s3 event interaction network or not") 138 | parser.add_argument("--s3_cross_modal", action='store_true', help="cross modal in s3 event interaction network or not") 139 | parser.add_argument("--s3_residual", action='store_true',help="whether use skip connection in s3 event interaction net") 140 | parser.add_argument("--s3_share_fc", action='store_true',help="for get event prob in s3, use fc_prob in s2") 141 | parser.add_argument("--s3_gat_nheads", type=int, default=1, help='num heads of GAT in stage3') 142 | parser.add_argument("--s3_mhsa_nheads", type=int, default=1, help='num heads of MHSA in stage3') 143 | parser.add_argument("--s3_mhsa_pe", action='store_true' , help='use position encoding in S3 or not') 144 | parser.add_argument("--s3_gat_depth", type=int, default=2, help='depth of GAT in stage3, if 1, just have one output layer') 145 | parser.add_argument("--s3_dropout", type=float, default=0.0, help='dropout of S3') 146 | parser.add_argument("--s3_cm_method", type=str, default='concat', choices=['concat', 'add', 'sequential'], help='cross modal attention method of MHSA in s3') 147 | parser.add_argument("--s3_pre_norm", action='store_true' , help='LN before MHSA in s3') 148 | parser.add_argument("--s3_post_norm", action='store_true' , help='LN after MHSA and residual in s3') 149 | parser.add_argument("--s3_no_share_weight", action='store_true' , help="use a fc layer for event projection") 150 | parser.add_argument("--s3_share_cm", action='store_true' , help="self attention and cross-modal attention share weights") 151 | parser.add_argument("--s3_just_cm", action='store_true' , help="just use cross modal attention") 152 | parser.add_argument("--s3_feature_detach", action='store_true' , help='detach grad when reweight snippet attention for event feature') 153 | parser.add_argument("--s3_event_proj", action='store_true' , help="use a fc layer for event projection") 154 | parser.add_argument("--s3_attn", type=str, default='all', choices=['all', 'self', 'cm', 'none'], 155 | help="the size of smallest window in multiscale transformer") 156 | 157 | # other setting 158 | parser.add_argument("--train", action='store_true', help="train or test") 159 | parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') 160 | parser.add_argument('--log-interval', type=int, default=50, 161 | help='how many batches to wait before logging training status') 162 | parser.add_argument("--model_path", type=str, default='ckpts/', help="path to save trained models") 163 | parser.add_argument("--resume", action='store_true', help="whether resume from a saved checkpoint") 164 | parser.add_argument("--eval_output_path", type=str, default='eval_output/', help="path to save evaluation") 165 | parser.add_argument('--gpu', type=str, default='1', help='gpu device number') 166 | parser.add_argument("--experiment_date", type=str, required=True, help="e.g., Apr15") 167 | 168 | parser.add_argument("--tensorsummary_name", type=str, default='', help="name of tensorboard summary file") 169 | parser.add_argument("--save_prob_dir", type=str, default='', help="save dir of predict prob") 170 | 171 | args = parser.parse_args() 172 | print(args) 173 | print('Experiment data: ', args.experiment_date) 174 | 175 | model_name = 'Multimodal-Multi-Event-Video-Analyzer_' \ 176 | 'label_utility-{}_' \ 177 | 'number_of_stages-{}_' \ 178 | 'transformer_layers-{}_' \ 179 | 'basic_window_size-{}_' \ 180 | 'window_shift-{}_' \ 181 | 'experiment_date-{}'.format(args.label_format, 182 | args.num_stages, 183 | args.num_transformer_layers, 184 | args.basic_window_size, 185 | args.window_shift, 186 | args.experiment_date) 187 | 188 | print('Model Name: \n', model_name) 189 | 190 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 191 | setup_seed(args.seed) 192 | np.set_printoptions(threshold=np.inf) 193 | 194 | if not os.path.exists(args.model_path): 195 | os.mkdir(args.model_path) 196 | if not os.path.exists(args.eval_output_path): 197 | os.mkdir(args.eval_output_path) 198 | 199 | device = torch.device('cuda:0') 200 | 201 | model = MultiStageNet(args, 202 | num_stages=args.num_stages, 203 | label_utility=args.label_format, 204 | num_hierarchy=args.num_transformer_layers, 205 | window_shift=args.window_shift, 206 | basic_window_size=args.basic_window_size, 207 | snippet_graph_op=args.graph_op_type, 208 | gat_edge_threshold=args.gat_edge_threshold) 209 | 210 | model = model.to(device) 211 | 212 | if args.resume: 213 | if os.path.exists(args.model_path + model_name + ".pth"): 214 | print('resume from {}'.format(args.model_path + model_name + ".pth")) 215 | model.load_state_dict(torch.load(args.model_path + model_name + ".pth")) 216 | else: 217 | print('No resume checkpoint found! Train from scratch!') 218 | 219 | if args.train: 220 | 221 | if args.label_format == 'video': 222 | train_dataset = MESSDatasetNew(label=args.weak_label_train_video_level, 223 | label_a=args.weak_label_train_audio, 224 | label_v=args.weak_label_train_visual, 225 | audio_dir=args.audio_feature_path, 226 | video_dir=args.visual_feature_path, 227 | st_dir=args.spatio_temporal_visual_feature_path, 228 | transform=transforms.Compose([ToTensor(), ToEqualLengthSample()])) 229 | else: 230 | raise NotImplementedError 231 | 232 | val_dataset = MESSDataset(label=args.weak_label_val, 233 | audio_dir=args.audio_feature_path, 234 | video_dir=args.visual_feature_path, 235 | st_dir=args.spatio_temporal_visual_feature_path, 236 | transform=transforms.Compose([ToTensor()])) 237 | 238 | train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4, 239 | pin_memory=True, drop_last=True) 240 | val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True, 241 | drop_last=False) 242 | optimizer_list = [] 243 | if args.num_stages >= 1: 244 | stg1_optimizer = optim.Adam(model.stage1.parameters(), lr=args.stg1_lr) 245 | stg1_scheduler = optim.lr_scheduler.StepLR(stg1_optimizer, step_size=args.step_size, gamma=0.1) 246 | optimizer_list.append(stg1_optimizer) 247 | if args.num_stages >= 2: 248 | stg2_optimizer = optim.Adam(model.stage2.parameters(), lr=args.stg2_lr) 249 | stg2_scheduler = optim.lr_scheduler.StepLR(stg2_optimizer, step_size=args.step_size, gamma=0.1) 250 | optimizer_list.append(stg2_optimizer) 251 | if args.num_stages >= 3: 252 | stg3_optimizer = optim.Adam(model.stage3.parameters(), lr=args.stg3_lr) 253 | stg3_scheduler = optim.lr_scheduler.StepLR(stg3_optimizer, step_size=args.step_size, gamma=0.1) 254 | optimizer_list.append(stg3_optimizer) 255 | criterion = nn.BCELoss() 256 | 257 | best_F = 0 258 | if len(args.tensorsummary_name)==0: 259 | current_time = time.strftime("%Y-%m-%dT%H:%M", time.localtime()) # if not input name, use current time as name 260 | else: 261 | current_time = args.tensorsummary_name 262 | my_writer = SummaryWriter(log_dir='tensorboard_summary/'+current_time) 263 | # writer = SummaryWriter('./tensorboard_Result') 264 | 265 | for epoch in range(1, args.epochs + 1): 266 | 267 | train(args, model, train_loader, optimizer_list, criterion, epoch=epoch, writer=my_writer) 268 | 269 | if args.num_stages >= 1: 270 | stg1_scheduler.step(epoch) 271 | if args.num_stages >= 2: 272 | stg2_scheduler.step(epoch) 273 | if args.num_stages >= 3: 274 | stg3_scheduler.step(epoch) 275 | 276 | print('\n') 277 | print('---------------start val--------------------') 278 | F = evaluation(args, model, model_name, val_loader, # val 279 | args.weak_label_val_audio, 280 | args.weak_label_val_visual, 281 | args.label_val_audio, 282 | args.label_val_visual) 283 | if F > best_F: # select best model in val dataset 284 | best_F = F 285 | torch.save(model.state_dict(), args.model_path + model_name + ".pth") 286 | print('save model, epoch: ', epoch) 287 | print('---------------end val--------------------') 288 | 289 | else: # test 290 | model.load_state_dict(torch.load(args.model_path + model_name + ".pth"), strict=False) 291 | 292 | test_dataset = MESSDataset(label=args.weak_label_test, 293 | audio_dir=args.audio_feature_path, 294 | video_dir=args.visual_feature_path, 295 | st_dir=args.spatio_temporal_visual_feature_path, 296 | transform=transforms.Compose([ToTensor()])) 297 | 298 | test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True, 299 | drop_last=False) 300 | 301 | print('\n') 302 | print('---------------start test--------------------') 303 | evaluation(args, model, model_name, test_loader, # test 304 | args.weak_label_test_audio, 305 | args.weak_label_test_visual, 306 | args.label_test_audio, 307 | args.label_test_visual) 308 | print('---------------end test--------------------') 309 | print('\n') 310 | 311 | 312 | if __name__ == '__main__': 313 | main() 314 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import * 2 | -------------------------------------------------------------------------------- /src/models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/base_models.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/base_models.cpython-36.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/base_models.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/base_models.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/canny1d.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/canny1d.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/get_adj.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/get_adj.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/get_multi_adj.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/get_multi_adj.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/graph_modules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/graph_modules.cpython-36.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/graph_modules.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/graph_modules.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/graph_modules_single.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/graph_modules_single.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/mhsa_layer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/mhsa_layer.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/model.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/modules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/modules.cpython-36.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/modules.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/modules.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/modules_new_stage2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/modules_new_stage2.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/stage_one.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/stage_one.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/stage_three.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/stage_three.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/stage_two.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/stage_two.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/stage_two_new.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/stage_two_new.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/__pycache__/transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/models/__pycache__/transformer.cpython-38.pyc -------------------------------------------------------------------------------- /src/models/base_models.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class BasicWindowTransformer(nn.Module): 8 | def __init__(self, dim=512, window_size=2): 9 | super(BasicWindowTransformer, self).__init__() 10 | 11 | self.dim = dim 12 | self.window_size = window_size 13 | 14 | def forward(self, *input): 15 | raise NotImplementedError 16 | 17 | def window_partition(self, original_input): 18 | """ 19 | Args: 20 | original_input: (b, t, dim) 21 | Returns: (b*num_windows, window_size, dim) 22 | """ 23 | b, t, dim = original_input.size() 24 | window_input = original_input.view(b, t // self.window_size, self.window_size, dim) 25 | num_windows = t // self.window_size 26 | window_input = window_input.view(-1, self.window_size, dim).contiguous() 27 | return window_input, num_windows 28 | 29 | def window_reverse(self, window_output, num_windows): 30 | """ 31 | Args: 32 | window_output: (b*num_windows, window_size, dim) 33 | num_windows: int 34 | Returns: (b, t, dim) 35 | """ 36 | b_times_n_win, window_size, dim = window_output.size() 37 | assert window_size == self.window_size, 'Inconsistency in window size !!!' 38 | window_output = window_output.view(-1, num_windows, window_size, dim) 39 | reverse_input = window_output.view(-1, num_windows * window_size, dim).contiguous() 40 | return reverse_input 41 | 42 | 43 | class BasicPaddedWindowTransformer(nn.Module): 44 | def __init__(self, dim=512, window_size=2, window_shift=True, shift_stride=None): 45 | super(BasicPaddedWindowTransformer, self).__init__() 46 | 47 | self.dim = dim 48 | self.window_size = window_size 49 | self.window_shift = window_shift 50 | self.shift_stride = shift_stride or window_size // 2 51 | 52 | self.pad_flag = 0 53 | self.padded_len = 0 54 | 55 | def forward(self, *input): 56 | raise NotImplementedError 57 | 58 | def window_forward(self, *input): 59 | raise NotImplementedError 60 | 61 | def window_partition(self, original_input): 62 | """ 63 | Args: 64 | original_input: (b, t, dim) 65 | Returns: (b*num_windows, window_size, dim) 66 | """ 67 | self.pad_flag = 0 68 | b, t, dim = original_input.size() 69 | if t % self.window_size == 0: 70 | window_input = original_input.view(b, t // self.window_size, self.window_size, dim) 71 | num_windows = t // self.window_size 72 | window_input = window_input.view(-1, self.window_size, dim).contiguous() 73 | else: 74 | residual_seg_len = t % self.window_size 75 | padded_len = self.window_size - residual_seg_len 76 | self.padded_len = padded_len 77 | self.pad_flag = 1 78 | 79 | padding = original_input[:, -padded_len:] 80 | padded_input = torch.cat((original_input, padding), dim=1) 81 | 82 | num_windows = t // self.window_size + 1 83 | assert num_windows == (t + padded_len) // self.window_size, \ 84 | 'wrong padding !!!, got num_windows = {}, t = {}, ' \ 85 | 'padded len = {}, self.window_size = {}'.format( 86 | num_windows, t, padded_len, self.window_size) 87 | window_input = padded_input.view(b, (t + padded_len) // self.window_size, self.window_size, dim) 88 | window_input = window_input.view(-1, self.window_size, dim).contiguous() 89 | 90 | return window_input, num_windows 91 | 92 | def window_reverse(self, window_output, num_windows): 93 | """ 94 | Args: 95 | window_output: (b*num_windows, window_size, dim) 96 | num_windows: int 97 | Returns: (b, t, dim) 98 | """ 99 | b_times_n_win, window_size, dim = window_output.size() 100 | assert window_size == self.window_size, 'Inconsistency in window size !!!' 101 | window_output = window_output.view(-1, num_windows, window_size, dim) 102 | reverse_input = window_output.view(-1, num_windows * window_size, dim).contiguous() 103 | 104 | if self.pad_flag != 0: 105 | reverse_input = reverse_input[:, :-self.padded_len] 106 | 107 | return reverse_input 108 | 109 | def sequence_shift(self, x): 110 | """ 111 | Shift the input sequence to fit the window shift operation. 112 | E.g.,: 113 | 1111 2222 3333 4444 --> 1122 2233 3344 4411 114 | """ 115 | 116 | shifted_x = torch.roll(x, shifts=-self.shift_stride, dims=1) 117 | return shifted_x 118 | 119 | def sequence_inverse_shift(self, shifted_x): 120 | """ 121 | Recover the original sequence w.r.t. the temporal order. 122 | E.g.,: 123 | 1122 2233 3344 4411 --> 1111 2222 3333 4444 124 | """ 125 | 126 | x = torch.roll(shifted_x, shifts=self.shift_stride, dims=1) 127 | return x 128 | -------------------------------------------------------------------------------- /src/models/get_adj.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | def get_batch_adj(frame_prob, th, min_length,cross_modal=False): 5 | 6 | bs_a_sb_list=[] 7 | bs_v_sb_list=[] 8 | bs, t, _, _ = frame_prob.shape 9 | adj=np.zeros([bs,2*t,2*t]) 10 | 11 | for i in range(bs): 12 | adj_a, a_subgraph_list=get_adj(frame_prob[i,:,0,:], th=th, min_length=min_length) 13 | adj_v, v_subgraph_list=get_adj(frame_prob[i,:,1,:], th=th, min_length=min_length) 14 | adj[i,:t,:t]=adj_a 15 | adj[i,t:,t:]=adj_v 16 | 17 | if cross_modal==True: 18 | 19 | adj[i,:t,t:]=np.diag(np.ones(t)) 20 | adj[i,t:,:t]=np.diag(np.ones(t)) 21 | 22 | bs_a_sb_list.append(a_subgraph_list) 23 | bs_v_sb_list.append(v_subgraph_list) 24 | return adj, bs_a_sb_list, bs_v_sb_list 25 | 26 | def get_adj(frame_prob, th=0.5, min_length=1): 27 | 28 | t, num_classes=frame_prob.shape 29 | subgraph_list=[[] for _ in range(num_classes)] 30 | 31 | diag=np.ones(t) 32 | adj=np.diag(diag) 33 | adj[1:,:-1]=adj[1:,:-1]+np.diag(diag[:-1]) 34 | adj[:-1,1:]=adj[:-1,1:]+np.diag(diag[1:]) 35 | 36 | if th == 'avg': 37 | avg_prob=np.mean(frame_prob,axis=0) 38 | th=np.mean(avg_prob) 39 | elif th=='ada': 40 | avg_prob=np.mean(frame_prob,axis=0) 41 | sort_prob=np.sort(avg_prob) 42 | th=sort_prob[-4] 43 | else: 44 | th=th 45 | 46 | for i in range(num_classes): 47 | if np.max(frame_prob[:,i])>=th: 48 | flag=0 49 | for j in range(t): 50 | if flag==0 and frame_prob[j,i]>=th: 51 | start=j 52 | flag=1 53 | if flag==1: 54 | if j>=t-1 and frame_prob[j,i]>=th: 55 | end=t 56 | flag=0 57 | elif frame_prob[j,i]=min_length: 62 | subgraph_adj=local_adj(t, start, end) 63 | adj=np.where(adj>subgraph_adj, adj, subgraph_adj) 64 | subgraph_list[i]+=list(range(start,end)) 65 | return adj, subgraph_list 66 | 67 | 68 | def local_adj(t,start,end): 69 | 70 | row=np.zeros([t,t]) 71 | col=np.zeros([t,t]) 72 | row[start:end,:]=1 73 | col[:,start:end]=1 74 | local_adj=np.where(row>col,col,row) 75 | return local_adj 76 | 77 | 78 | 79 | 80 | if __name__=='__main__': 81 | a=np.array([[1,2,3], 82 | [4,5,6]]) 83 | 84 | print(np.max(a[0])) 85 | 86 | -------------------------------------------------------------------------------- /src/models/get_multi_adj.py: -------------------------------------------------------------------------------- 1 | 2 | from asyncio import events 3 | import numpy as np 4 | 5 | def get_batch_adj(frame_prob, th, min_length,event_split=False,adj_mode='local'): 6 | 7 | bs_a_sb_list=[] 8 | bs_v_sb_list=[] 9 | bs, t, _, num_classes = frame_prob.shape 10 | adj=np.zeros([bs,num_classes,2*t,2*t]) 11 | 12 | for i in range(bs): 13 | adj_a, a_subgraph_list=get_adj(frame_prob[i,:,0,:], th=th, min_length=min_length, event_split=event_split, adj_mode=adj_mode) 14 | adj_v, v_subgraph_list=get_adj(frame_prob[i,:,1,:], th=th, min_length=min_length, event_split=event_split, adj_mode=adj_mode) 15 | adj[i,:,:t,:t]=adj_a 16 | adj[i,:,t:,t:]=adj_v 17 | 18 | bs_a_sb_list.append(a_subgraph_list) 19 | bs_v_sb_list.append(v_subgraph_list) 20 | 21 | adj=adj.transpose(1,0,2,3) 22 | 23 | 24 | return adj, bs_a_sb_list, bs_v_sb_list 25 | 26 | def get_adj(frame_prob, th=0.5, min_length=1, event_split=False, adj_mode='local'): 27 | 28 | t, num_classes=frame_prob.shape 29 | subgraph_list=[[] for _ in range(num_classes)] 30 | 31 | diag=np.ones(t) 32 | adj=np.diag(diag) 33 | adj[1:,:-1]=adj[1:,:-1]+np.diag(diag[:-1]) 34 | adj[:-1,1:]=adj[:-1,1:]+np.diag(diag[1:]) 35 | 36 | adj=np.expand_dims(adj,0) 37 | adj = adj*np.ones((num_classes,1,1)) 38 | 39 | if th == 'avg': 40 | avg_prob=np.mean(frame_prob,axis=0) 41 | th=np.mean(avg_prob) 42 | 43 | else: 44 | th=th 45 | 46 | for i in range(num_classes): 47 | if np.max(frame_prob[:,i])>=th: 48 | if adj_mode=='local': 49 | flag=0 50 | for j in range(t): 51 | if flag==0 and frame_prob[j,i]>=th: 52 | start=j 53 | flag=1 54 | if flag==1: 55 | if j>=t-1 and frame_prob[j,i]>=th: 56 | end=t 57 | flag=0 58 | elif frame_prob[j,i]=min_length: 63 | subgraph_adj=local_adj(t, start, end) 64 | 65 | if event_split==True: 66 | adj[i]=np.where(adj[i]>subgraph_adj, adj[i], subgraph_adj) 67 | else: 68 | adj=np.where(adj>subgraph_adj, adj, subgraph_adj) 69 | 70 | subgraph_list[i]+=list(range(start,end)) 71 | 72 | elif adj_mode == 'global': 73 | 74 | garray = list(np.where(frame_prob[:,i]>th)[0]) 75 | 76 | subgraph_adj=global_adj(t, garray) 77 | 78 | if event_split==True: 79 | adj[i]=np.where(adj[i]>subgraph_adj, adj[i], subgraph_adj) 80 | else: 81 | adj=np.where(adj>subgraph_adj, adj, subgraph_adj) 82 | 83 | subgraph_list[i] = garray 84 | 85 | else: 86 | raise NotImplementedError('illegal choice') 87 | 88 | return adj, subgraph_list 89 | 90 | def global_adj(t, array): 91 | row=np.zeros([t,t]) 92 | col=np.zeros([t,t]) 93 | row[array,:]=1 94 | col[:,array]=1 95 | local_adj=np.where(row>col,col,row) 96 | return local_adj 97 | 98 | 99 | def local_adj(t,start,end): 100 | 101 | row=np.zeros([t,t]) 102 | col=np.zeros([t,t]) 103 | row[start:end,:]=1 104 | col[:,start:end]=1 105 | local_adj=np.where(row>col,col,row) 106 | return local_adj 107 | 108 | 109 | if __name__=='__main__': 110 | a=np.array([[1,2,3], 111 | [4,5,6]]) 112 | 113 | print(np.max(a[0])) 114 | 115 | -------------------------------------------------------------------------------- /src/models/graph_modules.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | from copy import deepcopy 7 | 8 | 9 | class CrossGATLayer(nn.Module): 10 | 11 | 12 | def __init__(self, in_features, out_features, dropout=0, alpha=0.1, concat=True, hybrid_cond=0., t_ksize=1, s_ksize=4): 13 | super(CrossGATLayer, self).__init__() 14 | 15 | self.dropout = dropout 16 | self.in_features = in_features 17 | self.out_features = out_features 18 | self.alpha = alpha 19 | self.concat = concat 20 | self.hybrid_cond = hybrid_cond 21 | self.t_ksize=t_ksize 22 | self.s_ksize=s_ksize 23 | assert max(self.t_ksize,self.s_ksize)>=1 24 | 25 | self.W = nn.Parameter(torch.empty(size=(in_features, out_features), device='cuda')) 26 | nn.init.xavier_uniform_(self.W.data, gain=1.414) 27 | self.a = nn.Parameter(torch.empty(size=(2 * out_features, 1))) 28 | nn.init.xavier_uniform_(self.a.data, gain=1.414) 29 | 30 | self.leakyrelu = nn.LeakyReLU(self.alpha) 31 | 32 | def forward(self, x_a, x_v, adj=None): 33 | 34 | bs,t,_=x_a.shape 35 | h=torch.cat([x_a, x_v],dim=1) 36 | Wh = torch.matmul(h, self.W) 37 | 38 | e = self._prepare_attentional_mechanism_input(Wh) 39 | zero_vec = -9e15 * torch.ones_like(e) 40 | if adj is None: 41 | adj=torch.zeros(bs,2*t,2*t).to('cuda') 42 | if self.t_ksize>=1: 43 | tem_adj=self._get_temporal_adj(Wh, k=self.t_ksize) 44 | adj=adj+tem_adj 45 | if self.s_ksize>=1: 46 | sem_adj=self._get_semantic_adj(Wh, k=self.s_ksize) 47 | adj=adj+sem_adj 48 | self.adj=adj 49 | 50 | if self.hybrid_cond == 0: 51 | attention = torch.where(adj > 0, e, zero_vec) 52 | else: 53 | raise NotImplementedError('will update in the future') 54 | 55 | attention = F.softmax(attention, dim=-1) 56 | attention = F.dropout(attention, self.dropout, training=self.training) 57 | self.attention=attention 58 | 59 | h_prime = torch.matmul(attention, Wh) 60 | 61 | h_prime = F.elu(h_prime) 62 | x_a, x_v= torch.chunk(h_prime, chunks=2, dim=1) 63 | return x_a, x_v 64 | 65 | def _prepare_attentional_mechanism_input(self, Wh): 66 | 67 | 68 | 69 | Wh1 = torch.matmul(Wh, self.a[:self.out_features, :]) 70 | Wh2 = torch.matmul(Wh, self.a[self.out_features:, :]) 71 | 72 | e = Wh1 + Wh2.transpose(-1, -2) 73 | return self.leakyrelu(e) 74 | 75 | def _get_semantic_adj(self, x, y=None, k=10, dis='euc'): 76 | """ 77 | https://github.com/frostinassiky/gtad/blob/master/gtad_lib/models.py 78 | 79 | """ 80 | def euclidean_distance(x,y): 81 | inner = 2 * torch.matmul(x,y.transpose(-2, -1)) 82 | xx = torch.sum(x ** 2, dim=-1, keepdim=True) 83 | yy = torch.sum(y ** 2, dim=-1, keepdim=True) 84 | pairwise_distance = -(xx - inner + yy.transpose(-2, -1)) 85 | 86 | return pairwise_distance 87 | 88 | def cosine_distance(x,y): 89 | x_norm=torch.norm(x,dim=-1,keepdim=True) 90 | y_norm=torch.norm(y,dim=-1,keepdim=True) 91 | xy_norm=x_norm*y_norm.transpose(-2,-1) 92 | xy_dot=torch.matmul(x,y.transpose(-2, -1)) 93 | pairwise_distance=xy_dot/xy_norm 94 | return pairwise_distance 95 | 96 | assert len(x.shape)==3 97 | if y is None: 98 | y = x 99 | 100 | bs, t, _=x.shape 101 | assert t%2==0 102 | t=t//2 103 | sem_adj=torch.zeros(bs,2*t,2*t).to('cuda') 104 | 105 | assert dis in ['euc','cos'] 106 | if dis == 'euc': 107 | pairwise_distance=euclidean_distance(x,y) 108 | elif dis=='cos': 109 | pairwise_distance=cosine_distance(x,y) 110 | else: 111 | raise NotImplementedError('not implenment distance') 112 | 113 | _, idx_aa = pairwise_distance[:,:t,:t].topk(k=k, dim=-1) 114 | sem_adj[:,:t,:t].scatter_(-1,torch.cuda.LongTensor(idx_aa),1) 115 | 116 | _, idx_vv = pairwise_distance[:,t:,t:].topk(k=k, dim=-1) 117 | sem_adj[:,t:,t:].scatter_(-1,torch.cuda.LongTensor(idx_vv),1) 118 | 119 | return sem_adj 120 | 121 | def _get_temporal_adj(self, x, k=3): 122 | """ 123 | x: feature (b,t,dim) 124 | return tem_adj: (1, t, t) 125 | """ 126 | _,t,_=x.shape 127 | assert t%2==0 128 | t=t//2 129 | 130 | assert k%2==1 131 | k=k//2 132 | tem_adj=np.zeros([2*t,2*t]) 133 | for i in range(2*t): 134 | for j in range(2*t): 135 | if abs(i-j)-0.5<=k and (i-(t-0.5))*(j-(t-0.5))>0: 136 | tem_adj[i][j]=1 137 | 138 | 139 | tem_adj=torch.tensor(tem_adj).to('cuda') 140 | tem_adj=tem_adj.unsqueeze(0) 141 | 142 | return tem_adj 143 | 144 | def __repr__(self): 145 | return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')' 146 | 147 | 148 | class DyGAT(nn.Module): 149 | def __init__(self, input_dim, model_dim=None, output_dim=None, dropout=0, alpha=0.1, num_heads=1, residual=False): 150 | """Dense version of GAT.""" 151 | super(DyGAT, self).__init__() 152 | 153 | model_dim = model_dim or input_dim 154 | output_dim = output_dim or input_dim 155 | 156 | assert num_heads==1 157 | 158 | self.dropout = dropout 159 | self.residual=residual 160 | self.attention = CrossGATLayer(input_dim, 161 | model_dim, 162 | dropout=dropout, 163 | alpha=alpha, 164 | concat=True) 165 | 166 | 167 | 168 | self.out_att = CrossGATLayer(model_dim * num_heads, 169 | output_dim, 170 | dropout=dropout, 171 | alpha=alpha, 172 | concat=False) 173 | 174 | self.att1=[] 175 | self.att2=[] 176 | 177 | def forward(self, x_a, x_v, adj): 178 | self.att1=[] 179 | self.att2=[] 180 | 181 | if self.residual: 182 | x_a_in=x_a 183 | x_v_in=x_v 184 | 185 | num_classes=adj.shape[0] 186 | x_a = F.dropout(x_a, self.dropout, training=self.training) 187 | x_v = F.dropout(x_v, self.dropout, training=self.training) 188 | x_a_layer1, x_v_layer1=self.attention(x_a, x_v, adj[0]) 189 | self.att1.append(self.attention.attention) 190 | 191 | for i in range(1,num_classes): 192 | x_a_new, x_v_new=self.attention(x_a, x_v, adj[i]) 193 | x_a_layer1=x_a_layer1+x_a_new 194 | x_v_layer1=x_v_layer1+x_v_new 195 | self.att1.append(self.attention.attention) 196 | 197 | x_a_layer1=x_a_layer1/num_classes 198 | x_v_layer1=x_v_layer1/num_classes 199 | 200 | x_a_layer1 = F.dropout(x_a_layer1, self.dropout, training=self.training) 201 | x_v_layer1 = F.dropout(x_v_layer1, self.dropout, training=self.training) 202 | 203 | x_a_out, x_v_out=self.out_att(x_a_layer1, x_v_layer1, adj[0]) 204 | self.att2.append(self.out_att.attention) 205 | 206 | for i in range(1,num_classes): 207 | x_a_new, x_v_new=self.out_att(x_a_layer1, x_v_layer1, adj[i]) 208 | x_a_out=x_a_out+x_a_new 209 | x_v_out=x_v_out+x_v_new 210 | self.att2.append(self.out_att.attention) 211 | 212 | x_a_out=x_a_out/num_classes 213 | x_v_out=x_v_out/num_classes 214 | 215 | if self.residual: 216 | x_a_out+=x_a_in 217 | x_v_out+=x_v_in 218 | 219 | return x_a_out, x_v_out 220 | 221 | 222 | -------------------------------------------------------------------------------- /src/models/graph_modules_single.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import math 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import numpy as np 8 | from copy import deepcopy 9 | 10 | 11 | class CrossGATLayer(nn.Module): 12 | 13 | def __init__(self, in_features, out_features, dropout=0, alpha=0.1, act=True, hybrid_cond=0.): 14 | super(CrossGATLayer, self).__init__() 15 | 16 | self.dropout = dropout 17 | self.in_features = in_features 18 | self.out_features = out_features 19 | self.alpha = alpha 20 | self.act = act 21 | self.hybrid_cond = hybrid_cond 22 | 23 | self.W = nn.Parameter(torch.empty(size=(in_features, out_features), device='cuda')) 24 | nn.init.xavier_uniform_(self.W.data, gain=1.414) 25 | self.a = nn.Parameter(torch.empty(size=(2 * out_features, 1))) 26 | nn.init.xavier_uniform_(self.a.data, gain=1.414) 27 | 28 | self.leakyrelu = nn.LeakyReLU(self.alpha) 29 | 30 | def forward(self, x_a, x_v, adj=None): 31 | 32 | bs,t,_=x_a.shape 33 | h=torch.cat([x_a, x_v],dim=1) 34 | Wh = torch.matmul(h, self.W) 35 | 36 | e = self._prepare_attentional_mechanism_input(Wh) 37 | zero_vec = -9e15 * torch.ones_like(e) 38 | if adj is None: 39 | adj=torch.zeros(bs,2*t,2*t).to('cuda') 40 | if self.t_ksize>=1: 41 | tem_adj=self._get_temporal_adj(Wh, k=self.t_ksize) 42 | adj=adj+tem_adj 43 | if self.s_ksize>=1: 44 | sem_adj=self._get_semantic_adj(Wh, k=self.s_ksize) 45 | adj=adj+sem_adj 46 | self.adj=adj 47 | 48 | 49 | 50 | if self.hybrid_cond == 0: 51 | attention = torch.where(adj > 0, e, zero_vec) 52 | else: 53 | raise NotImplementedError('will update in the future') 54 | 55 | 56 | attention = F.softmax(attention, dim=-1) 57 | 58 | 59 | attention = F.dropout(attention, self.dropout, training=self.training) 60 | 61 | h_prime = torch.matmul(attention, Wh) 62 | 63 | 64 | if self.act: 65 | h_prime = F.elu(h_prime) 66 | x_a, x_v= torch.chunk(h_prime, chunks=2, dim=1) 67 | return x_a, x_v 68 | 69 | def _prepare_attentional_mechanism_input(self, Wh): 70 | 71 | Wh1 = torch.matmul(Wh, self.a[:self.out_features, :]) 72 | Wh2 = torch.matmul(Wh, self.a[self.out_features:, :]) 73 | 74 | e = Wh1 + Wh2.transpose(-1, -2) 75 | return self.leakyrelu(e) 76 | 77 | def __repr__(self): 78 | return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')' 79 | 80 | 81 | class SingleGAT(nn.Module): 82 | def __init__(self, input_dim, model_dim=None, output_dim=None, dropout=0, alpha=0.1, num_heads=1, depth = 2): 83 | """Dense version of GAT.""" 84 | super(SingleGAT, self).__init__() 85 | 86 | model_dim = model_dim or input_dim 87 | output_dim = output_dim or input_dim 88 | 89 | assert depth >= 1 90 | assert num_heads >= 1 91 | 92 | 93 | self.dropout = dropout 94 | 95 | 96 | self.depth = depth 97 | self.num_heads = num_heads 98 | self.attentions = [] 99 | for i in range(depth-1): 100 | if i ==0: 101 | layer_input = input_dim 102 | else: 103 | layer_input = model_dim * num_heads 104 | self.attentions.append([CrossGATLayer(layer_input, 105 | model_dim, 106 | dropout=dropout, 107 | alpha=alpha, 108 | act=True) for _ in range(num_heads)]) 109 | 110 | for i, attention_layer in enumerate(self.attentions): 111 | for j, attention in enumerate(attention_layer): 112 | self.add_module('attention_{}_{}'.format(i,j), attention) 113 | 114 | if depth == 1: 115 | outlayer_input = input_dim 116 | else: 117 | outlayer_input = model_dim * num_heads 118 | self.out_att = [CrossGATLayer(outlayer_input, 119 | output_dim, 120 | dropout=dropout, 121 | alpha=alpha, 122 | act=False) for _ in range(num_heads)] 123 | for i, attention in enumerate(self.out_att): 124 | self.add_module('out_att_{}'.format(i), attention) 125 | 126 | 127 | def forward(self, x_a, x_v, adj): 128 | 129 | for i in range(self.depth-1): 130 | x_a = F.dropout(x_a, self.dropout, training=self.training) 131 | x_v = F.dropout(x_v, self.dropout, training=self.training) 132 | 133 | a_list=[] 134 | v_list=[] 135 | for att in self.attentions[i]: 136 | x_a_new, x_v_new=att(x_a, x_v,adj) 137 | a_list.append(x_a_new) 138 | v_list.append(x_v_new) 139 | 140 | x_a=torch.cat(a_list,dim=-1) 141 | x_v=torch.cat(v_list,dim=-1) 142 | 143 | x_a = F.dropout(x_a, self.dropout, training=self.training) 144 | x_v = F.dropout(x_v, self.dropout, training=self.training) 145 | a_list=[] 146 | v_list=[] 147 | for att in self.out_att: 148 | x_a_new, x_v_new = att(x_a, x_v, adj) 149 | a_list.append(x_a_new) 150 | v_list.append(x_v_new) 151 | 152 | for i in range(1, self.num_heads): 153 | a_list[0] = a_list[0] + a_list[i] 154 | v_list[0] = v_list[0] + v_list[i] 155 | x_a = a_list[0] / self.num_heads 156 | x_v = v_list[0] / self.num_heads 157 | 158 | x_a = F.elu(x_a) 159 | x_v = F.elu(x_v) 160 | 161 | return x_a, x_v 162 | 163 | 164 | -------------------------------------------------------------------------------- /src/models/mhsa_layer.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | from turtle import pos 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | 9 | """ 10 | code from https://github.com/harvardnlp/annotated-transformer/blob/master/the_annotated_transformer.py 11 | """ 12 | 13 | def clones(module, N): 14 | "Produce N identical layers." 15 | return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) 16 | 17 | def attention(query, key, value, mask=None, dropout=None): 18 | "Compute 'Scaled Dot Product Attention'" 19 | d_k = query.size(-1) 20 | scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) 21 | if mask is not None: 22 | scores = scores.masked_fill(mask == 0, -1e9) 23 | p_attn = scores.softmax(dim=-1) 24 | if dropout is not None: 25 | p_attn = dropout(p_attn) 26 | return torch.matmul(p_attn, value), p_attn 27 | 28 | class MultiHeadedAttention(nn.Module): 29 | def __init__(self, h, d_model, dropout=0.1): 30 | "Take in model size and number of heads." 31 | super(MultiHeadedAttention, self).__init__() 32 | assert d_model % h == 0 33 | 34 | self.d_k = d_model // h 35 | self.h = h 36 | self.linears = clones(nn.Linear(d_model, d_model), 4) 37 | self.attn = None 38 | self.dropout = nn.Dropout(p=dropout) 39 | 40 | def forward(self, query, key, value, mask = None, use_att = True): 41 | "Implements Figure 2" 42 | if use_att: 43 | if mask is not None: 44 | 45 | mask = mask.unsqueeze(1) 46 | mask = mask.unsqueeze(-2) 47 | 48 | nbatches = query.size(0) 49 | 50 | 51 | query, key, value = [ 52 | lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) 53 | for lin, x in zip(self.linears, (query, key, value)) 54 | ] 55 | 56 | 57 | x, self.attn = attention( 58 | query, key, value, mask=mask, dropout=self.dropout 59 | ) 60 | 61 | 62 | x = ( 63 | x.transpose(1, 2) 64 | .contiguous() 65 | .view(nbatches, -1, self.h * self.d_k) 66 | ) 67 | del query 68 | del key 69 | del value 70 | return self.linears[-1](x) 71 | 72 | else: 73 | query = self.linears[0](query) 74 | query = self.linears[-1](query) 75 | return query 76 | 77 | 78 | def get_mask(self, event_list, bs, num_cls): 79 | mask = torch.zeros([bs, num_cls]) 80 | mask[[event_list[0], event_list[1]]]=1 81 | return mask 82 | 83 | 84 | class PositionalEncoding(nn.Module): 85 | """Implement the PE function. 86 | 87 | """ 88 | 89 | def __init__(self, 90 | d_model, 91 | dropout, 92 | max_len=35 93 | ): 94 | super(PositionalEncoding, self).__init__() 95 | self.dropout = nn.Dropout(p=dropout) 96 | self.d_model = d_model 97 | self.max_len = max_len 98 | 99 | 100 | 101 | def forward(self, x, pe): 102 | x = x + pe[:, : x.size(1)].requires_grad_(False) 103 | return self.dropout(x) 104 | 105 | def get_position_encoding(self, bs, position): 106 | pe = torch.zeros(bs, self.max_len, self.d_model) 107 | if position == None: 108 | position = torch.arange(0, self.max_len).unsqueeze(1) 109 | div_term = torch.exp( 110 | torch.arange(0, self.d_model, 2) * -(math.log(10000.0) / self.d_model) 111 | ) 112 | 113 | pe[:, :, 0::2] = torch.sin(position * div_term) 114 | pe[:, :, 1::2] = torch.cos(position * div_term) 115 | pe=pe.to('cuda') 116 | 117 | return pe 118 | 119 | def get_position(self, event_list, bs, num_cls): 120 | position = torch.zeros(bs, num_cls) 121 | pos_tensor = np.array(event_list[2]).astype(np.float32) 122 | pos_tensor = torch.from_numpy(pos_tensor) 123 | position[[event_list[0], event_list[1]]] = pos_tensor 124 | position = position.unsqueeze(2) 125 | return position 126 | 127 | 128 | class MHSALayer(nn.Module): 129 | """ 130 | A residual connection followed by a layer norm. 131 | Note for code simplicity the norm is first as opposed to last. 132 | """ 133 | 134 | 135 | def __init__(self, size, dropout): 136 | super(MHSALayer, self).__init__() 137 | self.norm = LayerNorm(size) 138 | self.dropout = nn.Dropout(dropout) 139 | 140 | def forward(self, x, sublayer): 141 | "Apply residual connection to any sublayer with the same size." 142 | return x + self.dropout(sublayer(self.norm(x))) 143 | 144 | 145 | class LayerNorm(nn.Module): 146 | "Construct a layernorm module in the OpenAI style (epsilon inside the square root)." 147 | 148 | def __init__(self, n_state, e=1e-5): 149 | super(LayerNorm, self).__init__() 150 | self.g = nn.Parameter(torch.ones(n_state)) 151 | self.b = nn.Parameter(torch.zeros(n_state)) 152 | self.e = e 153 | 154 | def forward(self, x): 155 | u = x.mean(-1, keepdim=True) 156 | s = (x - u).pow(2).mean(-1, keepdim=True) 157 | x = (x - u) / torch.sqrt(s + self.e) 158 | return self.g * x + self.b -------------------------------------------------------------------------------- /src/models/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .stage_one import StackedWindowTransformer 5 | from .stage_two_new import SnippetGAT 6 | from .stage_three import EventInteractionNet 7 | import time 8 | 9 | 10 | class MultiStageNet(nn.Module): 11 | def __init__(self, 12 | args, 13 | num_stages=3, 14 | label_utility='video', 15 | model_dim=512, 16 | num_heads_in_transformer=4, 17 | mask_generator_type='conv', 18 | temperature_in_transformer=1, 19 | num_hierarchy=6, 20 | flow_across_layers='sequential', 21 | window_shift=True, 22 | basic_window_size=2, 23 | num_classes=35, 24 | snippet_graph_op='conv', 25 | event_graph_op='attn', 26 | gat_edge_threshold=0.5, 27 | ): 28 | super(MultiStageNet, self).__init__() 29 | 30 | self.label_utility = label_utility 31 | 32 | self.num_stage = num_stages 33 | self.args = args 34 | 35 | self.stage1 = StackedWindowTransformer(args, 36 | label_utility=label_utility, 37 | model_dim=model_dim, 38 | num_heads_in_transformer=num_heads_in_transformer, 39 | mask_generator_type=mask_generator_type, 40 | temperature_in_transformer=temperature_in_transformer, 41 | num_hierarchy=num_hierarchy, 42 | flow_across_layers=flow_across_layers, 43 | window_shift=window_shift, 44 | basic_window_size=basic_window_size, 45 | num_classes=num_classes) 46 | 47 | if num_stages >= 2: 48 | self.stage2 = SnippetGAT(args, 49 | model_dim=model_dim, 50 | snippet_graph_op=snippet_graph_op, 51 | edge_threshold=gat_edge_threshold) 52 | 53 | if num_stages >= 3: 54 | self.stage3 = EventInteractionNet(args, 55 | model_dim=model_dim, 56 | event_interaction_op=args.event_interaction_op, 57 | num_classes=num_classes) 58 | 59 | def forward(self, audio, visual, visual_st, id=None): 60 | 61 | if self.label_utility == 'video': 62 | if self.num_stage >= 1: 63 | a_prob, v_prob, frame_prob, x_a, x_v = self.stage1(audio, visual, visual_st) 64 | if self.num_stage >= 2: 65 | g_a_prob, g_v_prob, g_frame_prob, x_a, x_v, a_event_prob_list,\ 66 | a_event_list, v_event_prob_list, v_event_list , a_event, v_event = self.stage2(x_a, x_v, frame_prob) 67 | if self.num_stage >= 3: 68 | if self.args.s3_share_fc: 69 | fc_prob = self.stage2.temporal_pooling.fc_prob 70 | else: 71 | fc_prob = None 72 | a_event_prob_s3, v_event_prob_s3, a_prob_s3, v_prob_s3 = self.stage3(a_event, v_event, a_event_list, v_event_list, g_a_prob, g_v_prob, g_frame_prob, x_a, x_v, fc_prob) 73 | return a_prob, v_prob, frame_prob, \ 74 | g_a_prob, g_v_prob, g_frame_prob, a_event_prob_list, a_event_list, v_event_prob_list, v_event_list, \ 75 | a_event_prob_s3, v_event_prob_s3, a_prob_s3, v_prob_s3 76 | 77 | else: 78 | return a_prob, v_prob, frame_prob, \ 79 | g_a_prob, g_v_prob, g_frame_prob, a_event_prob_list, a_event_list, v_event_prob_list, v_event_list 80 | else: 81 | return a_prob, v_prob, frame_prob 82 | else: 83 | raise NotImplementedError('Label format should only be video, ' 84 | 'but got {} !!! '.format(self.label_utility)) 85 | -------------------------------------------------------------------------------- /src/models/modules.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | 7 | class MLP(nn.Module): 8 | """ Multilayer perceptron.""" 9 | 10 | def __init__(self, in_features, hidden_features=None, out_features=None, drop=0.2): 11 | super().__init__() 12 | out_features = out_features or in_features 13 | hidden_features = hidden_features or in_features 14 | self.fc1 = nn.Linear(in_features, hidden_features) 15 | self.act = nn.LeakyReLU() 16 | self.fc2 = nn.Linear(hidden_features, out_features) 17 | self.drop = nn.Dropout(drop) 18 | 19 | def forward(self, x): 20 | x = self.fc1(x) 21 | x = self.act(x) 22 | x = self.drop(x) 23 | x = self.fc2(x) 24 | x = self.drop(x) 25 | return x 26 | 27 | 28 | 29 | class MultiInputSequential(nn.Sequential): 30 | def forward(self, *inputs): 31 | for module in self._modules.values(): 32 | if type(inputs) == tuple: 33 | inputs = module(*inputs) 34 | else: 35 | inputs = module(inputs) 36 | return inputs 37 | 38 | 39 | class MILPooling(nn.Module): 40 | def __init__(self, model_dim=512, num_cls=35): 41 | super(MILPooling, self).__init__() 42 | self.fc_prob = nn.Linear(model_dim, num_cls) 43 | self.fc_frame_att = nn.Linear(model_dim, num_cls) 44 | 45 | def forward(self, a, v): 46 | x = torch.cat([a.unsqueeze(-2), v.unsqueeze(-2)], dim=-2) 47 | frame_prob = torch.sigmoid(self.fc_prob(x)) 48 | 49 | frame_att = torch.softmax(self.fc_frame_att(x), dim=1) 50 | temporal_prob = (frame_att * frame_prob) 51 | a_prob = temporal_prob[:, :, 0, :].sum(dim=1) 52 | v_prob = temporal_prob[:, :, 1, :].sum(dim=1) 53 | 54 | return a_prob, v_prob, frame_prob 55 | 56 | -------------------------------------------------------------------------------- /src/models/modules_new_stage2.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from math import cos 3 | from traceback import FrameSummary 4 | import torch 5 | import torch.nn as nn 6 | import numpy as np 7 | 8 | 9 | 10 | class MILPooling(nn.Module): 11 | def __init__(self, model_dim=512, num_cls=35): 12 | super(MILPooling, self).__init__() 13 | self.fc_prob = nn.Linear(model_dim, num_cls) 14 | self.fc_frame_att = nn.Linear(model_dim, num_cls) 15 | 16 | def forward(self, a, v): 17 | x = torch.cat([a.unsqueeze(-2), v.unsqueeze(-2)], dim=-2) # (b, t, 2, dim) 18 | frame_prob = torch.sigmoid(self.fc_prob(x)) 19 | 20 | frame_att = torch.softmax(self.fc_frame_att(x), dim=1) # temporal attention w 21 | temporal_prob = (frame_att * frame_prob) 22 | a_prob = temporal_prob[:, :, 0, :].sum(dim=1) 23 | v_prob = temporal_prob[:, :, 1, :].sum(dim=1) 24 | 25 | return a_prob, v_prob, frame_prob, frame_att 26 | 27 | def event_prob(self, 28 | event_feature, # new event feature 29 | cls_index # index of class 30 | ): 31 | event_prob = torch.sigmoid(self.fc_prob(event_feature))[cls_index] # share weight (frame prob and event prob) 32 | 33 | return event_prob 34 | 35 | class GraphFinetune(): 36 | # add and delete node of subgraph 37 | 38 | def __init__(self, 39 | pred_th = 0.5, # threshold of predict prob 40 | cos_th = 0.9 # threshold of cos sim 41 | ): 42 | self.pred_th = pred_th 43 | self.cos_th = cos_th 44 | 45 | def get_event_feature(self, 46 | feature, # feature, just one video, one modal 47 | pool_method, # method of get event feature 48 | class_graph, 49 | frame_att=None, # attention of each frame, just use when event_feature=='att' 50 | keepdim=False 51 | ): 52 | if pool_method=='att': 53 | assert frame_att is not None # frame_att used when pool_method = 'att' 54 | if frame_att.requires_grad: 55 | frame_att = frame_att.detach() 56 | 57 | if pool_method == 'avg': 58 | event_feature = torch.sum(feature[class_graph, :], dim=0, keepdim=keepdim)/len(class_graph) 59 | elif pool_method=='att': 60 | event_feature = feature[class_graph, :] * frame_att[class_graph].unsqueeze(1) 61 | event_feature = torch.sum(event_feature, dim=0, keepdim=keepdim) / torch.sum(frame_att[class_graph]) 62 | else: 63 | raise NotImplementedError 64 | 65 | return event_feature 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /src/models/stage_one.py: -------------------------------------------------------------------------------- 1 | """ 2 | Network in stage one. It is based on stacked shifted window transformers to capture events with various durations. 3 | """ 4 | import os.path 5 | 6 | import torch 7 | 8 | from .modules import * 9 | from .transformer import * 10 | 11 | 12 | class StackedWindowTransformer(nn.Module): 13 | def __init__(self, 14 | args, 15 | label_utility='video', 16 | model_dim=512, 17 | num_heads_in_transformer=4, 18 | mask_generator_type='conv', 19 | temperature_in_transformer=1, 20 | num_hierarchy=6, 21 | flow_across_layers='sequential', 22 | window_shift=True, 23 | basic_window_size=2, 24 | num_classes=35): 25 | super(StackedWindowTransformer, self).__init__() 26 | 27 | self.label_utility = label_utility 28 | self.num_classes = num_classes 29 | self.args = args 30 | 31 | 32 | self.fc_a = nn.Linear(128, model_dim) 33 | self.fc_v = nn.Linear(512, model_dim) 34 | self.fc_st = nn.Linear(512, model_dim) 35 | self.fc_fusion = nn.Linear(model_dim * 2, model_dim) 36 | 37 | 38 | self.multiscale_hybrid_transformer = HybridWindowTransformer(args, 39 | model_dim=model_dim, 40 | num_heads=num_heads_in_transformer, 41 | num_hierarchy=num_hierarchy, 42 | temperature=temperature_in_transformer, 43 | basic_window_size=basic_window_size, 44 | window_shift=window_shift, 45 | feature_flow=flow_across_layers) 46 | 47 | if label_utility == 'video': 48 | self.temporal_pooling = MILPooling(model_dim, num_classes) 49 | else: 50 | raise NotImplementedError('Label format should only be video, ' 51 | 'but got {} !!! '.format(self.label_utility)) 52 | 53 | 54 | def forward(self, audio, visual, visual_st): 55 | 56 | x_a = self.fc_a(audio) 57 | vid_s = self.fc_v(visual) 58 | vid_st = self.fc_st(visual_st) 59 | x_v = torch.cat((vid_s, vid_st), dim=-1) 60 | x_v = self.fc_fusion(x_v) 61 | 62 | x_a, x_v = self.multiscale_hybrid_transformer(x_a, x_v) 63 | 64 | if self.label_utility == 'video': 65 | a_prob, v_prob, frame_prob = self.temporal_pooling(x_a, x_v) 66 | 67 | return a_prob, v_prob, frame_prob, x_a, x_v 68 | else: 69 | raise NotImplementedError('Label format should only be video, ' 70 | 'but got {} !!! '.format(self.label_utility)) 71 | -------------------------------------------------------------------------------- /src/models/stage_three.py: -------------------------------------------------------------------------------- 1 | """ 2 | Network in stage three. It is based on graph attention network to achieve feature aggregation in event. 3 | aims of stage3 is to interactive 4 | """ 5 | 6 | from turtle import forward 7 | import numpy as np 8 | from copy import deepcopy 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from .graph_modules_single import SingleGAT 13 | from .mhsa_layer import PositionalEncoding, MultiHeadedAttention, MHSALayer, LayerNorm 14 | 15 | from tools.distance import cosine_distance 16 | 17 | 18 | class EventInteractionNet(nn.Module): 19 | 20 | def __init__(self, 21 | args, 22 | model_dim=512, 23 | event_interaction_op='attn', 24 | num_classes=35): 25 | super(EventInteractionNet, self).__init__() 26 | 27 | self.num_cls = num_classes 28 | self.args = args 29 | self.event_interaction_op = event_interaction_op 30 | if args.num_stages == 3: 31 | if args.event_interaction_op == 'attn': 32 | assert args.s3_within_modal | args.s3_cross_modal 33 | if args.event_interaction_op == 'mhsa': 34 | pass 35 | 36 | 37 | 38 | if event_interaction_op == 'attn': 39 | self.gat = SingleGAT(input_dim=model_dim, 40 | num_heads=args.s3_gat_nheads, 41 | depth = args.s3_gat_depth, 42 | dropout = args.s3_dropout 43 | ) 44 | 45 | elif event_interaction_op == 'mhsa': 46 | self.pe = PositionalEncoding(d_model=model_dim, 47 | dropout=args.s3_dropout, 48 | max_len=35) 49 | 50 | if args.s3_no_share_weight: 51 | 52 | self.mhsa_a = MultiHeadedAttention(h=args.s3_mhsa_nheads, 53 | d_model=model_dim, 54 | dropout=args.s3_dropout) 55 | self.mhsa_v = MultiHeadedAttention(h=args.s3_mhsa_nheads, 56 | d_model=model_dim, 57 | dropout=args.s3_dropout) 58 | 59 | self.mhsa_av = MultiHeadedAttention(h=args.s3_mhsa_nheads, 60 | d_model=model_dim, 61 | dropout=args.s3_dropout) 62 | self.mhsa_va = MultiHeadedAttention(h=args.s3_mhsa_nheads, 63 | d_model=model_dim, 64 | dropout=args.s3_dropout) 65 | 66 | self.norm_a = LayerNorm(n_state=model_dim) 67 | self.norm_v = LayerNorm(n_state=model_dim) 68 | 69 | else: 70 | self.mhsa = MultiHeadedAttention(h=args.s3_mhsa_nheads, 71 | d_model=model_dim, 72 | dropout=args.s3_dropout) 73 | 74 | self.cm_mhsa = MultiHeadedAttention(h=args.s3_mhsa_nheads, 75 | d_model=model_dim, 76 | dropout=args.s3_dropout) 77 | 78 | self.norm = LayerNorm(n_state=model_dim) 79 | 80 | 81 | elif event_interaction_op == 'none': 82 | pass 83 | 84 | else: 85 | raise NotImplementedError("more models for event cross maybe update in future") 86 | 87 | 88 | self.dropout = args.s3_dropout 89 | self.fc_prob = nn.Linear(model_dim, self.num_cls) 90 | self.re_cal_prob = ReCalProb(model_dim=model_dim, 91 | feature_detach=args.s3_feature_detach, 92 | event_proj=args.s3_event_proj) 93 | 94 | def forward(self, a_event, v_event, a_event_list, v_event_list, a_prob, v_prob, frame_prob, x_a, x_v, fc_prob = None): 95 | 96 | 97 | bs, num_cls, num_dim = a_event.shape 98 | 99 | self.a_event_old = deepcopy(a_event.detach()) 100 | self.v_event_old = deepcopy(v_event.detach()) 101 | 102 | self.a_event_list = a_event_list[1] 103 | self.v_event_list = v_event_list[1] 104 | 105 | 106 | if self.event_interaction_op == 'attn': 107 | 108 | 109 | 110 | 111 | adj=np.zeros([bs, 2*self.num_cls, 2*self.num_cls]) 112 | 113 | if self.args.s3_within_modal: 114 | adj[:, :self.num_cls, :self.num_cls] = self.event_adj(bs, self.num_cls, a_event_list) 115 | adj[:, self.num_cls:, self.num_cls:] = self.event_adj(bs, self.num_cls, v_event_list) 116 | if self.args.s3_cross_modal: 117 | adj[:,:self.num_cls, self.num_cls:] = self.cross_modal_event_adj(bs, self.num_cls, a_event_list, v_event_list) 118 | adj[:,self.num_cls:, :self.num_cls] = self.cross_modal_event_adj(bs, self.num_cls, v_event_list, a_event_list) 119 | 120 | adj=torch.tensor(adj).to('cuda') 121 | 122 | 123 | 124 | a_event_new, v_event_new = self.gat(a_event, v_event, adj) 125 | 126 | if self.args.s3_residual: 127 | a_event += a_event_new 128 | v_event += v_event_new 129 | 130 | else: 131 | a_event = a_event_new 132 | v_event = v_event_new 133 | 134 | elif self.event_interaction_op =='mhsa': 135 | if self.args.s3_attn == 'all': 136 | s_att = True 137 | c_att = True 138 | elif self.args.s3_attn == 'cm': 139 | s_att = False 140 | c_att = True 141 | elif self.args.s3_attn == 'self': 142 | s_att = True 143 | c_att = False 144 | elif self.args.s3_attn == 'none': 145 | s_att = False 146 | c_att = False 147 | else: 148 | raise NotImplementedError 149 | 150 | if self.args.s3_mhsa_pe: 151 | a_position = self.pe.get_position(a_event_list, bs, num_cls) 152 | a_pe = self.pe.get_position_encoding(bs, a_position) 153 | a_event = self.pe(a_event, a_pe) 154 | 155 | v_position = self.pe.get_position(v_event_list, bs, num_cls) 156 | v_pe = self.pe.get_position_encoding(bs, v_position) 157 | v_event = self.pe(v_event, v_pe) 158 | 159 | if self.args.s3_cross_modal: 160 | if self.args.s3_cm_method == 'concat': 161 | a_mask = self.mhsa.get_mask(a_event_list, bs, num_cls) 162 | v_mask = self.mhsa.get_mask(v_event_list, bs, num_cls) 163 | mask = torch.cat([a_mask, v_mask], dim=1) 164 | mask = mask.to('cuda') 165 | 166 | event = torch.cat([a_event, v_event], dim=1) 167 | shortcut = event 168 | 169 | if self.args.s3_pre_norm: 170 | event = self.norm(event) 171 | 172 | event = self.mhsa(event, event, event, mask) 173 | event = F.dropout(event, self.dropout, training=self.training) 174 | event = event + shortcut 175 | 176 | if self.args.s3_post_norm: 177 | event = self.norm(event) 178 | 179 | a_event, v_event = torch.chunk(event, chunks=2, dim=1) 180 | 181 | elif self.args.s3_cm_method == 'add': 182 | if not self.args.s3_no_share_weight: 183 | a_mask = self.mhsa.get_mask(a_event_list, bs, num_cls) 184 | a_mask = a_mask.to('cuda') 185 | v_mask = self.mhsa.get_mask(v_event_list, bs, num_cls) 186 | v_mask = v_mask.to('cuda') 187 | 188 | a_shortcut = a_event 189 | v_shortcut = v_event 190 | 191 | if self.args.s3_pre_norm: 192 | a_event = self.norm(a_event) 193 | v_event = self.norm(v_event) 194 | 195 | if self.args.s3_share_cm: 196 | a_event_new = self.mhsa(a_event, a_event, a_event, a_mask) + self.mhsa(a_event, v_event, v_event, v_mask) 197 | v_event_new = self.mhsa(v_event, v_event, v_event, v_mask) + self.mhsa(v_event, a_event, a_event, a_mask) 198 | elif self.args.s3_just_cm: 199 | a_event_new = self.cm_mhsa(a_event, v_event, v_event, v_mask) 200 | v_event_new = self.cm_mhsa(v_event, a_event, a_event, a_mask) 201 | else: 202 | a_event_new = self.mhsa(a_event, a_event, a_event, a_mask, use_att = s_att) + self.cm_mhsa(a_event, v_event, v_event, v_mask, use_att = c_att) 203 | v_event_new = self.mhsa(v_event, v_event, v_event, v_mask, use_att = s_att) + self.cm_mhsa(v_event, a_event, a_event, a_mask, use_att = c_att) 204 | 205 | 206 | a_event_new = F.dropout(a_event_new, self.dropout, training=self.training) 207 | v_event_new = F.dropout(v_event_new, self.dropout, training=self.training) 208 | 209 | a_event = a_event_new + a_shortcut 210 | v_event = v_event_new + v_shortcut 211 | 212 | if self.args.s3_post_norm: 213 | a_event = self.norm(a_event) 214 | v_event = self.norm(v_event) 215 | 216 | else: 217 | 218 | a_mask = self.mhsa_a.get_mask(a_event_list, bs, num_cls) 219 | a_mask = a_mask.to('cuda') 220 | v_mask = self.mhsa_v.get_mask(v_event_list, bs, num_cls) 221 | v_mask = v_mask.to('cuda') 222 | a_shortcut = a_event 223 | v_shortcut = v_event 224 | 225 | if self.args.s3_pre_norm: 226 | a_event = self.norm_a(a_event) 227 | v_event = self.norm_v(v_event) 228 | 229 | a_event_new = self.mhsa_a(a_event, a_event, a_event, a_mask) + self.mhsa_av(a_event, v_event, v_event, v_mask) 230 | v_event_new = self.mhsa_v(v_event, v_event, v_event, v_mask) + self.mhsa_va(v_event, a_event, a_event, a_mask) 231 | 232 | a_event_new = F.dropout(a_event_new, self.dropout, training=self.training) 233 | v_event_new = F.dropout(v_event_new, self.dropout, training=self.training) 234 | 235 | a_event = a_event_new + a_shortcut 236 | v_event = v_event_new + v_shortcut 237 | 238 | if self.args.s3_post_norm: 239 | a_event = self.norm_a(a_event) 240 | v_event = self.norm_v(v_event) 241 | 242 | else: 243 | raise NotImplementedError('more method will be update in future') 244 | 245 | else: 246 | a_mask = self.mhsa.get_mask(a_event_list, bs, num_cls) 247 | a_mask = a_mask.to('cuda') 248 | v_mask = self.mhsa.get_mask(v_event_list, bs, num_cls) 249 | v_mask = v_mask.to('cuda') 250 | 251 | a_shortcut = a_event 252 | v_shortcut = v_event 253 | 254 | if self.args.s3_pre_norm: 255 | a_event = self.norm(a_event) 256 | v_event = self.norm(v_event) 257 | 258 | a_event = self.mhsa(a_event, a_event, a_event, a_mask) 259 | v_event = self.mhsa(v_event, v_event, v_event, v_mask) 260 | 261 | 262 | 263 | a_event = F.dropout(a_event, self.dropout, training=self.training) 264 | v_event = F.dropout(v_event, self.dropout, training=self.training) 265 | 266 | a_event = a_event + a_shortcut 267 | v_event = v_event + v_shortcut 268 | 269 | if self.args.s3_post_norm: 270 | a_event = self.norm(a_event) 271 | v_event = self.norm(v_event) 272 | 273 | elif self.event_interaction_op =='none': 274 | pass 275 | 276 | else: 277 | raise NotImplementedError('more models maybe update in future, now just have GAT and MHSA') 278 | 279 | 280 | self.a_event_new = a_event.detach() 281 | self.v_event_new = v_event.detach() 282 | 283 | a_event_prob_list = [] 284 | v_event_prob_list = [] 285 | for i, j in zip(a_event_list[0], a_event_list[1]): 286 | a_event_prob = self.event_prob(fc_prob, a_event[i,j], j) 287 | a_event_prob_list.append(a_event_prob) 288 | 289 | for i, j in zip(v_event_list[0], v_event_list[1]): 290 | v_event_prob = self.event_prob(fc_prob, v_event[i,j], j) 291 | v_event_prob_list.append(v_event_prob) 292 | 293 | 294 | a_prob, v_prob = self.re_cal_prob(a_event, v_event, a_event_list, v_event_list, a_prob, v_prob, frame_prob, x_a, x_v) 295 | 296 | return a_event_prob_list, v_event_prob_list, a_prob, v_prob 297 | 298 | def event_prob(self, 299 | fc_prob, 300 | event_feature, 301 | cls_index 302 | ): 303 | if fc_prob == None: 304 | event_prob = torch.sigmoid(self.fc_prob(event_feature))[cls_index] 305 | else: 306 | event_prob = torch.sigmoid(fc_prob(event_feature))[cls_index] 307 | return event_prob 308 | 309 | def event_adj(self, bs, num_cls, event_list): 310 | adj=np.zeros([bs,num_cls,num_cls]) 311 | batch_list = [[] for _ in range(bs)] 312 | for i, j in zip(event_list[0], event_list[1]): 313 | batch_list[i].append(j) 314 | 315 | for i,video_list in enumerate(batch_list): 316 | adj[i]=np.diag(np.ones(num_cls)) 317 | row=np.zeros([num_cls,num_cls]) 318 | col=np.zeros([num_cls,num_cls]) 319 | row[video_list,:]=1 320 | col[:,video_list]=1 321 | local_adj=np.where(row>col,col,row) 322 | adj[i]=np.where(adj[i]>local_adj,adj[i],local_adj) 323 | 324 | return adj 325 | 326 | def cross_modal_event_adj(self, bs, num_cls, event_list1, event_list2): 327 | 328 | adj=np.zeros([bs,num_cls,num_cls]) 329 | batch_list1 = [[] for _ in range(bs)] 330 | batch_list2 = [[] for _ in range(bs)] 331 | 332 | for i, j in zip(event_list1[0], event_list1[1]): 333 | batch_list1[i].append(j) 334 | for i, j in zip(event_list2[0], event_list2[1]): 335 | batch_list2[i].append(j) 336 | 337 | for i in range(bs): 338 | row=np.zeros([num_cls,num_cls]) 339 | col=np.zeros([num_cls,num_cls]) 340 | row[batch_list1[i],:]=1 341 | 342 | col[:,batch_list2[i]]=1 343 | local_adj=np.where(row>col,col,row) 344 | adj[i]=local_adj 345 | 346 | return adj 347 | 348 | class ReCalProb(nn.Module): 349 | 350 | def __init__(self, model_dim=512, feature_detach=False, event_proj=False): 351 | super(ReCalProb, self).__init__() 352 | self.feature_detach = feature_detach 353 | self.event_proj = event_proj 354 | if event_proj: 355 | self.fc = nn.Linear(model_dim, model_dim) 356 | 357 | def forward(self, a_event, v_event, a_event_list, v_event_list, a_prob, v_prob, frame_prob, x_a, x_v): 358 | 359 | if self.event_proj: 360 | a_event = self.fc(a_event) 361 | v_event = self.fc(v_event) 362 | 363 | a_prob_s3 = torch.clone(a_prob) 364 | v_prob_s3 = torch.clone(v_prob) 365 | if self.feature_detach: 366 | a_event = a_event.detach() 367 | 368 | sim_a = cosine_distance(x_a, a_event) 369 | att_a = torch.softmax(sim_a, dim=1) 370 | 371 | 372 | temporal_prob_a = att_a * frame_prob[:, :, 0, :] 373 | a_prob_new = temporal_prob_a.sum(dim=1) 374 | a_prob_s3[[a_event_list[0], a_event_list[1]]] = a_prob_new[[a_event_list[0], a_event_list[1]]] 375 | 376 | if self.feature_detach: 377 | v_event = v_event.detach() 378 | sim_v = cosine_distance(x_v, v_event) 379 | att_v = torch.softmax(sim_v, dim=1) 380 | 381 | 382 | temporal_prob_v = att_v * frame_prob[:, :, 1, :] 383 | v_prob_new = temporal_prob_v.sum(dim=1) 384 | v_prob_s3[[v_event_list[0], v_event_list[1]]] = v_prob_new[[v_event_list[0], v_event_list[1]]] 385 | 386 | return a_prob_s3, v_prob_s3 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | -------------------------------------------------------------------------------- /src/models/stage_two_new.py: -------------------------------------------------------------------------------- 1 | """ 2 | Network in stage two. It is based on graph attention network to achieve feature aggregation in event. 3 | """ 4 | from sys import modules 5 | from copy import deepcopy 6 | import torch 7 | 8 | from .graph_modules import * 9 | from .modules_new_stage2 import * 10 | import time 11 | from .get_multi_adj import get_adj, get_batch_adj 12 | import numpy as np 13 | from tools.distance import cosine_distance 14 | 15 | 16 | 17 | class SnippetGAT(nn.Module): 18 | def __init__(self, 19 | args, 20 | model_dim=512, 21 | snippet_graph_op='attn', 22 | edge_threshold=0.7, 23 | mask_update=False, 24 | num_classes=35, 25 | output_layer='fc', 26 | graph_finetune=True 27 | ): 28 | super(SnippetGAT, self).__init__() 29 | 30 | self.mask_update = mask_update 31 | self.num_classes = num_classes 32 | self.output_layer = output_layer 33 | self.graph_finetune = graph_finetune 34 | self.args=args 35 | self.model_dim = model_dim 36 | 37 | if snippet_graph_op == 'conv': 38 | raise NotImplementedError('do not use now') 39 | 40 | 41 | elif snippet_graph_op == 'attn': 42 | self.snippet_graph_op = DyGAT(input_dim=model_dim, residual=args.gat_residual) 43 | 44 | else: 45 | raise NotImplementedError("Incorrect graph operation {} ! " 46 | "Only between 'conv' and 'attn' !".format(snippet_graph_op)) 47 | 48 | self.temporal_pooling = MILPooling(model_dim, num_classes) 49 | self.gfine=GraphFinetune(pred_th = args.add_sb_pred_th, cos_th = args.add_sb_cos_th) 50 | 51 | def forward(self, x_a, x_v, s1_frame_prob): 52 | 53 | 54 | s1_frame_prob_np = s1_frame_prob.detach().cpu().numpy() 55 | self.s1_frame_prob = s1_frame_prob_np 56 | bs = x_a.shape[0] 57 | num_cls = s1_frame_prob_np.shape[-1] 58 | 59 | a_event_prob_list = [] 60 | a_event_list = [[], [], []] 61 | a_event=torch.zeros([bs, num_cls, self.model_dim], requires_grad=True).to('cuda') 62 | v_event_prob_list = [] 63 | v_event_list = [[], [], []] 64 | v_event=torch.zeros([bs, num_cls, self.model_dim], requires_grad=True).to('cuda') 65 | 66 | adj, bs_a_sb_list, bs_v_sb_list=get_batch_adj(s1_frame_prob_np, th=self.args.extract_sb_th, min_length=1, event_split=True, adj_mode=self.args.adj_mode) 67 | 68 | self.a_class_graph_old = deepcopy(bs_a_sb_list) 69 | self.v_class_graph_old = deepcopy(bs_v_sb_list) 70 | 71 | adj=torch.tensor(adj).to('cuda') 72 | 73 | x_a , x_v= self.snippet_graph_op(x_a, x_v, adj) 74 | 75 | self.x_a=x_a.detach() 76 | self.x_v=x_v.detach() 77 | 78 | a_prob, v_prob, frame_prob, frame_att = self.temporal_pooling(x_a, x_v) 79 | 80 | self.frame_att = frame_att.detach() 81 | 82 | if self.graph_finetune: 83 | for i in range(bs): 84 | a_class_graph = bs_a_sb_list[i] 85 | v_class_graph = bs_v_sb_list[i] 86 | for j in range(len(a_class_graph)): 87 | if len(a_class_graph[j]) > 0: 88 | # if self.args.add_node: 89 | # a_class_graph[j] = self.gfine.add_node(x_a[i], a_class_graph[j], s1_frame_prob_np[i, :, 0, j], frame_att[i, :, 0, j], self.args.pool_method) 90 | event_feature = self.gfine.get_event_feature(x_a[i], self.args.pool_method, a_class_graph[j], frame_att = frame_att[i, :, 0, j]) 91 | event_prob = self.temporal_pooling.event_prob(event_feature, j) 92 | 93 | a_event_prob_list.append(event_prob) 94 | a_event_list[0].append(i) 95 | a_event_list[1].append(j) 96 | a_event_list[2].append(sum(a_class_graph[j])/len(a_class_graph[j])) 97 | a_event[i,j]=event_feature 98 | 99 | if len(v_class_graph[j]) > 0: 100 | # if self.args.add_node: 101 | # v_class_graph[j] = self.gfine.add_node(x_v[i], v_class_graph[j], s1_frame_prob_np[i, :, 1, j], frame_att[i, :, 1, j], self.args.pool_method) 102 | event_feature = self.gfine.get_event_feature(x_v[i], self.args.pool_method, v_class_graph[j], frame_att = frame_att[i, :, 1, j]) 103 | event_prob = self.temporal_pooling.event_prob(event_feature, j) 104 | 105 | v_event_prob_list.append(event_prob) 106 | v_event_list[0].append(i) 107 | v_event_list[1].append(j) 108 | v_event_list[2].append(sum(v_class_graph[j])/len(v_class_graph[j])) 109 | v_event[i,j]=event_feature 110 | 111 | self.a_class_graph_new = bs_a_sb_list 112 | self.v_class_graph_new = bs_v_sb_list 113 | 114 | return a_prob, v_prob, frame_prob, x_a, x_v, a_event_prob_list, a_event_list, v_event_prob_list, v_event_list, a_event, v_event 115 | -------------------------------------------------------------------------------- /src/models/transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | import math 5 | 6 | from .base_models import BasicWindowTransformer, BasicPaddedWindowTransformer 7 | from .modules import MultiInputSequential 8 | 9 | 10 | class TransformerWithinModal(nn.Module): 11 | def __init__(self, dim=512, num_heads=4, qkv_bias=True, qk_scale=None, 12 | attn_drop=0.2, proj_drop=0.2, temperature=1.): 13 | super(TransformerWithinModal, self).__init__() 14 | self.dim = dim 15 | self.num_heads = num_heads 16 | head_dim = dim // num_heads 17 | self.scale = qk_scale or head_dim ** -0.5 18 | self.temperature = temperature 19 | 20 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 21 | self.attn_drop = nn.Dropout(attn_drop) 22 | self.proj = nn.Linear(dim, dim) 23 | self.proj_drop = nn.Dropout(proj_drop) 24 | 25 | self.pre_norm = nn.LayerNorm(dim) 26 | self.post_norm = nn.LayerNorm(dim) 27 | 28 | self.softmax = nn.Softmax(dim=-1) 29 | 30 | def forward(self, x): 31 | shortcut = x 32 | x = self.pre_norm(x) 33 | 34 | b, t, dim = x.size() 35 | qkv = self.qkv(x).reshape(b, t, 3, self.num_heads, dim // self.num_heads) 36 | qkv = qkv.permute(2, 0, 3, 1, 4) 37 | q, k, v = qkv[0], qkv[1], qkv[2] 38 | 39 | q = q * self.scale 40 | attn = q @ k.transpose(-2, -1) 41 | 42 | attn = self.softmax(attn / self.temperature) 43 | attn = self.attn_drop(attn) 44 | 45 | x = (attn @ v).transpose(1, 2).reshape(b, t, dim) 46 | x = self.proj(x) 47 | x = self.proj_drop(x) 48 | 49 | x = x + shortcut 50 | x = self.post_norm(x) 51 | return x 52 | 53 | 54 | class TransformerCrossModal(nn.Module): 55 | def __init__(self, dim=512, num_heads=4, qkv_bias=True, qk_scale=None, 56 | attn_drop=0.2, proj_drop=0.2, temperature=1.): 57 | super(TransformerCrossModal, self).__init__() 58 | self.dim = dim 59 | self.num_heads = num_heads 60 | head_dim = dim // num_heads 61 | self.scale = qk_scale or head_dim ** -0.5 62 | self.temperature = temperature 63 | 64 | self.q = nn.Linear(dim, dim * 1, bias=qkv_bias) 65 | self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) 66 | self.attn_drop = nn.Dropout(attn_drop) 67 | 68 | self.proj = nn.Linear(dim, dim) 69 | self.proj_drop = nn.Dropout(proj_drop) 70 | 71 | self.pre_norm = nn.LayerNorm(dim) 72 | self.post_norm = nn.LayerNorm(dim) 73 | 74 | self.softmax = nn.Softmax(dim=-1) 75 | 76 | def forward(self, x, y): 77 | shortcut = x 78 | x = self.pre_norm(x) 79 | 80 | b, t, dim = x.size() 81 | q = self.q(x).reshape(b, t, self.num_heads, dim // self.num_heads) 82 | q = q.permute(0, 2, 1, 3) 83 | 84 | kv = self.kv(y).reshape(b, t, 2, self.num_heads, dim // self.num_heads) 85 | kv = kv.permute(2, 0, 3, 1, 4) 86 | k, v = kv[0], kv[1] 87 | 88 | q = q * self.scale 89 | attn = q @ k.transpose(-2, -1) 90 | 91 | attn = self.softmax(attn / self.temperature) 92 | attn = self.attn_drop(attn) 93 | 94 | x = (attn @ v).transpose(1, 2).reshape(b, t, dim) 95 | x = self.proj(x) 96 | x = self.proj_drop(x) 97 | 98 | x = x + shortcut 99 | x = self.post_norm(x) 100 | return x 101 | 102 | 103 | class WindowTransformerWithinModal(BasicPaddedWindowTransformer): 104 | def __init__(self, dim=512, window_size=2, 105 | num_heads=4, qkv_bias=True, qk_scale=None, 106 | attn_drop=0.2, proj_drop=0.2, window_shift=True, temperature=1.): 107 | super(WindowTransformerWithinModal, self).__init__() 108 | 109 | self.dim = dim 110 | self.window_size = window_size 111 | self.window_shift = window_shift 112 | 113 | self.num_heads = num_heads 114 | head_dim = dim // num_heads 115 | self.scale = qk_scale or head_dim ** -0.5 116 | self.temperature = temperature 117 | 118 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 119 | self.attn_drop = nn.Dropout(attn_drop) 120 | self.proj = nn.Linear(dim, dim) 121 | self.proj_drop = nn.Dropout(proj_drop) 122 | 123 | self.pre_norm = nn.LayerNorm(dim) 124 | self.post_norm = nn.LayerNorm(dim) 125 | 126 | self.softmax = nn.Softmax(dim=-1) 127 | 128 | def forward(self, x): 129 | 130 | x = self.window_forward(x) 131 | 132 | if self.window_shift: 133 | x = self.sequence_shift(x) 134 | x = self.window_forward(x) 135 | x = self.sequence_inverse_shift(x) 136 | 137 | return x 138 | 139 | def window_forward(self, x): 140 | 141 | shortcut = x 142 | x = self.pre_norm(x) 143 | 144 | if self.window_size > 0: 145 | x, n_win = self.window_partition(x) 146 | 147 | b_times_n_win, win_size, dim = x.size() 148 | qkv = self.qkv(x).reshape(b_times_n_win, win_size, 3, self.num_heads, dim // self.num_heads) 149 | qkv = qkv.permute(2, 0, 3, 1, 4) 150 | q, k, v = qkv[0], qkv[1], qkv[2] 151 | 152 | q = q * self.scale 153 | attn = q @ k.transpose(-2, -1) 154 | 155 | attn = self.softmax(attn / self.temperature) 156 | attn = self.attn_drop(attn) 157 | 158 | x = (attn @ v).transpose(1, 2).reshape(b_times_n_win, win_size, dim) 159 | x = self.proj(x) 160 | x = self.proj_drop(x) 161 | 162 | if self.window_size > 0: 163 | x = self.window_reverse(x, n_win) 164 | 165 | x = x + shortcut 166 | x = self.post_norm(x) 167 | 168 | return x 169 | 170 | 171 | class WindowTransformerCrossModal(BasicPaddedWindowTransformer): 172 | def __init__(self, dim=512, window_size=2, 173 | num_heads=4, qkv_bias=True, qk_scale=None, 174 | attn_drop=0.2, proj_drop=0.2, window_shift=True, temperature=1.): 175 | super(WindowTransformerCrossModal, self).__init__() 176 | 177 | self.dim = dim 178 | self.window_size = window_size 179 | self.window_shift = window_shift 180 | 181 | self.num_heads = num_heads 182 | head_dim = dim // num_heads 183 | self.scale = qk_scale or head_dim ** -0.5 184 | self.temperature = temperature 185 | 186 | self.q = nn.Linear(dim, dim * 1, bias=qkv_bias) 187 | self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) 188 | self.attn_drop = nn.Dropout(attn_drop) 189 | 190 | self.proj = nn.Linear(dim, dim) 191 | self.proj_drop = nn.Dropout(proj_drop) 192 | 193 | self.pre_norm = nn.LayerNorm(dim) 194 | self.post_norm = nn.LayerNorm(dim) 195 | 196 | self.softmax = nn.Softmax(dim=-1) 197 | 198 | def forward(self, x, y): 199 | x = self.window_forward(x, y) 200 | 201 | if self.window_shift: 202 | x = self.sequence_shift(x) 203 | y = self.sequence_shift(y) 204 | x = self.window_forward(x, y) 205 | x = self.sequence_inverse_shift(x) 206 | return x 207 | 208 | def window_forward(self, x, y): 209 | shortcut = x 210 | x = self.pre_norm(x) 211 | y = self.pre_norm(y) 212 | 213 | if self.window_size > 0: 214 | x, n_win_x = self.window_partition(x) 215 | y, n_win_y = self.window_partition(y) 216 | assert n_win_x == n_win_y, 'Length inconsistency between modalities !!!' 217 | 218 | b_times_n_win, win_size, dim = x.size() 219 | q = self.q(x).reshape(b_times_n_win, win_size, self.num_heads, dim // self.num_heads) 220 | q = q.permute(0, 2, 1, 3) 221 | 222 | kv = self.kv(y).reshape(b_times_n_win, win_size, 2, self.num_heads, dim // self.num_heads) 223 | kv = kv.permute(2, 0, 3, 1, 4) 224 | k, v = kv[0], kv[1] 225 | 226 | q = q * self.scale 227 | attn = q @ k.transpose(-2, -1) 228 | 229 | attn = self.softmax(attn / self.temperature) 230 | attn = self.attn_drop(attn) 231 | 232 | x = (attn @ v).transpose(1, 2).reshape(b_times_n_win, win_size, dim) 233 | x = self.proj(x) 234 | x = self.proj_drop(x) 235 | 236 | if self.window_size > 0: 237 | x = self.window_reverse(x, n_win_x) 238 | 239 | x = x + shortcut 240 | x = self.post_norm(x) 241 | return x 242 | 243 | 244 | class HybridWindowTransformerLayer(nn.Module): 245 | def __init__(self, args, model_dim=512, window_size=2, window_shift=True, num_heads=8, temperature=1.): 246 | super(HybridWindowTransformerLayer, self).__init__() 247 | 248 | use_window = window_size > 0 249 | self.args = args 250 | if use_window: 251 | self.within_modal_transformer_a = WindowTransformerWithinModal(dim=model_dim, 252 | window_size=window_size, 253 | window_shift=window_shift, 254 | num_heads=num_heads, 255 | temperature=temperature) 256 | self.within_modal_transformer_v = WindowTransformerWithinModal(dim=model_dim, 257 | window_size=window_size, 258 | window_shift=window_shift, 259 | num_heads=num_heads, 260 | temperature=temperature) 261 | self.cross_modal_transformer = WindowTransformerCrossModal(dim=model_dim, 262 | window_size=window_size, 263 | window_shift=window_shift, 264 | num_heads=num_heads, 265 | temperature=temperature) 266 | 267 | 268 | if self.args.s1_attn == 'self': 269 | self.within_modal_transformer_av = WindowTransformerWithinModal(dim=model_dim, 270 | window_size=window_size, 271 | window_shift=window_shift, 272 | num_heads=num_heads, 273 | temperature=temperature) 274 | 275 | if self.args.s1_attn == 'cm': 276 | self.cross_modal_transformer_a = WindowTransformerCrossModal(dim=model_dim, 277 | window_size=window_size, 278 | window_shift=window_shift, 279 | num_heads=num_heads, 280 | temperature=temperature) 281 | 282 | self.cross_modal_transformer_v = WindowTransformerCrossModal(dim=model_dim, 283 | window_size=window_size, 284 | window_shift=window_shift, 285 | num_heads=num_heads, 286 | temperature=temperature) 287 | 288 | else: 289 | self.within_modal_transformer_a = TransformerWithinModal(dim=model_dim, 290 | num_heads=num_heads, 291 | temperature=temperature) 292 | self.within_modal_transformer_v = TransformerWithinModal(dim=model_dim, 293 | num_heads=num_heads, 294 | temperature=temperature) 295 | self.cross_modal_transformer = TransformerCrossModal(dim=model_dim, 296 | num_heads=num_heads, 297 | temperature=temperature) 298 | 299 | def forward(self, x, y): 300 | if self.args.s1_attn == 'all': 301 | x = self.within_modal_transformer_a(x) 302 | y = self.within_modal_transformer_v(y) 303 | 304 | x_cross = self.cross_modal_transformer(x, y) 305 | y_cross = self.cross_modal_transformer(y, x) 306 | 307 | x = x + x_cross 308 | y = y + y_cross 309 | 310 | elif self.args.s1_attn == 'self': 311 | x = self.within_modal_transformer_a(x) 312 | y = self.within_modal_transformer_v(y) 313 | 314 | x_self = self.within_modal_transformer_av(x) 315 | y_self = self.within_modal_transformer_av(y) 316 | 317 | x = x + x_self 318 | y = y + y_self 319 | 320 | elif self.args.s1_attn == 'cm': 321 | 322 | x_c = self.cross_modal_transformer_a(x, y) 323 | y_c = self.cross_modal_transformer_v(y, x) 324 | 325 | x_cross = self.cross_modal_transformer(x_c, y_c) 326 | y_cross = self.cross_modal_transformer(y_c, x_c) 327 | 328 | x = x_c + x_cross 329 | y = y_c + y_cross 330 | 331 | elif self.args.s1_attn == 'none': 332 | pass 333 | 334 | return x, y 335 | 336 | 337 | class HybridWindowTransformer(nn.Module): 338 | def __init__(self, 339 | args, 340 | model_dim=512, 341 | num_heads=8, 342 | temperature=1., 343 | num_hierarchy=3, 344 | basic_window_size=2, 345 | window_shift=True, 346 | feature_flow='sequential'): 347 | super(HybridWindowTransformer, self).__init__() 348 | 349 | self.feature_flow = feature_flow 350 | self.num_hierarchy = num_hierarchy 351 | 352 | index = list(range(num_hierarchy)) 353 | self.window_size_list = list(map(lambda x: basic_window_size ** (x + 1), index)) 354 | 355 | self.args = args 356 | 357 | if feature_flow == 'sequential': 358 | transformer_layers = [] 359 | for i in range(num_hierarchy): 360 | transformer_layers.append(HybridWindowTransformerLayer(args=args, 361 | model_dim=model_dim, 362 | window_size=self.window_size_list[i], 363 | window_shift=window_shift, 364 | num_heads=num_heads, 365 | temperature=temperature)) 366 | self.multiscale_hybrid_transformer = MultiInputSequential(*transformer_layers) 367 | 368 | else: 369 | self.multiscale_hybrid_transformer = nn.ModuleList() 370 | for i in range(num_hierarchy): 371 | self.multiscale_hybrid_transformer.append( 372 | HybridWindowTransformerLayer(args=args, 373 | model_dim=model_dim, 374 | window_size=self.window_size_list[i], 375 | window_shift=window_shift, 376 | num_heads=num_heads, 377 | temperature=temperature) 378 | ) 379 | 380 | if feature_flow == 'ada_weight': 381 | self.gate = nn.Sequential(nn.Linear(model_dim, 128), 382 | nn.ReLU(), 383 | nn.Linear(128, model_dim)) 384 | self.dim_reduction_x = nn.Linear(model_dim * num_hierarchy, model_dim) 385 | self.dim_reduction_y = nn.Linear(model_dim * num_hierarchy, model_dim) 386 | self.leaky_relu = nn.LeakyReLU() 387 | 388 | elif feature_flow == 'dense_connected': 389 | self.in_layer_dim_reduction_x = nn.Linear(model_dim * 2, model_dim) 390 | self.in_layer_dim_reduction_y = nn.Linear(model_dim * 2, model_dim) 391 | 392 | self.dim_reduction_x = nn.Linear(model_dim * (num_hierarchy + 1), model_dim) 393 | self.dim_reduction_y = nn.Linear(model_dim * (num_hierarchy + 1), model_dim) 394 | 395 | self.leaky_relu = nn.LeakyReLU() 396 | 397 | else: 398 | raise NotImplementedError('Incorrect feature flow !!! Got {} !!!'.format(feature_flow)) 399 | 400 | def forward(self, x, y): 401 | 402 | if self.feature_flow == 'sequential': 403 | x, y = self.multiscale_hybrid_transformer(x, y) 404 | 405 | elif self.feature_flow == 'ada_weight': 406 | feature_list_x = [] 407 | feature_list_y = [] 408 | for i in range(self.num_hierarchy): 409 | x, y = self.multiscale_hybrid_transformer[i](x, y) 410 | feature_list_x.append(x) 411 | feature_list_y.append(y) 412 | x = torch.cat(feature_list_x, dim=-1) 413 | y = torch.cat(feature_list_y, dim=-1) 414 | x = self.dim_reduction_x(x) 415 | x = self.leaky_relu(x) 416 | y = self.dim_reduction_y(y) 417 | y = self.leaky_relu(y) 418 | 419 | elif self.feature_flow == 'dense_connected': 420 | 421 | feature_list_x = [x] 422 | feature_list_y = [y] 423 | for i in range(self.num_hierarchy): 424 | new_x, new_y = self.multiscale_hybrid_transformer[i](x, y) 425 | 426 | x = torch.cat((x, new_x), dim=-1) 427 | y = torch.cat((y, new_y), dim=-1) 428 | 429 | 430 | x = self.in_layer_dim_reduction_x(x) 431 | x = self.leaky_relu(x) 432 | y = self.in_layer_dim_reduction_y(y) 433 | y = self.leaky_relu(y) 434 | 435 | feature_list_x.append(x) 436 | feature_list_y.append(y) 437 | 438 | x = torch.cat(feature_list_x, dim=-1) 439 | y = torch.cat(feature_list_y, dim=-1) 440 | 441 | x = self.dim_reduction_x(x) 442 | x = self.leaky_relu(x) 443 | y = self.dim_reduction_y(y) 444 | y = self.leaky_relu(y) 445 | 446 | else: 447 | raise NotImplementedError('Incorrect feature flow !!! Got {} !!!'.format(self.feature_flow)) 448 | 449 | return x, y 450 | -------------------------------------------------------------------------------- /src/scripts/test_s3.sh: -------------------------------------------------------------------------------- 1 | # new stage3, use 'add' to do cross modal attention 2 | 3 | python main.py \ 4 | --audio_feature_path "data_path/audio-feat-vggish-align/" \ 5 | --visual_feature_path "data_path/visual-feat-res18-align/" \ 6 | --spatio_temporal_visual_feature_path "data_path/visual-feat-r2plus1d-align/" \ 7 | --weak_label_train_video_level "label_path/train/train_weakly.csv" \ 8 | --weak_label_train_audio "label_path/train/train_audio_weakly.csv" \ 9 | --weak_label_train_visual "label_path/train/train_visual_weakly.csv" \ 10 | --weak_label_val_audio "label_path/val/val_audio_weakly.csv" \ 11 | --weak_label_val_visual "label_path/val/val_visual_weakly.csv" \ 12 | --weak_label_val "label_path/val/val_weak_av.csv" \ 13 | --label_val_audio "label_path/val/val_audio.csv" \ 14 | --label_val_visual "label_path/val/val_visual.csv" \ 15 | --weak_label_test_audio "label_path/test/test_audio_weakly.csv" \ 16 | --weak_label_test_visual "label_path/test/test_visual_weakly.csv" \ 17 | --weak_label_test "label_path/test/test_weak_av.csv" \ 18 | --label_test_audio "label_path/test/test_audio.csv" \ 19 | --label_test_visual "label_path/test/test_visual.csv" \ 20 | --model_path "model_ckpt_path/" \ 21 | --label_format 'video' \ 22 | --epochs 30 \ 23 | --batch_size 16 \ 24 | --stg1_lr 1e-4 \ 25 | --stg2_lr 1e-4 \ 26 | --stg3_lr 2e-4 \ 27 | --step_size 10 \ 28 | --gpu 2 \ 29 | --num_stages 3 \ 30 | --transformer_dim 512 \ 31 | --transformer_num_heads 8 \ 32 | --num_transformer_layers 6 \ 33 | --basic_window_size 2 \ 34 | --window_shift \ 35 | --graph_op_type 'attn' \ 36 | --gat_edge_threshold 0.5 \ 37 | --experiment_date 20221101_s3 \ 38 | --real_av_labels \ 39 | --stg2_evnet_loss_weight 0 \ 40 | --el_warm_up_epoch -3 \ 41 | --extract_sb_th 0.5 \ 42 | --pool_method 'att' \ 43 | --event_interaction_op 'mhsa' \ 44 | --s3_share_fc \ 45 | --stg3_event_loss_weight 0.3 \ 46 | --stg3_av_loss_weight 1.0 \ 47 | --s3_el_warm_up_epoch -6 \ 48 | --s3_cross_modal \ 49 | --s3_cm_method 'add' \ 50 | --s3_mhsa_nhead 4 \ 51 | --s3_pre_norm \ 52 | --tensorsummary_name '20221101_s3' -------------------------------------------------------------------------------- /src/scripts/train_s3.sh: -------------------------------------------------------------------------------- 1 | # new stage3, use 'add' to do cross modal attention 2 | 3 | python main.py \ 4 | --audio_feature_path "data_path/audio-feat-vggish-align/" \ 5 | --visual_feature_path "data_path/visual-feat-res18-align/" \ 6 | --spatio_temporal_visual_feature_path "data_path/visual-feat-r2plus1d-align/" \ 7 | --weak_label_train_video_level "label_path/train/train_weakly.csv" \ 8 | --weak_label_train_audio "label_path/train/train_audio_weakly.csv" \ 9 | --weak_label_train_visual "label_path/train/train_visual_weakly.csv" \ 10 | --weak_label_val_audio "label_path/val/val_audio_weakly.csv" \ 11 | --weak_label_val_visual "label_path/val/val_visual_weakly.csv" \ 12 | --weak_label_val "label_path/val/val_weak_av.csv" \ 13 | --label_val_audio "label_path/val/val_audio.csv" \ 14 | --label_val_visual "label_path/val/val_visual.csv" \ 15 | --weak_label_test_audio "label_path/test/test_audio_weakly.csv" \ 16 | --weak_label_test_visual "label_path/test/test_visual_weakly.csv" \ 17 | --weak_label_test "label_path/test/test_weak_av.csv" \ 18 | --label_test_audio "label_path/test/test_audio.csv" \ 19 | --label_test_visual "label_path/test/test_visual.csv" \ 20 | --train \ 21 | --label_format 'video' \ 22 | --epochs 30 \ 23 | --batch_size 16 \ 24 | --stg1_lr 1e-4 \ 25 | --stg2_lr 1e-4 \ 26 | --stg3_lr 2e-4 \ 27 | --step_size 10 \ 28 | --gpu 1 \ 29 | --num_stages 3 \ 30 | --transformer_dim 512 \ 31 | --transformer_num_heads 8 \ 32 | --num_transformer_layers 6 \ 33 | --basic_window_size 2 \ 34 | --window_shift \ 35 | --graph_op_type 'attn' \ 36 | --gat_edge_threshold 0.5 \ 37 | --experiment_date 20221101 \ 38 | --real_av_labels \ 39 | --stg2_evnet_loss_weight 0 \ 40 | --el_warm_up_epoch -3 \ 41 | --extract_sb_th 0.5 \ 42 | --pool_method 'att' \ 43 | --event_interaction_op 'mhsa' \ 44 | --s3_share_fc \ 45 | --stg3_event_loss_weight 0.3 \ 46 | --stg3_av_loss_weight 1.0 \ 47 | --s3_el_warm_up_epoch -6 \ 48 | --s3_cross_modal \ 49 | --s3_cm_method 'add' \ 50 | --s3_mhsa_nhead 4 \ 51 | --s3_pre_norm \ 52 | --tensorsummary_name '20221101_s3' -------------------------------------------------------------------------------- /src/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .train import * 2 | from .evaluation import * 3 | -------------------------------------------------------------------------------- /src/tools/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/tools/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/tools/__pycache__/distance.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/tools/__pycache__/distance.cpython-38.pyc -------------------------------------------------------------------------------- /src/tools/__pycache__/evaluation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/tools/__pycache__/evaluation.cpython-38.pyc -------------------------------------------------------------------------------- /src/tools/__pycache__/evaluation_stat.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/tools/__pycache__/evaluation_stat.cpython-38.pyc -------------------------------------------------------------------------------- /src/tools/__pycache__/get_subgraph.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/tools/__pycache__/get_subgraph.cpython-38.pyc -------------------------------------------------------------------------------- /src/tools/__pycache__/plot_graph_mat.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/tools/__pycache__/plot_graph_mat.cpython-38.pyc -------------------------------------------------------------------------------- /src/tools/__pycache__/plot_hist.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/tools/__pycache__/plot_hist.cpython-38.pyc -------------------------------------------------------------------------------- /src/tools/__pycache__/plot_prob.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/tools/__pycache__/plot_prob.cpython-38.pyc -------------------------------------------------------------------------------- /src/tools/__pycache__/stat_neighbor.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/tools/__pycache__/stat_neighbor.cpython-38.pyc -------------------------------------------------------------------------------- /src/tools/__pycache__/subgraph_analysis.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/tools/__pycache__/subgraph_analysis.cpython-38.pyc -------------------------------------------------------------------------------- /src/tools/__pycache__/train.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/tools/__pycache__/train.cpython-38.pyc -------------------------------------------------------------------------------- /src/tools/distance.py: -------------------------------------------------------------------------------- 1 | # calculate distance of features 2 | 3 | import torch 4 | import numpy as np 5 | 6 | def cosine_distance(x, y, eps=1e-9): 7 | # cosine distance, 8 | # x and y are 3d tensors, (b, t, dim) or (b, num_cls, dim) 9 | x_norm=torch.norm(x,dim=-1,keepdim=True) 10 | y_norm=torch.norm(y,dim=-1,keepdim=True) 11 | xy_norm=x_norm*y_norm.transpose(-2,-1) 12 | xy_dot=torch.matmul(x,y.transpose(-2, -1)) 13 | pairwise_distance=xy_dot/(xy_norm+eps) 14 | return pairwise_distance -------------------------------------------------------------------------------- /src/tools/evaluation.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | from termios import TIOCPKT_DOSTOP 5 | 6 | from requests import TooManyRedirects 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import torch 11 | 12 | from utils import CATEGORIES 13 | from utils.eval_metrics import segment_level, event_level, segment_map 14 | 15 | 16 | def evaluation(args, model, model_name, val_loader, 17 | eval_a_weak_file, eval_v_weak_file, 18 | eval_a_csv_file, eval_v_csv_file): 19 | """ 20 | 21 | Args: 22 | args: arguments of evaluation 23 | model: model for evaluation 24 | model_name: model name 25 | val_loader: validation dataloader 26 | eval_a_weak_file: evaluation csv file 27 | eval_v_weak_file: evaluation csv file 28 | eval_a_csv_file: audio evaluation csv file 29 | eval_v_csv_file: visual evaluation csv file 30 | 31 | Returns: 32 | 33 | """ 34 | 35 | categories = CATEGORIES 36 | 37 | n_categories = len(categories) 38 | assert n_categories == args.num_classes 39 | # print_subgraph=False 40 | model.eval() 41 | 42 | # load annotations 43 | df_a_w = pd.read_csv(eval_a_weak_file, header=0, sep='\t') 44 | df_v_w = pd.read_csv(eval_v_weak_file, header=0, sep='\t') 45 | df_a = pd.read_csv(eval_a_csv_file, header=0, sep='\t') 46 | df_v = pd.read_csv(eval_v_csv_file, header=0, sep='\t') 47 | 48 | id_to_idx = {id: index for index, id in enumerate(categories)} 49 | F_seg_a = [] 50 | F_seg_v = [] 51 | F_seg = [] 52 | F_seg_av = [] 53 | F_event_a = [] 54 | F_event_v = [] 55 | F_event = [] 56 | F_event_av = [] 57 | 58 | pa_list=[] 59 | pv_list=[] 60 | pav_list=[] 61 | gta_list=[] 62 | gtv_list=[] 63 | gtav_list=[] 64 | 65 | with torch.no_grad(): 66 | for batch_idx, sample in enumerate(val_loader): 67 | name, audio, video, video_st, target = sample['name'], sample['audio'].to('cuda'), sample['video_s'].to( 68 | 'cuda'), sample['video_st'].to('cuda'), sample['label'] 69 | 70 | if args.label_format == 'video': 71 | if args.num_stages == 1: 72 | a_prob, v_prob, frame_prob = model(audio, video, video_st, name[0]) 73 | elif args.num_stages == 2: 74 | a_prob, v_prob, frame_prob, ag_prob, vg_prob, g_frame_prob, _, _, _, _ = model(audio, video, video_st) # # if just stage2, do not need a_event and v_event 75 | elif args.num_stages == 3: 76 | a_prob, v_prob, frame_prob, ag_prob, vg_prob, g_frame_prob, _, _, _, _, _, _, _, _ = model(audio, video, video_st) # 这一行修改了输出使输出变量数匹配 77 | else: 78 | pass 79 | if args.real_av_labels: 80 | assert args.label_format == 'video', 'real av labels only exist when label format is video !' 81 | if args.num_stages==1: 82 | o_a = (a_prob.cpu().detach().numpy() >= 0.5).astype(np.int_) 83 | o_v = (v_prob.cpu().detach().numpy() >= 0.5).astype(np.int_) 84 | 85 | Pa = frame_prob[0, :, 0, :].cpu().detach().numpy() 86 | Pv = frame_prob[0, :, 1, :].cpu().detach().numpy() 87 | 88 | pa_list.append(np.transpose(Pa)) 89 | pv_list.append(np.transpose(Pv)) 90 | pav_list.append(np.transpose(Pa)*np.transpose(Pv)) 91 | 92 | elif args.num_stages>=2: 93 | ''' 94 | stage3 just add an event loss, for evaluation, we need predict of each snippet and class, 95 | do not need event prob, so s2 and s3 will share the same ouput in evaluation 96 | ''' 97 | o_a = (ag_prob.cpu().detach().numpy() >= 0.5).astype(np.int_) 98 | o_v = (vg_prob.cpu().detach().numpy() >= 0.5).astype(np.int_) 99 | 100 | Pa = g_frame_prob[0, :, 0, :].cpu().detach().numpy() 101 | Pv = g_frame_prob[0, :, 1, :].cpu().detach().numpy() 102 | 103 | pa_list.append(np.transpose(Pa)) 104 | pv_list.append(np.transpose(Pv)) 105 | pav_list.append(np.transpose(Pa)*np.transpose(Pv)) 106 | 107 | # ---------frame prob of s1 ----------- 108 | # model.stage2.s1_frame_prob (bs, t, 2, num_cls) 109 | pas1=model.stage2.s1_frame_prob[0, :, 0, :] # (t, num_cls) 110 | pas1=np.transpose(pas1) 111 | 112 | pvs1=model.stage2.s1_frame_prob[0, :, 1, :] 113 | pvs1=np.transpose(pvs1) 114 | 115 | # ---------frame att ----------- 116 | frame_att_a = model.stage2.frame_att[0, :, 0, :] 117 | frame_att_v = model.stage2.frame_att[0, :, 1, :] 118 | # print(frame_att_a.shape) 119 | # ---------end----------- 120 | 121 | else: 122 | raise NotImplementedError('num of stages must > 0') 123 | 124 | repeat_len = audio.size()[1] 125 | 126 | Pa = (Pa >= 0.5).astype(np.int_) * np.repeat(o_a, repeats=repeat_len, axis=0) 127 | Pv = (Pv >= 0.5).astype(np.int_) * np.repeat(o_v, repeats=repeat_len, axis=0) 128 | else: 129 | 130 | raise NotImplementedError('no need') 131 | 132 | # save result 133 | save_path = os.path.join(args.eval_output_path, model_name) 134 | if not os.path.exists(save_path): 135 | os.mkdir(save_path) 136 | save_name = name[0] + ".txt" 137 | file_tmp = os.path.join(save_path, save_name) 138 | # eval_result = open(file_tmp, 'w') 139 | 140 | # ************************** extract audio GT labels ************************** 141 | GT_a = np.zeros((n_categories, repeat_len)) 142 | GT_v = np.zeros((n_categories, repeat_len)) 143 | 144 | df_vid_a = df_a.loc[df_a['filename'] == df_a_w.loc[batch_idx, :][0]] 145 | filenames = df_vid_a["filename"] 146 | events = df_vid_a["event_labels"] 147 | onsets = df_vid_a["onset"] 148 | offsets = df_vid_a["offset"] 149 | num = len(filenames) 150 | if num > 0: 151 | for i in range(num): 152 | x1 = int(onsets[df_vid_a.index[i]]) 153 | x2 = int(offsets[df_vid_a.index[i]]) 154 | event = events[df_vid_a.index[i]] 155 | idx = id_to_idx[event] 156 | GT_a[idx, x1:x2+1] = 1 157 | 158 | 159 | # ************************** extract visual GT labels ************************** 160 | df_vid_v = df_v.loc[df_v['filename'] == df_v_w.loc[batch_idx, :][0]] 161 | filenames = df_vid_v["filename"] 162 | events = df_vid_v["event_labels"] 163 | onsets = df_vid_v["onset"] 164 | offsets = df_vid_v["offset"] 165 | num = len(filenames) 166 | if num > 0: 167 | for i in range(num): 168 | x1 = int(onsets[df_vid_v.index[i]]) 169 | x2 = int(offsets[df_vid_v.index[i]]) 170 | event = events[df_vid_v.index[i]] 171 | idx = id_to_idx[event] 172 | GT_v[idx, x1:x2+1] = 1 173 | 174 | # ************************** obtain audiovisual GT labels ************************** 175 | GT_av = GT_a * GT_v 176 | 177 | 178 | gta_list.append(GT_a) # num_cls, t 179 | gtv_list.append(GT_v) 180 | gtav_list.append(GT_av) 181 | 182 | # obtain prediction matrices 183 | SO_a = np.transpose(Pa) 184 | SO_v = np.transpose(Pv) 185 | SO_av = SO_a * SO_v 186 | 187 | # segment-level F1 scores 188 | f_a, f_v, f, f_av = segment_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av) 189 | F_seg_a.append(f_a) 190 | F_seg_v.append(f_v) 191 | F_seg.append(f) 192 | F_seg_av.append(f_av) 193 | 194 | # event-level F1 scores 195 | f_a, f_v, f, f_av = event_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av, repeat_len, 35) 196 | F_event_a.append(f_a) 197 | F_event_v.append(f_v) 198 | F_event.append(f) 199 | F_event_av.append(f_av) 200 | 201 | 202 | print("\n-------------------------Snippet-level MAP -------------------------") 203 | map_a=segment_map(pa_list,gta_list) 204 | print('snippet-level audio map: {:.2f}'.format(map_a*100)) 205 | 206 | map_v=segment_map(pv_list,gtv_list) 207 | print('snippet-level visual map: {:.2f}'.format(map_v*100)) 208 | 209 | map_av=segment_map(pav_list,gtav_list) 210 | print('snippet-level audio-visual map: {:.2f}'.format(map_av*100)) 211 | 212 | map_avg = (map_a+map_v+map_av)/3 213 | print('snippet-level avg map: {:.2f}'.format(map_avg*100)) 214 | 215 | 216 | print("\n------------------------- Event-level F1-------------------------") 217 | print('Audio Event Detection Event-level F1: {:.2f}'.format(100 * np.mean(np.array(F_event_a)))) 218 | print('Visual Event Detection Event-level F1: {:.2f}'.format(100 * np.mean(np.array(F_event_v)))) 219 | print('Audio-Visual Event Detection Event-level F1: {:.2f}'.format(100 * np.mean(np.array(F_event_av)))) 220 | 221 | avg_type_event = (100 * np.mean(np.array(F_event_av)) + 100 * np.mean(np.array(F_event_a)) + 100 * np.mean( 222 | np.array(F_event_v))) / 3. 223 | print('Event-level Type@Avg. F1: {:.2f}'.format(avg_type_event)) 224 | 225 | return (map_avg*100 + avg_type_event)/2 226 | -------------------------------------------------------------------------------- /src/tools/train.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import torch 4 | 5 | def train(args, model, train_loader, optimizer_list, criterion, epoch, writer): 6 | if args.num_stages == 1: 7 | stg1_optimizer = optimizer_list[0] 8 | elif args.num_stages == 2: 9 | stg1_optimizer, stg2_optimizer = optimizer_list 10 | elif args.num_stages == 3: 11 | stg1_optimizer, stg2_optimizer, stg3_optimizer = optimizer_list 12 | else: 13 | raise NotImplementedError 14 | 15 | model.train() 16 | 17 | print("\n------------- train -------------") 18 | 19 | 20 | 21 | with torch.autograd.set_detect_anomaly(True): 22 | 23 | for batch_idx, sample in enumerate(train_loader): 24 | start = time.time() 25 | s1_av_loss = 0 # s1 video loss scalar 26 | s2_av_loss = 0 # s2 video loss scalar 27 | s2_event_loss = 0 # s2 event loss scalar 28 | s3_event_loss = 0 29 | all_batch_idx = batch_idx + len(train_loader)*(epoch-1) # for tensorboard 30 | 31 | if args.label_format == 'video': 32 | audio, video, video_st, target = sample['audio'].to('cuda'), sample['video_s'].to('cuda'), sample[ 33 | 'video_st'].to('cuda'), sample['label'] 34 | target_video = target[0].type(torch.FloatTensor).to('cuda') 35 | if args.real_av_labels: 36 | target_a = target[1].type(torch.FloatTensor).to('cuda') 37 | target_v = target[2].type(torch.FloatTensor).to('cuda') 38 | else: 39 | # label smoothing 40 | a = 1.0 41 | v = 0.9 42 | target_a = a * target_video + (1 - a) * 0.5 43 | target_v = v * target_video + (1 - v) * 0.5 44 | 45 | if args.num_stages >= 1: 46 | stg1_optimizer.zero_grad() 47 | if args.num_stages >= 2: 48 | stg2_optimizer.zero_grad() 49 | if args.num_stages == 3: 50 | stg3_optimizer.zero_grad() 51 | 52 | if args.label_format == 'video': 53 | if args.num_stages == 1: 54 | a_prob, v_prob, _ = model(audio, video, video_st) 55 | elif args.num_stages == 2: 56 | a_prob, v_prob, _, ag_prob, vg_prob, _, a_event_prob_list, a_evnet_list, \ 57 | v_event_prob_list, v_evnet_list = model(audio, video, video_st) # if just stage2, do not need a_event and v_event 58 | elif args.num_stages == 3: 59 | a_prob, v_prob, _, ag_prob, vg_prob, _, a_event_prob_list, a_evnet_list, \ 60 | v_event_prob_list, v_evnet_list, a_event_prob_s3, v_event_prob_s3, \ 61 | a_prob_s3, v_prob_s3 = model(audio, video, video_st) 62 | else: 63 | raise NotImplementedError('just have 3 stages') 64 | forward_end = time.time() 65 | a_prob.clamp_(min=1e-7, max=1 - 1e-7) 66 | v_prob.clamp_(min=1e-7, max=1 - 1e-7) 67 | if args.num_stages >= 2: 68 | ag_prob.clamp_(min=1e-7, max=1 - 1e-7) 69 | vg_prob.clamp_(min=1e-7, max=1 - 1e-7) 70 | if args.num_stages >= 3: 71 | a_prob_s3.clamp_(min=1e-7, max=1 - 1e-7) 72 | v_prob_s3.clamp_(min=1e-7, max=1 - 1e-7) 73 | 74 | if args.num_stages >= 1: 75 | stage1_loss = criterion(a_prob, target_a) + criterion(v_prob, target_v) 76 | s1_av_loss = stage1_loss.item() 77 | writer.add_scalar('stage1_loss', s1_av_loss, all_batch_idx) 78 | 79 | if args.num_stages >= 2: 80 | if args.just_event_loss: 81 | stage2_loss=None 82 | s2_av_loss = 0 83 | writer.add_scalar('stage2_av_loss', s2_av_loss, all_batch_idx) 84 | 85 | else: 86 | stage2_loss=criterion(ag_prob, target_a) + criterion(vg_prob, target_v) 87 | s2_av_loss = stage2_loss.item() 88 | writer.add_scalar('stage2_av_loss', s2_av_loss, all_batch_idx) 89 | 90 | if args.stg2_evnet_loss_weight > 0: 91 | if args.el_warm_up_epoch > 0: # weight of event loss, soft warm-up 92 | stg2_event_loss_weight = min(args.stg2_evnet_loss_weight, args.stg2_evnet_loss_weight * (epoch-1) / args.el_warm_up_epoch) 93 | elif args.el_warm_up_epoch == 0: 94 | stg2_event_loss_weight = args.stg2_evnet_loss_weight 95 | else: # hard warm up 96 | if epoch <= abs(args.el_warm_up_epoch): 97 | stg2_event_loss_weight = 0 98 | else: 99 | stg2_event_loss_weight = args.stg2_evnet_loss_weight 100 | 101 | 102 | if len(a_evnet_list[0])>0: 103 | s2_event_loss_a = stg2_event_loss_weight*event_loss(criterion, a_event_prob_list, a_evnet_list, target_a) 104 | s2_event_loss += s2_event_loss_a.item() 105 | if stage2_loss==None: 106 | stage2_loss = s2_event_loss_a 107 | else: 108 | stage2_loss += s2_event_loss_a 109 | if len(v_evnet_list[0])>0: 110 | s2_event_loss_v = stg2_event_loss_weight*event_loss(criterion, v_event_prob_list, v_evnet_list, target_v) 111 | s2_event_loss += s2_event_loss_v.item() 112 | if stage2_loss==None: 113 | stage2_loss = s2_event_loss_v 114 | else: 115 | stage2_loss += s2_event_loss_v 116 | writer.add_scalar('stage2_event_loss', s2_event_loss, all_batch_idx) 117 | 118 | # --------------------stage3 loss ------------------------------ 119 | if args.num_stages >= 3: # if have stage3, must use event loss, stage3 just have event loss 120 | if args.s3_el_warm_up_epoch > 0: # weight of event loss, soft warm-up 121 | s3_event_loss_weight = min(args.stg3_event_loss_weight, args.stg3_event_loss_weight * (epoch-1) / args.s3_el_warm_up_epoch) 122 | s3_av_loss_weight = min(args.stg3_av_loss_weight, args.stg3_av_loss_weight * (epoch-1) / args.s3_el_warm_up_epoch) 123 | elif args.s3_el_warm_up_epoch == 0: 124 | s3_event_loss_weight = args.stg3_event_loss_weight 125 | s3_av_loss_weight = args.stg3_av_loss_weight 126 | else: # hard warm up 127 | if epoch <= abs(args.s3_el_warm_up_epoch): 128 | s3_event_loss_weight = 0.0 129 | s3_av_loss_weight = 0.0 130 | else: 131 | s3_event_loss_weight = args.stg3_event_loss_weight 132 | s3_av_loss_weight = args.stg3_av_loss_weight 133 | 134 | stage3_loss = s3_av_loss_weight * (criterion(a_prob_s3, target_a) + criterion(v_prob_s3, target_v)) 135 | s3_av_loss = stage3_loss.item() 136 | writer.add_scalar('stage3_av_loss', s3_av_loss, all_batch_idx) 137 | if len(a_evnet_list[0])>0: # event loss 138 | s3_event_loss_a = s3_event_loss_weight * event_loss(criterion, a_event_prob_s3, a_evnet_list, target_a) 139 | s3_event_loss += s3_event_loss_a.item() # tensorboard use 140 | stage3_loss += s3_event_loss_a 141 | 142 | if len(v_evnet_list[0])>0: # event loss 143 | s3_event_loss_v = s3_event_loss_weight * event_loss(criterion, v_event_prob_s3, v_evnet_list, target_v) 144 | s3_event_loss += s3_event_loss_v.item() 145 | stage3_loss += s3_event_loss_v 146 | 147 | loss = stage1_loss + stage2_loss + stage3_loss 148 | 149 | writer.add_scalar('stage3_event_loss', s3_event_loss, all_batch_idx) 150 | # --------------------end stage3 loss ------------------------------ 151 | 152 | else: 153 | if stage2_loss==None: 154 | loss = stage1_loss 155 | stage2_loss=torch.tensor(0) 156 | else: 157 | loss = stage1_loss+stage2_loss 158 | else: 159 | loss = stage1_loss 160 | 161 | else: 162 | raise NotImplementedError 163 | 164 | loss.backward() 165 | if args.num_stages >= 1: 166 | stg1_optimizer.step() 167 | if args.num_stages >= 2: 168 | stg2_optimizer.step() 169 | if args.num_stages == 3: 170 | stg3_optimizer.step() 171 | 172 | end = time.time() 173 | if batch_idx % args.log_interval == 0: 174 | if args.num_stages == 1: 175 | print('Train Epoch: {} ' 176 | '[{}/{} ({:.0f}%)]\t' 177 | 'Stage1 Loss: {:.6f}\t' 178 | 'ForwardTime: {:.2f}\t' 179 | 'BackwardTime: {:.2f}'.format(epoch, 180 | batch_idx * len(audio), len(train_loader.dataset), 181 | 100. * batch_idx / len(train_loader), 182 | stage1_loss.item(), 183 | forward_end - start, 184 | end - forward_end)) 185 | elif args.num_stages == 2: 186 | print('Train Epoch: {} ' 187 | '[{}/{} ({:.0f}%)]\t' 188 | 'Stage1 Loss: {:.6f}\t' 189 | 'Stage2 Loss: {:.6f}\t' 190 | 'ForwardTime: {:.2f}\t' 191 | 'BackwardTime: {:.2f}'.format(epoch, 192 | batch_idx * len(audio), len(train_loader.dataset), 193 | 100. * batch_idx / len(train_loader), 194 | stage1_loss.item(), 195 | stage2_loss.item(), 196 | forward_end - start, 197 | end - forward_end)) 198 | elif args.num_stages == 3: 199 | print('Train Epoch: {} ' 200 | '[{}/{} ({:.0f}%)]\t' 201 | 'Stage1 Loss: {:.6f}\t' 202 | 'Stage2 Loss: {:.6f}\t' 203 | 'Stage3 Loss: {:.6f}\t' 204 | 'ForwardTime: {:.2f}\t' 205 | 'BackwardTime: {:.2f}'.format(epoch, 206 | batch_idx * len(audio), len(train_loader.dataset), 207 | 100. * batch_idx / len(train_loader), 208 | stage1_loss.item(), 209 | stage2_loss.item(), 210 | stage3_loss.item(), 211 | forward_end - start, 212 | end - forward_end)) 213 | else: 214 | raise NotImplementedError 215 | 216 | def event_loss(criterion, event_prob_list, event_list, target_m): 217 | # event_prob_list, event_list, m_prob 218 | event_prob=torch.stack(event_prob_list) 219 | event_prob.clamp_(min=1e-7, max=1 - 1e-7) 220 | 221 | if len(event_list) > 2: 222 | event_list = event_list[:2] 223 | target_list=target_m[event_list] 224 | return criterion(event_prob,target_list) -------------------------------------------------------------------------------- /src/utils/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/utils/.DS_Store -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .set_seed import setup_seed 2 | from .constant import CATEGORIES 3 | -------------------------------------------------------------------------------- /src/utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/utils/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /src/utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /src/utils/__pycache__/constant.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/utils/__pycache__/constant.cpython-36.pyc -------------------------------------------------------------------------------- /src/utils/__pycache__/constant.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/utils/__pycache__/constant.cpython-38.pyc -------------------------------------------------------------------------------- /src/utils/__pycache__/eval_metrics.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/utils/__pycache__/eval_metrics.cpython-38.pyc -------------------------------------------------------------------------------- /src/utils/__pycache__/set_seed.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/utils/__pycache__/set_seed.cpython-36.pyc -------------------------------------------------------------------------------- /src/utils/__pycache__/set_seed.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GeWu-Lab/LFAV/7c4dc6cfa8e82b15cc93f4fec2b7414219ede7e4/src/utils/__pycache__/set_seed.cpython-38.pyc -------------------------------------------------------------------------------- /src/utils/constant.py: -------------------------------------------------------------------------------- 1 | CATEGORIES = ['accordion', 'alarm', 'banjo', 'bicycle', 'car', 'car_alarm', 'cat', 2 | 'cello', 'chainsaw','cheering', 'chicken_rooster', 'clapping', 'cry', 'dance', 3 | 'dog', 'drum','fixed-wing_aircraft', 'frisbee', 'guitar', 'helicopter', 'horse', 4 | 'laughter','piano', 'playing_basketball', 'playing_badminton', 'playing_baseball', 'playing_ping-pong','playing_tennis', 5 | 'playing_soccer', 'playing_volleyball', 'rodents', 'shofar','singing', 'speech', 'violin'] 6 | 7 | CATEGORIES_NULL = ['accordion', 'alarm', 'banjo', 'bicycle', 'car', 'car_alarm', 'cat', 'cello', 'chainsaw', 8 | 'cheering', 'chicken_rooster', 'clapping', 'cry', 'dance', 'dog', 'drum', 9 | 'fixed-wing_aircraft', 'frisbee', 'guitar', 'helicopter', 'horse', 'laughter', 10 | 'piano', 'playing_basketball', 'playing_badminton', 'playing_baseball', 'playing_ping-pong', 11 | 'playing_tennis', 'playing_soccer', 'playing_volleyball', 'rodents', 'shofar', 12 | 'singing', 'speech', 'violin', 'null'] 13 | -------------------------------------------------------------------------------- /src/utils/eval_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def Precision(X_pre, X_gt): 5 | N = len(X_pre) 6 | p = 0.0 7 | for i in range(N): 8 | x = X_pre[i, :] 9 | y = X_gt[i, :] 10 | p += np.sum(x * y) / np.sum(x) 11 | return p / N 12 | 13 | 14 | def Recall(X_pre, X_gt): 15 | N = len(X_pre) 16 | p = 0.0 17 | for i in range(N): 18 | x = X_pre[i, :] 19 | y = X_gt[i, :] 20 | p += np.sum(x * y) / np.sum(y) 21 | return p / N 22 | 23 | 24 | def F1(X_pre, X_gt): 25 | N = len(X_pre) 26 | p = 0 27 | for i in range(N): 28 | x = X_pre[i, :] 29 | y = X_gt[i, :] 30 | p += 2 * np.sum(x * y) / (np.sum(x) + np.sum(y)) 31 | return p / N 32 | 33 | def event_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av, T, n_cls=35): 34 | 35 | N = n_cls 36 | event_p_a = [None for _ in range(n_cls)] 37 | event_gt_a = [None for _ in range(n_cls)] 38 | event_p_v = [None for _ in range(n_cls)] 39 | event_gt_v = [None for _ in range(n_cls)] 40 | event_p_av = [None for _ in range(n_cls)] 41 | event_gt_av = [None for _ in range(n_cls)] 42 | 43 | TP_a = np.zeros(n_cls) 44 | TP_v = np.zeros(n_cls) 45 | TP_av = np.zeros(n_cls) 46 | 47 | FP_a = np.zeros(n_cls) 48 | FP_v = np.zeros(n_cls) 49 | FP_av = np.zeros(n_cls) 50 | 51 | FN_a = np.zeros(n_cls) 52 | FN_v = np.zeros(n_cls) 53 | FN_av = np.zeros(n_cls) 54 | 55 | for n in range(N): 56 | seq_pred = SO_a[n, :] 57 | if np.sum(seq_pred) != 0: 58 | x = extract_event(seq_pred, n, T) 59 | event_p_a[n] = x 60 | seq_gt = GT_a[n, :] 61 | if np.sum(seq_gt) != 0: 62 | x = extract_event(seq_gt, n, T) 63 | event_gt_a[n] = x 64 | 65 | seq_pred = SO_v[n, :] 66 | if np.sum(seq_pred) != 0: 67 | x = extract_event(seq_pred, n, T) 68 | event_p_v[n] = x 69 | seq_gt = GT_v[n, :] 70 | if np.sum(seq_gt) != 0: 71 | x = extract_event(seq_gt, n, T) 72 | event_gt_v[n] = x 73 | 74 | seq_pred = SO_av[n, :] 75 | if np.sum(seq_pred) != 0: 76 | x = extract_event(seq_pred, n, T) 77 | event_p_av[n] = x 78 | 79 | seq_gt = GT_av[n, :] 80 | if np.sum(seq_gt) != 0: 81 | x = extract_event(seq_gt, n, T) 82 | event_gt_av[n] = x 83 | 84 | 85 | tp, fp, fn = event_wise_metric(event_p_a[n], event_gt_a[n]) 86 | TP_a[n] += tp 87 | FP_a[n] += fp 88 | FN_a[n] += fn 89 | 90 | 91 | tp, fp, fn = event_wise_metric(event_p_v[n], event_gt_v[n]) 92 | TP_v[n] += tp 93 | FP_v[n] += fp 94 | FN_v[n] += fn 95 | 96 | 97 | tp, fp, fn = event_wise_metric(event_p_av[n], event_gt_av[n]) 98 | TP_av[n] += tp 99 | FP_av[n] += fp 100 | FN_av[n] += fn 101 | 102 | TP = TP_a + TP_v 103 | FN = FN_a + FN_v 104 | FP = FP_a + FP_v 105 | 106 | n = len(FP_a) 107 | F_a = [] 108 | for ii in range(n): 109 | if (TP_a + FP_a)[ii] != 0 or (TP_a + FN_a)[ii] != 0: 110 | F_a.append(2 * TP_a[ii] / (2 * TP_a[ii] + (FN_a + FP_a)[ii])) 111 | 112 | F_v = [] 113 | for ii in range(n): 114 | if (TP_v + FP_v)[ii] != 0 or (TP_v + FN_v)[ii] != 0: 115 | F_v.append(2 * TP_v[ii] / (2 * TP_v[ii] + (FN_v + FP_v)[ii])) 116 | 117 | F = [] 118 | for ii in range(n): 119 | if (TP + FP)[ii] != 0 or (TP + FN)[ii] != 0: 120 | F.append(2 * TP[ii] / (2 * TP[ii] + (FN + FP)[ii])) 121 | 122 | F_av = [] 123 | for ii in range(n): 124 | if (TP_av + FP_av)[ii] != 0 or (TP_av + FN_av)[ii] != 0: 125 | F_av.append(2 * TP_av[ii] / (2 * TP_av[ii] + (FN_av + FP_av)[ii])) 126 | 127 | if len(F_a) == 0: 128 | f_a = 1.0 129 | else: 130 | f_a = (sum(F_a) / len(F_a)) 131 | 132 | if len(F_v) == 0: 133 | f_v = 1.0 134 | else: 135 | f_v = (sum(F_v) / len(F_v)) 136 | 137 | if len(F) == 0: 138 | f = 1.0 139 | else: 140 | f = (sum(F) / len(F)) 141 | if len(F_av) == 0: 142 | f_av = 1.0 143 | else: 144 | f_av = (sum(F_av) / len(F_av)) 145 | 146 | return f_a, f_v, f, f_av 147 | 148 | def segment_level(SO_a, SO_v, SO_av, GT_a, GT_v, GT_av): 149 | 150 | TP_a = np.sum(SO_a * GT_a, axis=1) 151 | FN_a = np.sum((1 - SO_a) * GT_a, axis=1) 152 | FP_a = np.sum(SO_a * (1 - GT_a), axis=1) 153 | 154 | n = len(FP_a) 155 | F_a = [] 156 | for ii in range(n): 157 | if (TP_a + FP_a)[ii] != 0 or (TP_a + FN_a)[ii] != 0: 158 | F_a.append(2 * TP_a[ii] / (2 * TP_a[ii] + (FN_a + FP_a)[ii])) 159 | 160 | TP_v = np.sum(SO_v * GT_v, axis=1) 161 | FN_v = np.sum((1 - SO_v) * GT_v, axis=1) 162 | FP_v = np.sum(SO_v * (1 - GT_v), axis=1) 163 | F_v = [] 164 | for ii in range(n): 165 | if (TP_v + FP_v)[ii] != 0 or (TP_v + FN_v)[ii] != 0: 166 | F_v.append(2 * TP_v[ii] / (2 * TP_v[ii] + (FN_v + FP_v)[ii])) 167 | 168 | TP = TP_a + TP_v 169 | FN = FN_a + FN_v 170 | FP = FP_a + FP_v 171 | 172 | n = len(FP) 173 | 174 | F = [] 175 | for ii in range(n): 176 | if (TP + FP)[ii] != 0 or (TP + FN)[ii] != 0: 177 | F.append(2 * TP[ii] / (2 * TP[ii] + (FN + FP)[ii])) 178 | 179 | TP_av = np.sum(SO_av * GT_av, axis=1) 180 | FN_av = np.sum((1 - SO_av) * GT_av, axis=1) 181 | FP_av = np.sum(SO_av * (1 - GT_av), axis=1) 182 | n = len(FP_av) 183 | F_av = [] 184 | for ii in range(n): 185 | if (TP_av + FP_av)[ii] != 0 or (TP_av + FN_av)[ii] != 0: 186 | F_av.append(2 * TP_av[ii] / (2 * TP_av[ii] + (FN_av + FP_av)[ii])) 187 | 188 | if len(F_a) == 0: 189 | f_a = 1.0 190 | else: 191 | f_a = (sum(F_a) / len(F_a)) 192 | 193 | if len(F_v) == 0: 194 | f_v = 1.0 195 | else: 196 | f_v = (sum(F_v) / len(F_v)) 197 | 198 | if len(F) == 0: 199 | f = 1.0 200 | else: 201 | f = (sum(F) / len(F)) 202 | if len(F_av) == 0: 203 | f_av = 1.0 204 | else: 205 | f_av = (sum(F_av) / len(F_av)) 206 | 207 | return f_a, f_v, f, f_av 208 | 209 | 210 | def to_vec(start, end, t): 211 | x = np.zeros(t) 212 | for i in range(start, end): 213 | x[i] = 1 214 | return x 215 | 216 | 217 | def extract_event(seq, n, T): 218 | x = [] 219 | i = 0 220 | while i < T: 221 | if seq[i] == 1: 222 | start = i 223 | if i + 1 == T: 224 | i = i + 1 225 | end = i 226 | x.append(to_vec(start, end, T)) 227 | break 228 | 229 | for j in range(i + 1, T): 230 | if seq[j] != 1: 231 | i = j + 1 232 | end = j 233 | x.append(to_vec(start, end, T)) 234 | break 235 | else: 236 | i = j + 1 237 | if i == T: 238 | end = i 239 | x.append(to_vec(start, end, T)) 240 | break 241 | else: 242 | i += 1 243 | return x 244 | 245 | 246 | def event_wise_metric(event_p, event_gt): 247 | TP = 0 248 | FP = 0 249 | FN = 0 250 | 251 | if event_p is not None: 252 | num_event = len(event_p) 253 | for i in range(num_event): 254 | x1 = event_p[i] 255 | if event_gt is not None: 256 | nn = len(event_gt) 257 | flag = True 258 | for j in range(nn): 259 | x2 = event_gt[j] 260 | if np.sum(x1 * x2) >= 0.5 * np.sum(x1 + x2 - x1 * x2): 261 | TP += 1 262 | flag = False 263 | break 264 | if flag: 265 | FP += 1 266 | else: 267 | FP += 1 268 | 269 | if event_gt is not None: 270 | num_event = len(event_gt) 271 | for i in range(num_event): 272 | x1 = event_gt[i] 273 | if event_p is not None: 274 | nn = len(event_p) 275 | flag = True 276 | for j in range(nn): 277 | x2 = event_p[j] 278 | if np.sum(x1 * x2) >= 0.5 * np.sum(x1 + x2 - x1 * x2): 279 | flag = False 280 | break 281 | if flag: 282 | FN += 1 283 | else: 284 | FN += 1 285 | return TP, FP, FN 286 | 287 | 288 | 289 | def segment_map(prob_list,gt_list): 290 | 291 | 292 | 293 | num_cls=prob_list[0].shape[0] 294 | assert num_cls==35 295 | 296 | map=[] 297 | for i in range(num_cls): 298 | prec=[] 299 | rec=[] 300 | TP=0 301 | FP=0 302 | prob=np.concatenate([prob_list[ii][i] for ii in range(len(prob_list))]) 303 | gt=np.concatenate([gt_list[ii][i] for ii in range(len(gt_list))]) 304 | gt_num=np.sum(gt) 305 | conf_index=np.argsort(prob) 306 | conf_index=conf_index[::-1] 307 | for index in conf_index: 308 | 309 | if gt[index]==1: 310 | TP+=1 311 | else: 312 | FP+=1 313 | prec.append(TP/(TP+FP)) 314 | rec.append(TP/gt_num) 315 | 316 | prec=np.array(prec) 317 | rec=np.array(rec) 318 | ap=interpolated_prec_rec(prec,rec) 319 | map.append(ap) 320 | return sum(map)/len(map) 321 | 322 | 323 | def interpolated_prec_rec(prec, rec): 324 | """Interpolated AP - VOCdevkit from VOC 2011. 325 | """ 326 | mprec = np.hstack([[0], prec, [0]]) 327 | mrec = np.hstack([[0], rec, [1]]) 328 | for i in range(len(mprec) - 1)[::-1]: 329 | mprec[i] = max(mprec[i], mprec[i + 1]) 330 | idx = np.where(mrec[1::] != mrec[0:-1])[0] + 1 331 | ap = np.sum((mrec[idx] - mrec[idx - 1]) * mprec[idx]) 332 | return ap 333 | 334 | -------------------------------------------------------------------------------- /src/utils/set_seed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | 4 | 5 | def setup_seed(seed): 6 | torch.manual_seed(seed) 7 | torch.cuda.manual_seed(seed) 8 | torch.backends.cudnn.deterministic = True 9 | torch.backends.cudnn.benchmark = False 10 | random.seed(seed) 11 | --------------------------------------------------------------------------------