├── nets ├── __init__.py ├── multi_attention_encoder_decoder.py ├── sliding_chunks.py ├── ours_MWAFM_Net.py ├── multi_attention.py └── diagonaled_mm_tvm.py ├── scripts ├── word_numbers.py ├── split_dataset.py └── data_clean.py ├── utils.py ├── readme.md ├── conifg.py ├── data_generator.py ├── metadata ├── output_classes_clean.json ├── output_classes.json ├── metadata_orig │ └── output_classes.json ├── wordst.txt ├── single_word_val_clean.csv └── single_word_test_clean.csv └── main_MWAFM.py /nets/__init__.py: -------------------------------------------------------------------------------- 1 | from nets.ours_MWAFM_Net import MWAFM_Net, MultiAttnLayer 2 | -------------------------------------------------------------------------------- /scripts/word_numbers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import pandas as pd 4 | 5 | def read_metadata(csv_file_path): 6 | csv_data = pd.read_csv(csv_file_path, encoding='latin1') 7 | audio_fnames = list(csv_data['file_name']) 8 | questions = list(csv_data['QuestionText']) 9 | answers = list(csv_data['answer']) 10 | return audio_fnames, questions, answers 11 | 12 | def StatWorsNums(csv_file): 13 | 14 | words = [] 15 | 16 | wave_name, question, answer = read_metadata(csv_file) 17 | # print(question) 18 | 19 | for index in range(len(question)): 20 | qst = question[index] 21 | qst = qst.replace(",", "").replace("?", "") 22 | qst = qst.split(" ") 23 | 24 | for wd in qst: 25 | if wd not in words: 26 | words.append(wd) 27 | 28 | print(words) 29 | print(len(words)) 30 | 31 | 32 | 33 | 34 | if __name__ == "__main__": 35 | 36 | csv_file = "../metadata/single_word_train.csv" 37 | StatWorsNums(csv_file) -------------------------------------------------------------------------------- /scripts/split_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | 5 | def split_data_into_binary_and_single_word_answers(csv_file, split): 6 | csv_data = pd.read_csv(csv_file, encoding='latin1', usecols=['file_name', 'QuestionText', 'answer' ]) 7 | 8 | csv_data['answer'] = csv_data['answer'].str.upper() 9 | 10 | select = ['YES', 'NO'] 11 | data_single_word = csv_data[~csv_data['answer'].isin(select)] 12 | data_binary = csv_data[csv_data['answer'].isin(select)] 13 | 14 | with open('metadata/single_word_{}.csv'.format(split), 'wb') as f: 15 | data_single_word.to_csv(f, index=False) 16 | 17 | with open('metadata/binary_{}.csv'.format(split), 'wb') as f: 18 | data_binary.to_csv(f, index=False) 19 | 20 | 21 | csv_file_train = 'metadata/clotho_aqa_train.csv' 22 | csv_file_val = 'metadata/clotho_aqa_val.csv' 23 | csv_file_test = 'metadata/clotho_aqa_test.csv' 24 | 25 | split_data_into_binary_and_single_word_answers(csv_file_train, 'train') 26 | split_data_into_binary_and_single_word_answers(csv_file_val, 'val') 27 | split_data_into_binary_and_single_word_answers(csv_file_test, 'test') 28 | 29 | # create word-index for single word answers 30 | csv_file = 'metadata/single_word_train.csv' 31 | csv_data = pd.read_csv(csv_file, encoding='latin1', usecols=['file_name', 'QuestionText', 'answer']) 32 | 33 | csv_data['answer'] = csv_data['answer'].str.upper() 34 | 35 | answers_set = set(list(csv_data['answer'])) 36 | answers_dict = dict(zip(answers_set, range(len(answers_set)))) 37 | 38 | with open("metadata/output_classes.json", "w") as outfile: 39 | json.dump(answers_dict, outfile) 40 | -------------------------------------------------------------------------------- /scripts/data_clean.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import pandas as pd 4 | import json 5 | 6 | 7 | 8 | def read_metadata(csv_file_path): 9 | csv_data = pd.read_csv(csv_file_path, encoding='latin1') 10 | audio_fnames = list(csv_data['file_name']) 11 | questions = list(csv_data['QuestionText']) 12 | answers = list(csv_data['answer']) 13 | return audio_fnames, questions, answers 14 | 15 | def QAClean(csv_file): 16 | 17 | wave_name, question, answer = read_metadata(csv_file) 18 | 19 | cnt = 0 20 | for index in range(0,len(question),3): 21 | 22 | answer_tmp = [] 23 | # for i in range(3) 24 | answer_tmp.append(answer[index]) 25 | 26 | if answer[index+1] in answer_tmp: 27 | answer_tmp.append(answer[index+1]) 28 | print(wave_name[index], ', "', question[index], '", ', answer[index+1]) 29 | cnt += 1 30 | else: 31 | answer_tmp.append(answer[index+1]) 32 | if answer[index+2] in answer_tmp: 33 | answer_tmp.append(answer[index+2]) 34 | print(wave_name[index], ',"', question[index], '",', answer[index+2]) 35 | cnt += 1 36 | print("cnt: ", cnt) 37 | 38 | 39 | def AnswerGen(csv_file): 40 | 41 | csv_data = pd.read_csv(csv_file, encoding='latin1', usecols=['file_name', 'QuestionText', 'answer']) 42 | 43 | csv_data['answer'] = csv_data['answer'].str.upper() 44 | 45 | answers_set = set(list(csv_data['answer'])) 46 | answers_dict = dict(zip(answers_set, range(len(answers_set)))) 47 | 48 | with open("../metadata/output_classes_clean.json", "w") as outfile: 49 | json.dump(answers_dict, outfile) 50 | 51 | 52 | 53 | if __name__ == "__main__": 54 | 55 | csv_file = "../metadata/binary_test.csv" 56 | QAClean(csv_file) 57 | # AnswerGen(csv_file) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import io 3 | import numpy as np 4 | import json 5 | 6 | 7 | def load_vectors(embedding_file): 8 | fin = io.open(embedding_file, 'r', encoding='utf-8', newline='\n', errors='ignore') 9 | data = {} 10 | for line in fin: 11 | tokens = line.rstrip().split(' ') 12 | data[tokens[0]] = list(map(float, tokens[1:])) 13 | return data 14 | 15 | 16 | def read_metadata(csv_file_path): 17 | csv_data = pd.read_csv(csv_file_path, encoding='latin1') 18 | audio_fnames = list(csv_data['file_name']) 19 | questions = list(csv_data['QuestionText']) 20 | answers = list(csv_data['answer']) 21 | return audio_fnames, questions, answers 22 | 23 | 24 | def binary_classification_accuracy(pred, ground_truth): 25 | n_samples = pred.shape[0] 26 | # x = pred - ground_truth 27 | x = pred - ground_truth.reshape(n_samples, 1) 28 | n_wrong_predictions = np.count_nonzero(x) 29 | accuracy = (n_samples - n_wrong_predictions) / n_samples 30 | return accuracy 31 | 32 | 33 | def multiclass_classification_accuracy(logits, ground_truth, k=1): # b x 830 bx1 34 | n_samples = logits.shape[0] 35 | if k == 1: 36 | prediction = np.argmax(logits, axis=1) 37 | x = prediction - ground_truth 38 | n_wrong_predictions = np.count_nonzero(x) 39 | accuracy = (n_samples - n_wrong_predictions) / n_samples 40 | return accuracy 41 | else: 42 | max_idx = np.argsort(-1*logits, 1)[:, :k] # np.argsort() 返回数组值从小到大的索引值 43 | # x = max_idx - ground_truth 44 | x = max_idx - ground_truth.reshape(n_samples, 1) 45 | n_correct_predictions = np.count_nonzero(x == 0) 46 | top_k_accuracy = n_correct_predictions/n_samples 47 | return top_k_accuracy 48 | 49 | 50 | 51 | def load_answers_dict(answers_dict_file): 52 | f = open(answers_dict_file) 53 | answers_dict = json.load(f) 54 | return answers_dict 55 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Audio Question Answering (AQA) 4 | 5 | PyTorch code accompanies our Interspeech 2023 paper: 6 | 7 | **Multi-Scale Attention for Audio Question Answering** \[[arXiv](https://arxiv.org/abs/2305.17993)\] 8 | 9 | [Guangyao Li](https://ayameyao.github.io/), Yixin Xu and [Di Hu](https://dtaoo.github.io/index.html) 10 | 11 | --- 12 | 13 | ## Requirements 14 | 15 | ```python 16 | python3.6 + 17 | pytorch1.6.0 18 | tensorboardX 19 | ffmpeg 20 | ``` 21 | 22 | ## Usage 23 | 24 | 1. **Clone this repo** 25 | 26 | ```python 27 | https://github.com/GeWu-Lab/MWAFM.git 28 | ``` 29 | 30 | 2. **Download data** 31 | 32 | [Clotho-AQA](https://zenodo.org/record/6473207) and [AQA-MUSIC-AVQA](https://gewu-lab.github.io/MUSIC-AVQA/) 33 | 34 | 3. **Data pre-processing** 35 | 36 | We follow exact the same setting data format as [MUSIC AVQA](https://gewu-lab.github.io/MUSIC-AVQA/). 37 | 38 | **Notice:** We examined the original annotation files of Clotho-AQA and found that the official open-source annotations were not cleansed, resulting in discrepancies where different annotators provided different answers for the same question. As a result, we performed a simple filtering process where we considered a question to have the correct answer if it had at least two identical answers Based on this filtering process, we obtained a new and more accurate annotation file. The files in 'metadata' folder are described as follows 39 | 40 | - 'single_word\_[train/val/test].csv', Does not contain samples with answers *yes* and *no*. 41 | - 'single_word\_[train/val/test]\_clean.csv', Does not contain samples with answers *yes* and *no*. (Cleaned data) 42 | - 'clotho_aqa\_[train/val/test]\_clean.csv', Contains samples with answers *yes* and *no*. (Cleaned data) 43 | - 'binary\_[train/val/test]\_clean.csv', Include only samples with answers *yes* and *no*. (Cleaned data) 44 | 45 | 46 | 47 | 4. **Train and evaluate** 48 | 49 | Training 50 | 51 | ```python 52 | python main_MWAFM.py --mode train 53 | ``` 54 | 55 | Testing 56 | 57 | ```python 58 | python main_MWAFM.py --mode test 59 | ``` 60 | 61 | 62 | ## Citation 63 | 64 | If you find this work useful, please consider citing it. 65 | 66 |

67 | @ARTICLE{Li2023MultiScale,
68 |   title	= {Multi-Scale Attention for Audio Question Answering},
69 |   author	= {Guangyao li, Yixin Xu, Di Hu},
70 |   journal	= {Proc. INTERSPEECH},
71 |   year	= {2023},
72 | }
73 | 
74 | 75 | 76 | 77 | ## Acknowledgement 78 | 79 | This research was supported by Public Computing Cloud, Renmin University of China. 80 | 81 | -------------------------------------------------------------------------------- /conifg.py: -------------------------------------------------------------------------------- 1 | data_config = { 2 | 3 | # 'train_metadata_path': 'metadata/clotho_aqa_train_clean.csv', # CSV containing audio URLs, Questions, Answers,filenames 4 | # 'val_metadata_path': 'metadata/clotho_aqa_val_clean.csv', 5 | # 'test_metadata_path': 'metadata/clotho_aqa_test_clean.csv', 6 | # 'output_classes_file': 'metadata/output_classes_clean.json', 7 | 8 | 'train_metadata_path': 'metadata/single_word_train.csv', # CSV containing audio URLs, Questions, Answers,filenames 9 | 'val_metadata_path': 'metadata/single_word_val.csv', 10 | 'test_metadata_path': 'metadata/single_word_test.csv', 11 | 'output_classes_file': 'metadata/output_classes.json', 12 | 13 | # 'data_dir': '/home/data/clotho-aqa/audio_16kHz', # path to store downloaded data 14 | 'feat_dir': '/home/data/clotho-aqa/vggish', 15 | # 'feat_ast_dir': '/home/data/clotho-aqa/feats/ast', 16 | # 'feat_dir': '/home/data/clotho-aqa/audio_spec', 17 | 'question_dir': './metadata/questions.csv', 18 | 'pre_trained_word_embeddings_file': './pretrained/wiki-news-300d-1M.vec', 19 | 'audio_embedding_size': 512, 20 | 21 | # audio length 22 | 'audio_length': 24, 23 | 'quest_length': 22, 24 | } 25 | 26 | model_config = { 27 | 28 | 'learning_rate': 1e-4, 29 | 'batch_size': 64, 30 | 'num_workers': 12, 31 | 'num_epochs': 50, 32 | 'log_interval': 10, 33 | 34 | 35 | # audio network 36 | 'audio_input_size': data_config['audio_embedding_size'], 37 | 'audio_lstm_n_layers': 2, 38 | 'audio_lstm_hidden_size': 128, 39 | 'audio_bidirectional': True, 40 | 'audio_lstm_dropout': 0.2, 41 | 42 | 43 | # NLP network 44 | 'text_input_size': 300, # pretrained embedding size from fasttext 45 | 'text_lstm_n_layers': 2, 46 | 'text_lstm_hidden_size': 128, 47 | 'text_bidirectional': True, 48 | 'text_lstm_dropout': 0.2, 49 | 50 | # classification 51 | 'n_dense1_units': 256, 52 | 'n_dense2_units': 128, 53 | } 54 | 55 | 56 | 57 | if 'binary' in data_config['train_metadata_path']: 58 | model_config['n_classes'] = 1 59 | else: 60 | model_config['n_classes'] = 828 # Notice! output dim! 61 | model_config['audio_lstm_hidden_size'] = 512 62 | model_config['text_lstm_hidden_size'] = 512 63 | 64 | dense1_input = 0 65 | if model_config['audio_bidirectional']: 66 | dense1_input = dense1_input + 2 * model_config['audio_lstm_hidden_size'] 67 | else: 68 | dense1_input = dense1_input + model_config['audio_lstm_hidden_size'] 69 | 70 | if model_config['text_bidirectional']: 71 | dense1_input = dense1_input + 2 * model_config['text_lstm_hidden_size'] 72 | else: 73 | dense1_input = dense1_input + model_config['text_lstm_hidden_size'] 74 | 75 | model_config['dense1_input'] = dense1_input 76 | -------------------------------------------------------------------------------- /nets/multi_attention_encoder_decoder.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Dict 2 | from torch import nn, Tensor 3 | from nets.multi_attention import MultiScaleSelfAttention 4 | # from transformers.modeling_bart import BartConfig, BartForConditionalGeneration 5 | # from transformers import BartConfig, BartForConditionalGeneration 6 | 7 | 8 | class LongformerEncoderDecoderForConditionalGeneration(BartForConditionalGeneration): 9 | def __init__(self, config): 10 | super().__init__(config) 11 | if config.attention_mode == 'n2': 12 | pass # do nothing, use BertSelfAttention instead 13 | else: 14 | for i, layer in enumerate(self.model.encoder.layers): 15 | layer.self_attn = MultiScaleSelfAttentionForBart(config, layer_id=i) 16 | 17 | 18 | class LongformerEncoderDecoderConfig(BartConfig): 19 | def __init__(self, attention_window: List[int] = None, attention_dilation: List[int] = None, 20 | autoregressive: bool = False, attention_mode: str = 'sliding_chunks', 21 | gradient_checkpointing: bool = False, **kwargs): 22 | """ 23 | Args: 24 | attention_window: list of attention window sizes of length = number of layers. 25 | window size = number of attention locations on each side. 26 | For an affective window size of 512, use `attention_window=[256]*num_layers` 27 | which is 256 on each side. 28 | attention_dilation: list of attention dilation of length = number of layers. 29 | attention dilation of `1` means no dilation. 30 | autoregressive: do autoregressive attention or have attention of both sides 31 | attention_mode: 'n2' for regular n^2 self-attention, 'tvm' for TVM implemenation of Longformer 32 | selfattention, 'sliding_chunks' for another implementation of Longformer selfattention 33 | """ 34 | super().__init__(**kwargs) 35 | self.attention_window = attention_window 36 | self.attention_dilation = attention_dilation 37 | self.autoregressive = autoregressive 38 | self.attention_mode = attention_mode 39 | self.gradient_checkpointing = gradient_checkpointing 40 | assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2'] 41 | 42 | 43 | class MultiScaleSelfAttentionForBart(nn.Module): 44 | def __init__(self, config, layer_id): 45 | super().__init__() 46 | self.embed_dim = config.d_model 47 | self.multi_scale_self_attn = MultiScaleSelfAttention(config, layer_id=layer_id) 48 | self.output = nn.Linear(self.embed_dim, self.embed_dim) 49 | 50 | def forward( 51 | self, 52 | query, 53 | key: Optional[Tensor], 54 | key_padding_mask: Optional[Tensor] = None, 55 | layer_state: Optional[Dict[str, Optional[Tensor]]] = None, 56 | attn_mask: Optional[Tensor] = None, 57 | need_weights=False, 58 | output_attentions=False, 59 | ) -> Tuple[Tensor, Optional[Tensor]]: 60 | 61 | tgt_len, bsz, embed_dim = query.size() 62 | assert embed_dim == self.embed_dim 63 | assert list(query.size()) == [tgt_len, bsz, embed_dim] 64 | assert attn_mask is None 65 | 66 | outputs = self.multi_scale_self_attn( 67 | query.transpose(0, 1), # MultiScaleSelfAttention expects (bsz, seqlen, embd_dim) 68 | attention_mask=key_padding_mask.unsqueeze(dim=1).unsqueeze(dim=1) * -1, 69 | head_mask=None, 70 | encoder_hidden_states=None, 71 | encoder_attention_mask=None, 72 | output_attentions=output_attentions, 73 | ) 74 | 75 | attn_output = self.output(outputs[0].transpose(0, 1)) 76 | 77 | return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None) 78 | -------------------------------------------------------------------------------- /data_generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.data.dataset import Dataset 4 | from utils import load_vectors, read_metadata, load_answers_dict 5 | import numpy as np 6 | 7 | import torch.nn.functional as F 8 | from conifg import data_config, model_config 9 | 10 | 11 | class DataGenerator(Dataset): 12 | 13 | def __init__(self, meta_file): 14 | super(DataGenerator, self).__init__() 15 | 16 | self.meta_file = meta_file 17 | 18 | self.feat_dir = data_config['feat_dir'] 19 | # self.feat_ast_dir = data_config['feat_ast_dir'] 20 | 21 | self.audio_fnames, self.qs, self.ans = read_metadata(self.meta_file) 22 | 23 | self.batch_size = model_config['batch_size'] 24 | self.audio_length = data_config['audio_length'] 25 | self.qust_max_len = data_config['quest_length'] 26 | 27 | self.word_embedding_path = data_config['pre_trained_word_embeddings_file'] 28 | self.word_embeddings = load_vectors(self.word_embedding_path) # dict of all the {'word': [vector]} pairs 29 | self.answers_dict = load_answers_dict(data_config['output_classes_file']) 30 | 31 | def __getitem__(self, item): 32 | 33 | audio_feat = self.load_audio_features(item) 34 | 35 | audio_name = self.audio_fnames[item][:-3] + 'npy' 36 | # audio_ast_feat = np.load(os.path.join(self.feat_ast_dir, audio_name)) 37 | 38 | question_text = self.qs[item] 39 | answer_text = self.ans[item] 40 | question_embedding = self.get_word_embeddings(question_text) 41 | 42 | if 'binary' in self.meta_file: 43 | if answer_text == 'YES': 44 | label = 0 45 | else: 46 | label = 1 47 | else: 48 | label = self.answers_dict[answer_text] 49 | 50 | # return audio_feat, audio_ast_feat, question_embedding, label 51 | return audio_feat, question_embedding, label 52 | 53 | def load_audio_features(self, idx): 54 | # audio_feat_file = self.audio_fnames[idx][:-3] + 'npz' 55 | audio_feat_file = self.audio_fnames[idx][:-3] + 'npy' 56 | data = np.load(os.path.join(self.feat_dir, audio_feat_file)) 57 | # return data['embedding'] 58 | 59 | ## ------------------------------------------------------------------------------- 60 | ## ensure audio length equal 61 | if self.batch_size != 1: 62 | data1 = torch.from_numpy(data) 63 | data2 = data1.unsqueeze(0).permute(0, 2, 1).contiguous() 64 | data3 = F.interpolate(data2, size=self.audio_length, mode='linear', align_corners=False) 65 | data4 = data3.permute(0, 2, 1).contiguous() 66 | data5 = data4.squeeze() 67 | data = data5.numpy() 68 | ## ------------------------------------------------------------------------------- 69 | 70 | return data 71 | 72 | def get_word_embeddings(self, input_text): 73 | 74 | words = input_text.split(' ') 75 | words[-1] = words[-1][:-1] # removing '?' from the question, repetitive in all the Qs, so adds no value 76 | 77 | ## ------------------------------------------------------------------------------- 78 | if len(words) < self.qust_max_len: 79 | dn = self.qust_max_len - len(words) 80 | for index in range(dn): 81 | words.append("0") 82 | else: 83 | words = words[0:self.qust_max_len] 84 | ## ------------------------------------------------------------------------------- 85 | 86 | text_embedding = [] 87 | for word in words: 88 | # word = word.split(",")[0] 89 | try: 90 | embedding = self.word_embeddings[word] 91 | except KeyError: 92 | continue 93 | text_embedding.append(embedding) 94 | 95 | text_embedding = np.array(text_embedding) 96 | 97 | ## ------------------------------------------------------------------------------- 98 | # if text_embedding.shape[0] < self.qust_max_len: 99 | # ddn = self.qust_max_len - text_embedding.shape[0] 100 | # pad_value = np.repeat(text_embedding[-1], ddn) 101 | # text_embedding = np.append(text_embedding, pad_value) 102 | # text_embedding = text_embedding.reshape(self.qust_max_len, -1) 103 | ## ------------------------------------------------------------------------------- 104 | 105 | ## ------------------------------------------------------------------------------- 106 | text_embedding1 = torch.from_numpy(text_embedding) 107 | text_embedding2 = text_embedding1.unsqueeze(0).permute(0, 2, 1).contiguous() 108 | text_embedding3 = F.interpolate(text_embedding2, size=self.qust_max_len, mode='linear', align_corners=False) 109 | text_embedding4 = text_embedding3.permute(0, 2, 1).contiguous() 110 | text_embedding5 = text_embedding4.squeeze() 111 | text_embedding = text_embedding5.numpy() 112 | ## ------------------------------------------------------------------------------- 113 | 114 | return text_embedding 115 | 116 | def __len__(self): 117 | return len(self.qs) 118 | -------------------------------------------------------------------------------- /metadata/output_classes_clean.json: -------------------------------------------------------------------------------- 1 | {"RAIN": 0, "CROWD": 1, "GRINDING": 2, "SHOES": 3, "CRINKLING": 4, "CLASSICAL": 5, "HOWLING": 6, "SQUEAKING": 7, "BASKETBALL": 8, "SCRAPING": 9, "BLOWING": 10, "PAPER": 11, "PLAYGROUND": 12, "BOWL": 13, "STYROFOAM": 14, "CHILD": 15, "CUTTER": 16, "RATCHET": 17, "TABLE": 18, "PARK": 19, "MOTOR": 20, "TRACKS": 21, "BOILING": 22, "DOWN": 23, "OUTSIDE": 24, "CLAP": 25, "ELECTRICITY": 26, "BALLOON": 27, "MICROWAVE": 28, "GRINDER": 29, "TRUMPET": 30, "SQUEAK": 31, "AUDIENCE": 32, "SLAMMING": 33, "BEACH": 34, "GOOSE": 35, "CHEER": 36, "PEOPLE": 37, "HONKING": 38, "LONG": 39, "KETTLE": 40, "POPCORN": 41, "OFFICE": 42, "PIPE": 43, "RINGING": 44, "TOY": 45, "CHICKEN": 46, "NIGHT": 47, "HARD": 48, "HAMMER": 49, "NEVER": 50, "CONSTRUCTION": 51, "DOG": 52, "CLOSE": 53, "MARKET": 54, "STOP": 55, "LOCK": 56, "HEN": 57, "STORMY": 58, "ELECTRONIC": 59, "SEVEN": 60, "SPLASH": 61, "OCEAN": 62, "THIRTEEN": 63, "BATHING": 64, "PEBBLES": 65, "DRYER": 66, "VERY": 67, "SEVENTEEN": 68, "WORKING": 69, "SEAGULLS": 70, "VEHICLE": 71, "AWAY": 72, "TWENTYTWO": 73, "POLICE": 74, "PULLED": 75, "BOOK": 76, "CAT": 77, "LIGHT": 78, "ICE": 79, "BEES": 80, "BAND": 81, "HORROR": 82, "BOTTLES": 83, "FAR": 84, "FISH": 85, "PIANO": 86, "CHANGING": 87, "SHAKING": 88, "WHITE": 89, "READING": 90, "LAST": 91, "BELLS": 92, "FOREST": 93, "COUGHING": 94, "BOARD": 95, "BAG": 96, "CHEERING": 97, "ACCELERATES": 98, "BANG": 99, "SIXTY": 100, "EGGS": 101, "MAT": 102, "HORN": 103, "MIDDLE": 104, "FLOORBOARD": 105, "TWENTYSIX": 106, "FROG": 107, "ONCE": 108, "STAPLER": 109, "CRASHING": 110, "TEN": 111, "KNIFE": 112, "POP": 113, "BATHROOM": 114, "FIREWORKS": 115, "TREE": 116, "FEET": 117, "AIRPORT": 118, "SMALL": 119, "TYPEWRITER": 120, "COIN": 121, "JOGGING": 122, "WINDER": 123, "GROWLING": 124, "SPINNER": 125, "MOWER": 126, "FIFTY": 127, "HORSE": 128, "KNOCKING": 129, "BIKE": 130, "TAPPING": 131, "RAINY": 132, "ONE": 133, "SPILLING": 134, "NINETEEN": 135, "SPEAKING": 136, "TAP": 137, "HIKING": 138, "BELL": 139, "WATER": 140, "SINGING": 141, "KEYS": 142, "GONG": 143, "DRAIN": 144, "DRILL": 145, "THUNDERSTORM": 146, "LIGHTENING": 147, "DRUMS": 148, "FLOWING": 149, "RAILROAD": 150, "CLOCK": 151, "BANGING": 152, "SHOWER": 153, "PRINTING": 154, "SECOND": 155, "AIR": 156, "STATIC": 157, "MOUTH": 158, "PURRING": 159, "STEPS": 160, "EXHAUST": 161, "METAL": 162, "LEAVES": 163, "BABBLING": 164, "WHISTLING": 165, "RUNNING": 166, "CONSISTENT": 167, "STREAM": 168, "TWICE": 169, "AMBULANCE": 170, "RADAR": 171, "WINTER": 172, "INSECT": 173, "HUMANS": 174, "GRASS": 175, "LIQUID": 176, "SIREN": 177, "SIX": 178, "DRIPPING": 179, "KIDS": 180, "BOAT": 181, "ROOSTER": 182, "NONE": 183, "FLYING": 184, "FEMALE": 185, "SLOW": 186, "FADES": 187, "COINS": 188, "HUNDRED": 189, "KEYBOARD": 190, "SHEEP": 191, "FLY": 192, "DRINK": 193, "MARKER": 194, "CABINET": 195, "SODA": 196, "TRAIN": 197, "ALARM": 198, "NOTHING": 199, "ROCK": 200, "BEAR": 201, "CHIRPING": 202, "PULSING": 203, "HEAVY": 204, "UP": 205, "TRUCK": 206, "HUMAN": 207, "DISTORTION": 208, "BEE": 209, "SWEEPING": 210, "GLASS": 211, "MEDIUM": 212, "SINK": 213, "CARDS": 214, "DRESS": 215, "ADULT": 216, "RAINSTORM": 217, "TWENTYEIGHT": 218, "GARBAGE": 219, "TREES": 220, "SHORE": 221, "ROOF": 222, "OPEN": 223, "FIVE": 224, "CHIPS": 225, "SNAKE": 226, "OPENING": 227, "MACHINERY": 228, "WHEEL": 229, "ALOT": 230, "ZOO": 231, "FLUTE": 232, "PAVEMENT": 233, "MUD": 234, "FOURTEEN": 235, "CRICKET": 236, "DUCKS": 237, "SEA": 238, "PERSON": 239, "TWENTY": 240, "PINGPONG": 241, "PULLOVER": 242, "SWIMMING": 243, "WATCH": 244, "HIGHWAY": 245, "HAPPY": 246, "DOOR": 247, "TELEPHONE": 248, "SPEED": 249, "WHISTLE": 250, "TWENTYONE": 251, "MIXING": 252, "TENNIS": 253, "ANNOUNCEMENT": 254, "LAUGHING": 255, "TORNADO": 256, "SEESAW": 257, "START": 258, "SEVENTYTHREE": 259, "SHOWERING": 260, "UNKNOWN": 261, "COFFEEMAKER": 262, "SOFT": 263, "TOP": 264, "PIG": 265, "THROWING": 266, "HALL": 267, "FEATHERS": 268, "THIRTY": 269, "MOVING": 270, "MALE": 271, "CRICKETS": 272, "TV": 273, "TALKING": 274, "KEY": 275, "TRAFFIC": 276, "LOUD": 277, "MEETING": 278, "SNIFFING": 279, "PILOT": 280, "EXCITEMENT": 281, "OIL": 282, "FALLING": 283, "SCRIBBLING": 284, "CROW": 285, "WOOD": 286, "HITTING": 287, "LOUDSPEAKER": 288, "STONE": 289, "SHOOTING": 290, "MOTORCYCLE": 291, "WRITING": 292, "HAY": 293, "BRUSH": 294, "TWELVE": 295, "FINGERS": 296, "THUNDER": 297, "TEARING": 298, "WASHER": 299, "JACKHAMMER": 300, "TILE": 301, "MAN": 302, "NESTS": 303, "FIREPLACE": 304, "SAND": 305, "FAST": 306, "CRYING": 307, "FACTORY": 308, "MOUSE": 309, "SCHOOL": 310, "WET": 311, "DEEP": 312, "PLANE": 313, "STOPS": 314, "FLOOR": 315, "GOOD": 316, "HAND": 317, "THIRTYONE": 318, "CUP": 319, "MANY": 320, "WHIRRING": 321, "FRYING": 322, "CYMBAL": 323, "PLAYING": 324, "GRAVEL": 325, "BUBBLING": 326, "NOTEBOOK": 327, "KITCHEN": 328, "ELEVEN": 329, "LAWNMOWER": 330, "FIRE": 331, "SUBWAY": 332, "UTENSIL": 333, "MACHINE": 334, "WINDOW": 335, "BOTTLE": 336, "DIESEL": 337, "END": 338, "DROPPING": 339, "TIMER": 340, "CHOPPING": 341, "FAUCET": 342, "CHATTING": 343, "FIRST": 344, "TEETH": 345, "CHAIR": 346, "CLANKING": 347, "BABY": 348, "LION": 349, "SLOWLY": 350, "STRAW": 351, "FOUR": 352, "BRAKES": 353, "WATERFALL": 354, "NINE": 355, "CARS": 356, "PARTY": 357, "WAVES": 358, "DRIVE": 359, "UMBRELLA": 360, "EVENING": 361, "SILENT": 362, "HIGH": 363, "WHISPERING": 364, "ENGINE": 365, "CONTINUOUSLY": 366, "SHAKER": 367, "DRUM": 368, "CLOUDS": 369, "CLAPPING": 370, "DUCK": 371, "EIGHTEEN": 372, "COUGH": 373, "SHARPENING": 374, "THREE": 375, "BREATHING": 376, "GROWL": 377, "PENCIL": 378, "BAD": 379, "DRAINING": 380, "GROUND": 381, "STEAM": 382, "COCK": 383, "BALL": 384, "BUZZING": 385, "LOW": 386, "SPEAKER": 387, "TIRES": 388, "RAKE": 389, "EIGHT": 390, "CAR": 391, "BIRDS": 392, "RAILWAYSTATION": 393, "COFFEE": 394, "WIND": 395, "WOMAN": 396, "HANDS": 397, "RACING": 398, "FABRIC": 399, "CART": 400, "SLEEPING": 401, "BEGINNING": 402, "GUITAR": 403, "DRILLING": 404, "FOURTY": 405, "POURING": 406, "SOUND": 407, "EMERGENCY": 408, "LISTENING": 409, "STOMPING": 410, "STORM": 411, "AIRPLANE": 412, "CHAINSAW": 413, "INCREASE": 414, "WOMEN": 415, "STEADY": 416, "GATE": 417, "PIPES": 418, "SIXTEEN": 419, "VIBRATING": 420, "FORTY": 421, "STATION": 422, "COMPUTER": 423, "XYLOPHONE": 424, "RUBBER": 425, "PLASTIC": 426, "SAW": 427, "SEWING": 428, "BIRD": 429, "CLUCKING": 430, "RADIO": 431, "TOILET": 432, "BIG": 433, "BOOTS": 434, "EATING": 435, "WINDY": 436, "MOWING": 437, "CHILDREN": 438, "SPLASHING": 439, "FARM": 440, "TWENTYFOUR": 441, "MICROPHONE": 442, "TWENTYFIVE": 443, "LAUGH": 444, "OWL": 445, "VIOLIN": 446, "RESTAURANT": 447, "FIFTEEN": 448, "HELICOPTER": 449, "ROAD": 450, "DISHES": 451, "DRIVING": 452, "RINSING": 453, "MORNING": 454, "STARTING": 455, "ACCELERATING": 456, "CHURCH": 457, "ZERO": 458, "COW": 459, "SCRATCHING": 460, "CHIRP": 461, "BEEPING": 462, "TAPE": 463, "TIN": 464, "TWO": 465, "REFRIGERATOR": 466, "TUNNEL": 467, "REVVING": 468, "TYPING": 469, "RAINING": 470, "CHIMES": 471, "BUS": 472, "BUBBLEWRAP": 473, "THIRTYFOUR": 474, "CONSTANT": 475, "PULLING": 476, "FAN": 477, "RAINFALL": 478, "ALWAYS": 479, "SAWING": 480, "BUCKET": 481, "BARKING": 482, "SYNTHESIZER": 483, "GLASSES": 484, "KNOCK": 485, "LAUNDRY": 486, "SANDER": 487, "TELEVISION": 488, "GAS": 489, "MUSIC": 490, "SWING": 491, "SPARROW": 492, "STAIRS": 493, "FLOOD": 494, "ORGAN": 495, "JAR": 496, "WALKING": 497, "SNOW": 498, "YES": 499, "NO": 500} -------------------------------------------------------------------------------- /nets/sliding_chunks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from nets.diagonaled_mm_tvm import mask_invalid_locations 4 | 5 | 6 | def _skew(x, direction, padding_value): 7 | '''Convert diagonals into columns (or columns into diagonals depending on `direction`''' 8 | x_padded = F.pad(x, direction, value=padding_value) 9 | x_padded = x_padded.view(*x_padded.size()[:-2], x_padded.size(-1), x_padded.size(-2)) 10 | return x_padded 11 | 12 | 13 | def _skew2(x, padding_value): 14 | '''shift every row 1 step to right converting columns into diagonals''' 15 | # X = B x C x M x L 16 | B, C, M, L = x.size() 17 | x = F.pad(x, (0, M + 1), value=padding_value) # B x C x M x (L+M+1) 18 | x = x.view(B, C, -1) # B x C x ML+MM+M 19 | x = x[:, :, :-M] # B x C x ML+MM 20 | x = x.view(B, C, M, M + L) # B x C, M x L+M 21 | x = x[:, :, :, :-1] 22 | return x 23 | 24 | 25 | def _chunk(x, w): 26 | '''convert into overlapping chunkings. Chunk size = 2w, overlap size = w''' 27 | 28 | # non-overlapping chunks of size = 2w 29 | x = x.view(x.size(0), x.size(1) // (w * 2), w * 2, x.size(2)) 30 | 31 | # use `as_strided` to make the chunks overlap with an overlap size = w 32 | chunk_size = list(x.size()) 33 | chunk_size[1] = chunk_size[1] * 2 - 1 34 | 35 | chunk_stride = list(x.stride()) 36 | chunk_stride[1] = chunk_stride[1] // 2 37 | return x.as_strided(size=chunk_size, stride=chunk_stride) 38 | 39 | 40 | def sliding_chunks_matmul_qk(q: torch.Tensor, k: torch.Tensor, w: int, padding_value: float): 41 | '''Matrix multiplicatio of query x key tensors using with a sliding window attention pattern. 42 | This implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer) 43 | with an overlap of size w''' 44 | 45 | bsz, seqlen, num_heads, head_dim = q.size() 46 | # seqlen, bsz, num_heads, head_dim = q.size() 47 | 48 | # print("scmq: ", bsz, seqlen, num_heads, head_dim) 49 | # print("w: ", w) 50 | 51 | assert seqlen % (w * 2) == 0 52 | assert q.size() == k.size() 53 | 54 | chunks_count = seqlen // w - 1 55 | 56 | # group bsz and num_heads dimensions into one, then chunk seqlen into chunks of size w * 2 57 | q = q.transpose(1, 2).reshape(bsz * num_heads, seqlen, head_dim) 58 | k = k.transpose(1, 2).reshape(bsz * num_heads, seqlen, head_dim) 59 | 60 | chunk_q = _chunk(q, w) 61 | chunk_k = _chunk(k, w) 62 | 63 | # matrix multipication 64 | # bcxd: bsz*num_heads x chunks x 2w x head_dim 65 | # bcyd: bsz*num_heads x chunks x 2w x head_dim 66 | # bcxy: bsz*num_heads x chunks x 2w x 2w 67 | chunk_attn = torch.einsum('bcxd,bcyd->bcxy', (chunk_q, chunk_k)) # multiply 68 | 69 | # convert diagonals into columns 70 | diagonal_chunk_attn = _skew(chunk_attn, direction=(0, 0, 0, 1), padding_value=padding_value) 71 | 72 | # allocate space for the overall attention matrix where the chunks are compined. The last dimension 73 | # has (w * 2 + 1) columns. The first (w) columns are the w lower triangles (attention from a word to 74 | # w previous words). The following column is attention score from each word to itself, then 75 | # followed by w columns for the upper triangle. 76 | 77 | diagonal_attn = diagonal_chunk_attn.new_empty((bsz * num_heads, chunks_count + 1, w, w * 2 + 1)) 78 | 79 | # copy parts from diagonal_chunk_attn into the compined matrix of attentions 80 | # - copying the main diagonal and the upper triangle 81 | diagonal_attn[:, :-1, :, w:] = diagonal_chunk_attn[:, :, :w, :w + 1] 82 | diagonal_attn[:, -1, :, w:] = diagonal_chunk_attn[:, -1, w:, :w + 1] 83 | # - copying the lower triangle 84 | diagonal_attn[:, 1:, :, :w] = diagonal_chunk_attn[:, :, - (w + 1):-1, w + 1:] 85 | diagonal_attn[:, 0, 1:w, 1:w] = diagonal_chunk_attn[:, 0, :w - 1, 1 - w:] 86 | 87 | # separate bsz and num_heads dimensions again 88 | diagonal_attn = diagonal_attn.view(bsz, num_heads, seqlen, 2 * w + 1).transpose(2, 1) 89 | 90 | mask_invalid_locations(diagonal_attn, w, 1, False) 91 | return diagonal_attn 92 | 93 | 94 | def sliding_chunks_matmul_pv(prob: torch.Tensor, v: torch.Tensor, w: int): 95 | '''Same as sliding_chunks_matmul_qk but for prob and value tensors. It is expecting the same output 96 | format from sliding_chunks_matmul_qk''' 97 | bsz, seqlen, num_heads, head_dim = v.size() 98 | assert seqlen % (w * 2) == 0 99 | assert prob.size()[:3] == v.size()[:3] 100 | assert prob.size(3) == 2 * w + 1 101 | chunks_count = seqlen // w - 1 102 | # group bsz and num_heads dimensions into one, then chunk seqlen into chunks of size 2w 103 | chunk_prob = prob.transpose(1, 2).reshape(bsz * num_heads, seqlen // w, w, 2 * w + 1) 104 | 105 | # group bsz and num_heads dimensions into one 106 | v = v.transpose(1, 2).reshape(bsz * num_heads, seqlen, head_dim) 107 | 108 | # pad seqlen with w at the beginning of the sequence and another w at the end 109 | padded_v = F.pad(v, (0, 0, w, w), value=-1) 110 | 111 | # chunk padded_v into chunks of size 3w and an overlap of size w 112 | chunk_v_size = (bsz * num_heads, chunks_count + 1, 3 * w, head_dim) 113 | chunk_v_stride = padded_v.stride() 114 | chunk_v_stride = chunk_v_stride[0], w * chunk_v_stride[1], chunk_v_stride[1], chunk_v_stride[2] 115 | chunk_v = padded_v.as_strided(size=chunk_v_size, stride=chunk_v_stride) 116 | 117 | skewed_prob = _skew2(chunk_prob, padding_value=0) 118 | 119 | context = torch.einsum('bcwd,bcdh->bcwh', (skewed_prob, chunk_v)) 120 | return context.view(bsz, num_heads, seqlen, head_dim).transpose(1, 2) 121 | 122 | 123 | def pad_to_window_size(input_ids: torch.Tensor, attention_mask: torch.Tensor, 124 | one_sided_window_size: int, pad_token_id: int): 125 | '''A helper function to pad tokens and mask to work with the sliding_chunks implementation of Longformer selfattention. 126 | Input: 127 | input_ids = torch.Tensor(bsz x seqlen): ids of wordpieces 128 | attention_mask = torch.Tensor(bsz x seqlen): attention mask 129 | one_sided_window_size = int: window size on one side of each token 130 | pad_token_id = int: tokenizer.pad_token_id 131 | Returns 132 | (input_ids, attention_mask) padded to length divisible by 2 * one_sided_window_size 133 | ''' 134 | w = int(2 * one_sided_window_size) 135 | seqlen = input_ids.size(1) 136 | padding_len = (w - seqlen % w) % w 137 | input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id) 138 | attention_mask = F.pad(attention_mask, (0, padding_len), value=False) # no attention on the padding tokens 139 | return input_ids, attention_mask 140 | 141 | 142 | # ========= "sliding_chunks_no_overlap": alternative implemenation of the sliding window attention ========= 143 | # This implementation uses non-overlapping chunks (or blocks) of size `w` with number of local attention = 3xw 144 | # To make this implemenation comparable to "sliding_chunks" set w such that 145 | # w_of_sliding_chunks_no_overlap = w_of_sliding_chunks * 2 / 3 146 | # For example, 147 | # w_of_sliding_chunks = 256 (this is one sided. Total attention size = 512) 148 | # w_of_sliding_chunks_no_overlap = 170 (Total attention size = 510) 149 | # Performance: 150 | # - Speed: 30% faster than "sliding_chunks" 151 | # - Memory: 95% of the memory usage of "sliding_chunks" 152 | # The windows are asymmetric where number of attention on each side of a token ranges between w to 2w 153 | # while "sliding_chunks" has a symmetric window around each token. 154 | 155 | 156 | def sliding_chunks_no_overlap_matmul_qk(q: torch.Tensor, k: torch.Tensor, w: int, padding_value: float): 157 | bsz, seqlen, num_heads, head_dim = q.size() 158 | assert seqlen % w == 0 159 | assert q.size() == k.size() 160 | # chunk seqlen into non-overlapping chunks of size w 161 | chunk_q = q.view(bsz, seqlen // w, w, num_heads, head_dim) 162 | chunk_k = k.view(bsz, seqlen // w, w, num_heads, head_dim) 163 | chunk_k_expanded = torch.stack(( 164 | F.pad(chunk_k[:, :-1], (0, 0, 0, 0, 0, 0, 1, 0), value=0.0), 165 | chunk_k, 166 | F.pad(chunk_k[:, 1:], (0, 0, 0, 0, 0, 0, 0, 1), value=0.0), 167 | ), dim=-1) 168 | diagonal_attn = torch.einsum('bcxhd,bcyhde->bcxhey', (chunk_q, chunk_k_expanded)) # multiply 169 | return diagonal_attn.reshape(bsz, seqlen, num_heads, 3 * w) 170 | 171 | 172 | def sliding_chunks_no_overlap_matmul_pv(prob: torch.Tensor, v: torch.Tensor, w: int): 173 | bsz, seqlen, num_heads, head_dim = v.size() 174 | chunk_prob = prob.view(bsz, seqlen // w, w, num_heads, 3, w) 175 | chunk_v = v.view(bsz, seqlen // w, w, num_heads, head_dim) 176 | chunk_v_extended = torch.stack(( 177 | F.pad(chunk_v[:, :-1], (0, 0, 0, 0, 0, 0, 1, 0), value=0.0), 178 | chunk_v, 179 | F.pad(chunk_v[:, 1:], (0, 0, 0, 0, 0, 0, 0, 1), value=0.0), 180 | ), dim=-1) 181 | context = torch.einsum('bcwhpd,bcdhep->bcwhe', (chunk_prob, chunk_v_extended)) 182 | return context.reshape(bsz, seqlen, num_heads, head_dim) 183 | -------------------------------------------------------------------------------- /main_MWAFM.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import pickle 4 | import utils 5 | import torch 6 | import torch.nn as nn 7 | import numpy as np 8 | from torch.utils.data import DataLoader 9 | import torch.optim as optim 10 | import argparse 11 | 12 | from data_generator import * 13 | from nets.ours_MWAFM_Net import MWAFM_Net 14 | from conifg import data_config, model_config 15 | 16 | 17 | def train(model, train_iterator, optimizer, criterion, epoch): 18 | 19 | model.train() 20 | 21 | # for batch_idx, (audio_feat, audio_ast_feat, question, label) in enumerate(train_iterator): 22 | for batch_idx, (audio_feat, question, label) in enumerate(train_iterator): 23 | 24 | audio_feat = audio_feat.to(dtype=torch.float) 25 | # audio_ast_feat = audio_ast_feat.to(dtype=torch.float) 26 | question = question.to(dtype=torch.float) 27 | label = label.to('cuda', dtype=torch.long) 28 | 29 | question_len = torch.ones((question.size(0),), dtype=torch.int8).to('cuda') 30 | 31 | optimizer.zero_grad() 32 | # logits_output = model(audio_feat, audio_ast_feat, question) 33 | logits_output = model(audio_feat, question) 34 | 35 | loss = criterion(logits_output, label) 36 | loss.backward() 37 | optimizer.step() 38 | 39 | if batch_idx % model_config['log_interval'] == 0: 40 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(audio_feat), len(train_iterator.dataset), 100. * batch_idx / len(train_iterator), loss.item())) 41 | 42 | 43 | def eval(model, val_iterator, optimizer, criterion, epoch): 44 | 45 | model.eval() 46 | val_acc_top_01 = 0 47 | val_acc_top_05 = 0 48 | val_acc_top_10 = 0 49 | 50 | total = 0 51 | correct_top_01 = 0 52 | correct_top_05 = 0 53 | correct_top_10 = 0 54 | 55 | with torch.no_grad(): 56 | # for batch_idx, (audio_feat, audio_ast_feat, question, label) in enumerate(val_iterator): 57 | for batch_idx, (audio_feat, question, label) in enumerate(val_iterator): 58 | 59 | audio_feat = audio_feat.to(dtype=torch.float) 60 | # audio_ast_feat = audio_ast_feat.to(dtype=torch.float) 61 | question = question.to(dtype=torch.float) 62 | label = label.to('cuda', dtype=torch.long) 63 | question_len = torch.ones((question.size(0),), dtype=torch.int8).to('cuda') 64 | 65 | # logits_output = model(audio_feat, audio_ast_feat, question) 66 | logits_output = model(audio_feat, question) 67 | 68 | 69 | total += logits_output.size(0) 70 | 71 | # top-01 accuracy 72 | _, predicted_top_01 = torch.max(logits_output.data, 1) 73 | correct_top_01 += (predicted_top_01 == label).sum().item() 74 | 75 | # top-05 and top-20 accuracy 76 | _, predicted_top_n = torch.sort(logits_output.data, dim=1, descending=True) 77 | 78 | predicted_top_05 = predicted_top_n[:, :5].detach().cpu().numpy() 79 | predicted_top_10 = predicted_top_n[:, :10].detach().cpu().numpy() 80 | 81 | ground_truth = label.detach().cpu().numpy() 82 | n_batch = ground_truth.shape[0] 83 | ground_truth = ground_truth.reshape(n_batch, 1) 84 | 85 | correct_top_05 += np.count_nonzero((predicted_top_05-ground_truth)==0) 86 | correct_top_10 += np.count_nonzero((predicted_top_10-ground_truth)==0) 87 | 88 | 89 | val_top_01 = 100 * correct_top_01 / total 90 | val_top_05 = 100 * correct_top_05 / total 91 | val_top_10 = 100 * correct_top_10 / total 92 | 93 | print("\nTop-01 Validation set accuracy = %.2f %%" % val_top_01) 94 | print("Top-05 Validation set accuracy = %.2f %%" % val_top_05) 95 | print("Top-10 Validation set accuracy = %.2f %%" % val_top_10) 96 | 97 | return val_top_01 98 | 99 | 100 | def test(model, test_iterator): 101 | 102 | model.eval() 103 | val_acc_top_01 = 0 104 | val_acc_top_05 = 0 105 | val_acc_top_10 = 0 106 | 107 | total = 0 108 | correct_top_01 = 0 109 | correct_top_05 = 0 110 | correct_top_10 = 0 111 | 112 | with torch.no_grad(): 113 | # for batch_idx, (audio_feat, audio_ast_feat, question, label) in enumerate(test_iterator): 114 | for batch_idx, (audio_feat, audio_feat, question, label) in enumerate(test_iterator): 115 | 116 | audio_feat = audio_feat.to(dtype=torch.float) 117 | # audio_ast_feat = audio_ast_feat.to(dtype=torch.float) 118 | question = question.to(dtype=torch.float) 119 | label = label.to('cuda', dtype=torch.long) 120 | question_len = torch.ones((question.size(0),), dtype=torch.int8).to('cuda') 121 | 122 | # logits_output = model(audio_feat, audio_ast_feat, question) 123 | logits_output = model(audio_feat, question) 124 | 125 | 126 | total += logits_output.size(0) 127 | 128 | # top-01 accuracy 129 | _, predicted_top_01 = torch.max(logits_output.data, 1) 130 | correct_top_01 += (predicted_top_01 == label).sum().item() 131 | 132 | # top-05 and top-20 accuracy 133 | _, predicted_top_n = torch.sort(logits_output.data, dim=1, descending=True) 134 | 135 | predicted_top_05 = predicted_top_n[:, :5].detach().cpu().numpy() 136 | predicted_top_10 = predicted_top_n[:, :10].detach().cpu().numpy() 137 | 138 | ground_truth = label.detach().cpu().numpy() 139 | n_batch = ground_truth.shape[0] 140 | ground_truth = ground_truth.reshape(n_batch, 1) 141 | 142 | correct_top_05 += np.count_nonzero((predicted_top_05-ground_truth)==0) 143 | correct_top_10 += np.count_nonzero((predicted_top_10-ground_truth)==0) 144 | 145 | 146 | val_top_01 = 100 * correct_top_01 / total 147 | val_top_05 = 100 * correct_top_05 / total 148 | val_top_10 = 100 * correct_top_10 / total 149 | 150 | print("\nTop-01 Validation set accuracy = %.2f %%" % val_top_01) 151 | print("Top-05 Validation set accuracy = %.2f %%" % val_top_05) 152 | print("Top-10 Validation set accuracy = %.2f %%" % val_top_10) 153 | print('\n***********************************************************\n') 154 | 155 | # return val_acc_top_01 156 | 157 | 158 | 159 | if __name__ == '__main__': 160 | 161 | parser = argparse.ArgumentParser(description='PyTorch Implementation of Audio Question Answering') 162 | parser.add_argument( 163 | "--mode", type=str, default='train', help="with mode to use") 164 | parser.add_argument( 165 | "--model_save_dir", type=str, default='./checkpoints/', help="model save dir") 166 | parser.add_argument( 167 | "--checkpoint", type=str, default='MWAFM_Net',help="save model name") 168 | parser.add_argument( 169 | '--seed', type=int, default=8888, metavar='S',help='random seed (default: 1)') 170 | parser.add_argument( 171 | '--gpu', type=str, default='0, 1', help='gpu device number') 172 | 173 | args = parser.parse_args() 174 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 175 | torch.manual_seed(args.seed) 176 | 177 | model = MWAFM_Net().to('cuda') 178 | model = nn.DataParallel(model) 179 | model = model.to('cuda') 180 | 181 | if args.mode == "train": 182 | print("\n-------------------- Multi-scale Window-size Attention Model Training --------------------") 183 | # create data iterator 184 | # train_dataset = DataGenerator(data_config['train_metadata_path']) 185 | # train_dataset = DataGenerator(data_config, model_config) 186 | train_dataset = DataGenerator(data_config['train_metadata_path']) 187 | train_iterator = DataLoader(dataset=train_dataset, batch_size=model_config['batch_size'], num_workers=model_config['num_workers'], 188 | pin_memory=True, shuffle=True) 189 | # val_dataset = DataGenerator(data_config['val_metadata_path']) 190 | val_dataset = DataGenerator(data_config['val_metadata_path']) 191 | val_iterator = DataLoader(dataset=val_dataset, batch_size=model_config['batch_size'], num_workers=model_config['num_workers'], 192 | pin_memory=True, shuffle=True) 193 | 194 | optimizer = torch.optim.Adam(params=model.parameters(), lr=model_config['learning_rate']) 195 | scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.1) 196 | criterion = nn.CrossEntropyLoss() 197 | 198 | start_epoch = 0 199 | best_acc = 0 200 | best_epoch = 0 201 | for epoch in range(start_epoch, model_config['num_epochs']): 202 | train(model, train_iterator, optimizer, criterion, epoch=epoch) 203 | scheduler.step(epoch) 204 | F = eval(model, val_iterator, optimizer, criterion, epoch) 205 | if F >= best_acc: 206 | best_acc = F 207 | best_epoch = epoch 208 | torch.save(model.state_dict(), args.model_save_dir + args.checkpoint + ".pt") 209 | print("\nTop-01 training-val best: epoch {}, acc: {:.2f}%".format(best_epoch, best_acc)) 210 | print('\n***********************************************************\n') 211 | else: 212 | print("\n-------------------- Multi-scale Window-size Attention Model Testing --------------------") 213 | test_dataset = DataGenerator(data_config, model_config, mode='test') 214 | test_iterator = DataLoader(dataset=test_dataset, batch_size=model_config['batch_size'], num_workers=model_config['num_workers'], 215 | pin_memory=True, shuffle=True) 216 | model.load_state_dict(torch.load(args.model_save_dir + args.checkpoint + ".pt")) 217 | test(model, test_iterator) 218 | 219 | -------------------------------------------------------------------------------- /metadata/output_classes.json: -------------------------------------------------------------------------------- 1 | {"WATERFALL": 0, "SHARPENING": 1, "REVVING": 2, "MARKER": 3, "CARDS": 4, "RAINDROPS": 5, "SNAKE": 6, "DIGGING": 7, "READING": 8, "FIGHTING": 9, "SCHOOL": 10, "PLASTIC": 11, "KETTLE": 12, "SECOND": 13, "LIQUID": 14, "TREE": 15, "GLASSES": 16, "THIRD": 17, "ALUMINUM": 18, "FILLING": 19, "AWAY": 20, "LIGHTENING": 21, "ADULT": 22, "STONE": 23, "EXHAUST": 24, "SHOWER": 25, "MIXER": 26, "DRIP": 27, "VIBRATING": 28, "LOCUST": 29, "LISTENING": 30, "NEVER": 31, "TRUCK": 32, "LOW": 33, "RACETRACK": 34, "PIG": 35, "MOWER": 36, "COOKER": 37, "FEMALE": 38, "CRINKLING": 39, "POP": 40, "FRYINGPAN": 41, "BLUE": 42, "AUDIENCE": 43, "GOOSE": 44, "PULLED": 45, "BUZZING": 46, "FAUCET": 47, "STREAM": 48, "DOLPHIN": 49, "CART": 50, "GLASS": 51, "XYLOPHONE": 52, "MATCH": 53, "TWELVE": 54, "FIREPLACE": 55, "CHAINSAW": 56, "DISHWASHER": 57, "LAUNDRY": 58, "FREIGHT": 59, "STATIC": 60, "ALARM": 61, "DROP": 62, "NIGHT": 63, "TWENTYFIVE": 64, "ELEVEN": 65, "SQUEAK": 66, "TWENTY": 67, "FUEL": 68, "HELICOPTER": 69, "FLUSHING": 70, "TRAFFIC": 71, "TWENTYFOUR": 72, "ROLLER": 73, "FLY": 74, "BRANCHES": 75, "COW": 76, "HORSE": 77, "HUMANS": 78, "DOWN": 79, "SONG": 80, "DRUMSTICK": 81, "DUCK": 82, "PAPER": 83, "HANDS": 84, "RICE": 85, "WOOD": 86, "ELECTRICITY": 87, "FLOW": 88, "BUBBLEWRAP": 89, "CLASSICAL": 90, "WOMEN": 91, "PERSON": 92, "RADIO": 93, "CHIRPS": 94, "MICROPHONE": 95, "SLAMMING": 96, "DRIVING": 97, "HIGH": 98, "CHANGING": 99, "BALL": 100, "CHILD": 101, "SEAWAVE": 102, "SINK": 103, "PRESS": 104, "AIRPLANE": 105, "STORMY": 106, "MUD": 107, "TELEPHONE": 108, "RUNNING": 109, "CRUSHING": 110, "FLOORBOARD": 111, "RAINY": 112, "RACE": 113, "MICROWAVE": 114, "DOVE": 115, "VEHICLE": 116, "RAPIDLY": 117, "BOAT": 118, "DRUM": 119, "BELLS": 120, "BUGS": 121, "TEN": 122, "QUIET": 123, "SEVENTEEN": 124, "BATHROOM": 125, "STEADY": 126, "WORK": 127, "VOICES": 128, "SUMMER": 129, "TRACTOR": 130, "PAVEMENT": 131, "TWEET": 132, "VACUUMING": 133, "SPRAYER": 134, "NOTEBOOK": 135, "HEN": 136, "SCRATCHING": 137, "STATION": 138, "FOURTEEN": 139, "SPARROW": 140, "GUTTERS": 141, "BUBBLING": 142, "KNOCKING": 143, "WAVE": 144, "STOPS": 145, "WHEELS": 146, "CONSTANT": 147, "INSECT": 148, "PARTY": 149, "SHREDDING": 150, "AIRSOUND": 151, "HAMMER": 152, "MEN": 153, "HAPPINESS": 154, "FIREWORKS": 155, "WATERS": 156, "PENCIL": 157, "FLOOD": 158, "RIVER": 159, "GOOD": 160, "TIN": 161, "KITCHEN": 162, "CHICKEN": 163, "KNIFE": 164, "FENCE": 165, "ELECTRONIC": 166, "SHIP": 167, "CAR": 168, "PACKAGE": 169, "SIXTH": 170, "CLOCK": 171, "GROWLING": 172, "DISTORTION": 173, "CREAKING": 174, "EGG": 175, "SCRAPING": 176, "SILVERWARE": 177, "SEVENTYTHREE": 178, "FOURTY": 179, "HARD": 180, "HIGHWAY": 181, "FLOOR": 182, "TUBA": 183, "CHURCH": 184, "BASKETBALL": 185, "PUMP": 186, "TELEVISION": 187, "SLEEPING": 188, "SHORT": 189, "DRIBBLING": 190, "WRITING": 191, "MANY": 192, "LAKE": 193, "MUG": 194, "WAKING": 195, "FLIGHT": 196, "SPOON": 197, "HEATER": 198, "ROCKS": 199, "SIZZLING": 200, "STARTING": 201, "HAIL": 202, "DRINK": 203, "WHIRRING": 204, "FIVE": 205, "PARROT": 206, "MOVEMENT": 207, "PEBBLES": 208, "GUITAR": 209, "MACHINERY": 210, "WINDY": 211, "CONSISTENT": 212, "SCRIBBLING": 213, "CHIRP": 214, "COFFEE": 215, "FISH": 216, "SHAKING": 217, "PLANE": 218, "DUCKS": 219, "FEET": 220, "STOMPING": 221, "SITTING": 222, "COINS": 223, "MONKEY": 224, "EVENING": 225, "GRINDER": 226, "SCOOTER": 227, "ROAD": 228, "MEETING": 229, "RAINING": 230, "SAND": 231, "COCK": 232, "RATCHET": 233, "MILD": 234, "CONTINUOUSLY": 235, "FOOT": 236, "WHITE": 237, "PILOT": 238, "MIXING": 239, "SEA": 240, "OBJECT": 241, "PIGEON": 242, "LATCH": 243, "UP": 244, "THIRTYONE": 245, "SHOP": 246, "WALK": 247, "RAT": 248, "LANDING": 249, "SHARPENER": 250, "WASHER": 251, "FOUR": 252, "PLATE": 253, "ROLLERCOASTER": 254, "VIOLIN": 255, "THIRTYFIVE": 256, "CRICKET": 257, "BOOTS": 258, "STRAW": 259, "VACUUM": 260, "MULTIPLE": 261, "ZOO": 262, "WHISPERING": 263, "SCREAMING": 264, "SHOUTING": 265, "CAN": 266, "BLENDER": 267, "TIMER": 268, "CRUNCH": 269, "MARBLE": 270, "CRYING": 271, "ROOSTER": 272, "ROCK": 273, "WINDCHIME": 274, "PRINTER": 275, "HOWLING": 276, "STICK": 277, "SILENT": 278, "NORMAL": 279, "FALLS": 280, "HONKING": 281, "UTENSILS": 282, "COFFEEMAKER": 283, "DOOR": 284, "SQUEAKING": 285, "CARDBOARD": 286, "TIRES": 287, "CLOSE": 288, "COPIER": 289, "STAPLER": 290, "COIN": 291, "AEROPLANE": 292, "LEAVES": 293, "PASSING": 294, "PIPE": 295, "SPINNER": 296, "RINGING": 297, "SHOES": 298, "FACTORY": 299, "STEPS": 300, "ORGAN": 301, "TREES": 302, "HAND": 303, "BEADS": 304, "UNKNOWN": 305, "CITY": 306, "LAWNMOWER": 307, "MOTORCYCLE": 308, "LEFT": 309, "DROPPING": 310, "PULSING": 311, "COMPLETION": 312, "JOGGING": 313, "DISHES": 314, "CAT": 315, "OUTDOORS": 316, "THIRTYFOUR": 317, "DEEP": 318, "HALL": 319, "POUR": 320, "SCREAM": 321, "SIX": 322, "DIRT": 323, "HISSING": 324, "AC": 325, "STEAL": 326, "SEWING": 327, "PIANO": 328, "BATMINTON": 329, "WIND": 330, "RAPID": 331, "SIXTY": 332, "TAPPING": 333, "RAIN": 334, "USEFUL": 335, "CUTTING": 336, "PHONE": 337, "HEAVY": 338, "SANDER": 339, "WHISTLING": 340, "INTERCOM": 341, "DIESEL": 342, "RAILROAD": 343, "FIRST": 344, "LAUGHTER": 345, "DEBRIS": 346, "WHEEL": 347, "SINGING": 348, "FIRE": 349, "FALLING": 350, "OIL": 351, "SHAKER": 352, "ENGINE": 353, "ACCELERATES": 354, "STOP": 355, "CANVAS": 356, "MOUTH": 357, "RABIT": 358, "BUS": 359, "DOG": 360, "SPEAKING": 361, "WD-40": 362, "AUDITORIUM": 363, "DRILLING": 364, "LORRY": 365, "DRIZZLING": 366, "BEAR": 367, "SCRAPER": 368, "SLOWLY": 369, "FEATHERS": 370, "GARBAGE": 371, "SPLASHING": 372, "BOOKS": 373, "PARK": 374, "OWL": 375, "GRINDING": 376, "POLICE": 377, "GONG": 378, "MARBLES": 379, "WALKING": 380, "FIFTY": 381, "WINCH": 382, "PEOPLE": 383, "HUMAN": 384, "CICADAS": 385, "METAL": 386, "URINATING": 387, "DRAIN": 388, "ONCE": 389, "SPILLING": 390, "STORM": 391, "CLANKING": 392, "TAMBOURINE": 393, "AQUARIUM": 394, "SAWING": 395, "NINETEEN": 396, "CHIME": 397, "SCREECHING": 398, "CROWD": 399, "SPLASH": 400, "ONE": 401, "TOP": 402, "ACCELERATING": 403, "MARKET": 404, "DRIPPING": 405, "EMERGENCY": 406, "SEMI": 407, "THUNDER": 408, "RAINFALL": 409, "HORROR": 410, "BIKE": 411, "FRYING": 412, "TAPE": 413, "PURRING": 414, "THROWING": 415, "TYPING": 416, "FINGERS": 417, "KNOCK": 418, "WATER": 419, "CHOPPING": 420, "ICE": 421, "RINSING": 422, "BICYCLE": 423, "GOLF": 424, "FIFTEEN": 425, "BUILDING": 426, "GHOST": 427, "NOISE": 428, "CREEK": 429, "WAVES": 430, "KEYS": 431, "THUD": 432, "TV": 433, "BOOK": 434, "HURRICANE": 435, "HAY": 436, "MANUFACTURING": 437, "MOVING": 438, "BOWL": 439, "TORNADO": 440, "SING": 441, "ARGUING": 442, "TRAIN": 443, "HONK": 444, "HARDWOOD": 445, "DRAWING": 446, "SPEAKER": 447, "OCEAN": 448, "CONFIGURATION": 449, "BIRD": 450, "WORKING": 451, "CRICKETS": 452, "BANG": 453, "ROARING": 454, "TRAVEL": 455, "MOTOR": 456, "TYPEWRITER": 457, "START": 458, "FAR": 459, "SEMITRUCK": 460, "WRENCH": 461, "RUSHING": 462, "SODA": 463, "MIKE": 464, "LIGHTNING": 465, "CRUNCHY": 466, "NESTS": 467, "TOOLS": 468, "TRAVELLING": 469, "WINDOW": 470, "STAIRS": 471, "WHISTLE": 472, "HUNDRED": 473, "SQUAWKING": 474, "OPEN": 475, "PIPES": 476, "SKY": 477, "INTERESTED": 478, "TRUMPET": 479, "BALLOON": 480, "STORE": 481, "SKATEBOARD": 482, "VEHICLES": 483, "TILE": 484, "GENERATOR": 485, "DRILLER": 486, "PINGPONG": 487, "CHIRPING": 488, "MAN": 489, "THIRTYSEVEN": 490, "SPEED": 491, "TUMBLING": 492, "HIKING": 493, "TANK": 494, "SLOW": 495, "COUGHING": 496, "FOOD": 497, "ALWAYS": 498, "NOISES": 499, "BABBLING": 500, "MAT": 501, "COUGH": 502, "ELEVATOR": 503, "FADES": 504, "KEY": 505, "PULLOVER": 506, "CLOUDS": 507, "MACHINE": 508, "TRIANGLE": 509, "CHIMING": 510, "DRAINING": 511, "GATE": 512, "LAUGH": 513, "MAGAZINE": 514, "BAT": 515, "ANNOUNCER": 516, "TRACKS": 517, "RACECAR": 518, "EXCITEMENT": 519, "VERY": 520, "BELL": 521, "THIRTEEN": 522, "CHIPS": 523, "THIRTY": 524, "ZERO": 525, "CHAIN": 526, "STEAM": 527, "FLYING": 528, "PLAYGROUND": 529, "FLOWING": 530, "LOCK": 531, "SNIFFING": 532, "GROUND": 533, "CONSTANTLY": 534, "CLOSING": 535, "SHEEP": 536, "END": 537, "DICE": 538, "TEAPOT": 539, "FROGS": 540, "BASS": 541, "BOARD": 542, "BRIDGE": 543, "TOOL": 544, "FARM": 545, "CYCLE": 546, "TAP": 547, "JACKHAMMER": 548, "GRAVEL": 549, "STYROFOAM": 550, "VOICE": 551, "BLOW": 552, "BLOWING": 553, "ANNOUNCEMENT": 554, "GAS": 555, "BUCKET": 556, "MIDDLE": 557, "AIRPORT": 558, "LAST": 559, "EAST": 560, "NONE": 561, "QUICKLY": 562, "BEES": 563, "STEELDRUM": 564, "POOL": 565, "TALKING": 566, "TWENTYEIGHT": 567, "DRYER": 568, "BEGINNING": 569, "SNOW": 570, "WATERFLOW": 571, "UTENSIL": 572, "BEACH": 573, "MALE": 574, "TWENTYNINE": 575, "BROOK": 576, "CROW": 577, "HAPPY": 578, "GAME": 579, "PLAYING": 580, "FIRETRUCK": 581, "CHATTING": 582, "LION": 583, "BREATHING": 584, "WOODPECKER": 585, "RADAR": 586, "WINDER": 587, "FLIES": 588, "JAR": 589, "BEEPING": 590, "HAMMERING": 591, "MOCKINGBIRD": 592, "WATCH": 593, "SOFT": 594, "WASHING": 595, "SEAGULLS": 596, "FOREST": 597, "CHEER": 598, "MOTORBIKE": 599, "TWO": 600, "DRILL": 601, "NOTHING": 602, "MOWING": 603, "GRASSHOPPERS": 604, "HOME": 605, "STORMING": 606, "FAST": 607, "BIRDS": 608, "HITTING": 609, "CLUCKING": 610, "BOX": 611, "IRON": 612, "FORTY": 613, "LOUDSPEAKER": 614, "INCREASE": 615, "BOTTLES": 616, "SEAGULL": 617, "BOILING": 618, "UMBRELLA": 619, "MOSQUITO": 620, "BRAKES": 621, "SHORE": 622, "TEETH": 623, "HE": 624, "POPCORN": 625, "SHOVEL": 626, "BAND": 627, "PEEING": 628, "MORE": 629, "EIGHTEEN": 630, "SUNNY": 631, "CRACKERS": 632, "SEVERAL": 633, "COOKING": 634, "8TIMES": 635, "CUT": 636, "DIRTBIKE": 637, "SWINGING": 638, "OFFICE": 639, "SIGNAL": 640, "INDUSTRIAL": 641, "THEREMIN": 642, "CELERY": 643, "SOUND": 644, "SCANNER": 645, "CLEANING": 646, "CICADA": 647, "TIRE": 648, "BANGING": 649, "SIXTEEN": 650, "MOUSE": 651, "MEDIUM": 652, "THREE": 653, "MORNING": 654, "WRAPPING": 655, "KIDS": 656, "SIRENS": 657, "EGGS": 658, "BOTTLE": 659, "SEVEN": 660, "WHEELBARROW": 661, "PLAIN": 662, "DRIVE": 663, "SYNTHESIZER": 664, "ENGLISH": 665, "VACCUM": 666, "CHEWING": 667, "GRASS": 668, "HOSE": 669, "LONG": 670, "CRUNCHING": 671, "WHISPER": 672, "SWEEPING": 673, "COMPUTER": 674, "POURING": 675, "SHOE": 676, "GARAGE": 677, "DRUMS": 678, "FOOTSTEPS": 679, "CLANGING": 680, "RUBBER": 681, "CUTTER": 682, "AMBULANCE": 683, "AIR": 684, "KEYBOARD": 685, "FALL": 686, "SWING": 687, "ROD": 688, "PULLING": 689, "STIR": 690, "BELT": 691, "METALS": 692, "TWENTYTWO": 693, "FAN": 694, "SEESAW": 695, "TWENTYTHREE": 696, "SIREN": 697, "STIRRING": 698, "CHAIR": 699, "RAILWAYSTATION": 700, "MODERATELY": 701, "SHOWERING": 702, "GRAINS": 703, "REFRIGERATOR": 704, "WINTER": 705, "GEESE": 706, "PAN": 707, "BAD": 708, "CONSTRUCTION": 709, "GROWL": 710, "BUZZSAW": 711, "THIRTYEIGHT": 712, "FOUNTAIN": 713, "PEN": 714, "CRASHING": 715, "INSECTS": 716, "JET": 717, "CLAPPING": 718, "BRUSH": 719, "RAILS": 720, "PRINTING": 721, "SWIMMING": 722, "CLOSET": 723, "CLUCK": 724, "BATHING": 725, "AUTOMOBILES": 726, "CONCRETE": 727, "LAUGHING": 728, "TWENTYSEVEN": 729, "FROG": 730, "HINGE": 731, "RESTAURANT": 732, "RAKE": 733, "CRACKER": 734, "HOG": 735, "BREAKS": 736, "LOUDER": 737, "STREET": 738, "CHEERING": 739, "HUMMING": 740, "CLAP": 741, "SORTER": 742, "SHOOTING": 743, "EIGHT": 744, "TENNIS": 745, "PRAYER": 746, "WELDING": 747, "PAPERS": 748, "BALLS": 749, "CUP": 750, "BROOM": 751, "BABY": 752, "NICE": 753, "INSTRUMENT": 754, "TEARING": 755, "HORN": 756, "CLICKING": 757, "SMALL": 758, "CHIMES": 759, "VIBRATION": 760, "GOAT": 761, "LOCUSTS": 762, "HOUSE": 763, "TWICE": 764, "THIRTYTHREE": 765, "ANIMAL": 766, "CARS": 767, "RACING": 768, "SIDEWALK": 769, "FABRIC": 770, "PARAKEET": 771, "DRESS": 772, "FLUTE": 773, "WOMAN": 774, "SCREECH": 775, "CRANK": 776, "MUSIC": 777, "CHILDREN": 778, "ELEPHANT": 779, "SPRING": 780, "SILENCE": 781, "TOILET": 782, "WHIR": 783, "BIG": 784, "SAW": 785, "BREEZY": 786, "TWENTYSIX": 787, "BUZZ": 788, "SOUNDS": 789, "SUBWAY": 790, "RATTLING": 791, "EATING": 792, "TWENTYONE": 793, "NINE": 794, "TUNNEL": 795, "OPENING": 796, "CYMBAL": 797, "OUTSIDE": 798, "BREAKING": 799, "FOLDING": 800, "WET": 801, "TUB": 802, "RAINSTORM": 803, "STEEL": 804, "BARKING": 805, "THUNDERSTORM": 806, "FREEWAY": 807, "ELECTRIC": 808, "BAG": 809, "CABINET": 810, "ARCADE": 811, "POT": 812, "LIGHT": 813, "LOUD": 814, "CRISPY": 815, "NOISY": 816, "TARP": 817, "CUCKOO": 818, "TABLE": 819, "YELLING": 820, "ALOT": 821, "BEE": 822, "CROWS": 823, "TOY": 824, "ROOF": 825, "GASOLINE": 826, "SMOOTH": 827} -------------------------------------------------------------------------------- /metadata/metadata_orig/output_classes.json: -------------------------------------------------------------------------------- 1 | {"CRUNCHING": 0, "SHEEP": 1, "HAMMERING": 2, "THIRD": 3, "SHOP": 4, "STORMY": 5, "BATHROOM": 6, "BOTTLES": 7, "FOURTY": 8, "CICADAS": 9, "OIL": 10, "FLOORBOARD": 11, "HITTING": 12, "CONSTANT": 13, "SHOOTING": 14, "AMBULANCE": 15, "SPRING": 16, "TREE": 17, "CHOPPING": 18, "STEELDRUM": 19, "URINATING": 20, "WATER": 21, "GARAGE": 22, "DOVE": 23, "RAILROAD": 24, "LOUD": 25, "LAUNDRY": 26, "PINGPONG": 27, "DRESS": 28, "SHARPENER": 29, "CLOCK": 30, "BUBBLING": 31, "DIGGING": 32, "FLUSHING": 33, "HEATER": 34, "TRAVEL": 35, "RATCHET": 36, "WOMAN": 37, "MOSQUITO": 38, "DRUMSTICK": 39, "HORN": 40, "CRYING": 41, "STAPLER": 42, "CHEWING": 43, "SCOOTER": 44, "SMALL": 45, "CONSTRUCTION": 46, "MORNING": 47, "START": 48, "STOPS": 49, "AEROPLANE": 50, "SOUNDS": 51, "HOWLING": 52, "WHEEL": 53, "SHOES": 54, "BIRD": 55, "SLOWLY": 56, "SPEED": 57, "PARK": 58, "GUITAR": 59, "BOTTLE": 60, "CLUCK": 61, "BABY": 62, "SLAMMING": 63, "TUMBLING": 64, "SHARPENING": 65, "ELECTRICITY": 66, "RACETRACK": 67, "DROPPING": 68, "COMPLETION": 69, "DOLPHIN": 70, "ELEPHANT": 71, "WHEELBARROW": 72, "CHEER": 73, "TWICE": 74, "MICROWAVE": 75, "BELLS": 76, "GENERATOR": 77, "WHISTLING": 78, "LIGHT": 79, "GONG": 80, "TENNIS": 81, "FROG": 82, "HARDWOOD": 83, "DRYER": 84, "WATERS": 85, "TWENTYEIGHT": 86, "FOURTEEN": 87, "CLANGING": 88, "TWENTYFIVE": 89, "CELERY": 90, "PEBBLES": 91, "SHIP": 92, "TOOL": 93, "OPENING": 94, "WRENCH": 95, "CROWS": 96, "MONKEY": 97, "LAWNMOWER": 98, "MATCH": 99, "THIRTEEN": 100, "ANIMAL": 101, "ONE": 102, "MIXER": 103, "CHEERING": 104, "AIRSOUND": 105, "HE": 106, "FAST": 107, "RAPID": 108, "CHILD": 109, "DOG": 110, "THEREMIN": 111, "GOLF": 112, "WOMEN": 113, "SILENT": 114, "KEYS": 115, "SITTING": 116, "WAVES": 117, "BEAR": 118, "ALOT": 119, "RAILWAYSTATION": 120, "FORTY": 121, "MACHINE": 122, "DRIZZLING": 123, "CICADA": 124, "DEEP": 125, "BOX": 126, "MAT": 127, "DUCK": 128, "DRUMS": 129, "MOWER": 130, "SINGING": 131, "ROAD": 132, "EVENING": 133, "METALS": 134, "CHIPS": 135, "PEN": 136, "STEEL": 137, "CHAIN": 138, "SUMMER": 139, "BUZZING": 140, "FILLING": 141, "ARGUING": 142, "SEMI": 143, "RAIN": 144, "TV": 145, "MAGAZINE": 146, "SPLASHING": 147, "DROP": 148, "BANGING": 149, "KNIFE": 150, "TOY": 151, "COOKING": 152, "YELLING": 153, "RAT": 154, "SEWING": 155, "STOP": 156, "HORSE": 157, "JOGGING": 158, "VIBRATION": 159, "CRACKER": 160, "FAUCET": 161, "VACCUM": 162, "HUMMING": 163, "BEADS": 164, "HAMMER": 165, "JET": 166, "DRIVING": 167, "CROW": 168, "PRINTING": 169, "SKY": 170, "TOP": 171, "GAME": 172, "SLEEPING": 173, "PASSING": 174, "MARKET": 175, "BABBLING": 176, "MUD": 177, "BARKING": 178, "CHIRPS": 179, "BOARD": 180, "FACTORY": 181, "UNKNOWN": 182, "AWAY": 183, "WORKING": 184, "DIRT": 185, "GRASS": 186, "ENGLISH": 187, "INSECTS": 188, "TEARING": 189, "HAIL": 190, "STREAM": 191, "DISTORTION": 192, "LISTENING": 193, "FABRIC": 194, "BREATHING": 195, "DRILLER": 196, "TWENTYFOUR": 197, "RADIO": 198, "DRIP": 199, "CYCLE": 200, "STYROFOAM": 201, "LOCUSTS": 202, "ALARM": 203, "WALKING": 204, "TWENTYSIX": 205, "BLENDER": 206, "SCREECH": 207, "RAINFALL": 208, "PAVEMENT": 209, "CLOSE": 210, "EIGHTEEN": 211, "SEAGULLS": 212, "CHANGING": 213, "LONG": 214, "SWIMMING": 215, "WINDY": 216, "MEETING": 217, "ELEVATOR": 218, "WET": 219, "GOOSE": 220, "CUT": 221, "DIRTBIKE": 222, "GRASSHOPPERS": 223, "END": 224, "TWENTY": 225, "8TIMES": 226, "FENCE": 227, "CONCRETE": 228, "SILVERWARE": 229, "THREE": 230, "THUD": 231, "COFFEEMAKER": 232, "SECOND": 233, "AIR": 234, "BICYCLE": 235, "BUGS": 236, "FLY": 237, "TAPE": 238, "CYMBAL": 239, "COIN": 240, "TWENTYONE": 241, "DOWN": 242, "HAND": 243, "FLUTE": 244, "CUP": 245, "RABIT": 246, "CANVAS": 247, "FLOW": 248, "GRINDER": 249, "GRINDING": 250, "GHOST": 251, "GROWL": 252, "BEE": 253, "CREAKING": 254, "MAN": 255, "PLATE": 256, "SQUAWKING": 257, "PAN": 258, "MANUFACTURING": 259, "RAKE": 260, "PARAKEET": 261, "STIR": 262, "CART": 263, "EMERGENCY": 264, "DISHES": 265, "NOISY": 266, "SQUEAK": 267, "FAN": 268, "OWL": 269, "PULLOVER": 270, "GAS": 271, "SCRATCHING": 272, "CARS": 273, "WRAPPING": 274, "FLOOD": 275, "SQUEAKING": 276, "PULSING": 277, "WATERFALL": 278, "TIN": 279, "ELEVEN": 280, "STARTING": 281, "PAPERS": 282, "WD-40": 283, "WINDER": 284, "PRAYER": 285, "WRITING": 286, "PLAYGROUND": 287, "RUBBER": 288, "EAST": 289, "CROWD": 290, "ROOF": 291, "FLYING": 292, "FOUR": 293, "TAMBOURINE": 294, "HARD": 295, "COCK": 296, "RAINY": 297, "MOCKINGBIRD": 298, "ROD": 299, "BELT": 300, "TUBA": 301, "TRACTOR": 302, "ROLLERCOASTER": 303, "WHISPER": 304, "ROOSTER": 305, "BASKETBALL": 306, "SPEAKING": 307, "LAKE": 308, "FAR": 309, "HOSE": 310, "STREET": 311, "SHREDDING": 312, "TRACKS": 313, "RUNNING": 314, "SHOWERING": 315, "SNAKE": 316, "CLAP": 317, "HORROR": 318, "WATERFLOW": 319, "FOOD": 320, "RADAR": 321, "AUDIENCE": 322, "EGG": 323, "CRICKETS": 324, "RINGING": 325, "SPINNER": 326, "SWEEPING": 327, "STATIC": 328, "LIQUID": 329, "MOUTH": 330, "GLASSES": 331, "HURRICANE": 332, "CRINKLING": 333, "LOW": 334, "TELEPHONE": 335, "SEMITRUCK": 336, "JACKHAMMER": 337, "BEACH": 338, "FRYING": 339, "WHEELS": 340, "FREIGHT": 341, "LAST": 342, "DRIVE": 343, "FALLS": 344, "FARM": 345, "INDUSTRIAL": 346, "ROARING": 347, "POPCORN": 348, "SNIFFING": 349, "MODERATELY": 350, "RAILS": 351, "PHONE": 352, "BREAKING": 353, "TANK": 354, "RACING": 355, "CHATTING": 356, "SMOOTH": 357, "DISHWASHER": 358, "THIRTYFOUR": 359, "COUGH": 360, "LIGHTENING": 361, "POURING": 362, "ENGINE": 363, "CHIME": 364, "RAINSTORM": 365, "MARBLE": 366, "RESTAURANT": 367, "WASHING": 368, "FISH": 369, "FIVE": 370, "ANNOUNCEMENT": 371, "CRUNCHY": 372, "WHISTLE": 373, "MOTORCYCLE": 374, "SCREAMING": 375, "BOOK": 376, "POUR": 377, "BLOW": 378, "CRUNCH": 379, "WAVE": 380, "PARROT": 381, "PIANO": 382, "SIXTY": 383, "FALLING": 384, "ROLLER": 385, "STATION": 386, "BLUE": 387, "BAND": 388, "AIRPORT": 389, "SWING": 390, "MOUSE": 391, "CONSTANTLY": 392, "THIRTYFIVE": 393, "VOICES": 394, "BALL": 395, "GRAVEL": 396, "FLIES": 397, "JAR": 398, "NORMAL": 399, "POOL": 400, "KITCHEN": 401, "PRINTER": 402, "NICE": 403, "TAP": 404, "FIFTY": 405, "SONG": 406, "ACCELERATING": 407, "THIRTY": 408, "LEFT": 409, "BROOK": 410, "WASHER": 411, "TALKING": 412, "INTERESTED": 413, "FEATHERS": 414, "WELDING": 415, "SIDEWALK": 416, "NINE": 417, "ROCK": 418, "FIREPLACE": 419, "AQUARIUM": 420, "SEVENTEEN": 421, "OCEAN": 422, "SCRIBBLING": 423, "EXHAUST": 424, "ROCKS": 425, "NIGHT": 426, "EIGHT": 427, "AIRPLANE": 428, "HEAVY": 429, "FUEL": 430, "CLAPPING": 431, "RICE": 432, "NONE": 433, "SEVENTYTHREE": 434, "BUZZ": 435, "RAINDROPS": 436, "MIDDLE": 437, "BOAT": 438, "ANNOUNCER": 439, "HOME": 440, "SPLASH": 441, "HAPPY": 442, "BUBBLEWRAP": 443, "MOTOR": 444, "HIKING": 445, "WAKING": 446, "STEAL": 447, "PULLED": 448, "LAUGH": 449, "BELL": 450, "OBJECT": 451, "FINGERS": 452, "TUB": 453, "TRUMPET": 454, "SHORE": 455, "QUIET": 456, "MOTORBIKE": 457, "HOUSE": 458, "SPRAYER": 459, "FOOTSTEPS": 460, "CHIMES": 461, "HUMANS": 462, "PLAIN": 463, "TWELVE": 464, "WINTER": 465, "VIBRATING": 466, "BEES": 467, "CONSISTENT": 468, "FRYINGPAN": 469, "GROWLING": 470, "BLOWING": 471, "CUCKOO": 472, "SCRAPING": 473, "KIDS": 474, "NOISE": 475, "CLUCKING": 476, "STORMING": 477, "POP": 478, "SIXTH": 479, "CRASHING": 480, "PARTY": 481, "PEOPLE": 482, "DRILLING": 483, "STIRRING": 484, "COOKER": 485, "PEEING": 486, "LIGHTNING": 487, "ZERO": 488, "EXCITEMENT": 489, "STEPS": 490, "SCHOOL": 491, "READING": 492, "CLASSICAL": 493, "INCREASE": 494, "HISSING": 495, "SEESAW": 496, "FOUNTAIN": 497, "HEN": 498, "KEYBOARD": 499, "CHIRPING": 500, "SODA": 501, "USEFUL": 502, "STICK": 503, "CONFIGURATION": 504, "TWENTYSEVEN": 505, "CRUSHING": 506, "PUMP": 507, "LOCK": 508, "SING": 509, "INSTRUMENT": 510, "DRAWING": 511, "GUTTERS": 512, "CLICKING": 513, "ADULT": 514, "EGGS": 515, "CLOSING": 516, "HIGHWAY": 517, "STOMPING": 518, "DRIBBLING": 519, "GROUND": 520, "DUCKS": 521, "PLANE": 522, "PILOT": 523, "WHIRRING": 524, "MOVING": 525, "MIKE": 526, "ORGAN": 527, "CUTTING": 528, "TIRES": 529, "BUILDING": 530, "RAINING": 531, "KEY": 532, "MUG": 533, "LOUDER": 534, "CHILDREN": 535, "HAPPINESS": 536, "FIRST": 537, "WALK": 538, "NOTEBOOK": 539, "SAND": 540, "ELECTRONIC": 541, "AUTOMOBILES": 542, "STONE": 543, "THIRTYTHREE": 544, "VACUUMING": 545, "GOOD": 546, "TEN": 547, "BRAKES": 548, "TWENTYTHREE": 549, "SEAGULL": 550, "CRISPY": 551, "TAPPING": 552, "SPOON": 553, "WIND": 554, "AUDITORIUM": 555, "SIRENS": 556, "SAW": 557, "OUTDOORS": 558, "SCREECHING": 559, "EATING": 560, "RACE": 561, "BIG": 562, "OUTSIDE": 563, "FOREST": 564, "HELICOPTER": 565, "SNOW": 566, "CLEANING": 567, "PIG": 568, "BAT": 569, "CONTINUOUSLY": 570, "THUNDERSTORM": 571, "LION": 572, "WOODPECKER": 573, "DOOR": 574, "DRINK": 575, "BATMINTON": 576, "OFFICE": 577, "XYLOPHONE": 578, "SCREAM": 579, "NEVER": 580, "FLOOR": 581, "MUSIC": 582, "WOOD": 583, "WHITE": 584, "COUGHING": 585, "FIGHTING": 586, "NESTS": 587, "TWO": 588, "SHOWER": 589, "ELECTRIC": 590, "SAWING": 591, "CRICKET": 592, "MALE": 593, "MOVEMENT": 594, "SIREN": 595, "SHOUTING": 596, "COFFEE": 597, "GATE": 598, "MEN": 599, "WATCH": 600, "BROOM": 601, "CITY": 602, "WORK": 603, "BIRDS": 604, "SHORT": 605, "CRACKERS": 606, "THIRTYEIGHT": 607, "GASOLINE": 608, "FOOT": 609, "VEHICLE": 610, "DEBRIS": 611, "SPILLING": 612, "RATTLING": 613, "WINDCHIME": 614, "SWINGING": 615, "RINSING": 616, "FIRE": 617, "WINDOW": 618, "FIFTEEN": 619, "SOFT": 620, "SHAKING": 621, "LAUGHING": 622, "TRAIN": 623, "ZOO": 624, "BRUSH": 625, "SYNTHESIZER": 626, "PENCIL": 627, "FLOWING": 628, "SIZZLING": 629, "METAL": 630, "LORRY": 631, "MOWING": 632, "PACKAGE": 633, "TRAVELLING": 634, "TARP": 635, "BREEZY": 636, "ICE": 637, "RAPIDLY": 638, "PLASTIC": 639, "VEHICLES": 640, "NOTHING": 641, "ALWAYS": 642, "BANG": 643, "HUMAN": 644, "CHICKEN": 645, "TREES": 646, "FREEWAY": 647, "AC": 648, "TRAFFIC": 649, "BOOTS": 650, "NOISES": 651, "CLANKING": 652, "LEAVES": 653, "BOILING": 654, "STORE": 655, "GEESE": 656, "DRAIN": 657, "VERY": 658, "SPARROW": 659, "HINGE": 660, "KNOCKING": 661, "BASS": 662, "SLOW": 663, "GRAINS": 664, "CARDS": 665, "LAUGHTER": 666, "MULTIPLE": 667, "UP": 668, "FALL": 669, "HALL": 670, "HONK": 671, "FEMALE": 672, "CHIRP": 673, "DICE": 674, "CRANK": 675, "WHISPERING": 676, "HIGH": 677, "TWEET": 678, "CREEK": 679, "CARDBOARD": 680, "SPEAKER": 681, "PIPES": 682, "PERSON": 683, "STEADY": 684, "VACUUM": 685, "SUNNY": 686, "SHAKER": 687, "SKATEBOARD": 688, "HUNDRED": 689, "MEDIUM": 690, "TEETH": 691, "TOILET": 692, "TRUCK": 693, "KETTLE": 694, "VOICE": 695, "BALLS": 696, "FEET": 697, "QUICKLY": 698, "CAR": 699, "SEAWAVE": 700, "ARCADE": 701, "BAG": 702, "BUZZSAW": 703, "LATCH": 704, "STEAM": 705, "SEVEN": 706, "DRAINING": 707, "SCRAPER": 708, "PIPE": 709, "UMBRELLA": 710, "THROWING": 711, "INSECT": 712, "REFRIGERATOR": 713, "THIRTYONE": 714, "POT": 715, "CUTTER": 716, "MANY": 717, "BRANCHES": 718, "SEVERAL": 719, "THIRTYSEVEN": 720, "WHIR": 721, "RUSHING": 722, "STRAW": 723, "CLOUDS": 724, "FLIGHT": 725, "PRESS": 726, "TYPING": 727, "HAY": 728, "SCANNER": 729, "PLAYING": 730, "MACHINERY": 731, "BUCKET": 732, "SIGNAL": 733, "MILD": 734, "LOUDSPEAKER": 735, "TILE": 736, "SANDER": 737, "SHOE": 738, "ALUMINUM": 739, "GOAT": 740, "BRIDGE": 741, "NINETEEN": 742, "COW": 743, "MICROPHONE": 744, "COPIER": 745, "SIX": 746, "BEEPING": 747, "RIVER": 748, "PULLING": 749, "BUS": 750, "TELEVISION": 751, "DRUM": 752, "TOOLS": 753, "CLOSET": 754, "SOUND": 755, "CAT": 756, "CHAIR": 757, "PIGEON": 758, "HONKING": 759, "RACECAR": 760, "BATHING": 761, "CABINET": 762, "KNOCK": 763, "UTENSIL": 764, "INTERCOM": 765, "TUNNEL": 766, "HOG": 767, "COMPUTER": 768, "HANDS": 769, "SEA": 770, "COINS": 771, "SIXTEEN": 772, "DIESEL": 773, "TEAPOT": 774, "THUNDER": 775, "TWENTYTWO": 776, "VIOLIN": 777, "ACCELERATES": 778, "MORE": 779, "POLICE": 780, "GARBAGE": 781, "SUBWAY": 782, "BAD": 783, "MARKER": 784, "PURRING": 785, "LOCUST": 786, "LANDING": 787, "ONCE": 788, "DRILL": 789, "TYPEWRITER": 790, "TWENTYNINE": 791, "IRON": 792, "SORTER": 793, "SINK": 794, "BOWL": 795, "SILENCE": 796, "TIRE": 797, "TORNADO": 798, "CHURCH": 799, "STORM": 800, "TIMER": 801, "MIXING": 802, "FIREWORKS": 803, "TRIANGLE": 804, "SHOVEL": 805, "FROGS": 806, "FIRETRUCK": 807, "PAPER": 808, "DRIPPING": 809, "BREAKS": 810, "CAN": 811, "GLASS": 812, "OPEN": 813, "BALLOON": 814, "CHIMING": 815, "FADES": 816, "STAIRS": 817, "WINCH": 818, "BOOKS": 819, "CHAINSAW": 820, "UTENSILS": 821, "BIKE": 822, "BEGINNING": 823, "MARBLES": 824, "REVVING": 825, "TABLE": 826, "FOLDING": 827} -------------------------------------------------------------------------------- /nets/ours_MWAFM_Net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy 6 | import copy 7 | import math 8 | from nets.multi_attention import MultiScaleSelfAttention 9 | 10 | 11 | class QstEncoder(nn.Module): 12 | 13 | def __init__(self, qst_vocab_size, word_embed_size, embed_size, num_layers, hidden_size): 14 | 15 | super(QstEncoder, self).__init__() 16 | self.word2vec = nn.Embedding(qst_vocab_size, word_embed_size) 17 | self.tanh = nn.Tanh() 18 | self.lstm = nn.LSTM(word_embed_size, hidden_size, num_layers) 19 | self.fc = nn.Linear(2*num_layers*hidden_size, embed_size) # 2 for hidden and cell states 20 | 21 | def forward(self, question): 22 | 23 | qst_vec = self.word2vec(question) # [batch_size, max_qst_length=30, word_embed_size=300] 24 | qst_vec = self.tanh(qst_vec) 25 | qst_vec = qst_vec.transpose(0, 1) # [max_qst_length=30, batch_size, word_embed_size=300] 26 | self.lstm.flatten_parameters() 27 | _, (hidden, cell) = self.lstm(qst_vec) # [num_layers=2, batch_size, hidden_size=512] 28 | qst_feature = torch.cat((hidden, cell), 2) # [num_layers=2, batch_size, 2*hidden_size=1024] 29 | qst_feature = qst_feature.transpose(0, 1) # [batch_size, num_layers=2, 2*hidden_size=1024] 30 | qst_feature = qst_feature.reshape(qst_feature.size()[0], -1) # [batch_size, 2*num_layers*hidden_size=2048] 31 | qst_feature = self.tanh(qst_feature) 32 | qst_feature = self.fc(qst_feature) # [batch_size, embed_size] 33 | 34 | return qst_feature 35 | 36 | 37 | def _get_clones(module, N): 38 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 39 | 40 | class Encoder(nn.Module): 41 | 42 | def __init__(self, encoder_layer, num_layers, norm=None): 43 | super(Encoder, self).__init__() 44 | self.layers = _get_clones(encoder_layer, num_layers) 45 | self.num_layers = num_layers 46 | self.norm1 = nn.LayerNorm(512) 47 | self.norm2 = nn.LayerNorm(512) 48 | self.norm = norm 49 | 50 | def forward(self, src_a, mask=None, src_key_padding_mask=None): 51 | output_a = src_a 52 | 53 | for i in range(self.num_layers): 54 | output_a = self.layers[i](src_a, src_a, src_mask=mask,src_key_padding_mask=src_key_padding_mask) 55 | 56 | if self.norm: 57 | output_a = self.norm1(output_a) 58 | 59 | return output_a 60 | 61 | 62 | 63 | class MultiAttnLayer(nn.Module): 64 | 65 | # d_model=512, nhead=1, dim_feedforward=512), num_layers=1 66 | def __init__(self, d_model, nhead, window_size, dim_feedforward=512, dropout=0.1): 67 | super(MultiAttnLayer, self).__init__() 68 | 69 | 70 | self.self_attn = MultiScaleSelfAttention(num_attention_heads = nhead, 71 | hidden_size = d_model, 72 | attention_probs_dropout_prob = 0.0, 73 | attention_window = [window_size], 74 | attention_dilation = [1], 75 | attention_mode = 'sliding_chunks', 76 | autoregressive = False, 77 | layer_id=0) 78 | 79 | self.cm_attn = MultiScaleSelfAttention(num_attention_heads = nhead, 80 | hidden_size = d_model, 81 | attention_probs_dropout_prob = 0.0, 82 | attention_window = [window_size], 83 | attention_dilation = [1], 84 | attention_mode = 'sliding_chunks', 85 | autoregressive = False, 86 | layer_id=0) 87 | 88 | # Implementation of Feedforward model 89 | self.linear1 = nn.Linear(d_model, dim_feedforward) 90 | self.dropout = nn.Dropout(dropout) 91 | self.linear2 = nn.Linear(dim_feedforward, d_model) 92 | 93 | self.norm1 = nn.LayerNorm(d_model) 94 | self.norm2 = nn.LayerNorm(d_model) 95 | self.dropout11 = nn.Dropout(dropout) 96 | self.dropout12 = nn.Dropout(dropout) 97 | self.dropout2 = nn.Dropout(dropout) 98 | 99 | 100 | 101 | def forward(self, src_q, src_kv, src_mask=None, src_key_padding_mask=None): 102 | 103 | src_lf_self = self.self_attn(src_q, src_q, src_q)[0] 104 | 105 | src_q = src_q + self.dropout12(src_lf_self) 106 | src_q = self.norm1(src_q) 107 | 108 | # src_lf_self = self.linear2(self.dropout(F.relu(self.linear1(src_q)))) 109 | # src_q = src_q + self.dropout2(src_lf_self) 110 | # src_q = self.norm2(src_q) 111 | 112 | return src_q 113 | 114 | 115 | 116 | class Encoder_QA(nn.Module): 117 | 118 | def __init__(self, encoder_layer, num_layers, norm=None): 119 | super(Encoder_QA, self).__init__() 120 | self.layers = _get_clones(encoder_layer, num_layers) 121 | self.num_layers = num_layers 122 | self.norm1 = nn.LayerNorm(512) 123 | self.norm2 = nn.LayerNorm(512) 124 | self.norm = norm 125 | 126 | def forward(self, src_a, mask=None, src_key_padding_mask=None): 127 | output_a = src_a 128 | 129 | for i in range(self.num_layers): 130 | output_a = self.layers[i](src_a, src_mask=mask,src_key_padding_mask=src_key_padding_mask) 131 | 132 | if self.norm: 133 | output_a = self.norm1(output_a) 134 | 135 | return output_a 136 | 137 | 138 | class QAHanLayer(nn.Module): 139 | 140 | def __init__(self, d_model, nhead, dim_feedforward=512, dropout=0.1): 141 | super(QAHanLayer, self).__init__() 142 | 143 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 144 | self.cm_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 145 | 146 | self.linear1 = nn.Linear(d_model, dim_feedforward) 147 | self.dropout = nn.Dropout(dropout) 148 | self.linear2 = nn.Linear(dim_feedforward, d_model) 149 | 150 | self.norm1 = nn.LayerNorm(d_model) 151 | self.norm2 = nn.LayerNorm(d_model) 152 | self.dropout11 = nn.Dropout(dropout) 153 | self.dropout12 = nn.Dropout(dropout) 154 | self.dropout2 = nn.Dropout(dropout) 155 | 156 | self.activation = nn.ReLU() 157 | 158 | def forward(self, src_a, src_mask=None, src_key_padding_mask=None): 159 | 160 | src_a = src_a.permute(1, 0, 2) 161 | src2 = self.self_attn(src_a, src_a, src_a, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] 162 | src_a = src_a + self.dropout12(src2) 163 | src_a = self.norm1(src_a) 164 | 165 | src2 = self.linear2(self.dropout(F.relu(self.linear1(src_a)))) 166 | src_a = src_a + self.dropout2(src2) 167 | src_a = self.norm2(src_a) 168 | 169 | return src_a.permute(1, 0, 2) 170 | 171 | 172 | class MWAFM_Net(nn.Module): 173 | 174 | def __init__(self, d_model=512, nhead=1, dropout=0.1, dim_feedforward=512): 175 | super(MWAFM_Net, self).__init__() 176 | 177 | 178 | # self.audio_ast_fc = nn.Linear(768, 512) 179 | # self.fusion_ast_fc = nn.Linear(1024, 512) 180 | 181 | self.audio_fc = nn.Linear(128, 512) 182 | 183 | self.question_fc = nn.Linear(300, 512) 184 | self.question_fc2 = nn.Linear(512, 512) 185 | 186 | self.question_encoder = QstEncoder(2000, 512, 512, 1, 512) 187 | self.word2vec = nn.Embedding(2000, 512) 188 | 189 | self.multi_scale_encoder_2 = Encoder(MultiAttnLayer(d_model=512, nhead=4, window_size=2, dim_feedforward=512), num_layers=1) 190 | self.multi_scale_encoder_4 = Encoder(MultiAttnLayer(d_model=512, nhead=4, window_size=4, dim_feedforward=512), num_layers=1) 191 | self.multi_scale_encoder_6 = Encoder(MultiAttnLayer(d_model=512, nhead=4, window_size=6, dim_feedforward=512), num_layers=1) 192 | self.multi_scale_encoder_12 = Encoder(MultiAttnLayer(d_model=512, nhead=4, window_size=12, dim_feedforward=512), num_layers=1) 193 | 194 | self.multi_scale_linear = nn.Linear(512, 512) 195 | self.multi_scale_dropout = nn.Dropout(0.1) 196 | self.multi_scale_norm = nn.LayerNorm(512) 197 | 198 | # question as query on audio and visual_feat_grd 199 | self.attn_qst_query = nn.MultiheadAttention(512, 4, dropout=0.1) 200 | self.qst_query_linear1 = nn.Linear(512, 512) 201 | self.qst_query_relu = nn.ReLU() 202 | self.qst_query_dropout1 = nn.Dropout(0.1) 203 | self.qst_query_linear2 = nn.Linear(512, 512) 204 | self.qst_query_dropout2 = nn.Dropout(0.1) 205 | self.qst_query_norm = nn.LayerNorm(512) 206 | 207 | # self-cross 208 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 209 | self.cm_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 210 | # Implementation of Feedforward model 211 | self.linear1 = nn.Linear(d_model, dim_feedforward) 212 | self.dropout = nn.Dropout(dropout) 213 | self.linear2 = nn.Linear(dim_feedforward, d_model) 214 | self.norm1 = nn.LayerNorm(d_model) 215 | self.norm2 = nn.LayerNorm(d_model) 216 | self.dropout11 = nn.Dropout(dropout) 217 | self.dropout12 = nn.Dropout(dropout) 218 | self.dropout2 = nn.Dropout(dropout) 219 | self.activation = nn.ReLU() 220 | 221 | self.tanh = nn.Tanh() 222 | 223 | 224 | self.combine_fc1 = nn.Linear(1024, 512) 225 | self.combine_fc2 = nn.Linear(512, 256) 226 | self.pred_fc = nn.Linear(256, 828) 227 | 228 | 229 | 230 | self.multi_layers = Encoder_QA(QAHanLayer(d_model=512, 231 | nhead=1, 232 | dim_feedforward=512), 233 | num_layers=4) 234 | 235 | 236 | 237 | ### attention, question as query on visual_feat and audio_feat 238 | def SelfAttn(self, quests_feat_input, key_value_feat): 239 | 240 | ### input Q, K, V: [T, B, C] 241 | 242 | key_value_feat_grd = key_value_feat.permute(1, 0, 2) 243 | qst_feat_query = key_value_feat_grd 244 | key_value_feat_att = self.attn_qst_query(qst_feat_query, key_value_feat_grd, key_value_feat_grd, 245 | attn_mask=None, key_padding_mask=None)[0] 246 | src = self.qst_query_linear1(key_value_feat_att) 247 | src = self.qst_query_relu(src) 248 | src = self.qst_query_dropout1(src) 249 | src = self.qst_query_linear2(src) 250 | src = self.qst_query_dropout2(src) 251 | 252 | key_value_feat_att = key_value_feat_att + src 253 | key_value_feat_att = self.qst_query_norm(key_value_feat_att) 254 | 255 | return key_value_feat_att.permute(1, 0, 2) 256 | 257 | 258 | def SelfCrossAttn(self, src_q, src_v, src_mask=None, src_key_padding_mask=None): 259 | # src_q = src_q.unsqueeze(0) 260 | src_q = src_q.permute(1, 0, 2) 261 | src_v = src_v.permute(1, 0, 2) 262 | src1 = self.cm_attn(src_q, src_v, src_v, attn_mask=src_mask,key_padding_mask=src_key_padding_mask)[0] 263 | src2 = self.self_attn(src_q, src_q, src_q, attn_mask=src_mask,key_padding_mask=src_key_padding_mask)[0] 264 | src_q = src_q + self.dropout11(src1) + self.dropout12(src2) 265 | src_q = self.norm1(src_q) 266 | 267 | src2 = self.linear2(self.dropout(F.relu(self.linear1(src_q)))) 268 | src_q = src_q + self.dropout2(src2) 269 | src_q = self.norm2(src_q) 270 | return src_q.permute(1, 0, 2) 271 | 272 | 273 | ### attention, question as query on visual_feat and audio_feat 274 | def QuestionQuereidAttn(self, quests_feat_input, key_value_feat): 275 | 276 | # qst_feat_query = quests_feat_input.unsqueeze(0) # [1, B, C], [1, 2, 512] 277 | qst_feat_query = quests_feat_input.permute(1, 0, 2) 278 | 279 | ### input Q, K, V: [T, B, C] 280 | key_value_feat_grd = key_value_feat.permute(1, 0, 2) 281 | key_value_feat_att = self.attn_qst_query(key_value_feat_grd, qst_feat_query, qst_feat_query, 282 | attn_mask=None, key_padding_mask=None)[0] 283 | src = self.qst_query_linear1(key_value_feat_att) 284 | src = self.qst_query_relu(src) 285 | src = self.qst_query_dropout1(src) 286 | src = self.qst_query_linear2(src) 287 | src = self.qst_query_dropout2(src) 288 | 289 | key_value_feat_att = key_value_feat_att + src 290 | key_value_feat_att = self.qst_query_norm(key_value_feat_att) 291 | 292 | return key_value_feat_att.permute(1, 0, 2) 293 | 294 | 295 | # def forward(self, audio, audio_ast_feat, question): 296 | def forward(self, audio, question): 297 | 298 | ### feature input 299 | audio_feat = self.audio_fc(audio) # [B, T, C] 300 | qst_feat = self.question_fc(question) 301 | 302 | # audio_ast_feat = self.audio_ast_fc(audio_ast_feat) 303 | # audio_ast_feat = F.relu(audio_ast_feat) 304 | 305 | audio_feat_grd = audio_feat 306 | qst_feat_grd = qst_feat 307 | 308 | ### --------------- Hybrid Attention Module start --------------- 309 | qst_feat = self.SelfAttn(qst_feat, qst_feat) 310 | audio_feat = self.SelfCrossAttn(audio_feat, qst_feat_grd) 311 | 312 | ### --------------- Multi-scale Window attention start --------------- 313 | ## input: [B, T, C], output: [B, T, C] 314 | aud_feat_scale_2 = self.multi_scale_encoder_2(audio_feat, audio_feat) 315 | aud_feat_scale_4 = self.multi_scale_encoder_4(audio_feat, audio_feat) 316 | aud_feat_scale_6 = self.multi_scale_encoder_6(audio_feat, audio_feat) 317 | aud_feat_scale_12 = self.multi_scale_encoder_12(audio_feat, audio_feat) 318 | 319 | audio_feat_kv2 = aud_feat_scale_2.permute(1, 0, 2) 320 | audio_feat_kv4 = aud_feat_scale_4.permute(1, 0, 2) 321 | audio_feat_kv6 = aud_feat_scale_6.permute(1, 0, 2) 322 | audio_feat_kv12 = aud_feat_scale_12.permute(1, 0, 2) 323 | 324 | audio_feat_kv2 = self.multi_scale_dropout(F.relu(self.multi_scale_linear(audio_feat_kv2))) 325 | audio_feat_kv4 = self.multi_scale_dropout(F.relu(self.multi_scale_linear(audio_feat_kv4))) 326 | audio_feat_kv6 = self.multi_scale_dropout(F.relu(self.multi_scale_linear(audio_feat_kv6))) 327 | audio_feat_kv12 = self.multi_scale_dropout(F.relu(self.multi_scale_linear(audio_feat_kv12))) 328 | 329 | audio_feat_ws_sum = audio_feat_kv2 + audio_feat_kv4 + audio_feat_kv6 + audio_feat_kv12 330 | audio_feat_kv = audio_feat + audio_feat_ws_sum.permute(1, 0, 2) 331 | # audio_feat_kv = self.multi_scale_norm(audio_feat_kv) 332 | 333 | ### --------------- Multi-scale Window attention end --------------- 334 | 335 | audio_feat_kv = self.multi_layers(audio_feat) 336 | audio_feat_kv = audio_feat_kv.mean(dim=1) 337 | 338 | # cat 339 | # audio_feat_kv = torch.cat([audio_ast_feat.mean(-2), audio_feat_kv], dim=-1) 340 | # audio_feat_kv = self.fusion_ast_fc(audio_feat_kv) 341 | # # audio_feat_kv = F.relu(audio_feat_kv) 342 | 343 | # add 344 | # audio_feat_kv = audio_feat_kv + audio_ast_feat.mean(-2) 345 | qst_feat = qst_feat.mean(dim=1) 346 | combine_feat = torch.mul(audio_feat_kv, qst_feat) 347 | 348 | combine_feat = F.relu(self.combine_fc2(combine_feat)) 349 | feat_output = self.pred_fc(combine_feat) 350 | 351 | return feat_output 352 | 353 | -------------------------------------------------------------------------------- /nets/multi_attention.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import math 3 | import torch 4 | from torch import nn 5 | import torch.nn.functional as F 6 | from nets.diagonaled_mm_tvm import diagonaled_mm as diagonaled_mm_tvm, mask_invalid_locations 7 | from nets.sliding_chunks import sliding_chunks_matmul_qk, sliding_chunks_matmul_pv 8 | from nets.sliding_chunks import sliding_chunks_no_overlap_matmul_qk, sliding_chunks_no_overlap_matmul_pv 9 | 10 | 11 | class MultiScaleSelfAttention(nn.Module): 12 | def __init__(self, num_attention_heads, 13 | hidden_size, 14 | attention_probs_dropout_prob, 15 | attention_window, 16 | attention_dilation, 17 | attention_mode, 18 | autoregressive, 19 | layer_id): 20 | super(MultiScaleSelfAttention, self).__init__() 21 | if hidden_size % num_attention_heads != 0: 22 | raise ValueError( 23 | "The hidden size (%d) is not a multiple of the number of attention " 24 | "heads (%d)" % (hidden_size, num_attention_heads)) 25 | self.num_heads = num_attention_heads 26 | self.head_dim = int(hidden_size / num_attention_heads) 27 | self.embed_dim = hidden_size 28 | 29 | self.query = nn.Linear(hidden_size, self.embed_dim) 30 | self.key = nn.Linear(hidden_size, self.embed_dim) 31 | self.value = nn.Linear(hidden_size, self.embed_dim) 32 | 33 | self.query_global = nn.Linear(hidden_size, self.embed_dim) 34 | self.key_global = nn.Linear(hidden_size, self.embed_dim) 35 | self.value_global = nn.Linear(hidden_size, self.embed_dim) 36 | 37 | self.dropout = attention_probs_dropout_prob 38 | 39 | self.layer_id = layer_id 40 | self.attention_window = attention_window[self.layer_id] 41 | self.attention_dilation = attention_dilation[self.layer_id] 42 | self.attention_mode = attention_mode 43 | self.autoregressive = autoregressive 44 | assert self.attention_window > 0 45 | assert self.attention_dilation > 0 46 | assert self.attention_mode in ['tvm', 'sliding_chunks', 'sliding_chunks_no_overlap'] 47 | if self.attention_mode in ['sliding_chunks', 'sliding_chunks_no_overlap']: 48 | assert not self.autoregressive # not supported 49 | assert self.attention_dilation == 1 # dilation is not supported 50 | # assert self.attention_dilation == 2 # dilation is not supported 51 | 52 | def forward( 53 | self, 54 | hidden_states_q, # [B, T, C] 55 | hidden_states_k, 56 | hidden_states_v, 57 | attention_mask=None, 58 | head_mask=None, 59 | encoder_hidden_states=None, 60 | encoder_attention_mask=None, 61 | output_attentions=False, 62 | ): 63 | 64 | # print("\n--->> Forward Info: ") 65 | # hidden states input: [B, T, C] 66 | 67 | ''' 68 | The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to 69 | -ve: no attention 70 | 0: local attention 71 | +ve: global attention 72 | ''' 73 | # print("Attention mask: ", attention_mask) 74 | 75 | assert encoder_hidden_states is None, "`encoder_hidden_states` is not supported and should be None" 76 | assert encoder_attention_mask is None, "`encoder_attention_mask` is not supported and shiould be None" 77 | 78 | if attention_mask is not None: 79 | # print("attention_mask is not None") 80 | attention_mask = attention_mask.squeeze(dim=2).squeeze(dim=1) 81 | key_padding_mask = attention_mask < 0 82 | extra_attention_mask = attention_mask > 0 83 | remove_from_windowed_attention_mask = attention_mask != 0 84 | 85 | num_extra_indices_per_batch = extra_attention_mask.long().sum(dim=1) 86 | max_num_extra_indices_per_batch = num_extra_indices_per_batch.max() 87 | 88 | if max_num_extra_indices_per_batch <= 0: 89 | extra_attention_mask = None 90 | else: 91 | # To support the case of variable number of global attention in the rows of a batch, 92 | # we use the following three selection masks to select global attention embeddings 93 | # in a 3d tensor and pad it to `max_num_extra_indices_per_batch` 94 | # 1) selecting embeddings that correspond to global attention 95 | extra_attention_mask_nonzeros = extra_attention_mask.nonzero(as_tuple=True) 96 | zero_to_max_range = torch.arange(0, max_num_extra_indices_per_batch, 97 | device=num_extra_indices_per_batch.device) 98 | # mask indicating which values are actually going to be padding 99 | selection_padding_mask = zero_to_max_range < num_extra_indices_per_batch.unsqueeze(dim=-1) 100 | # 2) location of the non-padding values in the selected global attention 101 | selection_padding_mask_nonzeros = selection_padding_mask.nonzero(as_tuple=True) 102 | # 3) location of the padding values in the selected global attention 103 | selection_padding_mask_zeros = (selection_padding_mask == 0).nonzero(as_tuple=True) 104 | else: 105 | remove_from_windowed_attention_mask = None 106 | extra_attention_mask = None 107 | key_padding_mask = None 108 | 109 | 110 | hidden_states_q = hidden_states_q.transpose(0, 1) 111 | hidden_states_k = hidden_states_k.transpose(0, 1) 112 | hidden_states_v = hidden_states_v.transpose(0, 1) 113 | 114 | seq_len, bsz, embed_dim = hidden_states_q.size() 115 | 116 | assert embed_dim == self.embed_dim 117 | q = self.query(hidden_states_q) 118 | k = self.key(hidden_states_k) 119 | v = self.value(hidden_states_v) 120 | q /= math.sqrt(self.head_dim) 121 | 122 | # print("num head: ", self.num_heads) 123 | # print("head dim: ", self.head_dim) # int(hidden_size / num_attention_heads) 124 | 125 | q = q.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1) 126 | k = k.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1) 127 | 128 | # print("long q: ", q.shape) 129 | # print("long k: ", k.shape) 130 | # print("long v: ", v.shape) 131 | 132 | # attn_weights = (bsz, seq_len, self.num_heads, window*2+1) 133 | if self.attention_mode == 'tvm': 134 | q = q.float().contiguous() 135 | k = k.float().contiguous() 136 | attn_weights = diagonaled_mm_tvm(q, k, self.attention_window, self.attention_dilation, False, 0, False) 137 | elif self.attention_mode == "sliding_chunks": 138 | attn_weights = sliding_chunks_matmul_qk(q, k, self.attention_window, padding_value=0) 139 | elif self.attention_mode == "sliding_chunks_no_overlap": 140 | attn_weights = sliding_chunks_no_overlap_matmul_qk(q, k, self.attention_window, padding_value=0) 141 | else: 142 | raise False 143 | 144 | # attn_weights = (bsz, seq_len, self.num_heads, window*2+1) 145 | mask_invalid_locations(attn_weights, self.attention_window, self.attention_dilation, False) 146 | 147 | if remove_from_windowed_attention_mask is not None: 148 | # This implementation is fast and takes very little memory because num_heads x hidden_size = 1 149 | # from (bsz x seq_len) to (bsz x seq_len x num_heads x hidden_size) 150 | remove_from_windowed_attention_mask = remove_from_windowed_attention_mask.unsqueeze(dim=-1).unsqueeze(dim=-1) 151 | # remove_from_windowed_attention_mask = remove_from_windowed_attention_mask.unsqueeze(dim=-1) 152 | # print("remove_from_windowed_attention_mask: ", remove_from_windowed_attention_mask.shape) 153 | # cast to float/half then replace 1's with -inf 154 | float_mask = remove_from_windowed_attention_mask.type_as(q).masked_fill(remove_from_windowed_attention_mask, -10000.0) 155 | repeat_size = 1 if isinstance(self.attention_dilation, int) else len(self.attention_dilation) 156 | float_mask = float_mask.repeat(1, 1, repeat_size, 1) 157 | ones = float_mask.new_ones(size=float_mask.size()) # tensor of ones 158 | # diagonal mask with zeros everywhere and -inf inplace of padding 159 | if self.attention_mode == 'tvm': 160 | d_mask = diagonaled_mm_tvm(ones, float_mask, self.attention_window, self.attention_dilation, False, 0, False) 161 | elif self.attention_mode == "sliding_chunks": 162 | d_mask = sliding_chunks_matmul_qk(ones, float_mask, self.attention_window, padding_value=0) 163 | elif self.attention_mode == "sliding_chunks_no_overlap": 164 | d_mask = sliding_chunks_no_overlap_matmul_qk(ones, float_mask, self.attention_window, padding_value=0) 165 | attn_weights += d_mask 166 | 167 | 168 | assert list(attn_weights.size())[:3] == [bsz, seq_len, self.num_heads] 169 | assert attn_weights.size(dim=3) in [self.attention_window * 2 + 1, self.attention_window * 3] 170 | 171 | # the extra attention 172 | if extra_attention_mask is not None: 173 | selected_k = k.new_zeros(bsz, max_num_extra_indices_per_batch, self.num_heads, self.head_dim) 174 | selected_k[selection_padding_mask_nonzeros] = k[extra_attention_mask_nonzeros] 175 | # (bsz, seq_len, num_heads, max_num_extra_indices_per_batch) 176 | selected_attn_weights = torch.einsum('blhd,bshd->blhs', (q, selected_k)) 177 | selected_attn_weights[selection_padding_mask_zeros[0], :, :, selection_padding_mask_zeros[1]] = -10000 178 | # concat to attn_weights 179 | # (bsz, seq_len, num_heads, extra attention count + 2*window+1) 180 | attn_weights = torch.cat((selected_attn_weights, attn_weights), dim=-1) 181 | attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32) # use fp32 for numerical stability 182 | 183 | if key_padding_mask is not None: 184 | # softmax sometimes inserts NaN if all positions are masked, replace them with 0 185 | attn_weights_float = torch.masked_fill(attn_weights_float, key_padding_mask.unsqueeze(-1).unsqueeze(-1), 0.0) 186 | 187 | attn_weights = attn_weights_float.type_as(attn_weights) 188 | attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training) 189 | v = v.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1) 190 | 191 | attn = 0 192 | if extra_attention_mask is not None: 193 | selected_attn_probs = attn_probs.narrow(-1, 0, max_num_extra_indices_per_batch) 194 | selected_v = v.new_zeros(bsz, max_num_extra_indices_per_batch, self.num_heads, self.head_dim) 195 | selected_v[selection_padding_mask_nonzeros] = v[extra_attention_mask_nonzeros] 196 | # use `matmul` because `einsum` crashes sometimes with fp16 197 | # attn = torch.einsum('blhs,bshd->blhd', (selected_attn_probs, selected_v)) 198 | attn = torch.matmul(selected_attn_probs.transpose(1, 2), selected_v.transpose(1, 2).type_as(selected_attn_probs)).transpose(1, 2) 199 | attn_probs = attn_probs.narrow(-1, max_num_extra_indices_per_batch, attn_probs.size(-1) - max_num_extra_indices_per_batch).contiguous() 200 | 201 | if self.attention_mode == 'tvm': 202 | v = v.float().contiguous() 203 | attn += diagonaled_mm_tvm(attn_probs, v, self.attention_window, self.attention_dilation, True, 0, False) 204 | elif self.attention_mode == "sliding_chunks": 205 | attn += sliding_chunks_matmul_pv(attn_probs, v, self.attention_window) 206 | elif self.attention_mode == "sliding_chunks_no_overlap": 207 | attn += sliding_chunks_no_overlap_matmul_pv(attn_probs, v, self.attention_window) 208 | else: 209 | raise False 210 | 211 | attn = attn.type_as(hidden_states_q) # 将attn类型转为hidden_states类型 212 | assert list(attn.size()) == [bsz, seq_len, self.num_heads, self.head_dim] 213 | attn = attn.transpose(0, 1).reshape(seq_len, bsz, embed_dim).contiguous() 214 | 215 | # For this case, we'll just recompute the attention for these indices 216 | # and overwrite the attn tensor. TODO: remove the redundant computation 217 | if extra_attention_mask is not None: 218 | selected_hidden_states = hidden_states.new_zeros(max_num_extra_indices_per_batch, bsz, embed_dim) 219 | selected_hidden_states[selection_padding_mask_nonzeros[::-1]] = hidden_states_q[extra_attention_mask_nonzeros[::-1]] 220 | 221 | q = self.query_global(selected_hidden_states) 222 | k = self.key_global(hidden_states_k) 223 | v = self.value_global(hidden_states_v) 224 | q /= math.sqrt(self.head_dim) 225 | 226 | q = q.contiguous().view(max_num_extra_indices_per_batch, bsz * self.num_heads, self.head_dim).transpose(0, 1) # (bsz*self.num_heads, max_num_extra_indices_per_batch, head_dim) 227 | k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) # bsz * self.num_heads, seq_len, head_dim) 228 | v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1) # bsz * self.num_heads, seq_len, head_dim) 229 | attn_weights = torch.bmm(q, k.transpose(1, 2)) 230 | assert list(attn_weights.size()) == [bsz * self.num_heads, max_num_extra_indices_per_batch, seq_len] 231 | 232 | attn_weights = attn_weights.view(bsz, self.num_heads, max_num_extra_indices_per_batch, seq_len) 233 | attn_weights[selection_padding_mask_zeros[0], :, selection_padding_mask_zeros[1], :] = -10000.0 234 | if key_padding_mask is not None: 235 | attn_weights = attn_weights.masked_fill( 236 | key_padding_mask.unsqueeze(1).unsqueeze(2), 237 | -10000.0, 238 | ) 239 | attn_weights = attn_weights.view(bsz * self.num_heads, max_num_extra_indices_per_batch, seq_len) 240 | attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32) # use fp32 for numerical stability 241 | attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training) 242 | selected_attn = torch.bmm(attn_probs, v) 243 | assert list(selected_attn.size()) == [bsz * self.num_heads, max_num_extra_indices_per_batch, self.head_dim] 244 | 245 | selected_attn_4d = selected_attn.view(bsz, self.num_heads, max_num_extra_indices_per_batch, self.head_dim) 246 | nonzero_selected_attn = selected_attn_4d[selection_padding_mask_nonzeros[0], :, selection_padding_mask_nonzeros[1]] 247 | attn[extra_attention_mask_nonzeros[::-1]] = nonzero_selected_attn.view(len(selection_padding_mask_nonzeros[0]), -1).type_as(hidden_states_q) 248 | 249 | context_layer = attn.transpose(0, 1) 250 | if output_attentions: 251 | if extra_attention_mask is not None: 252 | # With global attention, return global attention probabilities only 253 | # batch_size x num_heads x max_num_global_attention_tokens x sequence_length 254 | # which is the attention weights from tokens with global attention to all tokens 255 | # It doesn't not return local attention 256 | # In case of variable number of global attantion in the rows of a batch, 257 | # attn_weights are padded with -10000.0 attention scores 258 | attn_weights = attn_weights.view(bsz, self.num_heads, max_num_extra_indices_per_batch, seq_len) 259 | else: 260 | # without global attention, return local attention probabilities 261 | # batch_size x num_heads x sequence_length x window_size 262 | # which is the attention weights of every token attending to its neighbours 263 | attn_weights = attn_weights.permute(0, 2, 1, 3) 264 | outputs = (context_layer, attn_weights) if output_attentions else (context_layer,) 265 | 266 | 267 | return outputs 268 | 269 | 270 | # if __name__ == "__main__": 271 | 272 | # model = MultiScaleSelfAttention(config, layer_id) -------------------------------------------------------------------------------- /nets/diagonaled_mm_tvm.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from functools import lru_cache 3 | 4 | import torch 5 | import os.path 6 | 7 | 8 | class DiagonaledMM(torch.autograd.Function): 9 | '''Class to encapsulate tvm code for compiling a diagonal_mm function, in addition to calling 10 | this function from PyTorch 11 | ''' 12 | 13 | function_dict = {} # save a list of functions, each has a different set of parameters 14 | 15 | @staticmethod 16 | def _compile_function(dtype: str, device: str, b0: int = 4, b1: int = 4, b2: int = 16): 17 | '''Compiles a tvm function that computes diagonal_mm 18 | args: 19 | dtype: str in ['float64', 'float32', 'float16'] 20 | device: str in ['cpu' or 'cuda'] 21 | b0, b1, b2: size of tensor tiles. Very important for good performance 22 | 23 | ''' 24 | import tvm # import the full tvm library here for compilation. Don't import at the top of the file in case we don't need to compile 25 | from tvm.contrib import nvcc 26 | @tvm.register_func 27 | def tvm_callback_cuda_compile(code): 28 | """Use nvcc compiler for better perf.""" 29 | ptx = nvcc.compile_cuda(code, target="ptx", arch='sm_52') # use old arch for this to work on old GPUs 30 | return ptx 31 | 32 | assert dtype in ['float16', 'float32', 'float64'] 33 | assert device in ['cpu', 'cuda'] 34 | device = None if device == 'cpu' else device 35 | tgt_host="llvm" 36 | 37 | b = tvm.var('b') # batch size 38 | n = tvm.var('n') # sequence length 39 | h = tvm.var('h') # number of heads 40 | m = tvm.var('m') # hidden dimension 41 | w = tvm.var('w') # window size 42 | w_upper = tvm.var('w_upper') # window size to the right of the word. Should be `0` or `w` 43 | padding = tvm.var('padding') # padding 44 | transpose_t1 = tvm.var('transpose_t1') # t1 should be transposed 45 | t1d3 = tvm.var('t1d3') # last dimension of t1 46 | t3d3 = tvm.var('t3d3') # last dimension of t3 (the result tensor) 47 | X = tvm.placeholder((b, n, h, t1d3), name='X', dtype=dtype) # first tensor 48 | Y = tvm.placeholder((b, n, h, m), name='Y', dtype=dtype) # second tensor 49 | k = tvm.reduce_axis((0, t1d3), name='k') # dimension to sum over 50 | D = tvm.placeholder((h), name='D', dtype='int') # dilation per head 51 | output_shape = (b, n, h, t3d3) # shape of the result tensor 52 | algorithm = lambda l, i, q, j: tvm.sum( 53 | tvm.if_then_else( 54 | t3d3 == m, # if output dimension == m, then t1 is diagonaled (FIXME: This breaks if t3d3 == m == t1d3) 55 | tvm.if_then_else( 56 | transpose_t1 == 0, 57 | tvm.if_then_else( 58 | tvm.all( 59 | i + D[q] * (k - w) >= 0, 60 | i + D[q] * (k - w) < n, 61 | ), 62 | X[l, i, q, k] * Y[l, i + D[q] * (k - w), q, j], # t1 is diagonaled 63 | padding 64 | ), 65 | tvm.if_then_else( 66 | tvm.all( 67 | i + D[q] * (k - w_upper) >= 0, # `w_upper` to handle the case `autoregressive=True` 68 | i + D[q] * (k - w_upper) < n, 69 | ), 70 | X[l, i + D[q] * (k - w_upper), q, (w_upper + w) - k] * Y[l, i + D[q] * (k - w_upper), q, j], # # t1 is diagonaled and should be transposed 71 | padding 72 | ), 73 | ), 74 | tvm.if_then_else( 75 | tvm.all( 76 | i + D[q] * (j - w) >= 0, 77 | i + D[q] * (j - w) < n, 78 | ), 79 | X[l, i, q, k] * Y[l, i + D[q] * (j - w), q, k], # t1 is not diagonaled, but the output tensor is going to be 80 | padding 81 | ) 82 | ), axis=k) 83 | 84 | Z = tvm.compute(output_shape, algorithm, name='Z') # automatically generate cuda code 85 | s = tvm.create_schedule(Z.op) 86 | 87 | print('Lowering: \n ===================== \n{}'.format(tvm.lower(s, [X, Y, D], simple_mode=True))) 88 | 89 | # split long axis into smaller chunks and assing each one to a separate GPU thread/block 90 | ko, ki = s[Z].split(Z.op.reduce_axis[0], factor=b0) 91 | ZF = s.rfactor(Z, ki) 92 | 93 | j_outer, j_inner = s[Z].split(s[Z].op.axis[-1], factor=b1) 94 | i_outer, i_inner = s[Z].split(s[Z].op.axis[1], factor=b2) 95 | 96 | s[Z].bind(j_outer, tvm.thread_axis("blockIdx.x")) 97 | s[Z].bind(j_inner, tvm.thread_axis("threadIdx.y")) 98 | 99 | s[Z].bind(i_outer, tvm.thread_axis("blockIdx.y")) 100 | s[Z].bind(i_inner, tvm.thread_axis("threadIdx.z")) 101 | 102 | tx = tvm.thread_axis("threadIdx.x") 103 | s[Z].bind(s[Z].op.reduce_axis[0], tx) 104 | s[ZF].compute_at(s[Z], s[Z].op.reduce_axis[0]) 105 | s[Z].set_store_predicate(tx.var.equal(0)) 106 | 107 | print('Lowering with GPU splits: \n ===================== \n{}'.format(tvm.lower(s, [X, Y, D], simple_mode=True))) 108 | 109 | # compiling the automatically generated cuda code 110 | diagonaled_mm = tvm.build(s, [X, Y, Z, D, w, w_upper, padding, transpose_t1, t3d3], target=device, target_host=tgt_host, name='diagonaled_mm') 111 | return diagonaled_mm 112 | 113 | @staticmethod 114 | def _get_lib_filename(dtype: str, device: str): 115 | base_filename = 'multi_scale/lib/lib_diagonaled_mm' 116 | return '{}_{}_{}.so'.format(base_filename, dtype, device) 117 | 118 | @staticmethod 119 | def _save_compiled_function(f, dtype: str, device: str): 120 | if not os.path.exists('multi_scale/lib/'): 121 | os.makedirs('multi_scale/lib/') 122 | f.export_library(DiagonaledMM._get_lib_filename(dtype, device)) 123 | 124 | @staticmethod 125 | def _load_compiled_function(dtype: str, device: str): 126 | from tvm.module import load # this can be the small runtime python library, and doesn't need to be the whole thing 127 | filename = DiagonaledMM._get_lib_filename(dtype, device) 128 | current_dir = os.path.dirname(os.path.abspath(__file__)) 129 | potential_dirs = ['../../', '../', './', f'{current_dir}/', f'{current_dir}/../'] 130 | for potential_dir in potential_dirs: 131 | filepath = '{}{}'.format(potential_dir, filename) 132 | if os.path.isfile(filepath): 133 | print('Loading tvm binary from: {}'.format(filepath)) 134 | return load(filepath) 135 | return None 136 | 137 | @staticmethod 138 | def _get_function(dtype: str, device: str): 139 | '''Loads the function from the disk or compile it''' 140 | # A list of arguments that define the function 141 | args = (dtype, device) 142 | if args not in DiagonaledMM.function_dict: 143 | diagonaled_mm = DiagonaledMM._load_compiled_function(dtype, device) # try to load from disk 144 | if not diagonaled_mm: 145 | print('Tvm binary not found. Compiling ...') 146 | diagonaled_mm = DiagonaledMM._compile_function(dtype, device) # compile 147 | DiagonaledMM._save_compiled_function(diagonaled_mm, dtype, device) # save to disk 148 | # convert the tvm function into a pytorch function 149 | from tvm.contrib import dlpack 150 | diagonaled_mm_pytorch = dlpack.to_pytorch_func(diagonaled_mm) # wrap it as a pytorch function 151 | # save the function into a dictionary to be reused 152 | DiagonaledMM.function_dict[args] = diagonaled_mm_pytorch # save it in a dictionary for next time 153 | return DiagonaledMM.function_dict[args] 154 | 155 | @staticmethod 156 | def _diagonaled_mm(t1: torch.Tensor, t2: torch.Tensor, w: int, d: Union[torch.Tensor,int], 157 | is_t1_diagonaled: bool = False, transpose_t1: bool = False, padding: int = 0, 158 | autoregressive: bool = False): 159 | '''Calls the compiled function after checking the input format. This function is called in three different modes. 160 | t1 x t2 = r ==> t1 and t2 are not diagonaled, but r is. Useful for query x key = attention_scores 161 | t1 x t2 = r ==> t1 is diagonaled, but t2 and r are not. Useful to compuate attantion_scores x value = context 162 | t1 x t2 = r ==> t1 is diagonaled and it should be transposed, but t2 and r are not diagonaled. Useful in some of 163 | the calculations in the backward pass. 164 | ''' 165 | dtype = str(t1.dtype).split('.')[1] 166 | device = t1.device.type 167 | assert len(t1.shape) == 4 168 | assert len(t1.shape) == len(t2.shape) 169 | assert t1.shape[:3] == t2.shape[:3] 170 | if isinstance(d, int): # if d is an integer, replace it with a tensor of the same length 171 | # as number of heads, and it is filled with the same dilation value 172 | d = t1.new_full(size=(t1.shape[2],), fill_value=d, dtype=torch.int, requires_grad=False) 173 | 174 | assert len(d.shape) == 1 175 | assert d.shape[0] == t1.shape[2] # number of dilation scores should match number of heads 176 | b = t1.shape[0] # batch size 177 | n = t1.shape[1] # sequence length 178 | h = t1.shape[2] # number of heads 179 | m = t2.shape[3] # hidden dimension 180 | w_upper = 0 if autoregressive else w 181 | c = w_upper + w + 1 # number of diagonals 182 | if is_t1_diagonaled: 183 | assert t1.shape[3] == c 184 | r = t1.new_empty(b, n, h, m) # allocate spase for the result tensor 185 | else: 186 | assert not transpose_t1 187 | assert t1.shape[3] == m 188 | r = t1.new_empty(b, n, h, c) # allocate spase for the result tensor 189 | 190 | # gets function from memory, from disk or compiles it from scratch 191 | _diagonaled_mm_function = DiagonaledMM._get_function(dtype=dtype, device=device) 192 | 193 | # The last argument to this function is a little hacky. It is the size of the last dimension of the result tensor 194 | # We use it as a proxy to tell if t1_is_diagonaled or not (if t1 is diagonaled, result is not, and vice versa). 195 | # The second reason is that the lambda expression in `_compile_function` is easier to express when the shape 196 | # of the output is known 197 | # This functions computes diagonal_mm then saves the result in `r` 198 | if m == c: 199 | # FIXME 200 | print('Error: the hidden dimension {m} shouldn\'t match number of diagonals {c}') 201 | assert False 202 | _diagonaled_mm_function(t1, t2, r, d, w, w_upper, padding, transpose_t1, m if is_t1_diagonaled else c) 203 | return r 204 | 205 | @staticmethod 206 | def _prepare_tensors(t): 207 | '''Fix `stride()` information of input tensor. This addresses some inconsistency in stride information in PyTorch. 208 | For a tensor t, if t.size(0) == 1, then the value of t.stride()[0] doesn't matter. 209 | TVM expects this value to be the `product(t.size()[1:])` but PyTorch some times sets it to `t.stride()[1]`. 210 | Here's an example to reporduce this issue: 211 | import torch 212 | print(torch.randn(1, 10).stride()) 213 | > (10, 1) 214 | print(torch.randn(10, 1).t().contiguous().stride()) 215 | > (1, 1) # expected it to be (10, 1) as above 216 | print(torch.randn(10, 2).t().contiguous().stride()) 217 | > (10, 1) # but gets the expected stride if the first dimension is > 1 218 | ''' 219 | assert t.is_contiguous() 220 | t_stride = list(t.stride()) 221 | t_size = list(t.size()) 222 | # Fix wrong stride information for the first dimension. This occures when batch_size=1 223 | if t_size[0] == 1 and t_stride[0] == t_stride[1]: 224 | # In this case, the stride of the first dimension should be the product 225 | # of the sizes of all other dimensions 226 | t_stride[0] = t_size[1] * t_size[2] * t_size[3] 227 | t = t.as_strided(size=t_size, stride=t_stride) 228 | return t 229 | 230 | min_seq_len = 16 # unexpected output if seq_len < 16 231 | 232 | @staticmethod 233 | def forward(ctx, t1: torch.Tensor, t2: torch.Tensor, w: int, d: Union[torch.Tensor,int], is_t1_diagonaled: bool = False, padding: int = 0, autoregressive: bool = False) -> torch.Tensor: 234 | '''Compuates diagonal_mm of t1 and t2. 235 | args: 236 | t1: torch.Tensor = (batch_size, seq_len, num_attention_heads, hidden_size|number_of_diagonals). 237 | t1 can be a regular tensor (e.g. `query_layer`) or a diagonaled one (e.g. `attention_scores`) 238 | t2: torch.Tensor = (batch_size, seq_len, num_attention_heads, hidden_size). This is always a non-diagonaled 239 | tensor, e.g. `key_layer` or `value_layer` 240 | w: int = window size; number of attentions on each side of the word 241 | d: torch.Tensor or int = dilation of attentions per attention head. If int, the same dilation value will be used for all 242 | heads. If torch.Tensor, it should be 1D of lenth=number of attention heads 243 | is_t1_diagonaled: is t1 a diagonaled or a regular tensor 244 | padding: the padding value to use when accessing invalid locations. This is mainly useful when the padding 245 | needs to be a very large negative value (to compute softmax of attentions). For other usecases, 246 | please use zero padding. 247 | autoregressive: if true, return only the lower triangle 248 | returns: torch.Tensor = (batch_size, seq_len, num_attention_heads, hidden_size|number_of_diagonals) 249 | if t1 is diagonaed, result is non-diagonaled, and vice versa 250 | ''' 251 | batch_size, seq_len, num_attention_heads, hidden_size = t1.size() 252 | assert seq_len >= DiagonaledMM.min_seq_len, 'avoid splitting errors by using seq_len >= {}'.format(DiagonaledMM.min_seq_len) # FIXME 253 | ctx.save_for_backward(t1, t2) 254 | ctx.w = w 255 | ctx.d = d 256 | ctx.is_t1_diagonaled = is_t1_diagonaled 257 | ctx.autoregressive = autoregressive 258 | t1 = DiagonaledMM._prepare_tensors(t1) 259 | t2 = DiagonaledMM._prepare_tensors(t2) 260 | # output = t1.mm(t2) # what would have been called if this was a regular matmul 261 | output = DiagonaledMM._diagonaled_mm(t1, t2, w, d, is_t1_diagonaled=is_t1_diagonaled, padding=padding, autoregressive=autoregressive) 262 | return output 263 | 264 | @staticmethod 265 | def backward(ctx, grad_output): 266 | t1, t2 = ctx.saved_tensors 267 | w = ctx.w 268 | d = ctx.d 269 | is_t1_diagonaled = ctx.is_t1_diagonaled 270 | autoregressive = ctx.autoregressive 271 | if not grad_output.is_contiguous(): 272 | grad_output = grad_output.contiguous() # tvm requires all input tensors to be contiguous 273 | grad_output = DiagonaledMM._prepare_tensors(grad_output) 274 | t1 = DiagonaledMM._prepare_tensors(t1) 275 | t2 = DiagonaledMM._prepare_tensors(t2) 276 | # http://cs231n.github.io/optimization-2/ 277 | # https://pytorch.org/docs/master/notes/extending.html 278 | # grad_t1 = grad_output.mm(t2) # what would have been called if this was a regular matmul 279 | grad_t1 = DiagonaledMM._diagonaled_mm(grad_output, t2, w, d, is_t1_diagonaled=not is_t1_diagonaled, autoregressive=autoregressive) 280 | # grad_t2 = grad_output.t().mm(t1) # or `grad_t2 = t1.t().mm(grad_output).t()` because `(AB)^T = B^TA^T` 281 | if is_t1_diagonaled: 282 | grad_t2 = DiagonaledMM._diagonaled_mm(t1, grad_output, w, d, is_t1_diagonaled=True, transpose_t1=True, autoregressive=autoregressive) 283 | else: 284 | grad_t2 = DiagonaledMM._diagonaled_mm(grad_output, t1, w, d, is_t1_diagonaled=True, transpose_t1=True, autoregressive=autoregressive) 285 | return grad_t1, grad_t2, None, None, None, None, None 286 | 287 | 288 | def _get_invalid_locations_mask_fixed_dilation(seq_len: int, w: int, d: int): 289 | diagonals_list = [] 290 | for j in range(-d * w, d, d): 291 | diagonal_mask = torch.zeros(seq_len, device='cpu', dtype=torch.uint8) 292 | diagonal_mask[:-j] = 1 293 | diagonals_list.append(diagonal_mask) 294 | return torch.stack(diagonals_list, dim=-1) 295 | 296 | @lru_cache() 297 | def _get_invalid_locations_mask(w: int, d: Union[torch.Tensor,int], autoregressive: bool, device: str): 298 | if isinstance(d, int): 299 | affected_seq_len = w * d 300 | mask = _get_invalid_locations_mask_fixed_dilation(affected_seq_len, w, d) 301 | mask = mask[None, :, None, :] 302 | else: 303 | affected_seq_len = w * d.max() 304 | head_masks = [] 305 | d_list = d.cpu().numpy().tolist() 306 | for d in d_list: 307 | one_head_mask = _get_invalid_locations_mask_fixed_dilation(affected_seq_len, w, d) 308 | head_masks.append(one_head_mask) 309 | mask = torch.stack(head_masks, dim=-2) 310 | mask = mask[None, :, :, :] 311 | 312 | ending_mask = None if autoregressive else mask.flip(dims=(1, 3)).bool().to(device) 313 | return affected_seq_len, mask.bool().to(device), ending_mask 314 | 315 | def mask_invalid_locations(input_tensor: torch.Tensor, w: int, d: Union[torch.Tensor, int], autoregressive: bool) -> torch.Tensor: 316 | affected_seq_len, beginning_mask, ending_mask = _get_invalid_locations_mask(w, d, autoregressive, input_tensor.device) 317 | seq_len = input_tensor.size(1) 318 | beginning_input = input_tensor[:, :affected_seq_len, :, :w+1] 319 | beginning_mask = beginning_mask[:, :seq_len].expand(beginning_input.size()) 320 | beginning_input.masked_fill_(beginning_mask, -float('inf')) 321 | if not autoregressive: 322 | ending_input = input_tensor[:, -affected_seq_len:, :, -(w+1):] 323 | ending_mask = ending_mask[:, -seq_len:].expand(ending_input.size()) 324 | ending_input.masked_fill_(ending_mask, -float('inf')) 325 | 326 | 327 | diagonaled_mm = DiagonaledMM.apply 328 | 329 | # The non-tvm implementation is the default, we don't need to load the kernel at loading time. 330 | # DiagonaledMM._get_function('float32', 'cuda') 331 | -------------------------------------------------------------------------------- /metadata/wordst.txt: -------------------------------------------------------------------------------- 1 | ['QuestionText', 'answer', 'confidence', 'Are', 'there', 'more', 'than', 'one', 'bird', 'squawking', 'yes', 'people', 'having', 'a', 'conversation', 'no', 'these', 'animals', 'outside', 'maybe', 'How', 'many', 'birds', 'are', 'three', 'Maybe', 'two', 'cars', 'honk', 'their', 'horn', 'zero', 'Yes', 'Is', 'only', 'sqauwking', 'making', 'noise', 'several', 'it', 'dog', 'the', 'close', 'and', 'far', 'away', 'person', 'screaming', 'this', 'What', 'species', 'of', 'animal', 'can', 'be', 'heard', 'seagull', 'they', 'listening', 'to', 'radio', 'is', 'an', 'object', 'being', 'dropped', 'someone', 'working', 'forklift', 'present', 'placed', 'on', 'platform', 'towards', 'end', 'clip', 'board', 'wood', 'type', 'material', 'item', 'made', 'any', 'in', 'distance', 'does', 'inhale', 'sharply', 'strong', 'wind', 'Was', 'recording', 'what', 'causing', 'rasping', 'sound', 'running', 'nearby', 'river', 'water', 'No', 'hawks', 'around', 'Does', 'involve', 'motion', 'occur', 'indoors', 'raining', 'crashing', 'against', 'beach', 'waves', 'that', 'rain', 'talking', 'music', 'accompanying', 'song', 'home', 'alone', 'well-attended', 'event', 'briefly', 'prayer', 'clapping', 'singing', 'mood', 'portrayed', 'from', 'mammals', 'happy', 'excitement', 'different', 'sounds', 'five', 'consistent', 'melodic', 'muffled', 'coming', 'train', 'factory', 'Can', 'seagulls', 'audio', 'device', 'moved', 'typically', 'by', 'vocalization', 'something', 'usually', 'desert', 'blowing', 'natural', 'element', 'besides', 'trees', 'all', 'same', 'source', 'bouncing', 'separate', 'creaking', 'noises', 'were', 'knocking', 'door', 'opening', 'closing', 'window', 'lots', 'little', 'pieces', 'mess', 'sweeping', 'yelling', 'name', 'used', 'complete', 'action', 'glass', 'Which', 'part', 'room', 'affected', 'activity', 'taking', 'place', 'floor', 'liquid', 'dripping', 'thunder', 'falling', 'most', 'pig', 'background', 'crowd', 'moving', 'quickly', 'nature', 'constant', 'aeroplane', 'match', 'car', 'airplane', 'here', 'live', 'distortion', 'color', 'unknown', 'white', 'kind', 'chicken', 'duck', 'rooster', 'authorities', 'voices', 'vehicle', 'outer', 'space', 'The', 'lengthy', 'high-pitched', 'known', 'as', 'siren', 'six', 'barking', 'hen', 'geese', 'fighting', 'each', 'other', '', 'going', 'would', 'expect', 'hear', 'at', 'shopping', 'mall', 'first', 'half', 'fast', 'freight', 'times', 'struck', 'nineteen', 'eleven', 'twenty', 'hitting', 'metal', 'multiple', 'musical', 'instrument', 'drum', 'pause', 'repetitive', 'four', 'speaking', 'themselves', 'created', 'handling', 'once', 'stapler', 'conversing', 'anyone', 'how', 'Who', 'men', 'get', 'louder', 'change', 'pitch', 'ever', 'stop', 'middle', 'objects', 'mechanical', 'synthesizer', "people's", 'growing', 'over', 'time', 'stray', 'cats', 'crying', 'motorcycle', 'start', 'up', 'chatting', 'clanging', 'driving', 'dead', 'chirping', 'faintly', 'thoroughly', 'traffic', 'doing', 'produced', 'blow', "it's", 'manually', 'rhythm', 'class', 'instance', 'softest', 'guitar', 'stick', 'bell', 'area', 'dry', 'cat', 'lower', 'higher', 'some', 'points', 'again', 'none', 'string', 'plucked', 'squealing', 'machine', 'belt', 'motor', 'Do', 'machines', 'have', 'wheels', 'take', '"If', 'vehicles', 'traveling', 'high', 'speed', 'likely', 'on"', 'freeway', 'highway', 'racetrack', 'meowing', 'machinery', 'engine', 'plane', 'tractor', 'recorded', 'messages', 'man', 'make', 'announcement', 'with', 'chime', 'for', 'passengers', 'gender', 'voice', 'female', 'announcer', 'do', 'flowing', 'pouring', 'heavier', 'weather', 'rainy', 'thundering', 'hollow', 'bucket', 'roof', 'communicating', '"Is', 'flow', 'pretty', 'much', 'unaltered', 'not', 'increased', 'or', 'decreased"', 'throughout', 'into', 'puddle', 'producing', 'substance', 'faucet', 'large', 'through', 'grass', 'beginning', "What's", 'tapping', 'TV', 'clanking', 'tin', 'pencil', 'shatter', 'rollers', 'nine', "someone's", 'speech', 'amplified', 'vibrating', 'humming', 'bus', 'heater', 'industrial', 'printing', 'birds).wav"', 'shoe', 'squeak', 'helicopter', 'passing', 'crow', 'ducks', 'clinking', 'toys', 'dining', 'participating', 'marathon', 'group', 'eating', 'sharp', 'utensils', 'dishes', 'whispering', 'hooting', 'repeatedly', 'dove', 'pigeon', 'owl', 'come', 'whistling', 'chiming', 'chimes', 'ringing', 'Where', 'church', 'house', 'fly', 'digging', 'gravel', 'outdoors', 'sort', 'located', 'zoo', 'elevator', 'steps', 'did', 'step', 'twentyseven', 'twentysix', 'walking', 'stairs', 'down', 'slide', 'inside', 'ticking', 'getting', 'trash', 'somebody', 'laying', 'plastic', 'crinkling', 'wrapping', 'fishing', 'Has', 'tap', 'been', 'left', 'stream', 'rapid', 'waterfall', 'like', 'scraped', 'silent', 'ball', 'sport', 'played', 'pingpong', 'children', 'playing', 'Did', 'pass', 'slowly', 'iron', 'steel', 'human', 'kinds', 'flapping', 'its', 'wings', 'insect', 'cricket', 'laughing', 'continuous', 'hissing', 'locust', 'woman', 'chirps', 'remaining', 'quiet', 'scratching', 'arguing', 'elephant', 'fish', 'alarm', 'go', 'off', 'lever', 'pushed', 'frozen', 'solid', 'tank', 'toilet', 'operating', 'revving', 'stopped', 'counting', 'race', 'thing', 'speaker', 'truck', 'loud', 'shore', 'operated', 'mopping', 'metallic', 'crumpled', 'paper', 'widely', 'kitchen', 'rustling', 'dryer', 'aluminum', 'foil', 'crushing', 'he', 'rattle', 'chipping', '8times', 'air', 'fourteen', 'thrown', 'repeated', 'back', 'forth', 'rake', 'books', 'rotating', 'slow', 'clothes', 'living', 'apart', 'main', 'microwaved', 'food', 'clustered', 'remind', 'popcorn', 'lion', 'growling', 'tarp', 'summer', 'vary', 'crunchy', 'cleaned', 'washing', 'hose', 'washed', 'rubbed', 'out', 'interested', 'rubber', 'speak', 'breaking', 'continue', 'until', 'seven', 'piece', 'shattering', 'brushing', 'teeth', 'bubbling', 'ordinary', 'emergency', 'somewhere', 'shrill', 'ambulance', 'direction', 'insects', 'tree', 'chopped', 'buzzing', 'bee', 'loudest', "truck's", 'ship', 'items', 'sliced', 'receptacles', 'drinks', 'poured', 'bacon', 'cooking', 'serving', 'jar', 'cup', 'papers', 'torn', 'folding', 'touching', 'bubblewrap', 'chickens', 'roosting', 'hay', 'straw', 'hit', 'surfaces', 'violent', 'storm', 'precipitation', 'produces', 'composed', 'electrical', 'giving', 'television', "Who's", 'audience', 'soft', 'way', 'where', 'behave', 'goat', 'makes', 'sheep', 'non', 'creating', 'was', 'called"', 'ice', 'baby', 'spraying', 'held', 'cooing', 'say', 'word', 'drums', 'farm', 'crowing', 'using', 'sometimes', 'scrubbing', 'point', 'unconscious', 'clean', 'brush', 'broom', 'mammal', 'low', 'flying', 'jets', 'ocean', 'electricity', 'steady', 'forest', 'tv', 'static', 'whisper', 'frogs', 'croaking', 'flushed', 'rushing', 'rolled', 'staticy', 'electric', 'crackling', 'welding', 'dogs', "person's", 'words', 'wildly', 'quieter', 'interrupting', 'another', 'completion', 'purpose', 'meeting', 'useful', 'overpower', 'bangs', 'done', 'shooting', 'engines', 'accelerating', 'bike', 'die', 'problem', 'electronic', 'chattering', 'keep', 'changing', 'scraping', 'deep', 'base', 'stay', 'dissipate', 'peoples', 'rainfall', 'bass', 'called', 'rumbling', 'squished', 'splashing', 'pacing', 'manuevered', 'cleaning', 'wheelbarrow', 'washer', 'street', 'celebrating', 'bonfire', 'fire', 'burning', 'fuel', 'gasoline', 'jet', 'transportation', 'eight', 'paved', 'road', 'double', 'creatures', 'sneezing', 'swimming', 'lay', 'eggs', 'egg', 'chewing', 'implement', 'artificial', 'turning', 'tool', 'starting', 'crank', 'wrench', 'wound', 'mower', '"Apart', 'heard"', 'whir', 'revved', 'uncovered', 'dull', 'thud', 'mariachi', 'practicing', 'lines', 'play', 'grated', 'grinding', 'across', 'light', 'cage', 'twentytwo', 'ten', 'chopping', 'flooring', 'tile', 'cutting', 'knife', 'bounce', 'cheering', 'rolling', 'throwing', 'breath', 'during', 'whole', 'breathe', 'entire', 'breathing', 'hard', 'quietly', 'behind', 'long', 'echo', 'short', 'tornado', 'thunderstorm', 'rapidly', 'caused', 'tightly', 'enclosed', 'steadily', 'broken', 'pipe', 'calling', 'rat', 'pour', 'tape', 'calm', 'draining', 'boiling', 'creature', 'chirp', 'furry', 'thirty', 'twentynine', 'before', 'concert', 'just', 'tested', 'loudspeaker', 'signal', 'Could', 'indicate', 'volume', 'fades', 'When', 'lowest', 'small', 'impacts', 'state', 'matter', 'periods', 'new', 'stirring', 'conclusion', 'feature', 'footsteps', 'fifty', 'fourty', 'jogging', 'initially', 'stepping', 'leaves', 'work', 'mat', 'putty', 'fart', 'boy', 'about', 'pants', 'crinkled', 'scrunched', 'grilling', 'his', 'foot', 'gas', 'production', 'turned', 'generator', 'within', 'wheel', 'child', 'crowded', 'drumming', 'repeat', 'could', 'knocked', 'together', 'breakable', 'anything', 'fall', 'things', 'In', 'found', 'seem', 'dings', 'typing', 'very', 'writing', 'pen', 'typewriter', 'keyboard', 'Aalborg', 'DK', '1900hrs.wav"', 'near', 'bells', '5', 'seconds', 'male', 'happening', 'hotel', 'bathroom', 'seventeen', 'cuckoo', 'machine.wav"', 'often', 'twice', 'level', 'sewing', 'sander', 'shaking', 'sheet', 'shaken', 'canvas', 'good', 'clucking', 'wolf', 'howling', 'rocks', 'fabrics', 'rubbing', 'kicked', 'marbles', 'clashing', 'pebbles', 'walks', 'shoes', 'boats', 'till', 'squeaking', 'always', 'constantly', 'swinging', 'rusty', 'swing', 'gate', 'surface', 'thumping', 'wearing', 'feet', 'boots', 'Voices', 'Traffic).wav"', 'talk', 'shots', 'exhilarating', 'military', 'operation', 'clearly', 'women', 'remain', 'mike', 'quietest', 'flap', 'sixteen', 'moment', 'surfers', 'continually', 'decrease', 'intensity', 'covering', 'ground', 'directly', 'front', 'tub', 'monkey', 'subway', 'goes', 'flashing', 'station', 'wolves', 'night', 'day', 'morning', 'airport', 'various', 'bumps', 'twelve', 'rodent', 'turn', 'jackhammer', 'power', 'drill', 'beep', 'fluctuate', 'weather-related', 'grizzly', 'bear', 'repetetive', "'howling'", 'parakeet', 'impatient', 'gradually', 'becomes', 'intense', 'thirtyfive', 'Whats', 'banged', 'fan', 'speeding', 'number', 'lake', 'pop', 'dropping', 'drip', 'opened', 'park', 'fight', 'fence', 'violently', 'horse', 'escape', 'delicate', 'latch', 'box', 'dishwasher', 'performs', 'actions', 'lawn', 'lot', 'tearing', 'package', 'crickets', 'panting', 'badminton', 'past', 'babies', 'repetitions', 'cyclical', 'fifteen', 'sprinkler', 'man-made', 'comes', 'fabric', 'Would', 'commonplace', 'golf', 'course', 'idle', 'drive', 'grinder', 'idling', 'dirt', 'scooter', 'dirtbike', 'brass', 'plays', 'trumpet', 'crack', 'patient', 'lidocaine', 'spinning', 'distorted', 'On', 'coin', 'piano', 'ambience.wav"', 'clinging', 'silverware', 'tools', 'lightning', 'ignite', 'safe', 'emitted', 'cow', 'messing', 'mouth', 'whistle', 'tweeting', 'warning', 'blaring', 'occuring', 'naturally', 'occurring', 'phenomenon', 'clouds', 'occurs', 'without', 'intervention', 'consistently', 'wave', 'twentyone', 'twentyfive', 'involved', 'jungle', 'pitched', 'giggling', 'drizzling', 'cough', 'nothing', 'fireplace', 'uneven', 'polishing', 'sand', 'shut', 'associated', 'movement', 'polished', 'hinge', 'only.wav"', 'need', 'maintenance', 'should', 'applied', 'eliminate', 'oil', 'shouting', 'taps', 'open', 'teaspoon', 'continuously', 'slam', 'powerful', 'chips', 'bowl', 'scraper', 'hammer', 'wind-up', 'toy', 'fail', 'tension', 'released', 'movements', 'parts', 'pulling', 'while', 'winding-up', 'tapped', 'hands', 'indoor', 'location', '2', 'pounds', 'booming', 'weapon', 'faster', 'screeching', 'speeds', 'doorbell', 'concrete', 'raindrops', 'signs', 'pleasant', 'sunny', 'when', 'taken', 'stormy', 'grumbles', 'growls', 'auditorium', 'colliding', 'sounding', 'instances', 'collide', 'metals', 'spoon', 'city', 'crows', 'waiting', 'From', 'face', 'after', 'sniffing', 'racing', 'switch', 'hurry', 'sirens', 'slowing', 'silence', 'tracks', 'cicadas', 'softly', 'overhead', 'who', 'Didgeridoo', 'instrumental', 'rock', 'theremin', 'chiansaw', 'might', 'humans', 'frog', 'clicking', 'clicks', 'twentyfour', 'twentythree', 'clock', 'fruit', 'blender', 'watch', 'started', 'winning', 'bathing', 'walked', 'chair', 'every', 'unnatural', 'flipped', 'key', 'contact', 'involvement', 'swearing', 'body', 'character', 'obstructed', 'audible', 'pauses', 'between', "'breathing'", 'centered', 'fading', 'out"', 'itself', 'cutter', 'medium', 'heavy', 'fountains', 'fountain', 'office', 'environment', 'speaks', 'closest', 'mic', 'strike', 'chainsaw', 'locomotive', 'flashes', 'creates', 'lightening', 'quacking', 'nearer', 'everyone', 'fighter', 'sky', 'banging', 'fired', 'compacted', 'birdsong', 'crunching', 'yell', 'pace', 'increase', 'upon', 'snow', 'trotting', 'else', "child's", 'punk', 'sobbing', 'spinner', 'driller', 'saw', 'break', 'variations', 'Were', 'mean', 'firetruck', 'mooing', '"What', 'voices"', 'drain', 'gurgle', 'duckie', 'filling', 'sink', 'basin', 'facet', 'pounding', 'wall', 'monkeys', 'those', 'due', 'deliberate', 'This', 'debris', 'hockey', 'path', 'cause', 'airline', 'setting', 'phone', 'clicked', 'mouse', 'shower', 'bedroom', 'pan', 'jostled', 'whale', 'best', 'describes', 'buzz', 'cut', 'eventually', 'sew', 'construction', 'cold', 'forefront', 'horns', 'llamas', 'chomping', 'heavily', 'sleeping', 'tuba', 'table', 'whining', 'cane', '"After', 'runs', 'off"', 'then', 'seat', 'gush', 'urinating', 'Kind', 'drilling', 'vibrations', 'regular', 'follow', 'tones', 'noisy', 'filming', 'flood', 'sing', 'closed', 'fluid', 'wd-40', 'squeaky', 'recycled', 'styrofoam', 'vessel', 'commonly', 'bottle', 'tropical', 'also', 'crash', 'seawave', 'sea', 'applauding', 'storming', 'calls', 'cleaver', 'steak', 'hum', 'trying', 'communicate', 'parrot', 'happiness', 'slurping', 'consistant', 'dribbling', 'wlking', 'next', 'habour.wav"', 'resistance.wav"', 'thuds', 'celtic', 'genre', 'creak', 'slamming', 'gates', 'push', 'motored', 'windy', 'airplane"', 'last', 'riding', 'rickshaw', 'beeps', 'stuck', 'household', 'tunnel', 'task', 'percussion', 'spitting', 'care', 'oral', 'hygiene', 'drinking', 'cooked', 'slight', 'clang', 'fryingpan', 'normal', 'grill', 'sizzling', 'become', 'shakes', 'vibrates', 'thin', 'impacting', 'intermittently', 'rod', 'composition', 'rattles', 'inelegant', 'hawk', 'jump', 'dunked', 'receptacle', 'bugs', 'longer', 'couple', 'flute', 'organ', 'emitting', 'songs', 'band', 'party', 'windchime', 'xylophone', 'outdoor', 'characterized', 'lively', 'zooming', 'torque', 'per', 'rotation', 'jam', 'manual', 'labor', 'hammering', 'rotated', 'crunch', 'ratchet', 'beings', 'amusement', 'ride', 'transport', 'bison', 'moaning', 'semi', 'hand', 'strokes', 'crackled', 'rise', 'if', 'container', 'filled', 'term', 'wheeled', 'pilot', 'loudness', 'stomping', 'video', 'chopsticks', 'thirtyseven', 'forty', 'frying', 'scene', 'canopy.wav"', 'wet', 'sun', 'shining', 'carry', 'them', 'umbrella', 'inconsistent', 'jumping', 'woodpecker', 'second', 'suddenly', 'which', 'component', 'attacking', 'nests', 'traverse', 'ticks', 'seventythree', 'rhythmic', 'chatter', 'nice', 'legs', 'hundred', 'dat', 'trein', 'op', 'alle', 'tussengelegen', 'stations', 'zal', 'stoppen_100311.wav"', 'preceded', 'tone', 'railwaystation', 'Why', 'tiger', 'vocalizations', 'god', 'bark', 'side', 'attempts', 'unlock', 'turing', 'lock', 'rainforest', 'driver', 'pullover', 'trunk', 'slammed', 'sut', 'old', 'limp', 'as"', 'briskly', 'pavement', 'drops', 'abundant', 'splash', 'loudly', 'instruments', 'accompany', 'singer', 'neighing', "man's", 'hoarse', 'shaker', 'snake', 'speakers', 'steeldrum', 'building', 'arcade', 'types', 'karate', 'purring', "cat's", 'strumming', 'top', 'fingers', 'copy', 'copies', 'print', 'status', 'shuffling', 'crisps', 'ring', 'mode', 'perform', 'reacting', 'interacted', 'up"', 'uninterrupted', 'ripped', 'steam', 'cap', 'let', 'interrupts', 'bang', 'airplanes', 'honking', '"When', 'doing"', 'slamping', 'striking', 'dried', 'drying', 'meant', 'right', 'now', 'mixer', 'landing', 'finish', 'dress', 'alive', 'huge', 'appliance', 'refrigerator', 'coffeemaker', 'both', 'utter', 'scream', 'help', 'gasps', "woman's", 'gasp', 'police', '"Does', 'though', 'meaning', 'starts', 'stops"', '"How', 'hearing', 'end"', 'covers', 'feathers', 'At', 'sandy', 'workshop', 'boat', 'growl', 'showers', 'faucets', "aren't", 'branches', 'aircraft', 'aircrafts', 'whooshing', 'thumps', 'sawmill', 'stepped', 'grains', 'spring', 'pot', 'wash', 'sprayer', 'plopping', 'happen', 'sample', 'pool', 'asking', 'questions', 'waters', 'program', 'occasionally', 'bad', 'sifted', 'book', 'snapped', 'selling', 'cleaner', 'handled', 'girl', 'fell', 'her', 'jingle', 'breaks', 'least', 'beat', 'began', 'configuration', 'rattling', 'evening', 'causes', 'rustle', 'ghost', 'tranquil', 'creek', 'milling', 'ac', 'breed', 'big', 'fade', 'flush', 'USA.wav"', 'swishes', 'sigh', 'ongoing', 'run', 'cabinet', 'drawer', 'repetitively', 'midway', 'powered', 'Wat', 'hail', 'cracking', 'Dies', 'even', 'slack', 'altogether', 'fairly', 'basketball', 'tires', 'laundry', 'sandwich', 'dominate', 'taping', 'packing', 'pattern', 'detected', 'South', 'Germany.wav"', 'english', 'store', 'cooker', 'A', "I'd", 'eighteen', 'gong', 'jelly', 'triangle', 'age', 'groups', 'kids', 'pound', 'ambient', 'never', 'among', 'stage', 'life', 'adult', 'sources', 'funeral', 'restaurant', 'prepared', 'busy', 'sensation', 'conduct', 'devices', '"In', 'garden', 'hold', 'whilst', 'watering', 'plants"', 'choir', 'sprayed', 'touches', 'create', 'softer', 'whirring', 'squeaks', 'lying', 'cease', 'beaten', 'exclaim', 'chant', 'unison', 'sporting', 'mostly', 'will', 'crew', 'cabin', 'night.wav"', 'rather', 'exhaling', 'shortest', 'sixth', 'pitches', 'art', 'form', 'gets', 'happens', 'initial', 'carnival', 'driven', 'skateboarding', 'bicycle', 'staticky', 'standing', 'still', 'library', 'squawks', 'squawk', 'snoring', 'deeper', 'vehical', 'mechicanal', 'charge', 'blinds', 'drawn', 'completing', 'single', 'Towards', 'introduced', 'beeping', 'stopping', 'bathwater', 'east', 'meow', 'moderately', 'crackers', 'flock', 'everything', 'bumping', 'snorting', 'roaring', 'closer', 'further', 'gentle', 'somewhat', 'startling', 'albeit', 'so', 'often"', 'oiled', 'recently', 'has', 'variety', 'equipment', 'such', 'playground', 'garage', 'bug', 'public', 'intercom', 'show', 'joy', 'laugh', 'vocalizing', 'airsound', 'blue', 'asleep', 'bids', 'RPM', 'talking.wav"', 'cows', 'cawing', 'school', 'range', 'market', 'shouted', 'forcefully', 'arid', 'recreational', 'drink', 'alternate', 'irrigation', 'third', 'green', 'impact', 'Hi', 'Mom.', 'I', 'Crashed', 'Car.wav"', 'hesitating', 'sniffling', 'daytime', 'cicada', 'distinctive', 'womanly', 'screams', 'ordering', 'ham', 'melody', 'panthers', 'creaks', 'hiking', 'bumble', 'bees.wav"', 'non-buzzing', 'adults', 'louder"', 'split', 'blown', 'tuning', 'specific', 'broadcast', "that's", 'entertainment', 'purposes', 'snippets', 'channels', 'changed', 'result', 'knock', 'keys', 'bumped', 'safety', 'handsaw', 'vacuuming', 'operate', 'click', 'pet', 'oinking', 'saxophone', 'letter', 'common', 'black', 'prying', 'sitting', 'celaring', 'glasscontainer.wav"', 'locked', 'barefooted', 'holding', 'smooth', 'in-between', 'press', 'mumbling', 'Wood', 'Barefoot', 'Jumps', '&', 'Scuffs.wav"', 'cream', 'trucks', 'honks', 'requires', 'immediate', 'attention', '"The', 'en', 'route', 'general"', 'interacting', 'hall', 'roll', 'rolls', 'bag', 'slows', 'completely', 'bowling', 'alley', 'bees', 'terrain', 'plain', 'dogs.wav"', 'rainning', 'intensify', 'foreground', 'walk', 'slicing', 'carrots', 'siren.wav"', 'interrupted', 'few', 'interference', 'laser', 'blasts', 'thirteen', 'spilling', 'game', 'basement', 'Soft.', 'Crickets.wav"', 'frequency', 'dinner', 'tweet', 'fixing', 'pipes', 'drainage', 'mechanism', 'stairway', 'stomp', 'may', 'reply', 'bath', "windchime's", 'sqeaking', 'breezy', 'annoying', 'shredded', 'dragging', 'coffee', 'farther', 'accelerates', 'return', 'piercing', 'sound"', '4', 'elements', 'tire', 'tick', 'diner', 'cock', 'goose', 'because', 'showering', 'condition', 'caught', 'ambience', '.wav"', 'reversing', 'spit', 'crunched', 'claps', 'gutters', 'screech', 'lawnmower', 'diesel', 'move', 'quickness', 'sixty', 'simply', 'animalistic', 'considered', 'hog', 'propellers', 'belong', 'distinct', 'NOT', 'rain"', 'tan', 'sliding', 'sneeze', '"who', 'talking"', 'try', 'comfort', 'barks', 'row', 'movie', 'horror', 'describe', 'rotate', 'conditioner', 'worn', 'mild', 'brakes', 'accelerate', 'buzzsaw', 'rural', 'wrapper', 'pages', 'twentyeight', 'struggling', 'magazine', 'wooden', 'plank', 'bubble', 'gum', 'peeing', 'logs', 'barefoot', 'staircase', 'coughing', 'rim', 'enables', 'leave', 'bare', "It's", 'Will', 'harder', 'heated', 'Sans', 'Feet.wav"', 'surfaced', 'knocks', 'thirtyeight', 'thirtyone', 'shaped', 'circles', 'bills', 'coins', 'skiing', 'tons', 'motors', 'neigh', 'shaving', 'splashed', 'dinging', 'winch', 'consistency', 'creaky', 'rhythmically', 'happened', 'spoons', 'plate', 'stops', 'overall', 'wind.wav"', 'halt', 'noticeably', 'frightened', 'repeating', 'brook', 'exhaust', 'decision', 'horses', 'flipping', 'rigid', 'sorter', 'violin', 'racket', 'spectator', 'watching', 'tennis', 'jingling', 'apparently', 'pleasure', 'listener', 'clap', 'stretched', 'capacity', 'stretching', 'limit', 'chain', 'feel', 'cloudy', 'football', 'initiated', 'cry', 'sidewalk', 'mug', 'performing', 'job', 'males', 'females', 'free', 'requests', 'probably', 'microphone', 'related', 'travelling', 'mugs', 'turns', 'grumbling', 'automobiles', 'floorboard', 'travel', 'flight', 'closet', 'passed', 'track', '18', 'respond', 'onto', 'chirpping', 'circuit', 'less', 'microwave', 'punched', 'streaming', 'random', 'locusts', 'discernible', 'distant', 'underlying', 'crossing', 'slower', 'ding', 'objected', 'picnic', 'rushes', 'excessively', 'referred', 'what"', 'literally', 'hat', 'today', 'ages', 'mosquitoes', 'mosquito', 'kettle', 'boiled', 'slithering', 'gurgling', 'pick', 'wild', 'ziplining', 'stapled', 'stapling', 'rollercoaster', 'intervals', 'set', 'contain', 'individual', 'bathtub', 'splashes', 'bigger', 'roosters', 'scarping', 'stationary', 'lighting', 'shinning', 'fireworks', 'structure', 'enjoy', "children's", 'seesaw', 'computer', 'printer', 'alot', 'whales', 'cliff', 'razor', 'therapeutic', 'year', 'winter', 'rocket', 'enter', 'measuring', 'amount', 'waterflow', 'mountaintop', 'interrupt', 'roadway', 'zipping', 'zipper', 'foghorn', 'sounded', 'able', 'streets', 'highways', 'nose', 'vibration', 'pressing', 'drawing', 'squishing', 'pizza', 'woods', 'along', '"Besides', 'sharpened', 'too', 'sharpening', 'emit', 'intermittent', 'powering', 'background.wav"', 'jigsaw', 'above', 'clear', 'takes', 'activated', 'stone', 'flushing', 'normally', 'again"', 'audibly', 'railroad', 'cardboard', 'Any', 'tambourine', 'sung', 'rev', 'keychain', 'jars', 'crane', 'site', 'semitruck', 'reverberate', 'stacked', 'lifted', 'venue', 'cheer', 'honked', 'drilled', 'babbling', 'hte', 'sharpener', 'vaccum', 'Approximately', 'whacking', 'gardening', 'reaches', 'sunset', 'gears', 'stall', 'ripping', 'smash', 'send', 'Summer', 'Country.wav"', 'lions', 'wildlife', 'below', 'sparrow', 'quick', 'irregularly', 'tinny', 'performed', 'require', 'radial', 'arm', 'vacuum', 'tell', 'completed', 'anybody', 'breeze', 'sealed', 'booth', 'resemble', 'motorboat', 'marker', 'gongs', 'hits', 'batminton', 'liquids', 'tugboat', 'rabit', 'empty', 'congregating', 'rinsing', 'chimpanzees', 'sure', 'plucking', 'strings', 'provides', 'backing', 'hiss', "vehicle's", 'echoing', 'maintain', 'thrum', "what's", 'trilling', 'sets', 'sand)_01.wav"', 'waking', 'squirrel', 'opens', 'fastened', 'zippers', 'zipped', 'coaxing', 'licking', 'pouting', 'baker', 'slapping', 'dough', 'patting', 'hardness', 'rails', 'RPMs', 'squeek', 'physical', 'mountains', 'extended', 'rings', 'sprinkling', 'accompanies', 'transmission', 'drizzle', 'gently', 'hurricane', 'documents', 'notebook', 'stacking', 'fill', 'flushes', 'announce', "he's", 'use', 'penguin', 'adjusting', 'hears', 'things"', 'drives', 'order', 'slid"', 'quicker', 'generated', 'pushing', 'slid', 'songbird', 'beautiful', 'sheeps', 'bleat', 'baa', 'plants', 'coolant', 'release', 'percolating', 'handle', 'cards', 'vibrate', 'GREENFIELD', 'PLACE', '(Snippet).wav"', 'means', 'effect', 'typical', 'warm', 'popping', 'actively', 'record', 'gushing', 'Praia', 'Grande.wav"', 'manipulating', 'ukulele', 'bellowing', 'scurrying', 'During', 'non-living', 'Cuba', '2008.wav"', 'field', 'severe', 'healthy', 'sized', 'eat', 'flies', 'beverage', 'dispensing', 'baking', 'falls', 'haul', 'garbage', 'planes', 'ajo', 'vedessa', 'maastossa', '_', 'jeep', 'atv', 'difficult', 'brief', 'stop.wav"', 'crush', 'crushed', 'shredding', 'roller', '"At', 'seems', 'analog', "isn't", 'tuned', 'in"', 'coalesce', 'singular', 'toward', 'technological', 'clichМ©d', "'old-fashioned'", 'copier', 'rainstorm', 'fizzing', 'soda', 'popped', 'accident', 'dark', 'monsters/', 'eachother', 'monster/animal', 'monster/animals', 'rush', 'hour', 'intensely', 'shop', 'total', 'spoke', 'full', 'sentence', 'fowl', 'surrounded', 'involving', 'ring.wav"', 'degree', 'pressure', 'stilettos', 'construction.wav"', 'abrasive', 'powers', 'rewind', 'opendoor.wav"', 'staying', 'increasing', 'herd', 'propeller', 'majority', 'travels', 'slowest', 'hardwood', 'faint', 'traffic)', '-', 'DR-100', '(omni)', 'audacity.wav"', 'rats', 'touch', 'hurt', 'heat', 'cycle', 'lorry', 'compressed', 'motorbike', 'racecar', 'galloping', 'rail', 'tempo', 'quaking', 'unscrewed', 'longest', 'thump', 'conductor', 'beads', 'stir', 'cart', 'write', 'calmly', 'utensil', 'scribbling', 'cracks', 'private', 'sportscar', 'zoom', 'alert', 'boxes', 'angry', 'grow', 'sawing', 'build', 'smashed', 'clinks', 'clink', 'fluctuating', 'quality', 'actual', 'reminiscent', 'harmony', 'settings', 'rinse', 'spoken', 'sloshed', 'mowing', 'airbus', 'mowed', 'ignited', 'packet', 'professional', 'industry', 'works', 'depart.wav"', 'bumpy', 'undone', 'nail', 'balloons', 'dirty', 'spewed', 'knob', 'adjusted', 'page', 'repititious', 'sex', 'expectorates', 'whimpering', 'J', 'ends', 'R', 'blower', 'vegetables', 'appear', 'puppy', 'howl', 'IS', 'domesticated', 'shirtless', 'underground', 'establishment', 'pulls', 'object(s)', 'explosions', 'firework', 'explode', 'exploding', 'aquarium', 'absent', 'cadence', 'patterns', "UFO's", '50.wav"', 'glasses', 'non-glass', 'reading', 'keeping', 'timer', 'yapping', 'Giggleswick', 'England.wav"', 'tower', 'saying', 'hurts', 'swam', 'beating', 'orchestra', 'rope', 'wheeling', 'suitcase', 'shaved', 'puddles', 'manufactured', 'cookie', 'urgent', 'mud', 'carboard', 'vocal', 'clip"', 'laughter', 'rung', 'final', 'previous', 'ones', 'tent', "bird's", 'toothbrush', 'wtaer', 'bunch', 'radar', 'scanner', 'genders', 'deserted', 'easy', 'domestic', 'include', 'squishy', 'cracker', 'texture', 'crispy', 'bounced', 'celery', 'quack', 'harsh', 'general', 'process', 'carried', "engine's", 'sputtering', 'Put', 'pills', 'dice', 'sick', 'ingest', 'thunderclaps', 'thunderclap', 'frequent', 'discussion', 'really', 'exterior', 'reverberating', 'drop', 'burbling', 'sucking', 'finishes', 'cussing', 'blended', 'scratched', 'shape', 'size', 'tossing', 'needed', 'owls', 'ceramic', 'bashed', 'Fountain', 'fountains.wav"', '"Of', 'closer"', 'phenomena', 'behaving', 'balloon', 'inflated', 'beaks', 'floating', 'striking"', 'raid', 'hapening', 'scrapping', 'thirtyfour', 'thirtythree', 'swooshing', 'possible', 'gun', 'series', 'skateboard', 'friction', 'resting', 'marble', 'put', 'pulse', 'vampire', 'maliciously', 'scifi', 'noticeable', 'pulsing', 'looking', 'slams', 'touched', 'injury', 'alight', 'rice', 'notes', 'shout', 'communication', 'control', 'utilized', 'mainland', 'rifle', 'circus', 'raise', 'young"', 'ping', 'snowing', 'pulled', 'drug', 'smoothly', 'relaxing', 'blare', 'utility', 'reverse', 'organically', 'grasshoppers', 'bottles', 'individuals', 'Frequently', 'improve', 'aspect', 'exercise', 'strart', 'forrest', 'head', 'protection', 'disturbed', 'real', 'identifiable', 'unchanged', 'tumbling', 'steal', 'swim', 'drained', 'pump', 'wetness', 'conveyed', 'context', "one's", 'interruption', 'cluck', 'roar', 'seemingly', 'young', 'asked', 'agitated', 'funny', 'People', 'rains', 'scarf', 'tongue', "animal's", 'grunts', 'mill', 'mixing', '"Are', 'fingernail', 'canvas"', '2.wav"', 'criminal', 'damage', 'meddling', 'attached', 'stones', 'spark', 'variation', 'bash', 'runner', 'motionless', 'accurate', 'granules', 'dust', 'collect', 'shovel', 'creepy', 'film', 'featured', 'cymbal', 'telephone', 'boil', 'whistle_1.wav"', 'teapot', 'stove', 'precedingly', 'distress', 'distinctly', 'Greece).wav"', 'hoot', 'camden', 'arkansas.wav"', 'These', 'indicative', 'pigeons', 'sifting', 'clinked', 'bodily', 'passes', 'knuckles', 'pots', 'pans', 'dolphin', 'leaf', 'blowers', 'switched', 'cascade', 'others', 'pogo', 'dancing', 'invention', 'organic', 'shovelling', 'ominous', 'flyover.wav"', 'squish', 'sticky', 'cement', 'stand', 'project', 'chanting', 'meditating', 'squeeze', 'category', 'flowers', 'seal', 'spray', 'latest', 'popular', 'classical', 'furniture', 'bubbles', 'repetitious', 'expelling', 'balls', 'larger', 'smaller', 'cur', 're', 'applaud', 'booing', 'operates', 'manufacturing', 'picked', 'ponged', 'blasting', 'screamo', 'jangled', 'visual', 'appearance', 'bat', 'whether', 'droning', 'slimly', 'bridge', 'season', 'eyebrows', 'twisty', 'warping', 'distorts', 'destructible', 'unchanging', '17', 'paddling', 'canoe', 'prevalent', 'hundreds', '"Instead', 'caged', 'mockingbird', 'assembling', 'disassembling', 'crumpling', 'non-stop', 'endure', "can't", 'drumstick', 'unloading', 'sports', 'squek', 'putting', 'lotion', 'outset', 'winder', 'applause'] -------------------------------------------------------------------------------- /metadata/single_word_val_clean.csv: -------------------------------------------------------------------------------- 1 | file_name,QuestionText,answer 2 | "WavesOnTheShore.wav","What can be heard being moved?",WATER 3 | "WavesOnTheShore.wav","What is the person moving in?",WATER 4 | "Footsteps on Rocky Terrain.wav","What does it sound like?",WALKING 5 | "nightinggale2.wav","What could cause this type of alarm?",FIRE 6 | "nightinggale2.wav","What kind of siren is sounding?",ALARM 7 | "Lluvia agosto 2011.wav","How many cars pass by?",ONE 8 | "Lluvia agosto 2011.wav","What is moving?",CAR 9 | "watertab.wav","What is the water pouring into?",SINK 10 | "watertab.wav","What type of surface is the water being poured on?",METAL 11 | "miniature goats and sheep.wav","What animal is making the noise?",SHEEP 12 | "0211_170236 walk downstairs.wav","What kind of surface was the person treading on?",WOOD 13 | "rotatingdome.wav","What kind of vehicle is passing through the tunnel?",TRAIN 14 | "rotatingdome.wav","What object makes the loud noise at the end?",DOOR 15 | "screen-door-slam.wav","How many times can the sound be heard?",EIGHT 16 | "Doorbell harsh.wav","How many times is the buzz heard?",THREE 17 | "bird in the Hague at dawn 5.wav","What animal is there?",BIRD 18 | "bird in the Hague at dawn 5.wav","What type of location can the bird sounds be heard?",PARK 19 | "Caltrain Pushing Caltrain.wav","What is the vehicle that is making noise?",TRAIN 20 | "Breaking Glass .wav","What is being smashed multiple times?",GLASS 21 | "Elizabeth Evans Park - Mount Dora - June.wav","How many types of animals can be heard in this clip?",TWO 22 | "Elizabeth Evans Park - Mount Dora - June.wav","What is making the animal noise heard in this clip?",BIRD 23 | "Evening Atmosphere #2.wav","How many times does the dog bark?",NINE 24 | "Evening Atmosphere #2.wav","What is the dog barking at?",BIRDS 25 | "stclaude.wav","How many times does the thumping noise repeat itself?",TWO 26 | "Night Frogs.wav","What time of day is associated with these sounds?",NIGHT 27 | "a boy and 2 pigs.wav","How many people are talking to one another?",FOUR 28 | "a boy and 2 pigs.wav","What type of animal is making the animal noise?",PIG 29 | "International Harvester Scout II.wav","How many times does the engine stall?",THREE 30 | "Wind moaning through gap in door and house noises.wav","How is the weather?",RAINY 31 | "Cafeteria Ambience.wav","What is clanking in the scene?",DISHES 32 | "20130418_stream.09.wav","What is making the sound?",WATER 33 | "20130418_stream.09.wav","what type of animal can be found under the sound producing thing?",FISH 34 | "wind-sound-from-inside-car.wav","How many voices can be heard yelling in the inclement weather?",ZERO 35 | "wind-sound-from-inside-car.wav","What is blowing outside?",WIND 36 | "2012check_run.wav","How many vehicles are there?",TWO 37 | "2012check_run.wav","What are the formula one cars doing?",RACING 38 | "stone_well.wav","how many times does the machine try to start?",THREE 39 | "Uppsala Streetbusker accordion 1.wav","How many instruments are being played?",ONE 40 | "Uppsala Streetbusker accordion 1.wav","Which instrument is being played?",KEYBOARD 41 | "water slushing moderate speed.wav","What liquid is being played with?",WATER 42 | "water slushing moderate speed.wav","Where is this liquid located?",BUCKET 43 | "greece_melanes_cofee_1.wav","What is singing?",BIRD 44 | "LightRaininPinesMarch302013.wav","How is the weather here?",RAINY 45 | "storm is coming 15-11-2012.wav","What is the animal heard?",DOG 46 | "storm is coming 15-11-2012.wav","What type of weather is it?",RAIN 47 | "Walking_on_tarmac.wav","How many people are moving?",ONE 48 | "Walking_on_tarmac.wav","What's the person doing?",WALKING 49 | "20070824.supper.wav","How many people are heard speaking?",TWO 50 | "20070824.supper.wav","What is a person doing?",POURING 51 | "STE-027FIRE.wav","What is crackling in the background?",FIRE 52 | "glass (2).wav","How many silent lapses are during the buzzing sound?",ZERO 53 | "glass (2).wav","What sound is it?",ALARM 54 | "160917-eichelherr000.wav","What is volume of the bird sounds?",LOUD 55 | "160917-eichelherr000.wav","What kind of animal is this?",BIRD 56 | "Birds of Klein Profijt.wav","What kind of animals are there?",BIRDS 57 | "Drumming on a wine glass.wav","What is the object made of that is making the sound?",GLASS 58 | "greece_naxos_cicadas_4.wav","what animal is making the sound?",CRICKET 59 | "Los Angeles Bus Ride.wav","What is approaching to the people?",BUS 60 | "Los Angeles Bus Ride.wav","What is making the sound?",BUS 61 | "upanddownstairs.wav","What is the person walking on?",STAIRS 62 | "train.wav","What is the source of this noise?",TRAIN 63 | "train.wav","What type of vehicle makes this sound?",TRAIN 64 | "Avion.wav","Where does the object depart from and arrive to?",AIRPORT 65 | "vague_sable.wav","What is it water called when it builds up and crashes on the sand?",WAVES 66 | "vague_sable.wav","What is the the object making the noise?",WATER 67 | "bridge demolition pounding.wav","How many impacts can we hear ?",NINETEEN 68 | "Babble of Frogs 001.wav","Unlike domesticated pets, these animals are what?",DUCKS 69 | "Night Ambient.wav","How many crickets are there?",TWO 70 | "Night Ambient.wav","What is the insect doing?",CHIRPING 71 | "Spring Birds Raw (New Jersey).wav","What domestic animal likes to catch these creatures?",BIRDS 72 | "Spring Birds Raw (New Jersey).wav","What type of animal is making this sound?",BIRD 73 | "bathtub drain.wav","what is the liquid doing?",DRIPPING 74 | "bathtub drain.wav","what noise it the liquid making upon hitting the surface?",SPLASHING 75 | "Night in nature.wav","What are the animals doing?",CHIRPING 76 | "CoffeeShopChatter.wav","What are the people doing?",TALKING 77 | "espresso-maschine.wav","How many times does a person tap on wood at the beginning ?",TWO 78 | "glas-bubbels-def01.wav","what is the water doing?",BUBBLING 79 | "glas-bubbels-def01.wav","What plastic item is the person drinking from?",STRAW 80 | "WATER DRIP ECHO LOW PITCH COMPRESSED.wav","What is falling?",WATER 81 | "small dog leaves.wav","What is being crumpled?",PAPER 82 | "village bar.wav","Besides talking, what are the people doing?",EATING 83 | "village bar.wav","What gets closed at the end of the recording?",DOOR 84 | "fireworks1.wav","What is making the popping noise?",FIREWORKS 85 | "Thunder Outside.wav","How many strikes of thunder are there?",TWO 86 | "car-radio-am-noise2.wav","What is the pitch of the loudest sound?",HIGH 87 | "bellaromani.wav","How many times is the bell struck?",SIX 88 | "bellaromani.wav","What is being struck to make the sound?",GONG 89 | "laundry.machine.wav","What is the object making the rattling noise?",DRYER 90 | "FM Radio Tuning Sweep.wav","What object is having its channels being changed?",RADIO 91 | "machine1.wav","What is driving by?",CAR 92 | "filling-ice-cup.wav","What can the person do with the liquid next?",DRINK 93 | "filling-ice-cup.wav","What is the person doing with a liquid?",POURING 94 | "Rain drops on marquee.wav","what is the rain hitting?",ROOF 95 | "Rain drops on marquee.wav","Where is it raining?",ROOF 96 | "channel 2 now concludes its broadcast day.wav","Where might one see color bars accompanying this noise?",TELEVISION 97 | "metal-bell-percussion.wav","how many pauses are between each set of ringing?",TWO 98 | "metal-bell-percussion.wav","How many times does the bell ring?",THREE 99 | "sizzle 4.wav","how many times does the bottle pop?",ONE 100 | "0221 Bar_terrace.wav","What are the people doing?",TALKING 101 | "Opening and Closing Bolt Door.wav","What activity involving the feet and legs can be heard?",WALKING 102 | "Deplacez-vous.wav","To what emergency vehicle does the siren belong to?",AMBULANCE 103 | "Sukhapha anchor chain.wav","What material is making the rattling?",METAL 104 | "Nord_Odal_Nyhus_04_juni_2011_quiet_forest_birds_insects_leaf_rustle_03.wav","What animal can be heard?",BIRD 105 | "Nord_Odal_Nyhus_04_juni_2011_quiet_forest_birds_insects_leaf_rustle_03.wav","What do background birds do?",CHIRP 106 | "Water_Drops_Falling.wav","What is making the crackling noise?",FIRE 107 | "Water_Drops_Falling.wav","What liquid is splashing on the ground ?",WATER 108 | "clanking lid.wav","what is the metal object doing?",MOVING 109 | "April dawn chorus Sydenham Hill.wav","What type of animal is heard?",BIRD 110 | "crumpleTissuePaper.wav","What creature is likely making this sound?",HUMAN 111 | "Distorted AM Radio noise.wav","What is that sound?",STATIC 112 | "pc_mouse.wav","How many different types of noises can be heard?",FIVE 113 | "pc_mouse.wav","How many times does the mouse click?",THREE 114 | "Running.wav","What does the person do at the end?",COUGH 115 | "Running.wav","what sound comes from the person's mouth as they run?",COUGHING 116 | "Squeaky Wood (Compilation).wav","How many times does the door creak?",TWELVE 117 | "Walking across carpeted floor with slippers.wav","What is the person walking on?",GRAVEL 118 | "quedlinburg castle.wav","What kind of animal is in the area?",BIRD 119 | "Walking alongside the road.wav","How is the person moving?",WALKING 120 | "Walking alongside the road.wav","How many cars passed by?",TWO 121 | "Building Site.wav","How many times is the metallic beating sound in the distance made?",FOUR 122 | "Building Site.wav","What gender are the humans making noise?",MALE 123 | "eraser.wav","What tool is being used?",HAMMER 124 | "Charleston Campus Summer.wav","What are the people doing?",TALKING 125 | "Big_Roundabout_Traffic.wav","How many vehicles can be heard?",FIVE 126 | "Big_Roundabout_Traffic.wav","What is the car driving on?",ROAD 127 | "AC Unit.wav","What happens to the sound at the end?",STOPS 128 | "Bathroom.wav","what is dripping?",WATER 129 | "Airplane Overhead.wav","What is flying overhead?",PLANE 130 | "Clothing_ShirtsandPants_Rustling.wav","What is the item being put into?",BAG 131 | "05769 carpenter's workshop ambience.wav","What is humming?",SAW 132 | "05769 carpenter's workshop ambience.wav","What type of work is being done?",SAWING 133 | "CicadasAPedreira.wav","What insect is making a sound?",CRICKETS 134 | "CicadasAPedreira.wav","What is making faint ripple sounds ever so slightly?",DOG 135 | "Walking in Grass in Evening with Loud Bird.wav","What is the person doing?",WALKING 136 | "Small growling dog.wav","How many dogs are growling ?",THREE 137 | "Small growling dog.wav","What is the animal doing?",GROWLING 138 | "Teluk Nipah 01.wav","What is making the noise in the water?",BOAT 139 | "Teluk Nipah 01.wav","What liquid can be heard?",WATER 140 | "20090407.cricket.real.close.wav","How can the pitch of this sound be described?",LOUD 141 | "Metal clatter drop.wav","How many times is the object dropped?",SIX 142 | "interference from wireless mouse on am radio.wav","How many 'bash' sounds are heard in the last two seconds?",SEVEN 143 | "interference from wireless mouse on am radio.wav","What is buzzing?",RADIO 144 | "outdoors ambient village bird distant neighbours children car.wav","How many times can you hear the inhale or exhale of a person's breath?",ONCE 145 | "17-Year Cicada Mating Call.wav","How many times does the alarm go off ?",TEN 146 | "FOLEY_Ext_Garbage_Hauling_001.wav","What was heard at the start of the audio recording?",DOOR 147 | "LOUD THUNDER - WITH RAIN HITTING UMBRELLA.wav","What booming noise can be heard coming from the sky?",THUNDER 148 | "CR FunnyMachine.wav","What kind of animals are around?",BIRDS 149 | "Bell_Hotel_desk.wav","On how many separate occasions was there a pause between bell ringing multiple times?",THREE 150 | "slam.wav","How many times is the ball audibly smacked?",SIX 151 | "metal rain.wav","What instrument is being played?",SYNTHESIZER 152 | "metal rain.wav","What kind of music is this?",GUITAR 153 | "13. Crushing tin can.wav","How many milk cartons does this person open ?",TWO 154 | "3trump.wav","What element, when forced through this device, makes the sound?",AIR 155 | "3trump.wav","What part of the face is used to play this instrument?",MOUTH 156 | "Fence Hit_City ambience night.wav","How many times is there a banging noise?",TWELVE 157 | "Car starting (open hood).wav","What is being started?",CAR 158 | "Car starting (open hood).wav","What is the girl talking about?",CAR 159 | "Thunder Storm Daytona 3.wav","What was falling from the sky?",THUNDER 160 | "AGFA_1.wav","How many times does the same sound pattern repeat itself?",SIX 161 | "amazon 04.wav","What appendages do these creatures have, instead of arms?",FEATHERS 162 | "amazon 04.wav","What is the predominant animal heard?",BIRD 163 | "Davis.wav","If one were caught outside, how would they end up being?",WET 164 | "Davis.wav","What is happening?",WIND 165 | "20100515.park.ambiance.02.wav","What are the birds doing?",CHIRPING 166 | "SinkWater.wav","What closes and stops the flow of water?",FAUCET 167 | "SinkWater.wav","What's going down the drain?",WATER 168 | "Room tone for quiet bathroom.wav","What is rushing?",WATER 169 | "Room tone for quiet bathroom.wav","What is the sound quality?",GOOD 170 | "crowd booing.wav","What are the people doing?",CHEERING 171 | "crowd booing.wav","what noise do the people make?",CHEERING 172 | "Walla chatter, adults and children in auditorium.wav","What are the people doing?",TALKING 173 | "Walla chatter, adults and children in auditorium.wav","What type of animal is making the biological sounds?",HUMANS 174 | "020220_00.wav","how many people are speaking?",ONE 175 | "020220_00.wav","How many times does the man sneeze?",ZERO 176 | "Waterfall close.wav","If a radio was making this sound, what would that sound be called?",STATIC 177 | "Waterfall close.wav","What kind of noise is this?",RAIN 178 | "Organic sound.wav","The sound indicates that the object is doing what?",OPENING 179 | "water_stream_001.wav","What is running?",WATER 180 | "street steps child car.wav","How many people are chatting?",THREE 181 | "20091224.bells.01.wav","how many times does the bell ring?",THIRTY 182 | "20091224.bells.01.wav","The items making sounds are made from what material?",BELL 183 | "cat_purr_1.wav","What animal is heard here?",CAT 184 | "Remix of 104372__rutgermuller__Metal_Tube_Rolling_www.wav","What is making the noise?",BELLS 185 | "Birdsong2.wav","How many birds are there?",FOUR 186 | "Birdsong2.wav","What is making the chirping noise?",BIRD 187 | "creaky.wav","What sound does the radio make?",STATIC 188 | "Calle.wav","what are the cars doing?",MOVING 189 | "gasBubblesNoise.wav","How many varieties of sound can be heard?",THREE 190 | "fountain in store 001.wav","What luxury item is filled with this in a rich person's backyard?",WATER 191 | "fountain in store 001.wav","What substance is being heard?",WATER 192 | "Beer Pong Sounds ball table and cups.wav","What falls before the tapping begins?",BALL 193 | "Beer Pong Sounds ball table and cups.wav","What object is being hit?",BALL 194 | "Walking On Dry Leaves Normalised.wav","How many people can be heard walking?",ONE 195 | "Walking On Dry Leaves Normalised.wav","What is happening here?",WALKING 196 | "Ambience - St Kilda Beach - waves lapping rocks, people nearby, seagulls.wav","What might the person be doing?",DRIVING 197 | "NYC Subway Train Approach Doors Announce Depart.wav","What does the carriage train do towards the end of the clip?",STOP 198 | "NYC Subway Train Approach Doors Announce Depart.wav","what opens at the end?",SUBWAY 199 | "UnionStation06OutBack_BusyOutside.wav","What is the person doing?",DRIVING 200 | "Wind Noise Backyard.wav","What is dropping the debris?",PERSON 201 | "waves_1.wav","What body of water makes this sound?",OCEAN 202 | "windscreen wipers heavy rain.wav","What is the weather like?",RAINY 203 | "windscreen wipers heavy rain.wav","What is the windshield wipers wiping away off the car?",RAIN 204 | "110422_village_dusk.wav","The staccato sound heard at the beginning and the end is made by what kind of animal?",DOG 205 | "110422_village_dusk.wav","Whose voices can be faintly heard?",KIDS 206 | "street_ambience_day.wav","What are the people doing?",TALKING 207 | "CrunchingHinge.wav","How many screws is this person driving in to wood ?",THREE 208 | "CrunchingHinge.wav","What device attached to a frame is making this sound?",DOOR 209 | "Conversacion Punjabi.wav","What are the people doing?",TALKING 210 | "Koeien, R4 en riet Lichterveldestraat.wav","How many tires do these vehicles, when passenger sized, typically have?",FOUR 211 | "Koeien, R4 en riet Lichterveldestraat.wav","The objects in this location are usually powered by what?",GAS 212 | "New Lift.wav","How many times did a bell ring?",ONCE 213 | "New Lift.wav","What sound alerts that a door was opened?",BELL 214 | "Atlantic Ocean Waves.wav","How many cycles of the same noise can be heard?",FOUR 215 | "Atlantic Ocean Waves.wav","What can be heard crashing into the shore?",WAVES 216 | "CAGE ELAVATOR MUMBAI.wav","How many voices can be heard?",THREE 217 | "CAGE ELAVATOR MUMBAI.wav","What is making the loud clanging noises?",DOOR 218 | "corneille_city01.wav","How many birds are there?",TWO 219 | "corneille_city01.wav","What kind of animal is making noise?",BIRD 220 | "HammerDrill.wav","How many times does the wooden figure croak?",SIX 221 | "13_waiting_chitwan.wav","how many steps are taken?",THREE 222 | "13_waiting_chitwan.wav","What animal is chirping?",BIRD 223 | "outdoors ambient windy wind leaves rustle hum.wav","What is in the sky during this sound?",CLOUDS 224 | "Crunchy walk on pebbles.wav","What is the person doing?",WALKING 225 | "La Barca i La Tempesta.wav","What animal is making noise?",MOUSE 226 | "La Barca i La Tempesta.wav","What is making the tapping sound?",BIRD 227 | "Forest Birds .wav","How many times does someone laugh?",ZERO 228 | "Forest Birds .wav","What animal is singing?",BIRD 229 | "DoorSqueak.wav","Which door creak is the loudest?",LAST 230 | "pencil on paper.wav","People commonly do this action upon a sheet of what?",PAPER 231 | "pencil on paper.wav","What is someone writing on?",PAPER 232 | "indoors dorm dormitory ambient room tone distant traffic in street.wav","What is honking?",CAR 233 | "RBH_Household_shower 03.wav","What is person about to do?",SHOWER 234 | "RBH_Household_shower 03.wav","What room is this?",BATHROOM 235 | "Crunching sticks with feet.wav","Who is crinkling this object?",PERSON 236 | "driving_buying_beer_on_sunday.wav","What are the kids doing in the background?",PLAYING 237 | "driving_buying_beer_on_sunday.wav","where does the man get into after opening the door?",CAR 238 | "080902_00_machine_generators.wav","How many vehicles are there?",ONE 239 | "Fitness studio _ Gym ambience. Weights and equipment.wav","what is making the scrubbing noise?",MACHINE 240 | "Fitness studio _ Gym ambience. Weights and equipment.wav","What is the metal object?",HAMMER 241 | "Door Slam.wav","How many human voices are there?",ONE 242 | "Door Slam.wav","What is happening here?",SLAMMING 243 | "creeeeek-GAIN_01.wav","How many large creaks are there?",FOUR 244 | "070821_flsp_trail03.wav","What are the birds doing?",CHIRPING 245 | "Kitchen Noise From Distance.wav","how many footsteps are there?",ZERO 246 | "Kitchen Noise From Distance.wav","These noises from the person's shoes indicate that the person is doing what?",WALKING 247 | "Page turns and book close_open.wav","Of what man-made material are the pages made of?",PAPER 248 | "Page turns and book close_open.wav","What does the person close shut at the end?",BOOK 249 | "Dinosaur Footsteps-01.wav","How many thumps take place?",EIGHT 250 | "Morning Ride 2.wav","How many times does the vehicle change gear?",FOUR 251 | "Morning Ride 2.wav","What machine is making the sound?",MOTORCYCLE 252 | "Bathtub_Water-drain.wav","what is making the gurgling sound?",DRAIN 253 | "windy rain.wav","What is making the constant sound?",RAIN 254 | "windy rain.wav","What is the weather like?",RAINING 255 | "SheryT_mixdown.wav","What gender is mostly speaking?",MALE 256 | "SheryT_mixdown.wav","Where have all the people congregated?",RESTAURANT 257 | "Motor boat.wav","What is the large machine or vehicle doing?",RUNNING 258 | "paper_bag.wav","What is being ripped?",PAPER 259 | "Fuente Cotino 2.wav","What is splashing?",WATER 260 | "bandung-taxiradio-1.wav","How many people are talking?",THREE 261 | "bandung-taxiradio-1.wav","What are the people riding in?",CAR 262 | "20110422_shower.wav","What direction is the water moving?",DOWN 263 | "20110422_shower.wav","What kind of animals can be heard?",BIRDS 264 | "Atmos beach regular waves hit shore, birds mono.wav","How loud is the noise?",MEDIUM 265 | "110724_inriversidemus1.wav","What gender is the person talking?",FEMALE 266 | "110724_inriversidemus1.wav","What kind of noise are the people making?",TALKING 267 | "Crunchy Footsteps.wav","What is the condition of ground?",WET 268 | "Crunchy Footsteps.wav","What part of the body is striking the surface?",FEET 269 | "Walking on pebble beach.wav","What is the person walking on?",GRAVEL 270 | "Flipping Coin Can.wav","What is the size of the container used?",SMALL 271 | "Flipping Coin Can.wav","What kind of object is it?",TOY 272 | "birds_late_morning.wav","What closure is opened at the end?",CABINET 273 | "birds_late_morning.wav","What is the name of the animal that is audible?",BIRD 274 | "Rain with thunder in a city.wav","What sound are the birds making?",CHIRPING 275 | "Atmosfera Miasto Spokojna dzielnica rano.wav","what does the car do loudly?",ACCELERATING 276 | "Atmosfera Miasto Spokojna dzielnica rano.wav","What is speeding up and passing by?",CARS 277 | "Opening and closing curtain.wav","How many times is the metal dragged across the ground?",SEVEN 278 | "Opening and closing curtain.wav","what is being moved?",FABRIC 279 | "walking_on_snow_and_light_wind.wav","What is blowing?",WIND 280 | "walking_on_snow_and_light_wind.wav","What is someone doing?",WALKING 281 | "auto-rickshaw-trip.wav","What part of the body is used to initiate the beeping sound?",HAND 282 | "auto-rickshaw-trip.wav","What part of the car beeps?",HORN 283 | "Bath 01.wav","What is the liquid doing?",SPILLING 284 | "engine_vibrations_of_ferry_1.wav","For how much of the clip does the machine run?",MIDDLE 285 | "Strong wind in trees.wav","How many vehicles drive past?",TWO 286 | "Strong wind in trees.wav","What machine is heard?",ENGINE 287 | "Chopping Vegetables.wav","What item is used to make the cuts?",KNIFE 288 | "Chopping Vegetables.wav","What scrapes against the surface as a cut is made?",KNIFE 289 | "tram_prague_2stops_veryfewpeople_AMB_INT.wav","What gender is the person speaking?",FEMALE 290 | "2013-03-28 rain in the rainforest.wav","What is the rate of the rainfall?",HEAVY 291 | "2013-03-28 rain in the rainforest.wav","What weather condition is heard?",RAINFALL 292 | "elk car.wav","What is used to steer the machine?",WHEEL 293 | "Vomit, puking spilling water onto grass splat.wav","What body part is the person using to pour the liquid out?",HAND 294 | "Vomit, puking spilling water onto grass splat.wav","What liquid is repeatedly spilled?",WATER 295 | "Beach Wave Ambince .wav","Where is the water coming from?",OCEAN 296 | "20150720_boat.engine.wav","How many different noises can be heard?",THREE 297 | "20150720_boat.engine.wav","How many people can be heard talking ?",TWO 298 | "night in the countryside.wav","How many dogs are barking?",ONE 299 | "night in the countryside.wav","What animal is present?",DOG 300 | "living room tone ambient distant noises neighbours.wav","how many times is there a tap?",ZERO 301 | "110709_05 goma exhibit.wav","What is making the beating sound?",DRUM 302 | "Rusty old boat.wav","What high pitched noise is being made at the start?",SQUEAK 303 | "sparrows.wav","What do these creatures do that humans cannot?",FLY 304 | "sparrows.wav","What is the bird doing?",CHIRPING 305 | "glass d.wav","What object is making the high pitched noise?",HORN 306 | "Heavy Rain 1.wav","What is making the dripping noise?",RAIN 307 | "Kocking door and open door.wav","What material does it sound like the object being knocked on is made of?",WOOD 308 | "Curtain.wav","How many times is the object moved?",SIX 309 | "Lambs.wav","How many different animals are heard?",THREE 310 | "Lambs.wav","How many times does the main, loudest animal call?",THREE 311 | "20100801.wharf.silence.night.wav","What is the animal doing?",BARKING 312 | "INT Factory budimka, pozega.wav","How many times is there an air pressure release?",TWO 313 | "electric-screwdriver.wav","What kind of material is being dropped again and again?",METAL 314 | "More Amphitheatre Birds. Wav.wav","what could be used to reduce the noise?",STYROFOAM 315 | "More Amphitheatre Birds. Wav.wav","What type of sounds are there?",BIRDS 316 | "Writing with Pen.wav","What's the person doing?",SCRATCHING 317 | "WasherSpinCycle.wav","What is making that sound?",MACHINE 318 | "Saas-Fee Village Atmosphere and Church 100611.wav","How many instances of a bird chirping are there?",EIGHT 319 | "Saas-Fee Village Atmosphere and Church 100611.wav","How many times does the bell ring?",EIGHT 320 | "Fast stream _ small river.wav","What liquid is making the rushing noise?",WATER 321 | "Hanoi street walking.wav","how many high pitched brake squeaks are there?",ONE 322 | "Garage Doors Opening_Closing.wav","How many pauses in the sound are there?",ONE 323 | "Garage Doors Opening_Closing.wav","What animal sound can be heard when the grinding stops?",BIRD 324 | "Elysian Park - Picnic Area 2.wav","What animal can be heard?",DOG 325 | "Elysian Park - Picnic Area 2.wav","What noise does the dog make?",BARKING 326 | "rain_medium_thunders.wav","How many times can thunder be heard rumbling?",THREE 327 | "rain_medium_thunders.wav","What is booming in the background?",THUNDER 328 | "Library with Light Chatter.wav","What type of building are they located in?",OFFICE 329 | "heating_far away.wav","What is the water doing?",FLOWING 330 | "Hitting baseball w. wooden bat.wav","How many whacks can be heard?",FOUR 331 | "background of the side streets of Rhodes, scooter, tourists French and American, grinder.wav","What tool is being used to trim the trees?",CHAINSAW 332 | "20141026 Bangkok House Traffic Thunder Bird 01.wav","At the beginning what is the gender of the voice heard?",MALE 333 | "20141026 Bangkok House Traffic Thunder Bird 01.wav","What sort of animal can be heard calling?",BIRD 334 | "traffic stereo.wav","What are people doing?",DRIVING 335 | "traffic stereo.wav","Where are the cars driving?",HIGHWAY 336 | "20090407.toy.train.01.wav","What is making the animal sounds in the background?",BIRDS 337 | "second_floor_lav.wav","What pitch tone is the object making?",LOW 338 | "Machete vs frying pan 2.wav","How many times are things banged together?",NINE 339 | "Machete vs frying pan 2.wav","What is someone using?",HAMMER 340 | "Senseo_boil_norm.wav","how many rotations does object making the sound complete?",ZERO 341 | "Truck starts and stops_edit.wav","How many times does an engine start?",ONE 342 | "Truck starts and stops_edit.wav","What kind of vehicle is starting up?",CAR 343 | "LondonTraffic.wav","What is this activity called?",DRIVING 344 | "Ronda - Fountain near the Town Hall (general) - Fuente cerca del Ayuntamiento (general).wav","What object is sometimes carried by a person to stay dry when this is happening?",UMBRELLA 345 | "Ronda - Fountain near the Town Hall (general) - Fuente cerca del Ayuntamiento (general).wav","What weather event is taking place outside?",RAIN 346 | "nxSample012.wav","What object is creating the loud noise?",RADIO 347 | "peanutFarmDawnShort.wav","What animal does the singing?",BIRD 348 | "peanutFarmDawnShort.wav","What sound is the bird making?",CHIRP 349 | "NY subway.wav","How many people are talking?",ONE 350 | "NY subway.wav","What are the people doing?",TALKING 351 | "People talking while waiting the bus.wav","How many different people are speaking?",THREE 352 | "People talking while waiting the bus.wav","How many different woman are speaking?",THREE 353 | "Walking on dry grass.wav","How many steps does the person take?",FOURTY 354 | "Walking on dry grass.wav","What type of footwear are they wearing?",BOOTS 355 | "Elbe near Ovelgoenne.wav","How many times does the sea waves hit the shore ?",FIVE 356 | "Elbe near Ovelgoenne.wav","What covers the immediate area?",WATER 357 | "AMB_COLE.wav","What gathering occasion caused all these people in the clip to appear together in one place?",PARTY 358 | "AMB_COLE.wav","What rhythmic sound is heard at the beginning of the clip?",RUNNING 359 | "trafficrain.wav","In which instance of a passing vehicle is the engine louder?",SECOND 360 | "trafficrain.wav","What animal is making the most noise?",DOG 361 | "Tibetan Bells 192kHz Original.wav","What is the instrument being struck?",BELL 362 | "Hyeres street voices ambience f.wav","What is likely to be the gender of the person making loud footsteps?",FEMALE 363 | "Steam 20.wav","How many times does the MRI thud?",FOURTEEN 364 | "Steam 20.wav","What instrument is being played?",DRUM 365 | "Birds-Crow & Song Birds.wav","What type of bird is making this noise at the beginning ?",CROW 366 | "Subway_departure_from_station.wav","What energy source does this mode of transport use?",ELECTRICITY 367 | "metal_bowls_altered.wav","What kind of pitch does the sound have?",MEDIUM 368 | "DR0000_0020.wav","how is the weather?",RAINY 369 | "birdy.wav","how many birds are there?",TWO 370 | "birdy.wav","What are the birds doing?",CHIRPING 371 | "morning in the countryside.wav","What is the female called, of the bird first heard?",HEN 372 | "morning in the countryside.wav","Which animal is making the loudest sound?",ROOSTER 373 | "Hanoi streets.wav","What is heard beeping halfway through?",CAR 374 | "Hanoi streets.wav","What type of music genre is played in the middle ?",POP 375 | "CityPark Evening Moerputten NL 130510_01.wav","What animal is this?",BIRD 376 | "rain_on_a_roof_01.wav","What liquid is falling from the sky?",RAIN 377 | "rain_on_a_roof_01.wav","What weather phenomenon is taking place?",RAIN 378 | "Autumnal Ambient 24 Bits 48 Khz.wav","What loud sound is made toward the end?",BOAT 379 | "babbling brook 2.wav","What is running?",WATER 380 | "babbling brook 2.wav","what is the water doing?",FLOWING 381 | "12 noon church-bell 140310_0121.wav","what kind of building is nearby?",CHURCH 382 | "12 noon church-bell 140310_0121.wav","What sound can be heard other than bird song?",BELL 383 | "Whalesong.wav","What is this aniimal?",BEAR 384 | "Whalesong.wav","What makes the sound get louder?",CLOSE 385 | "HeavyRain.wav","How is the weather now?",RAINING 386 | "HeavyRain.wav","What is the rain falling on?",GROUND 387 | "Serving Water Quickly.wav","How many times can water be heard being poured?",TWO 388 | "Serving Water Quickly.wav","What is being poured?",WATER 389 | "Centurion Suburb Evening.wav","What can be heard falling?",WATER 390 | "sizzling oil.wav","How many taps are there?",ONE 391 | "sizzling oil.wav","What is the water doing?",FLOWING 392 | "20081130_walking_in_snow.wav","What are they walking in?",SAND 393 | "open and close pen.wav","How many times is the pen clicked?",TWENTY 394 | "open and close pen.wav","What useful activity can be done with this object, besides clicking it?",WRITING 395 | "100121.wav","How many separate explosion sounds were there?",ONE 396 | "100121.wav","What is causing that loud sound?",THUNDER 397 | "Rain WashingtonSt 1.wav","What can be heard falling?",RAIN 398 | "Rain WashingtonSt 1.wav","What sort of weather is heard?",RAIN 399 | "20090501.horse.neigh.wav","What animal that makes a sound is useful for home security?",DOG 400 | "20090501.horse.neigh.wav","What is the larger animal at the beginning and then at the end?",HORSE 401 | "Edit Radio .wav","What is the object that is being used?",RADIO 402 | "Menziken Sawmill.wav","how many wheel turns are audible?",FIFTY 403 | "Menziken Sawmill.wav","What type of vehicle?",TRAIN 404 | "Rain falling on a metal roof - 96 kHz _ 24 Bit.wav","What kind of object is the rain hitting?",ROOF 405 | "Rain falling on a metal roof - 96 kHz _ 24 Bit.wav","Where is this occuring?",OUTSIDE 406 | "invexdpo.wav","What instrument is being played?",ORGAN 407 | "invexdpo.wav","What kind of movies' can they use this instrumental?",HORROR 408 | "carpet_on_carpet.wav","The hammering sound is interspersed with what other noise?",SWEEPING 409 | "carpet_on_carpet.wav","What is one of the tools being used?",HAMMER 410 | "Song Birds-Lighthouse Park-March.wav","How does these types of creatures typically travel?",FLY 411 | "Song Birds-Lighthouse Park-March.wav","What the birds are doing?",CHIRPING 412 | "md1trk11.wav","How many human voices can be heard?",ZERO 413 | "Dog escapes from the room.wav","What is squeeling?",DOG 414 | "Dog escapes from the room.wav","What slams closed?",DOOR 415 | "CalmWaves SandBeach 03 EQ 130430_03.wav","What is crashing against the surface?",WATER 416 | "CalmWaves SandBeach 03 EQ 130430_03.wav","What is making the noise?",WAVES 417 | "medical car horn EGYPT Alexandria.wav","What is making the wa sound throughout the recording?",SIREN 418 | "medical car horn EGYPT Alexandria.wav","What type of siren is being used?",AMBULANCE 419 | "Heavy Wind on Microphone.wav","What speed is the wind blowing?",FAST 420 | "Basement Water Pump.wav","what sound is being made by the motor?",SCRAPING 421 | "Hallway Room Tone with shower in background.wav","What is the person doing in this clip?",SHOWERING 422 | "Ambience - Merri path, trees and birds, gentle wind take 2.wav","How do the creatures that are heard move about in the air, they do what?",FLY 423 | "Metal_On_Wood_Hits_Axe.wav","how many times does the striking sound occur?",THIRTEEN 424 | "Metal_On_Wood_Hits_Axe.wav","What activity is taking place?",TENNIS 425 | "waterspalsh_in_glass_pitcher.wav","What is the water doing?",DRIPPING 426 | "Washing Machine Spins.wav","Where is this machine located?",KITCHEN 427 | "Traffic Light.wav","how many cars pass by?",TWO 428 | "Traffic Light.wav","What are the people doing in the background?",TALKING 429 | "SPilling Water.wav","How many fires have been lit?",ONE 430 | "Vending Machines - Room Tone.wav","What are the people doing?",LAUGHING 431 | "1122thrum.wav","How many times is there static?",TWO 432 | "steam train from 1912 locomotive.wav","How many times does the horn sound?",ONE 433 | "steam train from 1912 locomotive.wav","What kind of automobile is this?",TRAIN 434 | "Coffeehouse Ambience Burlington VT 0112xx.wav","What are people doing?",TALKING 435 | "Coffeehouse Ambience Burlington VT 0112xx.wav","What opens?",DOOR 436 | "underpass.wav","How many people are singing ?",ONE 437 | "underpass.wav","What is the gender of the person singing?",FEMALE 438 | "Kortedala, Gothenburg - By night - Police sirens and surroundings.wav","What sound is loudest?",CAR 439 | "clock_raw.wav","how many beeps are there?",TWENTYFIVE 440 | "clock_raw.wav","How many times does the beating sound pause?",TWENTYFIVE 441 | "Tools .wav","what are the items doing with each other?",CLANKING 442 | "Metal objects in bowl.wav","Of what substance are the objects likely made?",METAL 443 | "Metal objects in bowl.wav","Where are the coins being put in?",JAR 444 | "audience final applause 01.wav","what are people using to make sounds?",HANDS 445 | "audience final applause 01.wav","when does the clapping die down?",END 446 | "box.wav","How many times did the sound repeat?",EIGHTEEN 447 | "box.wav","What type of surface is being scraped ?",HARD 448 | "Plane crash - black box.wav","How many different voices are there?",TWO -------------------------------------------------------------------------------- /metadata/single_word_test_clean.csv: -------------------------------------------------------------------------------- 1 | file_name,QuestionText,answer 2 | "river_mouth3.wav","How many times does the water splash?",ELEVEN 3 | "river_mouth3.wav","What is flowing?",WATER 4 | "Creaking pier.wav","What type of animal is making the light sound in the background?",BIRD 5 | "Mug in sink.wav","What is being poured?",WATER 6 | "20061124ParadeCHS.wav","Where is the music coming from?",BAND 7 | "Sizzling Bacon.wav","What is open?",FAUCET 8 | "Sizzling Bacon.wav","what type of precipitation causes this sound?",RAIN 9 | "Rain and Storm.wav","How many strikes of lightning can be heard?",ONE 10 | "Rain and Storm.wav","How many times does it thunder?",ONE 11 | "20160718_fountain.03.wav","What is the water doing?",RUNNING 12 | "Inner City Bees.wav","What is buzzing around the microphone?",BEE 13 | "Inner City Bees.wav","What is buzzing?",BEES 14 | "forest_ambience_chepachet_spring_1.wav","How many different types of animals are here?",ONE 15 | "Arch Leaf.wav","What are they walking on?",GRASS 16 | "20130326_caged.birds.01.wav","what are the birds doing?",SINGING 17 | "20130326_caged.birds.01.wav","What size are the birds?",SMALL 18 | "crowdfree.wav","What loud expression does the crowd make?",CHEER 19 | "Creaky wooden steps, down and up.wav","How many people are there?",TWO 20 | "20080416.bunting.wav","How many times does the cricket insect make noise ?",FIVE 21 | "folding and crumpling paper.wav","what gets crumbled up?",PAPER 22 | "bar crowd.wav","how many questions are heard?",THREE 23 | "Thunder Rain Cars Driving By.wav","How is the weather?",RAINY 24 | "Thunder Rain Cars Driving By.wav","What is booming behind the rain?",THUNDER 25 | "WM_switch_fill_to_wash_cycle_24_96_mono.wav","What item is making the repeated noise?",WASHER 26 | "Mechanical paper cutter.wav","What is the source of the sound not coming from a machine?",PEOPLE 27 | "Tall Wine glass hits.wav","How many times does the tapping repeat?",NINE 28 | "je_campuswalk.wav","What is the rough irregular sound heard throughout?",WIND 29 | "Ambience with Train.wav","What vehicle is passing by?",TRAIN 30 | "Tub Draining.wav","What part does it start draining violently?",MIDDLE 31 | "winter wren wind leaves.wav","What sound does the bird make?",CHIRP 32 | "restaurant wood floor.wav","what are the people continuously doing?",TALKING 33 | "restaurant wood floor.wav","What type of business is this?",RESTAURANT 34 | "20101228.teens.wav","What do the people do as a group?",SINGING 35 | "20101228.teens.wav","What noise does the group make with their hands?",CLAPPING 36 | "rhythm of the falling drops.wav","What gender is the first person that talks?",FEMALE 37 | "broken comms2.wav","What happens to the communication signal?",DISTORTION 38 | "20160922_passing.lorry.marshes.wav","How many times does the bird caw out loud?",THREE 39 | "20160922_passing.lorry.marshes.wav","what are the birds doing?",CHIRPING 40 | "Water in a canal.wav","What is making the noise?",FAN 41 | "Wipers .wav","How many times do the car wipers go across the windscreen?",TWELVE 42 | "Wipers .wav","What part of the vehicle is making the sound?",WINDOW 43 | "RemoteControl.Antique.Zenith.wav","What general material is the object that is being struck made out of?",METAL 44 | "Brush 01.wav","How many times does the person sweep?",TWENTYTWO 45 | "Brush 01.wav","What is this person cleaning?",FLOOR 46 | "Beach_SaintJeanDeLuz_France_Waves_Kids_People_xystereo.wav","what place are they playing on?",PARK 47 | "Beach_SaintJeanDeLuz_France_Waves_Kids_People_xystereo.wav","Who is playing outside?",CHILDREN 48 | "Train sound.wav","What direction is the sound moving?",CLOSE 49 | "Train sound.wav","what transportation vehicle is there?",TRAIN 50 | "Steps Indoor medium soft Shoe Sole accompanying wooden Floor hollow Big Room 5mx10mx6m.wav","What kind of shoes is the person wearing?",DRESS 51 | "Steps Indoor medium soft Shoe Sole accompanying wooden Floor hollow Big Room 5mx10mx6m.wav","What's the person doing?",WALKING 52 | "Cicadas .wav","What instrument is heard?",SHAKER 53 | "Cicadas .wav","What is the liquid heard in the background?",WATER 54 | "trail_footsteps_1_0725_102951.wav","How many footsteps are there?",FORTY 55 | "MISC_Int_Cat_Purring_002.wav","What is being used to groom the cat?",BRUSH 56 | "MISC_Int_Cat_Purring_002.wav","What sound is the cat making?",PURRING 57 | "dripping.wav","What plumbing feature is being used?",FAUCET 58 | "Birds in Pujipor.wav","What is cawing?",CROW 59 | "turning pages book slow quickly.wav","How many times is a page turned in the book?",THIRTYONE 60 | "turning pages book slow quickly.wav","What item is having its pages turned?",BOOK 61 | "autospasandociudadnoche1.wav","How many different cars can be heard?",THREE 62 | "Footsteps On Squeaky Wood Floor.wav","What does the person walking wear on their feet?",BOOTS 63 | "wheaten field.wav","What is the long continuous sound?",RAIN 64 | "weather_wind_strong_trees.wav","What is blowing all around?",WIND 65 | "weather_wind_strong_trees.wav","What is crashing against the sand?",WAVES 66 | "HarleyDavidson.wav","what is making the rumbling sound?",MOTORCYCLE 67 | "20140210FallingIce.wav","What sounds like it's popping?",BUBBLEWRAP 68 | "Plaza_de_la_Revolucion_risa.wav","How many times does a dog bark?",TWICE 69 | "Plaza_de_la_Revolucion_risa.wav","What animal is barking?",DOG 70 | "quick walk.wav","what activity is the person doing?",WALKING 71 | "quick walk.wav","What form of transport is the person using?",WALKING 72 | "20110805_forest.crows.07.wav","What animal is making that sound?",DOG 73 | "fdv_orage_26082011.wav","What is the weather like?",RAIN 74 | "fdv_orage_26082011.wav","What item should be used above there head to keep dry in rain?",UMBRELLA 75 | "Rave1.wav","What instrument is producing this sound?",KEYBOARD 76 | "ieai.wav","how many waves crash?",SEVEN 77 | "ieai.wav","What is the water hitting?",SHORE 78 | "room-tone theater with silent woman 130525_07.wav","what is the machine doing?",BLOWING 79 | "Buddhist Bells.wav","How many times is the object hit?",THREE 80 | "Buddhist Bells.wav","What pitch would the sound be considered?",HIGH 81 | "h907 boules pologna clap f.wav","How many people speak on a microphone?",TWO 82 | "Wind-up Toy Motorbike SFX.wav","If this sound was coming from a toy, what would that toy be called?",SPINNER 83 | "Wind-up Toy Motorbike SFX.wav","What is making the sound?",TOY 84 | "breakfast ambience.wav","How are the people communicating with each other?",TALKING 85 | "breakfast ambience.wav","how many people are speaking?",TWO 86 | "Machetes sliding 2.wav","How many times does the person make the object sound off?",TEN 87 | "Afternoon Suburb Calm.wav","What kind of animal is making noise?",BIRD 88 | "Mulholland Memorial Fountain Los Angeles.wav","What object is making this static noise?",TELEVISION 89 | "Mulholland Memorial Fountain Los Angeles.wav","Where is the water going?",DOWN 90 | "Morsecode - SOS MAYDAY - 988 Hz Tone.wav","What kind of code can be heard?",BEEPING 91 | "Mariehamn_frogs.wav","how many quacks are there?",TWENTYFIVE 92 | "bands_and_motorbike.wav","How many times are the cymbals played?",SIX 93 | "metal plate striking wall.wav","how many times does the metal fall?",FIVE 94 | "FOREST-SOUNDS.wav","How many times is there a loud 'caw' sound?",SEVEN 95 | "FOREST-SOUNDS.wav","What kind of animal can be heard throughout the sound?",BIRD 96 | "wind and birds in the delta of the River Po 2.wav","What are the birds doing?",CHIRPING 97 | "wind and birds in the delta of the River Po 2.wav","What is blowing?",WIND 98 | "Rain on Car Roof 2.wav","What are the object falling on that makes the banging noise?",ROOF 99 | "Grackles.wav","What kind of animals can be heard?",BIRDS 100 | "Grackles.wav","what type of building where people pay to see animals could this be located in?",ZOO 101 | "country-ambiance-01.wav","what is making the chirping noise?",CRICKETS 102 | "Morning Birds 001.wav","These creatures live in homes they build themselves which are known as what?",NESTS 103 | "Morning Birds 001.wav","What animals are making noise?",BIRDS 104 | "WaterBottle.wav","How many knocks can be heard?",ONE 105 | "rainy stream 22 sec.wav","What is falling?",RAIN 106 | "Krankenwagen _ German Ambulances Passing by...wav","What makes a siren sound?",AMBULANCE 107 | "air bubbles on the surface of the water.wav","What is boiling?",WATER 108 | "air bubbles on the surface of the water.wav","What type of water device is in operation?",TAP 109 | "Ambience - Cattle Barn - Busy - 96kHzhg.wav","How many times does the cow moo?",FOUR 110 | "Ambience - Cattle Barn - Busy - 96kHzhg.wav","How many types of animals can be heard making noise ?",ONE 111 | "Clatter.wav","Who's making the sound?",PERSON 112 | "Paris to Germany Train Announcement.wav","What is the man's voice being transmitted through?",SPEAKER 113 | "Paris to Germany Train Announcement.wav","what kind of transportation is the speaker on?",TRAIN 114 | "night ambient crickets bugs white noise.wav","What insect is heard in the background?",CRICKETS 115 | "Rain - 1.wav","how is the water flowing?",FAST 116 | "spring morning birds oiseaux reveil printemps #1.wav","How many times does the rooster crow?",FOUR 117 | "divide lake.wav","What is crashing against the sand?",WAVES 118 | "divide lake.wav","What is falling?",WATER 119 | "Old moped.wav","What is the machine being used?",SEWING 120 | "rain_on_tin_roof.wav","what is the rain hitting?",ROOF 121 | "rain_on_tin_roof.wav","What is the weather like?",RAINY 122 | "Footsteps on Wet Pavement_1-2.wav","What is the person stepping on?",LEAVES 123 | "hamlet Haanwijk autumn NL 03 151003_0804 ST.wav","What are the birds doing?",CHIRPING 124 | "hamlet Haanwijk autumn NL 03 151003_0804 ST.wav","What winged creatures are heard?",BIRDS 125 | "Hotel automatic skylight open and close, faint sirens nearby.wav","How many times does the machine noise pause?",TWO 126 | "Birds in the city 1.wav","How do the animals that are audible usually get around?",FLY 127 | "Birds in the city 1.wav","What is whistling?",BIRD 128 | "faucet3.wav","How many animals can be heard?",NONE 129 | "Garden Birds 3.wav","How many times does a bird make a whistling sound?",SEVEN 130 | "Garden Birds 3.wav","What are these creatures?",BIRDS 131 | "bagpipe_on_street_BA.wav","How many dogs bark?",ONE 132 | "bagpipe_on_street_BA.wav","What type of instrument produces this sound?",PIPES 133 | "024_House_InsideCarEngineStart.wav","What does the engine do before the end?",ACCELERATES 134 | "27 hn_birdspecking.wav","What weather condition can be heard in the recording?",RAIN 135 | "Garbage Truck.wav","How many machines are being operated?",FOUR 136 | "Garbage Truck.wav","How many times is there a sound of an object being compressed?",ONE 137 | "Curtains.wav","What tool is being used to clean up the leaves?",RAKE 138 | "THE_RATT23_1.wav","What is the person doing?",WALKING 139 | "THE_RATT23_1.wav","What is the person walking on?",GRAVEL 140 | "WalkingInSnowCrunchingIce.wav","How many people are walking?",ONE 141 | "LightRainOctober31st2015.wav","What is on fire?",FIREPLACE 142 | "PauseConference_youngerPeople.wav","What are all the people in this area doing?",TALKING 143 | "Siren Milan.wav","How many sirens are going off?",TWO 144 | "Siren Milan.wav","In what type of situation would this noise occur?",EMERGENCY 145 | "anykeystudio_apocalypse.wav","What object is producing the high pitched noise?",SIREN 146 | "Fowl - Chatter 1 - 96kHz.wav","What animal is making the most noise?",CHICKEN 147 | "IKEA_Cafeteria.wav","In what location are the people?",RESTAURANT 148 | "dragged-glass-object.wav","What is the person doing?",WRITING 149 | "Erny vs Deadman4.wav","What are the people doing?",TALKING 150 | "paper_cut.wav","What is making the cutting sound?",CUTTER 151 | "paper_cut.wav","What kind of substance is being cut into?",PAPER 152 | "20070303.duck.wav","How many times does the bird quack?",TWENTYEIGHT 153 | "Cooking rice.wav","What is making this noise?",WIND 154 | "hissy fizz.wav","How many times does the static pause?",FIVE 155 | "Kitchen faucet running fast and slow and filling glass of water.wav","What plumbing device is emitting water?",FAUCET 156 | "20080918.boots.door.wav","What is the person doing inside of the building?",WALKING 157 | "Boom_Folie_NoiseOnGlass.wav","How many people are speaking?",ZERO 158 | "upstairs.wav","How many people are going down the stairs ?",TWO 159 | "upstairs.wav","When does the door open?",END 160 | "drunk_teenagers_1.wav","who is talking besides men?",WOMEN 161 | "CAR_WASH.wav","What is an example of something carried by this vehicle?",PEOPLE 162 | "CAR_WASH.wav","What kind of vehicle is moving on the track?",TRAIN 163 | "Train Pass Koln.wav","what type of vehicle can be heard?",TRAIN 164 | "RadioFan.wav","What is being played in this clip?",TV 165 | "WeddingClap.wav","How is the crowd feeling?",HAPPY 166 | "WeddingClap.wav","What is the crowd doing?",CLAPPING 167 | "light suburban ambiance.wav","What animal can be heard?",BIRD 168 | "WaterOnMetal.wav","What is making the noise?",MACHINE 169 | "20081130_walking_in_snow_with_snowshoes.wav","how many footsteps are there?",TWENTYFOUR 170 | "20081130_walking_in_snow_with_snowshoes.wav","What is the person walking through?",SNOW 171 | "Airplane indoor ambience .wav","What is this sound?",AIRPLANE 172 | "Theater Chatter.wav","What are the people doing?",TALKING 173 | "Theater Chatter.wav","What part of their bodies are people using to make noise?",MOUTH 174 | "public.wav","What are they doing ?",CHEERING 175 | "public.wav","What type of mammal is screaming?",HUMAN 176 | "Oppedette cafe #1.wav","What is the dog doing?",HOWLING 177 | "Oppedette cafe #1.wav","What song is the man singing?",MUSIC 178 | "adw018raw.wav","How many times does the sound repeat itself?",FIFTEEN 179 | "adw018raw.wav","What is ringing?",BELL 180 | "Icy rain.wav","How many dogs can be heard barking?",ONE 181 | "Icy rain.wav","What animal is making noise in the background?",DOG 182 | "Index Card Flips (handle business paper mvmt) 02.wav","How many distinct snaps can be heard?",THIRTEEN 183 | "Index Card Flips (handle business paper mvmt) 02.wav","What is being shuffled?",CARDS 184 | "Waterfalls_00216.wav","How many distortions can be heard in the spraying sound?",ONE 185 | "WindInPylons.wav","What item can be heard blowing in the wind?",LEAVES 186 | "Art Gallery Tone.wav","What type of gallery did he say it was?",MUSIC 187 | "rain in tent.wav","What is falling?",RAIN 188 | "Lots of Geese.wav","Where is this noise coming from?",FARM 189 | "birds chirping 03 short.wav","How many birds are nearby?",TWO 190 | "birds chirping 03 short.wav","What sound does a bird make?",CHIRP 191 | "THE_RATT12_1.wav","Who is speaking?",MAN 192 | "food_prep_1_cw.wav","What activity is causing the sizzling?",FRYING 193 | "STE-008.wav","What sound is the train making?",HONKING 194 | "graffiti artist spraying NL 130611_02.wav","What is being sprayed on?",LIQUID 195 | "Sound_FX_Ambient_Street_cars+passing_by.wav","How many different vehicles can be heard?",TWO 196 | "bierfest_atmosphere.wav","What are the people doing?",CHATTING 197 | "bierfest_atmosphere.wav","What does the child do at the end?",LAUGH 198 | "MVI_4002-B.wav","What is driving over the tracks?",TRAIN 199 | "MVI_4002-B.wav","What type of train is this ?",SUBWAY 200 | "UrbanHerringGulls.wav","How many wings do each of these animals have?",TWO 201 | "Bicycle Chain Accel Crash.wav","What object just broke?",GLASS 202 | "Tunnel Creek.wav","Where is the man talking?",BATHROOM 203 | "Metallic Gate.wav","What does the door keep doing?",SQUEAKING 204 | "Metallic Gate.wav","What is creaking?",DOOR 205 | "cafecarusel_fan_hizz_EQ2.wav","What happens to the screeching sound in the middle?",INCREASE 206 | "walking indoors footsteps tap tap tapping foley.wav","What material are they stepping on?",WOOD 207 | "walking indoors footsteps tap tap tapping foley.wav","Where is this person walking?",HALL 208 | "Chainsaw Crosscutting 3.wav","What is the person doing with the lawn equipment?",MOWING 209 | "Chainsaw Crosscutting 3.wav","What tool is being used?",CHAINSAW 210 | "Room-tone rain-drips 1m 161015_1013.wav","How many types of sounds can be heard?",TWO 211 | "Room-tone rain-drips 1m 161015_1013.wav","What is the sound coming from?",PIPE 212 | "Water Driping 7.wav","what is the water doing?",TAP 213 | "street works_pressure_low rumble.wav","who speaks?",MAN 214 | "Street Noise - Cars - Ball Bouncing indistinct voices.wav","What is the vehicle moving along?",BIKE 215 | "Atmosphere on road in London.wav","What can be heard in the distance in the first half of the clip?",CAR 216 | "walking in gravel 2.wav","How many steps does the person take?",TWENTY 217 | "walking in gravel 2.wav","What is the person doing?",WALKING 218 | "AMB_swamp_summer_night_fish_insects_00.wav","How many times is it possible to hear an object moving in the water?",THREE 219 | "AMB_swamp_summer_night_fish_insects_00.wav","What type of creature is calling out?",CRICKET 220 | "LA Rain.wav","How many footsteps can be heard in the clip?",ZERO 221 | "Swings in Mauerpark, Berlin.wav","What device is moving?",SWING 222 | "Rain recording.wav","What element is coming from the sky?",WATER 223 | "Rain recording.wav","What kind of storm is it?",RAINSTORM 224 | "aftertherain.wav","What animal can be heard in the background?",BIRD 225 | "Stovetop Range w. City Ambience_1-2.wav","how often does the sound happen?",CONTINUOUSLY 226 | "Stovetop Range w. City Ambience_1-2.wav","What is leaking from a tube?",AIR 227 | "outdoors ambient distant village 3.wav","How many animals can be heard?",TWO 228 | "outdoors ambient distant village 3.wav","What animal can clearly be heard in the background at one point in the audio?",DOG 229 | "Office Lift 2.wav","What gender is the human?",FEMALE 230 | "Office Lift 2.wav","What is opened?",DOOR 231 | "20110220_churchbell.wav","How many times is the gong struck?",FOUR 232 | "20110220_churchbell.wav","What is being struck?",BELL 233 | "Nightingale.wav","How many chirps are there?",FIFTY 234 | "worktoilet.wav","What is being flushed?",WATER 235 | "worktoilet.wav","What is going down the drain?",WATER 236 | "cupboard door squeaks.wav","What is being opened and closed?",DOOR 237 | "river_mouth1.wav","what does the water make when it collides with itself?",SPLASH 238 | "fountains-Udaipur-Saheliyon-Ki-Bari-4.wav","What is falling to the ground?",RAIN 239 | "fountains-Udaipur-Saheliyon-Ki-Bari-4.wav","What is the ground?",PAVEMENT 240 | "Padlock.wav","What does the gadget seem to be made from?",METAL 241 | "Footsteps, Muddy, E.wav","How many steps does the person take?",TWENTY 242 | "Footsteps, Muddy, E.wav","What is being walked on?",MUD 243 | "squeaky metal swing.wav","What item is being moved back and forth to create the noise?",CAT 244 | "squeaky metal swing.wav","What playground feature makes this sound when in use?",SEESAW 245 | "Fairground 2 Ghost ride.wav","How many screams are there?",FOUR 246 | "SCC CLAPTER 20101210.wav","What are the people doing?",CLAPPING 247 | "SCC CLAPTER 20101210.wav","What emotion are the people expressing in the clip?",EXCITEMENT 248 | "FOOTSTEPS_005.wav","How is the person traveling?",WALKING 249 | "Sink_Running.wav","What is flowing?",WATER 250 | "Sink_Running.wav","what is the water doing?",DRAINING 251 | "F907 Church prayer f.wav","How many women speak?",ONE 252 | "F907 Church prayer f.wav","Who is responding to a single person?",CROWD 253 | "water dripping 2.wav","How many faucets are dripping water?",ONE 254 | "water dripping 2.wav","What is dripping?",WATER 255 | "20121014_boat_tour_01.wav","How many kinds of creatures are making noise?",TWO 256 | "20121014_boat_tour_01.wav","What are the people doing?",TALKING 257 | "Alps village field-recording distance.wav","What is making the mechanical noise?",TRUCK 258 | "moving glass pieces.wav","How many times is something struck?",TWELVE 259 | "moving glass pieces.wav","what is moving around?",GLASS 260 | "AMB_EXT_PARK_SUMMER_DAY_LOOP.wav","What is chirping?",CRICKET 261 | "AMB_EXT_PARK_SUMMER_DAY_LOOP.wav","What other type of insect can make this sound?",CRICKET 262 | "Rain_inside_of_a_Car.wav","What is be boiling?",POPCORN 263 | "morning breeze and birds.wav","What kind of animals are making noise?",BIRDS 264 | "morning breeze and birds.wav","What sound are the birds making?",CHIRPING 265 | "Corn Husking Sequence x2.wav","what happens to the tape?",PULLED 266 | "radio_static.wav","What machine is making the noise?",RADIO 267 | "md3trk2.wav","How many times does the noise repeat?",SIX 268 | "md3trk2.wav","What is spinning?",MOTOR 269 | "md4trk10.wav","What manual action creates this sound?",SHAKING 270 | "Bus Pulls Away.wav","What part of a car can be heard?",ENGINE 271 | "Cat Meowing.wav","What does the puppy keep doing?",GROWLING 272 | "Cat Meowing.wav","What is growling?",CAT 273 | "Household - Atmos - Wind Through Window.wav","How is the weather here?",WINDY 274 | "tentrain.wav","What is falling?",RAIN 275 | "Small Falling Water Onto Stones.wav","what is the water doing?",FLOWING 276 | "WOOD CHOPPING_ Chopping hard wood with metal Axe (SFX).wav","How many times is the object smacked?",SIXTEEN 277 | "WOOD CHOPPING_ Chopping hard wood with metal Axe (SFX).wav","Which time is the object smacked the quietest?",NONE 278 | "Wet_Soggy_Squishy_Footsteps.wav","When is the object being handled the gentlest?",BEGINNING 279 | "Air raid siren_rising.wav","What animal is chirping?",BIRD 280 | "Old metal window open and close.wav","How many times can the scraping sound be heard?",SIX 281 | "Old metal window open and close.wav","What does the general weight of this object seem to be?",LIGHT 282 | "bombolles.wav","How many times does the sound stop and start again?",THREE 283 | "bombolles.wav","What is making the bubbling noise?",LIQUID 284 | "vending machine action.wav","What does the machine being operated do?",LAUNDRY 285 | "vending machine action.wav","What is the person putting into the machine?",COINS 286 | "Marker Writing on Paper.wav","What is the person writing with?",PENCIL 287 | "crickets and owls.wav","how many times is there a loud screeching sound?",THREE 288 | "BobKessler-Spinning Tin Top.wav","How many times was the thing rolled down?",FOUR 289 | "Gravel_Sand Walking 1.wav","How many people are walking?",ONE 290 | "foley footsteps - raw.wav","Where is he?",OFFICE 291 | "OrchestraTuning2.wav","What are the people doing with their instruments?",PLAYING 292 | "OrchestraTuning2.wav","What is the tone of the music?",CLASSICAL 293 | "SonicSnap_GPSUK_Cockerel.wav","What bird is making a sound near the end?",ROOSTER 294 | "SonicSnap_GPSUK_Cockerel.wav","What is the sound that is coming from a vehicle?",SIREN 295 | "Muddy_steps_bush_birds_singing.wav","What animal is making noise?",BIRD 296 | "Muddy_steps_bush_birds_singing.wav","What is the person doing?",WALKING 297 | "Liverpool St Station main hall.wav","What device, on a vehicle, makes the high pitched squeal sound?",BRAKES 298 | "Cooking on Gas.wav","when does the sound cease?",NEVER 299 | "Blackbird tweet with waterfall background.wav","What is making the chirping noise?",BIRD 300 | "Greek Habitues - (Evosmos - Salonika) 16.18 28.09.wav","What is moving in the sky?",HELICOPTER 301 | "Greek Habitues - (Evosmos - Salonika) 16.18 28.09.wav","what is the helicopter doing?",FLYING 302 | "Chicharra1.wav","How many people can be heard moving around?",ONE 303 | "Watering Can.wav","What is turned on to release the water?",FAUCET 304 | "Car vs. Freight Train.wav","What causes the crashing noise?",TRAIN 305 | "Single cricket chirping during a summer evening in the city (with traffic noise).wav","what is making the high pitch sound?",INSECT 306 | "RG Large Old Dog Snoring.wav","How many times does the person breathe?",SIX 307 | "RG Large Old Dog Snoring.wav","What is the person doing?",SLEEPING 308 | "20160506_sharpening.02.wav","How many times does the sound repeat?",SIXTEEN 309 | "20160506_sharpening.02.wav","What object is being used to make the noise?",KNIFE 310 | "Thunder_01.wav","how many times does the thunder crack?",TWICE 311 | "ortam.wav","What object is making this deep noise?",GONG 312 | "ortam.wav","When is the pitch of the rumbling sound the highest?",CONSTANT 313 | "Brushing teeth.wav","What does the person spit out?",LIQUID 314 | "Brushing teeth.wav","What was the person cleaning?",TEETH 315 | "Fantasy Ambience.wav","What instrument is being played?",ORGAN 316 | "Glass bottles in and out of a basket.wav","How many taps are there?",THREE 317 | "Sound_FX_Kitchen_wash dish.wav","In what room is the person using a sink?",KITCHEN 318 | "Sound_FX_Kitchen_wash dish.wav","What fixture is the water coming out of?",TAP 319 | "20091211.barking.stairs.wav","What animal is making loud noise?",DOG 320 | "20091211.barking.stairs.wav","What is the dog doing continuously?",BARKING 321 | "Kiddie Train.wav","What is being expelled at the beginning?",AIR 322 | "Kiddie Train.wav","What type of vehicle can be heard?",TRAIN 323 | "Fryers Forest - Powerful Owl (Ninox Stenua).wav","how many times does the animal hoot?",EIGHT 324 | "Fryers Forest - Powerful Owl (Ninox Stenua).wav","What animal can be heard?",OWL 325 | "Boulevard SummerRiver calm 01 NL 160905_0961 0962.wav","What animal is chirping in the background?",BIRD 326 | "Boulevard SummerRiver calm 01 NL 160905_0961 0962.wav","What are the people doing?",TALKING 327 | "mab-kite-spool-20080727.wav","What gas is escaping from the appliance?",STEAM 328 | "20130723_Rain2.wav","what is driving in the rain?",VEHICLE 329 | "Cardiff Bay fireworks.wav","What is being celebrated?",PARTY 330 | "Cardiff Bay fireworks.wav","What is making the load bangs?",FIREWORKS 331 | "draining board metal drip on metal.wav","What is the water falling into?",BUCKET 332 | "drip rhythm1.wav","What liquid element do the drops fall into?",WATER 333 | "a gentle breeze, wind 4.wav","How soft is this sound?",SOFT 334 | "Electric Train Interior Atmos.wav","How many engines are running?:",TWO 335 | "atmo_kenting_national_park.wav","What covers the outside of the animal heard here?",FEATHERS 336 | "Tree Bark Cracks.wav","What can be heard breaking?",WOOD 337 | "tua-mirandela_train_arrival_march2007.wav","What kind of animal is in the background?",DOG 338 | "Outside wind.wav","What type of rainfall is this?",HEAVY 339 | "Outside wind.wav","What type of weather is this?",RAINY 340 | "Metal pipe hitting the ground.wav","How many times is something dropped?",SEVEN 341 | "Metal pipe hitting the ground.wav","What material is the object made from?",METAL 342 | "tornado day 1.wav","What is the weather like?",RAINY 343 | "1400 am static.wav","Where do the wheels of the vehicle rotate?",LOW 344 | "Busy Playground.wav","What animal is singing in the background?",BIRD 345 | "Busy Playground.wav","Where would these kids be playing at?",PLAYGROUND 346 | "Bizzare Atmosphere.wav","How many times is the first noise repeated?",FIFTEEN 347 | "Fast food soda with ice, sip slurp straw.wav","What is the person drinking?",SODA 348 | "Footsteps Dress Shoes Wood Floor.wav","How many steps does the person take?",FORTY 349 | "Footsteps Dress Shoes Wood Floor.wav","What is the activity that's taking place?",WALKING 350 | "fresound sample 2.wav","What type of instrument is making the main sound?",PIANO 351 | "slupia river.wav","How is the weather here?",RAINING 352 | "slupia river.wav","How many air bubbles can be heard?",ALOT 353 | "at the edge of the forest.wav","How many different noises can be heard?",THREE 354 | "20130405_wooden.stairs.floor.01.wav","What material is the floor?",WOOD 355 | "53 blackhead_minifjord_closeup.wav","What is making the engine noise?",AIRPLANE 356 | "53 blackhead_minifjord_closeup.wav","What is the occupation of the person operating the thing making the engine noise?",PILOT 357 | "plastic-straw-whistles.wav","What general art form is this?",MUSIC 358 | "plastic-straw-whistles.wav","What instrument is being played?",FLUTE 359 | "river + waterfall 2 .wav","what is the water doing?",RUNNING 360 | "By ther blacksmith-002.wav","How many times has the object been hit?",SEVENTYTHREE 361 | "By ther blacksmith-002.wav","What material is being hit?",METAL 362 | "Street market.wav","What are the people doing?",TALKING 363 | "Street market.wav","what gender of people are speaking?",FEMALE 364 | "Construction Zone.wav","From which part of the car do the noise come from?",ENGINE 365 | "d0_drips_04.wav","What is dripping into glass?",WATER 366 | "d0_drips_04.wav","What is the water being poured in?",BOWL 367 | "Ambience_hum_tuning fork.wav","How many times does the tone sound?",ONCE 368 | "Ambience_hum_tuning fork.wav","What repetitive miniscule movement is the item making that is struck, in order to make the sound?",VIBRATING 369 | "cricket chirp.wav","How many times does the insect chirp?",THIRTYFOUR 370 | "cricket chirp.wav","What insect is making the sound?",CRICKET 371 | "Omsk_Victory_park_1.wav","Which of the distant banging sounds is the loudest?",MIDDLE 372 | "windroar_constant_1m12s.wav","What other weather condition is occurring in the recording?",RAIN 373 | "Growing Hum.wav","How many times is the clicking heard at the beginning?",SIX 374 | "Growing Hum.wav","what type of material are the scissor blades made of?",METAL 375 | "Watervogels en riet Lichterveldestraat (HiPass).wav","What is the object that is travelling in the clip?",CAR 376 | "urinating on a wall.wav","How many water taps are open ?",ONE 377 | "urinating on a wall.wav","what is coming out of the hose?",WATER 378 | "Large Splashes.wav","How many splashes are there?",FIVE 379 | "Large Splashes.wav","What are things being dropped into?",WATER 380 | "20060523.grassland.wav","What animal is heard?",BIRDS 381 | "20060523.grassland.wav","What is blowing though the trees?",WIND 382 | "car alarm 130603.wav","What kind of alert is going off?",CAR 383 | "20080505_1306playground01.wav","What type of animal can be heard occasionally?",BIRD 384 | "porto_morning_tropical_birds_market_20.wav","What is making the most noise?",BIRDS 385 | "remix of 130879__frederic-font__05-hang-song-1.wav","How many instruments are there?",TWO 386 | "remix of 130879__frederic-font__05-hang-song-1.wav","What kind of instrument are they using?",PIANO 387 | "Thunder 03.wav","What is making that sound?",WIND 388 | "Canada Geese Squawk on a Pond with a Fountain.wav","What animal is making sounds?",BIRDS 389 | "indoors house ambient room tone distant neighbours 1.wav","What age range do the voices belong to at the beginning?",CHILDREN 390 | "indoors house ambient room tone distant neighbours 1.wav","What animal is making a sound at the end?",BIRD 391 | "Cityscape 04 090617.wav","What is rushing past?",TRAIN 392 | "Lluvia 1.wav","If there is too much of this, what disaster can it cause?",FLOOD 393 | "Lluvia 1.wav","What hits the ground?",RAIN 394 | "walkingondirtpath.wav","How many times do they kick forward?",TWO 395 | "walkingondirtpath.wav","What is the man stepping on?",GRAVEL 396 | "Shower Running 02.wav","What is constantly streaming?",WATER 397 | "Shower Running 02.wav","What touches the water as it drips?",HANDS 398 | "fireTruckFar NL 140109_00.wav","How many ambulances are heard passing by in the background ?",ONE 399 | "fireTruckFar NL 140109_00.wav","What vehicle is getting closer?",AMBULANCE 400 | "abandoned-ballroom-big-metal.wav","What is the pace of the person's movement?",SLOW 401 | "basement-stairs.wav","What is happening here?",WALKING 402 | "FP_Refrigerator_Door_Squeak.wav","How many times does the door open after being shut?",SIX 403 | "FP_Refrigerator_Door_Squeak.wav","What is creaking?",DOOR 404 | "10_lightning_kohchang.wav","How many times does thunder clap?",ONE 405 | "Birds Singing in a Small Town During Morning.wav","what hits the mic at the end?",WIND 406 | "Birds Singing in a Small Town During Morning.wav","What is the bird doing?",CHIRPING 407 | "coffee can.wav","What tool is being used?",SAW 408 | "BulletJuneEdited192012.wav","How many times does the machine rev up at the start of the clip?",ONE 409 | "Neighbourhood evening ambience.wav","What is the person doing?",WALKING 410 | "Neighbourhood evening ambience.wav","What kind of creature is chirping?",CRICKETS 411 | "ambience car.wav","What are the people who operate these modes of transportation known as?",DRIVE 412 | "ambience car.wav","What is making the noise?",CAR 413 | "20090412.bell.strikes.12.wav","How many people can be heard talking?",MANY 414 | "20090412.bell.strikes.12.wav","How many times does the bell ring?",EIGHT 415 | "mediterranean_sea_porticcio.wav","what is the water doing?",CRASHING 416 | "Bicin_Diputacion_Day_22-03-2009.wav","How many clicks are heard?",THIRTY 417 | "WasherEndofRestCycleStartFill-WashCycle.wav","where does this noise occur?",FACTORY 418 | "police_car_siren-esp.wav","What law enforcement would use this sound?",POLICE 419 | "Kung Fu Clothes Hits and Clothing Sounds.wav","how many taps are heard?",EIGHT 420 | "Kung Fu Clothes Hits and Clothing Sounds.wav","What is hit?",MAT 421 | "Library Ambience_large space.wav","What is the last sound called?",KNOCK 422 | "Library Ambience_large space.wav","What sort of noise coming from a human can briefly be heard in the background?",TALKING 423 | "Slushing in mouth.wav","What fluid does the man shake around?",WATER 424 | "Cualquiera.wav","What makes the squeaking sound?",DOOR 425 | "box_open_hit.wav","What is the sound being made repeatedly?",HITTING 426 | "Rainforest Morning Chorus.wav","What are communicating with each other?",BIRDS 427 | "Rainforest Morning Chorus.wav","What is chirping in the background?",BIRD 428 | "Tiergarten birds early morning.wav","what are the birds doing?",SINGING 429 | "Tiergarten birds early morning.wav","Where are the birds making this noise?",OUTSIDE 430 | "WalkingGravelTrailAugust2015.wav","How many people are present?",ONE 431 | "WalkingGravelTrailAugust2015.wav","What type of surface are they walking on?",GRAVEL 432 | "Ship Fender.wav","What are they using to start up the engine?",START 433 | "Ship Fender.wav","What tool is making the noise?",MOWER 434 | "JM_HOME&OFFICE_Shower 01 - Taking a shower.wav","How many people can be heard?",NONE 435 | "Typing 5 lines.wav","What is the person using to type?",TYPEWRITER 436 | "20090407.airplane.wav","How many planes can be heard?",ONE 437 | "20091217.18.chains.wav","How many times can the clinking sound be heard?",EIGHT 438 | "20091217.18.chains.wav","What material is the object being manipulated made out of?",METAL 439 | "Light to heavy Rain.wav","What noise can be heard?",WATER 440 | "Cityscape construction site 2 100304.wav","How many cutting strokes does this person make ?",TWO 441 | "earth_movement.wav","This sound is reminiscent of what domesticated animal when it's happy?",CAT 442 | "Horse_Hooves_Hard_Floor_Interior.wav","how many sets of clomping noises are there?",THREE 443 | "Horse_Hooves_Hard_Floor_Interior.wav","Which part of their body is used to make the sound?",FINGERS 444 | "Kaffemaschine_1.wav","What is been used?",SAW 445 | "Kaffemaschine_1.wav","What is the sound the machine makes?",DRILLING 446 | "BoyRacer.wav","what is the engine doing?",REVVING 447 | "campanas.wav","what are the bells doing?",RINGING 448 | "rain_near_smooth.wav","How is the weather?",RAINY 449 | "rain_near_smooth.wav","how many times is thunder heard?",ONE 450 | "treefrogs.wav","What type of machine does this sound come from?",ENGINE 451 | "Quill pen writing on hard paper various speed.wav","What is someone doing?",WRITING 452 | "Waiting at a Montreal Subway Station.wav","Where is this sound made?",TUNNEL 453 | "Waiting at a Montreal Subway Station.wav","Which form of transportation is heard?",TRAIN 454 | "circuitbend03.wav","How many times does the buzzing stop?",TEN 455 | "natureatmosphere.wav","What are these people doing?",LISTENING 456 | "natureatmosphere.wav","what sense does the man refer to?",LISTENING 457 | "Pigeon Temple.wav","What type of gender voice preceded the laugh?",MALE 458 | "Pigeon Temple.wav","When does the person laugh?",END 459 | "London Overground train (interior) approaches Victoria Station.wav","What are the people on the train doing?",TALKING 460 | "London Overground train (interior) approaches Victoria Station.wav","What is this mode of transportation traveling on?",TRAIN 461 | "Storm sirens with dog bark at end 050627 24 bit.wav","How is the weather?",RAINY 462 | "Storm sirens with dog bark at end 050627 24 bit.wav","What type of weather can be heard?",RAIN 463 | "spring rain in the woods.wav","How many birds are singing?",ONE 464 | "spring rain in the woods.wav","Where are the birds?",OUTSIDE 465 | "water_boil_pour_stir-96.wav","How many times is there a different sound other than the sound that occurs the most?",ONCE 466 | "wooden barndoor.wav","How many clacks are heard?",FOUR 467 | "wooden barndoor.wav","What is the person opening?",DOOR 468 | "gargnano-sounds.wav","How many cars pass by?",ONE 469 | "gargnano-sounds.wav","Who is making the noise?",HUMAN 470 | "paper01.wav","What is the person doing?",READING 471 | "Rushing_water+wind-Rec_Samsung_HMX-F80_Camcorder.wav","What is making the noise?",WATER 472 | "RunningWater_BathTub_01.wav","What is the water going down?",DRAIN 473 | "Jet Engine 1.wav","What product is being processed?",METAL 474 | "Trompetistas.wav","How many trumpets are being played?",TWO 475 | "Trompetistas.wav","What is the person playing?",TRUMPET 476 | "TIKTOK_1.wav","How many times does the clock tick?",SIXTY 477 | "TIKTOK_1.wav","What is making the ticking noise?",CLOCK 478 | "OiseauNuit1.wav","What is making the chirping?",BIRDS 479 | "OiseauNuit1.wav","What is singing?",BIRDS 480 | "meadow brook bees.wav","What sweet product do the insects that make these sounds produce?",LIQUID 481 | "thunder-distant-20120709-s3.wav","What is making this sound?",THUNDER 482 | "ambience winter fountain birds .wav","What kind of vessel would be used in this substance?",BOAT 483 | "abandoned-ballroom-radiators.wav","What instrument is being played?",XYLOPHONE 484 | "abandoned-ballroom-radiators.wav","What is making the music?",XYLOPHONE 485 | "Wisper1.wav","What is the person doing?",WHISPERING 486 | "BlueJay.wav","How many single squawks does the loudest bird make?",SEVEN 487 | "California morning birds singing.wav","What animals are making the most sounds?",BIRDS 488 | "California morning birds singing.wav","What breed of bird is chirping?",SPARROW 489 | "steps_snow.wav","What cold substance is someone walking through?",SNOW 490 | "steps_snow.wav","What does the group keep doing?",WALKING 491 | "Lincoln Nebraska Tornado 5 9 2016.wav","What is creaking?",RADAR 492 | "Gentle rain outside balcony street noise.wav","How many engines can be heard?",TWO 493 | "Water Faucet HQ Stereo.wav","What moving substance is causing this sound?",WATER 494 | "mercury topaz starting.wav","What is the person doing before starting the car?",OPENING 495 | "mercury topaz starting.wav","What is the person doing in the vehicle?",PULLING 496 | "Iceland2013_Stokkur.wav","What are the people doing?",LAUGHING 497 | "Iceland2013_Stokkur.wav","Whos is laughing?",WOMAN 498 | "Paper_Parchment_Rustling.wav","What is the person doing with the bag?",OPENING 499 | "Paper_Parchment_Rustling.wav","What material is the rustled bag made of?",PAPER 500 | "Balloon Game at Arlington Heights Carnival.wav","How many children speak?",TWO 501 | "Balloon Game at Arlington Heights Carnival.wav","What kind of toy is the child asking for?",DOG 502 | "branch and wind in wood 1.wav","What can be heard blowing in the background?",WIND 503 | "spring, road.wav","how many times does the wood hit another surface?",FIVE 504 | "spring, road.wav","What type of material is making that impact noise ?",WOOD 505 | "Rishikesh Aarati.wav","How many times does this person cough ?",ONE 506 | "Rishikesh Aarati.wav","How many times is a horn honked?",FOUR 507 | "Stream # 2.wav","Name the fluid that is churning away?",WATER 508 | "Taking the car out of the garage.wav","What is making the noise?",MACHINE 509 | "Taking the car out of the garage.wav","What was in the garage ?",VEHICLE 510 | "Shower 2.wav","What is going down the drain?",WATER 511 | "Shower 2.wav","Which room does this sound occur in?",BATHROOM 512 | "005_musesdelight_charismatic-african-preacher.wav","What is the gender of the person speaking?",MALE 513 | "005_musesdelight_charismatic-african-preacher.wav","What place are the people in?",CHURCH 514 | "Boiling a cup of water.wav","What process needs to occur to the liquid, in order for it to make that sound?",BOILING 515 | "Toilet Shuffling.wav","What object was just used?",TOILET 516 | "down stars running 3.wav","What final action is the person doing at the very end of the sound?",STOMPING 517 | "PIT-ROIG 0.12-0.17.wav","What material are the boots made from?",RUBBER 518 | "Short Hailstorm.wav","What is pouring down in the sound?",RAIN 519 | "Short Hailstorm.wav","What material is the rain beating against?",METAL 520 | "Face slap CsG.wav","How many times is the slapping sound heard?",TWENTY 521 | "Face slap CsG.wav","What item is being hit against the object?",HAND 522 | "nxSample010.wav","What is banging?",METAL 523 | "nxSample010.wav","What is running?",WATER 524 | "Bubbles water.wav","What object is making the liquid bubble?",STRAW 525 | "squeaking wooden floor.wav","What is the final sound heard called?",SQUEAKING 526 | "fan_2_300513.wav","What is blowing out air?",FAN 527 | "Jumping onto a hard floor with shoes and some walking sounds.wav","What sport is associated with this sound?",TENNIS 528 | "20130723_Rain1.wav","How is the traffic?",SLOW 529 | "Kali Temple Soundscape.wav","What are the people doing?",TALKING 530 | "Parking Garage - Ambiance, Electrical Hum 1.wav","How many times does the buzzing pause?",ZERO 531 | "Parking Garage - Ambiance, Electrical Hum 1.wav","What is the sound that heard called?",BUZZING 532 | "knocking on a window or glass.wav","What is the person doing?",KNOCKING 533 | "cowshed.wav","How many times do the cows moo?",SIX 534 | "cowshed.wav","What animal is making the noise?",COW 535 | "Pag_Starigrad_crickets_birds_2.wav","How many different bird calls are heard?",THREE 536 | "Pag_Starigrad_crickets_birds_2.wav","What kind of animals are nearby?",BIRDS 537 | "Ext, distance village, light wind in tree-01.wav","How many different types of animals are there?",FOUR 538 | "microphonecontact_stereo.wav","What action is being done to one of the objects in this clip?",GRINDING 539 | "microphonecontact_stereo.wav","What is being eaten?",ICE 540 | "Small Boat Engine.wav","What does this machine do?",CONSTRUCTION 541 | "Small Boat Engine.wav","what gender of human speaks?",MALE 542 | "footsteps on beach.wav","How is the person travelling?",WALKING 543 | "door.of.bar.raining2.wav","What is heard over over talking crowd?",RAIN 544 | "door.of.bar.raining2.wav","What living mammal is making the sounds?",HUMAN 545 | "PassingMoped01.wav","What sound are the birds making?",CHIRPING 546 | "rain2.wav","What can be heard falling?",WATER 547 | "rain2.wav","What speed is the rain falling?",FAST 548 | "Hamamatsu-traffic-light-1.wav","the chiming usually happens when your car door is what?",OPEN 549 | "coastal road on the beach, scooter, motorcycle, car.wav","Around what kind of building would noises like these be a common?",AIRPORT 550 | "coastal road on the beach, scooter, motorcycle, car.wav","Which vehicle has the noisiest engine in this clip?",MOTORCYCLE 551 | "windup_dino_slow.wav","What is the person doing?",TYPING 552 | "radio tuning 2.wav","How would one describe the quality of the conversation?",BAD 553 | "radio tuning 2.wav","What electronic is being dialed?",RADIO 554 | "md1trk33-34.wav","To what does the hinges need?",OIL 555 | "md1trk33-34.wav","What rectangular object that opens and closes are the hinges likely attached to?",DOOR 556 | "washcloth.wav","The sound is of a ball hitting what?",FLOOR 557 | "Blackbird sounds.wav","In what alive thing does this creature usually build a nest?",TREE 558 | "Blackbird sounds.wav","What kind of animal is that?",BIRD 559 | "20100110.kitchen.wav","What is beeping?",MICROWAVE 560 | "20100110.kitchen.wav","what item makes the ding sound?",ALARM 561 | "Sword clanks 3.wav","how many bangs are there?",ELEVEN 562 | "Sword clanks 3.wav","What is the material that is making the sounds?",METAL 563 | "ambient text.wav","What genre of movie could this sound be played in?",HORROR 564 | "ambient text.wav","What style does the music belong to?",HORROR 565 | "Subping03.wav","What machine creates this noise?",ELECTRONIC 566 | "Subping03.wav","What shell allows the sound to reverberate?",SEA 567 | "STE-017 traffic bus stop.wav","How many cars pass by?",THREE 568 | "STE-017 traffic bus stop.wav","What are the cars doing?",DRIVING 569 | "Anti Air Gun (3 Sounds).wav","Name one of the guns that was fired?",MACHINE 570 | "Anti Air Gun (3 Sounds).wav","what does the gun do?",FIRE 571 | "Wet Footstpes Sidewalk . Metro Pass in Distance.wav","how many steps are taken?",FIFTY 572 | "Construction Sounds.wav","What is rattling here?",MACHINE 573 | "Little sreet behind a terrasse cafe.wav","What kind of noises are the animals making?",BARKING 574 | "131227_strumyk_1.wav","What is running?",WATER 575 | "131227_strumyk_1.wav","what is the water doing?",FLOWING 576 | "Wind_Whistling_Dorm_Window.wav","How long does it beep?",LONG 577 | "Wind_Whistling_Dorm_Window.wav","What type of sound is playing though out the clip?",WHISTLE 578 | "RKeaton_EMF366_12_Tearing Thick Paper.wav","How many tearing sequences are heard throughout?",FOUR 579 | "RKeaton_EMF366_12_Tearing Thick Paper.wav","What is the person doing to the paper?",TEARING 580 | "Gibbons of Dusit.wav","What animal is making these noises?",BIRD 581 | "Gibbons of Dusit.wav","Which instance of the whooping sound is the longest?",LAST 582 | "bebops_water1.wav","how many times does a wave collide with the shore?",FIVE 583 | "obresAranyo_trepant2.wav","What would this sound likely be made by if it were heard in a dentist's office?",DRILL 584 | "pr#6F9A9E.wav","how many times does the item drop?",EIGHT 585 | "Llantas_rechinando.wav","How many times is a screeching sound made?",TWELVE --------------------------------------------------------------------------------