├── nets
    ├── __init__.py
    ├── multi_attention_encoder_decoder.py
    ├── sliding_chunks.py
    ├── ours_MWAFM_Net.py
    ├── multi_attention.py
    └── diagonaled_mm_tvm.py
├── scripts
    ├── word_numbers.py
    ├── split_dataset.py
    └── data_clean.py
├── utils.py
├── readme.md
├── conifg.py
├── data_generator.py
├── metadata
    ├── output_classes_clean.json
    ├── output_classes.json
    ├── metadata_orig
    │   └── output_classes.json
    ├── wordst.txt
    ├── single_word_val_clean.csv
    └── single_word_test_clean.csv
└── main_MWAFM.py


/nets/__init__.py:
--------------------------------------------------------------------------------
1 | from nets.ours_MWAFM_Net import MWAFM_Net, MultiAttnLayer
2 | 


--------------------------------------------------------------------------------
/scripts/word_numbers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import pandas as pd
 4 | 
 5 | def read_metadata(csv_file_path):
 6 |     csv_data = pd.read_csv(csv_file_path, encoding='latin1')
 7 |     audio_fnames = list(csv_data['file_name'])
 8 |     questions = list(csv_data['QuestionText'])
 9 |     answers = list(csv_data['answer'])
10 |     return audio_fnames, questions, answers
11 | 
12 | def StatWorsNums(csv_file):
13 | 
14 | 	words = []
15 | 
16 | 	wave_name, question, answer = read_metadata(csv_file)
17 | 	# print(question)
18 | 
19 | 	for index in range(len(question)):
20 | 		qst = question[index]
21 | 		qst = qst.replace(",", "").replace("?", "")
22 | 		qst = qst.split(" ")
23 | 
24 | 		for wd in qst:
25 | 			if wd not in words:
26 | 				words.append(wd)
27 | 
28 | 	print(words)
29 | 	print(len(words))
30 | 
31 | 
32 | 
33 | 
34 | if __name__ == "__main__":
35 | 
36 | 	csv_file = "../metadata/single_word_train.csv"
37 | 	StatWorsNums(csv_file)


--------------------------------------------------------------------------------
/scripts/split_dataset.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | 
 4 | 
 5 | def split_data_into_binary_and_single_word_answers(csv_file, split):
 6 |     csv_data = pd.read_csv(csv_file, encoding='latin1', usecols=['file_name', 'QuestionText', 'answer' ])
 7 | 
 8 |     csv_data['answer'] = csv_data['answer'].str.upper()
 9 | 
10 |     select = ['YES', 'NO']
11 |     data_single_word = csv_data[~csv_data['answer'].isin(select)]
12 |     data_binary = csv_data[csv_data['answer'].isin(select)]
13 | 
14 |     with open('metadata/single_word_{}.csv'.format(split), 'wb') as f:
15 |         data_single_word.to_csv(f, index=False)
16 |     
17 |     with open('metadata/binary_{}.csv'.format(split), 'wb') as f:
18 |         data_binary.to_csv(f, index=False)
19 | 
20 | 
21 | csv_file_train = 'metadata/clotho_aqa_train.csv'
22 | csv_file_val = 'metadata/clotho_aqa_val.csv'
23 | csv_file_test = 'metadata/clotho_aqa_test.csv'
24 | 
25 | split_data_into_binary_and_single_word_answers(csv_file_train, 'train')
26 | split_data_into_binary_and_single_word_answers(csv_file_val, 'val')
27 | split_data_into_binary_and_single_word_answers(csv_file_test, 'test')
28 | 
29 | # create word-index for single word answers
30 | csv_file = 'metadata/single_word_train.csv'
31 | csv_data = pd.read_csv(csv_file, encoding='latin1', usecols=['file_name', 'QuestionText', 'answer'])
32 | 
33 | csv_data['answer'] = csv_data['answer'].str.upper()
34 | 
35 | answers_set = set(list(csv_data['answer']))
36 | answers_dict = dict(zip(answers_set, range(len(answers_set))))
37 | 
38 | with open("metadata/output_classes.json", "w") as outfile:
39 |     json.dump(answers_dict, outfile)
40 | 


--------------------------------------------------------------------------------
/scripts/data_clean.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import pandas as pd
 4 | import json
 5 | 
 6 | 
 7 | 
 8 | def read_metadata(csv_file_path):
 9 |     csv_data = pd.read_csv(csv_file_path, encoding='latin1')
10 |     audio_fnames = list(csv_data['file_name'])
11 |     questions = list(csv_data['QuestionText'])
12 |     answers = list(csv_data['answer'])
13 |     return audio_fnames, questions, answers
14 | 
15 | def QAClean(csv_file):
16 | 
17 | 	wave_name, question, answer = read_metadata(csv_file)
18 | 
19 | 	cnt = 0
20 | 	for index in range(0,len(question),3):
21 | 
22 | 		answer_tmp = []
23 | 		# for i in range(3)
24 | 		answer_tmp.append(answer[index])
25 | 		
26 | 		if answer[index+1] in answer_tmp:
27 | 			answer_tmp.append(answer[index+1])
28 | 			print(wave_name[index], ', "', question[index], '", ', answer[index+1])
29 | 			cnt += 1
30 | 		else:
31 | 			answer_tmp.append(answer[index+1])
32 | 			if answer[index+2] in answer_tmp:
33 | 				answer_tmp.append(answer[index+2])
34 | 				print(wave_name[index], ',"', question[index], '",', answer[index+2])
35 | 				cnt += 1
36 | 	print("cnt: ", cnt)
37 | 
38 | 
39 | def AnswerGen(csv_file):
40 | 
41 | 	csv_data = pd.read_csv(csv_file, encoding='latin1', usecols=['file_name', 'QuestionText', 'answer'])
42 | 
43 | 	csv_data['answer'] = csv_data['answer'].str.upper()
44 | 
45 | 	answers_set = set(list(csv_data['answer']))
46 | 	answers_dict = dict(zip(answers_set, range(len(answers_set))))
47 | 
48 | 	with open("../metadata/output_classes_clean.json", "w") as outfile:
49 | 	    json.dump(answers_dict, outfile)
50 | 
51 | 
52 | 
53 | if __name__ == "__main__":
54 | 
55 | 	csv_file = "../metadata/binary_test.csv"
56 | 	QAClean(csv_file)
57 | 	# AnswerGen(csv_file)


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import io
 3 | import numpy as np
 4 | import json
 5 | 
 6 | 
 7 | def load_vectors(embedding_file):
 8 |     fin = io.open(embedding_file, 'r', encoding='utf-8', newline='\n', errors='ignore')
 9 |     data = {}
10 |     for line in fin:
11 |         tokens = line.rstrip().split(' ')
12 |         data[tokens[0]] = list(map(float, tokens[1:]))
13 |     return data
14 | 
15 | 
16 | def read_metadata(csv_file_path):
17 |     csv_data = pd.read_csv(csv_file_path, encoding='latin1')
18 |     audio_fnames = list(csv_data['file_name'])
19 |     questions = list(csv_data['QuestionText'])
20 |     answers = list(csv_data['answer'])
21 |     return audio_fnames, questions, answers
22 | 
23 | 
24 | def binary_classification_accuracy(pred, ground_truth):
25 |     n_samples = pred.shape[0]
26 |     # x = pred - ground_truth
27 |     x = pred - ground_truth.reshape(n_samples, 1)
28 |     n_wrong_predictions = np.count_nonzero(x)
29 |     accuracy = (n_samples - n_wrong_predictions) / n_samples
30 |     return accuracy
31 | 
32 | 
33 | def multiclass_classification_accuracy(logits, ground_truth, k=1):   # b x 830  bx1
34 |     n_samples = logits.shape[0]
35 |     if k == 1:
36 |         prediction = np.argmax(logits, axis=1)
37 |         x = prediction - ground_truth
38 |         n_wrong_predictions = np.count_nonzero(x)        
39 |         accuracy = (n_samples - n_wrong_predictions) / n_samples
40 |         return accuracy
41 |     else:
42 |         max_idx = np.argsort(-1*logits, 1)[:, :k]       # np.argsort() 返回数组值从小到大的索引值
43 |         # x = max_idx - ground_truth
44 |         x = max_idx - ground_truth.reshape(n_samples, 1)
45 |         n_correct_predictions = np.count_nonzero(x == 0)
46 |         top_k_accuracy = n_correct_predictions/n_samples
47 |         return top_k_accuracy
48 | 
49 | 
50 | 
51 | def load_answers_dict(answers_dict_file):
52 |     f = open(answers_dict_file)
53 |     answers_dict = json.load(f)
54 |     return answers_dict
55 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Audio Question Answering (AQA)
 4 | 
 5 | PyTorch code accompanies our Interspeech 2023 paper:
 6 | 
 7 | **Multi-Scale Attention for Audio Question Answering** \[[arXiv](https://arxiv.org/abs/2305.17993)\]
 8 | 
 9 | [Guangyao Li](https://ayameyao.github.io/), Yixin Xu and [Di Hu](https://dtaoo.github.io/index.html)
10 | 
11 | ---
12 | 
13 | ## Requirements
14 | 
15 | ```python
16 | python3.6 +
17 | pytorch1.6.0
18 | tensorboardX
19 | ffmpeg
20 | ```
21 | 
22 | ## Usage
23 | 
24 | 1. **Clone this repo**
25 | 
26 |    ```python
27 |    https://github.com/GeWu-Lab/MWAFM.git
28 |    ```
29 | 
30 | 2. **Download data**
31 | 
32 |    [Clotho-AQA](https://zenodo.org/record/6473207) and [AQA-MUSIC-AVQA](https://gewu-lab.github.io/MUSIC-AVQA/)
33 |    
34 | 3. **Data pre-processing**
35 | 
36 |    We follow exact the same setting data format as [MUSIC AVQA](https://gewu-lab.github.io/MUSIC-AVQA/).
37 | 
38 |    **Notice:** We examined the original annotation files of Clotho-AQA and found that the official open-source annotations were not cleansed, resulting in discrepancies where different annotators provided different answers for the same question. As a result, we performed a simple filtering process where we considered a question to have the correct answer if it had at least two identical answers Based on this filtering process, we obtained a new and more accurate annotation file. The files in 'metadata' folder are described as follows
39 | 
40 |    - 'single_word\_[train/val/test].csv', Does not contain samples with answers *yes* and *no*.
41 |    - 'single_word\_[train/val/test]\_clean.csv', Does not contain samples with answers *yes* and *no*. (Cleaned data)
42 |    - 'clotho_aqa\_[train/val/test]\_clean.csv', Contains samples with answers *yes* and *no*. (Cleaned data)
43 |    - 'binary\_[train/val/test]\_clean.csv', Include only samples with answers *yes* and *no*. (Cleaned data)
44 | 
45 |    
46 | 
47 | 4. **Train and evaluate**
48 | 
49 |    Training
50 | 
51 |    ```python
52 |    python main_MWAFM.py --mode train
53 |    ```
54 | 
55 |    Testing
56 | 
57 |    ```python
58 |    python main_MWAFM.py --mode test
59 |    ```
60 | 
61 | 
62 | ## Citation
63 | 
64 | If you find this work useful, please consider citing it.
65 | 
66 | <pre><code>
67 | @ARTICLE{Li2023MultiScale,
68 |   title	= {Multi-Scale Attention for Audio Question Answering},
69 |   author	= {Guangyao li, Yixin Xu, Di Hu},
70 |   journal	= {Proc. INTERSPEECH},
71 |   year	= {2023},
72 | }
73 | </code></pre>
74 | 
75 | 
76 | 
77 | ## Acknowledgement
78 | 
79 | This research was supported by Public Computing Cloud, Renmin University of China.
80 | 
81 | 


--------------------------------------------------------------------------------
/conifg.py:
--------------------------------------------------------------------------------
 1 | data_config = {
 2 | 
 3 |     # 'train_metadata_path': 'metadata/clotho_aqa_train_clean.csv',  # CSV containing audio URLs, Questions, Answers,filenames
 4 |     # 'val_metadata_path': 'metadata/clotho_aqa_val_clean.csv',
 5 |     # 'test_metadata_path': 'metadata/clotho_aqa_test_clean.csv',
 6 |     # 'output_classes_file': 'metadata/output_classes_clean.json',
 7 | 
 8 |     'train_metadata_path': 'metadata/single_word_train.csv',  # CSV containing audio URLs, Questions, Answers,filenames
 9 |     'val_metadata_path': 'metadata/single_word_val.csv',
10 |     'test_metadata_path': 'metadata/single_word_test.csv',
11 |     'output_classes_file': 'metadata/output_classes.json',
12 | 
13 |     # 'data_dir': '/home/data/clotho-aqa/audio_16kHz',  # path to store downloaded data
14 |     'feat_dir': '/home/data/clotho-aqa/vggish',
15 |     # 'feat_ast_dir': '/home/data/clotho-aqa/feats/ast',
16 |     # 'feat_dir': '/home/data/clotho-aqa/audio_spec',
17 |     'question_dir': './metadata/questions.csv',
18 |     'pre_trained_word_embeddings_file': './pretrained/wiki-news-300d-1M.vec',
19 |     'audio_embedding_size': 512,
20 | 
21 |     # audio length
22 |     'audio_length': 24,
23 |     'quest_length': 22,
24 | }
25 | 
26 | model_config = {
27 | 
28 |     'learning_rate': 1e-4,
29 |     'batch_size': 64,
30 |     'num_workers': 12,
31 |     'num_epochs': 50,
32 |     'log_interval': 10,
33 | 
34 | 
35 |     # audio network
36 |     'audio_input_size': data_config['audio_embedding_size'],
37 |     'audio_lstm_n_layers': 2,
38 |     'audio_lstm_hidden_size': 128,
39 |     'audio_bidirectional': True,
40 |     'audio_lstm_dropout': 0.2,
41 | 
42 | 
43 |     # NLP network
44 |     'text_input_size': 300,  # pretrained embedding size from fasttext
45 |     'text_lstm_n_layers': 2,
46 |     'text_lstm_hidden_size': 128,
47 |     'text_bidirectional': True,
48 |     'text_lstm_dropout': 0.2,
49 | 
50 |     # classification
51 |     'n_dense1_units': 256,
52 |     'n_dense2_units': 128,
53 | }
54 | 
55 | 
56 | 
57 | if 'binary' in data_config['train_metadata_path']:
58 |     model_config['n_classes'] = 1
59 | else:
60 |     model_config['n_classes'] = 828                 # Notice! output dim!
61 |     model_config['audio_lstm_hidden_size'] = 512
62 |     model_config['text_lstm_hidden_size'] = 512
63 | 
64 | dense1_input = 0
65 | if model_config['audio_bidirectional']:
66 |     dense1_input = dense1_input + 2 * model_config['audio_lstm_hidden_size']
67 | else:
68 |     dense1_input = dense1_input + model_config['audio_lstm_hidden_size']
69 | 
70 | if model_config['text_bidirectional']:
71 |     dense1_input = dense1_input + 2 * model_config['text_lstm_hidden_size']
72 | else:
73 |     dense1_input = dense1_input + model_config['text_lstm_hidden_size']
74 | 
75 | model_config['dense1_input'] = dense1_input
76 | 


--------------------------------------------------------------------------------
/nets/multi_attention_encoder_decoder.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Tuple, Dict
 2 | from torch import nn, Tensor
 3 | from nets.multi_attention import MultiScaleSelfAttention
 4 | # from transformers.modeling_bart import BartConfig, BartForConditionalGeneration
 5 | # from transformers import BartConfig, BartForConditionalGeneration
 6 | 
 7 | 
 8 | class LongformerEncoderDecoderForConditionalGeneration(BartForConditionalGeneration):
 9 |     def __init__(self, config):
10 |         super().__init__(config)
11 |         if config.attention_mode == 'n2':
12 |             pass  # do nothing, use BertSelfAttention instead
13 |         else:
14 |             for i, layer in enumerate(self.model.encoder.layers):
15 |                 layer.self_attn = MultiScaleSelfAttentionForBart(config, layer_id=i)
16 | 
17 | 
18 | class LongformerEncoderDecoderConfig(BartConfig):
19 |     def __init__(self, attention_window: List[int] = None, attention_dilation: List[int] = None,
20 |                  autoregressive: bool = False, attention_mode: str = 'sliding_chunks',
21 |                  gradient_checkpointing: bool = False, **kwargs):
22 |         """
23 |         Args:
24 |             attention_window: list of attention window sizes of length = number of layers.
25 |                 window size = number of attention locations on each side.
26 |                 For an affective window size of 512, use `attention_window=[256]*num_layers`
27 |                 which is 256 on each side.
28 |             attention_dilation: list of attention dilation of length = number of layers.
29 |                 attention dilation of `1` means no dilation.
30 |             autoregressive: do autoregressive attention or have attention of both sides
31 |             attention_mode: 'n2' for regular n^2 self-attention, 'tvm' for TVM implemenation of Longformer
32 |                 selfattention, 'sliding_chunks' for another implementation of Longformer selfattention
33 |         """
34 |         super().__init__(**kwargs)
35 |         self.attention_window = attention_window
36 |         self.attention_dilation = attention_dilation
37 |         self.autoregressive = autoregressive
38 |         self.attention_mode = attention_mode
39 |         self.gradient_checkpointing = gradient_checkpointing
40 |         assert self.attention_mode in ['tvm', 'sliding_chunks', 'n2']
41 | 
42 | 
43 | class MultiScaleSelfAttentionForBart(nn.Module):
44 |     def __init__(self, config, layer_id):
45 |         super().__init__()
46 |         self.embed_dim = config.d_model
47 |         self.multi_scale_self_attn = MultiScaleSelfAttention(config, layer_id=layer_id)
48 |         self.output = nn.Linear(self.embed_dim, self.embed_dim)
49 | 
50 |     def forward(
51 |         self,
52 |         query,
53 |         key: Optional[Tensor],
54 |         key_padding_mask: Optional[Tensor] = None,
55 |         layer_state: Optional[Dict[str, Optional[Tensor]]] = None,
56 |         attn_mask: Optional[Tensor] = None,
57 |         need_weights=False,
58 |         output_attentions=False,
59 |     ) -> Tuple[Tensor, Optional[Tensor]]:
60 | 
61 |         tgt_len, bsz, embed_dim = query.size()
62 |         assert embed_dim == self.embed_dim
63 |         assert list(query.size()) == [tgt_len, bsz, embed_dim]
64 |         assert attn_mask is None
65 | 
66 |         outputs = self.multi_scale_self_attn(
67 |             query.transpose(0, 1),  # MultiScaleSelfAttention expects (bsz, seqlen, embd_dim)
68 |             attention_mask=key_padding_mask.unsqueeze(dim=1).unsqueeze(dim=1) * -1,
69 |             head_mask=None,
70 |             encoder_hidden_states=None,
71 |             encoder_attention_mask=None,
72 |             output_attentions=output_attentions,
73 |         )
74 | 
75 |         attn_output = self.output(outputs[0].transpose(0, 1))
76 | 
77 |         return (attn_output,) + outputs[1:] if len(outputs) == 2 else (attn_output, None)
78 | 


--------------------------------------------------------------------------------
/data_generator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from torch.utils.data.dataset import Dataset
  4 | from utils import load_vectors, read_metadata, load_answers_dict
  5 | import numpy as np
  6 | 
  7 | import torch.nn.functional as F
  8 | from conifg import data_config, model_config
  9 | 
 10 | 
 11 | class DataGenerator(Dataset):
 12 | 
 13 |     def __init__(self, meta_file):
 14 |         super(DataGenerator, self).__init__()
 15 | 
 16 |         self.meta_file = meta_file
 17 | 
 18 |         self.feat_dir = data_config['feat_dir']
 19 |         # self.feat_ast_dir = data_config['feat_ast_dir']
 20 | 
 21 |         self.audio_fnames, self.qs, self.ans = read_metadata(self.meta_file)
 22 | 
 23 |         self.batch_size = model_config['batch_size']
 24 |         self.audio_length = data_config['audio_length']
 25 |         self.qust_max_len = data_config['quest_length']
 26 | 
 27 |         self.word_embedding_path = data_config['pre_trained_word_embeddings_file']
 28 |         self.word_embeddings = load_vectors(self.word_embedding_path)  # dict of all the {'word': [vector]} pairs
 29 |         self.answers_dict = load_answers_dict(data_config['output_classes_file'])
 30 | 
 31 |     def __getitem__(self, item):
 32 | 
 33 |         audio_feat = self.load_audio_features(item)
 34 | 
 35 |         audio_name = self.audio_fnames[item][:-3] + 'npy'
 36 |         # audio_ast_feat = np.load(os.path.join(self.feat_ast_dir, audio_name))
 37 | 
 38 |         question_text = self.qs[item]
 39 |         answer_text = self.ans[item]
 40 |         question_embedding = self.get_word_embeddings(question_text)
 41 | 
 42 |         if 'binary' in self.meta_file:
 43 |             if answer_text == 'YES':
 44 |                 label = 0
 45 |             else:
 46 |                 label = 1
 47 |         else:
 48 |             label = self.answers_dict[answer_text]
 49 | 
 50 |         # return audio_feat, audio_ast_feat, question_embedding, label
 51 |         return audio_feat, question_embedding, label
 52 | 
 53 |     def load_audio_features(self, idx):
 54 |         # audio_feat_file = self.audio_fnames[idx][:-3] + 'npz'
 55 |         audio_feat_file = self.audio_fnames[idx][:-3] + 'npy'
 56 |         data = np.load(os.path.join(self.feat_dir, audio_feat_file))
 57 |         # return data['embedding']
 58 | 
 59 |         ## -------------------------------------------------------------------------------
 60 |         ## ensure audio length equal
 61 |         if self.batch_size != 1:
 62 |             data1 = torch.from_numpy(data)
 63 |             data2 = data1.unsqueeze(0).permute(0, 2, 1).contiguous()
 64 |             data3 = F.interpolate(data2, size=self.audio_length, mode='linear', align_corners=False)
 65 |             data4 = data3.permute(0, 2, 1).contiguous()
 66 |             data5 = data4.squeeze()
 67 |             data = data5.numpy()
 68 |         ## -------------------------------------------------------------------------------
 69 | 
 70 |         return data
 71 | 
 72 |     def get_word_embeddings(self, input_text):
 73 | 
 74 |         words = input_text.split(' ')
 75 |         words[-1] = words[-1][:-1]  # removing '?' from the question, repetitive in all the Qs, so adds no value
 76 | 
 77 |         ## -------------------------------------------------------------------------------
 78 |         if len(words) < self.qust_max_len:
 79 |             dn = self.qust_max_len - len(words)
 80 |             for index in range(dn):
 81 |                 words.append("0")
 82 |         else:
 83 |             words = words[0:self.qust_max_len]
 84 |         ## -------------------------------------------------------------------------------
 85 | 
 86 |         text_embedding = []
 87 |         for word in words:
 88 |             # word = word.split(",")[0]
 89 |             try:
 90 |                 embedding = self.word_embeddings[word]
 91 |             except KeyError:
 92 |                 continue
 93 |             text_embedding.append(embedding)
 94 | 
 95 |         text_embedding = np.array(text_embedding)
 96 | 
 97 |         ## -------------------------------------------------------------------------------
 98 |         # if text_embedding.shape[0] < self.qust_max_len:
 99 |         #     ddn = self.qust_max_len - text_embedding.shape[0]
100 |         #     pad_value = np.repeat(text_embedding[-1], ddn)
101 |         #     text_embedding = np.append(text_embedding, pad_value)
102 |         #     text_embedding = text_embedding.reshape(self.qust_max_len, -1)
103 |         ## -------------------------------------------------------------------------------
104 | 
105 |         ## -------------------------------------------------------------------------------
106 |         text_embedding1 = torch.from_numpy(text_embedding)
107 |         text_embedding2 = text_embedding1.unsqueeze(0).permute(0, 2, 1).contiguous()
108 |         text_embedding3 = F.interpolate(text_embedding2, size=self.qust_max_len, mode='linear', align_corners=False)
109 |         text_embedding4 = text_embedding3.permute(0, 2, 1).contiguous()
110 |         text_embedding5 = text_embedding4.squeeze()
111 |         text_embedding = text_embedding5.numpy()
112 |         ## -------------------------------------------------------------------------------
113 | 
114 |         return text_embedding
115 | 
116 |     def __len__(self):
117 |         return len(self.qs)
118 | 


--------------------------------------------------------------------------------
/metadata/output_classes_clean.json:
--------------------------------------------------------------------------------
1 | {"RAIN": 0, "CROWD": 1, "GRINDING": 2, "SHOES": 3, "CRINKLING": 4, "CLASSICAL": 5, "HOWLING": 6, "SQUEAKING": 7, "BASKETBALL": 8, "SCRAPING": 9, "BLOWING": 10, "PAPER": 11, "PLAYGROUND": 12, "BOWL": 13, "STYROFOAM": 14, "CHILD": 15, "CUTTER": 16, "RATCHET": 17, "TABLE": 18, "PARK": 19, "MOTOR": 20, "TRACKS": 21, "BOILING": 22, "DOWN": 23, "OUTSIDE": 24, "CLAP": 25, "ELECTRICITY": 26, "BALLOON": 27, "MICROWAVE": 28, "GRINDER": 29, "TRUMPET": 30, "SQUEAK": 31, "AUDIENCE": 32, "SLAMMING": 33, "BEACH": 34, "GOOSE": 35, "CHEER": 36, "PEOPLE": 37, "HONKING": 38, "LONG": 39, "KETTLE": 40, "POPCORN": 41, "OFFICE": 42, "PIPE": 43, "RINGING": 44, "TOY": 45, "CHICKEN": 46, "NIGHT": 47, "HARD": 48, "HAMMER": 49, "NEVER": 50, "CONSTRUCTION": 51, "DOG": 52, "CLOSE": 53, "MARKET": 54, "STOP": 55, "LOCK": 56, "HEN": 57, "STORMY": 58, "ELECTRONIC": 59, "SEVEN": 60, "SPLASH": 61, "OCEAN": 62, "THIRTEEN": 63, "BATHING": 64, "PEBBLES": 65, "DRYER": 66, "VERY": 67, "SEVENTEEN": 68, "WORKING": 69, "SEAGULLS": 70, "VEHICLE": 71, "AWAY": 72, "TWENTYTWO": 73, "POLICE": 74, "PULLED": 75, "BOOK": 76, "CAT": 77, "LIGHT": 78, "ICE": 79, "BEES": 80, "BAND": 81, "HORROR": 82, "BOTTLES": 83, "FAR": 84, "FISH": 85, "PIANO": 86, "CHANGING": 87, "SHAKING": 88, "WHITE": 89, "READING": 90, "LAST": 91, "BELLS": 92, "FOREST": 93, "COUGHING": 94, "BOARD": 95, "BAG": 96, "CHEERING": 97, "ACCELERATES": 98, "BANG": 99, "SIXTY": 100, "EGGS": 101, "MAT": 102, "HORN": 103, "MIDDLE": 104, "FLOORBOARD": 105, "TWENTYSIX": 106, "FROG": 107, "ONCE": 108, "STAPLER": 109, "CRASHING": 110, "TEN": 111, "KNIFE": 112, "POP": 113, "BATHROOM": 114, "FIREWORKS": 115, "TREE": 116, "FEET": 117, "AIRPORT": 118, "SMALL": 119, "TYPEWRITER": 120, "COIN": 121, "JOGGING": 122, "WINDER": 123, "GROWLING": 124, "SPINNER": 125, "MOWER": 126, "FIFTY": 127, "HORSE": 128, "KNOCKING": 129, "BIKE": 130, "TAPPING": 131, "RAINY": 132, "ONE": 133, "SPILLING": 134, "NINETEEN": 135, "SPEAKING": 136, "TAP": 137, "HIKING": 138, "BELL": 139, "WATER": 140, "SINGING": 141, "KEYS": 142, "GONG": 143, "DRAIN": 144, "DRILL": 145, "THUNDERSTORM": 146, "LIGHTENING": 147, "DRUMS": 148, "FLOWING": 149, "RAILROAD": 150, "CLOCK": 151, "BANGING": 152, "SHOWER": 153, "PRINTING": 154, "SECOND": 155, "AIR": 156, "STATIC": 157, "MOUTH": 158, "PURRING": 159, "STEPS": 160, "EXHAUST": 161, "METAL": 162, "LEAVES": 163, "BABBLING": 164, "WHISTLING": 165, "RUNNING": 166, "CONSISTENT": 167, "STREAM": 168, "TWICE": 169, "AMBULANCE": 170, "RADAR": 171, "WINTER": 172, "INSECT": 173, "HUMANS": 174, "GRASS": 175, "LIQUID": 176, "SIREN": 177, "SIX": 178, "DRIPPING": 179, "KIDS": 180, "BOAT": 181, "ROOSTER": 182, "NONE": 183, "FLYING": 184, "FEMALE": 185, "SLOW": 186, "FADES": 187, "COINS": 188, "HUNDRED": 189, "KEYBOARD": 190, "SHEEP": 191, "FLY": 192, "DRINK": 193, "MARKER": 194, "CABINET": 195, "SODA": 196, "TRAIN": 197, "ALARM": 198, "NOTHING": 199, "ROCK": 200, "BEAR": 201, "CHIRPING": 202, "PULSING": 203, "HEAVY": 204, "UP": 205, "TRUCK": 206, "HUMAN": 207, "DISTORTION": 208, "BEE": 209, "SWEEPING": 210, "GLASS": 211, "MEDIUM": 212, "SINK": 213, "CARDS": 214, "DRESS": 215, "ADULT": 216, "RAINSTORM": 217, "TWENTYEIGHT": 218, "GARBAGE": 219, "TREES": 220, "SHORE": 221, "ROOF": 222, "OPEN": 223, "FIVE": 224, "CHIPS": 225, "SNAKE": 226, "OPENING": 227, "MACHINERY": 228, "WHEEL": 229, "ALOT": 230, "ZOO": 231, "FLUTE": 232, "PAVEMENT": 233, "MUD": 234, "FOURTEEN": 235, "CRICKET": 236, "DUCKS": 237, "SEA": 238, "PERSON": 239, "TWENTY": 240, "PINGPONG": 241, "PULLOVER": 242, "SWIMMING": 243, "WATCH": 244, "HIGHWAY": 245, "HAPPY": 246, "DOOR": 247, "TELEPHONE": 248, "SPEED": 249, "WHISTLE": 250, "TWENTYONE": 251, "MIXING": 252, "TENNIS": 253, "ANNOUNCEMENT": 254, "LAUGHING": 255, "TORNADO": 256, "SEESAW": 257, "START": 258, "SEVENTYTHREE": 259, "SHOWERING": 260, "UNKNOWN": 261, "COFFEEMAKER": 262, "SOFT": 263, "TOP": 264, "PIG": 265, "THROWING": 266, "HALL": 267, "FEATHERS": 268, "THIRTY": 269, "MOVING": 270, "MALE": 271, "CRICKETS": 272, "TV": 273, "TALKING": 274, "KEY": 275, "TRAFFIC": 276, "LOUD": 277, "MEETING": 278, "SNIFFING": 279, "PILOT": 280, "EXCITEMENT": 281, "OIL": 282, "FALLING": 283, "SCRIBBLING": 284, "CROW": 285, "WOOD": 286, "HITTING": 287, "LOUDSPEAKER": 288, "STONE": 289, "SHOOTING": 290, "MOTORCYCLE": 291, "WRITING": 292, "HAY": 293, "BRUSH": 294, "TWELVE": 295, "FINGERS": 296, "THUNDER": 297, "TEARING": 298, "WASHER": 299, "JACKHAMMER": 300, "TILE": 301, "MAN": 302, "NESTS": 303, "FIREPLACE": 304, "SAND": 305, "FAST": 306, "CRYING": 307, "FACTORY": 308, "MOUSE": 309, "SCHOOL": 310, "WET": 311, "DEEP": 312, "PLANE": 313, "STOPS": 314, "FLOOR": 315, "GOOD": 316, "HAND": 317, "THIRTYONE": 318, "CUP": 319, "MANY": 320, "WHIRRING": 321, "FRYING": 322, "CYMBAL": 323, "PLAYING": 324, "GRAVEL": 325, "BUBBLING": 326, "NOTEBOOK": 327, "KITCHEN": 328, "ELEVEN": 329, "LAWNMOWER": 330, "FIRE": 331, "SUBWAY": 332, "UTENSIL": 333, "MACHINE": 334, "WINDOW": 335, "BOTTLE": 336, "DIESEL": 337, "END": 338, "DROPPING": 339, "TIMER": 340, "CHOPPING": 341, "FAUCET": 342, "CHATTING": 343, "FIRST": 344, "TEETH": 345, "CHAIR": 346, "CLANKING": 347, "BABY": 348, "LION": 349, "SLOWLY": 350, "STRAW": 351, "FOUR": 352, "BRAKES": 353, "WATERFALL": 354, "NINE": 355, "CARS": 356, "PARTY": 357, "WAVES": 358, "DRIVE": 359, "UMBRELLA": 360, "EVENING": 361, "SILENT": 362, "HIGH": 363, "WHISPERING": 364, "ENGINE": 365, "CONTINUOUSLY": 366, "SHAKER": 367, "DRUM": 368, "CLOUDS": 369, "CLAPPING": 370, "DUCK": 371, "EIGHTEEN": 372, "COUGH": 373, "SHARPENING": 374, "THREE": 375, "BREATHING": 376, "GROWL": 377, "PENCIL": 378, "BAD": 379, "DRAINING": 380, "GROUND": 381, "STEAM": 382, "COCK": 383, "BALL": 384, "BUZZING": 385, "LOW": 386, "SPEAKER": 387, "TIRES": 388, "RAKE": 389, "EIGHT": 390, "CAR": 391, "BIRDS": 392, "RAILWAYSTATION": 393, "COFFEE": 394, "WIND": 395, "WOMAN": 396, "HANDS": 397, "RACING": 398, "FABRIC": 399, "CART": 400, "SLEEPING": 401, "BEGINNING": 402, "GUITAR": 403, "DRILLING": 404, "FOURTY": 405, "POURING": 406, "SOUND": 407, "EMERGENCY": 408, "LISTENING": 409, "STOMPING": 410, "STORM": 411, "AIRPLANE": 412, "CHAINSAW": 413, "INCREASE": 414, "WOMEN": 415, "STEADY": 416, "GATE": 417, "PIPES": 418, "SIXTEEN": 419, "VIBRATING": 420, "FORTY": 421, "STATION": 422, "COMPUTER": 423, "XYLOPHONE": 424, "RUBBER": 425, "PLASTIC": 426, "SAW": 427, "SEWING": 428, "BIRD": 429, "CLUCKING": 430, "RADIO": 431, "TOILET": 432, "BIG": 433, "BOOTS": 434, "EATING": 435, "WINDY": 436, "MOWING": 437, "CHILDREN": 438, "SPLASHING": 439, "FARM": 440, "TWENTYFOUR": 441, "MICROPHONE": 442, "TWENTYFIVE": 443, "LAUGH": 444, "OWL": 445, "VIOLIN": 446, "RESTAURANT": 447, "FIFTEEN": 448, "HELICOPTER": 449, "ROAD": 450, "DISHES": 451, "DRIVING": 452, "RINSING": 453, "MORNING": 454, "STARTING": 455, "ACCELERATING": 456, "CHURCH": 457, "ZERO": 458, "COW": 459, "SCRATCHING": 460, "CHIRP": 461, "BEEPING": 462, "TAPE": 463, "TIN": 464, "TWO": 465, "REFRIGERATOR": 466, "TUNNEL": 467, "REVVING": 468, "TYPING": 469, "RAINING": 470, "CHIMES": 471, "BUS": 472, "BUBBLEWRAP": 473, "THIRTYFOUR": 474, "CONSTANT": 475, "PULLING": 476, "FAN": 477, "RAINFALL": 478, "ALWAYS": 479, "SAWING": 480, "BUCKET": 481, "BARKING": 482, "SYNTHESIZER": 483, "GLASSES": 484, "KNOCK": 485, "LAUNDRY": 486, "SANDER": 487, "TELEVISION": 488, "GAS": 489, "MUSIC": 490, "SWING": 491, "SPARROW": 492, "STAIRS": 493, "FLOOD": 494, "ORGAN": 495, "JAR": 496, "WALKING": 497, "SNOW": 498, "YES": 499, "NO": 500}


--------------------------------------------------------------------------------
/nets/sliding_chunks.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from nets.diagonaled_mm_tvm import mask_invalid_locations
  4 | 
  5 | 
  6 | def _skew(x, direction, padding_value):
  7 |     '''Convert diagonals into columns (or columns into diagonals depending on `direction`'''
  8 |     x_padded = F.pad(x, direction, value=padding_value)
  9 |     x_padded = x_padded.view(*x_padded.size()[:-2], x_padded.size(-1), x_padded.size(-2))
 10 |     return x_padded
 11 | 
 12 | 
 13 | def _skew2(x, padding_value):
 14 |     '''shift every row 1 step to right converting columns into diagonals'''
 15 |     # X = B x C x M x L
 16 |     B, C, M, L = x.size()
 17 |     x = F.pad(x, (0, M + 1), value=padding_value)  # B x C x M x (L+M+1)
 18 |     x = x.view(B, C, -1)  # B x C x ML+MM+M
 19 |     x = x[:, :, :-M]  # B x C x ML+MM
 20 |     x = x.view(B, C, M, M + L)  # B x C, M x L+M
 21 |     x = x[:, :, :, :-1]
 22 |     return x
 23 | 
 24 | 
 25 | def _chunk(x, w):
 26 |     '''convert into overlapping chunkings. Chunk size = 2w, overlap size = w'''
 27 | 
 28 |     # non-overlapping chunks of size = 2w
 29 |     x = x.view(x.size(0), x.size(1) // (w * 2), w * 2, x.size(2))
 30 | 
 31 |     # use `as_strided` to make the chunks overlap with an overlap size = w
 32 |     chunk_size = list(x.size())
 33 |     chunk_size[1] = chunk_size[1] * 2 - 1
 34 | 
 35 |     chunk_stride = list(x.stride())
 36 |     chunk_stride[1] = chunk_stride[1] // 2
 37 |     return x.as_strided(size=chunk_size, stride=chunk_stride)
 38 | 
 39 | 
 40 | def sliding_chunks_matmul_qk(q: torch.Tensor, k: torch.Tensor, w: int, padding_value: float):
 41 |     '''Matrix multiplicatio of query x key tensors using with a sliding window attention pattern.
 42 |     This implementation splits the input into overlapping chunks of size 2w (e.g. 512 for pretrained Longformer)
 43 |     with an overlap of size w'''
 44 |     
 45 |     bsz, seqlen, num_heads, head_dim = q.size()
 46 |     # seqlen, bsz, num_heads, head_dim = q.size()
 47 | 
 48 |     # print("scmq: ", bsz, seqlen, num_heads, head_dim)
 49 |     # print("w: ", w)
 50 | 
 51 |     assert seqlen % (w * 2) == 0
 52 |     assert q.size() == k.size()
 53 | 
 54 |     chunks_count = seqlen // w - 1
 55 | 
 56 |     # group bsz and num_heads dimensions into one, then chunk seqlen into chunks of size w * 2
 57 |     q = q.transpose(1, 2).reshape(bsz * num_heads, seqlen, head_dim)
 58 |     k = k.transpose(1, 2).reshape(bsz * num_heads, seqlen, head_dim)
 59 | 
 60 |     chunk_q = _chunk(q, w)
 61 |     chunk_k = _chunk(k, w)
 62 | 
 63 |     # matrix multipication
 64 |     # bcxd: bsz*num_heads x chunks x 2w x head_dim
 65 |     # bcyd: bsz*num_heads x chunks x 2w x head_dim
 66 |     # bcxy: bsz*num_heads x chunks x 2w x 2w
 67 |     chunk_attn = torch.einsum('bcxd,bcyd->bcxy', (chunk_q, chunk_k))  # multiply
 68 | 
 69 |     # convert diagonals into columns
 70 |     diagonal_chunk_attn = _skew(chunk_attn, direction=(0, 0, 0, 1), padding_value=padding_value)
 71 | 
 72 |     # allocate space for the overall attention matrix where the chunks are compined. The last dimension
 73 |     # has (w * 2 + 1) columns. The first (w) columns are the w lower triangles (attention from a word to
 74 |     # w previous words). The following column is attention score from each word to itself, then
 75 |     # followed by w columns for the upper triangle.
 76 | 
 77 |     diagonal_attn = diagonal_chunk_attn.new_empty((bsz * num_heads, chunks_count + 1, w, w * 2 + 1))
 78 | 
 79 |     # copy parts from diagonal_chunk_attn into the compined matrix of attentions
 80 |     # - copying the main diagonal and the upper triangle
 81 |     diagonal_attn[:, :-1, :, w:] = diagonal_chunk_attn[:, :, :w, :w + 1]
 82 |     diagonal_attn[:, -1, :, w:] = diagonal_chunk_attn[:, -1, w:, :w + 1]
 83 |     # - copying the lower triangle
 84 |     diagonal_attn[:, 1:, :, :w] = diagonal_chunk_attn[:, :, - (w + 1):-1, w + 1:]
 85 |     diagonal_attn[:, 0, 1:w, 1:w] = diagonal_chunk_attn[:, 0, :w - 1, 1 - w:]
 86 | 
 87 |     # separate bsz and num_heads dimensions again
 88 |     diagonal_attn = diagonal_attn.view(bsz, num_heads, seqlen, 2 * w + 1).transpose(2, 1)
 89 | 
 90 |     mask_invalid_locations(diagonal_attn, w, 1, False)
 91 |     return diagonal_attn
 92 | 
 93 | 
 94 | def sliding_chunks_matmul_pv(prob: torch.Tensor, v: torch.Tensor, w: int):
 95 |     '''Same as sliding_chunks_matmul_qk but for prob and value tensors. It is expecting the same output
 96 |     format from sliding_chunks_matmul_qk'''
 97 |     bsz, seqlen, num_heads, head_dim = v.size()
 98 |     assert seqlen % (w * 2) == 0
 99 |     assert prob.size()[:3] == v.size()[:3]
100 |     assert prob.size(3) == 2 * w + 1
101 |     chunks_count = seqlen // w - 1
102 |     # group bsz and num_heads dimensions into one, then chunk seqlen into chunks of size 2w
103 |     chunk_prob = prob.transpose(1, 2).reshape(bsz * num_heads, seqlen // w, w, 2 * w + 1)
104 | 
105 |     # group bsz and num_heads dimensions into one
106 |     v = v.transpose(1, 2).reshape(bsz * num_heads, seqlen, head_dim)
107 | 
108 |     # pad seqlen with w at the beginning of the sequence and another w at the end
109 |     padded_v = F.pad(v, (0, 0, w, w), value=-1)
110 | 
111 |     # chunk padded_v into chunks of size 3w and an overlap of size w
112 |     chunk_v_size = (bsz * num_heads, chunks_count + 1, 3 * w, head_dim)
113 |     chunk_v_stride = padded_v.stride()
114 |     chunk_v_stride = chunk_v_stride[0], w * chunk_v_stride[1], chunk_v_stride[1], chunk_v_stride[2]
115 |     chunk_v = padded_v.as_strided(size=chunk_v_size, stride=chunk_v_stride)
116 | 
117 |     skewed_prob = _skew2(chunk_prob, padding_value=0)
118 | 
119 |     context = torch.einsum('bcwd,bcdh->bcwh', (skewed_prob, chunk_v))
120 |     return context.view(bsz, num_heads, seqlen, head_dim).transpose(1, 2)
121 | 
122 | 
123 | def pad_to_window_size(input_ids: torch.Tensor, attention_mask: torch.Tensor,
124 |                        one_sided_window_size: int, pad_token_id: int):
125 |     '''A helper function to pad tokens and mask to work with the sliding_chunks implementation of Longformer selfattention.
126 |     Input:
127 |         input_ids = torch.Tensor(bsz x seqlen): ids of wordpieces
128 |         attention_mask = torch.Tensor(bsz x seqlen): attention mask
129 |         one_sided_window_size = int: window size on one side of each token
130 |         pad_token_id = int: tokenizer.pad_token_id
131 |     Returns
132 |         (input_ids, attention_mask) padded to length divisible by 2 * one_sided_window_size
133 |     '''
134 |     w = int(2 * one_sided_window_size)
135 |     seqlen = input_ids.size(1)
136 |     padding_len = (w - seqlen % w) % w
137 |     input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id)
138 |     attention_mask = F.pad(attention_mask, (0, padding_len), value=False)  # no attention on the padding tokens
139 |     return input_ids, attention_mask
140 | 
141 | 
142 | # ========= "sliding_chunks_no_overlap": alternative implemenation of the sliding window attention =========
143 | # This implementation uses non-overlapping chunks (or blocks) of size `w` with number of local attention = 3xw
144 | # To make this implemenation comparable to "sliding_chunks" set w such that
145 | #       w_of_sliding_chunks_no_overlap = w_of_sliding_chunks * 2 / 3
146 | # For example,
147 | #    w_of_sliding_chunks = 256 (this is one sided. Total attention size = 512)
148 | #    w_of_sliding_chunks_no_overlap = 170 (Total attention size = 510)
149 | # Performance:
150 | # - Speed: 30% faster than "sliding_chunks"
151 | # - Memory: 95% of the memory usage of "sliding_chunks"
152 | # The windows are asymmetric where number of attention on each side of a token ranges between w to 2w
153 | # while "sliding_chunks" has a symmetric window around each token.
154 | 
155 | 
156 | def sliding_chunks_no_overlap_matmul_qk(q: torch.Tensor, k: torch.Tensor, w: int, padding_value: float):
157 |     bsz, seqlen, num_heads, head_dim = q.size()
158 |     assert seqlen % w == 0
159 |     assert q.size() == k.size()
160 |     # chunk seqlen into non-overlapping chunks of size w
161 |     chunk_q = q.view(bsz, seqlen // w, w, num_heads, head_dim)
162 |     chunk_k = k.view(bsz, seqlen // w, w, num_heads, head_dim)
163 |     chunk_k_expanded = torch.stack((
164 |         F.pad(chunk_k[:, :-1], (0, 0, 0, 0, 0, 0, 1, 0), value=0.0),
165 |         chunk_k,
166 |         F.pad(chunk_k[:, 1:], (0, 0, 0, 0, 0, 0, 0, 1), value=0.0),
167 |     ), dim=-1)
168 |     diagonal_attn = torch.einsum('bcxhd,bcyhde->bcxhey', (chunk_q, chunk_k_expanded))  # multiply
169 |     return diagonal_attn.reshape(bsz, seqlen, num_heads, 3 * w)
170 | 
171 | 
172 | def sliding_chunks_no_overlap_matmul_pv(prob: torch.Tensor, v: torch.Tensor, w: int):
173 |     bsz, seqlen, num_heads, head_dim = v.size()
174 |     chunk_prob = prob.view(bsz, seqlen // w, w, num_heads, 3, w)
175 |     chunk_v = v.view(bsz, seqlen // w, w, num_heads, head_dim)
176 |     chunk_v_extended = torch.stack((
177 |         F.pad(chunk_v[:, :-1], (0, 0, 0, 0, 0, 0, 1, 0), value=0.0),
178 |         chunk_v,
179 |         F.pad(chunk_v[:, 1:], (0, 0, 0, 0, 0, 0, 0, 1), value=0.0),
180 |     ), dim=-1)
181 |     context = torch.einsum('bcwhpd,bcdhep->bcwhe', (chunk_prob, chunk_v_extended))
182 |     return context.reshape(bsz, seqlen, num_heads, head_dim)
183 | 


--------------------------------------------------------------------------------
/main_MWAFM.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import pickle
  4 | import utils
  5 | import torch
  6 | import torch.nn as nn
  7 | import numpy as np
  8 | from torch.utils.data import DataLoader
  9 | import torch.optim as optim
 10 | import argparse
 11 | 
 12 | from data_generator import *
 13 | from nets.ours_MWAFM_Net import MWAFM_Net
 14 | from conifg import data_config, model_config
 15 | 
 16 | 
 17 | def train(model, train_iterator, optimizer, criterion, epoch):
 18 | 
 19 |     model.train()
 20 | 
 21 |     # for batch_idx, (audio_feat, audio_ast_feat, question, label) in enumerate(train_iterator):
 22 |     for batch_idx, (audio_feat, question, label) in enumerate(train_iterator):
 23 | 
 24 |         audio_feat = audio_feat.to(dtype=torch.float)
 25 |         # audio_ast_feat = audio_ast_feat.to(dtype=torch.float)
 26 |         question = question.to(dtype=torch.float)
 27 |         label = label.to('cuda', dtype=torch.long)
 28 | 
 29 |         question_len = torch.ones((question.size(0),), dtype=torch.int8).to('cuda')
 30 | 
 31 |         optimizer.zero_grad()
 32 |         # logits_output = model(audio_feat, audio_ast_feat, question)
 33 |         logits_output = model(audio_feat, question)
 34 | 
 35 |         loss = criterion(logits_output, label)
 36 |         loss.backward()
 37 |         optimizer.step()
 38 | 
 39 |         if batch_idx % model_config['log_interval'] == 0:
 40 |             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch, batch_idx * len(audio_feat), len(train_iterator.dataset), 100. * batch_idx / len(train_iterator), loss.item()))
 41 | 
 42 | 
 43 | def eval(model, val_iterator, optimizer, criterion, epoch):
 44 | 
 45 |     model.eval()
 46 |     val_acc_top_01 = 0
 47 |     val_acc_top_05 = 0
 48 |     val_acc_top_10 = 0
 49 | 
 50 |     total = 0
 51 |     correct_top_01 = 0
 52 |     correct_top_05 = 0
 53 |     correct_top_10 = 0
 54 | 
 55 |     with torch.no_grad():
 56 |         # for batch_idx, (audio_feat, audio_ast_feat, question, label) in enumerate(val_iterator):
 57 |         for batch_idx, (audio_feat, question, label) in enumerate(val_iterator):
 58 |         
 59 |             audio_feat = audio_feat.to(dtype=torch.float)
 60 |             # audio_ast_feat = audio_ast_feat.to(dtype=torch.float)
 61 |             question = question.to(dtype=torch.float)
 62 |             label = label.to('cuda', dtype=torch.long)
 63 |             question_len = torch.ones((question.size(0),), dtype=torch.int8).to('cuda')
 64 |         
 65 |             # logits_output = model(audio_feat, audio_ast_feat, question)
 66 |             logits_output = model(audio_feat, question)
 67 | 
 68 | 
 69 |             total += logits_output.size(0)
 70 | 
 71 |             # top-01 accuracy
 72 |             _, predicted_top_01 = torch.max(logits_output.data, 1)
 73 |             correct_top_01 += (predicted_top_01 == label).sum().item()
 74 | 
 75 |             # top-05 and top-20 accuracy
 76 |             _, predicted_top_n = torch.sort(logits_output.data, dim=1, descending=True)
 77 |             
 78 |             predicted_top_05 = predicted_top_n[:, :5].detach().cpu().numpy()
 79 |             predicted_top_10 = predicted_top_n[:, :10].detach().cpu().numpy()
 80 | 
 81 |             ground_truth = label.detach().cpu().numpy()
 82 |             n_batch = ground_truth.shape[0]
 83 |             ground_truth = ground_truth.reshape(n_batch, 1)
 84 | 
 85 |             correct_top_05 += np.count_nonzero((predicted_top_05-ground_truth)==0)
 86 |             correct_top_10 += np.count_nonzero((predicted_top_10-ground_truth)==0)
 87 | 
 88 | 
 89 |     val_top_01 = 100 * correct_top_01 / total
 90 |     val_top_05 = 100 * correct_top_05 / total
 91 |     val_top_10 = 100 * correct_top_10 / total
 92 | 
 93 |     print("\nTop-01 Validation set accuracy = %.2f %%" % val_top_01)
 94 |     print("Top-05 Validation set accuracy = %.2f %%" % val_top_05)
 95 |     print("Top-10 Validation set accuracy = %.2f %%" % val_top_10)
 96 | 
 97 |     return val_top_01
 98 | 
 99 | 
100 | def test(model, test_iterator):
101 | 
102 |     model.eval()
103 |     val_acc_top_01 = 0
104 |     val_acc_top_05 = 0
105 |     val_acc_top_10 = 0
106 | 
107 |     total = 0
108 |     correct_top_01 = 0
109 |     correct_top_05 = 0
110 |     correct_top_10 = 0
111 | 
112 |     with torch.no_grad():
113 |         # for batch_idx, (audio_feat, audio_ast_feat, question, label) in enumerate(test_iterator):
114 |         for batch_idx, (audio_feat, audio_feat, question, label) in enumerate(test_iterator):
115 |         
116 |             audio_feat = audio_feat.to(dtype=torch.float)
117 |             # audio_ast_feat = audio_ast_feat.to(dtype=torch.float)
118 |             question = question.to(dtype=torch.float)
119 |             label = label.to('cuda', dtype=torch.long)
120 |             question_len = torch.ones((question.size(0),), dtype=torch.int8).to('cuda')
121 |         
122 |             # logits_output = model(audio_feat, audio_ast_feat, question)
123 |             logits_output = model(audio_feat, question)
124 | 
125 | 
126 |             total += logits_output.size(0)
127 | 
128 |             # top-01 accuracy
129 |             _, predicted_top_01 = torch.max(logits_output.data, 1)
130 |             correct_top_01 += (predicted_top_01 == label).sum().item()
131 | 
132 |             # top-05 and top-20 accuracy
133 |             _, predicted_top_n = torch.sort(logits_output.data, dim=1, descending=True)
134 |             
135 |             predicted_top_05 = predicted_top_n[:, :5].detach().cpu().numpy()
136 |             predicted_top_10 = predicted_top_n[:, :10].detach().cpu().numpy()
137 | 
138 |             ground_truth = label.detach().cpu().numpy()
139 |             n_batch = ground_truth.shape[0]
140 |             ground_truth = ground_truth.reshape(n_batch, 1)
141 | 
142 |             correct_top_05 += np.count_nonzero((predicted_top_05-ground_truth)==0)
143 |             correct_top_10 += np.count_nonzero((predicted_top_10-ground_truth)==0)
144 | 
145 | 
146 |     val_top_01 = 100 * correct_top_01 / total
147 |     val_top_05 = 100 * correct_top_05 / total
148 |     val_top_10 = 100 * correct_top_10 / total
149 | 
150 |     print("\nTop-01 Validation set accuracy = %.2f %%" % val_top_01)
151 |     print("Top-05 Validation set accuracy = %.2f %%" % val_top_05)
152 |     print("Top-10 Validation set accuracy = %.2f %%" % val_top_10)
153 |     print('\n***********************************************************\n')
154 | 
155 |     # return val_acc_top_01
156 | 
157 | 
158 | 
159 | if __name__ == '__main__':
160 | 
161 |     parser = argparse.ArgumentParser(description='PyTorch Implementation of Audio Question Answering')
162 |     parser.add_argument(
163 |         "--mode", type=str, default='train', help="with mode to use")
164 |     parser.add_argument(
165 |         "--model_save_dir", type=str, default='./checkpoints/', help="model save dir")
166 |     parser.add_argument(
167 |         "--checkpoint", type=str, default='MWAFM_Net',help="save model name")
168 |     parser.add_argument(
169 |         '--seed', type=int, default=8888, metavar='S',help='random seed (default: 1)')
170 |     parser.add_argument(
171 |         '--gpu', type=str, default='0, 1', help='gpu device number')
172 | 
173 |     args = parser.parse_args()
174 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
175 |     torch.manual_seed(args.seed)
176 |     
177 |     model = MWAFM_Net().to('cuda')
178 |     model = nn.DataParallel(model)
179 |     model = model.to('cuda')
180 | 
181 |     if args.mode == "train":
182 |         print("\n-------------------- Multi-scale Window-size Attention Model Training --------------------")
183 |         # create data iterator
184 |         # train_dataset = DataGenerator(data_config['train_metadata_path'])
185 |         # train_dataset = DataGenerator(data_config, model_config)
186 |         train_dataset = DataGenerator(data_config['train_metadata_path'])
187 |         train_iterator = DataLoader(dataset=train_dataset, batch_size=model_config['batch_size'], num_workers=model_config['num_workers'], 
188 |                                     pin_memory=True, shuffle=True)
189 |         # val_dataset = DataGenerator(data_config['val_metadata_path'])
190 |         val_dataset = DataGenerator(data_config['val_metadata_path'])
191 |         val_iterator = DataLoader(dataset=val_dataset, batch_size=model_config['batch_size'], num_workers=model_config['num_workers'], 
192 |                                   pin_memory=True, shuffle=True)
193 |         
194 |         optimizer = torch.optim.Adam(params=model.parameters(), lr=model_config['learning_rate'])
195 |         scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=8, gamma=0.1)
196 |         criterion = nn.CrossEntropyLoss()
197 | 
198 |         start_epoch = 0
199 |         best_acc = 0
200 |         best_epoch = 0
201 |         for epoch in range(start_epoch, model_config['num_epochs']):
202 |             train(model, train_iterator, optimizer, criterion, epoch=epoch)
203 |             scheduler.step(epoch)
204 |             F = eval(model, val_iterator, optimizer, criterion, epoch)
205 |             if F >= best_acc:
206 |                 best_acc = F
207 |                 best_epoch = epoch
208 |                 torch.save(model.state_dict(), args.model_save_dir + args.checkpoint + ".pt")
209 |             print("\nTop-01 training-val best: epoch {},  acc: {:.2f}%".format(best_epoch, best_acc))
210 |             print('\n***********************************************************\n')
211 |     else:
212 |         print("\n-------------------- Multi-scale Window-size Attention Model Testing --------------------")
213 |         test_dataset = DataGenerator(data_config, model_config, mode='test')
214 |         test_iterator = DataLoader(dataset=test_dataset, batch_size=model_config['batch_size'], num_workers=model_config['num_workers'], 
215 |                                    pin_memory=True, shuffle=True)
216 |         model.load_state_dict(torch.load(args.model_save_dir + args.checkpoint + ".pt"))
217 |         test(model, test_iterator)
218 | 
219 | 


--------------------------------------------------------------------------------
/metadata/output_classes.json:
--------------------------------------------------------------------------------
1 | {"WATERFALL": 0, "SHARPENING": 1, "REVVING": 2, "MARKER": 3, "CARDS": 4, "RAINDROPS": 5, "SNAKE": 6, "DIGGING": 7, "READING": 8, "FIGHTING": 9, "SCHOOL": 10, "PLASTIC": 11, "KETTLE": 12, "SECOND": 13, "LIQUID": 14, "TREE": 15, "GLASSES": 16, "THIRD": 17, "ALUMINUM": 18, "FILLING": 19, "AWAY": 20, "LIGHTENING": 21, "ADULT": 22, "STONE": 23, "EXHAUST": 24, "SHOWER": 25, "MIXER": 26, "DRIP": 27, "VIBRATING": 28, "LOCUST": 29, "LISTENING": 30, "NEVER": 31, "TRUCK": 32, "LOW": 33, "RACETRACK": 34, "PIG": 35, "MOWER": 36, "COOKER": 37, "FEMALE": 38, "CRINKLING": 39, "POP": 40, "FRYINGPAN": 41, "BLUE": 42, "AUDIENCE": 43, "GOOSE": 44, "PULLED": 45, "BUZZING": 46, "FAUCET": 47, "STREAM": 48, "DOLPHIN": 49, "CART": 50, "GLASS": 51, "XYLOPHONE": 52, "MATCH": 53, "TWELVE": 54, "FIREPLACE": 55, "CHAINSAW": 56, "DISHWASHER": 57, "LAUNDRY": 58, "FREIGHT": 59, "STATIC": 60, "ALARM": 61, "DROP": 62, "NIGHT": 63, "TWENTYFIVE": 64, "ELEVEN": 65, "SQUEAK": 66, "TWENTY": 67, "FUEL": 68, "HELICOPTER": 69, "FLUSHING": 70, "TRAFFIC": 71, "TWENTYFOUR": 72, "ROLLER": 73, "FLY": 74, "BRANCHES": 75, "COW": 76, "HORSE": 77, "HUMANS": 78, "DOWN": 79, "SONG": 80, "DRUMSTICK": 81, "DUCK": 82, "PAPER": 83, "HANDS": 84, "RICE": 85, "WOOD": 86, "ELECTRICITY": 87, "FLOW": 88, "BUBBLEWRAP": 89, "CLASSICAL": 90, "WOMEN": 91, "PERSON": 92, "RADIO": 93, "CHIRPS": 94, "MICROPHONE": 95, "SLAMMING": 96, "DRIVING": 97, "HIGH": 98, "CHANGING": 99, "BALL": 100, "CHILD": 101, "SEAWAVE": 102, "SINK": 103, "PRESS": 104, "AIRPLANE": 105, "STORMY": 106, "MUD": 107, "TELEPHONE": 108, "RUNNING": 109, "CRUSHING": 110, "FLOORBOARD": 111, "RAINY": 112, "RACE": 113, "MICROWAVE": 114, "DOVE": 115, "VEHICLE": 116, "RAPIDLY": 117, "BOAT": 118, "DRUM": 119, "BELLS": 120, "BUGS": 121, "TEN": 122, "QUIET": 123, "SEVENTEEN": 124, "BATHROOM": 125, "STEADY": 126, "WORK": 127, "VOICES": 128, "SUMMER": 129, "TRACTOR": 130, "PAVEMENT": 131, "TWEET": 132, "VACUUMING": 133, "SPRAYER": 134, "NOTEBOOK": 135, "HEN": 136, "SCRATCHING": 137, "STATION": 138, "FOURTEEN": 139, "SPARROW": 140, "GUTTERS": 141, "BUBBLING": 142, "KNOCKING": 143, "WAVE": 144, "STOPS": 145, "WHEELS": 146, "CONSTANT": 147, "INSECT": 148, "PARTY": 149, "SHREDDING": 150, "AIRSOUND": 151, "HAMMER": 152, "MEN": 153, "HAPPINESS": 154, "FIREWORKS": 155, "WATERS": 156, "PENCIL": 157, "FLOOD": 158, "RIVER": 159, "GOOD": 160, "TIN": 161, "KITCHEN": 162, "CHICKEN": 163, "KNIFE": 164, "FENCE": 165, "ELECTRONIC": 166, "SHIP": 167, "CAR": 168, "PACKAGE": 169, "SIXTH": 170, "CLOCK": 171, "GROWLING": 172, "DISTORTION": 173, "CREAKING": 174, "EGG": 175, "SCRAPING": 176, "SILVERWARE": 177, "SEVENTYTHREE": 178, "FOURTY": 179, "HARD": 180, "HIGHWAY": 181, "FLOOR": 182, "TUBA": 183, "CHURCH": 184, "BASKETBALL": 185, "PUMP": 186, "TELEVISION": 187, "SLEEPING": 188, "SHORT": 189, "DRIBBLING": 190, "WRITING": 191, "MANY": 192, "LAKE": 193, "MUG": 194, "WAKING": 195, "FLIGHT": 196, "SPOON": 197, "HEATER": 198, "ROCKS": 199, "SIZZLING": 200, "STARTING": 201, "HAIL": 202, "DRINK": 203, "WHIRRING": 204, "FIVE": 205, "PARROT": 206, "MOVEMENT": 207, "PEBBLES": 208, "GUITAR": 209, "MACHINERY": 210, "WINDY": 211, "CONSISTENT": 212, "SCRIBBLING": 213, "CHIRP": 214, "COFFEE": 215, "FISH": 216, "SHAKING": 217, "PLANE": 218, "DUCKS": 219, "FEET": 220, "STOMPING": 221, "SITTING": 222, "COINS": 223, "MONKEY": 224, "EVENING": 225, "GRINDER": 226, "SCOOTER": 227, "ROAD": 228, "MEETING": 229, "RAINING": 230, "SAND": 231, "COCK": 232, "RATCHET": 233, "MILD": 234, "CONTINUOUSLY": 235, "FOOT": 236, "WHITE": 237, "PILOT": 238, "MIXING": 239, "SEA": 240, "OBJECT": 241, "PIGEON": 242, "LATCH": 243, "UP": 244, "THIRTYONE": 245, "SHOP": 246, "WALK": 247, "RAT": 248, "LANDING": 249, "SHARPENER": 250, "WASHER": 251, "FOUR": 252, "PLATE": 253, "ROLLERCOASTER": 254, "VIOLIN": 255, "THIRTYFIVE": 256, "CRICKET": 257, "BOOTS": 258, "STRAW": 259, "VACUUM": 260, "MULTIPLE": 261, "ZOO": 262, "WHISPERING": 263, "SCREAMING": 264, "SHOUTING": 265, "CAN": 266, "BLENDER": 267, "TIMER": 268, "CRUNCH": 269, "MARBLE": 270, "CRYING": 271, "ROOSTER": 272, "ROCK": 273, "WINDCHIME": 274, "PRINTER": 275, "HOWLING": 276, "STICK": 277, "SILENT": 278, "NORMAL": 279, "FALLS": 280, "HONKING": 281, "UTENSILS": 282, "COFFEEMAKER": 283, "DOOR": 284, "SQUEAKING": 285, "CARDBOARD": 286, "TIRES": 287, "CLOSE": 288, "COPIER": 289, "STAPLER": 290, "COIN": 291, "AEROPLANE": 292, "LEAVES": 293, "PASSING": 294, "PIPE": 295, "SPINNER": 296, "RINGING": 297, "SHOES": 298, "FACTORY": 299, "STEPS": 300, "ORGAN": 301, "TREES": 302, "HAND": 303, "BEADS": 304, "UNKNOWN": 305, "CITY": 306, "LAWNMOWER": 307, "MOTORCYCLE": 308, "LEFT": 309, "DROPPING": 310, "PULSING": 311, "COMPLETION": 312, "JOGGING": 313, "DISHES": 314, "CAT": 315, "OUTDOORS": 316, "THIRTYFOUR": 317, "DEEP": 318, "HALL": 319, "POUR": 320, "SCREAM": 321, "SIX": 322, "DIRT": 323, "HISSING": 324, "AC": 325, "STEAL": 326, "SEWING": 327, "PIANO": 328, "BATMINTON": 329, "WIND": 330, "RAPID": 331, "SIXTY": 332, "TAPPING": 333, "RAIN": 334, "USEFUL": 335, "CUTTING": 336, "PHONE": 337, "HEAVY": 338, "SANDER": 339, "WHISTLING": 340, "INTERCOM": 341, "DIESEL": 342, "RAILROAD": 343, "FIRST": 344, "LAUGHTER": 345, "DEBRIS": 346, "WHEEL": 347, "SINGING": 348, "FIRE": 349, "FALLING": 350, "OIL": 351, "SHAKER": 352, "ENGINE": 353, "ACCELERATES": 354, "STOP": 355, "CANVAS": 356, "MOUTH": 357, "RABIT": 358, "BUS": 359, "DOG": 360, "SPEAKING": 361, "WD-40": 362, "AUDITORIUM": 363, "DRILLING": 364, "LORRY": 365, "DRIZZLING": 366, "BEAR": 367, "SCRAPER": 368, "SLOWLY": 369, "FEATHERS": 370, "GARBAGE": 371, "SPLASHING": 372, "BOOKS": 373, "PARK": 374, "OWL": 375, "GRINDING": 376, "POLICE": 377, "GONG": 378, "MARBLES": 379, "WALKING": 380, "FIFTY": 381, "WINCH": 382, "PEOPLE": 383, "HUMAN": 384, "CICADAS": 385, "METAL": 386, "URINATING": 387, "DRAIN": 388, "ONCE": 389, "SPILLING": 390, "STORM": 391, "CLANKING": 392, "TAMBOURINE": 393, "AQUARIUM": 394, "SAWING": 395, "NINETEEN": 396, "CHIME": 397, "SCREECHING": 398, "CROWD": 399, "SPLASH": 400, "ONE": 401, "TOP": 402, "ACCELERATING": 403, "MARKET": 404, "DRIPPING": 405, "EMERGENCY": 406, "SEMI": 407, "THUNDER": 408, "RAINFALL": 409, "HORROR": 410, "BIKE": 411, "FRYING": 412, "TAPE": 413, "PURRING": 414, "THROWING": 415, "TYPING": 416, "FINGERS": 417, "KNOCK": 418, "WATER": 419, "CHOPPING": 420, "ICE": 421, "RINSING": 422, "BICYCLE": 423, "GOLF": 424, "FIFTEEN": 425, "BUILDING": 426, "GHOST": 427, "NOISE": 428, "CREEK": 429, "WAVES": 430, "KEYS": 431, "THUD": 432, "TV": 433, "BOOK": 434, "HURRICANE": 435, "HAY": 436, "MANUFACTURING": 437, "MOVING": 438, "BOWL": 439, "TORNADO": 440, "SING": 441, "ARGUING": 442, "TRAIN": 443, "HONK": 444, "HARDWOOD": 445, "DRAWING": 446, "SPEAKER": 447, "OCEAN": 448, "CONFIGURATION": 449, "BIRD": 450, "WORKING": 451, "CRICKETS": 452, "BANG": 453, "ROARING": 454, "TRAVEL": 455, "MOTOR": 456, "TYPEWRITER": 457, "START": 458, "FAR": 459, "SEMITRUCK": 460, "WRENCH": 461, "RUSHING": 462, "SODA": 463, "MIKE": 464, "LIGHTNING": 465, "CRUNCHY": 466, "NESTS": 467, "TOOLS": 468, "TRAVELLING": 469, "WINDOW": 470, "STAIRS": 471, "WHISTLE": 472, "HUNDRED": 473, "SQUAWKING": 474, "OPEN": 475, "PIPES": 476, "SKY": 477, "INTERESTED": 478, "TRUMPET": 479, "BALLOON": 480, "STORE": 481, "SKATEBOARD": 482, "VEHICLES": 483, "TILE": 484, "GENERATOR": 485, "DRILLER": 486, "PINGPONG": 487, "CHIRPING": 488, "MAN": 489, "THIRTYSEVEN": 490, "SPEED": 491, "TUMBLING": 492, "HIKING": 493, "TANK": 494, "SLOW": 495, "COUGHING": 496, "FOOD": 497, "ALWAYS": 498, "NOISES": 499, "BABBLING": 500, "MAT": 501, "COUGH": 502, "ELEVATOR": 503, "FADES": 504, "KEY": 505, "PULLOVER": 506, "CLOUDS": 507, "MACHINE": 508, "TRIANGLE": 509, "CHIMING": 510, "DRAINING": 511, "GATE": 512, "LAUGH": 513, "MAGAZINE": 514, "BAT": 515, "ANNOUNCER": 516, "TRACKS": 517, "RACECAR": 518, "EXCITEMENT": 519, "VERY": 520, "BELL": 521, "THIRTEEN": 522, "CHIPS": 523, "THIRTY": 524, "ZERO": 525, "CHAIN": 526, "STEAM": 527, "FLYING": 528, "PLAYGROUND": 529, "FLOWING": 530, "LOCK": 531, "SNIFFING": 532, "GROUND": 533, "CONSTANTLY": 534, "CLOSING": 535, "SHEEP": 536, "END": 537, "DICE": 538, "TEAPOT": 539, "FROGS": 540, "BASS": 541, "BOARD": 542, "BRIDGE": 543, "TOOL": 544, "FARM": 545, "CYCLE": 546, "TAP": 547, "JACKHAMMER": 548, "GRAVEL": 549, "STYROFOAM": 550, "VOICE": 551, "BLOW": 552, "BLOWING": 553, "ANNOUNCEMENT": 554, "GAS": 555, "BUCKET": 556, "MIDDLE": 557, "AIRPORT": 558, "LAST": 559, "EAST": 560, "NONE": 561, "QUICKLY": 562, "BEES": 563, "STEELDRUM": 564, "POOL": 565, "TALKING": 566, "TWENTYEIGHT": 567, "DRYER": 568, "BEGINNING": 569, "SNOW": 570, "WATERFLOW": 571, "UTENSIL": 572, "BEACH": 573, "MALE": 574, "TWENTYNINE": 575, "BROOK": 576, "CROW": 577, "HAPPY": 578, "GAME": 579, "PLAYING": 580, "FIRETRUCK": 581, "CHATTING": 582, "LION": 583, "BREATHING": 584, "WOODPECKER": 585, "RADAR": 586, "WINDER": 587, "FLIES": 588, "JAR": 589, "BEEPING": 590, "HAMMERING": 591, "MOCKINGBIRD": 592, "WATCH": 593, "SOFT": 594, "WASHING": 595, "SEAGULLS": 596, "FOREST": 597, "CHEER": 598, "MOTORBIKE": 599, "TWO": 600, "DRILL": 601, "NOTHING": 602, "MOWING": 603, "GRASSHOPPERS": 604, "HOME": 605, "STORMING": 606, "FAST": 607, "BIRDS": 608, "HITTING": 609, "CLUCKING": 610, "BOX": 611, "IRON": 612, "FORTY": 613, "LOUDSPEAKER": 614, "INCREASE": 615, "BOTTLES": 616, "SEAGULL": 617, "BOILING": 618, "UMBRELLA": 619, "MOSQUITO": 620, "BRAKES": 621, "SHORE": 622, "TEETH": 623, "HE": 624, "POPCORN": 625, "SHOVEL": 626, "BAND": 627, "PEEING": 628, "MORE": 629, "EIGHTEEN": 630, "SUNNY": 631, "CRACKERS": 632, "SEVERAL": 633, "COOKING": 634, "8TIMES": 635, "CUT": 636, "DIRTBIKE": 637, "SWINGING": 638, "OFFICE": 639, "SIGNAL": 640, "INDUSTRIAL": 641, "THEREMIN": 642, "CELERY": 643, "SOUND": 644, "SCANNER": 645, "CLEANING": 646, "CICADA": 647, "TIRE": 648, "BANGING": 649, "SIXTEEN": 650, "MOUSE": 651, "MEDIUM": 652, "THREE": 653, "MORNING": 654, "WRAPPING": 655, "KIDS": 656, "SIRENS": 657, "EGGS": 658, "BOTTLE": 659, "SEVEN": 660, "WHEELBARROW": 661, "PLAIN": 662, "DRIVE": 663, "SYNTHESIZER": 664, "ENGLISH": 665, "VACCUM": 666, "CHEWING": 667, "GRASS": 668, "HOSE": 669, "LONG": 670, "CRUNCHING": 671, "WHISPER": 672, "SWEEPING": 673, "COMPUTER": 674, "POURING": 675, "SHOE": 676, "GARAGE": 677, "DRUMS": 678, "FOOTSTEPS": 679, "CLANGING": 680, "RUBBER": 681, "CUTTER": 682, "AMBULANCE": 683, "AIR": 684, "KEYBOARD": 685, "FALL": 686, "SWING": 687, "ROD": 688, "PULLING": 689, "STIR": 690, "BELT": 691, "METALS": 692, "TWENTYTWO": 693, "FAN": 694, "SEESAW": 695, "TWENTYTHREE": 696, "SIREN": 697, "STIRRING": 698, "CHAIR": 699, "RAILWAYSTATION": 700, "MODERATELY": 701, "SHOWERING": 702, "GRAINS": 703, "REFRIGERATOR": 704, "WINTER": 705, "GEESE": 706, "PAN": 707, "BAD": 708, "CONSTRUCTION": 709, "GROWL": 710, "BUZZSAW": 711, "THIRTYEIGHT": 712, "FOUNTAIN": 713, "PEN": 714, "CRASHING": 715, "INSECTS": 716, "JET": 717, "CLAPPING": 718, "BRUSH": 719, "RAILS": 720, "PRINTING": 721, "SWIMMING": 722, "CLOSET": 723, "CLUCK": 724, "BATHING": 725, "AUTOMOBILES": 726, "CONCRETE": 727, "LAUGHING": 728, "TWENTYSEVEN": 729, "FROG": 730, "HINGE": 731, "RESTAURANT": 732, "RAKE": 733, "CRACKER": 734, "HOG": 735, "BREAKS": 736, "LOUDER": 737, "STREET": 738, "CHEERING": 739, "HUMMING": 740, "CLAP": 741, "SORTER": 742, "SHOOTING": 743, "EIGHT": 744, "TENNIS": 745, "PRAYER": 746, "WELDING": 747, "PAPERS": 748, "BALLS": 749, "CUP": 750, "BROOM": 751, "BABY": 752, "NICE": 753, "INSTRUMENT": 754, "TEARING": 755, "HORN": 756, "CLICKING": 757, "SMALL": 758, "CHIMES": 759, "VIBRATION": 760, "GOAT": 761, "LOCUSTS": 762, "HOUSE": 763, "TWICE": 764, "THIRTYTHREE": 765, "ANIMAL": 766, "CARS": 767, "RACING": 768, "SIDEWALK": 769, "FABRIC": 770, "PARAKEET": 771, "DRESS": 772, "FLUTE": 773, "WOMAN": 774, "SCREECH": 775, "CRANK": 776, "MUSIC": 777, "CHILDREN": 778, "ELEPHANT": 779, "SPRING": 780, "SILENCE": 781, "TOILET": 782, "WHIR": 783, "BIG": 784, "SAW": 785, "BREEZY": 786, "TWENTYSIX": 787, "BUZZ": 788, "SOUNDS": 789, "SUBWAY": 790, "RATTLING": 791, "EATING": 792, "TWENTYONE": 793, "NINE": 794, "TUNNEL": 795, "OPENING": 796, "CYMBAL": 797, "OUTSIDE": 798, "BREAKING": 799, "FOLDING": 800, "WET": 801, "TUB": 802, "RAINSTORM": 803, "STEEL": 804, "BARKING": 805, "THUNDERSTORM": 806, "FREEWAY": 807, "ELECTRIC": 808, "BAG": 809, "CABINET": 810, "ARCADE": 811, "POT": 812, "LIGHT": 813, "LOUD": 814, "CRISPY": 815, "NOISY": 816, "TARP": 817, "CUCKOO": 818, "TABLE": 819, "YELLING": 820, "ALOT": 821, "BEE": 822, "CROWS": 823, "TOY": 824, "ROOF": 825, "GASOLINE": 826, "SMOOTH": 827}


--------------------------------------------------------------------------------
/metadata/metadata_orig/output_classes.json:
--------------------------------------------------------------------------------
1 | {"CRUNCHING": 0, "SHEEP": 1, "HAMMERING": 2, "THIRD": 3, "SHOP": 4, "STORMY": 5, "BATHROOM": 6, "BOTTLES": 7, "FOURTY": 8, "CICADAS": 9, "OIL": 10, "FLOORBOARD": 11, "HITTING": 12, "CONSTANT": 13, "SHOOTING": 14, "AMBULANCE": 15, "SPRING": 16, "TREE": 17, "CHOPPING": 18, "STEELDRUM": 19, "URINATING": 20, "WATER": 21, "GARAGE": 22, "DOVE": 23, "RAILROAD": 24, "LOUD": 25, "LAUNDRY": 26, "PINGPONG": 27, "DRESS": 28, "SHARPENER": 29, "CLOCK": 30, "BUBBLING": 31, "DIGGING": 32, "FLUSHING": 33, "HEATER": 34, "TRAVEL": 35, "RATCHET": 36, "WOMAN": 37, "MOSQUITO": 38, "DRUMSTICK": 39, "HORN": 40, "CRYING": 41, "STAPLER": 42, "CHEWING": 43, "SCOOTER": 44, "SMALL": 45, "CONSTRUCTION": 46, "MORNING": 47, "START": 48, "STOPS": 49, "AEROPLANE": 50, "SOUNDS": 51, "HOWLING": 52, "WHEEL": 53, "SHOES": 54, "BIRD": 55, "SLOWLY": 56, "SPEED": 57, "PARK": 58, "GUITAR": 59, "BOTTLE": 60, "CLUCK": 61, "BABY": 62, "SLAMMING": 63, "TUMBLING": 64, "SHARPENING": 65, "ELECTRICITY": 66, "RACETRACK": 67, "DROPPING": 68, "COMPLETION": 69, "DOLPHIN": 70, "ELEPHANT": 71, "WHEELBARROW": 72, "CHEER": 73, "TWICE": 74, "MICROWAVE": 75, "BELLS": 76, "GENERATOR": 77, "WHISTLING": 78, "LIGHT": 79, "GONG": 80, "TENNIS": 81, "FROG": 82, "HARDWOOD": 83, "DRYER": 84, "WATERS": 85, "TWENTYEIGHT": 86, "FOURTEEN": 87, "CLANGING": 88, "TWENTYFIVE": 89, "CELERY": 90, "PEBBLES": 91, "SHIP": 92, "TOOL": 93, "OPENING": 94, "WRENCH": 95, "CROWS": 96, "MONKEY": 97, "LAWNMOWER": 98, "MATCH": 99, "THIRTEEN": 100, "ANIMAL": 101, "ONE": 102, "MIXER": 103, "CHEERING": 104, "AIRSOUND": 105, "HE": 106, "FAST": 107, "RAPID": 108, "CHILD": 109, "DOG": 110, "THEREMIN": 111, "GOLF": 112, "WOMEN": 113, "SILENT": 114, "KEYS": 115, "SITTING": 116, "WAVES": 117, "BEAR": 118, "ALOT": 119, "RAILWAYSTATION": 120, "FORTY": 121, "MACHINE": 122, "DRIZZLING": 123, "CICADA": 124, "DEEP": 125, "BOX": 126, "MAT": 127, "DUCK": 128, "DRUMS": 129, "MOWER": 130, "SINGING": 131, "ROAD": 132, "EVENING": 133, "METALS": 134, "CHIPS": 135, "PEN": 136, "STEEL": 137, "CHAIN": 138, "SUMMER": 139, "BUZZING": 140, "FILLING": 141, "ARGUING": 142, "SEMI": 143, "RAIN": 144, "TV": 145, "MAGAZINE": 146, "SPLASHING": 147, "DROP": 148, "BANGING": 149, "KNIFE": 150, "TOY": 151, "COOKING": 152, "YELLING": 153, "RAT": 154, "SEWING": 155, "STOP": 156, "HORSE": 157, "JOGGING": 158, "VIBRATION": 159, "CRACKER": 160, "FAUCET": 161, "VACCUM": 162, "HUMMING": 163, "BEADS": 164, "HAMMER": 165, "JET": 166, "DRIVING": 167, "CROW": 168, "PRINTING": 169, "SKY": 170, "TOP": 171, "GAME": 172, "SLEEPING": 173, "PASSING": 174, "MARKET": 175, "BABBLING": 176, "MUD": 177, "BARKING": 178, "CHIRPS": 179, "BOARD": 180, "FACTORY": 181, "UNKNOWN": 182, "AWAY": 183, "WORKING": 184, "DIRT": 185, "GRASS": 186, "ENGLISH": 187, "INSECTS": 188, "TEARING": 189, "HAIL": 190, "STREAM": 191, "DISTORTION": 192, "LISTENING": 193, "FABRIC": 194, "BREATHING": 195, "DRILLER": 196, "TWENTYFOUR": 197, "RADIO": 198, "DRIP": 199, "CYCLE": 200, "STYROFOAM": 201, "LOCUSTS": 202, "ALARM": 203, "WALKING": 204, "TWENTYSIX": 205, "BLENDER": 206, "SCREECH": 207, "RAINFALL": 208, "PAVEMENT": 209, "CLOSE": 210, "EIGHTEEN": 211, "SEAGULLS": 212, "CHANGING": 213, "LONG": 214, "SWIMMING": 215, "WINDY": 216, "MEETING": 217, "ELEVATOR": 218, "WET": 219, "GOOSE": 220, "CUT": 221, "DIRTBIKE": 222, "GRASSHOPPERS": 223, "END": 224, "TWENTY": 225, "8TIMES": 226, "FENCE": 227, "CONCRETE": 228, "SILVERWARE": 229, "THREE": 230, "THUD": 231, "COFFEEMAKER": 232, "SECOND": 233, "AIR": 234, "BICYCLE": 235, "BUGS": 236, "FLY": 237, "TAPE": 238, "CYMBAL": 239, "COIN": 240, "TWENTYONE": 241, "DOWN": 242, "HAND": 243, "FLUTE": 244, "CUP": 245, "RABIT": 246, "CANVAS": 247, "FLOW": 248, "GRINDER": 249, "GRINDING": 250, "GHOST": 251, "GROWL": 252, "BEE": 253, "CREAKING": 254, "MAN": 255, "PLATE": 256, "SQUAWKING": 257, "PAN": 258, "MANUFACTURING": 259, "RAKE": 260, "PARAKEET": 261, "STIR": 262, "CART": 263, "EMERGENCY": 264, "DISHES": 265, "NOISY": 266, "SQUEAK": 267, "FAN": 268, "OWL": 269, "PULLOVER": 270, "GAS": 271, "SCRATCHING": 272, "CARS": 273, "WRAPPING": 274, "FLOOD": 275, "SQUEAKING": 276, "PULSING": 277, "WATERFALL": 278, "TIN": 279, "ELEVEN": 280, "STARTING": 281, "PAPERS": 282, "WD-40": 283, "WINDER": 284, "PRAYER": 285, "WRITING": 286, "PLAYGROUND": 287, "RUBBER": 288, "EAST": 289, "CROWD": 290, "ROOF": 291, "FLYING": 292, "FOUR": 293, "TAMBOURINE": 294, "HARD": 295, "COCK": 296, "RAINY": 297, "MOCKINGBIRD": 298, "ROD": 299, "BELT": 300, "TUBA": 301, "TRACTOR": 302, "ROLLERCOASTER": 303, "WHISPER": 304, "ROOSTER": 305, "BASKETBALL": 306, "SPEAKING": 307, "LAKE": 308, "FAR": 309, "HOSE": 310, "STREET": 311, "SHREDDING": 312, "TRACKS": 313, "RUNNING": 314, "SHOWERING": 315, "SNAKE": 316, "CLAP": 317, "HORROR": 318, "WATERFLOW": 319, "FOOD": 320, "RADAR": 321, "AUDIENCE": 322, "EGG": 323, "CRICKETS": 324, "RINGING": 325, "SPINNER": 326, "SWEEPING": 327, "STATIC": 328, "LIQUID": 329, "MOUTH": 330, "GLASSES": 331, "HURRICANE": 332, "CRINKLING": 333, "LOW": 334, "TELEPHONE": 335, "SEMITRUCK": 336, "JACKHAMMER": 337, "BEACH": 338, "FRYING": 339, "WHEELS": 340, "FREIGHT": 341, "LAST": 342, "DRIVE": 343, "FALLS": 344, "FARM": 345, "INDUSTRIAL": 346, "ROARING": 347, "POPCORN": 348, "SNIFFING": 349, "MODERATELY": 350, "RAILS": 351, "PHONE": 352, "BREAKING": 353, "TANK": 354, "RACING": 355, "CHATTING": 356, "SMOOTH": 357, "DISHWASHER": 358, "THIRTYFOUR": 359, "COUGH": 360, "LIGHTENING": 361, "POURING": 362, "ENGINE": 363, "CHIME": 364, "RAINSTORM": 365, "MARBLE": 366, "RESTAURANT": 367, "WASHING": 368, "FISH": 369, "FIVE": 370, "ANNOUNCEMENT": 371, "CRUNCHY": 372, "WHISTLE": 373, "MOTORCYCLE": 374, "SCREAMING": 375, "BOOK": 376, "POUR": 377, "BLOW": 378, "CRUNCH": 379, "WAVE": 380, "PARROT": 381, "PIANO": 382, "SIXTY": 383, "FALLING": 384, "ROLLER": 385, "STATION": 386, "BLUE": 387, "BAND": 388, "AIRPORT": 389, "SWING": 390, "MOUSE": 391, "CONSTANTLY": 392, "THIRTYFIVE": 393, "VOICES": 394, "BALL": 395, "GRAVEL": 396, "FLIES": 397, "JAR": 398, "NORMAL": 399, "POOL": 400, "KITCHEN": 401, "PRINTER": 402, "NICE": 403, "TAP": 404, "FIFTY": 405, "SONG": 406, "ACCELERATING": 407, "THIRTY": 408, "LEFT": 409, "BROOK": 410, "WASHER": 411, "TALKING": 412, "INTERESTED": 413, "FEATHERS": 414, "WELDING": 415, "SIDEWALK": 416, "NINE": 417, "ROCK": 418, "FIREPLACE": 419, "AQUARIUM": 420, "SEVENTEEN": 421, "OCEAN": 422, "SCRIBBLING": 423, "EXHAUST": 424, "ROCKS": 425, "NIGHT": 426, "EIGHT": 427, "AIRPLANE": 428, "HEAVY": 429, "FUEL": 430, "CLAPPING": 431, "RICE": 432, "NONE": 433, "SEVENTYTHREE": 434, "BUZZ": 435, "RAINDROPS": 436, "MIDDLE": 437, "BOAT": 438, "ANNOUNCER": 439, "HOME": 440, "SPLASH": 441, "HAPPY": 442, "BUBBLEWRAP": 443, "MOTOR": 444, "HIKING": 445, "WAKING": 446, "STEAL": 447, "PULLED": 448, "LAUGH": 449, "BELL": 450, "OBJECT": 451, "FINGERS": 452, "TUB": 453, "TRUMPET": 454, "SHORE": 455, "QUIET": 456, "MOTORBIKE": 457, "HOUSE": 458, "SPRAYER": 459, "FOOTSTEPS": 460, "CHIMES": 461, "HUMANS": 462, "PLAIN": 463, "TWELVE": 464, "WINTER": 465, "VIBRATING": 466, "BEES": 467, "CONSISTENT": 468, "FRYINGPAN": 469, "GROWLING": 470, "BLOWING": 471, "CUCKOO": 472, "SCRAPING": 473, "KIDS": 474, "NOISE": 475, "CLUCKING": 476, "STORMING": 477, "POP": 478, "SIXTH": 479, "CRASHING": 480, "PARTY": 481, "PEOPLE": 482, "DRILLING": 483, "STIRRING": 484, "COOKER": 485, "PEEING": 486, "LIGHTNING": 487, "ZERO": 488, "EXCITEMENT": 489, "STEPS": 490, "SCHOOL": 491, "READING": 492, "CLASSICAL": 493, "INCREASE": 494, "HISSING": 495, "SEESAW": 496, "FOUNTAIN": 497, "HEN": 498, "KEYBOARD": 499, "CHIRPING": 500, "SODA": 501, "USEFUL": 502, "STICK": 503, "CONFIGURATION": 504, "TWENTYSEVEN": 505, "CRUSHING": 506, "PUMP": 507, "LOCK": 508, "SING": 509, "INSTRUMENT": 510, "DRAWING": 511, "GUTTERS": 512, "CLICKING": 513, "ADULT": 514, "EGGS": 515, "CLOSING": 516, "HIGHWAY": 517, "STOMPING": 518, "DRIBBLING": 519, "GROUND": 520, "DUCKS": 521, "PLANE": 522, "PILOT": 523, "WHIRRING": 524, "MOVING": 525, "MIKE": 526, "ORGAN": 527, "CUTTING": 528, "TIRES": 529, "BUILDING": 530, "RAINING": 531, "KEY": 532, "MUG": 533, "LOUDER": 534, "CHILDREN": 535, "HAPPINESS": 536, "FIRST": 537, "WALK": 538, "NOTEBOOK": 539, "SAND": 540, "ELECTRONIC": 541, "AUTOMOBILES": 542, "STONE": 543, "THIRTYTHREE": 544, "VACUUMING": 545, "GOOD": 546, "TEN": 547, "BRAKES": 548, "TWENTYTHREE": 549, "SEAGULL": 550, "CRISPY": 551, "TAPPING": 552, "SPOON": 553, "WIND": 554, "AUDITORIUM": 555, "SIRENS": 556, "SAW": 557, "OUTDOORS": 558, "SCREECHING": 559, "EATING": 560, "RACE": 561, "BIG": 562, "OUTSIDE": 563, "FOREST": 564, "HELICOPTER": 565, "SNOW": 566, "CLEANING": 567, "PIG": 568, "BAT": 569, "CONTINUOUSLY": 570, "THUNDERSTORM": 571, "LION": 572, "WOODPECKER": 573, "DOOR": 574, "DRINK": 575, "BATMINTON": 576, "OFFICE": 577, "XYLOPHONE": 578, "SCREAM": 579, "NEVER": 580, "FLOOR": 581, "MUSIC": 582, "WOOD": 583, "WHITE": 584, "COUGHING": 585, "FIGHTING": 586, "NESTS": 587, "TWO": 588, "SHOWER": 589, "ELECTRIC": 590, "SAWING": 591, "CRICKET": 592, "MALE": 593, "MOVEMENT": 594, "SIREN": 595, "SHOUTING": 596, "COFFEE": 597, "GATE": 598, "MEN": 599, "WATCH": 600, "BROOM": 601, "CITY": 602, "WORK": 603, "BIRDS": 604, "SHORT": 605, "CRACKERS": 606, "THIRTYEIGHT": 607, "GASOLINE": 608, "FOOT": 609, "VEHICLE": 610, "DEBRIS": 611, "SPILLING": 612, "RATTLING": 613, "WINDCHIME": 614, "SWINGING": 615, "RINSING": 616, "FIRE": 617, "WINDOW": 618, "FIFTEEN": 619, "SOFT": 620, "SHAKING": 621, "LAUGHING": 622, "TRAIN": 623, "ZOO": 624, "BRUSH": 625, "SYNTHESIZER": 626, "PENCIL": 627, "FLOWING": 628, "SIZZLING": 629, "METAL": 630, "LORRY": 631, "MOWING": 632, "PACKAGE": 633, "TRAVELLING": 634, "TARP": 635, "BREEZY": 636, "ICE": 637, "RAPIDLY": 638, "PLASTIC": 639, "VEHICLES": 640, "NOTHING": 641, "ALWAYS": 642, "BANG": 643, "HUMAN": 644, "CHICKEN": 645, "TREES": 646, "FREEWAY": 647, "AC": 648, "TRAFFIC": 649, "BOOTS": 650, "NOISES": 651, "CLANKING": 652, "LEAVES": 653, "BOILING": 654, "STORE": 655, "GEESE": 656, "DRAIN": 657, "VERY": 658, "SPARROW": 659, "HINGE": 660, "KNOCKING": 661, "BASS": 662, "SLOW": 663, "GRAINS": 664, "CARDS": 665, "LAUGHTER": 666, "MULTIPLE": 667, "UP": 668, "FALL": 669, "HALL": 670, "HONK": 671, "FEMALE": 672, "CHIRP": 673, "DICE": 674, "CRANK": 675, "WHISPERING": 676, "HIGH": 677, "TWEET": 678, "CREEK": 679, "CARDBOARD": 680, "SPEAKER": 681, "PIPES": 682, "PERSON": 683, "STEADY": 684, "VACUUM": 685, "SUNNY": 686, "SHAKER": 687, "SKATEBOARD": 688, "HUNDRED": 689, "MEDIUM": 690, "TEETH": 691, "TOILET": 692, "TRUCK": 693, "KETTLE": 694, "VOICE": 695, "BALLS": 696, "FEET": 697, "QUICKLY": 698, "CAR": 699, "SEAWAVE": 700, "ARCADE": 701, "BAG": 702, "BUZZSAW": 703, "LATCH": 704, "STEAM": 705, "SEVEN": 706, "DRAINING": 707, "SCRAPER": 708, "PIPE": 709, "UMBRELLA": 710, "THROWING": 711, "INSECT": 712, "REFRIGERATOR": 713, "THIRTYONE": 714, "POT": 715, "CUTTER": 716, "MANY": 717, "BRANCHES": 718, "SEVERAL": 719, "THIRTYSEVEN": 720, "WHIR": 721, "RUSHING": 722, "STRAW": 723, "CLOUDS": 724, "FLIGHT": 725, "PRESS": 726, "TYPING": 727, "HAY": 728, "SCANNER": 729, "PLAYING": 730, "MACHINERY": 731, "BUCKET": 732, "SIGNAL": 733, "MILD": 734, "LOUDSPEAKER": 735, "TILE": 736, "SANDER": 737, "SHOE": 738, "ALUMINUM": 739, "GOAT": 740, "BRIDGE": 741, "NINETEEN": 742, "COW": 743, "MICROPHONE": 744, "COPIER": 745, "SIX": 746, "BEEPING": 747, "RIVER": 748, "PULLING": 749, "BUS": 750, "TELEVISION": 751, "DRUM": 752, "TOOLS": 753, "CLOSET": 754, "SOUND": 755, "CAT": 756, "CHAIR": 757, "PIGEON": 758, "HONKING": 759, "RACECAR": 760, "BATHING": 761, "CABINET": 762, "KNOCK": 763, "UTENSIL": 764, "INTERCOM": 765, "TUNNEL": 766, "HOG": 767, "COMPUTER": 768, "HANDS": 769, "SEA": 770, "COINS": 771, "SIXTEEN": 772, "DIESEL": 773, "TEAPOT": 774, "THUNDER": 775, "TWENTYTWO": 776, "VIOLIN": 777, "ACCELERATES": 778, "MORE": 779, "POLICE": 780, "GARBAGE": 781, "SUBWAY": 782, "BAD": 783, "MARKER": 784, "PURRING": 785, "LOCUST": 786, "LANDING": 787, "ONCE": 788, "DRILL": 789, "TYPEWRITER": 790, "TWENTYNINE": 791, "IRON": 792, "SORTER": 793, "SINK": 794, "BOWL": 795, "SILENCE": 796, "TIRE": 797, "TORNADO": 798, "CHURCH": 799, "STORM": 800, "TIMER": 801, "MIXING": 802, "FIREWORKS": 803, "TRIANGLE": 804, "SHOVEL": 805, "FROGS": 806, "FIRETRUCK": 807, "PAPER": 808, "DRIPPING": 809, "BREAKS": 810, "CAN": 811, "GLASS": 812, "OPEN": 813, "BALLOON": 814, "CHIMING": 815, "FADES": 816, "STAIRS": 817, "WINCH": 818, "BOOKS": 819, "CHAINSAW": 820, "UTENSILS": 821, "BIKE": 822, "BEGINNING": 823, "MARBLES": 824, "REVVING": 825, "TABLE": 826, "FOLDING": 827}


--------------------------------------------------------------------------------
/nets/ours_MWAFM_Net.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import numpy
  6 | import copy
  7 | import math
  8 | from nets.multi_attention import MultiScaleSelfAttention
  9 | 
 10 | 
 11 | class QstEncoder(nn.Module):
 12 | 
 13 |     def __init__(self, qst_vocab_size, word_embed_size, embed_size, num_layers, hidden_size):
 14 | 
 15 |         super(QstEncoder, self).__init__()
 16 |         self.word2vec = nn.Embedding(qst_vocab_size, word_embed_size)
 17 |         self.tanh = nn.Tanh()
 18 |         self.lstm = nn.LSTM(word_embed_size, hidden_size, num_layers)
 19 |         self.fc = nn.Linear(2*num_layers*hidden_size, embed_size)     # 2 for hidden and cell states
 20 | 
 21 |     def forward(self, question):
 22 | 
 23 |         qst_vec = self.word2vec(question)                             # [batch_size, max_qst_length=30, word_embed_size=300]
 24 |         qst_vec = self.tanh(qst_vec)
 25 |         qst_vec = qst_vec.transpose(0, 1)                             # [max_qst_length=30, batch_size, word_embed_size=300]
 26 |         self.lstm.flatten_parameters()
 27 |         _, (hidden, cell) = self.lstm(qst_vec)                        # [num_layers=2, batch_size, hidden_size=512]
 28 |         qst_feature = torch.cat((hidden, cell), 2)                    # [num_layers=2, batch_size, 2*hidden_size=1024]
 29 |         qst_feature = qst_feature.transpose(0, 1)                     # [batch_size, num_layers=2, 2*hidden_size=1024]
 30 |         qst_feature = qst_feature.reshape(qst_feature.size()[0], -1)  # [batch_size, 2*num_layers*hidden_size=2048]
 31 |         qst_feature = self.tanh(qst_feature)
 32 |         qst_feature = self.fc(qst_feature)                            # [batch_size, embed_size]
 33 | 
 34 |         return qst_feature
 35 | 
 36 | 
 37 | def _get_clones(module, N):
 38 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
 39 | 
 40 | class Encoder(nn.Module):
 41 | 
 42 |     def __init__(self, encoder_layer, num_layers, norm=None):
 43 |         super(Encoder, self).__init__()
 44 |         self.layers = _get_clones(encoder_layer, num_layers)
 45 |         self.num_layers = num_layers
 46 |         self.norm1 = nn.LayerNorm(512)
 47 |         self.norm2 = nn.LayerNorm(512)
 48 |         self.norm = norm
 49 | 
 50 |     def forward(self, src_a, mask=None, src_key_padding_mask=None):
 51 |         output_a = src_a
 52 | 
 53 |         for i in range(self.num_layers):
 54 |             output_a = self.layers[i](src_a, src_a, src_mask=mask,src_key_padding_mask=src_key_padding_mask)
 55 | 
 56 |         if self.norm:
 57 |             output_a = self.norm1(output_a)
 58 | 
 59 |         return output_a
 60 | 
 61 | 
 62 | 
 63 | class MultiAttnLayer(nn.Module):
 64 | 
 65 |     # d_model=512, nhead=1, dim_feedforward=512), num_layers=1
 66 |     def __init__(self, d_model, nhead, window_size, dim_feedforward=512, dropout=0.1):
 67 |         super(MultiAttnLayer, self).__init__()
 68 | 
 69 | 
 70 |         self.self_attn = MultiScaleSelfAttention(num_attention_heads = nhead, 
 71 |                                                  hidden_size = d_model,
 72 |                                                  attention_probs_dropout_prob = 0.0,
 73 |                                                  attention_window = [window_size],
 74 |                                                  attention_dilation = [1],
 75 |                                                  attention_mode = 'sliding_chunks',
 76 |                                                  autoregressive = False, 
 77 |                                                  layer_id=0)
 78 | 
 79 |         self.cm_attn = MultiScaleSelfAttention(num_attention_heads = nhead,   
 80 |                                                  hidden_size = d_model,
 81 |                                                  attention_probs_dropout_prob = 0.0,
 82 |                                                  attention_window = [window_size],
 83 |                                                  attention_dilation = [1],
 84 |                                                  attention_mode = 'sliding_chunks',
 85 |                                                  autoregressive = False, 
 86 |                                                  layer_id=0)
 87 | 
 88 |         # Implementation of Feedforward model
 89 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
 90 |         self.dropout = nn.Dropout(dropout)
 91 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
 92 | 
 93 |         self.norm1 = nn.LayerNorm(d_model)
 94 |         self.norm2 = nn.LayerNorm(d_model)
 95 |         self.dropout11 = nn.Dropout(dropout)
 96 |         self.dropout12 = nn.Dropout(dropout)
 97 |         self.dropout2 = nn.Dropout(dropout)
 98 | 
 99 |         
100 | 
101 |     def forward(self, src_q, src_kv, src_mask=None, src_key_padding_mask=None):
102 | 
103 |         src_lf_self = self.self_attn(src_q, src_q, src_q)[0]
104 | 
105 |         src_q = src_q + self.dropout12(src_lf_self)
106 |         src_q = self.norm1(src_q)
107 | 
108 |         # src_lf_self = self.linear2(self.dropout(F.relu(self.linear1(src_q))))
109 |         # src_q = src_q + self.dropout2(src_lf_self)
110 |         # src_q = self.norm2(src_q)
111 | 
112 |         return src_q
113 | 
114 | 
115 | 
116 | class Encoder_QA(nn.Module):
117 | 
118 |     def __init__(self, encoder_layer, num_layers, norm=None):
119 |         super(Encoder_QA, self).__init__()
120 |         self.layers = _get_clones(encoder_layer, num_layers)
121 |         self.num_layers = num_layers
122 |         self.norm1 = nn.LayerNorm(512)
123 |         self.norm2 = nn.LayerNorm(512)
124 |         self.norm = norm
125 | 
126 |     def forward(self, src_a, mask=None, src_key_padding_mask=None):
127 |         output_a = src_a
128 | 
129 |         for i in range(self.num_layers):
130 |             output_a = self.layers[i](src_a, src_mask=mask,src_key_padding_mask=src_key_padding_mask)
131 | 
132 |         if self.norm:
133 |             output_a = self.norm1(output_a)
134 | 
135 |         return output_a
136 | 
137 | 
138 | class QAHanLayer(nn.Module):
139 | 
140 |     def __init__(self, d_model, nhead, dim_feedforward=512, dropout=0.1):
141 |         super(QAHanLayer, self).__init__()
142 | 
143 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
144 |         self.cm_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
145 | 
146 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
147 |         self.dropout = nn.Dropout(dropout)
148 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
149 | 
150 |         self.norm1 = nn.LayerNorm(d_model)
151 |         self.norm2 = nn.LayerNorm(d_model)
152 |         self.dropout11 = nn.Dropout(dropout)
153 |         self.dropout12 = nn.Dropout(dropout)
154 |         self.dropout2 = nn.Dropout(dropout)
155 | 
156 |         self.activation = nn.ReLU()
157 | 
158 |     def forward(self, src_a, src_mask=None, src_key_padding_mask=None):
159 | 
160 |         src_a = src_a.permute(1, 0, 2)
161 |         src2 = self.self_attn(src_a, src_a, src_a, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
162 |         src_a = src_a + self.dropout12(src2)
163 |         src_a = self.norm1(src_a)
164 | 
165 |         src2 = self.linear2(self.dropout(F.relu(self.linear1(src_a))))
166 |         src_a = src_a + self.dropout2(src2)
167 |         src_a = self.norm2(src_a)
168 | 
169 |         return src_a.permute(1, 0, 2)
170 | 
171 | 
172 | class MWAFM_Net(nn.Module):
173 | 
174 |     def __init__(self, d_model=512, nhead=1, dropout=0.1, dim_feedforward=512):
175 |         super(MWAFM_Net, self).__init__()
176 | 
177 | 
178 |         # self.audio_ast_fc = nn.Linear(768, 512)
179 |         # self.fusion_ast_fc = nn.Linear(1024, 512)
180 |  
181 |         self.audio_fc =  nn.Linear(128, 512)
182 | 
183 |         self.question_fc = nn.Linear(300, 512)
184 |         self.question_fc2 = nn.Linear(512, 512)
185 | 
186 |         self.question_encoder = QstEncoder(2000, 512, 512, 1, 512)
187 |         self.word2vec = nn.Embedding(2000, 512)
188 | 
189 |         self.multi_scale_encoder_2 = Encoder(MultiAttnLayer(d_model=512, nhead=4, window_size=2, dim_feedforward=512), num_layers=1)
190 |         self.multi_scale_encoder_4 = Encoder(MultiAttnLayer(d_model=512, nhead=4, window_size=4, dim_feedforward=512), num_layers=1)
191 |         self.multi_scale_encoder_6 = Encoder(MultiAttnLayer(d_model=512, nhead=4, window_size=6, dim_feedforward=512), num_layers=1)
192 |         self.multi_scale_encoder_12 = Encoder(MultiAttnLayer(d_model=512, nhead=4, window_size=12, dim_feedforward=512), num_layers=1)
193 | 
194 |         self.multi_scale_linear = nn.Linear(512, 512)
195 |         self.multi_scale_dropout = nn.Dropout(0.1)
196 |         self.multi_scale_norm = nn.LayerNorm(512)
197 | 
198 |         # question as query on audio and visual_feat_grd
199 |         self.attn_qst_query = nn.MultiheadAttention(512, 4, dropout=0.1)
200 |         self.qst_query_linear1 = nn.Linear(512, 512)
201 |         self.qst_query_relu = nn.ReLU()
202 |         self.qst_query_dropout1 = nn.Dropout(0.1)
203 |         self.qst_query_linear2 = nn.Linear(512, 512)
204 |         self.qst_query_dropout2 = nn.Dropout(0.1)
205 |         self.qst_query_norm = nn.LayerNorm(512)
206 | 
207 |         # self-cross
208 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
209 |         self.cm_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
210 |         # Implementation of Feedforward model
211 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
212 |         self.dropout = nn.Dropout(dropout)
213 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
214 |         self.norm1 = nn.LayerNorm(d_model)
215 |         self.norm2 = nn.LayerNorm(d_model)
216 |         self.dropout11 = nn.Dropout(dropout)
217 |         self.dropout12 = nn.Dropout(dropout)
218 |         self.dropout2 = nn.Dropout(dropout)
219 |         self.activation = nn.ReLU()
220 | 
221 |         self.tanh = nn.Tanh()
222 | 
223 |         
224 |         self.combine_fc1 = nn.Linear(1024, 512)
225 |         self.combine_fc2 = nn.Linear(512, 256)
226 |         self.pred_fc = nn.Linear(256, 828)
227 | 
228 | 
229 | 
230 |         self.multi_layers = Encoder_QA(QAHanLayer(d_model=512, 
231 |                                                     nhead=1, 
232 |                                                     dim_feedforward=512), 
233 |                                                     num_layers=4)
234 | 
235 |     
236 | 
237 |     ### attention, question as query on visual_feat and audio_feat
238 |     def SelfAttn(self, quests_feat_input, key_value_feat):
239 |         
240 |         ### input Q, K, V: [T, B, C]
241 | 
242 |         key_value_feat_grd = key_value_feat.permute(1, 0, 2)
243 |         qst_feat_query = key_value_feat_grd
244 |         key_value_feat_att = self.attn_qst_query(qst_feat_query, key_value_feat_grd, key_value_feat_grd, 
245 |                                                  attn_mask=None, key_padding_mask=None)[0]
246 |         src = self.qst_query_linear1(key_value_feat_att)
247 |         src = self.qst_query_relu(src)
248 |         src = self.qst_query_dropout1(src)
249 |         src = self.qst_query_linear2(src)
250 |         src = self.qst_query_dropout2(src)
251 |         
252 |         key_value_feat_att = key_value_feat_att + src
253 |         key_value_feat_att = self.qst_query_norm(key_value_feat_att)
254 | 
255 |         return key_value_feat_att.permute(1, 0, 2)
256 | 
257 | 
258 |     def SelfCrossAttn(self, src_q, src_v, src_mask=None, src_key_padding_mask=None):
259 |         # src_q = src_q.unsqueeze(0)
260 |         src_q = src_q.permute(1, 0, 2)
261 |         src_v = src_v.permute(1, 0, 2)
262 |         src1 = self.cm_attn(src_q, src_v, src_v, attn_mask=src_mask,key_padding_mask=src_key_padding_mask)[0]
263 |         src2 = self.self_attn(src_q, src_q, src_q, attn_mask=src_mask,key_padding_mask=src_key_padding_mask)[0]
264 |         src_q = src_q + self.dropout11(src1) + self.dropout12(src2)
265 |         src_q = self.norm1(src_q)
266 | 
267 |         src2 = self.linear2(self.dropout(F.relu(self.linear1(src_q))))
268 |         src_q = src_q + self.dropout2(src2)
269 |         src_q = self.norm2(src_q)
270 |         return src_q.permute(1, 0, 2)
271 | 
272 | 
273 |     ### attention, question as query on visual_feat and audio_feat
274 |     def QuestionQuereidAttn(self, quests_feat_input, key_value_feat):
275 | 
276 |         # qst_feat_query = quests_feat_input.unsqueeze(0)          # [1, B, C], [1, 2, 512]
277 |         qst_feat_query = quests_feat_input.permute(1, 0, 2)
278 | 
279 |         ### input Q, K, V: [T, B, C]
280 |         key_value_feat_grd = key_value_feat.permute(1, 0, 2)
281 |         key_value_feat_att = self.attn_qst_query(key_value_feat_grd, qst_feat_query, qst_feat_query,  
282 |                                                  attn_mask=None, key_padding_mask=None)[0]
283 |         src = self.qst_query_linear1(key_value_feat_att)
284 |         src = self.qst_query_relu(src)
285 |         src = self.qst_query_dropout1(src)
286 |         src = self.qst_query_linear2(src)
287 |         src = self.qst_query_dropout2(src)
288 |         
289 |         key_value_feat_att = key_value_feat_att + src
290 |         key_value_feat_att = self.qst_query_norm(key_value_feat_att)
291 | 
292 |         return key_value_feat_att.permute(1, 0, 2)
293 | 
294 | 
295 |     # def forward(self, audio, audio_ast_feat, question):
296 |     def forward(self, audio, question):
297 | 
298 |         ### feature input
299 |         audio_feat = self.audio_fc(audio)               # [B, T, C]
300 |         qst_feat = self.question_fc(question)
301 | 
302 |         # audio_ast_feat = self.audio_ast_fc(audio_ast_feat)
303 |         # audio_ast_feat = F.relu(audio_ast_feat)
304 |         
305 |         audio_feat_grd = audio_feat
306 |         qst_feat_grd = qst_feat
307 |     
308 |         ### --------------- Hybrid Attention Module start --------------- 
309 |         qst_feat = self.SelfAttn(qst_feat, qst_feat)
310 |         audio_feat = self.SelfCrossAttn(audio_feat, qst_feat_grd)
311 | 
312 |         ### --------------- Multi-scale Window attention start --------------- 
313 |         ## input: [B, T, C], output: [B, T, C]
314 |         aud_feat_scale_2 = self.multi_scale_encoder_2(audio_feat, audio_feat)
315 |         aud_feat_scale_4 = self.multi_scale_encoder_4(audio_feat, audio_feat)
316 |         aud_feat_scale_6 = self.multi_scale_encoder_6(audio_feat, audio_feat)
317 |         aud_feat_scale_12 = self.multi_scale_encoder_12(audio_feat, audio_feat)
318 | 
319 |         audio_feat_kv2 = aud_feat_scale_2.permute(1, 0, 2)
320 |         audio_feat_kv4 = aud_feat_scale_4.permute(1, 0, 2)
321 |         audio_feat_kv6 = aud_feat_scale_6.permute(1, 0, 2)
322 |         audio_feat_kv12 = aud_feat_scale_12.permute(1, 0, 2)
323 | 
324 |         audio_feat_kv2 = self.multi_scale_dropout(F.relu(self.multi_scale_linear(audio_feat_kv2)))
325 |         audio_feat_kv4 = self.multi_scale_dropout(F.relu(self.multi_scale_linear(audio_feat_kv4)))
326 |         audio_feat_kv6 = self.multi_scale_dropout(F.relu(self.multi_scale_linear(audio_feat_kv6)))
327 |         audio_feat_kv12 = self.multi_scale_dropout(F.relu(self.multi_scale_linear(audio_feat_kv12)))
328 | 
329 |         audio_feat_ws_sum = audio_feat_kv2 + audio_feat_kv4 + audio_feat_kv6 + audio_feat_kv12
330 |         audio_feat_kv = audio_feat + audio_feat_ws_sum.permute(1, 0, 2)
331 |         # audio_feat_kv = self.multi_scale_norm(audio_feat_kv)
332 | 
333 |         ### --------------- Multi-scale Window attention end --------------- 
334 | 
335 |         audio_feat_kv = self.multi_layers(audio_feat)
336 |         audio_feat_kv = audio_feat_kv.mean(dim=1)
337 | 
338 |         # cat
339 |         # audio_feat_kv = torch.cat([audio_ast_feat.mean(-2), audio_feat_kv], dim=-1)
340 |         # audio_feat_kv = self.fusion_ast_fc(audio_feat_kv)
341 |         # # audio_feat_kv = F.relu(audio_feat_kv)
342 | 
343 |         # add
344 |         # audio_feat_kv = audio_feat_kv + audio_ast_feat.mean(-2)
345 |         qst_feat = qst_feat.mean(dim=1)
346 |         combine_feat = torch.mul(audio_feat_kv, qst_feat)
347 |         
348 |         combine_feat = F.relu(self.combine_fc2(combine_feat))
349 |         feat_output = self.pred_fc(combine_feat)
350 | 
351 |         return feat_output
352 | 
353 | 


--------------------------------------------------------------------------------
/nets/multi_attention.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import math
  3 | import torch
  4 | from torch import nn
  5 | import torch.nn.functional as F
  6 | from nets.diagonaled_mm_tvm import diagonaled_mm as diagonaled_mm_tvm, mask_invalid_locations
  7 | from nets.sliding_chunks import sliding_chunks_matmul_qk, sliding_chunks_matmul_pv
  8 | from nets.sliding_chunks import sliding_chunks_no_overlap_matmul_qk, sliding_chunks_no_overlap_matmul_pv
  9 | 
 10 | 
 11 | class MultiScaleSelfAttention(nn.Module):
 12 |     def __init__(self, num_attention_heads, 
 13 |                        hidden_size,
 14 |                        attention_probs_dropout_prob,
 15 |                        attention_window,
 16 |                        attention_dilation,
 17 |                        attention_mode,
 18 |                        autoregressive, 
 19 |                        layer_id):
 20 |         super(MultiScaleSelfAttention, self).__init__()
 21 |         if hidden_size % num_attention_heads != 0:
 22 |             raise ValueError(
 23 |                 "The hidden size (%d) is not a multiple of the number of attention "
 24 |                 "heads (%d)" % (hidden_size, num_attention_heads))
 25 |         self.num_heads = num_attention_heads
 26 |         self.head_dim = int(hidden_size / num_attention_heads)
 27 |         self.embed_dim = hidden_size
 28 | 
 29 |         self.query = nn.Linear(hidden_size, self.embed_dim)
 30 |         self.key = nn.Linear(hidden_size, self.embed_dim)
 31 |         self.value = nn.Linear(hidden_size, self.embed_dim)
 32 | 
 33 |         self.query_global = nn.Linear(hidden_size, self.embed_dim)
 34 |         self.key_global = nn.Linear(hidden_size, self.embed_dim)
 35 |         self.value_global = nn.Linear(hidden_size, self.embed_dim)
 36 | 
 37 |         self.dropout = attention_probs_dropout_prob
 38 | 
 39 |         self.layer_id = layer_id
 40 |         self.attention_window = attention_window[self.layer_id]
 41 |         self.attention_dilation = attention_dilation[self.layer_id]
 42 |         self.attention_mode = attention_mode
 43 |         self.autoregressive = autoregressive
 44 |         assert self.attention_window > 0
 45 |         assert self.attention_dilation > 0
 46 |         assert self.attention_mode in ['tvm', 'sliding_chunks', 'sliding_chunks_no_overlap']
 47 |         if self.attention_mode in ['sliding_chunks', 'sliding_chunks_no_overlap']:
 48 |             assert not self.autoregressive  # not supported
 49 |             assert self.attention_dilation == 1  # dilation is not supported
 50 |             # assert self.attention_dilation == 2  # dilation is not supported
 51 | 
 52 |     def forward(
 53 |         self,
 54 |         hidden_states_q,  # [B, T, C]
 55 |         hidden_states_k,
 56 |         hidden_states_v,
 57 |         attention_mask=None,
 58 |         head_mask=None,
 59 |         encoder_hidden_states=None,
 60 |         encoder_attention_mask=None,
 61 |         output_attentions=False,
 62 |     ):
 63 | 
 64 |         # print("\n--->> Forward Info: ")
 65 |         # hidden states input: [B, T, C]
 66 | 
 67 |         '''
 68 |         The `attention_mask` is changed in `BertModel.forward` from 0, 1, 2 to
 69 |             -ve: no attention
 70 |               0: local attention
 71 |             +ve: global attention
 72 |         '''
 73 |         # print("Attention mask: ", attention_mask)
 74 | 
 75 |         assert encoder_hidden_states is None, "`encoder_hidden_states` is not supported and should be None"
 76 |         assert encoder_attention_mask is None, "`encoder_attention_mask` is not supported and shiould be None"
 77 | 
 78 |         if attention_mask is not None:
 79 |             # print("attention_mask is not None")
 80 |             attention_mask = attention_mask.squeeze(dim=2).squeeze(dim=1)
 81 |             key_padding_mask = attention_mask < 0
 82 |             extra_attention_mask = attention_mask > 0
 83 |             remove_from_windowed_attention_mask = attention_mask != 0
 84 | 
 85 |             num_extra_indices_per_batch = extra_attention_mask.long().sum(dim=1)
 86 |             max_num_extra_indices_per_batch = num_extra_indices_per_batch.max()
 87 |             
 88 |             if max_num_extra_indices_per_batch <= 0:
 89 |                 extra_attention_mask = None
 90 |             else:
 91 |                 # To support the case of variable number of global attention in the rows of a batch,
 92 |                 # we use the following three selection masks to select global attention embeddings
 93 |                 # in a 3d tensor and pad it to `max_num_extra_indices_per_batch`
 94 |                 # 1) selecting embeddings that correspond to global attention
 95 |                 extra_attention_mask_nonzeros = extra_attention_mask.nonzero(as_tuple=True)
 96 |                 zero_to_max_range = torch.arange(0, max_num_extra_indices_per_batch,
 97 |                                                  device=num_extra_indices_per_batch.device)
 98 |                 # mask indicating which values are actually going to be padding
 99 |                 selection_padding_mask = zero_to_max_range < num_extra_indices_per_batch.unsqueeze(dim=-1)
100 |                 # 2) location of the non-padding values in the selected global attention
101 |                 selection_padding_mask_nonzeros = selection_padding_mask.nonzero(as_tuple=True)
102 |                 # 3) location of the padding values in the selected global attention
103 |                 selection_padding_mask_zeros = (selection_padding_mask == 0).nonzero(as_tuple=True)
104 |         else:
105 |             remove_from_windowed_attention_mask = None
106 |             extra_attention_mask = None
107 |             key_padding_mask = None
108 | 
109 |         
110 |         hidden_states_q = hidden_states_q.transpose(0, 1)
111 |         hidden_states_k = hidden_states_k.transpose(0, 1)
112 |         hidden_states_v = hidden_states_v.transpose(0, 1)
113 | 
114 |         seq_len, bsz, embed_dim = hidden_states_q.size()
115 | 
116 |         assert embed_dim == self.embed_dim
117 |         q = self.query(hidden_states_q)
118 |         k = self.key(hidden_states_k)
119 |         v = self.value(hidden_states_v)
120 |         q /= math.sqrt(self.head_dim)
121 | 
122 |         # print("num head: ", self.num_heads)
123 |         # print("head dim: ", self.head_dim)   # int(hidden_size / num_attention_heads)
124 | 
125 |         q = q.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1)
126 |         k = k.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1)
127 | 
128 |         # print("long q: ", q.shape)
129 |         # print("long k: ", k.shape)
130 |         # print("long v: ", v.shape)
131 | 
132 |         # attn_weights = (bsz, seq_len, self.num_heads, window*2+1)
133 |         if self.attention_mode == 'tvm':
134 |             q = q.float().contiguous()
135 |             k = k.float().contiguous()
136 |             attn_weights = diagonaled_mm_tvm(q, k, self.attention_window, self.attention_dilation, False, 0, False)
137 |         elif self.attention_mode == "sliding_chunks":
138 |             attn_weights = sliding_chunks_matmul_qk(q, k, self.attention_window, padding_value=0)
139 |         elif self.attention_mode == "sliding_chunks_no_overlap":
140 |             attn_weights = sliding_chunks_no_overlap_matmul_qk(q, k, self.attention_window, padding_value=0)
141 |         else:
142 |             raise False
143 | 
144 |         # attn_weights = (bsz, seq_len, self.num_heads, window*2+1)
145 |         mask_invalid_locations(attn_weights, self.attention_window, self.attention_dilation, False)
146 | 
147 |         if remove_from_windowed_attention_mask is not None:
148 |             # This implementation is fast and takes very little memory because num_heads x hidden_size = 1
149 |             # from (bsz x seq_len) to (bsz x seq_len x num_heads x hidden_size)
150 |             remove_from_windowed_attention_mask = remove_from_windowed_attention_mask.unsqueeze(dim=-1).unsqueeze(dim=-1)
151 |             # remove_from_windowed_attention_mask = remove_from_windowed_attention_mask.unsqueeze(dim=-1)
152 |             # print("remove_from_windowed_attention_mask: ", remove_from_windowed_attention_mask.shape)
153 |             # cast to float/half then replace 1's with -inf
154 |             float_mask = remove_from_windowed_attention_mask.type_as(q).masked_fill(remove_from_windowed_attention_mask, -10000.0)
155 |             repeat_size = 1 if isinstance(self.attention_dilation, int) else len(self.attention_dilation)
156 |             float_mask = float_mask.repeat(1, 1, repeat_size, 1)
157 |             ones = float_mask.new_ones(size=float_mask.size())  # tensor of ones
158 |             # diagonal mask with zeros everywhere and -inf inplace of padding
159 |             if self.attention_mode == 'tvm':
160 |                 d_mask = diagonaled_mm_tvm(ones, float_mask, self.attention_window, self.attention_dilation, False, 0, False)
161 |             elif self.attention_mode == "sliding_chunks":
162 |                 d_mask = sliding_chunks_matmul_qk(ones, float_mask, self.attention_window, padding_value=0)
163 |             elif self.attention_mode == "sliding_chunks_no_overlap":
164 |                 d_mask = sliding_chunks_no_overlap_matmul_qk(ones, float_mask, self.attention_window, padding_value=0)
165 |             attn_weights += d_mask
166 | 
167 | 
168 |         assert list(attn_weights.size())[:3] == [bsz, seq_len, self.num_heads]
169 |         assert attn_weights.size(dim=3) in [self.attention_window * 2 + 1, self.attention_window * 3]
170 | 
171 |         # the extra attention
172 |         if extra_attention_mask is not None:
173 |             selected_k = k.new_zeros(bsz, max_num_extra_indices_per_batch, self.num_heads, self.head_dim)
174 |             selected_k[selection_padding_mask_nonzeros] = k[extra_attention_mask_nonzeros]
175 |             # (bsz, seq_len, num_heads, max_num_extra_indices_per_batch)
176 |             selected_attn_weights = torch.einsum('blhd,bshd->blhs', (q, selected_k))
177 |             selected_attn_weights[selection_padding_mask_zeros[0], :, :, selection_padding_mask_zeros[1]] = -10000
178 |             # concat to attn_weights
179 |             # (bsz, seq_len, num_heads, extra attention count + 2*window+1)
180 |             attn_weights = torch.cat((selected_attn_weights, attn_weights), dim=-1)
181 |         attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32)  # use fp32 for numerical stability
182 | 
183 |         if key_padding_mask is not None:
184 |             # softmax sometimes inserts NaN if all positions are masked, replace them with 0
185 |             attn_weights_float = torch.masked_fill(attn_weights_float, key_padding_mask.unsqueeze(-1).unsqueeze(-1), 0.0)
186 | 
187 |         attn_weights = attn_weights_float.type_as(attn_weights)
188 |         attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
189 |         v = v.view(seq_len, bsz, self.num_heads, self.head_dim).transpose(0, 1)
190 | 
191 |         attn = 0
192 |         if extra_attention_mask is not None:
193 |             selected_attn_probs = attn_probs.narrow(-1, 0, max_num_extra_indices_per_batch)
194 |             selected_v = v.new_zeros(bsz, max_num_extra_indices_per_batch, self.num_heads, self.head_dim)
195 |             selected_v[selection_padding_mask_nonzeros] = v[extra_attention_mask_nonzeros]
196 |             # use `matmul` because `einsum` crashes sometimes with fp16
197 |             # attn = torch.einsum('blhs,bshd->blhd', (selected_attn_probs, selected_v))
198 |             attn = torch.matmul(selected_attn_probs.transpose(1, 2), selected_v.transpose(1, 2).type_as(selected_attn_probs)).transpose(1, 2)
199 |             attn_probs = attn_probs.narrow(-1, max_num_extra_indices_per_batch, attn_probs.size(-1) - max_num_extra_indices_per_batch).contiguous()
200 | 
201 |         if self.attention_mode == 'tvm':
202 |             v = v.float().contiguous()
203 |             attn += diagonaled_mm_tvm(attn_probs, v, self.attention_window, self.attention_dilation, True, 0, False)
204 |         elif self.attention_mode == "sliding_chunks":
205 |             attn += sliding_chunks_matmul_pv(attn_probs, v, self.attention_window)
206 |         elif self.attention_mode == "sliding_chunks_no_overlap":
207 |             attn += sliding_chunks_no_overlap_matmul_pv(attn_probs, v, self.attention_window)
208 |         else:
209 |             raise False
210 | 
211 |         attn = attn.type_as(hidden_states_q)    # 将attn类型转为hidden_states类型
212 |         assert list(attn.size()) == [bsz, seq_len, self.num_heads, self.head_dim]
213 |         attn = attn.transpose(0, 1).reshape(seq_len, bsz, embed_dim).contiguous()
214 | 
215 |         # For this case, we'll just recompute the attention for these indices
216 |         # and overwrite the attn tensor. TODO: remove the redundant computation
217 |         if extra_attention_mask is not None:
218 |             selected_hidden_states = hidden_states.new_zeros(max_num_extra_indices_per_batch, bsz, embed_dim)
219 |             selected_hidden_states[selection_padding_mask_nonzeros[::-1]] = hidden_states_q[extra_attention_mask_nonzeros[::-1]]
220 | 
221 |             q = self.query_global(selected_hidden_states)
222 |             k = self.key_global(hidden_states_k)
223 |             v = self.value_global(hidden_states_v)
224 |             q /= math.sqrt(self.head_dim)
225 | 
226 |             q = q.contiguous().view(max_num_extra_indices_per_batch, bsz * self.num_heads, self.head_dim).transpose(0, 1)  # (bsz*self.num_heads, max_num_extra_indices_per_batch, head_dim)
227 |             k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)  # bsz * self.num_heads, seq_len, head_dim)
228 |             v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)  # bsz * self.num_heads, seq_len, head_dim)
229 |             attn_weights = torch.bmm(q, k.transpose(1, 2))
230 |             assert list(attn_weights.size()) == [bsz * self.num_heads, max_num_extra_indices_per_batch, seq_len]
231 | 
232 |             attn_weights = attn_weights.view(bsz, self.num_heads, max_num_extra_indices_per_batch, seq_len)
233 |             attn_weights[selection_padding_mask_zeros[0], :, selection_padding_mask_zeros[1], :] = -10000.0
234 |             if key_padding_mask is not None:
235 |                 attn_weights = attn_weights.masked_fill(
236 |                     key_padding_mask.unsqueeze(1).unsqueeze(2),
237 |                     -10000.0,
238 |                 )
239 |             attn_weights = attn_weights.view(bsz * self.num_heads, max_num_extra_indices_per_batch, seq_len)
240 |             attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32)  # use fp32 for numerical stability
241 |             attn_probs = F.dropout(attn_weights_float.type_as(attn_weights), p=self.dropout, training=self.training)
242 |             selected_attn = torch.bmm(attn_probs, v)
243 |             assert list(selected_attn.size()) == [bsz * self.num_heads, max_num_extra_indices_per_batch, self.head_dim]
244 | 
245 |             selected_attn_4d = selected_attn.view(bsz, self.num_heads, max_num_extra_indices_per_batch, self.head_dim)
246 |             nonzero_selected_attn = selected_attn_4d[selection_padding_mask_nonzeros[0], :, selection_padding_mask_nonzeros[1]]
247 |             attn[extra_attention_mask_nonzeros[::-1]] = nonzero_selected_attn.view(len(selection_padding_mask_nonzeros[0]), -1).type_as(hidden_states_q)
248 | 
249 |         context_layer = attn.transpose(0, 1)
250 |         if output_attentions:
251 |             if extra_attention_mask is not None:
252 |                 # With global attention, return global attention probabilities only
253 |                 # batch_size x num_heads x max_num_global_attention_tokens x sequence_length
254 |                 # which is the attention weights from tokens with global attention to all tokens
255 |                 # It doesn't not return local attention
256 |                 # In case of variable number of global attantion in the rows of a batch,
257 |                 # attn_weights are padded with -10000.0 attention scores
258 |                 attn_weights = attn_weights.view(bsz, self.num_heads, max_num_extra_indices_per_batch, seq_len)
259 |             else:
260 |                 # without global attention, return local attention probabilities
261 |                 # batch_size x num_heads x sequence_length x window_size
262 |                 # which is the attention weights of every token attending to its neighbours
263 |                 attn_weights = attn_weights.permute(0, 2, 1, 3)
264 |         outputs = (context_layer, attn_weights) if output_attentions else (context_layer,)
265 | 
266 |         
267 |         return outputs
268 | 
269 | 
270 | # if __name__ == "__main__":
271 | 
272 | #     model = MultiScaleSelfAttention(config, layer_id)


--------------------------------------------------------------------------------
/nets/diagonaled_mm_tvm.py:
--------------------------------------------------------------------------------
  1 | from typing import Union
  2 | from functools import lru_cache
  3 | 
  4 | import torch
  5 | import os.path
  6 | 
  7 | 
  8 | class DiagonaledMM(torch.autograd.Function):
  9 |     '''Class to encapsulate tvm code for compiling a diagonal_mm function, in addition to calling
 10 |     this function from PyTorch
 11 |     '''
 12 | 
 13 |     function_dict = {}  # save a list of functions, each has a different set of parameters
 14 | 
 15 |     @staticmethod
 16 |     def _compile_function(dtype: str, device: str, b0: int = 4, b1: int = 4, b2: int = 16):
 17 |         '''Compiles a tvm function that computes diagonal_mm
 18 |         args:
 19 |         dtype: str in ['float64', 'float32', 'float16']
 20 |         device: str in ['cpu' or 'cuda']
 21 |         b0, b1, b2: size of tensor tiles. Very important for good performance
 22 | 
 23 |         '''
 24 |         import tvm  # import the full tvm library here for compilation. Don't import at the top of the file in case we don't need to compile
 25 |         from tvm.contrib import nvcc
 26 |         @tvm.register_func
 27 |         def tvm_callback_cuda_compile(code):
 28 |             """Use nvcc compiler for better perf."""
 29 |             ptx = nvcc.compile_cuda(code, target="ptx", arch='sm_52')  # use old arch for this to work on old GPUs
 30 |             return ptx
 31 | 
 32 |         assert dtype in ['float16', 'float32', 'float64']
 33 |         assert device in ['cpu', 'cuda']
 34 |         device = None if device == 'cpu' else device
 35 |         tgt_host="llvm"
 36 | 
 37 |         b = tvm.var('b')  # batch size
 38 |         n = tvm.var('n')  # sequence length
 39 |         h = tvm.var('h')  # number of heads
 40 |         m = tvm.var('m')  # hidden dimension
 41 |         w = tvm.var('w')  # window size
 42 |         w_upper = tvm.var('w_upper')  # window size to the right of the word. Should be `0` or `w`
 43 |         padding = tvm.var('padding')  # padding
 44 |         transpose_t1 = tvm.var('transpose_t1')  # t1 should be transposed
 45 |         t1d3 = tvm.var('t1d3')  # last dimension of t1
 46 |         t3d3 = tvm.var('t3d3')  # last dimension of t3 (the result tensor)
 47 |         X = tvm.placeholder((b, n, h, t1d3), name='X', dtype=dtype)  # first tensor
 48 |         Y = tvm.placeholder((b, n, h, m), name='Y', dtype=dtype)  # second tensor
 49 |         k = tvm.reduce_axis((0, t1d3), name='k')  # dimension to sum over
 50 |         D = tvm.placeholder((h), name='D', dtype='int')  # dilation per head
 51 |         output_shape = (b, n, h, t3d3)  # shape of the result tensor
 52 |         algorithm = lambda l, i, q, j: tvm.sum(
 53 |             tvm.if_then_else(
 54 |                 t3d3 == m,  # if output dimension == m, then t1 is diagonaled (FIXME: This breaks if t3d3 == m == t1d3)
 55 |                 tvm.if_then_else(
 56 |                     transpose_t1 == 0,
 57 |                     tvm.if_then_else(
 58 |                         tvm.all(
 59 |                             i + D[q] * (k - w) >= 0,
 60 |                             i + D[q] * (k - w) < n,
 61 |                         ),
 62 |                         X[l, i, q, k] * Y[l, i + D[q] * (k - w), q, j],  # t1 is diagonaled
 63 |                         padding
 64 |                     ),
 65 |                     tvm.if_then_else(
 66 |                         tvm.all(
 67 |                             i + D[q] * (k - w_upper) >= 0,  # `w_upper` to handle the case `autoregressive=True`
 68 |                             i + D[q] * (k - w_upper) < n,
 69 |                         ),
 70 |                         X[l, i + D[q] * (k - w_upper), q, (w_upper + w) - k] * Y[l, i + D[q] * (k - w_upper), q, j],  # # t1 is diagonaled and should be transposed
 71 |                         padding
 72 |                     ),
 73 |                 ),
 74 |                 tvm.if_then_else(
 75 |                     tvm.all(
 76 |                         i + D[q] * (j - w) >= 0,
 77 |                         i + D[q] * (j - w) < n,
 78 |                     ),
 79 |                     X[l, i, q, k] * Y[l, i + D[q] * (j - w), q, k],  # t1 is not diagonaled, but the output tensor is going to be
 80 |                     padding
 81 |                 )
 82 |             ), axis=k)
 83 | 
 84 |         Z = tvm.compute(output_shape, algorithm, name='Z')  # automatically generate cuda code
 85 |         s = tvm.create_schedule(Z.op)
 86 | 
 87 |         print('Lowering: \n ===================== \n{}'.format(tvm.lower(s, [X, Y, D], simple_mode=True)))
 88 | 
 89 |         # split long axis into smaller chunks and assing each one to a separate GPU thread/block
 90 |         ko, ki = s[Z].split(Z.op.reduce_axis[0], factor=b0)
 91 |         ZF = s.rfactor(Z, ki)
 92 | 
 93 |         j_outer, j_inner = s[Z].split(s[Z].op.axis[-1], factor=b1)
 94 |         i_outer, i_inner = s[Z].split(s[Z].op.axis[1], factor=b2)
 95 | 
 96 |         s[Z].bind(j_outer, tvm.thread_axis("blockIdx.x"))
 97 |         s[Z].bind(j_inner, tvm.thread_axis("threadIdx.y"))
 98 | 
 99 |         s[Z].bind(i_outer, tvm.thread_axis("blockIdx.y"))
100 |         s[Z].bind(i_inner, tvm.thread_axis("threadIdx.z"))
101 | 
102 |         tx = tvm.thread_axis("threadIdx.x")
103 |         s[Z].bind(s[Z].op.reduce_axis[0], tx)
104 |         s[ZF].compute_at(s[Z], s[Z].op.reduce_axis[0])
105 |         s[Z].set_store_predicate(tx.var.equal(0))
106 | 
107 |         print('Lowering with GPU splits: \n ===================== \n{}'.format(tvm.lower(s, [X, Y, D], simple_mode=True)))
108 | 
109 |         # compiling the automatically generated cuda code
110 |         diagonaled_mm = tvm.build(s, [X, Y, Z, D, w, w_upper, padding, transpose_t1, t3d3], target=device, target_host=tgt_host, name='diagonaled_mm')
111 |         return diagonaled_mm
112 | 
113 |     @staticmethod
114 |     def _get_lib_filename(dtype: str, device: str):
115 |         base_filename = 'multi_scale/lib/lib_diagonaled_mm'
116 |         return '{}_{}_{}.so'.format(base_filename, dtype, device)
117 | 
118 |     @staticmethod
119 |     def _save_compiled_function(f, dtype: str, device: str):
120 |         if not os.path.exists('multi_scale/lib/'):
121 |             os.makedirs('multi_scale/lib/')
122 |         f.export_library(DiagonaledMM._get_lib_filename(dtype, device))
123 | 
124 |     @staticmethod
125 |     def _load_compiled_function(dtype: str, device: str):
126 |         from tvm.module import load  # this can be the small runtime python library, and doesn't need to be the whole thing
127 |         filename = DiagonaledMM._get_lib_filename(dtype, device)
128 |         current_dir = os.path.dirname(os.path.abspath(__file__))
129 |         potential_dirs = ['../../', '../', './', f'{current_dir}/', f'{current_dir}/../']
130 |         for potential_dir in  potential_dirs:
131 |             filepath = '{}{}'.format(potential_dir, filename)
132 |             if os.path.isfile(filepath):
133 |                 print('Loading tvm binary from: {}'.format(filepath))
134 |                 return load(filepath)
135 |         return None
136 | 
137 |     @staticmethod
138 |     def _get_function(dtype: str, device: str):
139 |         '''Loads the function from the disk or compile it'''
140 |         # A list of arguments that define the function
141 |         args = (dtype, device)
142 |         if args not in DiagonaledMM.function_dict:
143 |             diagonaled_mm = DiagonaledMM._load_compiled_function(dtype, device)  # try to load from disk
144 |             if not diagonaled_mm:
145 |                 print('Tvm binary not found. Compiling ...')
146 |                 diagonaled_mm = DiagonaledMM._compile_function(dtype, device)  # compile
147 |                 DiagonaledMM._save_compiled_function(diagonaled_mm, dtype, device)  # save to disk
148 |             # convert the tvm function into a pytorch function
149 |             from tvm.contrib import dlpack
150 |             diagonaled_mm_pytorch = dlpack.to_pytorch_func(diagonaled_mm)  # wrap it as a pytorch function
151 |             # save the function into a dictionary to be reused
152 |             DiagonaledMM.function_dict[args] = diagonaled_mm_pytorch  # save it in a dictionary for next time
153 |         return DiagonaledMM.function_dict[args]
154 | 
155 |     @staticmethod
156 |     def _diagonaled_mm(t1: torch.Tensor, t2: torch.Tensor, w: int, d: Union[torch.Tensor,int],
157 |                        is_t1_diagonaled: bool = False, transpose_t1: bool = False, padding: int = 0,
158 |                        autoregressive: bool = False):
159 |         '''Calls the compiled function after checking the input format. This function is called in three different modes.
160 |         t1 x t2 = r ==> t1 and t2 are not diagonaled, but r is. Useful for query x key = attention_scores
161 |         t1 x t2 = r ==> t1 is diagonaled, but t2 and r are not. Useful to compuate attantion_scores x value = context
162 |         t1 x t2 = r ==> t1 is diagonaled and it should be transposed, but t2 and r are not diagonaled. Useful in some of
163 |                             the calculations in the backward pass.
164 |         '''
165 |         dtype = str(t1.dtype).split('.')[1]
166 |         device = t1.device.type
167 |         assert len(t1.shape) == 4
168 |         assert len(t1.shape) == len(t2.shape)
169 |         assert t1.shape[:3] == t2.shape[:3]
170 |         if isinstance(d, int):  # if d is an integer, replace it with a tensor of the same length
171 |                                 # as number of heads, and it is filled with the same dilation value
172 |             d = t1.new_full(size=(t1.shape[2],), fill_value=d, dtype=torch.int, requires_grad=False)
173 | 
174 |         assert len(d.shape) == 1
175 |         assert d.shape[0] == t1.shape[2]  # number of dilation scores should match number of heads
176 |         b = t1.shape[0]  # batch size
177 |         n = t1.shape[1]  # sequence length
178 |         h = t1.shape[2]  # number of heads
179 |         m = t2.shape[3]  # hidden dimension
180 |         w_upper = 0 if autoregressive else w
181 |         c = w_upper + w + 1  # number of diagonals
182 |         if is_t1_diagonaled:
183 |             assert t1.shape[3] == c
184 |             r = t1.new_empty(b, n, h, m)  # allocate spase for the result tensor
185 |         else:
186 |             assert not transpose_t1
187 |             assert t1.shape[3] == m
188 |             r = t1.new_empty(b, n, h, c)  # allocate spase for the result tensor
189 | 
190 |         # gets function from memory, from disk or compiles it from scratch
191 |         _diagonaled_mm_function = DiagonaledMM._get_function(dtype=dtype, device=device)
192 | 
193 |         # The last argument to this function is a little hacky. It is the size of the last dimension of the result tensor
194 |         # We use it as a proxy to tell if t1_is_diagonaled or not (if t1 is diagonaled, result is not, and vice versa).
195 |         # The second reason is that the lambda expression in `_compile_function` is easier to express when the shape
196 |         # of the output is known
197 |         # This functions computes diagonal_mm then saves the result in `r`
198 |         if m == c:
199 |             # FIXME
200 |             print('Error: the hidden dimension {m} shouldn\'t match number of diagonals {c}')
201 |             assert False
202 |         _diagonaled_mm_function(t1, t2, r, d, w, w_upper, padding, transpose_t1, m if is_t1_diagonaled else c)
203 |         return r
204 | 
205 |     @staticmethod
206 |     def _prepare_tensors(t):
207 |         '''Fix `stride()` information of input tensor. This addresses some inconsistency in stride information in PyTorch.
208 |         For a tensor t, if t.size(0) == 1, then the value of t.stride()[0] doesn't matter.
209 |         TVM expects this value to be the `product(t.size()[1:])` but PyTorch some times sets it to `t.stride()[1]`.
210 |         Here's an example to reporduce this issue:
211 |             import torch
212 |             print(torch.randn(1, 10).stride())
213 |             > (10, 1)
214 |             print(torch.randn(10, 1).t().contiguous().stride())
215 |             > (1, 1)  # expected it to be (10, 1) as above
216 |             print(torch.randn(10, 2).t().contiguous().stride())
217 |             > (10, 1) # but gets the expected stride if the first dimension is > 1
218 |         '''
219 |         assert t.is_contiguous()
220 |         t_stride = list(t.stride())
221 |         t_size = list(t.size())
222 |         # Fix wrong stride information for the first dimension. This occures when batch_size=1
223 |         if t_size[0] == 1 and t_stride[0] == t_stride[1]:
224 |             # In this case, the stride of the first dimension should be the product
225 |             # of the sizes  of all other dimensions
226 |             t_stride[0] = t_size[1] * t_size[2] * t_size[3]
227 |             t = t.as_strided(size=t_size, stride=t_stride)
228 |         return t
229 | 
230 |     min_seq_len = 16  # unexpected output if seq_len < 16
231 | 
232 |     @staticmethod
233 |     def forward(ctx, t1: torch.Tensor, t2: torch.Tensor, w: int, d: Union[torch.Tensor,int], is_t1_diagonaled: bool = False, padding: int = 0, autoregressive: bool = False) -> torch.Tensor:
234 |         '''Compuates diagonal_mm of t1 and t2.
235 |         args: 
236 |         t1: torch.Tensor = (batch_size, seq_len, num_attention_heads, hidden_size|number_of_diagonals).
237 |             t1 can be a regular tensor (e.g. `query_layer`) or a diagonaled one (e.g. `attention_scores`)
238 |         t2: torch.Tensor = (batch_size, seq_len, num_attention_heads, hidden_size). This is always a non-diagonaled
239 |             tensor, e.g. `key_layer` or `value_layer`
240 |         w: int = window size; number of attentions on each side of the word
241 |         d: torch.Tensor or int = dilation of attentions per attention head. If int, the same dilation value will be used for all
242 |             heads. If torch.Tensor, it should be 1D of lenth=number of attention heads
243 |         is_t1_diagonaled: is t1 a diagonaled or a regular tensor
244 |         padding: the padding value to use when accessing invalid locations. This is mainly useful when the padding
245 |             needs to be a very large negative value (to compute softmax of attentions). For other usecases,
246 |             please use zero padding.
247 |         autoregressive: if true, return only the lower triangle
248 |         returns: torch.Tensor = (batch_size, seq_len, num_attention_heads, hidden_size|number_of_diagonals)
249 |             if t1 is diagonaed, result is non-diagonaled, and vice versa
250 |         '''
251 |         batch_size, seq_len, num_attention_heads, hidden_size = t1.size()
252 |         assert seq_len >= DiagonaledMM.min_seq_len, 'avoid splitting errors by using seq_len >= {}'.format(DiagonaledMM.min_seq_len)  # FIXME
253 |         ctx.save_for_backward(t1, t2)
254 |         ctx.w = w
255 |         ctx.d = d
256 |         ctx.is_t1_diagonaled = is_t1_diagonaled
257 |         ctx.autoregressive = autoregressive
258 |         t1 = DiagonaledMM._prepare_tensors(t1)
259 |         t2 = DiagonaledMM._prepare_tensors(t2)
260 |         # output = t1.mm(t2)  # what would have been called if this was a regular matmul
261 |         output = DiagonaledMM._diagonaled_mm(t1, t2, w, d, is_t1_diagonaled=is_t1_diagonaled, padding=padding, autoregressive=autoregressive)
262 |         return output
263 | 
264 |     @staticmethod
265 |     def backward(ctx, grad_output):
266 |         t1, t2 = ctx.saved_tensors
267 |         w = ctx.w
268 |         d = ctx.d
269 |         is_t1_diagonaled = ctx.is_t1_diagonaled
270 |         autoregressive = ctx.autoregressive
271 |         if not grad_output.is_contiguous():
272 |             grad_output = grad_output.contiguous()  # tvm requires all input tensors to be contiguous
273 |         grad_output = DiagonaledMM._prepare_tensors(grad_output)
274 |         t1 = DiagonaledMM._prepare_tensors(t1)
275 |         t2 = DiagonaledMM._prepare_tensors(t2)
276 |         # http://cs231n.github.io/optimization-2/
277 |         # https://pytorch.org/docs/master/notes/extending.html
278 |         # grad_t1 = grad_output.mm(t2)  # what would have been called if this was a regular matmul
279 |         grad_t1 = DiagonaledMM._diagonaled_mm(grad_output, t2, w, d, is_t1_diagonaled=not is_t1_diagonaled, autoregressive=autoregressive)
280 |         # grad_t2 = grad_output.t().mm(t1)  # or `grad_t2 = t1.t().mm(grad_output).t()` because `(AB)^T = B^TA^T`
281 |         if is_t1_diagonaled:
282 |             grad_t2 = DiagonaledMM._diagonaled_mm(t1, grad_output, w, d, is_t1_diagonaled=True, transpose_t1=True, autoregressive=autoregressive)
283 |         else:
284 |             grad_t2 = DiagonaledMM._diagonaled_mm(grad_output, t1, w, d, is_t1_diagonaled=True, transpose_t1=True, autoregressive=autoregressive)
285 |         return grad_t1, grad_t2, None, None, None, None, None
286 | 
287 | 
288 | def _get_invalid_locations_mask_fixed_dilation(seq_len: int, w: int, d: int):
289 |     diagonals_list = []
290 |     for j in range(-d * w, d, d):
291 |         diagonal_mask = torch.zeros(seq_len, device='cpu', dtype=torch.uint8)
292 |         diagonal_mask[:-j] = 1
293 |         diagonals_list.append(diagonal_mask)
294 |     return torch.stack(diagonals_list, dim=-1)
295 | 
296 | @lru_cache()
297 | def _get_invalid_locations_mask(w: int, d: Union[torch.Tensor,int], autoregressive: bool, device: str):
298 |     if isinstance(d, int):
299 |         affected_seq_len = w * d
300 |         mask = _get_invalid_locations_mask_fixed_dilation(affected_seq_len, w, d)
301 |         mask = mask[None, :, None, :]
302 |     else:
303 |         affected_seq_len = w * d.max()
304 |         head_masks = []
305 |         d_list = d.cpu().numpy().tolist()
306 |         for d in d_list:
307 |             one_head_mask = _get_invalid_locations_mask_fixed_dilation(affected_seq_len, w, d)
308 |             head_masks.append(one_head_mask)
309 |         mask = torch.stack(head_masks, dim=-2)
310 |         mask = mask[None, :, :, :]
311 | 
312 |     ending_mask = None if autoregressive else mask.flip(dims=(1, 3)).bool().to(device)
313 |     return affected_seq_len, mask.bool().to(device), ending_mask
314 | 
315 | def mask_invalid_locations(input_tensor: torch.Tensor, w: int, d: Union[torch.Tensor, int], autoregressive: bool) -> torch.Tensor:
316 |     affected_seq_len, beginning_mask, ending_mask = _get_invalid_locations_mask(w, d, autoregressive, input_tensor.device)
317 |     seq_len = input_tensor.size(1)
318 |     beginning_input = input_tensor[:, :affected_seq_len, :, :w+1]
319 |     beginning_mask = beginning_mask[:, :seq_len].expand(beginning_input.size())
320 |     beginning_input.masked_fill_(beginning_mask, -float('inf'))
321 |     if not autoregressive:
322 |         ending_input = input_tensor[:, -affected_seq_len:, :, -(w+1):]
323 |         ending_mask = ending_mask[:, -seq_len:].expand(ending_input.size())
324 |         ending_input.masked_fill_(ending_mask, -float('inf'))
325 | 
326 | 
327 | diagonaled_mm = DiagonaledMM.apply
328 | 
329 | # The non-tvm implementation is the default, we don't need to load the kernel at loading time.
330 | # DiagonaledMM._get_function('float32', 'cuda')
331 | 


--------------------------------------------------------------------------------
/metadata/wordst.txt:
--------------------------------------------------------------------------------
1 | ['QuestionText', 'answer', 'confidence', 'Are', 'there', 'more', 'than', 'one', 'bird', 'squawking', 'yes', 'people', 'having', 'a', 'conversation', 'no', 'these', 'animals', 'outside', 'maybe', 'How', 'many', 'birds', 'are', 'three', 'Maybe', 'two', 'cars', 'honk', 'their', 'horn', 'zero', 'Yes', 'Is', 'only', 'sqauwking', 'making', 'noise', 'several', 'it', 'dog', 'the', 'close', 'and', 'far', 'away', 'person', 'screaming', 'this', 'What', 'species', 'of', 'animal', 'can', 'be', 'heard', 'seagull', 'they', 'listening', 'to', 'radio', 'is', 'an', 'object', 'being', 'dropped', 'someone', 'working', 'forklift', 'present', 'placed', 'on', 'platform', 'towards', 'end', 'clip', 'board', 'wood', 'type', 'material', 'item', 'made', 'any', 'in', 'distance', 'does', 'inhale', 'sharply', 'strong', 'wind', 'Was', 'recording', 'what', 'causing', 'rasping', 'sound', 'running', 'nearby', 'river', 'water', 'No', 'hawks', 'around', 'Does', 'involve', 'motion', 'occur', 'indoors', 'raining', 'crashing', 'against', 'beach', 'waves', 'that', 'rain', 'talking', 'music', 'accompanying', 'song', 'home', 'alone', 'well-attended', 'event', 'briefly', 'prayer', 'clapping', 'singing', 'mood', 'portrayed', 'from', 'mammals', 'happy', 'excitement', 'different', 'sounds', 'five', 'consistent', 'melodic', 'muffled', 'coming', 'train', 'factory', 'Can', 'seagulls', 'audio', 'device', 'moved', 'typically', 'by', 'vocalization', 'something', 'usually', 'desert', 'blowing', 'natural', 'element', 'besides', 'trees', 'all', 'same', 'source', 'bouncing', 'separate', 'creaking', 'noises', 'were', 'knocking', 'door', 'opening', 'closing', 'window', 'lots', 'little', 'pieces', 'mess', 'sweeping', 'yelling', 'name', 'used', 'complete', 'action', 'glass', 'Which', 'part', 'room', 'affected', 'activity', 'taking', 'place', 'floor', 'liquid', 'dripping', 'thunder', 'falling', 'most', 'pig', 'background', 'crowd', 'moving', 'quickly', 'nature', 'constant', 'aeroplane', 'match', 'car', 'airplane', 'here', 'live', 'distortion', 'color', 'unknown', 'white', 'kind', 'chicken', 'duck', 'rooster', 'authorities', 'voices', 'vehicle', 'outer', 'space', 'The', 'lengthy', 'high-pitched', 'known', 'as', 'siren', 'six', 'barking', 'hen', 'geese', 'fighting', 'each', 'other', '', 'going', 'would', 'expect', 'hear', 'at', 'shopping', 'mall', 'first', 'half', 'fast', 'freight', 'times', 'struck', 'nineteen', 'eleven', 'twenty', 'hitting', 'metal', 'multiple', 'musical', 'instrument', 'drum', 'pause', 'repetitive', 'four', 'speaking', 'themselves', 'created', 'handling', 'once', 'stapler', 'conversing', 'anyone', 'how', 'Who', 'men', 'get', 'louder', 'change', 'pitch', 'ever', 'stop', 'middle', 'objects', 'mechanical', 'synthesizer', "people's", 'growing', 'over', 'time', 'stray', 'cats', 'crying', 'motorcycle', 'start', 'up', 'chatting', 'clanging', 'driving', 'dead', 'chirping', 'faintly', 'thoroughly', 'traffic', 'doing', 'produced', 'blow', "it's", 'manually', 'rhythm', 'class', 'instance', 'softest', 'guitar', 'stick', 'bell', 'area', 'dry', 'cat', 'lower', 'higher', 'some', 'points', 'again', 'none', 'string', 'plucked', 'squealing', 'machine', 'belt', 'motor', 'Do', 'machines', 'have', 'wheels', 'take', '"If', 'vehicles', 'traveling', 'high', 'speed', 'likely', 'on"', 'freeway', 'highway', 'racetrack', 'meowing', 'machinery', 'engine', 'plane', 'tractor', 'recorded', 'messages', 'man', 'make', 'announcement', 'with', 'chime', 'for', 'passengers', 'gender', 'voice', 'female', 'announcer', 'do', 'flowing', 'pouring', 'heavier', 'weather', 'rainy', 'thundering', 'hollow', 'bucket', 'roof', 'communicating', '"Is', 'flow', 'pretty', 'much', 'unaltered', 'not', 'increased', 'or', 'decreased"', 'throughout', 'into', 'puddle', 'producing', 'substance', 'faucet', 'large', 'through', 'grass', 'beginning', "What's", 'tapping', 'TV', 'clanking', 'tin', 'pencil', 'shatter', 'rollers', 'nine', "someone's", 'speech', 'amplified', 'vibrating', 'humming', 'bus', 'heater', 'industrial', 'printing', 'birds).wav"', 'shoe', 'squeak', 'helicopter', 'passing', 'crow', 'ducks', 'clinking', 'toys', 'dining', 'participating', 'marathon', 'group', 'eating', 'sharp', 'utensils', 'dishes', 'whispering', 'hooting', 'repeatedly', 'dove', 'pigeon', 'owl', 'come', 'whistling', 'chiming', 'chimes', 'ringing', 'Where', 'church', 'house', 'fly', 'digging', 'gravel', 'outdoors', 'sort', 'located', 'zoo', 'elevator', 'steps', 'did', 'step', 'twentyseven', 'twentysix', 'walking', 'stairs', 'down', 'slide', 'inside', 'ticking', 'getting', 'trash', 'somebody', 'laying', 'plastic', 'crinkling', 'wrapping', 'fishing', 'Has', 'tap', 'been', 'left', 'stream', 'rapid', 'waterfall', 'like', 'scraped', 'silent', 'ball', 'sport', 'played', 'pingpong', 'children', 'playing', 'Did', 'pass', 'slowly', 'iron', 'steel', 'human', 'kinds', 'flapping', 'its', 'wings', 'insect', 'cricket', 'laughing', 'continuous', 'hissing', 'locust', 'woman', 'chirps', 'remaining', 'quiet', 'scratching', 'arguing', 'elephant', 'fish', 'alarm', 'go', 'off', 'lever', 'pushed', 'frozen', 'solid', 'tank', 'toilet', 'operating', 'revving', 'stopped', 'counting', 'race', 'thing', 'speaker', 'truck', 'loud', 'shore', 'operated', 'mopping', 'metallic', 'crumpled', 'paper', 'widely', 'kitchen', 'rustling', 'dryer', 'aluminum', 'foil', 'crushing', 'he', 'rattle', 'chipping', '8times', 'air', 'fourteen', 'thrown', 'repeated', 'back', 'forth', 'rake', 'books', 'rotating', 'slow', 'clothes', 'living', 'apart', 'main', 'microwaved', 'food', 'clustered', 'remind', 'popcorn', 'lion', 'growling', 'tarp', 'summer', 'vary', 'crunchy', 'cleaned', 'washing', 'hose', 'washed', 'rubbed', 'out', 'interested', 'rubber', 'speak', 'breaking', 'continue', 'until', 'seven', 'piece', 'shattering', 'brushing', 'teeth', 'bubbling', 'ordinary', 'emergency', 'somewhere', 'shrill', 'ambulance', 'direction', 'insects', 'tree', 'chopped', 'buzzing', 'bee', 'loudest', "truck's", 'ship', 'items', 'sliced', 'receptacles', 'drinks', 'poured', 'bacon', 'cooking', 'serving', 'jar', 'cup', 'papers', 'torn', 'folding', 'touching', 'bubblewrap', 'chickens', 'roosting', 'hay', 'straw', 'hit', 'surfaces', 'violent', 'storm', 'precipitation', 'produces', 'composed', 'electrical', 'giving', 'television', "Who's", 'audience', 'soft', 'way', 'where', 'behave', 'goat', 'makes', 'sheep', 'non', 'creating', 'was', 'called"', 'ice', 'baby', 'spraying', 'held', 'cooing', 'say', 'word', 'drums', 'farm', 'crowing', 'using', 'sometimes', 'scrubbing', 'point', 'unconscious', 'clean', 'brush', 'broom', 'mammal', 'low', 'flying', 'jets', 'ocean', 'electricity', 'steady', 'forest', 'tv', 'static', 'whisper', 'frogs', 'croaking', 'flushed', 'rushing', 'rolled', 'staticy', 'electric', 'crackling', 'welding', 'dogs', "person's", 'words', 'wildly', 'quieter', 'interrupting', 'another', 'completion', 'purpose', 'meeting', 'useful', 'overpower', 'bangs', 'done', 'shooting', 'engines', 'accelerating', 'bike', 'die', 'problem', 'electronic', 'chattering', 'keep', 'changing', 'scraping', 'deep', 'base', 'stay', 'dissipate', 'peoples', 'rainfall', 'bass', 'called', 'rumbling', 'squished', 'splashing', 'pacing', 'manuevered', 'cleaning', 'wheelbarrow', 'washer', 'street', 'celebrating', 'bonfire', 'fire', 'burning', 'fuel', 'gasoline', 'jet', 'transportation', 'eight', 'paved', 'road', 'double', 'creatures', 'sneezing', 'swimming', 'lay', 'eggs', 'egg', 'chewing', 'implement', 'artificial', 'turning', 'tool', 'starting', 'crank', 'wrench', 'wound', 'mower', '"Apart', 'heard"', 'whir', 'revved', 'uncovered', 'dull', 'thud', 'mariachi', 'practicing', 'lines', 'play', 'grated', 'grinding', 'across', 'light', 'cage', 'twentytwo', 'ten', 'chopping', 'flooring', 'tile', 'cutting', 'knife', 'bounce', 'cheering', 'rolling', 'throwing', 'breath', 'during', 'whole', 'breathe', 'entire', 'breathing', 'hard', 'quietly', 'behind', 'long', 'echo', 'short', 'tornado', 'thunderstorm', 'rapidly', 'caused', 'tightly', 'enclosed', 'steadily', 'broken', 'pipe', 'calling', 'rat', 'pour', 'tape', 'calm', 'draining', 'boiling', 'creature', 'chirp', 'furry', 'thirty', 'twentynine', 'before', 'concert', 'just', 'tested', 'loudspeaker', 'signal', 'Could', 'indicate', 'volume', 'fades', 'When', 'lowest', 'small', 'impacts', 'state', 'matter', 'periods', 'new', 'stirring', 'conclusion', 'feature', 'footsteps', 'fifty', 'fourty', 'jogging', 'initially', 'stepping', 'leaves', 'work', 'mat', 'putty', 'fart', 'boy', 'about', 'pants', 'crinkled', 'scrunched', 'grilling', 'his', 'foot', 'gas', 'production', 'turned', 'generator', 'within', 'wheel', 'child', 'crowded', 'drumming', 'repeat', 'could', 'knocked', 'together', 'breakable', 'anything', 'fall', 'things', 'In', 'found', 'seem', 'dings', 'typing', 'very', 'writing', 'pen', 'typewriter', 'keyboard', 'Aalborg', 'DK', '1900hrs.wav"', 'near', 'bells', '5', 'seconds', 'male', 'happening', 'hotel', 'bathroom', 'seventeen', 'cuckoo', 'machine.wav"', 'often', 'twice', 'level', 'sewing', 'sander', 'shaking', 'sheet', 'shaken', 'canvas', 'good', 'clucking', 'wolf', 'howling', 'rocks', 'fabrics', 'rubbing', 'kicked', 'marbles', 'clashing', 'pebbles', 'walks', 'shoes', 'boats', 'till', 'squeaking', 'always', 'constantly', 'swinging', 'rusty', 'swing', 'gate', 'surface', 'thumping', 'wearing', 'feet', 'boots', 'Voices', 'Traffic).wav"', 'talk', 'shots', 'exhilarating', 'military', 'operation', 'clearly', 'women', 'remain', 'mike', 'quietest', 'flap', 'sixteen', 'moment', 'surfers', 'continually', 'decrease', 'intensity', 'covering', 'ground', 'directly', 'front', 'tub', 'monkey', 'subway', 'goes', 'flashing', 'station', 'wolves', 'night', 'day', 'morning', 'airport', 'various', 'bumps', 'twelve', 'rodent', 'turn', 'jackhammer', 'power', 'drill', 'beep', 'fluctuate', 'weather-related', 'grizzly', 'bear', 'repetetive', "'howling'", 'parakeet', 'impatient', 'gradually', 'becomes', 'intense', 'thirtyfive', 'Whats', 'banged', 'fan', 'speeding', 'number', 'lake', 'pop', 'dropping', 'drip', 'opened', 'park', 'fight', 'fence', 'violently', 'horse', 'escape', 'delicate', 'latch', 'box', 'dishwasher', 'performs', 'actions', 'lawn', 'lot', 'tearing', 'package', 'crickets', 'panting', 'badminton', 'past', 'babies', 'repetitions', 'cyclical', 'fifteen', 'sprinkler', 'man-made', 'comes', 'fabric', 'Would', 'commonplace', 'golf', 'course', 'idle', 'drive', 'grinder', 'idling', 'dirt', 'scooter', 'dirtbike', 'brass', 'plays', 'trumpet', 'crack', 'patient', 'lidocaine', 'spinning', 'distorted', 'On', 'coin', 'piano', 'ambience.wav"', 'clinging', 'silverware', 'tools', 'lightning', 'ignite', 'safe', 'emitted', 'cow', 'messing', 'mouth', 'whistle', 'tweeting', 'warning', 'blaring', 'occuring', 'naturally', 'occurring', 'phenomenon', 'clouds', 'occurs', 'without', 'intervention', 'consistently', 'wave', 'twentyone', 'twentyfive', 'involved', 'jungle', 'pitched', 'giggling', 'drizzling', 'cough', 'nothing', 'fireplace', 'uneven', 'polishing', 'sand', 'shut', 'associated', 'movement', 'polished', 'hinge', 'only.wav"', 'need', 'maintenance', 'should', 'applied', 'eliminate', 'oil', 'shouting', 'taps', 'open', 'teaspoon', 'continuously', 'slam', 'powerful', 'chips', 'bowl', 'scraper', 'hammer', 'wind-up', 'toy', 'fail', 'tension', 'released', 'movements', 'parts', 'pulling', 'while', 'winding-up', 'tapped', 'hands', 'indoor', 'location', '2', 'pounds', 'booming', 'weapon', 'faster', 'screeching', 'speeds', 'doorbell', 'concrete', 'raindrops', 'signs', 'pleasant', 'sunny', 'when', 'taken', 'stormy', 'grumbles', 'growls', 'auditorium', 'colliding', 'sounding', 'instances', 'collide', 'metals', 'spoon', 'city', 'crows', 'waiting', 'From', 'face', 'after', 'sniffing', 'racing', 'switch', 'hurry', 'sirens', 'slowing', 'silence', 'tracks', 'cicadas', 'softly', 'overhead', 'who', 'Didgeridoo', 'instrumental', 'rock', 'theremin', 'chiansaw', 'might', 'humans', 'frog', 'clicking', 'clicks', 'twentyfour', 'twentythree', 'clock', 'fruit', 'blender', 'watch', 'started', 'winning', 'bathing', 'walked', 'chair', 'every', 'unnatural', 'flipped', 'key', 'contact', 'involvement', 'swearing', 'body', 'character', 'obstructed', 'audible', 'pauses', 'between', "'breathing'", 'centered', 'fading', 'out"', 'itself', 'cutter', 'medium', 'heavy', 'fountains', 'fountain', 'office', 'environment', 'speaks', 'closest', 'mic', 'strike', 'chainsaw', 'locomotive', 'flashes', 'creates', 'lightening', 'quacking', 'nearer', 'everyone', 'fighter', 'sky', 'banging', 'fired', 'compacted', 'birdsong', 'crunching', 'yell', 'pace', 'increase', 'upon', 'snow', 'trotting', 'else', "child's", 'punk', 'sobbing', 'spinner', 'driller', 'saw', 'break', 'variations', 'Were', 'mean', 'firetruck', 'mooing', '"What', 'voices"', 'drain', 'gurgle', 'duckie', 'filling', 'sink', 'basin', 'facet', 'pounding', 'wall', 'monkeys', 'those', 'due', 'deliberate', 'This', 'debris', 'hockey', 'path', 'cause', 'airline', 'setting', 'phone', 'clicked', 'mouse', 'shower', 'bedroom', 'pan', 'jostled', 'whale', 'best', 'describes', 'buzz', 'cut', 'eventually', 'sew', 'construction', 'cold', 'forefront', 'horns', 'llamas', 'chomping', 'heavily', 'sleeping', 'tuba', 'table', 'whining', 'cane', '"After', 'runs', 'off"', 'then', 'seat', 'gush', 'urinating', 'Kind', 'drilling', 'vibrations', 'regular', 'follow', 'tones', 'noisy', 'filming', 'flood', 'sing', 'closed', 'fluid', 'wd-40', 'squeaky', 'recycled', 'styrofoam', 'vessel', 'commonly', 'bottle', 'tropical', 'also', 'crash', 'seawave', 'sea', 'applauding', 'storming', 'calls', 'cleaver', 'steak', 'hum', 'trying', 'communicate', 'parrot', 'happiness', 'slurping', 'consistant', 'dribbling', 'wlking', 'next', 'habour.wav"', 'resistance.wav"', 'thuds', 'celtic', 'genre', 'creak', 'slamming', 'gates', 'push', 'motored', 'windy', 'airplane"', 'last', 'riding', 'rickshaw', 'beeps', 'stuck', 'household', 'tunnel', 'task', 'percussion', 'spitting', 'care', 'oral', 'hygiene', 'drinking', 'cooked', 'slight', 'clang', 'fryingpan', 'normal', 'grill', 'sizzling', 'become', 'shakes', 'vibrates', 'thin', 'impacting', 'intermittently', 'rod', 'composition', 'rattles', 'inelegant', 'hawk', 'jump', 'dunked', 'receptacle', 'bugs', 'longer', 'couple', 'flute', 'organ', 'emitting', 'songs', 'band', 'party', 'windchime', 'xylophone', 'outdoor', 'characterized', 'lively', 'zooming', 'torque', 'per', 'rotation', 'jam', 'manual', 'labor', 'hammering', 'rotated', 'crunch', 'ratchet', 'beings', 'amusement', 'ride', 'transport', 'bison', 'moaning', 'semi', 'hand', 'strokes', 'crackled', 'rise', 'if', 'container', 'filled', 'term', 'wheeled', 'pilot', 'loudness', 'stomping', 'video', 'chopsticks', 'thirtyseven', 'forty', 'frying', 'scene', 'canopy.wav"', 'wet', 'sun', 'shining', 'carry', 'them', 'umbrella', 'inconsistent', 'jumping', 'woodpecker', 'second', 'suddenly', 'which', 'component', 'attacking', 'nests', 'traverse', 'ticks', 'seventythree', 'rhythmic', 'chatter', 'nice', 'legs', 'hundred', 'dat', 'trein', 'op', 'alle', 'tussengelegen', 'stations', 'zal', 'stoppen_100311.wav"', 'preceded', 'tone', 'railwaystation', 'Why', 'tiger', 'vocalizations', 'god', 'bark', 'side', 'attempts', 'unlock', 'turing', 'lock', 'rainforest', 'driver', 'pullover', 'trunk', 'slammed', 'sut', 'old', 'limp', 'as"', 'briskly', 'pavement', 'drops', 'abundant', 'splash', 'loudly', 'instruments', 'accompany', 'singer', 'neighing', "man's", 'hoarse', 'shaker', 'snake', 'speakers', 'steeldrum', 'building', 'arcade', 'types', 'karate', 'purring', "cat's", 'strumming', 'top', 'fingers', 'copy', 'copies', 'print', 'status', 'shuffling', 'crisps', 'ring', 'mode', 'perform', 'reacting', 'interacted', 'up"', 'uninterrupted', 'ripped', 'steam', 'cap', 'let', 'interrupts', 'bang', 'airplanes', 'honking', '"When', 'doing"', 'slamping', 'striking', 'dried', 'drying', 'meant', 'right', 'now', 'mixer', 'landing', 'finish', 'dress', 'alive', 'huge', 'appliance', 'refrigerator', 'coffeemaker', 'both', 'utter', 'scream', 'help', 'gasps', "woman's", 'gasp', 'police', '"Does', 'though', 'meaning', 'starts', 'stops"', '"How', 'hearing', 'end"', 'covers', 'feathers', 'At', 'sandy', 'workshop', 'boat', 'growl', 'showers', 'faucets', "aren't", 'branches', 'aircraft', 'aircrafts', 'whooshing', 'thumps', 'sawmill', 'stepped', 'grains', 'spring', 'pot', 'wash', 'sprayer', 'plopping', 'happen', 'sample', 'pool', 'asking', 'questions', 'waters', 'program', 'occasionally', 'bad', 'sifted', 'book', 'snapped', 'selling', 'cleaner', 'handled', 'girl', 'fell', 'her', 'jingle', 'breaks', 'least', 'beat', 'began', 'configuration', 'rattling', 'evening', 'causes', 'rustle', 'ghost', 'tranquil', 'creek', 'milling', 'ac', 'breed', 'big', 'fade', 'flush', 'USA.wav"', 'swishes', 'sigh', 'ongoing', 'run', 'cabinet', 'drawer', 'repetitively', 'midway', 'powered', 'Wat', 'hail', 'cracking', 'Dies', 'even', 'slack', 'altogether', 'fairly', 'basketball', 'tires', 'laundry', 'sandwich', 'dominate', 'taping', 'packing', 'pattern', 'detected', 'South', 'Germany.wav"', 'english', 'store', 'cooker', 'A', "I'd", 'eighteen', 'gong', 'jelly', 'triangle', 'age', 'groups', 'kids', 'pound', 'ambient', 'never', 'among', 'stage', 'life', 'adult', 'sources', 'funeral', 'restaurant', 'prepared', 'busy', 'sensation', 'conduct', 'devices', '"In', 'garden', 'hold', 'whilst', 'watering', 'plants"', 'choir', 'sprayed', 'touches', 'create', 'softer', 'whirring', 'squeaks', 'lying', 'cease', 'beaten', 'exclaim', 'chant', 'unison', 'sporting', 'mostly', 'will', 'crew', 'cabin', 'night.wav"', 'rather', 'exhaling', 'shortest', 'sixth', 'pitches', 'art', 'form', 'gets', 'happens', 'initial', 'carnival', 'driven', 'skateboarding', 'bicycle', 'staticky', 'standing', 'still', 'library', 'squawks', 'squawk', 'snoring', 'deeper', 'vehical', 'mechicanal', 'charge', 'blinds', 'drawn', 'completing', 'single', 'Towards', 'introduced', 'beeping', 'stopping', 'bathwater', 'east', 'meow', 'moderately', 'crackers', 'flock', 'everything', 'bumping', 'snorting', 'roaring', 'closer', 'further', 'gentle', 'somewhat', 'startling', 'albeit', 'so', 'often"', 'oiled', 'recently', 'has', 'variety', 'equipment', 'such', 'playground', 'garage', 'bug', 'public', 'intercom', 'show', 'joy', 'laugh', 'vocalizing', 'airsound', 'blue', 'asleep', 'bids', 'RPM', 'talking.wav"', 'cows', 'cawing', 'school', 'range', 'market', 'shouted', 'forcefully', 'arid', 'recreational', 'drink', 'alternate', 'irrigation', 'third', 'green', 'impact', 'Hi', 'Mom.', 'I', 'Crashed', 'Car.wav"', 'hesitating', 'sniffling', 'daytime', 'cicada', 'distinctive', 'womanly', 'screams', 'ordering', 'ham', 'melody', 'panthers', 'creaks', 'hiking', 'bumble', 'bees.wav"', 'non-buzzing', 'adults', 'louder"', 'split', 'blown', 'tuning', 'specific', 'broadcast', "that's", 'entertainment', 'purposes', 'snippets', 'channels', 'changed', 'result', 'knock', 'keys', 'bumped', 'safety', 'handsaw', 'vacuuming', 'operate', 'click', 'pet', 'oinking', 'saxophone', 'letter', 'common', 'black', 'prying', 'sitting', 'celaring', 'glasscontainer.wav"', 'locked', 'barefooted', 'holding', 'smooth', 'in-between', 'press', 'mumbling', 'Wood', 'Barefoot', 'Jumps', '&amp;', 'Scuffs.wav"', 'cream', 'trucks', 'honks', 'requires', 'immediate', 'attention', '"The', 'en', 'route', 'general"', 'interacting', 'hall', 'roll', 'rolls', 'bag', 'slows', 'completely', 'bowling', 'alley', 'bees', 'terrain', 'plain', 'dogs.wav"', 'rainning', 'intensify', 'foreground', 'walk', 'slicing', 'carrots', 'siren.wav"', 'interrupted', 'few', 'interference', 'laser', 'blasts', 'thirteen', 'spilling', 'game', 'basement', 'Soft.', 'Crickets.wav"', 'frequency', 'dinner', 'tweet', 'fixing', 'pipes', 'drainage', 'mechanism', 'stairway', 'stomp', 'may', 'reply', 'bath', "windchime's", 'sqeaking', 'breezy', 'annoying', 'shredded', 'dragging', 'coffee', 'farther', 'accelerates', 'return', 'piercing', 'sound"', '4', 'elements', 'tire', 'tick', 'diner', 'cock', 'goose', 'because', 'showering', 'condition', 'caught', 'ambience', '.wav"', 'reversing', 'spit', 'crunched', 'claps', 'gutters', 'screech', 'lawnmower', 'diesel', 'move', 'quickness', 'sixty', 'simply', 'animalistic', 'considered', 'hog', 'propellers', 'belong', 'distinct', 'NOT', 'rain"', 'tan', 'sliding', 'sneeze', '"who', 'talking"', 'try', 'comfort', 'barks', 'row', 'movie', 'horror', 'describe', 'rotate', 'conditioner', 'worn', 'mild', 'brakes', 'accelerate', 'buzzsaw', 'rural', 'wrapper', 'pages', 'twentyeight', 'struggling', 'magazine', 'wooden', 'plank', 'bubble', 'gum', 'peeing', 'logs', 'barefoot', 'staircase', 'coughing', 'rim', 'enables', 'leave', 'bare', "It's", 'Will', 'harder', 'heated', 'Sans', 'Feet.wav"', 'surfaced', 'knocks', 'thirtyeight', 'thirtyone', 'shaped', 'circles', 'bills', 'coins', 'skiing', 'tons', 'motors', 'neigh', 'shaving', 'splashed', 'dinging', 'winch', 'consistency', 'creaky', 'rhythmically', 'happened', 'spoons', 'plate', 'stops', 'overall', 'wind.wav"', 'halt', 'noticeably', 'frightened', 'repeating', 'brook', 'exhaust', 'decision', 'horses', 'flipping', 'rigid', 'sorter', 'violin', 'racket', 'spectator', 'watching', 'tennis', 'jingling', 'apparently', 'pleasure', 'listener', 'clap', 'stretched', 'capacity', 'stretching', 'limit', 'chain', 'feel', 'cloudy', 'football', 'initiated', 'cry', 'sidewalk', 'mug', 'performing', 'job', 'males', 'females', 'free', 'requests', 'probably', 'microphone', 'related', 'travelling', 'mugs', 'turns', 'grumbling', 'automobiles', 'floorboard', 'travel', 'flight', 'closet', 'passed', 'track', '18', 'respond', 'onto', 'chirpping', 'circuit', 'less', 'microwave', 'punched', 'streaming', 'random', 'locusts', 'discernible', 'distant', 'underlying', 'crossing', 'slower', 'ding', 'objected', 'picnic', 'rushes', 'excessively', 'referred', 'what"', 'literally', 'hat', 'today', 'ages', 'mosquitoes', 'mosquito', 'kettle', 'boiled', 'slithering', 'gurgling', 'pick', 'wild', 'ziplining', 'stapled', 'stapling', 'rollercoaster', 'intervals', 'set', 'contain', 'individual', 'bathtub', 'splashes', 'bigger', 'roosters', 'scarping', 'stationary', 'lighting', 'shinning', 'fireworks', 'structure', 'enjoy', "children's", 'seesaw', 'computer', 'printer', 'alot', 'whales', 'cliff', 'razor', 'therapeutic', 'year', 'winter', 'rocket', 'enter', 'measuring', 'amount', 'waterflow', 'mountaintop', 'interrupt', 'roadway', 'zipping', 'zipper', 'foghorn', 'sounded', 'able', 'streets', 'highways', 'nose', 'vibration', 'pressing', 'drawing', 'squishing', 'pizza', 'woods', 'along', '"Besides', 'sharpened', 'too', 'sharpening', 'emit', 'intermittent', 'powering', 'background.wav"', 'jigsaw', 'above', 'clear', 'takes', 'activated', 'stone', 'flushing', 'normally', 'again"', 'audibly', 'railroad', 'cardboard', 'Any', 'tambourine', 'sung', 'rev', 'keychain', 'jars', 'crane', 'site', 'semitruck', 'reverberate', 'stacked', 'lifted', 'venue', 'cheer', 'honked', 'drilled', 'babbling', 'hte', 'sharpener', 'vaccum', 'Approximately', 'whacking', 'gardening', 'reaches', 'sunset', 'gears', 'stall', 'ripping', 'smash', 'send', 'Summer', 'Country.wav"', 'lions', 'wildlife', 'below', 'sparrow', 'quick', 'irregularly', 'tinny', 'performed', 'require', 'radial', 'arm', 'vacuum', 'tell', 'completed', 'anybody', 'breeze', 'sealed', 'booth', 'resemble', 'motorboat', 'marker', 'gongs', 'hits', 'batminton', 'liquids', 'tugboat', 'rabit', 'empty', 'congregating', 'rinsing', 'chimpanzees', 'sure', 'plucking', 'strings', 'provides', 'backing', 'hiss', "vehicle's", 'echoing', 'maintain', 'thrum', "what's", 'trilling', 'sets', 'sand)_01.wav"', 'waking', 'squirrel', 'opens', 'fastened', 'zippers', 'zipped', 'coaxing', 'licking', 'pouting', 'baker', 'slapping', 'dough', 'patting', 'hardness', 'rails', 'RPMs', 'squeek', 'physical', 'mountains', 'extended', 'rings', 'sprinkling', 'accompanies', 'transmission', 'drizzle', 'gently', 'hurricane', 'documents', 'notebook', 'stacking', 'fill', 'flushes', 'announce', "he's", 'use', 'penguin', 'adjusting', 'hears', 'things"', 'drives', 'order', 'slid"', 'quicker', 'generated', 'pushing', 'slid', 'songbird', 'beautiful', 'sheeps', 'bleat', 'baa', 'plants', 'coolant', 'release', 'percolating', 'handle', 'cards', 'vibrate', 'GREENFIELD', 'PLACE', '(Snippet).wav"', 'means', 'effect', 'typical', 'warm', 'popping', 'actively', 'record', 'gushing', 'Praia', 'Grande.wav"', 'manipulating', 'ukulele', 'bellowing', 'scurrying', 'During', 'non-living', 'Cuba', '2008.wav"', 'field', 'severe', 'healthy', 'sized', 'eat', 'flies', 'beverage', 'dispensing', 'baking', 'falls', 'haul', 'garbage', 'planes', 'ajo', 'vedessa', 'maastossa', '_', 'jeep', 'atv', 'difficult', 'brief', 'stop.wav"', 'crush', 'crushed', 'shredding', 'roller', '"At', 'seems', 'analog', "isn't", 'tuned', 'in"', 'coalesce', 'singular', 'toward', 'technological', 'clichМ©d', "'old-fashioned'", 'copier', 'rainstorm', 'fizzing', 'soda', 'popped', 'accident', 'dark', 'monsters/', 'eachother', 'monster/animal', 'monster/animals', 'rush', 'hour', 'intensely', 'shop', 'total', 'spoke', 'full', 'sentence', 'fowl', 'surrounded', 'involving', 'ring.wav"', 'degree', 'pressure', 'stilettos', 'construction.wav"', 'abrasive', 'powers', 'rewind', 'opendoor.wav"', 'staying', 'increasing', 'herd', 'propeller', 'majority', 'travels', 'slowest', 'hardwood', 'faint', 'traffic)', '-', 'DR-100', '(omni)', 'audacity.wav"', 'rats', 'touch', 'hurt', 'heat', 'cycle', 'lorry', 'compressed', 'motorbike', 'racecar', 'galloping', 'rail', 'tempo', 'quaking', 'unscrewed', 'longest', 'thump', 'conductor', 'beads', 'stir', 'cart', 'write', 'calmly', 'utensil', 'scribbling', 'cracks', 'private', 'sportscar', 'zoom', 'alert', 'boxes', 'angry', 'grow', 'sawing', 'build', 'smashed', 'clinks', 'clink', 'fluctuating', 'quality', 'actual', 'reminiscent', 'harmony', 'settings', 'rinse', 'spoken', 'sloshed', 'mowing', 'airbus', 'mowed', 'ignited', 'packet', 'professional', 'industry', 'works', 'depart.wav"', 'bumpy', 'undone', 'nail', 'balloons', 'dirty', 'spewed', 'knob', 'adjusted', 'page', 'repititious', 'sex', 'expectorates', 'whimpering', 'J', 'ends', 'R', 'blower', 'vegetables', 'appear', 'puppy', 'howl', 'IS', 'domesticated', 'shirtless', 'underground', 'establishment', 'pulls', 'object(s)', 'explosions', 'firework', 'explode', 'exploding', 'aquarium', 'absent', 'cadence', 'patterns', "UFO's", '50.wav"', 'glasses', 'non-glass', 'reading', 'keeping', 'timer', 'yapping', 'Giggleswick', 'England.wav"', 'tower', 'saying', 'hurts', 'swam', 'beating', 'orchestra', 'rope', 'wheeling', 'suitcase', 'shaved', 'puddles', 'manufactured', 'cookie', 'urgent', 'mud', 'carboard', 'vocal', 'clip"', 'laughter', 'rung', 'final', 'previous', 'ones', 'tent', "bird's", 'toothbrush', 'wtaer', 'bunch', 'radar', 'scanner', 'genders', 'deserted', 'easy', 'domestic', 'include', 'squishy', 'cracker', 'texture', 'crispy', 'bounced', 'celery', 'quack', 'harsh', 'general', 'process', 'carried', "engine's", 'sputtering', 'Put', 'pills', 'dice', 'sick', 'ingest', 'thunderclaps', 'thunderclap', 'frequent', 'discussion', 'really', 'exterior', 'reverberating', 'drop', 'burbling', 'sucking', 'finishes', 'cussing', 'blended', 'scratched', 'shape', 'size', 'tossing', 'needed', 'owls', 'ceramic', 'bashed', 'Fountain', 'fountains.wav"', '"Of', 'closer"', 'phenomena', 'behaving', 'balloon', 'inflated', 'beaks', 'floating', 'striking"', 'raid', 'hapening', 'scrapping', 'thirtyfour', 'thirtythree', 'swooshing', 'possible', 'gun', 'series', 'skateboard', 'friction', 'resting', 'marble', 'put', 'pulse', 'vampire', 'maliciously', 'scifi', 'noticeable', 'pulsing', 'looking', 'slams', 'touched', 'injury', 'alight', 'rice', 'notes', 'shout', 'communication', 'control', 'utilized', 'mainland', 'rifle', 'circus', 'raise', 'young"', 'ping', 'snowing', 'pulled', 'drug', 'smoothly', 'relaxing', 'blare', 'utility', 'reverse', 'organically', 'grasshoppers', 'bottles', 'individuals', 'Frequently', 'improve', 'aspect', 'exercise', 'strart', 'forrest', 'head', 'protection', 'disturbed', 'real', 'identifiable', 'unchanged', 'tumbling', 'steal', 'swim', 'drained', 'pump', 'wetness', 'conveyed', 'context', "one's", 'interruption', 'cluck', 'roar', 'seemingly', 'young', 'asked', 'agitated', 'funny', 'People', 'rains', 'scarf', 'tongue', "animal's", 'grunts', 'mill', 'mixing', '"Are', 'fingernail', 'canvas"', '2.wav"', 'criminal', 'damage', 'meddling', 'attached', 'stones', 'spark', 'variation', 'bash', 'runner', 'motionless', 'accurate', 'granules', 'dust', 'collect', 'shovel', 'creepy', 'film', 'featured', 'cymbal', 'telephone', 'boil', 'whistle_1.wav"', 'teapot', 'stove', 'precedingly', 'distress', 'distinctly', 'Greece).wav"', 'hoot', 'camden', 'arkansas.wav"', 'These', 'indicative', 'pigeons', 'sifting', 'clinked', 'bodily', 'passes', 'knuckles', 'pots', 'pans', 'dolphin', 'leaf', 'blowers', 'switched', 'cascade', 'others', 'pogo', 'dancing', 'invention', 'organic', 'shovelling', 'ominous', 'flyover.wav"', 'squish', 'sticky', 'cement', 'stand', 'project', 'chanting', 'meditating', 'squeeze', 'category', 'flowers', 'seal', 'spray', 'latest', 'popular', 'classical', 'furniture', 'bubbles', 'repetitious', 'expelling', 'balls', 'larger', 'smaller', 'cur', 're', 'applaud', 'booing', 'operates', 'manufacturing', 'picked', 'ponged', 'blasting', 'screamo', 'jangled', 'visual', 'appearance', 'bat', 'whether', 'droning', 'slimly', 'bridge', 'season', 'eyebrows', 'twisty', 'warping', 'distorts', 'destructible', 'unchanging', '17', 'paddling', 'canoe', 'prevalent', 'hundreds', '"Instead', 'caged', 'mockingbird', 'assembling', 'disassembling', 'crumpling', 'non-stop', 'endure', "can't", 'drumstick', 'unloading', 'sports', 'squek', 'putting', 'lotion', 'outset', 'winder', 'applause']


--------------------------------------------------------------------------------
/metadata/single_word_val_clean.csv:
--------------------------------------------------------------------------------
  1 | file_name,QuestionText,answer
  2 | "WavesOnTheShore.wav","What can be heard being moved?",WATER
  3 | "WavesOnTheShore.wav","What is the person moving in?",WATER
  4 | "Footsteps on Rocky Terrain.wav","What does it sound like?",WALKING
  5 | "nightinggale2.wav","What could cause this type of alarm?",FIRE
  6 | "nightinggale2.wav","What kind of siren is sounding?",ALARM
  7 | "Lluvia agosto 2011.wav","How many cars pass by?",ONE
  8 | "Lluvia agosto 2011.wav","What is moving?",CAR
  9 | "watertab.wav","What is the water pouring into?",SINK
 10 | "watertab.wav","What type of surface is the water being poured on?",METAL
 11 | "miniature goats and sheep.wav","What animal is making the noise?",SHEEP
 12 | "0211_170236 walk downstairs.wav","What kind of surface was the person treading on?",WOOD
 13 | "rotatingdome.wav","What kind of vehicle  is passing through the tunnel?",TRAIN
 14 | "rotatingdome.wav","What object makes the loud noise at the end?",DOOR
 15 | "screen-door-slam.wav","How many times can the sound be heard?",EIGHT
 16 | "Doorbell harsh.wav","How many times is the buzz heard?",THREE
 17 | "bird in the Hague at dawn 5.wav","What animal is there?",BIRD
 18 | "bird in the Hague at dawn 5.wav","What type of location can the bird sounds be heard?",PARK
 19 | "Caltrain Pushing Caltrain.wav","What is the vehicle that is making noise?",TRAIN
 20 | "Breaking Glass .wav","What is being smashed multiple times?",GLASS
 21 | "Elizabeth Evans Park - Mount Dora - June.wav","How many types of animals can be heard in this clip?",TWO
 22 | "Elizabeth Evans Park - Mount Dora - June.wav","What is making the animal noise heard in this clip?",BIRD
 23 | "Evening Atmosphere #2.wav","How many times does the dog bark?",NINE
 24 | "Evening Atmosphere #2.wav","What is the dog barking at?",BIRDS
 25 | "stclaude.wav","How many times does the thumping noise repeat itself?",TWO
 26 | "Night Frogs.wav","What time of day is associated with these sounds?",NIGHT
 27 | "a boy and 2 pigs.wav","How many people are talking to one another?",FOUR
 28 | "a boy and 2 pigs.wav","What type of animal is making the animal noise?",PIG
 29 | "International Harvester Scout II.wav","How many times does the engine stall?",THREE
 30 | "Wind moaning through gap in door and house noises.wav","How is the weather?",RAINY
 31 | "Cafeteria Ambience.wav","What is clanking in the scene?",DISHES
 32 | "20130418_stream.09.wav","What is making the sound?",WATER
 33 | "20130418_stream.09.wav","what type of animal can be found under the sound producing thing?",FISH
 34 | "wind-sound-from-inside-car.wav","How many voices can be heard yelling in the inclement weather?",ZERO
 35 | "wind-sound-from-inside-car.wav","What is blowing outside?",WIND
 36 | "2012check_run.wav","How many vehicles are there?",TWO
 37 | "2012check_run.wav","What are the formula one cars doing?",RACING
 38 | "stone_well.wav","how many times does the machine try to start?",THREE
 39 | "Uppsala Streetbusker accordion 1.wav","How many instruments are being played?",ONE
 40 | "Uppsala Streetbusker accordion 1.wav","Which instrument is being played?",KEYBOARD
 41 | "water slushing moderate speed.wav","What liquid is being played with?",WATER
 42 | "water slushing moderate speed.wav","Where is this liquid located?",BUCKET
 43 | "greece_melanes_cofee_1.wav","What is singing?",BIRD
 44 | "LightRaininPinesMarch302013.wav","How is the weather here?",RAINY
 45 | "storm is coming 15-11-2012.wav","What is the animal heard?",DOG
 46 | "storm is coming 15-11-2012.wav","What type of weather is it?",RAIN
 47 | "Walking_on_tarmac.wav","How many people are moving?",ONE
 48 | "Walking_on_tarmac.wav","What's the person doing?",WALKING
 49 | "20070824.supper.wav","How many people are heard speaking?",TWO
 50 | "20070824.supper.wav","What is a person doing?",POURING
 51 | "STE-027FIRE.wav","What is crackling in the background?",FIRE
 52 | "glass (2).wav","How many silent lapses are during the buzzing sound?",ZERO
 53 | "glass (2).wav","What sound is it?",ALARM
 54 | "160917-eichelherr000.wav","What is volume of the bird sounds?",LOUD
 55 | "160917-eichelherr000.wav","What kind of animal is this?",BIRD
 56 | "Birds of Klein Profijt.wav","What kind of animals are there?",BIRDS
 57 | "Drumming on a wine glass.wav","What is the object made of that is making the sound?",GLASS
 58 | "greece_naxos_cicadas_4.wav","what animal is making the sound?",CRICKET
 59 | "Los Angeles Bus Ride.wav","What is approaching to the people?",BUS
 60 | "Los Angeles Bus Ride.wav","What is making the sound?",BUS
 61 | "upanddownstairs.wav","What is the person walking on?",STAIRS
 62 | "train.wav","What is the source of this noise?",TRAIN
 63 | "train.wav","What type of vehicle makes this sound?",TRAIN
 64 | "Avion.wav","Where does the object depart from and arrive to?",AIRPORT
 65 | "vague_sable.wav","What is it water called when it builds up and crashes on the sand?",WAVES
 66 | "vague_sable.wav","What is the the object making the noise?",WATER
 67 | "bridge demolition pounding.wav","How many impacts can we hear ?",NINETEEN
 68 | "Babble of Frogs 001.wav","Unlike domesticated pets, these animals are what?",DUCKS
 69 | "Night Ambient.wav","How many crickets are there?",TWO
 70 | "Night Ambient.wav","What is the insect doing?",CHIRPING
 71 | "Spring Birds Raw (New Jersey).wav","What domestic animal likes to catch these creatures?",BIRDS
 72 | "Spring Birds Raw (New Jersey).wav","What type of animal is making this sound?",BIRD
 73 | "bathtub drain.wav","what is the liquid doing?",DRIPPING
 74 | "bathtub drain.wav","what noise it the liquid making upon hitting the surface?",SPLASHING
 75 | "Night in nature.wav","What are the animals doing?",CHIRPING
 76 | "CoffeeShopChatter.wav","What are the people doing?",TALKING
 77 | "espresso-maschine.wav","How many times does a person tap on wood at the beginning ?",TWO
 78 | "glas-bubbels-def01.wav","what is the water doing?",BUBBLING
 79 | "glas-bubbels-def01.wav","What plastic item is the person drinking from?",STRAW
 80 | "WATER DRIP ECHO LOW PITCH COMPRESSED.wav","What is falling?",WATER
 81 | "small dog leaves.wav","What is being crumpled?",PAPER
 82 | "village bar.wav","Besides talking, what are the people doing?",EATING
 83 | "village bar.wav","What gets closed at the end of the recording?",DOOR
 84 | "fireworks1.wav","What is making the popping noise?",FIREWORKS
 85 | "Thunder Outside.wav","How many strikes of thunder are there?",TWO
 86 | "car-radio-am-noise2.wav","What is the pitch of the loudest sound?",HIGH
 87 | "bellaromani.wav","How many times is the bell struck?",SIX
 88 | "bellaromani.wav","What is being struck to make the sound?",GONG
 89 | "laundry.machine.wav","What is the object making the rattling noise?",DRYER
 90 | "FM Radio Tuning Sweep.wav","What object is having its channels being changed?",RADIO
 91 | "machine1.wav","What is driving by?",CAR
 92 | "filling-ice-cup.wav","What can the person do with the liquid next?",DRINK
 93 | "filling-ice-cup.wav","What is the person doing with a liquid?",POURING
 94 | "Rain drops on marquee.wav","what is the rain hitting?",ROOF
 95 | "Rain drops on marquee.wav","Where is it raining?",ROOF
 96 | "channel 2 now concludes its broadcast day.wav","Where might one see color bars accompanying this noise?",TELEVISION
 97 | "metal-bell-percussion.wav","how many pauses are between each set of ringing?",TWO
 98 | "metal-bell-percussion.wav","How many times does the bell ring?",THREE
 99 | "sizzle 4.wav","how many times does the bottle pop?",ONE
100 | "0221 Bar_terrace.wav","What are the people doing?",TALKING
101 | "Opening and Closing Bolt Door.wav","What activity involving the feet and legs can be heard?",WALKING
102 | "Deplacez-vous.wav","To what emergency vehicle does the siren belong to?",AMBULANCE
103 | "Sukhapha anchor chain.wav","What material is making the rattling?",METAL
104 | "Nord_Odal_Nyhus_04_juni_2011_quiet_forest_birds_insects_leaf_rustle_03.wav","What animal can be heard?",BIRD
105 | "Nord_Odal_Nyhus_04_juni_2011_quiet_forest_birds_insects_leaf_rustle_03.wav","What do background birds do?",CHIRP
106 | "Water_Drops_Falling.wav","What is making the crackling noise?",FIRE
107 | "Water_Drops_Falling.wav","What liquid is splashing on the ground ?",WATER
108 | "clanking lid.wav","what is the metal object doing?",MOVING
109 | "April dawn chorus Sydenham Hill.wav","What type of animal is heard?",BIRD
110 | "crumpleTissuePaper.wav","What creature is likely making this sound?",HUMAN
111 | "Distorted AM Radio noise.wav","What is that sound?",STATIC
112 | "pc_mouse.wav","How many different types of noises can be heard?",FIVE
113 | "pc_mouse.wav","How many times does the mouse click?",THREE
114 | "Running.wav","What does the person do at the end?",COUGH
115 | "Running.wav","what sound comes from the person's mouth as they run?",COUGHING
116 | "Squeaky Wood (Compilation).wav","How many times does the door creak?",TWELVE
117 | "Walking across carpeted floor with slippers.wav","What is the person walking on?",GRAVEL
118 | "quedlinburg castle.wav","What kind of animal is in the area?",BIRD
119 | "Walking alongside the road.wav","How is the person moving?",WALKING
120 | "Walking alongside the road.wav","How many cars passed by?",TWO
121 | "Building Site.wav","How many times is the metallic beating sound in the distance made?",FOUR
122 | "Building Site.wav","What gender are the humans making noise?",MALE
123 | "eraser.wav","What tool is being used?",HAMMER
124 | "Charleston Campus Summer.wav","What are the people doing?",TALKING
125 | "Big_Roundabout_Traffic.wav","How many vehicles can be heard?",FIVE
126 | "Big_Roundabout_Traffic.wav","What is the car driving on?",ROAD
127 | "AC Unit.wav","What happens to the sound at the end?",STOPS
128 | "Bathroom.wav","what is dripping?",WATER
129 | "Airplane Overhead.wav","What is flying overhead?",PLANE
130 | "Clothing_ShirtsandPants_Rustling.wav","What is the item being put into?",BAG
131 | "05769 carpenter&#39;s workshop ambience.wav","What is humming?",SAW
132 | "05769 carpenter&#39;s workshop ambience.wav","What type of work is being done?",SAWING
133 | "CicadasAPedreira.wav","What insect is making a sound?",CRICKETS
134 | "CicadasAPedreira.wav","What is making faint ripple sounds ever so slightly?",DOG
135 | "Walking in Grass in Evening with Loud Bird.wav","What is the person doing?",WALKING
136 | "Small growling dog.wav","How many dogs are growling ?",THREE
137 | "Small growling dog.wav","What is the animal doing?",GROWLING
138 | "Teluk Nipah 01.wav","What is making the noise in the water?",BOAT
139 | "Teluk Nipah 01.wav","What liquid can be heard?",WATER
140 | "20090407.cricket.real.close.wav","How can the pitch of this sound be described?",LOUD
141 | "Metal clatter drop.wav","How many times is the object dropped?",SIX
142 | "interference from wireless mouse on am radio.wav","How many 'bash' sounds are heard in the last two seconds?",SEVEN
143 | "interference from wireless mouse on am radio.wav","What is buzzing?",RADIO
144 | "outdoors ambient  village bird distant neighbours children car.wav","How many times can you hear the inhale or exhale of a person's breath?",ONCE
145 | "17-Year Cicada Mating Call.wav","How many times does the alarm go off ?",TEN
146 | "FOLEY_Ext_Garbage_Hauling_001.wav","What was heard at the start of the audio recording?",DOOR
147 | "LOUD THUNDER - WITH RAIN HITTING UMBRELLA.wav","What booming noise can be heard coming from the sky?",THUNDER
148 | "CR FunnyMachine.wav","What kind of animals are around?",BIRDS
149 | "Bell_Hotel_desk.wav","On how many separate occasions was there a pause between bell ringing multiple times?",THREE
150 | "slam.wav","How many times is the ball audibly smacked?",SIX
151 | "metal rain.wav","What instrument is being played?",SYNTHESIZER
152 | "metal rain.wav","What kind of music is this?",GUITAR
153 | "13. Crushing tin can.wav","How many milk cartons does this person open ?",TWO
154 | "3trump.wav","What element, when forced through this device, makes the sound?",AIR
155 | "3trump.wav","What part of the face is used to play this instrument?",MOUTH
156 | "Fence Hit_City ambience night.wav","How many times is there a banging noise?",TWELVE
157 | "Car starting (open hood).wav","What is being started?",CAR
158 | "Car starting (open hood).wav","What is the girl talking about?",CAR
159 | "Thunder Storm Daytona 3.wav","What was falling from the sky?",THUNDER
160 | "AGFA_1.wav","How many times does the same sound pattern repeat itself?",SIX
161 | "amazon 04.wav","What appendages do these creatures have, instead of arms?",FEATHERS
162 | "amazon 04.wav","What is the predominant animal heard?",BIRD
163 | "Davis.wav","If one were caught outside, how would they end up being?",WET
164 | "Davis.wav","What is happening?",WIND
165 | "20100515.park.ambiance.02.wav","What are the birds doing?",CHIRPING
166 | "SinkWater.wav","What closes and stops the flow of water?",FAUCET
167 | "SinkWater.wav","What's going down the drain?",WATER
168 | "Room tone for quiet bathroom.wav","What is rushing?",WATER
169 | "Room tone for quiet bathroom.wav","What is the sound quality?",GOOD
170 | "crowd booing.wav","What are the people doing?",CHEERING
171 | "crowd booing.wav","what noise do the people make?",CHEERING
172 | "Walla chatter, adults and children in auditorium.wav","What are the people doing?",TALKING
173 | "Walla chatter, adults and children in auditorium.wav","What type of animal is making the biological sounds?",HUMANS
174 | "020220_00.wav","how many people are speaking?",ONE
175 | "020220_00.wav","How many times does the man sneeze?",ZERO
176 | "Waterfall close.wav","If a radio was making this sound, what would that sound be called?",STATIC
177 | "Waterfall close.wav","What kind of noise is this?",RAIN
178 | "Organic sound.wav","The sound indicates that the object is doing what?",OPENING
179 | "water_stream_001.wav","What is running?",WATER
180 | "street steps child car.wav","How many people are chatting?",THREE
181 | "20091224.bells.01.wav","how many times does the bell ring?",THIRTY
182 | "20091224.bells.01.wav","The items making sounds are made from what material?",BELL
183 | "cat_purr_1.wav","What animal is heard here?",CAT
184 | "Remix of 104372__rutgermuller__Metal_Tube_Rolling_www.wav","What is making the noise?",BELLS
185 | "Birdsong2.wav","How many birds are there?",FOUR
186 | "Birdsong2.wav","What is making the chirping noise?",BIRD
187 | "creaky.wav","What sound does the radio make?",STATIC
188 | "Calle.wav","what are the cars doing?",MOVING
189 | "gasBubblesNoise.wav","How many varieties of sound can be heard?",THREE
190 | "fountain in store 001.wav","What luxury item is filled with this in a rich person's backyard?",WATER
191 | "fountain in store 001.wav","What substance is being heard?",WATER
192 | "Beer Pong Sounds ball table and cups.wav","What falls before the tapping begins?",BALL
193 | "Beer Pong Sounds ball table and cups.wav","What object is being hit?",BALL
194 | "Walking On Dry Leaves Normalised.wav","How many people can be heard walking?",ONE
195 | "Walking On Dry Leaves Normalised.wav","What is happening here?",WALKING
196 | "Ambience - St Kilda Beach - waves lapping rocks, people nearby, seagulls.wav","What might the person be doing?",DRIVING
197 | "NYC Subway Train Approach Doors Announce Depart.wav","What does the carriage train do towards the end of the clip?",STOP
198 | "NYC Subway Train Approach Doors Announce Depart.wav","what opens at the end?",SUBWAY
199 | "UnionStation06OutBack_BusyOutside.wav","What is the person doing?",DRIVING
200 | "Wind Noise Backyard.wav","What is dropping the debris?",PERSON
201 | "waves_1.wav","What body of water makes this sound?",OCEAN
202 | "windscreen wipers heavy rain.wav","What is the weather like?",RAINY
203 | "windscreen wipers heavy rain.wav","What is the windshield wipers wiping away off the car?",RAIN
204 | "110422_village_dusk.wav","The staccato sound heard at the beginning and the end is made by what kind of animal?",DOG
205 | "110422_village_dusk.wav","Whose voices can be faintly heard?",KIDS
206 | "street_ambience_day.wav","What are the people doing?",TALKING
207 | "CrunchingHinge.wav","How many screws is this person driving in to wood ?",THREE
208 | "CrunchingHinge.wav","What device attached to a frame is making this sound?",DOOR
209 | "Conversacion Punjabi.wav","What are the people doing?",TALKING
210 | "Koeien, R4 en riet Lichterveldestraat.wav","How many tires do these vehicles, when passenger sized, typically have?",FOUR
211 | "Koeien, R4 en riet Lichterveldestraat.wav","The objects in this location are usually powered by what?",GAS
212 | "New Lift.wav","How many times did a bell ring?",ONCE
213 | "New Lift.wav","What sound alerts that a door was opened?",BELL
214 | "Atlantic Ocean Waves.wav","How many cycles of the same noise can be heard?",FOUR
215 | "Atlantic Ocean Waves.wav","What can be heard crashing into the shore?",WAVES
216 | "CAGE ELAVATOR MUMBAI.wav","How many voices can be heard?",THREE
217 | "CAGE ELAVATOR MUMBAI.wav","What is making the loud clanging noises?",DOOR
218 | "corneille_city01.wav","How many birds are there?",TWO
219 | "corneille_city01.wav","What kind of animal is making noise?",BIRD
220 | "HammerDrill.wav","How many times does the wooden figure croak?",SIX
221 | "13_waiting_chitwan.wav","how many steps are taken?",THREE
222 | "13_waiting_chitwan.wav","What animal is chirping?",BIRD
223 | "outdoors ambient windy wind leaves rustle hum.wav","What is in the sky during this sound?",CLOUDS
224 | "Crunchy walk on pebbles.wav","What is the person doing?",WALKING
225 | "La Barca i La Tempesta.wav","What animal is making noise?",MOUSE
226 | "La Barca i La Tempesta.wav","What is making the tapping sound?",BIRD
227 | "Forest Birds .wav","How many times does someone laugh?",ZERO
228 | "Forest Birds .wav","What animal is singing?",BIRD
229 | "DoorSqueak.wav","Which door creak is the loudest?",LAST
230 | "pencil on paper.wav","People commonly do this action upon a sheet of what?",PAPER
231 | "pencil on paper.wav","What is someone writing on?",PAPER
232 | "indoors dorm dormitory ambient room tone distant traffic in street.wav","What is honking?",CAR
233 | "RBH_Household_shower 03.wav","What is person about to do?",SHOWER
234 | "RBH_Household_shower 03.wav","What room is this?",BATHROOM
235 | "Crunching sticks with feet.wav","Who is crinkling this object?",PERSON
236 | "driving_buying_beer_on_sunday.wav","What are the kids doing in the background?",PLAYING
237 | "driving_buying_beer_on_sunday.wav","where does the man get into after opening the door?",CAR
238 | "080902_00_machine_generators.wav","How many vehicles are there?",ONE
239 | "Fitness studio _ Gym ambience. Weights and equipment.wav","what is making the scrubbing noise?",MACHINE
240 | "Fitness studio _ Gym ambience. Weights and equipment.wav","What is the metal object?",HAMMER
241 | "Door Slam.wav","How many human voices are there?",ONE
242 | "Door Slam.wav","What is happening here?",SLAMMING
243 | "creeeeek-GAIN_01.wav","How many large creaks are there?",FOUR
244 | "070821_flsp_trail03.wav","What are the birds doing?",CHIRPING
245 | "Kitchen Noise From Distance.wav","how many footsteps are there?",ZERO
246 | "Kitchen Noise From Distance.wav","These noises from the person's shoes indicate that the person is doing what?",WALKING
247 | "Page turns and book close_open.wav","Of what man-made material are the pages made of?",PAPER
248 | "Page turns and book close_open.wav","What does the person close shut at the end?",BOOK
249 | "Dinosaur Footsteps-01.wav","How many thumps take place?",EIGHT
250 | "Morning Ride 2.wav","How many times does the vehicle change gear?",FOUR
251 | "Morning Ride 2.wav","What machine is making the sound?",MOTORCYCLE
252 | "Bathtub_Water-drain.wav","what is making the gurgling sound?",DRAIN
253 | "windy rain.wav","What is making the constant sound?",RAIN
254 | "windy rain.wav","What is the weather like?",RAINING
255 | "SheryT_mixdown.wav","What gender is mostly speaking?",MALE
256 | "SheryT_mixdown.wav","Where have all the people congregated?",RESTAURANT
257 | "Motor boat.wav","What is the large machine or vehicle doing?",RUNNING
258 | "paper_bag.wav","What is being ripped?",PAPER
259 | "Fuente Cotino 2.wav","What is splashing?",WATER
260 | "bandung-taxiradio-1.wav","How many people are talking?",THREE
261 | "bandung-taxiradio-1.wav","What are the people riding in?",CAR
262 | "20110422_shower.wav","What direction is the water moving?",DOWN
263 | "20110422_shower.wav","What kind of animals can be heard?",BIRDS
264 | "Atmos beach regular waves hit shore, birds mono.wav","How loud is the noise?",MEDIUM
265 | "110724_inriversidemus1.wav","What gender is the person talking?",FEMALE
266 | "110724_inriversidemus1.wav","What kind of noise are the people making?",TALKING
267 | "Crunchy Footsteps.wav","What is the condition of ground?",WET
268 | "Crunchy Footsteps.wav","What part of the body is striking the surface?",FEET
269 | "Walking on pebble beach.wav","What is the person walking on?",GRAVEL
270 | "Flipping Coin Can.wav","What is the size of the container used?",SMALL
271 | "Flipping Coin Can.wav","What kind of object is it?",TOY
272 | "birds_late_morning.wav","What closure is opened at the end?",CABINET
273 | "birds_late_morning.wav","What is the name of the animal that is audible?",BIRD
274 | "Rain with thunder in a city.wav","What sound are the birds making?",CHIRPING
275 | "Atmosfera Miasto Spokojna dzielnica rano.wav","what does the car do loudly?",ACCELERATING
276 | "Atmosfera Miasto Spokojna dzielnica rano.wav","What is speeding up and passing by?",CARS
277 | "Opening and closing curtain.wav","How many times is the metal dragged across the ground?",SEVEN
278 | "Opening and closing curtain.wav","what is being moved?",FABRIC
279 | "walking_on_snow_and_light_wind.wav","What is blowing?",WIND
280 | "walking_on_snow_and_light_wind.wav","What is someone doing?",WALKING
281 | "auto-rickshaw-trip.wav","What part of the body is used to initiate the beeping sound?",HAND
282 | "auto-rickshaw-trip.wav","What part of the car beeps?",HORN
283 | "Bath 01.wav","What is the liquid doing?",SPILLING
284 | "engine_vibrations_of_ferry_1.wav","For how much of the clip does the machine run?",MIDDLE
285 | "Strong wind in trees.wav","How many vehicles drive past?",TWO
286 | "Strong wind in trees.wav","What machine is heard?",ENGINE
287 | "Chopping Vegetables.wav","What item is used to make the cuts?",KNIFE
288 | "Chopping Vegetables.wav","What scrapes against the surface as a cut is made?",KNIFE
289 | "tram_prague_2stops_veryfewpeople_AMB_INT.wav","What gender is the person speaking?",FEMALE
290 | "2013-03-28 rain in the rainforest.wav","What is the rate of the rainfall?",HEAVY
291 | "2013-03-28 rain in the rainforest.wav","What weather condition is heard?",RAINFALL
292 | "elk car.wav","What is used to steer the machine?",WHEEL
293 | "Vomit, puking spilling water onto grass splat.wav","What body part is the person using to pour the liquid out?",HAND
294 | "Vomit, puking spilling water onto grass splat.wav","What liquid is repeatedly spilled?",WATER
295 | "Beach Wave Ambince .wav","Where is the water coming from?",OCEAN
296 | "20150720_boat.engine.wav","How many different noises can be heard?",THREE
297 | "20150720_boat.engine.wav","How many people can be heard talking ?",TWO
298 | "night in the countryside.wav","How many dogs are barking?",ONE
299 | "night in the countryside.wav","What animal is present?",DOG
300 | "living room tone  ambient distant noises neighbours.wav","how many times is there a tap?",ZERO
301 | "110709_05 goma exhibit.wav","What is making the beating sound?",DRUM
302 | "Rusty old boat.wav","What high pitched noise is being made at the start?",SQUEAK
303 | "sparrows.wav","What do these creatures do that humans cannot?",FLY
304 | "sparrows.wav","What is the bird doing?",CHIRPING
305 | "glass d.wav","What object is making the high pitched noise?",HORN
306 | "Heavy Rain 1.wav","What is making the dripping noise?",RAIN
307 | "Kocking door and open door.wav","What material does it sound like the object being knocked on is made of?",WOOD
308 | "Curtain.wav","How many times is the object moved?",SIX
309 | "Lambs.wav","How many different animals are heard?",THREE
310 | "Lambs.wav","How many times does the main, loudest animal call?",THREE
311 | "20100801.wharf.silence.night.wav","What is the animal doing?",BARKING
312 | "INT Factory budimka, pozega.wav","How many times is there an air pressure release?",TWO
313 | "electric-screwdriver.wav","What kind of material is being dropped again and again?",METAL
314 | "More Amphitheatre Birds. Wav.wav","what could be used to reduce the noise?",STYROFOAM
315 | "More Amphitheatre Birds. Wav.wav","What type of sounds are there?",BIRDS
316 | "Writing with Pen.wav","What's the person doing?",SCRATCHING
317 | "WasherSpinCycle.wav","What is making that sound?",MACHINE
318 | "Saas-Fee Village Atmosphere and Church 100611.wav","How many instances of a bird chirping are there?",EIGHT
319 | "Saas-Fee Village Atmosphere and Church 100611.wav","How many times does the bell ring?",EIGHT
320 | "Fast stream _ small river.wav","What liquid is making the rushing noise?",WATER
321 | "Hanoi street walking.wav","how many high pitched brake squeaks are there?",ONE
322 | "Garage Doors Opening_Closing.wav","How many pauses in the sound are there?",ONE
323 | "Garage Doors Opening_Closing.wav","What animal sound can be heard when the grinding stops?",BIRD
324 | "Elysian Park - Picnic Area 2.wav","What animal can be heard?",DOG
325 | "Elysian Park - Picnic Area 2.wav","What noise does the dog make?",BARKING
326 | "rain_medium_thunders.wav","How many times can thunder be heard rumbling?",THREE
327 | "rain_medium_thunders.wav","What is booming in the background?",THUNDER
328 | "Library with Light Chatter.wav","What type of building are they located in?",OFFICE
329 | "heating_far away.wav","What is the water doing?",FLOWING
330 | "Hitting baseball w. wooden bat.wav","How many whacks can be heard?",FOUR
331 | "background of the side streets of Rhodes, scooter, tourists French and American, grinder.wav","What tool is being used to trim the trees?",CHAINSAW
332 | "20141026 Bangkok House Traffic Thunder Bird 01.wav","At the beginning what is the gender of the voice heard?",MALE
333 | "20141026 Bangkok House Traffic Thunder Bird 01.wav","What sort of animal can be heard calling?",BIRD
334 | "traffic stereo.wav","What are people doing?",DRIVING
335 | "traffic stereo.wav","Where are the cars driving?",HIGHWAY
336 | "20090407.toy.train.01.wav","What is making the animal sounds in the background?",BIRDS
337 | "second_floor_lav.wav","What pitch  tone is the object making?",LOW
338 | "Machete vs frying pan 2.wav","How many times are things banged together?",NINE
339 | "Machete vs frying pan 2.wav","What is someone using?",HAMMER
340 | "Senseo_boil_norm.wav","how many rotations does object making the sound complete?",ZERO
341 | "Truck starts and stops_edit.wav","How many times does an engine start?",ONE
342 | "Truck starts and stops_edit.wav","What kind of vehicle is starting up?",CAR
343 | "LondonTraffic.wav","What is this activity called?",DRIVING
344 | "Ronda - Fountain near the Town Hall (general) - Fuente cerca del Ayuntamiento (general).wav","What object is sometimes carried by a person to stay dry when this is happening?",UMBRELLA
345 | "Ronda - Fountain near the Town Hall (general) - Fuente cerca del Ayuntamiento (general).wav","What weather event is taking place outside?",RAIN
346 | "nxSample012.wav","What object is creating the loud noise?",RADIO
347 | "peanutFarmDawnShort.wav","What animal does the singing?",BIRD
348 | "peanutFarmDawnShort.wav","What sound is the bird making?",CHIRP
349 | "NY subway.wav","How many people are talking?",ONE
350 | "NY subway.wav","What are the people doing?",TALKING
351 | "People talking while waiting the bus.wav","How many different people are speaking?",THREE
352 | "People talking while waiting the bus.wav","How many different woman are speaking?",THREE
353 | "Walking on dry grass.wav","How many steps does the person take?",FOURTY
354 | "Walking on dry grass.wav","What type of footwear are they wearing?",BOOTS
355 | "Elbe near Ovelgoenne.wav","How many times does the sea waves hit the shore ?",FIVE
356 | "Elbe near Ovelgoenne.wav","What covers the immediate area?",WATER
357 | "AMB_COLE.wav","What gathering occasion caused all these people in the clip to appear together in one place?",PARTY
358 | "AMB_COLE.wav","What rhythmic sound is heard at the beginning of the clip?",RUNNING
359 | "trafficrain.wav","In which instance of a passing vehicle is the engine louder?",SECOND
360 | "trafficrain.wav","What animal is making the most noise?",DOG
361 | "Tibetan Bells 192kHz Original.wav","What is the instrument being struck?",BELL
362 | "Hyeres street voices ambience f.wav","What is likely to be the gender of the person making loud footsteps?",FEMALE
363 | "Steam 20.wav","How many times does the MRI thud?",FOURTEEN
364 | "Steam 20.wav","What instrument is being played?",DRUM
365 | "Birds-Crow &amp; Song Birds.wav","What type of bird is making this noise at the beginning ?",CROW
366 | "Subway_departure_from_station.wav","What energy source does this mode of transport use?",ELECTRICITY
367 | "metal_bowls_altered.wav","What kind of pitch does the sound have?",MEDIUM
368 | "DR0000_0020.wav","how is the weather?",RAINY
369 | "birdy.wav","how many birds are there?",TWO
370 | "birdy.wav","What are the birds doing?",CHIRPING
371 | "morning in the countryside.wav","What is the female called, of the bird first heard?",HEN
372 | "morning in the countryside.wav","Which animal is making the loudest sound?",ROOSTER
373 | "Hanoi streets.wav","What is heard beeping halfway through?",CAR
374 | "Hanoi streets.wav","What type of music genre is played in the middle ?",POP
375 | "CityPark Evening Moerputten NL 130510_01.wav","What animal is this?",BIRD
376 | "rain_on_a_roof_01.wav","What liquid is falling from the sky?",RAIN
377 | "rain_on_a_roof_01.wav","What weather phenomenon is taking place?",RAIN
378 | "Autumnal Ambient 24 Bits 48 Khz.wav","What loud sound is made toward the end?",BOAT
379 | "babbling brook 2.wav","What is running?",WATER
380 | "babbling brook 2.wav","what is the water doing?",FLOWING
381 | "12 noon church-bell 140310_0121.wav","what kind of building is nearby?",CHURCH
382 | "12 noon church-bell 140310_0121.wav","What sound can be heard other than bird song?",BELL
383 | "Whalesong.wav","What is this aniimal?",BEAR
384 | "Whalesong.wav","What makes the sound get louder?",CLOSE
385 | "HeavyRain.wav","How is the weather now?",RAINING
386 | "HeavyRain.wav","What is the rain falling on?",GROUND
387 | "Serving Water Quickly.wav","How many times can water be heard being poured?",TWO
388 | "Serving Water Quickly.wav","What is being poured?",WATER
389 | "Centurion Suburb Evening.wav","What can be heard falling?",WATER
390 | "sizzling oil.wav","How many taps are there?",ONE
391 | "sizzling oil.wav","What is the water doing?",FLOWING
392 | "20081130_walking_in_snow.wav","What are they walking in?",SAND
393 | "open and close pen.wav","How many times is the pen clicked?",TWENTY
394 | "open and close pen.wav","What useful activity can be done with this object, besides clicking it?",WRITING
395 | "100121.wav","How many separate explosion sounds were there?",ONE
396 | "100121.wav","What is causing that loud sound?",THUNDER
397 | "Rain WashingtonSt 1.wav","What can be heard falling?",RAIN
398 | "Rain WashingtonSt 1.wav","What sort of weather is heard?",RAIN
399 | "20090501.horse.neigh.wav","What animal that makes a sound is useful for home security?",DOG
400 | "20090501.horse.neigh.wav","What is the larger animal at the beginning and then at the end?",HORSE
401 | "Edit Radio .wav","What is the object that is being used?",RADIO
402 | "Menziken Sawmill.wav","how many wheel turns are audible?",FIFTY
403 | "Menziken Sawmill.wav","What type of vehicle?",TRAIN
404 | "Rain falling on a metal roof - 96 kHz _ 24 Bit.wav","What kind of object is the rain hitting?",ROOF
405 | "Rain falling on a metal roof - 96 kHz _ 24 Bit.wav","Where is this occuring?",OUTSIDE
406 | "invexdpo.wav","What instrument is being played?",ORGAN
407 | "invexdpo.wav","What kind of movies' can they use this instrumental?",HORROR
408 | "carpet_on_carpet.wav","The hammering sound is interspersed with what other noise?",SWEEPING
409 | "carpet_on_carpet.wav","What is one of the tools being used?",HAMMER
410 | "Song Birds-Lighthouse Park-March.wav","How does these types of creatures typically travel?",FLY
411 | "Song Birds-Lighthouse Park-March.wav","What the birds are doing?",CHIRPING
412 | "md1trk11.wav","How many human voices can be heard?",ZERO
413 | "Dog escapes from the room.wav","What is squeeling?",DOG
414 | "Dog escapes from the room.wav","What slams closed?",DOOR
415 | "CalmWaves SandBeach 03 EQ 130430_03.wav","What is crashing against the surface?",WATER
416 | "CalmWaves SandBeach 03 EQ 130430_03.wav","What is making the noise?",WAVES
417 | "medical car horn EGYPT Alexandria.wav","What is making the wa sound throughout the recording?",SIREN
418 | "medical car horn EGYPT Alexandria.wav","What type of siren is being used?",AMBULANCE
419 | "Heavy Wind on Microphone.wav","What speed is the wind blowing?",FAST
420 | "Basement Water Pump.wav","what sound is being made by the motor?",SCRAPING
421 | "Hallway Room Tone with shower in background.wav","What is the person doing in this clip?",SHOWERING
422 | "Ambience - Merri path, trees and birds, gentle wind take 2.wav","How do the creatures that are heard move about in the air, they do what?",FLY
423 | "Metal_On_Wood_Hits_Axe.wav","how many times does the striking sound occur?",THIRTEEN
424 | "Metal_On_Wood_Hits_Axe.wav","What activity is taking place?",TENNIS
425 | "waterspalsh_in_glass_pitcher.wav","What is the water doing?",DRIPPING
426 | "Washing Machine Spins.wav","Where is this machine located?",KITCHEN
427 | "Traffic Light.wav","how many cars pass by?",TWO
428 | "Traffic Light.wav","What are the people doing in the background?",TALKING
429 | "SPilling Water.wav","How many fires have been lit?",ONE
430 | "Vending Machines - Room Tone.wav","What are the people doing?",LAUGHING
431 | "1122thrum.wav","How many times is there static?",TWO
432 | "steam train from 1912 locomotive.wav","How many times does the horn sound?",ONE
433 | "steam train from 1912 locomotive.wav","What kind of automobile is this?",TRAIN
434 | "Coffeehouse Ambience Burlington VT 0112xx.wav","What are people doing?",TALKING
435 | "Coffeehouse Ambience Burlington VT 0112xx.wav","What opens?",DOOR
436 | "underpass.wav","How many people are singing ?",ONE
437 | "underpass.wav","What is the gender of the person singing?",FEMALE
438 | "Kortedala, Gothenburg - By night - Police sirens and surroundings.wav","What sound is loudest?",CAR
439 | "clock_raw.wav","how many beeps are there?",TWENTYFIVE
440 | "clock_raw.wav","How many times does the beating sound pause?",TWENTYFIVE
441 | "Tools .wav","what are the items doing with each other?",CLANKING
442 | "Metal objects in bowl.wav","Of what substance are the objects likely made?",METAL
443 | "Metal objects in bowl.wav","Where are the coins being put in?",JAR
444 | "audience final applause 01.wav","what are people using to make sounds?",HANDS
445 | "audience final applause 01.wav","when does the clapping die down?",END
446 | "box.wav","How many times did the sound repeat?",EIGHTEEN
447 | "box.wav","What type of surface is being scraped ?",HARD
448 | "Plane crash - black box.wav","How many different voices are there?",TWO


--------------------------------------------------------------------------------
/metadata/single_word_test_clean.csv:
--------------------------------------------------------------------------------
  1 | file_name,QuestionText,answer
  2 | "river_mouth3.wav","How many times does the water splash?",ELEVEN
  3 | "river_mouth3.wav","What is flowing?",WATER
  4 | "Creaking pier.wav","What type of animal is making the light sound in the background?",BIRD
  5 | "Mug in sink.wav","What is being poured?",WATER
  6 | "20061124ParadeCHS.wav","Where is the music coming from?",BAND
  7 | "Sizzling Bacon.wav","What is open?",FAUCET
  8 | "Sizzling Bacon.wav","what type of precipitation causes this sound?",RAIN
  9 | "Rain and Storm.wav","How many strikes of lightning can be heard?",ONE
 10 | "Rain and Storm.wav","How many times does it thunder?",ONE
 11 | "20160718_fountain.03.wav","What is the water doing?",RUNNING
 12 | "Inner City Bees.wav","What is buzzing around the microphone?",BEE
 13 | "Inner City Bees.wav","What is buzzing?",BEES
 14 | "forest_ambience_chepachet_spring_1.wav","How many different types of animals are here?",ONE
 15 | "Arch Leaf.wav","What are they walking on?",GRASS
 16 | "20130326_caged.birds.01.wav","what are the birds doing?",SINGING
 17 | "20130326_caged.birds.01.wav","What size are the birds?",SMALL
 18 | "crowdfree.wav","What loud expression does the crowd make?",CHEER
 19 | "Creaky wooden steps, down and up.wav","How many people are there?",TWO
 20 | "20080416.bunting.wav","How many times does the cricket insect make noise ?",FIVE
 21 | "folding and crumpling paper.wav","what gets crumbled up?",PAPER
 22 | "bar crowd.wav","how many questions are heard?",THREE
 23 | "Thunder Rain Cars Driving By.wav","How is the weather?",RAINY
 24 | "Thunder Rain Cars Driving By.wav","What is booming behind the rain?",THUNDER
 25 | "WM_switch_fill_to_wash_cycle_24_96_mono.wav","What item is making the repeated noise?",WASHER
 26 | "Mechanical paper cutter.wav","What is the source of the sound not coming from a machine?",PEOPLE
 27 | "Tall Wine glass hits.wav","How many times does the tapping repeat?",NINE
 28 | "je_campuswalk.wav","What is the rough irregular sound heard throughout?",WIND
 29 | "Ambience with Train.wav","What vehicle is passing by?",TRAIN
 30 | "Tub Draining.wav","What part does it start draining violently?",MIDDLE
 31 | "winter wren wind leaves.wav","What sound does the bird make?",CHIRP
 32 | "restaurant wood floor.wav","what are the people continuously doing?",TALKING
 33 | "restaurant wood floor.wav","What type of business is this?",RESTAURANT
 34 | "20101228.teens.wav","What do the people do as a group?",SINGING
 35 | "20101228.teens.wav","What noise does the group make with their hands?",CLAPPING
 36 | "rhythm of the falling drops.wav","What gender is the first person that talks?",FEMALE
 37 | "broken comms2.wav","What happens to the communication signal?",DISTORTION
 38 | "20160922_passing.lorry.marshes.wav","How many times does the bird caw out loud?",THREE
 39 | "20160922_passing.lorry.marshes.wav","what are the birds doing?",CHIRPING
 40 | "Water in a canal.wav","What is making the noise?",FAN
 41 | "Wipers .wav","How many times do the car wipers go across the windscreen?",TWELVE
 42 | "Wipers .wav","What part of the vehicle is making the sound?",WINDOW
 43 | "RemoteControl.Antique.Zenith.wav","What general material is the object that is being struck made out of?",METAL
 44 | "Brush 01.wav","How many times does the person sweep?",TWENTYTWO
 45 | "Brush 01.wav","What is this person cleaning?",FLOOR
 46 | "Beach_SaintJeanDeLuz_France_Waves_Kids_People_xystereo.wav","what place are they playing on?",PARK
 47 | "Beach_SaintJeanDeLuz_France_Waves_Kids_People_xystereo.wav","Who is playing outside?",CHILDREN
 48 | "Train sound.wav","What direction is the sound moving?",CLOSE
 49 | "Train sound.wav","what transportation vehicle is there?",TRAIN
 50 | "Steps Indoor medium soft Shoe Sole accompanying wooden Floor hollow Big Room 5mx10mx6m.wav","What kind of shoes is the person wearing?",DRESS
 51 | "Steps Indoor medium soft Shoe Sole accompanying wooden Floor hollow Big Room 5mx10mx6m.wav","What's the person doing?",WALKING
 52 | "Cicadas .wav","What instrument is heard?",SHAKER
 53 | "Cicadas .wav","What is the liquid heard in the background?",WATER
 54 | "trail_footsteps_1_0725_102951.wav","How many footsteps are there?",FORTY
 55 | "MISC_Int_Cat_Purring_002.wav","What is being used to groom the cat?",BRUSH
 56 | "MISC_Int_Cat_Purring_002.wav","What sound is the cat making?",PURRING
 57 | "dripping.wav","What plumbing feature is being used?",FAUCET
 58 | "Birds in Pujipor.wav","What is cawing?",CROW
 59 | "turning pages book slow quickly.wav","How many times is a page turned in the book?",THIRTYONE
 60 | "turning pages book slow quickly.wav","What item is having its pages turned?",BOOK
 61 | "autospasandociudadnoche1.wav","How many different cars can be heard?",THREE
 62 | "Footsteps On Squeaky Wood Floor.wav","What does the person walking wear on their feet?",BOOTS
 63 | "wheaten field.wav","What is the long continuous sound?",RAIN
 64 | "weather_wind_strong_trees.wav","What is blowing all around?",WIND
 65 | "weather_wind_strong_trees.wav","What is crashing against the sand?",WAVES
 66 | "HarleyDavidson.wav","what is making the rumbling sound?",MOTORCYCLE
 67 | "20140210FallingIce.wav","What sounds like it's popping?",BUBBLEWRAP
 68 | "Plaza_de_la_Revolucion_risa.wav","How many times does a dog bark?",TWICE
 69 | "Plaza_de_la_Revolucion_risa.wav","What animal is barking?",DOG
 70 | "quick walk.wav","what activity is the person doing?",WALKING
 71 | "quick walk.wav","What form of transport is the person using?",WALKING
 72 | "20110805_forest.crows.07.wav","What animal is making that sound?",DOG
 73 | "fdv_orage_26082011.wav","What is the weather like?",RAIN
 74 | "fdv_orage_26082011.wav","What item should be used above there head to keep dry in rain?",UMBRELLA
 75 | "Rave1.wav","What instrument is producing this sound?",KEYBOARD
 76 | "ieai.wav","how many waves crash?",SEVEN
 77 | "ieai.wav","What is the water hitting?",SHORE
 78 | "room-tone theater with silent woman 130525_07.wav","what is the machine doing?",BLOWING
 79 | "Buddhist Bells.wav","How many times is the object hit?",THREE
 80 | "Buddhist Bells.wav","What pitch would the sound be considered?",HIGH
 81 | "h907 boules pologna clap f.wav","How many people speak on a microphone?",TWO
 82 | "Wind-up Toy Motorbike SFX.wav","If this sound was coming from a toy, what would that toy be called?",SPINNER
 83 | "Wind-up Toy Motorbike SFX.wav","What is making the sound?",TOY
 84 | "breakfast ambience.wav","How are the people communicating with each other?",TALKING
 85 | "breakfast ambience.wav","how many people are speaking?",TWO
 86 | "Machetes sliding 2.wav","How many times does the person make the object sound off?",TEN
 87 | "Afternoon Suburb Calm.wav","What kind of animal is making noise?",BIRD
 88 | "Mulholland Memorial Fountain Los Angeles.wav","What object is making this static noise?",TELEVISION
 89 | "Mulholland Memorial Fountain Los Angeles.wav","Where is the water going?",DOWN
 90 | "Morsecode - SOS MAYDAY - 988 Hz Tone.wav","What kind of code can be heard?",BEEPING
 91 | "Mariehamn_frogs.wav","how many quacks are there?",TWENTYFIVE
 92 | "bands_and_motorbike.wav","How many times are the cymbals played?",SIX
 93 | "metal plate striking wall.wav","how many times does the metal fall?",FIVE
 94 | "FOREST-SOUNDS.wav","How many times is there a loud 'caw' sound?",SEVEN
 95 | "FOREST-SOUNDS.wav","What kind of animal can be heard throughout the sound?",BIRD
 96 | "wind and birds in the delta of the River Po 2.wav","What are the birds doing?",CHIRPING
 97 | "wind and birds in the delta of the River Po 2.wav","What is blowing?",WIND
 98 | "Rain on Car Roof 2.wav","What are the object falling on that makes the banging noise?",ROOF
 99 | "Grackles.wav","What kind of animals can be heard?",BIRDS
100 | "Grackles.wav","what type of building where people pay to see animals could this be located in?",ZOO
101 | "country-ambiance-01.wav","what is making the chirping noise?",CRICKETS
102 | "Morning Birds 001.wav","These creatures live in homes they build themselves which are known as what?",NESTS
103 | "Morning Birds 001.wav","What animals are making noise?",BIRDS
104 | "WaterBottle.wav","How many knocks can be heard?",ONE
105 | "rainy stream 22 sec.wav","What is falling?",RAIN
106 | "Krankenwagen _ German Ambulances Passing by...wav","What makes a siren sound?",AMBULANCE
107 | "air bubbles on the surface of the water.wav","What is boiling?",WATER
108 | "air bubbles on the surface of the water.wav","What type of water device is in operation?",TAP
109 | "Ambience - Cattle Barn - Busy - 96kHzhg.wav","How many times does the cow moo?",FOUR
110 | "Ambience - Cattle Barn - Busy - 96kHzhg.wav","How many types of animals can be heard making noise ?",ONE
111 | "Clatter.wav","Who's making the sound?",PERSON
112 | "Paris to Germany Train Announcement.wav","What is the man's voice being transmitted through?",SPEAKER
113 | "Paris to Germany Train Announcement.wav","what kind of transportation is the speaker on?",TRAIN
114 | "night ambient crickets bugs white noise.wav","What insect is heard in the background?",CRICKETS
115 | "Rain - 1.wav","how is the water flowing?",FAST
116 | "spring morning birds oiseaux reveil printemps #1.wav","How many times does the rooster crow?",FOUR
117 | "divide lake.wav","What is crashing against the sand?",WAVES
118 | "divide lake.wav","What is falling?",WATER
119 | "Old moped.wav","What is the machine being used?",SEWING
120 | "rain_on_tin_roof.wav","what is the rain hitting?",ROOF
121 | "rain_on_tin_roof.wav","What is the weather like?",RAINY
122 | "Footsteps on Wet Pavement_1-2.wav","What is the person stepping on?",LEAVES
123 | "hamlet Haanwijk autumn NL 03 151003_0804 ST.wav","What are the birds doing?",CHIRPING
124 | "hamlet Haanwijk autumn NL 03 151003_0804 ST.wav","What winged creatures are heard?",BIRDS
125 | "Hotel automatic skylight open and close, faint sirens nearby.wav","How many times does the machine noise pause?",TWO
126 | "Birds in the city 1.wav","How do the animals that are audible usually get around?",FLY
127 | "Birds in the city 1.wav","What is whistling?",BIRD
128 | "faucet3.wav","How many animals can be heard?",NONE
129 | "Garden Birds 3.wav","How many times does a bird make a whistling sound?",SEVEN
130 | "Garden Birds 3.wav","What are these creatures?",BIRDS
131 | "bagpipe_on_street_BA.wav","How many dogs bark?",ONE
132 | "bagpipe_on_street_BA.wav","What type of instrument produces this sound?",PIPES
133 | "024_House_InsideCarEngineStart.wav","What does the engine do before the end?",ACCELERATES
134 | "27 hn_birdspecking.wav","What weather condition can be heard in the recording?",RAIN
135 | "Garbage Truck.wav","How many machines are being operated?",FOUR
136 | "Garbage Truck.wav","How many times is there a sound of an object being compressed?",ONE
137 | "Curtains.wav","What tool is being used to clean up the leaves?",RAKE
138 | "THE_RATT23_1.wav","What is the person doing?",WALKING
139 | "THE_RATT23_1.wav","What is the person walking on?",GRAVEL
140 | "WalkingInSnowCrunchingIce.wav","How many people are walking?",ONE
141 | "LightRainOctober31st2015.wav","What is on fire?",FIREPLACE
142 | "PauseConference_youngerPeople.wav","What are all the people in this area doing?",TALKING
143 | "Siren Milan.wav","How many sirens are going off?",TWO
144 | "Siren Milan.wav","In what type of situation would this noise occur?",EMERGENCY
145 | "anykeystudio_apocalypse.wav","What object is producing the high pitched noise?",SIREN
146 | "Fowl - Chatter 1 - 96kHz.wav","What animal is making the most noise?",CHICKEN
147 | "IKEA_Cafeteria.wav","In what location are the people?",RESTAURANT
148 | "dragged-glass-object.wav","What is the person doing?",WRITING
149 | "Erny vs Deadman4.wav","What are the people doing?",TALKING
150 | "paper_cut.wav","What is making the cutting sound?",CUTTER
151 | "paper_cut.wav","What kind of substance is being cut into?",PAPER
152 | "20070303.duck.wav","How many times does the bird quack?",TWENTYEIGHT
153 | "Cooking rice.wav","What is making this noise?",WIND
154 | "hissy fizz.wav","How many times does the static pause?",FIVE
155 | "Kitchen faucet running fast and slow and filling glass of water.wav","What plumbing device is emitting water?",FAUCET
156 | "20080918.boots.door.wav","What is the person doing inside of the building?",WALKING
157 | "Boom_Folie_NoiseOnGlass.wav","How many people are speaking?",ZERO
158 | "upstairs.wav","How many people are going down the stairs ?",TWO
159 | "upstairs.wav","When does the door open?",END
160 | "drunk_teenagers_1.wav","who is talking besides men?",WOMEN
161 | "CAR_WASH.wav","What is an example of something carried by this vehicle?",PEOPLE
162 | "CAR_WASH.wav","What kind of vehicle is moving on the track?",TRAIN
163 | "Train Pass Koln.wav","what type of vehicle can be heard?",TRAIN
164 | "RadioFan.wav","What is being played in this clip?",TV
165 | "WeddingClap.wav","How is the crowd feeling?",HAPPY
166 | "WeddingClap.wav","What is the crowd doing?",CLAPPING
167 | "light suburban ambiance.wav","What animal can be heard?",BIRD
168 | "WaterOnMetal.wav","What is making the noise?",MACHINE
169 | "20081130_walking_in_snow_with_snowshoes.wav","how many footsteps are there?",TWENTYFOUR
170 | "20081130_walking_in_snow_with_snowshoes.wav","What is the person walking through?",SNOW
171 | "Airplane indoor ambience .wav","What is this sound?",AIRPLANE
172 | "Theater Chatter.wav","What are the people doing?",TALKING
173 | "Theater Chatter.wav","What part of their bodies are people using to make noise?",MOUTH
174 | "public.wav","What are they doing ?",CHEERING
175 | "public.wav","What type of mammal is screaming?",HUMAN
176 | "Oppedette cafe #1.wav","What is the dog doing?",HOWLING
177 | "Oppedette cafe #1.wav","What song is the man singing?",MUSIC
178 | "adw018raw.wav","How many times does the sound repeat itself?",FIFTEEN
179 | "adw018raw.wav","What is ringing?",BELL
180 | "Icy rain.wav","How many dogs can be heard barking?",ONE
181 | "Icy rain.wav","What animal is making noise in the background?",DOG
182 | "Index Card Flips (handle business paper mvmt) 02.wav","How many distinct snaps can be heard?",THIRTEEN
183 | "Index Card Flips (handle business paper mvmt) 02.wav","What is being shuffled?",CARDS
184 | "Waterfalls_00216.wav","How many distortions can be heard in the spraying sound?",ONE
185 | "WindInPylons.wav","What item can be heard blowing in the wind?",LEAVES
186 | "Art Gallery Tone.wav","What type of gallery did he say it was?",MUSIC
187 | "rain in tent.wav","What is falling?",RAIN
188 | "Lots of Geese.wav","Where is this noise coming from?",FARM
189 | "birds chirping 03 short.wav","How many birds are nearby?",TWO
190 | "birds chirping 03 short.wav","What sound does a bird make?",CHIRP
191 | "THE_RATT12_1.wav","Who is speaking?",MAN
192 | "food_prep_1_cw.wav","What activity is causing the sizzling?",FRYING
193 | "STE-008.wav","What sound is the train making?",HONKING
194 | "graffiti artist spraying NL 130611_02.wav","What is being sprayed on?",LIQUID
195 | "Sound_FX_Ambient_Street_cars+passing_by.wav","How many different vehicles can be heard?",TWO
196 | "bierfest_atmosphere.wav","What are the people doing?",CHATTING
197 | "bierfest_atmosphere.wav","What does the child do at the end?",LAUGH
198 | "MVI_4002-B.wav","What is driving over the tracks?",TRAIN
199 | "MVI_4002-B.wav","What type of train is this ?",SUBWAY
200 | "UrbanHerringGulls.wav","How many wings do each of these animals have?",TWO
201 | "Bicycle Chain Accel Crash.wav","What object just broke?",GLASS
202 | "Tunnel Creek.wav","Where is the man talking?",BATHROOM
203 | "Metallic  Gate.wav","What does the door keep doing?",SQUEAKING
204 | "Metallic  Gate.wav","What is creaking?",DOOR
205 | "cafecarusel_fan_hizz_EQ2.wav","What happens to the screeching sound in the middle?",INCREASE
206 | "walking indoors footsteps tap tap tapping foley.wav","What material are they stepping on?",WOOD
207 | "walking indoors footsteps tap tap tapping foley.wav","Where is this person walking?",HALL
208 | "Chainsaw Crosscutting  3.wav","What is the person doing with the lawn equipment?",MOWING
209 | "Chainsaw Crosscutting  3.wav","What tool is being used?",CHAINSAW
210 | "Room-tone rain-drips 1m 161015_1013.wav","How many types of sounds can be heard?",TWO
211 | "Room-tone rain-drips 1m 161015_1013.wav","What is the sound coming from?",PIPE
212 | "Water Driping 7.wav","what is the water doing?",TAP
213 | "street works_pressure_low rumble.wav","who speaks?",MAN
214 | "Street Noise - Cars - Ball Bouncing indistinct voices.wav","What is the vehicle moving along?",BIKE
215 | "Atmosphere on road in London.wav","What can be heard in the distance in the first half of the clip?",CAR
216 | "walking in gravel 2.wav","How many steps does the person take?",TWENTY
217 | "walking in gravel 2.wav","What is the person doing?",WALKING
218 | "AMB_swamp_summer_night_fish_insects_00.wav","How many times is it possible to hear an object moving in the water?",THREE
219 | "AMB_swamp_summer_night_fish_insects_00.wav","What type of creature is calling out?",CRICKET
220 | "LA Rain.wav","How many footsteps can be heard in the clip?",ZERO
221 | "Swings in Mauerpark, Berlin.wav","What device is moving?",SWING
222 | "Rain recording.wav","What element is coming from the sky?",WATER
223 | "Rain recording.wav","What kind of storm is it?",RAINSTORM
224 | "aftertherain.wav","What animal can be heard in the background?",BIRD
225 | "Stovetop Range w. City Ambience_1-2.wav","how often does the sound happen?",CONTINUOUSLY
226 | "Stovetop Range w. City Ambience_1-2.wav","What is leaking from a tube?",AIR
227 | "outdoors ambient distant village 3.wav","How many animals can be heard?",TWO
228 | "outdoors ambient distant village 3.wav","What animal can clearly be heard in the background at one point in the audio?",DOG
229 | "Office Lift 2.wav","What gender is the human?",FEMALE
230 | "Office Lift 2.wav","What is opened?",DOOR
231 | "20110220_churchbell.wav","How many times is the gong struck?",FOUR
232 | "20110220_churchbell.wav","What is being struck?",BELL
233 | "Nightingale.wav","How many chirps are there?",FIFTY
234 | "worktoilet.wav","What is being flushed?",WATER
235 | "worktoilet.wav","What is going down the drain?",WATER
236 | "cupboard door squeaks.wav","What is being opened and closed?",DOOR
237 | "river_mouth1.wav","what does the water make when it collides with itself?",SPLASH
238 | "fountains-Udaipur-Saheliyon-Ki-Bari-4.wav","What is falling to the ground?",RAIN
239 | "fountains-Udaipur-Saheliyon-Ki-Bari-4.wav","What is the ground?",PAVEMENT
240 | "Padlock.wav","What does the gadget seem to be made from?",METAL
241 | "Footsteps, Muddy, E.wav","How many steps does the person take?",TWENTY
242 | "Footsteps, Muddy, E.wav","What is being walked on?",MUD
243 | "squeaky metal swing.wav","What item is being moved back and forth to create the noise?",CAT
244 | "squeaky metal swing.wav","What playground feature makes this sound when in use?",SEESAW
245 | "Fairground 2 Ghost ride.wav","How many screams are there?",FOUR
246 | "SCC CLAPTER 20101210.wav","What are the people doing?",CLAPPING
247 | "SCC CLAPTER 20101210.wav","What emotion are the people expressing in the clip?",EXCITEMENT
248 | "FOOTSTEPS_005.wav","How is the person traveling?",WALKING
249 | "Sink_Running.wav","What is flowing?",WATER
250 | "Sink_Running.wav","what is the water doing?",DRAINING
251 | "F907 Church prayer f.wav","How many women speak?",ONE
252 | "F907 Church prayer f.wav","Who is responding to a single person?",CROWD
253 | "water dripping 2.wav","How many faucets are dripping water?",ONE
254 | "water dripping 2.wav","What is dripping?",WATER
255 | "20121014_boat_tour_01.wav","How many kinds of creatures are making noise?",TWO
256 | "20121014_boat_tour_01.wav","What are the people doing?",TALKING
257 | "Alps village field-recording distance.wav","What is making the mechanical noise?",TRUCK
258 | "moving glass pieces.wav","How many times is something struck?",TWELVE
259 | "moving glass pieces.wav","what is moving around?",GLASS
260 | "AMB_EXT_PARK_SUMMER_DAY_LOOP.wav","What is chirping?",CRICKET
261 | "AMB_EXT_PARK_SUMMER_DAY_LOOP.wav","What other type of insect can make this sound?",CRICKET
262 | "Rain_inside_of_a_Car.wav","What is be boiling?",POPCORN
263 | "morning breeze and birds.wav","What kind of animals are making noise?",BIRDS
264 | "morning breeze and birds.wav","What sound are the birds making?",CHIRPING
265 | "Corn Husking Sequence x2.wav","what happens to the tape?",PULLED
266 | "radio_static.wav","What machine is making the noise?",RADIO
267 | "md3trk2.wav","How many times does the noise repeat?",SIX
268 | "md3trk2.wav","What is spinning?",MOTOR
269 | "md4trk10.wav","What manual action creates this sound?",SHAKING
270 | "Bus Pulls Away.wav","What part of a car can be heard?",ENGINE
271 | "Cat Meowing.wav","What does the puppy keep doing?",GROWLING
272 | "Cat Meowing.wav","What is growling?",CAT
273 | "Household - Atmos - Wind Through Window.wav","How is the weather here?",WINDY
274 | "tentrain.wav","What is falling?",RAIN
275 | "Small Falling Water Onto Stones.wav","what is the water doing?",FLOWING
276 | "WOOD CHOPPING_ Chopping hard wood with metal Axe (SFX).wav","How many times is the object smacked?",SIXTEEN
277 | "WOOD CHOPPING_ Chopping hard wood with metal Axe (SFX).wav","Which time is the object smacked the quietest?",NONE
278 | "Wet_Soggy_Squishy_Footsteps.wav","When is the object being handled the gentlest?",BEGINNING
279 | "Air raid siren_rising.wav","What animal is chirping?",BIRD
280 | "Old metal window open and close.wav","How many times can the scraping sound be heard?",SIX
281 | "Old metal window open and close.wav","What does the general weight of this object seem to be?",LIGHT
282 | "bombolles.wav","How many times does the sound stop and start again?",THREE
283 | "bombolles.wav","What is making the bubbling noise?",LIQUID
284 | "vending machine action.wav","What does the machine being operated do?",LAUNDRY
285 | "vending machine action.wav","What is the person putting into the machine?",COINS
286 | "Marker Writing on Paper.wav","What is the person writing with?",PENCIL
287 | "crickets and owls.wav","how many times is there a loud screeching sound?",THREE
288 | "BobKessler-Spinning Tin Top.wav","How many times was the thing rolled down?",FOUR
289 | "Gravel_Sand Walking 1.wav","How many people are walking?",ONE
290 | "foley footsteps - raw.wav","Where is he?",OFFICE
291 | "OrchestraTuning2.wav","What are the people doing with their instruments?",PLAYING
292 | "OrchestraTuning2.wav","What is the tone of the music?",CLASSICAL
293 | "SonicSnap_GPSUK_Cockerel.wav","What bird is making a sound near the end?",ROOSTER
294 | "SonicSnap_GPSUK_Cockerel.wav","What is the sound that is coming from a vehicle?",SIREN
295 | "Muddy_steps_bush_birds_singing.wav","What animal is making noise?",BIRD
296 | "Muddy_steps_bush_birds_singing.wav","What is the person doing?",WALKING
297 | "Liverpool St Station main hall.wav","What device, on a vehicle, makes the high pitched squeal sound?",BRAKES
298 | "Cooking on Gas.wav","when does the sound cease?",NEVER
299 | "Blackbird tweet with waterfall background.wav","What is making the chirping noise?",BIRD
300 | "Greek Habitues - (Evosmos - Salonika) 16.18 28.09.wav","What is moving in the sky?",HELICOPTER
301 | "Greek Habitues - (Evosmos - Salonika) 16.18 28.09.wav","what is the helicopter doing?",FLYING
302 | "Chicharra1.wav","How many people can be heard moving around?",ONE
303 | "Watering Can.wav","What is turned on to release the water?",FAUCET
304 | "Car vs. Freight Train.wav","What causes the crashing noise?",TRAIN
305 | "Single cricket chirping during a summer evening in the city (with traffic noise).wav","what is making the high pitch sound?",INSECT
306 | "RG Large Old Dog Snoring.wav","How many times does the person breathe?",SIX
307 | "RG Large Old Dog Snoring.wav","What is the person doing?",SLEEPING
308 | "20160506_sharpening.02.wav","How many times does the sound repeat?",SIXTEEN
309 | "20160506_sharpening.02.wav","What object is being used to make the noise?",KNIFE
310 | "Thunder_01.wav","how many times does the thunder crack?",TWICE
311 | "ortam.wav","What object is making this deep noise?",GONG
312 | "ortam.wav","When is the pitch of the rumbling sound the highest?",CONSTANT
313 | "Brushing teeth.wav","What does the person spit out?",LIQUID
314 | "Brushing teeth.wav","What was the person cleaning?",TEETH
315 | "Fantasy Ambience.wav","What instrument is being played?",ORGAN
316 | "Glass bottles in and out of a basket.wav","How many taps are there?",THREE
317 | "Sound_FX_Kitchen_wash dish.wav","In what room is the person using a sink?",KITCHEN
318 | "Sound_FX_Kitchen_wash dish.wav","What fixture is the water coming out of?",TAP
319 | "20091211.barking.stairs.wav","What animal is making loud noise?",DOG
320 | "20091211.barking.stairs.wav","What is the dog doing continuously?",BARKING
321 | "Kiddie Train.wav","What is being expelled at the beginning?",AIR
322 | "Kiddie Train.wav","What type of vehicle can be heard?",TRAIN
323 | "Fryers Forest - Powerful Owl (Ninox Stenua).wav","how many times does the animal hoot?",EIGHT
324 | "Fryers Forest - Powerful Owl (Ninox Stenua).wav","What animal can be heard?",OWL
325 | "Boulevard SummerRiver calm 01 NL 160905_0961 0962.wav","What animal is chirping in the background?",BIRD
326 | "Boulevard SummerRiver calm 01 NL 160905_0961 0962.wav","What are the people doing?",TALKING
327 | "mab-kite-spool-20080727.wav","What gas is escaping from the appliance?",STEAM
328 | "20130723_Rain2.wav","what is driving in the rain?",VEHICLE
329 | "Cardiff Bay fireworks.wav","What is being celebrated?",PARTY
330 | "Cardiff Bay fireworks.wav","What is making the load bangs?",FIREWORKS
331 | "draining board metal drip on metal.wav","What is the water falling into?",BUCKET
332 | "drip rhythm1.wav","What liquid element do the drops fall into?",WATER
333 | "a gentle breeze, wind 4.wav","How soft is this sound?",SOFT
334 | "Electric Train Interior Atmos.wav","How many engines are running?:",TWO
335 | "atmo_kenting_national_park.wav","What covers the outside of the animal heard here?",FEATHERS
336 | "Tree Bark Cracks.wav","What can be heard breaking?",WOOD
337 | "tua-mirandela_train_arrival_march2007.wav","What kind of animal is in the background?",DOG
338 | "Outside wind.wav","What type of rainfall is this?",HEAVY
339 | "Outside wind.wav","What type of weather is this?",RAINY
340 | "Metal pipe hitting the ground.wav","How many times is something dropped?",SEVEN
341 | "Metal pipe hitting the ground.wav","What material is the object made from?",METAL
342 | "tornado day 1.wav","What is the weather like?",RAINY
343 | "1400 am static.wav","Where do the wheels of the vehicle rotate?",LOW
344 | "Busy Playground.wav","What animal is singing in the background?",BIRD
345 | "Busy Playground.wav","Where would these kids be playing at?",PLAYGROUND
346 | "Bizzare Atmosphere.wav","How many times is the first noise repeated?",FIFTEEN
347 | "Fast food soda with ice, sip slurp straw.wav","What is the person drinking?",SODA
348 | "Footsteps Dress Shoes Wood Floor.wav","How many steps does the person take?",FORTY
349 | "Footsteps Dress Shoes Wood Floor.wav","What is the activity that's taking place?",WALKING
350 | "fresound sample 2.wav","What type of instrument is making the main sound?",PIANO
351 | "slupia river.wav","How is the weather here?",RAINING
352 | "slupia river.wav","How many air bubbles can be heard?",ALOT
353 | "at the edge of the forest.wav","How many different noises can be heard?",THREE
354 | "20130405_wooden.stairs.floor.01.wav","What material is the floor?",WOOD
355 | "53 blackhead_minifjord_closeup.wav","What is making the engine noise?",AIRPLANE
356 | "53 blackhead_minifjord_closeup.wav","What is the occupation of the person operating the thing making the engine noise?",PILOT
357 | "plastic-straw-whistles.wav","What general art form is this?",MUSIC
358 | "plastic-straw-whistles.wav","What instrument is being played?",FLUTE
359 | "river + waterfall 2 .wav","what is the water doing?",RUNNING
360 | "By ther blacksmith-002.wav","How many times has the object been hit?",SEVENTYTHREE
361 | "By ther blacksmith-002.wav","What material is being hit?",METAL
362 | "Street market.wav","What are the people doing?",TALKING
363 | "Street market.wav","what gender of people are speaking?",FEMALE
364 | "Construction Zone.wav","From which part of the car do the noise come from?",ENGINE
365 | "d0_drips_04.wav","What is dripping into glass?",WATER
366 | "d0_drips_04.wav","What is the water being poured in?",BOWL
367 | "Ambience_hum_tuning fork.wav","How many times does the tone sound?",ONCE
368 | "Ambience_hum_tuning fork.wav","What repetitive miniscule movement is the item making that is struck, in order to make the sound?",VIBRATING
369 | "cricket chirp.wav","How many times does the insect chirp?",THIRTYFOUR
370 | "cricket chirp.wav","What insect is making the sound?",CRICKET
371 | "Omsk_Victory_park_1.wav","Which of the distant banging sounds is the loudest?",MIDDLE
372 | "windroar_constant_1m12s.wav","What other weather condition is occurring in the recording?",RAIN
373 | "Growing Hum.wav","How many times is the clicking heard at the beginning?",SIX
374 | "Growing Hum.wav","what type of material are the scissor blades made of?",METAL
375 | "Watervogels en riet Lichterveldestraat (HiPass).wav","What is the object that is travelling in the clip?",CAR
376 | "urinating on a wall.wav","How many water taps are open ?",ONE
377 | "urinating on a wall.wav","what is coming out of the hose?",WATER
378 | "Large Splashes.wav","How many splashes are there?",FIVE
379 | "Large Splashes.wav","What are things being dropped into?",WATER
380 | "20060523.grassland.wav","What animal is heard?",BIRDS
381 | "20060523.grassland.wav","What is blowing though the trees?",WIND
382 | "car alarm 130603.wav","What kind of alert is going off?",CAR
383 | "20080505_1306playground01.wav","What type of animal can be heard occasionally?",BIRD
384 | "porto_morning_tropical_birds_market_20.wav","What is making the most noise?",BIRDS
385 | "remix of 130879__frederic-font__05-hang-song-1.wav","How many instruments are there?",TWO
386 | "remix of 130879__frederic-font__05-hang-song-1.wav","What kind of instrument are they using?",PIANO
387 | "Thunder 03.wav","What is making that sound?",WIND
388 | "Canada Geese Squawk on a Pond with a Fountain.wav","What animal is making sounds?",BIRDS
389 | "indoors house ambient room tone distant neighbours 1.wav","What age range do the voices belong to at the beginning?",CHILDREN
390 | "indoors house ambient room tone distant neighbours 1.wav","What animal is making a sound at the end?",BIRD
391 | "Cityscape 04 090617.wav","What is rushing past?",TRAIN
392 | "Lluvia 1.wav","If there is too much of this, what disaster can it cause?",FLOOD
393 | "Lluvia 1.wav","What hits the ground?",RAIN
394 | "walkingondirtpath.wav","How many times do they kick forward?",TWO
395 | "walkingondirtpath.wav","What is the man stepping on?",GRAVEL
396 | "Shower Running 02.wav","What is constantly streaming?",WATER
397 | "Shower Running 02.wav","What touches the water as it drips?",HANDS
398 | "fireTruckFar NL 140109_00.wav","How many ambulances are heard passing by in the background ?",ONE
399 | "fireTruckFar NL 140109_00.wav","What vehicle is getting closer?",AMBULANCE
400 | "abandoned-ballroom-big-metal.wav","What is the pace of the person's movement?",SLOW
401 | "basement-stairs.wav","What is happening here?",WALKING
402 | "FP_Refrigerator_Door_Squeak.wav","How many times does the door open after being shut?",SIX
403 | "FP_Refrigerator_Door_Squeak.wav","What is creaking?",DOOR
404 | "10_lightning_kohchang.wav","How many times does thunder clap?",ONE
405 | "Birds Singing in a Small Town During Morning.wav","what hits the mic at the end?",WIND
406 | "Birds Singing in a Small Town During Morning.wav","What is the bird doing?",CHIRPING
407 | "coffee can.wav","What tool is being used?",SAW
408 | "BulletJuneEdited192012.wav","How many times does the machine rev up at the start of the clip?",ONE
409 | "Neighbourhood evening ambience.wav","What is the person doing?",WALKING
410 | "Neighbourhood evening ambience.wav","What kind of creature is chirping?",CRICKETS
411 | "ambience car.wav","What are the people who operate these modes of transportation known as?",DRIVE
412 | "ambience car.wav","What is making the noise?",CAR
413 | "20090412.bell.strikes.12.wav","How many people can be heard talking?",MANY
414 | "20090412.bell.strikes.12.wav","How many times does the bell ring?",EIGHT
415 | "mediterranean_sea_porticcio.wav","what is the water doing?",CRASHING
416 | "Bicin_Diputacion_Day_22-03-2009.wav","How many clicks are heard?",THIRTY
417 | "WasherEndofRestCycleStartFill-WashCycle.wav","where does this noise occur?",FACTORY
418 | "police_car_siren-esp.wav","What law enforcement would use this sound?",POLICE
419 | "Kung Fu Clothes Hits and Clothing Sounds.wav","how many taps are heard?",EIGHT
420 | "Kung Fu Clothes Hits and Clothing Sounds.wav","What is hit?",MAT
421 | "Library Ambience_large space.wav","What is the last sound called?",KNOCK
422 | "Library Ambience_large space.wav","What sort of noise coming from a human can briefly be heard in the background?",TALKING
423 | "Slushing in mouth.wav","What fluid does the man shake around?",WATER
424 | "Cualquiera.wav","What makes the squeaking sound?",DOOR
425 | "box_open_hit.wav","What is the sound being made repeatedly?",HITTING
426 | "Rainforest Morning Chorus.wav","What are communicating with each other?",BIRDS
427 | "Rainforest Morning Chorus.wav","What is chirping in the background?",BIRD
428 | "Tiergarten birds early morning.wav","what are the birds doing?",SINGING
429 | "Tiergarten birds early morning.wav","Where are the birds making this noise?",OUTSIDE
430 | "WalkingGravelTrailAugust2015.wav","How many people are present?",ONE
431 | "WalkingGravelTrailAugust2015.wav","What type of surface are they walking on?",GRAVEL
432 | "Ship Fender.wav","What are they using to start up the engine?",START
433 | "Ship Fender.wav","What tool is making the noise?",MOWER
434 | "JM_HOME&amp;OFFICE_Shower 01 - Taking a shower.wav","How many people can be heard?",NONE
435 | "Typing 5 lines.wav","What is the person using to type?",TYPEWRITER
436 | "20090407.airplane.wav","How many planes can be heard?",ONE
437 | "20091217.18.chains.wav","How many times can the clinking sound be heard?",EIGHT
438 | "20091217.18.chains.wav","What material is the object being manipulated made out of?",METAL
439 | "Light to heavy Rain.wav","What noise can be heard?",WATER
440 | "Cityscape construction site 2 100304.wav","How many cutting strokes does this person make ?",TWO
441 | "earth_movement.wav","This sound is reminiscent of what domesticated animal when it's happy?",CAT
442 | "Horse_Hooves_Hard_Floor_Interior.wav","how many sets of clomping noises are there?",THREE
443 | "Horse_Hooves_Hard_Floor_Interior.wav","Which part of their body is used to make the sound?",FINGERS
444 | "Kaffemaschine_1.wav","What is been used?",SAW
445 | "Kaffemaschine_1.wav","What is the sound the machine makes?",DRILLING
446 | "BoyRacer.wav","what is the engine doing?",REVVING
447 | "campanas.wav","what are the bells doing?",RINGING
448 | "rain_near_smooth.wav","How is the weather?",RAINY
449 | "rain_near_smooth.wav","how many times is thunder heard?",ONE
450 | "treefrogs.wav","What type of machine does this sound come from?",ENGINE
451 | "Quill pen writing on hard paper various speed.wav","What is someone doing?",WRITING
452 | "Waiting at a Montreal Subway Station.wav","Where is this sound made?",TUNNEL
453 | "Waiting at a Montreal Subway Station.wav","Which form of transportation is heard?",TRAIN
454 | "circuitbend03.wav","How many times does the buzzing stop?",TEN
455 | "natureatmosphere.wav","What are these people doing?",LISTENING
456 | "natureatmosphere.wav","what sense does the man refer to?",LISTENING
457 | "Pigeon Temple.wav","What type of gender voice preceded the laugh?",MALE
458 | "Pigeon Temple.wav","When does the person laugh?",END
459 | "London Overground train (interior) approaches Victoria Station.wav","What are the people on the train doing?",TALKING
460 | "London Overground train (interior) approaches Victoria Station.wav","What is this mode of transportation traveling on?",TRAIN
461 | "Storm sirens with dog bark at end 050627 24 bit.wav","How is the weather?",RAINY
462 | "Storm sirens with dog bark at end 050627 24 bit.wav","What type of weather can be heard?",RAIN
463 | "spring rain in the woods.wav","How many birds are singing?",ONE
464 | "spring rain in the woods.wav","Where are the birds?",OUTSIDE
465 | "water_boil_pour_stir-96.wav","How many times is there a different sound other than the sound that occurs the most?",ONCE
466 | "wooden barndoor.wav","How many clacks are heard?",FOUR
467 | "wooden barndoor.wav","What is the person opening?",DOOR
468 | "gargnano-sounds.wav","How many cars pass by?",ONE
469 | "gargnano-sounds.wav","Who is making the noise?",HUMAN
470 | "paper01.wav","What is the person doing?",READING
471 | "Rushing_water+wind-Rec_Samsung_HMX-F80_Camcorder.wav","What is making the noise?",WATER
472 | "RunningWater_BathTub_01.wav","What is the water going down?",DRAIN
473 | "Jet Engine 1.wav","What product is being processed?",METAL
474 | "Trompetistas.wav","How many trumpets are being played?",TWO
475 | "Trompetistas.wav","What is the person playing?",TRUMPET
476 | "TIKTOK_1.wav","How many times does the clock tick?",SIXTY
477 | "TIKTOK_1.wav","What is making the ticking noise?",CLOCK
478 | "OiseauNuit1.wav","What is making the chirping?",BIRDS
479 | "OiseauNuit1.wav","What is singing?",BIRDS
480 | "meadow brook bees.wav","What sweet product do the insects that make these sounds produce?",LIQUID
481 | "thunder-distant-20120709-s3.wav","What is making this sound?",THUNDER
482 | "ambience winter fountain birds .wav","What kind of vessel would be used in this substance?",BOAT
483 | "abandoned-ballroom-radiators.wav","What instrument is being played?",XYLOPHONE
484 | "abandoned-ballroom-radiators.wav","What is making the music?",XYLOPHONE
485 | "Wisper1.wav","What is the person doing?",WHISPERING
486 | "BlueJay.wav","How many single squawks does the loudest bird make?",SEVEN
487 | "California morning birds singing.wav","What animals are making the most sounds?",BIRDS
488 | "California morning birds singing.wav","What breed of bird is chirping?",SPARROW
489 | "steps_snow.wav","What cold substance is someone walking through?",SNOW
490 | "steps_snow.wav","What does the group keep doing?",WALKING
491 | "Lincoln Nebraska Tornado 5 9 2016.wav","What is creaking?",RADAR
492 | "Gentle rain outside balcony street noise.wav","How many engines can be heard?",TWO
493 | "Water Faucet HQ Stereo.wav","What moving substance is causing this sound?",WATER
494 | "mercury topaz starting.wav","What is the person doing before starting the car?",OPENING
495 | "mercury topaz starting.wav","What is the person doing in the vehicle?",PULLING
496 | "Iceland2013_Stokkur.wav","What are the people doing?",LAUGHING
497 | "Iceland2013_Stokkur.wav","Whos is laughing?",WOMAN
498 | "Paper_Parchment_Rustling.wav","What is the person doing with the bag?",OPENING
499 | "Paper_Parchment_Rustling.wav","What material is the rustled bag made of?",PAPER
500 | "Balloon Game at Arlington Heights Carnival.wav","How many children speak?",TWO
501 | "Balloon Game at Arlington Heights Carnival.wav","What kind of toy is the child asking for?",DOG
502 | "branch and wind in wood 1.wav","What can be heard blowing in the background?",WIND
503 | "spring, road.wav","how many times does the wood hit another surface?",FIVE
504 | "spring, road.wav","What type of material is making that impact noise ?",WOOD
505 | "Rishikesh Aarati.wav","How many times does this person cough ?",ONE
506 | "Rishikesh Aarati.wav","How many times is a horn honked?",FOUR
507 | "Stream # 2.wav","Name the fluid that is churning away?",WATER
508 | "Taking the car out of the garage.wav","What is making the noise?",MACHINE
509 | "Taking the car out of the garage.wav","What was in the garage ?",VEHICLE
510 | "Shower 2.wav","What is going down the drain?",WATER
511 | "Shower 2.wav","Which room does this sound occur in?",BATHROOM
512 | "005_musesdelight_charismatic-african-preacher.wav","What is the gender of the person speaking?",MALE
513 | "005_musesdelight_charismatic-african-preacher.wav","What place are the people in?",CHURCH
514 | "Boiling a cup of water.wav","What process needs to occur to the liquid, in order for it to make that sound?",BOILING
515 | "Toilet Shuffling.wav","What object was just used?",TOILET
516 | "down stars running 3.wav","What final action is the person doing at the very end of the sound?",STOMPING
517 | "PIT-ROIG 0.12-0.17.wav","What material are the boots made from?",RUBBER
518 | "Short Hailstorm.wav","What is pouring down in the sound?",RAIN
519 | "Short Hailstorm.wav","What material is the rain beating against?",METAL
520 | "Face slap CsG.wav","How many times is the slapping sound heard?",TWENTY
521 | "Face slap CsG.wav","What item is being hit against the object?",HAND
522 | "nxSample010.wav","What is banging?",METAL
523 | "nxSample010.wav","What is running?",WATER
524 | "Bubbles water.wav","What object is making the liquid bubble?",STRAW
525 | "squeaking wooden floor.wav","What is the final sound heard called?",SQUEAKING
526 | "fan_2_300513.wav","What is blowing out air?",FAN
527 | "Jumping onto a hard floor with shoes and some walking sounds.wav","What sport is associated with this sound?",TENNIS
528 | "20130723_Rain1.wav","How is the traffic?",SLOW
529 | "Kali Temple Soundscape.wav","What are the people doing?",TALKING
530 | "Parking Garage - Ambiance, Electrical Hum 1.wav","How many times does the buzzing pause?",ZERO
531 | "Parking Garage - Ambiance, Electrical Hum 1.wav","What is the sound that heard called?",BUZZING
532 | "knocking on a window or glass.wav","What is the person doing?",KNOCKING
533 | "cowshed.wav","How many times do the cows moo?",SIX
534 | "cowshed.wav","What animal is making the noise?",COW
535 | "Pag_Starigrad_crickets_birds_2.wav","How many different bird calls are heard?",THREE
536 | "Pag_Starigrad_crickets_birds_2.wav","What kind of animals are nearby?",BIRDS
537 | "Ext, distance village, light wind in tree-01.wav","How many different types of animals are there?",FOUR
538 | "microphonecontact_stereo.wav","What action is being done to one of the objects in this clip?",GRINDING
539 | "microphonecontact_stereo.wav","What is being eaten?",ICE
540 | "Small Boat Engine.wav","What does this machine do?",CONSTRUCTION
541 | "Small Boat Engine.wav","what gender of human speaks?",MALE
542 | "footsteps on beach.wav","How is the person travelling?",WALKING
543 | "door.of.bar.raining2.wav","What is heard over over talking crowd?",RAIN
544 | "door.of.bar.raining2.wav","What living mammal is making the sounds?",HUMAN
545 | "PassingMoped01.wav","What sound are the birds making?",CHIRPING
546 | "rain2.wav","What can be heard falling?",WATER
547 | "rain2.wav","What speed is the rain falling?",FAST
548 | "Hamamatsu-traffic-light-1.wav","the chiming usually happens when your car door is what?",OPEN
549 | "coastal road on the beach, scooter, motorcycle, car.wav","Around what kind of building would noises like these be a common?",AIRPORT
550 | "coastal road on the beach, scooter, motorcycle, car.wav","Which vehicle has the noisiest engine in this clip?",MOTORCYCLE
551 | "windup_dino_slow.wav","What is the person doing?",TYPING
552 | "radio tuning 2.wav","How would one describe the quality of the conversation?",BAD
553 | "radio tuning 2.wav","What electronic is being dialed?",RADIO
554 | "md1trk33-34.wav","To what does the hinges need?",OIL
555 | "md1trk33-34.wav","What rectangular object that opens and closes are the hinges likely attached to?",DOOR
556 | "washcloth.wav","The sound is of a ball hitting what?",FLOOR
557 | "Blackbird sounds.wav","In what alive thing does this creature usually build a nest?",TREE
558 | "Blackbird sounds.wav","What kind of animal is that?",BIRD
559 | "20100110.kitchen.wav","What is beeping?",MICROWAVE
560 | "20100110.kitchen.wav","what item makes the ding sound?",ALARM
561 | "Sword clanks 3.wav","how many bangs are there?",ELEVEN
562 | "Sword clanks 3.wav","What is the material that is making the sounds?",METAL
563 | "ambient text.wav","What genre of movie could this sound be played in?",HORROR
564 | "ambient text.wav","What style does the music belong to?",HORROR
565 | "Subping03.wav","What machine creates this noise?",ELECTRONIC
566 | "Subping03.wav","What shell allows the sound to reverberate?",SEA
567 | "STE-017 traffic bus stop.wav","How many cars pass by?",THREE
568 | "STE-017 traffic bus stop.wav","What are the cars doing?",DRIVING
569 | "Anti Air Gun (3 Sounds).wav","Name one of the guns that was fired?",MACHINE
570 | "Anti Air Gun (3 Sounds).wav","what does the gun do?",FIRE
571 | "Wet Footstpes Sidewalk . Metro Pass in Distance.wav","how many steps are taken?",FIFTY
572 | "Construction Sounds.wav","What is rattling here?",MACHINE
573 | "Little sreet behind a terrasse cafe.wav","What kind of noises are the animals making?",BARKING
574 | "131227_strumyk_1.wav","What is running?",WATER
575 | "131227_strumyk_1.wav","what is the water doing?",FLOWING
576 | "Wind_Whistling_Dorm_Window.wav","How long does it beep?",LONG
577 | "Wind_Whistling_Dorm_Window.wav","What type of sound is playing though out the clip?",WHISTLE
578 | "RKeaton_EMF366_12_Tearing Thick Paper.wav","How many tearing sequences are heard throughout?",FOUR
579 | "RKeaton_EMF366_12_Tearing Thick Paper.wav","What is the person doing to the paper?",TEARING
580 | "Gibbons of Dusit.wav","What animal is making these noises?",BIRD
581 | "Gibbons of Dusit.wav","Which instance of the whooping sound is the longest?",LAST
582 | "bebops_water1.wav","how many times does a wave collide with the shore?",FIVE
583 | "obresAranyo_trepant2.wav","What would this sound likely be made by if it were heard in a dentist's office?",DRILL
584 | "pr#6F9A9E.wav","how many times does the item drop?",EIGHT
585 | "Llantas_rechinando.wav","How many times is a screeching sound made?",TWELVE


--------------------------------------------------------------------------------