├── .gitignore
├── LICENSE
├── README.md
├── day01
    ├── README.md
    └── Seminar.ipynb
├── day02
    ├── README.md
    └── Seminar_Models_and_Pipeline.ipynb
├── day03
    ├── README.md
    ├── Seminar_WandB_and_Coding.ipynb
    ├── Transformers-LauzHack-SummerBootcamp.pdf
    ├── notebook_problems_examples
    │   ├── Example_Structured.ipynb
    │   ├── Example_Structured_Simple.ipynb
    │   ├── Example_Unstructured_Ordered.ipynb
    │   └── Example_Unstructured_Unordered.ipynb
    └── pics
    │   ├── git_four.png
    │   ├── git_one.png
    │   ├── git_three.png
    │   └── git_two.png
├── day06
    └── README.md
├── day07
    ├── GNN_lecture_Ali_Hariri.pdf
    └── README.md
├── day08
    ├── Computer_Vision_13_02_2025.pdf
    └── README.md
└── docs
    └── logo.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # seminar data and logs
 10 | *mnist
 11 | *speech_commands
 12 | *wandb
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | pip-wheel-metadata/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 
136 | .DS_Store
137 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 LauzHack
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![logo](docs/logo.png)
 2 | 
 3 | # Deep Learning Bootcamp
 4 | 
 5 | Since 2016, LauzHack has organized hackathons at EPFL in Lausanne, Switzerland. We also organize tech talks during the school year.
 6 | 
 7 | This is a repository for our Deep Learning Bootcamp (Winter 2025 Edition). For previous editions, see [Previous Editions](#previous-editions) section.
 8 | 
 9 | # Syllabus
10 | 
11 | - [**day01**](./day01) Introduction to Deep Learning and PyTorch
12 |   - Lecture: Introduction to bootcamp and Deep Learning
13 |   - Seminar: Introduction to `pytorch`
14 | - [**day02**](./day02) Basic Model Architectures
15 |   - Lecture: Fully-connected and Convolutional Neural Networks, ResNet
16 |   - Seminar: Models in `pytorch` and training pipeline
17 | - [**day03**](./day03) Transformer and R&D Coding
18 |   - Lecture: Recurrent Neural Networks, BatchNorm, LayerNorm
19 |   - Seminar: RNN, LSTM, GRU example
20 |   - Lecture 2: Transformer
21 |   - Seminar 2: Implementation of Transformer in `pytorch`
22 | - [**day06**](./day06) Deep Learning for Audio
23 |   - Lecture: Representing sound digitally, tasks (denoising, speech recognition, text-to-speech, voice conversion, lip-sync)
24 | - [**day07**](./day07) Graph Neural Networks
25 |   - Lecture: Graph learning, applications, limitations
26 |   - Seminar: PyTorch-based examples of training GCN and SAGE architectures
27 | - [**day08**](./day08) Computer Vision
28 |   - Lecture: Diffusion models, Vision Transformers, Object Detection, Generalizability, Test-Time Training
29 |   - Seminar: Diffusion models and test-time training with MNIST
30 | 
31 | # Resources
32 | 
33 | - [Recordings on YouTube](https://youtube.com/playlist?list=PLpYenI2Zwc7ZpUcnP18vDOD__wiwbfoua)
34 | 
35 | # Contributors & bootcamp staff
36 | 
37 | Bootcamp materials and teaching were delivered by:
38 | 
39 | - Petr Grinberg
40 | - Seyed Parsa Neshaei
41 | - Eric Bezzam
42 | - Ali Hariri
43 | - Nikita Durasov
44 | - Federico Stella (Previously)
45 | - Atli Kosson (Previously)
46 | - Cristian Cioflan (Previously)
47 | - Skander Moalla (Previously)
48 | - Vinitra Swamy (Previously)
49 | 
50 | # Previous Editions
51 | 
52 | - [Summer 2024](https://github.com/LauzHack/deep-learning-bootcamp/tree/summer24/)
53 | 


--------------------------------------------------------------------------------
/day01/README.md:
--------------------------------------------------------------------------------
1 | # Day 01
2 | 
3 | Introduction to Deep Learning, Petr Grinberg
4 | 
5 | - [Lecture slides](https://docs.google.com/presentation/d/1K0NJLjn4aocm723y0oCfdeu-cIbUE-PUcrU_3dvPLe0/edit?usp=sharing)
6 | - [Seminar](Seminar.ipynb)
7 | - [Recording on YouTube](https://youtu.be/qtY6RTqKOsE)
8 | 


--------------------------------------------------------------------------------
/day02/README.md:
--------------------------------------------------------------------------------
1 | # Day 02
2 | 
3 | Fully-Connected and Convolutional Neural Networks, Petr Grinberg
4 | 
5 | - [Lecture slides](https://docs.google.com/presentation/d/1gtTyTvXqCscoXwGfAfIUhafLTKVKbLmUUNH93LfMbqo/edit?usp=sharing)
6 | - [Seminar](Seminar_Models_and_Pipeline.ipynb)
7 | - [Recording on YouTube](https://youtu.be/3EDUGwWxZFw)
8 | 


--------------------------------------------------------------------------------
/day03/README.md:
--------------------------------------------------------------------------------
 1 | # Day 03
 2 | 
 3 | Part One: Transformer, Seyed Parsa Neshaei
 4 | * [Lecture slides](Transformers-LauzHack-SummerBootcamp.pdf)
 5 | * [Seminar](https://github.com/hermanmichaels/transformer_example)
 6 | * [Recording on YouTube](https://youtu.be/SleVoySdPoA)
 7 | 
 8 | Part Two: Improved logging and R&D coding techniques, Petr Grinberg
 9 | * [Seminar](Seminar_WandB_and_Coding.ipynb)
10 | * [Project template](https://github.com/Blinorot/pytorch_project_template)
11 | * [Recording on YouTube](https://youtu.be/sEA-Js5ZHxU)


--------------------------------------------------------------------------------
/day03/Transformers-LauzHack-SummerBootcamp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LauzHack/deep-learning-bootcamp/40fc389c0b1e6639b7525dc33a54e5fcb0563c30/day03/Transformers-LauzHack-SummerBootcamp.pdf


--------------------------------------------------------------------------------
/day03/notebook_problems_examples/Example_Structured_Simple.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "ad755e7f-097a-47ef-af96-db14f4c617e5",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "This notebook is based on [DLA Seminar](https://github.com/markovka17/dla/blob/2022/week06/seminar.ipynb)"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "1a09c197-776c-433f-9275-9561f2000d0e",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import dataclasses\n",
 19 |     "import torch\n",
 20 |     "from typing import Tuple, Union, List, Callable, Optional\n",
 21 |     "\n",
 22 |     "@dataclasses.dataclass\n",
 23 |     "class TaskConfig:\n",
 24 |     "    keyword: str = 'sheila'  # We will use 1 key word -- 'sheila'\n",
 25 |     "    batch_size: int = 128\n",
 26 |     "    learning_rate: float = 3e-4\n",
 27 |     "    weight_decay: float = 1e-5\n",
 28 |     "    num_epochs: int = 20\n",
 29 |     "    n_mels: int = 40\n",
 30 |     "    cnn_out_channels: int = 8\n",
 31 |     "    kernel_size: Tuple[int, int] = (5, 20)\n",
 32 |     "    stride: Tuple[int, int] = (2, 8)\n",
 33 |     "    hidden_size: int = 64\n",
 34 |     "    gru_num_layers: int = 2\n",
 35 |     "    bidirectional: bool = False\n",
 36 |     "    num_classes: int = 2\n",
 37 |     "    sample_rate: int = 16000\n",
 38 |     "    device: torch.device = torch.device(\n",
 39 |     "        'cuda:0' if torch.cuda.is_available() else 'cpu')"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "id": "d3dc86bd-98f1-498c-8d92-17ff3e8769f7",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "# Dataset"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 2,
 53 |    "id": "2bc4b4bf-3687-4885-a826-0beaae4b0146",
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# !wget http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz -O speech_commands_v0.01.tar.gz\n",
 58 |     "# !mkdir speech_commands && tar -C speech_commands -xvzf speech_commands_v0.01.tar.gz 1> log"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 3,
 64 |    "id": "39dee0e1-4b94-44e1-ab94-a471ec5d028d",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "from torch.utils.data import Dataset, DataLoader"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "id": "a3125256-a635-4a7f-86dc-175bedcdd3dc",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "import pandas as pd\n",
 79 |     "from pathlib import Path\n",
 80 |     "import json\n",
 81 |     "from collections import OrderedDict\n",
 82 |     "\n",
 83 |     "def read_json(fname):\n",
 84 |     "    fname = Path(fname)\n",
 85 |     "    with fname.open(\"rt\") as handle:\n",
 86 |     "        return json.load(handle, object_hook=OrderedDict)\n",
 87 |     "\n",
 88 |     "\n",
 89 |     "def write_json(content, fname):\n",
 90 |     "    fname = Path(fname)\n",
 91 |     "    with fname.open(\"wt\") as handle:\n",
 92 |     "        json.dump(content, handle, indent=4, sort_keys=False)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 5,
 98 |    "id": "f46b0a0b-40b9-44e4-b784-df2a8443cb57",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "class SpeechCommandDataset(Dataset):\n",
103 |     "\n",
104 |     "    def __init__(\n",
105 |     "        self,\n",
106 |     "        transform: Optional[Callable] = None,\n",
107 |     "        path2dir: str = None,\n",
108 |     "        keywords: Union[str, List[str]] = None,\n",
109 |     "        csv: Optional[pd.DataFrame] = None,\n",
110 |     "        part: \"str\" = \"train\",\n",
111 |     "    ):        \n",
112 |     "        self.transform = transform\n",
113 |     "\n",
114 |     "        self.path2dir = path2dir\n",
115 |     "        self.keywords = keywords\n",
116 |     "        self.index = self.create_or_load_index(part)\n",
117 |     "\n",
118 |     "    def create_or_load_index(self, part):\n",
119 |     "        index_path = Path(f\"{part}_index.json\")\n",
120 |     "        \n",
121 |     "        if not index_path.exists():\n",
122 |     "            self.create_index(part)\n",
123 |     "            \n",
124 |     "        return read_json(index_path)\n",
125 |     "\n",
126 |     "    def create_index(self, part):\n",
127 |     "        path2dir = Path(self.path2dir)\n",
128 |     "        keywords = self.keywords if isinstance(self.keywords, list) else [self.keywords]\n",
129 |     "        \n",
130 |     "        all_keywords = [\n",
131 |     "            p.stem for p in path2dir.glob('*')\n",
132 |     "            if p.is_dir() and not p.stem.startswith('_')\n",
133 |     "        ]\n",
134 |     "\n",
135 |     "        index = []\n",
136 |     "        for keyword in all_keywords:\n",
137 |     "            paths = (path2dir / keyword).rglob('*.wav')\n",
138 |     "            if keyword in keywords:\n",
139 |     "                for path2wav in paths:\n",
140 |     "                    index.append({\n",
141 |     "                        \"path\": path2wav.as_posix(),\n",
142 |     "                        \"keyword\": keyword,\n",
143 |     "                        \"label\": 1\n",
144 |     "                    })\n",
145 |     "            else:\n",
146 |     "                for path2wav in paths:\n",
147 |     "                    index.append({\n",
148 |     "                        \"path\": path2wav.as_posix(),\n",
149 |     "                        \"keyword\": keyword,\n",
150 |     "                        \"label\": 0\n",
151 |     "                    })\n",
152 |     "\n",
153 |     "        torch.manual_seed(0)\n",
154 |     "        indexes = torch.randperm(len(index))\n",
155 |     "        train_indexes = indexes[:int(len(index) * 0.8)]\n",
156 |     "        val_indexes = indexes[int(len(index) * 0.8):]\n",
157 |     "\n",
158 |     "        train_index = [index[i] for i in train_indexes]\n",
159 |     "        val_index = [index[i] for i in val_indexes]\n",
160 |     "\n",
161 |     "        train_index_path = pathlib.Path(\"train_index.json\")\n",
162 |     "        write_json(train_index, str(train_index_path))\n",
163 |     "        \n",
164 |     "        val_index_path = pathlib.Path(\"val_index.json\")\n",
165 |     "        write_json(val_index, str(val_index_path))\n",
166 |     "\n",
167 |     "    def __getitem__(self, ind: int):\n",
168 |     "        instance = self.index[ind]\n",
169 |     "\n",
170 |     "        path2wav = instance['path']\n",
171 |     "        wav, sr = torchaudio.load(path2wav)\n",
172 |     "        wav = wav.sum(dim=0)\n",
173 |     "        \n",
174 |     "        if self.transform:\n",
175 |     "            wav = self.transform(wav)\n",
176 |     "\n",
177 |     "        return {\n",
178 |     "            'wav': wav,\n",
179 |     "            'keywors': instance['keyword'],\n",
180 |     "            'label': instance['label']\n",
181 |     "        }\n",
182 |     "\n",
183 |     "    def __len__(self):\n",
184 |     "        return len(self.index)\n"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "id": "adc47254-b50a-428f-a1f0-c79d4f2348a9",
190 |    "metadata": {},
191 |    "source": [
192 |     "## Augmentations"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 6,
198 |    "id": "07b777e5-5a57-4251-90e5-f291a540a935",
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "import torchaudio\n",
203 |     "\n",
204 |     "class AugsCreation:\n",
205 |     "\n",
206 |     "    def __init__(self):\n",
207 |     "        self.background_noises = [\n",
208 |     "            'speech_commands/_background_noise_/white_noise.wav',\n",
209 |     "            'speech_commands/_background_noise_/dude_miaowing.wav',\n",
210 |     "            'speech_commands/_background_noise_/doing_the_dishes.wav',\n",
211 |     "            'speech_commands/_background_noise_/exercise_bike.wav',\n",
212 |     "            'speech_commands/_background_noise_/pink_noise.wav',\n",
213 |     "            'speech_commands/_background_noise_/running_tap.wav'\n",
214 |     "        ]\n",
215 |     "\n",
216 |     "        self.noises = [\n",
217 |     "            torchaudio.load(p)[0].squeeze()\n",
218 |     "            for p in self.background_noises\n",
219 |     "        ]\n",
220 |     "\n",
221 |     "    def add_rand_noise(self, audio):\n",
222 |     "\n",
223 |     "        # randomly choose noise\n",
224 |     "        noise_num = torch.randint(low=0, high=len(\n",
225 |     "            self.background_noises), size=(1,)).item()\n",
226 |     "        noise = self.noises[noise_num]\n",
227 |     "\n",
228 |     "        noise_level = torch.Tensor([1])  # [0, 40]\n",
229 |     "\n",
230 |     "        noise_energy = torch.norm(noise)\n",
231 |     "        audio_energy = torch.norm(audio)\n",
232 |     "        alpha = (audio_energy / noise_energy) * \\\n",
233 |     "            torch.pow(10, -noise_level / 20)\n",
234 |     "\n",
235 |     "        start = torch.randint(\n",
236 |     "            low=0,\n",
237 |     "            high=max(int(noise.size(0) - audio.size(0) - 1), 1),\n",
238 |     "            size=(1,)\n",
239 |     "        ).item()\n",
240 |     "        noise_sample = noise[start: start + audio.size(0)]\n",
241 |     "\n",
242 |     "        audio_new = audio + alpha * noise_sample\n",
243 |     "        audio_new.clamp_(-1, 1)\n",
244 |     "        return audio_new\n",
245 |     "\n",
246 |     "    def __call__(self, wav):\n",
247 |     "        aug_num = torch.randint(low=0, high=4, size=(1,)).item()   # choose 1 random aug from augs\n",
248 |     "        augs = [\n",
249 |     "            lambda x: x,\n",
250 |     "            lambda x: (x + torch.distributions.Normal(0, 0.01).sample(x.size())).clamp_(-1, 1),\n",
251 |     "            lambda x: torchaudio.transforms.Vol(.25)(x),\n",
252 |     "            lambda x: self.add_rand_noise(x)\n",
253 |     "        ]\n",
254 |     "\n",
255 |     "        return augs[aug_num](wav)"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 7,
261 |    "id": "3f45ccf7-aafa-46b0-9473-e27fbefae2e0",
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "train_dataset = SpeechCommandDataset(\n",
266 |     "    path2dir='speech_commands', keywords=TaskConfig.keyword, part=\"train\", transform=AugsCreation()\n",
267 |     ")\n",
268 |     "val_dataset = SpeechCommandDataset(\n",
269 |     "    path2dir='speech_commands', keywords=TaskConfig.keyword, part=\"val\"\n",
270 |     ")"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 8,
276 |    "id": "3b336ca8-c552-43d6-8d76-e7398d921f06",
277 |    "metadata": {},
278 |    "outputs": [
279 |     {
280 |      "data": {
281 |       "text/plain": [
282 |        "[OrderedDict([('path', 'speech_commands/nine/7c75a504_nohash_1.wav'),\n",
283 |        "              ('keyword', 'nine'),\n",
284 |        "              ('label', 0)]),\n",
285 |        " OrderedDict([('path', 'speech_commands/marvin/f9af823e_nohash_1.wav'),\n",
286 |        "              ('keyword', 'marvin'),\n",
287 |        "              ('label', 0)])]"
288 |       ]
289 |      },
290 |      "execution_count": 8,
291 |      "metadata": {},
292 |      "output_type": "execute_result"
293 |     }
294 |    ],
295 |    "source": [
296 |     "train_dataset.index[:2]"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "id": "80b4e209-71ff-47ff-9dc7-0bd6d601d8b2",
302 |    "metadata": {},
303 |    "source": [
304 |     "## Collate Fn"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 9,
310 |    "id": "dbb86465-6cf1-40cb-9e45-102843000e70",
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "from torch.nn.utils.rnn import pad_sequence\n",
315 |     "\n",
316 |     "def collate_fn(data):\n",
317 |     "    wavs = []\n",
318 |     "    labels = []    \n",
319 |     "\n",
320 |     "    for el in data:\n",
321 |     "        wavs.append(el['wav'])\n",
322 |     "        labels.append(el['label'])\n",
323 |     "\n",
324 |     "    # torch.nn.utils.rnn.pad_sequence takes list(Tensors) and returns padded (with 0.0) Tensor\n",
325 |     "    wavs = pad_sequence(wavs, batch_first=True)    \n",
326 |     "    labels = torch.Tensor(labels).long()\n",
327 |     "    return wavs, labels"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "id": "d8ab710b-172c-401b-b871-5861b9bb4981",
333 |    "metadata": {},
334 |    "source": [
335 |     "## DataLoader"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 10,
341 |    "id": "2dc73f86-847c-46cc-bb1c-df557e141748",
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": [
345 |     "train_dataloader = DataLoader(train_dataset, batch_size=TaskConfig.batch_size,\n",
346 |     "                          shuffle=False, collate_fn=collate_fn,\n",
347 |     "                          num_workers=2, pin_memory=True)\n",
348 |     "\n",
349 |     "val_dataloader = DataLoader(val_dataset, batch_size=TaskConfig.batch_size,\n",
350 |     "                        shuffle=False, collate_fn=collate_fn,\n",
351 |     "                        num_workers=2, pin_memory=True)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "id": "052244e7-cf98-49bb-aeea-31eb8c4c970d",
357 |    "metadata": {},
358 |    "source": [
359 |     "# Model"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 11,
365 |    "id": "3797e444-a0ee-44a9-9d1a-d3c83e29f6f3",
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": [
369 |     "from torch import nn"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 12,
375 |    "id": "130538c5-e66e-411a-9b7d-3ff978948365",
376 |    "metadata": {},
377 |    "outputs": [],
378 |    "source": [
379 |     "class LogMelspec(nn.Module):\n",
380 |     "\n",
381 |     "    def __init__(self, config):\n",
382 |     "        super().__init__()\n",
383 |     "        self.melspec = torchaudio.transforms.MelSpectrogram(\n",
384 |     "                sample_rate=config.sample_rate,\n",
385 |     "                n_fft=400,\n",
386 |     "                win_length=400,\n",
387 |     "                hop_length=160,\n",
388 |     "                n_mels=config.n_mels\n",
389 |     "        )\n",
390 |     "\n",
391 |     "        self.spec_augs = nn.Sequential(\n",
392 |     "                torchaudio.transforms.FrequencyMasking(freq_mask_param=15),\n",
393 |     "                torchaudio.transforms.TimeMasking(time_mask_param=35),\n",
394 |     "        )\n",
395 |     "\n",
396 |     "\n",
397 |     "    def __call__(self, batch):\n",
398 |     "        x = torch.log(self.melspec(batch).clamp_(min=1e-9, max=1e9))\n",
399 |     "        if self.training:\n",
400 |     "            x = self.spec_augs(x)\n",
401 |     "        return x"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": 13,
407 |    "id": "adf8be91-a439-42ad-bd77-77b5be595190",
408 |    "metadata": {},
409 |    "outputs": [
410 |     {
411 |      "data": {
412 |       "text/plain": [
413 |        "CRNN(\n",
414 |        "  (mel_spec): LogMelspec(\n",
415 |        "    (melspec): MelSpectrogram(\n",
416 |        "      (spectrogram): Spectrogram()\n",
417 |        "      (mel_scale): MelScale()\n",
418 |        "    )\n",
419 |        "    (spec_augs): Sequential(\n",
420 |        "      (0): FrequencyMasking()\n",
421 |        "      (1): TimeMasking()\n",
422 |        "    )\n",
423 |        "  )\n",
424 |        "  (conv): Sequential(\n",
425 |        "    (0): Conv2d(1, 8, kernel_size=(5, 20), stride=(2, 8))\n",
426 |        "    (1): Flatten(start_dim=1, end_dim=2)\n",
427 |        "  )\n",
428 |        "  (gru): GRU(144, 64, num_layers=2, batch_first=True, dropout=0.1)\n",
429 |        "  (attention): Attention(\n",
430 |        "    (energy): Sequential(\n",
431 |        "      (0): Linear(in_features=64, out_features=64, bias=True)\n",
432 |        "      (1): Tanh()\n",
433 |        "      (2): Linear(in_features=64, out_features=1, bias=True)\n",
434 |        "    )\n",
435 |        "  )\n",
436 |        "  (classifier): Linear(in_features=64, out_features=2, bias=True)\n",
437 |        ")"
438 |       ]
439 |      },
440 |      "execution_count": 13,
441 |      "metadata": {},
442 |      "output_type": "execute_result"
443 |     }
444 |    ],
445 |    "source": [
446 |     "class Attention(nn.Module):\n",
447 |     "\n",
448 |     "    def __init__(self, hidden_size: int):\n",
449 |     "        super().__init__()\n",
450 |     "\n",
451 |     "        self.energy = nn.Sequential(\n",
452 |     "            nn.Linear(hidden_size, hidden_size),\n",
453 |     "            nn.Tanh(),\n",
454 |     "            nn.Linear(hidden_size, 1)\n",
455 |     "        )\n",
456 |     "    \n",
457 |     "    def forward(self, input):\n",
458 |     "        energy = self.energy(input)\n",
459 |     "        alpha = torch.softmax(energy, dim=-2)\n",
460 |     "        return (input * alpha).sum(dim=-2)\n",
461 |     "\n",
462 |     "class CRNN(nn.Module):\n",
463 |     "\n",
464 |     "    def __init__(self, config: TaskConfig):\n",
465 |     "        super().__init__()\n",
466 |     "        self.config = config\n",
467 |     "\n",
468 |     "        self.mel_spec = LogMelspec(config)\n",
469 |     "\n",
470 |     "        self.conv = nn.Sequential(\n",
471 |     "            nn.Conv2d(\n",
472 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
473 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
474 |     "            ),\n",
475 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
476 |     "        )\n",
477 |     "\n",
478 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
479 |     "            config.stride[0] + 1\n",
480 |     "        \n",
481 |     "        self.gru = nn.GRU(\n",
482 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
483 |     "            hidden_size=config.hidden_size,\n",
484 |     "            num_layers=config.gru_num_layers,\n",
485 |     "            dropout=0.1,\n",
486 |     "            bidirectional=config.bidirectional,\n",
487 |     "            batch_first=True\n",
488 |     "        )\n",
489 |     "\n",
490 |     "        self.attention = Attention(config.hidden_size)\n",
491 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
492 |     "    \n",
493 |     "    def forward(self, input):\n",
494 |     "        input = self.mel_spec(input)\n",
495 |     "        \n",
496 |     "        input = input.unsqueeze(dim=1)\n",
497 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
498 |     "        gru_output, _ = self.gru(conv_output)\n",
499 |     "        contex_vector = self.attention(gru_output)\n",
500 |     "        output = self.classifier(contex_vector)\n",
501 |     "        return output\n",
502 |     "\n",
503 |     "config = TaskConfig()\n",
504 |     "model = CRNN(config)\n",
505 |     "model"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": 14,
511 |    "id": "275de3c2-9dfd-477b-98ba-e42e4ceef70d",
512 |    "metadata": {},
513 |    "outputs": [
514 |     {
515 |      "data": {
516 |       "text/plain": [
517 |        "tensor([[0.2782, 0.1052]], grad_fn=<AddmmBackward0>)"
518 |       ]
519 |      },
520 |      "execution_count": 14,
521 |      "metadata": {},
522 |      "output_type": "execute_result"
523 |     }
524 |    ],
525 |    "source": [
526 |     "model(train_dataset[0][\"wav\"].unsqueeze(0))"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "markdown",
531 |    "id": "3109817f-b675-44b8-800a-a1f1cf6b095a",
532 |    "metadata": {},
533 |    "source": [
534 |     "# Training pipeline"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": 21,
540 |    "id": "18e98516-e71b-4afe-9c2c-59ca6cbd7ee0",
541 |    "metadata": {},
542 |    "outputs": [
543 |     {
544 |      "data": {
545 |       "text/plain": [
546 |        "True"
547 |       ]
548 |      },
549 |      "execution_count": 21,
550 |      "metadata": {},
551 |      "output_type": "execute_result"
552 |     }
553 |    ],
554 |    "source": [
555 |     "from tqdm.auto import tqdm\n",
556 |     "import wandb\n",
557 |     "\n",
558 |     "wandb.login()"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": 16,
564 |    "id": "f20e5fe8-f64e-43bd-a308-0c504cea4785",
565 |    "metadata": {},
566 |    "outputs": [],
567 |    "source": [
568 |     "criterion = nn.CrossEntropyLoss()"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": 17,
574 |    "id": "6dc333e1-65e5-4f2f-8b3b-825801d0c5d8",
575 |    "metadata": {},
576 |    "outputs": [],
577 |    "source": [
578 |     "# FA - true: 0, model: 1\n",
579 |     "# FR - true: 1, model: 0\n",
580 |     "\n",
581 |     "def count_fa(preds, labels):\n",
582 |     "    preds = torch.argmax(preds, dim=-1)\n",
583 |     "\n",
584 |     "    FA = torch.sum(preds[labels == 0])\n",
585 |     "    \n",
586 |     "    # torch.numel - returns total number of elements in tensor\n",
587 |     "    return FA.item() / torch.numel(preds)\n",
588 |     "\n",
589 |     "def count_fr(preds, labels):\n",
590 |     "    preds = torch.argmax(preds, dim=-1)\n",
591 |     "\n",
592 |     "    FR = torch.sum(labels[preds == 0])\n",
593 |     "    \n",
594 |     "    # torch.numel - returns total number of elements in tensor\n",
595 |     "    return FR.item() / torch.numel(preds)\n",
596 |     "\n",
597 |     "def count_acc(preds, labels):\n",
598 |     "    preds = torch.argmax(preds, dim=-1)\n",
599 |     "\n",
600 |     "    acc = torch.sum(preds == labels)\n",
601 |     "    \n",
602 |     "    # torch.numel - returns total number of elements in tensor\n",
603 |     "    return acc.item() / torch.numel(preds)"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": 18,
609 |    "id": "2d87ba59-936c-45b6-8468-2dd88c23a032",
610 |    "metadata": {},
611 |    "outputs": [],
612 |    "source": [
613 |     "def train_one_epoch(model, dataloader, criterion, optimizer, scheduler, device, epoch):\n",
614 |     "    model.train()\n",
615 |     "\n",
616 |     "    avg_loss = 0\n",
617 |     "    step = epoch * len(dataloader)\n",
618 |     "    for batch_idx, (wav, label) in tqdm(enumerate(dataloader), total=len(dataloader)):\n",
619 |     "        wav, label = wav.to(device), label.to(device)\n",
620 |     "\n",
621 |     "        preds = model(wav)\n",
622 |     "        loss = criterion(preds, label)\n",
623 |     "\n",
624 |     "        loss.backward()\n",
625 |     "        optimizer.step()\n",
626 |     "        optimizer.zero_grad()\n",
627 |     "        scheduler.step()\n",
628 |     "\n",
629 |     "        avg_loss += loss.item()\n",
630 |     "        \n",
631 |     "        wandb.log({\n",
632 |     "            \"train_step_loss\": loss.item(),\n",
633 |     "            \"lr\": scheduler.get_last_lr()[0], # get current lr for the 0th param group\n",
634 |     "            \"acc_step\": count_acc(preds, label),\n",
635 |     "            \"fa_step\": count_fa(preds, label),\n",
636 |     "            \"fr_step\": count_fr(preds, label),\n",
637 |     "        }, step=step + batch_idx)\n",
638 |     "\n",
639 |     "        if batch_idx == 0:        \n",
640 |     "            wandb.log({\"train_image\": wandb.Audio(wav[0].detach().cpu().numpy(), sample_rate=16000,\n",
641 |     "                                                  caption=f\"Label: {label[0]}, Pred: {preds[0].argmax(-1)}\")},\n",
642 |     "                      step=step+batch_idx)\n",
643 |     "\n",
644 |     "    avg_loss = avg_loss / (batch_idx + 1)\n",
645 |     "    return avg_loss\n",
646 |     "\n",
647 |     "\n",
648 |     "def evaluate(model, dataloader, criterion, device):\n",
649 |     "    model.eval()\n",
650 |     "\n",
651 |     "    avg_loss = 0\n",
652 |     "    accuracy = 0\n",
653 |     "    fa = 0\n",
654 |     "    fr = 0\n",
655 |     "    total_elements = 0\n",
656 |     "    for batch_idx, (wav, label) in enumerate(dataloader):\n",
657 |     "        wav, label = wav.to(device), label.to(device)\n",
658 |     "\n",
659 |     "        preds = model(wav)\n",
660 |     "        loss = criterion(preds, label)\n",
661 |     "\n",
662 |     "        accuracy += count_acc(preds, label)\n",
663 |     "        fa += count_fa(preds, label)\n",
664 |     "        fr += count_fr(preds, label)\n",
665 |     "        avg_loss += loss.item()\n",
666 |     "        \n",
667 |     "\n",
668 |     "    avg_loss = avg_loss / (batch_idx + 1)\n",
669 |     "    accuracy = accuracy / (batch_idx + 1)\n",
670 |     "    fa = fa / (batch_idx + 1)\n",
671 |     "    fr = fr / (batch_idx + 1)\n",
672 |     "\n",
673 |     "    return avg_loss, accuracy, fa, fr\n",
674 |     "\n",
675 |     "\n",
676 |     "def train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, device, n_epochs):\n",
677 |     "\n",
678 |     "    train_avg_losses = []\n",
679 |     "    val_avg_losses = []\n",
680 |     "    val_accuracy_list = []\n",
681 |     "\n",
682 |     "    for epoch in range(n_epochs):        \n",
683 |     "        train_avg_loss = train_one_epoch(model, train_dataloader, criterion, optimizer, scheduler, device, epoch)\n",
684 |     "        val_avg_loss, val_accuracy, val_fa, val_fr = evaluate(model, val_dataloader, criterion, device)\n",
685 |     "\n",
686 |     "        wandb.log({\n",
687 |     "            \"train_avg_loss\": train_avg_loss, \n",
688 |     "            \"val_avg_loss\": val_avg_loss,\n",
689 |     "            \"val_accuracy\": val_accuracy,\n",
690 |     "            \"val_fa\": val_fa,\n",
691 |     "            \"val_fr\": val_fr,\n",
692 |     "        }, step=(epoch + 1) * len(train_dataloader))"
693 |    ]
694 |   },
695 |   {
696 |    "cell_type": "code",
697 |    "execution_count": 19,
698 |    "id": "2d1fb606-b1f6-4f7e-b656-2523402950a8",
699 |    "metadata": {},
700 |    "outputs": [],
701 |    "source": [
702 |     "model.to(config.device)\n",
703 |     "\n",
704 |     "NUM_EPOCHS = 2\n",
705 |     "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)\n",
706 |     "scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS * len(train_dataloader), eta_min=1e-4)"
707 |    ]
708 |   },
709 |   {
710 |    "cell_type": "code",
711 |    "execution_count": null,
712 |    "id": "deffa1c5-bee8-441b-aee7-8d5eba4f9a38",
713 |    "metadata": {},
714 |    "outputs": [],
715 |    "source": [
716 |     "with wandb.init(\n",
717 |     "                project=\"seminar_wandb_kws\", # project name\n",
718 |     "                name=\"crnn\" # run name within the project\n",
719 |     "            ) as run:\n",
720 |     "    train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, config.device, NUM_EPOCHS)"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "code",
725 |    "execution_count": null,
726 |    "id": "da9bdc8f-075f-4171-a3bf-3362159c8a6b",
727 |    "metadata": {},
728 |    "outputs": [],
729 |    "source": []
730 |   },
731 |   {
732 |    "cell_type": "code",
733 |    "execution_count": null,
734 |    "id": "dfbf78d5-80dc-417f-b444-5f30a031397f",
735 |    "metadata": {},
736 |    "outputs": [],
737 |    "source": []
738 |   }
739 |  ],
740 |  "metadata": {
741 |   "kernelspec": {
742 |    "display_name": "Python 3 (ipykernel)",
743 |    "language": "python",
744 |    "name": "python3"
745 |   },
746 |   "language_info": {
747 |    "codemirror_mode": {
748 |     "name": "ipython",
749 |     "version": 3
750 |    },
751 |    "file_extension": ".py",
752 |    "mimetype": "text/x-python",
753 |    "name": "python",
754 |    "nbconvert_exporter": "python",
755 |    "pygments_lexer": "ipython3",
756 |    "version": "3.9.7"
757 |   }
758 |  },
759 |  "nbformat": 4,
760 |  "nbformat_minor": 5
761 | }
762 | 


--------------------------------------------------------------------------------
/day03/notebook_problems_examples/Example_Unstructured_Ordered.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "ad755e7f-097a-47ef-af96-db14f4c617e5",
   6 |    "metadata": {},
   7 |    "source": [
   8 |     "This notebook is based on [DLA Seminar](https://github.com/markovka17/dla/blob/2022/week06/seminar.ipynb)"
   9 |    ]
  10 |   },
  11 |   {
  12 |    "cell_type": "code",
  13 |    "execution_count": null,
  14 |    "id": "1a09c197-776c-433f-9275-9561f2000d0e",
  15 |    "metadata": {},
  16 |    "outputs": [],
  17 |    "source": [
  18 |     "import dataclasses\n",
  19 |     "import torch\n",
  20 |     "from typing import Tuple, Union, List, Callable, Optional\n",
  21 |     "\n",
  22 |     "@dataclasses.dataclass\n",
  23 |     "class TaskConfig:\n",
  24 |     "    keyword: str = 'sheila'  # We will use 1 key word -- 'sheila'\n",
  25 |     "    batch_size: int = 128\n",
  26 |     "    learning_rate: float = 3e-4\n",
  27 |     "    weight_decay: float = 1e-5\n",
  28 |     "    num_epochs: int = 20\n",
  29 |     "    n_mels: int = 40\n",
  30 |     "    cnn_out_channels: int = 8\n",
  31 |     "    kernel_size: Tuple[int, int] = (5, 20)\n",
  32 |     "    stride: Tuple[int, int] = (2, 8)\n",
  33 |     "    hidden_size: int = 64\n",
  34 |     "    gru_num_layers: int = 2\n",
  35 |     "    bidirectional: bool = False\n",
  36 |     "    num_classes: int = 2\n",
  37 |     "    sample_rate: int = 16000\n",
  38 |     "    device: torch.device = torch.device(\n",
  39 |     "        'cuda:0' if torch.cuda.is_available() else 'cpu')"
  40 |    ]
  41 |   },
  42 |   {
  43 |    "cell_type": "code",
  44 |    "execution_count": null,
  45 |    "id": "2bc4b4bf-3687-4885-a826-0beaae4b0146",
  46 |    "metadata": {},
  47 |    "outputs": [],
  48 |    "source": [
  49 |     "# !wget http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz -O speech_commands_v0.01.tar.gz\n",
  50 |     "# !mkdir speech_commands && tar -C speech_commands -xvzf speech_commands_v0.01.tar.gz 1> log"
  51 |    ]
  52 |   },
  53 |   {
  54 |    "cell_type": "code",
  55 |    "execution_count": null,
  56 |    "id": "39dee0e1-4b94-44e1-ab94-a471ec5d028d",
  57 |    "metadata": {},
  58 |    "outputs": [],
  59 |    "source": [
  60 |     "from torch.utils.data import Dataset, DataLoader"
  61 |    ]
  62 |   },
  63 |   {
  64 |    "cell_type": "code",
  65 |    "execution_count": null,
  66 |    "id": "a3125256-a635-4a7f-86dc-175bedcdd3dc",
  67 |    "metadata": {},
  68 |    "outputs": [],
  69 |    "source": [
  70 |     "import pandas as pd\n",
  71 |     "from pathlib import Path\n",
  72 |     "import json\n",
  73 |     "from collections import OrderedDict\n",
  74 |     "\n",
  75 |     "def read_json(fname):\n",
  76 |     "    fname = Path(fname)\n",
  77 |     "    with fname.open(\"rt\") as handle:\n",
  78 |     "        return json.load(handle, object_hook=OrderedDict)\n",
  79 |     "\n",
  80 |     "\n",
  81 |     "def write_json(content, fname):\n",
  82 |     "    fname = Path(fname)\n",
  83 |     "    with fname.open(\"wt\") as handle:\n",
  84 |     "        json.dump(content, handle, indent=4, sort_keys=False)"
  85 |    ]
  86 |   },
  87 |   {
  88 |    "cell_type": "code",
  89 |    "execution_count": null,
  90 |    "id": "f46b0a0b-40b9-44e4-b784-df2a8443cb57",
  91 |    "metadata": {},
  92 |    "outputs": [],
  93 |    "source": [
  94 |     "class SpeechCommandDataset(Dataset):\n",
  95 |     "\n",
  96 |     "    def __init__(\n",
  97 |     "        self,\n",
  98 |     "        transform: Optional[Callable] = None,\n",
  99 |     "        path2dir: str = None,\n",
 100 |     "        keywords: Union[str, List[str]] = None,\n",
 101 |     "        csv: Optional[pd.DataFrame] = None,\n",
 102 |     "        part: \"str\" = \"train\",\n",
 103 |     "    ):        \n",
 104 |     "        self.transform = transform\n",
 105 |     "\n",
 106 |     "        self.path2dir = path2dir\n",
 107 |     "        self.keywords = keywords\n",
 108 |     "        self.index = self.create_or_load_index(part)\n",
 109 |     "\n",
 110 |     "    def create_or_load_index(self, part):\n",
 111 |     "        index_path = Path(f\"{part}_index.json\")\n",
 112 |     "        \n",
 113 |     "        if not index_path.exists():\n",
 114 |     "            self.create_index(part)\n",
 115 |     "            \n",
 116 |     "        return read_json(index_path)\n",
 117 |     "\n",
 118 |     "    def create_index(self, part):\n",
 119 |     "        path2dir = Path(self.path2dir)\n",
 120 |     "        keywords = self.keywords if isinstance(self.keywords, list) else [self.keywords]\n",
 121 |     "        \n",
 122 |     "        all_keywords = [\n",
 123 |     "            p.stem for p in path2dir.glob('*')\n",
 124 |     "            if p.is_dir() and not p.stem.startswith('_')\n",
 125 |     "        ]\n",
 126 |     "\n",
 127 |     "        index = []\n",
 128 |     "        for keyword in all_keywords:\n",
 129 |     "            paths = (path2dir / keyword).rglob('*.wav')\n",
 130 |     "            if keyword in keywords:\n",
 131 |     "                for path2wav in paths:\n",
 132 |     "                    index.append({\n",
 133 |     "                        \"path\": path2wav.as_posix(),\n",
 134 |     "                        \"keyword\": keyword,\n",
 135 |     "                        \"label\": 1\n",
 136 |     "                    })\n",
 137 |     "            else:\n",
 138 |     "                for path2wav in paths:\n",
 139 |     "                    index.append({\n",
 140 |     "                        \"path\": path2wav.as_posix(),\n",
 141 |     "                        \"keyword\": keyword,\n",
 142 |     "                        \"label\": 0\n",
 143 |     "                    })\n",
 144 |     "\n",
 145 |     "        torch.manual_seed(0)\n",
 146 |     "        indexes = torch.randperm(len(index))\n",
 147 |     "        train_indexes = indexes[:int(len(index) * 0.8)]\n",
 148 |     "        val_indexes = indexes[int(len(index) * 0.8):]\n",
 149 |     "\n",
 150 |     "        train_index = [index[i] for i in train_indexes]\n",
 151 |     "        val_index = [index[i] for i in val_indexes]\n",
 152 |     "\n",
 153 |     "        train_index_path = pathlib.Path(\"train_index.json\")\n",
 154 |     "        write_json(train_index, str(train_index_path))\n",
 155 |     "        \n",
 156 |     "        val_index_path = pathlib.Path(\"val_index.json\")\n",
 157 |     "        write_json(val_index, str(val_index_path))\n",
 158 |     "\n",
 159 |     "    def __getitem__(self, ind: int):\n",
 160 |     "        instance = self.index[ind]\n",
 161 |     "\n",
 162 |     "        path2wav = instance['path']\n",
 163 |     "        wav, sr = torchaudio.load(path2wav)\n",
 164 |     "        wav = wav.sum(dim=0)\n",
 165 |     "        \n",
 166 |     "        if self.transform:\n",
 167 |     "            wav = self.transform(wav)\n",
 168 |     "\n",
 169 |     "        return {\n",
 170 |     "            'wav': wav,\n",
 171 |     "            'keywors': instance['keyword'],\n",
 172 |     "            'label': instance['label']\n",
 173 |     "        }\n",
 174 |     "\n",
 175 |     "    def __len__(self):\n",
 176 |     "        return len(self.index)\n"
 177 |    ]
 178 |   },
 179 |   {
 180 |    "cell_type": "code",
 181 |    "execution_count": null,
 182 |    "id": "90ce1a7d-14f7-4671-90c8-6d8f1b102adb",
 183 |    "metadata": {},
 184 |    "outputs": [],
 185 |    "source": [
 186 |     "class SpeechCommandDatasetV2(Dataset):\n",
 187 |     "\n",
 188 |     "    def __init__(\n",
 189 |     "        self,\n",
 190 |     "        transform: Optional[Callable] = None,\n",
 191 |     "        path2dir: str = None,\n",
 192 |     "        keywords: Union[str, List[str]] = None,\n",
 193 |     "        csv: Optional[pd.DataFrame] = None,\n",
 194 |     "        part: \"str\" = \"train\",\n",
 195 |     "    ):        \n",
 196 |     "        self.transform = transform\n",
 197 |     "\n",
 198 |     "        self.path2dir = path2dir\n",
 199 |     "        self.keywords = keywords\n",
 200 |     "        self.index = self.create_or_load_index(part)\n",
 201 |     "\n",
 202 |     "    def create_or_load_index(self, part):\n",
 203 |     "        index_path = Path(f\"{part}_index.json\")\n",
 204 |     "        \n",
 205 |     "        if not index_path.exists():\n",
 206 |     "            self.create_index(part)\n",
 207 |     "            \n",
 208 |     "        return read_json(index_path)\n",
 209 |     "\n",
 210 |     "    def create_index(self, part):\n",
 211 |     "        path2dir = Path(self.path2dir)\n",
 212 |     "        keywords = self.keywords if isinstance(self.keywords, list) else [self.keywords]\n",
 213 |     "        \n",
 214 |     "        all_keywords = [\n",
 215 |     "            p.stem for p in path2dir.glob('*')\n",
 216 |     "            if p.is_dir() and not p.stem.startswith('_')\n",
 217 |     "        ]\n",
 218 |     "\n",
 219 |     "        index = []\n",
 220 |     "        for keyword in all_keywords:\n",
 221 |     "            paths = (path2dir / keyword).rglob('*.wav')\n",
 222 |     "            if keyword in keywords:\n",
 223 |     "                for path2wav in paths:\n",
 224 |     "                    index.append({\n",
 225 |     "                        \"path\": path2wav.as_posix(),\n",
 226 |     "                        \"keyword\": keyword,\n",
 227 |     "                        \"label\": 1\n",
 228 |     "                    })\n",
 229 |     "            else:\n",
 230 |     "                for path2wav in paths:\n",
 231 |     "                    index.append({\n",
 232 |     "                        \"path\": path2wav.as_posix(),\n",
 233 |     "                        \"keyword\": keyword,\n",
 234 |     "                        \"label\": 0\n",
 235 |     "                    })\n",
 236 |     "\n",
 237 |     "        torch.manual_seed(0)\n",
 238 |     "        indexes = torch.randperm(len(index))\n",
 239 |     "        train_indexes = indexes[:int(len(index) * 0.8)]\n",
 240 |     "        val_indexes = indexes[int(len(index) * 0.8):]\n",
 241 |     "\n",
 242 |     "        train_index = [index[i] for i in train_indexes]\n",
 243 |     "        val_index = [index[i] for i in val_indexes]\n",
 244 |     "\n",
 245 |     "        train_index_path = pathlib.Path(\"train_index.json\")\n",
 246 |     "        write_json(train_index, str(train_index_path))\n",
 247 |     "        \n",
 248 |     "        val_index_path = pathlib.Path(\"val_index.json\")\n",
 249 |     "        write_json(val_index, str(val_index_path))\n",
 250 |     "\n",
 251 |     "    def __getitem__(self, ind: int):\n",
 252 |     "        instance = self.index[ind]\n",
 253 |     "\n",
 254 |     "        path2wav = instance['path']\n",
 255 |     "        wav, sr = torchaudio.load(path2wav)\n",
 256 |     "        wav = wav.sum(dim=0)\n",
 257 |     "        \n",
 258 |     "        if self.transform:\n",
 259 |     "            wav = self.transform(wav)\n",
 260 |     "\n",
 261 |     "        return {\n",
 262 |     "            'wav': wav,\n",
 263 |     "            'keywors': instance['keyword'],\n",
 264 |     "            'label': instance['label']\n",
 265 |     "        }\n",
 266 |     "\n",
 267 |     "    def __len__(self):\n",
 268 |     "        return len(self.index)\n"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "code",
 273 |    "execution_count": null,
 274 |    "id": "2e704499-9c1f-40a0-a3ad-5cde8cdd25f8",
 275 |    "metadata": {},
 276 |    "outputs": [],
 277 |    "source": [
 278 |     "class SpeechCommandDatasetV3(Dataset):\n",
 279 |     "\n",
 280 |     "    def __init__(\n",
 281 |     "        self,\n",
 282 |     "        transform: Optional[Callable] = None,\n",
 283 |     "        path2dir: str = None,\n",
 284 |     "        keywords: Union[str, List[str]] = None,\n",
 285 |     "        csv: Optional[pd.DataFrame] = None,\n",
 286 |     "        part: \"str\" = \"train\",\n",
 287 |     "    ):        \n",
 288 |     "        self.transform = transform\n",
 289 |     "\n",
 290 |     "        self.path2dir = path2dir\n",
 291 |     "        self.keywords = keywords\n",
 292 |     "        self.index = self.create_or_load_index(part)\n",
 293 |     "\n",
 294 |     "    def create_or_load_index(self, part):\n",
 295 |     "        index_path = Path(f\"{part}_index.json\")\n",
 296 |     "        \n",
 297 |     "        if not index_path.exists():\n",
 298 |     "            self.create_index(part)\n",
 299 |     "            \n",
 300 |     "        return read_json(index_path)\n",
 301 |     "\n",
 302 |     "    def create_index(self, part):\n",
 303 |     "        path2dir = Path(self.path2dir)\n",
 304 |     "        keywords = self.keywords if isinstance(self.keywords, list) else [self.keywords]\n",
 305 |     "        \n",
 306 |     "        all_keywords = [\n",
 307 |     "            p.stem for p in path2dir.glob('*')\n",
 308 |     "            if p.is_dir() and not p.stem.startswith('_')\n",
 309 |     "        ]\n",
 310 |     "\n",
 311 |     "        index = []\n",
 312 |     "        for keyword in all_keywords:\n",
 313 |     "            paths = (path2dir / keyword).rglob('*.wav')\n",
 314 |     "            if keyword in keywords:\n",
 315 |     "                for path2wav in paths:\n",
 316 |     "                    index.append({\n",
 317 |     "                        \"path\": path2wav.as_posix(),\n",
 318 |     "                        \"keyword\": keyword,\n",
 319 |     "                        \"label\": 1\n",
 320 |     "                    })\n",
 321 |     "            else:\n",
 322 |     "                for path2wav in paths:\n",
 323 |     "                    index.append({\n",
 324 |     "                        \"path\": path2wav.as_posix(),\n",
 325 |     "                        \"keyword\": keyword,\n",
 326 |     "                        \"label\": 0\n",
 327 |     "                    })\n",
 328 |     "\n",
 329 |     "        torch.manual_seed(0)\n",
 330 |     "        indexes = torch.randperm(len(index))\n",
 331 |     "        train_indexes = indexes[:int(len(index) * 0.8)]\n",
 332 |     "        val_indexes = indexes[int(len(index) * 0.8):]\n",
 333 |     "\n",
 334 |     "        train_index = [index[i] for i in train_indexes]\n",
 335 |     "        val_index = [index[i] for i in val_indexes]\n",
 336 |     "\n",
 337 |     "        train_index_path = pathlib.Path(\"train_index.json\")\n",
 338 |     "        write_json(train_index, str(train_index_path))\n",
 339 |     "        \n",
 340 |     "        val_index_path = pathlib.Path(\"val_index.json\")\n",
 341 |     "        write_json(val_index, str(val_index_path))\n",
 342 |     "\n",
 343 |     "    def __getitem__(self, ind: int):\n",
 344 |     "        instance = self.index[ind]\n",
 345 |     "\n",
 346 |     "        path2wav = instance['path']\n",
 347 |     "        wav, sr = torchaudio.load(path2wav)\n",
 348 |     "        wav = wav.sum(dim=0)\n",
 349 |     "        \n",
 350 |     "        if self.transform:\n",
 351 |     "            wav = self.transform(wav)\n",
 352 |     "\n",
 353 |     "        return {\n",
 354 |     "            'wav': wav,\n",
 355 |     "            'keywors': instance['keyword'],\n",
 356 |     "            'label': instance['label']\n",
 357 |     "        }\n",
 358 |     "\n",
 359 |     "    def __len__(self):\n",
 360 |     "        return len(self.index)\n"
 361 |    ]
 362 |   },
 363 |   {
 364 |    "cell_type": "code",
 365 |    "execution_count": null,
 366 |    "id": "7a5caeba-a61e-4c9e-b263-b0eb81c0bafa",
 367 |    "metadata": {},
 368 |    "outputs": [],
 369 |    "source": [
 370 |     "class SpeechCommandDatasetV4(Dataset):\n",
 371 |     "\n",
 372 |     "    def __init__(\n",
 373 |     "        self,\n",
 374 |     "        transform: Optional[Callable] = None,\n",
 375 |     "        path2dir: str = None,\n",
 376 |     "        keywords: Union[str, List[str]] = None,\n",
 377 |     "        csv: Optional[pd.DataFrame] = None,\n",
 378 |     "        part: \"str\" = \"train\",\n",
 379 |     "    ):        \n",
 380 |     "        self.transform = transform\n",
 381 |     "\n",
 382 |     "        self.path2dir = path2dir\n",
 383 |     "        self.keywords = keywords\n",
 384 |     "        self.index = self.create_or_load_index(part)\n",
 385 |     "\n",
 386 |     "    def create_or_load_index(self, part):\n",
 387 |     "        index_path = Path(f\"{part}_index.json\")\n",
 388 |     "        \n",
 389 |     "        if not index_path.exists():\n",
 390 |     "            self.create_index(part)\n",
 391 |     "            \n",
 392 |     "        return read_json(index_path)\n",
 393 |     "\n",
 394 |     "    def create_index(self, part):\n",
 395 |     "        path2dir = Path(self.path2dir)\n",
 396 |     "        keywords = self.keywords if isinstance(self.keywords, list) else [self.keywords]\n",
 397 |     "        \n",
 398 |     "        all_keywords = [\n",
 399 |     "            p.stem for p in path2dir.glob('*')\n",
 400 |     "            if p.is_dir() and not p.stem.startswith('_')\n",
 401 |     "        ]\n",
 402 |     "\n",
 403 |     "        index = []\n",
 404 |     "        for keyword in all_keywords:\n",
 405 |     "            paths = (path2dir / keyword).rglob('*.wav')\n",
 406 |     "            if keyword in keywords:\n",
 407 |     "                for path2wav in paths:\n",
 408 |     "                    index.append({\n",
 409 |     "                        \"path\": path2wav.as_posix(),\n",
 410 |     "                        \"keyword\": keyword,\n",
 411 |     "                        \"label\": 1\n",
 412 |     "                    })\n",
 413 |     "            else:\n",
 414 |     "                for path2wav in paths:\n",
 415 |     "                    index.append({\n",
 416 |     "                        \"path\": path2wav.as_posix(),\n",
 417 |     "                        \"keyword\": keyword,\n",
 418 |     "                        \"label\": 0\n",
 419 |     "                    })\n",
 420 |     "\n",
 421 |     "        torch.manual_seed(0)\n",
 422 |     "        indexes = torch.randperm(len(index))\n",
 423 |     "        train_indexes = indexes[:int(len(index) * 0.8)]\n",
 424 |     "        val_indexes = indexes[int(len(index) * 0.8):]\n",
 425 |     "\n",
 426 |     "        train_index = [index[i] for i in train_indexes]\n",
 427 |     "        val_index = [index[i] for i in val_indexes]\n",
 428 |     "\n",
 429 |     "        train_index_path = pathlib.Path(\"train_index.json\")\n",
 430 |     "        write_json(train_index, str(train_index_path))\n",
 431 |     "        \n",
 432 |     "        val_index_path = pathlib.Path(\"val_index.json\")\n",
 433 |     "        write_json(val_index, str(val_index_path))\n",
 434 |     "\n",
 435 |     "    def __getitem__(self, ind: int):\n",
 436 |     "        instance = self.index[ind]\n",
 437 |     "\n",
 438 |     "        path2wav = instance['path']\n",
 439 |     "        wav, sr = torchaudio.load(path2wav)\n",
 440 |     "        wav = wav.sum(dim=0)\n",
 441 |     "        \n",
 442 |     "        if self.transform:\n",
 443 |     "            wav = self.transform(wav)\n",
 444 |     "\n",
 445 |     "        return {\n",
 446 |     "            'wav': wav,\n",
 447 |     "            'keywors': instance['keyword'],\n",
 448 |     "            'label': instance['label']\n",
 449 |     "        }\n",
 450 |     "\n",
 451 |     "    def __len__(self):\n",
 452 |     "        return len(self.index)\n"
 453 |    ]
 454 |   },
 455 |   {
 456 |    "cell_type": "code",
 457 |    "execution_count": null,
 458 |    "id": "c3861ca2-70de-4ed3-9cdd-82d37ad4cc13",
 459 |    "metadata": {},
 460 |    "outputs": [],
 461 |    "source": [
 462 |     "class SpeechCommandDatasetV5(Dataset):\n",
 463 |     "\n",
 464 |     "    def __init__(\n",
 465 |     "        self,\n",
 466 |     "        transform: Optional[Callable] = None,\n",
 467 |     "        path2dir: str = None,\n",
 468 |     "        keywords: Union[str, List[str]] = None,\n",
 469 |     "        csv: Optional[pd.DataFrame] = None,\n",
 470 |     "        part: \"str\" = \"train\",\n",
 471 |     "    ):        \n",
 472 |     "        self.transform = transform\n",
 473 |     "\n",
 474 |     "        self.path2dir = path2dir\n",
 475 |     "        self.keywords = keywords\n",
 476 |     "        self.index = self.create_or_load_index(part)\n",
 477 |     "\n",
 478 |     "    def create_or_load_index(self, part):\n",
 479 |     "        index_path = Path(f\"{part}_index.json\")\n",
 480 |     "        \n",
 481 |     "        if not index_path.exists():\n",
 482 |     "            self.create_index(part)\n",
 483 |     "            \n",
 484 |     "        return read_json(index_path)\n",
 485 |     "\n",
 486 |     "    def create_index(self, part):\n",
 487 |     "        path2dir = Path(self.path2dir)\n",
 488 |     "        keywords = self.keywords if isinstance(self.keywords, list) else [self.keywords]\n",
 489 |     "        \n",
 490 |     "        all_keywords = [\n",
 491 |     "            p.stem for p in path2dir.glob('*')\n",
 492 |     "            if p.is_dir() and not p.stem.startswith('_')\n",
 493 |     "        ]\n",
 494 |     "\n",
 495 |     "        index = []\n",
 496 |     "        for keyword in all_keywords:\n",
 497 |     "            paths = (path2dir / keyword).rglob('*.wav')\n",
 498 |     "            if keyword in keywords:\n",
 499 |     "                for path2wav in paths:\n",
 500 |     "                    index.append({\n",
 501 |     "                        \"path\": path2wav.as_posix(),\n",
 502 |     "                        \"keyword\": keyword,\n",
 503 |     "                        \"label\": 1\n",
 504 |     "                    })\n",
 505 |     "            else:\n",
 506 |     "                for path2wav in paths:\n",
 507 |     "                    index.append({\n",
 508 |     "                        \"path\": path2wav.as_posix(),\n",
 509 |     "                        \"keyword\": keyword,\n",
 510 |     "                        \"label\": 0\n",
 511 |     "                    })\n",
 512 |     "\n",
 513 |     "        torch.manual_seed(0)\n",
 514 |     "        indexes = torch.randperm(len(index))\n",
 515 |     "        train_indexes = indexes[:int(len(index) * 0.8)]\n",
 516 |     "        val_indexes = indexes[int(len(index) * 0.8):]\n",
 517 |     "\n",
 518 |     "        train_index = [index[i] for i in train_indexes]\n",
 519 |     "        val_index = [index[i] for i in val_indexes]\n",
 520 |     "\n",
 521 |     "        train_index_path = pathlib.Path(\"train_index.json\")\n",
 522 |     "        write_json(train_index, str(train_index_path))\n",
 523 |     "        \n",
 524 |     "        val_index_path = pathlib.Path(\"val_index.json\")\n",
 525 |     "        write_json(val_index, str(val_index_path))\n",
 526 |     "\n",
 527 |     "    def __getitem__(self, ind: int):\n",
 528 |     "        instance = self.index[ind]\n",
 529 |     "\n",
 530 |     "        path2wav = instance['path']\n",
 531 |     "        wav, sr = torchaudio.load(path2wav)\n",
 532 |     "        wav = wav.sum(dim=0)\n",
 533 |     "        \n",
 534 |     "        if self.transform:\n",
 535 |     "            wav = self.transform(wav)\n",
 536 |     "\n",
 537 |     "        return {\n",
 538 |     "            'wav': wav,\n",
 539 |     "            'keywors': instance['keyword'],\n",
 540 |     "            'label': instance['label']\n",
 541 |     "        }\n",
 542 |     "\n",
 543 |     "    def __len__(self):\n",
 544 |     "        return len(self.index)\n"
 545 |    ]
 546 |   },
 547 |   {
 548 |    "cell_type": "code",
 549 |    "execution_count": null,
 550 |    "id": "07b777e5-5a57-4251-90e5-f291a540a935",
 551 |    "metadata": {},
 552 |    "outputs": [],
 553 |    "source": [
 554 |     "import torchaudio\n",
 555 |     "\n",
 556 |     "class AugsCreation:\n",
 557 |     "\n",
 558 |     "    def __init__(self):\n",
 559 |     "        self.background_noises = [\n",
 560 |     "            'speech_commands/_background_noise_/white_noise.wav',\n",
 561 |     "            'speech_commands/_background_noise_/dude_miaowing.wav',\n",
 562 |     "            'speech_commands/_background_noise_/doing_the_dishes.wav',\n",
 563 |     "            'speech_commands/_background_noise_/exercise_bike.wav',\n",
 564 |     "            'speech_commands/_background_noise_/pink_noise.wav',\n",
 565 |     "            'speech_commands/_background_noise_/running_tap.wav'\n",
 566 |     "        ]\n",
 567 |     "\n",
 568 |     "        self.noises = [\n",
 569 |     "            torchaudio.load(p)[0].squeeze()\n",
 570 |     "            for p in self.background_noises\n",
 571 |     "        ]\n",
 572 |     "\n",
 573 |     "    def add_rand_noise(self, audio):\n",
 574 |     "\n",
 575 |     "        # randomly choose noise\n",
 576 |     "        noise_num = torch.randint(low=0, high=len(\n",
 577 |     "            self.background_noises), size=(1,)).item()\n",
 578 |     "        noise = self.noises[noise_num]\n",
 579 |     "\n",
 580 |     "        noise_level = torch.Tensor([1])  # [0, 40]\n",
 581 |     "\n",
 582 |     "        noise_energy = torch.norm(noise)\n",
 583 |     "        audio_energy = torch.norm(audio)\n",
 584 |     "        alpha = (audio_energy / noise_energy) * \\\n",
 585 |     "            torch.pow(10, -noise_level / 20)\n",
 586 |     "\n",
 587 |     "        start = torch.randint(\n",
 588 |     "            low=0,\n",
 589 |     "            high=max(int(noise.size(0) - audio.size(0) - 1), 1),\n",
 590 |     "            size=(1,)\n",
 591 |     "        ).item()\n",
 592 |     "        noise_sample = noise[start: start + audio.size(0)]\n",
 593 |     "\n",
 594 |     "        audio_new = audio + alpha * noise_sample\n",
 595 |     "        audio_new.clamp_(-1, 1)\n",
 596 |     "        return audio_new\n",
 597 |     "\n",
 598 |     "    def __call__(self, wav):\n",
 599 |     "        aug_num = torch.randint(low=0, high=4, size=(1,)).item()   # choose 1 random aug from augs\n",
 600 |     "        augs = [\n",
 601 |     "            lambda x: x,\n",
 602 |     "            lambda x: (x + torch.distributions.Normal(0, 0.01).sample(x.size())).clamp_(-1, 1),\n",
 603 |     "            lambda x: torchaudio.transforms.Vol(.25)(x),\n",
 604 |     "            lambda x: self.add_rand_noise(x)\n",
 605 |     "        ]\n",
 606 |     "\n",
 607 |     "        return augs[aug_num](wav)"
 608 |    ]
 609 |   },
 610 |   {
 611 |    "cell_type": "code",
 612 |    "execution_count": null,
 613 |    "id": "3f45ccf7-aafa-46b0-9473-e27fbefae2e0",
 614 |    "metadata": {},
 615 |    "outputs": [],
 616 |    "source": [
 617 |     "train_dataset = SpeechCommandDataset(\n",
 618 |     "    path2dir='speech_commands', keywords=TaskConfig.keyword, part=\"train\", transform=AugsCreation()\n",
 619 |     ")\n",
 620 |     "val_dataset = SpeechCommandDataset(\n",
 621 |     "    path2dir='speech_commands', keywords=TaskConfig.keyword, part=\"val\"\n",
 622 |     ")"
 623 |    ]
 624 |   },
 625 |   {
 626 |    "cell_type": "code",
 627 |    "execution_count": null,
 628 |    "id": "3b336ca8-c552-43d6-8d76-e7398d921f06",
 629 |    "metadata": {},
 630 |    "outputs": [],
 631 |    "source": [
 632 |     "train_dataset.index[:2]"
 633 |    ]
 634 |   },
 635 |   {
 636 |    "cell_type": "code",
 637 |    "execution_count": null,
 638 |    "id": "dbb86465-6cf1-40cb-9e45-102843000e70",
 639 |    "metadata": {},
 640 |    "outputs": [],
 641 |    "source": [
 642 |     "from torch.nn.utils.rnn import pad_sequence\n",
 643 |     "\n",
 644 |     "def collate_fn(data):\n",
 645 |     "    wavs = []\n",
 646 |     "    labels = []    \n",
 647 |     "\n",
 648 |     "    for el in data:\n",
 649 |     "        wavs.append(el['wav'])\n",
 650 |     "        labels.append(el['label'])\n",
 651 |     "\n",
 652 |     "    # torch.nn.utils.rnn.pad_sequence takes list(Tensors) and returns padded (with 0.0) Tensor\n",
 653 |     "    wavs = pad_sequence(wavs, batch_first=True)    \n",
 654 |     "    labels = torch.Tensor(labels).long()\n",
 655 |     "    return wavs, labels"
 656 |    ]
 657 |   },
 658 |   {
 659 |    "cell_type": "code",
 660 |    "execution_count": null,
 661 |    "id": "2dc73f86-847c-46cc-bb1c-df557e141748",
 662 |    "metadata": {},
 663 |    "outputs": [],
 664 |    "source": [
 665 |     "train_dataloader = DataLoader(train_dataset, batch_size=TaskConfig.batch_size,\n",
 666 |     "                          shuffle=False, collate_fn=collate_fn,\n",
 667 |     "                          num_workers=2, pin_memory=True)\n",
 668 |     "\n",
 669 |     "val_dataloader = DataLoader(val_dataset, batch_size=TaskConfig.batch_size,\n",
 670 |     "                        shuffle=False, collate_fn=collate_fn,\n",
 671 |     "                        num_workers=2, pin_memory=True)"
 672 |    ]
 673 |   },
 674 |   {
 675 |    "cell_type": "code",
 676 |    "execution_count": null,
 677 |    "id": "3797e444-a0ee-44a9-9d1a-d3c83e29f6f3",
 678 |    "metadata": {},
 679 |    "outputs": [],
 680 |    "source": [
 681 |     "from torch import nn"
 682 |    ]
 683 |   },
 684 |   {
 685 |    "cell_type": "code",
 686 |    "execution_count": null,
 687 |    "id": "130538c5-e66e-411a-9b7d-3ff978948365",
 688 |    "metadata": {},
 689 |    "outputs": [],
 690 |    "source": [
 691 |     "class LogMelspec(nn.Module):\n",
 692 |     "\n",
 693 |     "    def __init__(self, config):\n",
 694 |     "        super().__init__()\n",
 695 |     "        self.melspec = torchaudio.transforms.MelSpectrogram(\n",
 696 |     "                sample_rate=config.sample_rate,\n",
 697 |     "                n_fft=400,\n",
 698 |     "                win_length=400,\n",
 699 |     "                hop_length=160,\n",
 700 |     "                n_mels=config.n_mels\n",
 701 |     "        )\n",
 702 |     "\n",
 703 |     "        self.spec_augs = nn.Sequential(\n",
 704 |     "                torchaudio.transforms.FrequencyMasking(freq_mask_param=15),\n",
 705 |     "                torchaudio.transforms.TimeMasking(time_mask_param=35),\n",
 706 |     "        )\n",
 707 |     "\n",
 708 |     "\n",
 709 |     "    def __call__(self, batch):\n",
 710 |     "        x = torch.log(self.melspec(batch).clamp_(min=1e-9, max=1e9))\n",
 711 |     "        if self.training:\n",
 712 |     "            x = self.spec_augs(x)\n",
 713 |     "        return x"
 714 |    ]
 715 |   },
 716 |   {
 717 |    "cell_type": "code",
 718 |    "execution_count": null,
 719 |    "id": "adf8be91-a439-42ad-bd77-77b5be595190",
 720 |    "metadata": {},
 721 |    "outputs": [],
 722 |    "source": [
 723 |     "class Attention(nn.Module):\n",
 724 |     "\n",
 725 |     "    def __init__(self, hidden_size: int):\n",
 726 |     "        super().__init__()\n",
 727 |     "\n",
 728 |     "        self.energy = nn.Sequential(\n",
 729 |     "            nn.Linear(hidden_size, hidden_size),\n",
 730 |     "            nn.Tanh(),\n",
 731 |     "            nn.Linear(hidden_size, 1)\n",
 732 |     "        )\n",
 733 |     "    \n",
 734 |     "    def forward(self, input):\n",
 735 |     "        energy = self.energy(input)\n",
 736 |     "        alpha = torch.softmax(energy, dim=-2)\n",
 737 |     "        return (input * alpha).sum(dim=-2)\n",
 738 |     "\n",
 739 |     "class CRNN(nn.Module):\n",
 740 |     "\n",
 741 |     "    def __init__(self, config: TaskConfig):\n",
 742 |     "        super().__init__()\n",
 743 |     "        self.config = config\n",
 744 |     "\n",
 745 |     "        self.mel_spec = LogMelspec(config)\n",
 746 |     "\n",
 747 |     "        self.conv = nn.Sequential(\n",
 748 |     "            nn.Conv2d(\n",
 749 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 750 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 751 |     "            ),\n",
 752 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 753 |     "        )\n",
 754 |     "\n",
 755 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 756 |     "            config.stride[0] + 1\n",
 757 |     "        \n",
 758 |     "        self.gru = nn.GRU(\n",
 759 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 760 |     "            hidden_size=config.hidden_size,\n",
 761 |     "            num_layers=config.gru_num_layers,\n",
 762 |     "            dropout=0.1,\n",
 763 |     "            bidirectional=config.bidirectional,\n",
 764 |     "            batch_first=True\n",
 765 |     "        )\n",
 766 |     "\n",
 767 |     "        self.attention = Attention(config.hidden_size)\n",
 768 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 769 |     "    \n",
 770 |     "    def forward(self, input):\n",
 771 |     "        input = self.mel_spec(input)\n",
 772 |     "        \n",
 773 |     "        input = input.unsqueeze(dim=1)\n",
 774 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 775 |     "        gru_output, _ = self.gru(conv_output)\n",
 776 |     "        contex_vector = self.attention(gru_output)\n",
 777 |     "        output = self.classifier(contex_vector)\n",
 778 |     "        return output"
 779 |    ]
 780 |   },
 781 |   {
 782 |    "cell_type": "code",
 783 |    "execution_count": null,
 784 |    "id": "9732e122-7988-4544-8815-c92de443a72e",
 785 |    "metadata": {},
 786 |    "outputs": [],
 787 |    "source": [
 788 |     "class CRNNV2(nn.Module):\n",
 789 |     "\n",
 790 |     "    def __init__(self, config: TaskConfig):\n",
 791 |     "        super().__init__()\n",
 792 |     "        self.config = config\n",
 793 |     "\n",
 794 |     "        self.mel_spec = LogMelspec(config)\n",
 795 |     "\n",
 796 |     "        self.conv = nn.Sequential(\n",
 797 |     "            nn.Conv2d(\n",
 798 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 799 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 800 |     "            ),\n",
 801 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 802 |     "        )\n",
 803 |     "\n",
 804 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 805 |     "            config.stride[0] + 1\n",
 806 |     "        \n",
 807 |     "        self.gru = nn.GRU(\n",
 808 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 809 |     "            hidden_size=config.hidden_size,\n",
 810 |     "            num_layers=config.gru_num_layers,\n",
 811 |     "            dropout=0.1,\n",
 812 |     "            bidirectional=config.bidirectional,\n",
 813 |     "            batch_first=True\n",
 814 |     "        )\n",
 815 |     "\n",
 816 |     "        self.attention = Attention(config.hidden_size)\n",
 817 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 818 |     "    \n",
 819 |     "    def forward(self, input):\n",
 820 |     "        input = self.mel_spec(input)\n",
 821 |     "        \n",
 822 |     "        input = input.unsqueeze(dim=1)\n",
 823 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 824 |     "        gru_output, _ = self.gru(conv_output)\n",
 825 |     "        contex_vector = self.attention(gru_output)\n",
 826 |     "        output = self.classifier(contex_vector)\n",
 827 |     "        return output"
 828 |    ]
 829 |   },
 830 |   {
 831 |    "cell_type": "code",
 832 |    "execution_count": null,
 833 |    "id": "4eab3ad0-c01c-4e2f-be1b-5a3c8c9ae227",
 834 |    "metadata": {},
 835 |    "outputs": [],
 836 |    "source": [
 837 |     "class CRNNV3(nn.Module):\n",
 838 |     "\n",
 839 |     "    def __init__(self, config: TaskConfig):\n",
 840 |     "        super().__init__()\n",
 841 |     "        self.config = config\n",
 842 |     "\n",
 843 |     "        self.mel_spec = LogMelspec(config)\n",
 844 |     "\n",
 845 |     "        self.conv = nn.Sequential(\n",
 846 |     "            nn.Conv2d(\n",
 847 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 848 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 849 |     "            ),\n",
 850 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 851 |     "        )\n",
 852 |     "\n",
 853 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 854 |     "            config.stride[0] + 1\n",
 855 |     "        \n",
 856 |     "        self.gru = nn.GRU(\n",
 857 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 858 |     "            hidden_size=config.hidden_size,\n",
 859 |     "            num_layers=config.gru_num_layers,\n",
 860 |     "            dropout=0.1,\n",
 861 |     "            bidirectional=config.bidirectional,\n",
 862 |     "            batch_first=True\n",
 863 |     "        )\n",
 864 |     "\n",
 865 |     "        self.attention = Attention(config.hidden_size)\n",
 866 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 867 |     "    \n",
 868 |     "    def forward(self, input):\n",
 869 |     "        input = self.mel_spec(input)\n",
 870 |     "        \n",
 871 |     "        input = input.unsqueeze(dim=1)\n",
 872 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 873 |     "        gru_output, _ = self.gru(conv_output)\n",
 874 |     "        contex_vector = self.attention(gru_output)\n",
 875 |     "        output = self.classifier(contex_vector)\n",
 876 |     "        return output\n",
 877 |     "\n",
 878 |     "\n",
 879 |     "class CRNNV3(nn.Module):\n",
 880 |     "\n",
 881 |     "    def __init__(self, config: TaskConfig):\n",
 882 |     "        super().__init__()\n",
 883 |     "        self.config = config\n",
 884 |     "\n",
 885 |     "        self.mel_spec = LogMelspec(config)\n",
 886 |     "\n",
 887 |     "        self.conv = nn.Sequential(\n",
 888 |     "            nn.Conv2d(\n",
 889 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 890 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 891 |     "            ),\n",
 892 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 893 |     "        )\n",
 894 |     "\n",
 895 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 896 |     "            config.stride[0] + 1\n",
 897 |     "        \n",
 898 |     "        self.gru = nn.GRU(\n",
 899 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 900 |     "            hidden_size=config.hidden_size,\n",
 901 |     "            num_layers=config.gru_num_layers,\n",
 902 |     "            dropout=0.1,\n",
 903 |     "            bidirectional=config.bidirectional,\n",
 904 |     "            batch_first=True\n",
 905 |     "        )\n",
 906 |     "\n",
 907 |     "        self.attention = Attention(config.hidden_size)\n",
 908 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 909 |     "    \n",
 910 |     "    def forward(self, input):\n",
 911 |     "        input = self.mel_spec(input)\n",
 912 |     "        \n",
 913 |     "        input = input.unsqueeze(dim=1)\n",
 914 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 915 |     "        gru_output, _ = self.gru(conv_output)\n",
 916 |     "        contex_vector = self.attention(gru_output)\n",
 917 |     "        output = self.classifier(contex_vector)\n",
 918 |     "        return output\n",
 919 |     "\n",
 920 |     "\n",
 921 |     "class CRNNV3(nn.Module):\n",
 922 |     "\n",
 923 |     "    def __init__(self, config: TaskConfig):\n",
 924 |     "        super().__init__()\n",
 925 |     "        self.config = config\n",
 926 |     "\n",
 927 |     "        self.mel_spec = LogMelspec(config)\n",
 928 |     "\n",
 929 |     "        self.conv = nn.Sequential(\n",
 930 |     "            nn.Conv2d(\n",
 931 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 932 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 933 |     "            ),\n",
 934 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 935 |     "        )\n",
 936 |     "\n",
 937 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 938 |     "            config.stride[0] + 1\n",
 939 |     "        \n",
 940 |     "        self.gru = nn.GRU(\n",
 941 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 942 |     "            hidden_size=config.hidden_size,\n",
 943 |     "            num_layers=config.gru_num_layers,\n",
 944 |     "            dropout=0.1,\n",
 945 |     "            bidirectional=config.bidirectional,\n",
 946 |     "            batch_first=True\n",
 947 |     "        )\n",
 948 |     "\n",
 949 |     "        self.attention = Attention(config.hidden_size)\n",
 950 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 951 |     "    \n",
 952 |     "    def forward(self, input):\n",
 953 |     "        input = self.mel_spec(input)\n",
 954 |     "        \n",
 955 |     "        input = input.unsqueeze(dim=1)\n",
 956 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 957 |     "        gru_output, _ = self.gru(conv_output)\n",
 958 |     "        contex_vector = self.attention(gru_output)\n",
 959 |     "        output = self.classifier(contex_vector)\n",
 960 |     "        return output\n",
 961 |     "\n",
 962 |     "\n",
 963 |     "class CRNNV3(nn.Module):\n",
 964 |     "\n",
 965 |     "    def __init__(self, config: TaskConfig):\n",
 966 |     "        super().__init__()\n",
 967 |     "        self.config = config\n",
 968 |     "\n",
 969 |     "        self.mel_spec = LogMelspec(config)\n",
 970 |     "\n",
 971 |     "        self.conv = nn.Sequential(\n",
 972 |     "            nn.Conv2d(\n",
 973 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 974 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 975 |     "            ),\n",
 976 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 977 |     "        )\n",
 978 |     "\n",
 979 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 980 |     "            config.stride[0] + 1\n",
 981 |     "        \n",
 982 |     "        self.gru = nn.GRU(\n",
 983 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 984 |     "            hidden_size=config.hidden_size,\n",
 985 |     "            num_layers=config.gru_num_layers,\n",
 986 |     "            dropout=0.1,\n",
 987 |     "            bidirectional=config.bidirectional,\n",
 988 |     "            batch_first=True\n",
 989 |     "        )\n",
 990 |     "\n",
 991 |     "        self.attention = Attention(config.hidden_size)\n",
 992 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 993 |     "    \n",
 994 |     "    def forward(self, input):\n",
 995 |     "        input = self.mel_spec(input)\n",
 996 |     "        \n",
 997 |     "        input = input.unsqueeze(dim=1)\n",
 998 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 999 |     "        gru_output, _ = self.gru(conv_output)\n",
1000 |     "        contex_vector = self.attention(gru_output)\n",
1001 |     "        output = self.classifier(contex_vector)\n",
1002 |     "        return output\n",
1003 |     "\n",
1004 |     "\n",
1005 |     "class CRNNV3(nn.Module):\n",
1006 |     "\n",
1007 |     "    def __init__(self, config: TaskConfig):\n",
1008 |     "        super().__init__()\n",
1009 |     "        self.config = config\n",
1010 |     "\n",
1011 |     "        self.mel_spec = LogMelspec(config)\n",
1012 |     "\n",
1013 |     "        self.conv = nn.Sequential(\n",
1014 |     "            nn.Conv2d(\n",
1015 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1016 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1017 |     "            ),\n",
1018 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1019 |     "        )\n",
1020 |     "\n",
1021 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1022 |     "            config.stride[0] + 1\n",
1023 |     "        \n",
1024 |     "        self.gru = nn.GRU(\n",
1025 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1026 |     "            hidden_size=config.hidden_size,\n",
1027 |     "            num_layers=config.gru_num_layers,\n",
1028 |     "            dropout=0.1,\n",
1029 |     "            bidirectional=config.bidirectional,\n",
1030 |     "            batch_first=True\n",
1031 |     "        )\n",
1032 |     "\n",
1033 |     "        self.attention = Attention(config.hidden_size)\n",
1034 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1035 |     "    \n",
1036 |     "    def forward(self, input):\n",
1037 |     "        input = self.mel_spec(input)\n",
1038 |     "        \n",
1039 |     "        input = input.unsqueeze(dim=1)\n",
1040 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1041 |     "        gru_output, _ = self.gru(conv_output)\n",
1042 |     "        contex_vector = self.attention(gru_output)\n",
1043 |     "        output = self.classifier(contex_vector)\n",
1044 |     "        return outputclass CRNNV3(nn.Module):\n",
1045 |     "\n",
1046 |     "    def __init__(self, config: TaskConfig):\n",
1047 |     "        super().__init__()\n",
1048 |     "        self.config = config\n",
1049 |     "\n",
1050 |     "        self.mel_spec = LogMelspec(config)\n",
1051 |     "\n",
1052 |     "        self.conv = nn.Sequential(\n",
1053 |     "            nn.Conv2d(\n",
1054 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1055 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1056 |     "            ),\n",
1057 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1058 |     "        )\n",
1059 |     "\n",
1060 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1061 |     "            config.stride[0] + 1\n",
1062 |     "        \n",
1063 |     "        self.gru = nn.GRU(\n",
1064 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1065 |     "            hidden_size=config.hidden_size,\n",
1066 |     "            num_layers=config.gru_num_layers,\n",
1067 |     "            dropout=0.1,\n",
1068 |     "            bidirectional=config.bidirectional,\n",
1069 |     "            batch_first=True\n",
1070 |     "        )\n",
1071 |     "\n",
1072 |     "        self.attention = Attention(config.hidden_size)\n",
1073 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1074 |     "    \n",
1075 |     "    def forward(self, input):\n",
1076 |     "        input = self.mel_spec(input)\n",
1077 |     "        \n",
1078 |     "        input = input.unsqueeze(dim=1)\n",
1079 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1080 |     "        gru_output, _ = self.gru(conv_output)\n",
1081 |     "        contex_vector = self.attention(gru_output)\n",
1082 |     "        output = self.classifier(contex_vector)\n",
1083 |     "        return output\n",
1084 |     "\n",
1085 |     "\n",
1086 |     "class CRNNV3(nn.Module):\n",
1087 |     "\n",
1088 |     "    def __init__(self, config: TaskConfig):\n",
1089 |     "        super().__init__()\n",
1090 |     "        self.config = config\n",
1091 |     "\n",
1092 |     "        self.mel_spec = LogMelspec(config)\n",
1093 |     "\n",
1094 |     "        self.conv = nn.Sequential(\n",
1095 |     "            nn.Conv2d(\n",
1096 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1097 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1098 |     "            ),\n",
1099 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1100 |     "        )\n",
1101 |     "\n",
1102 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1103 |     "            config.stride[0] + 1\n",
1104 |     "        \n",
1105 |     "        self.gru = nn.GRU(\n",
1106 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1107 |     "            hidden_size=config.hidden_size,\n",
1108 |     "            num_layers=config.gru_num_layers,\n",
1109 |     "            dropout=0.1,\n",
1110 |     "            bidirectional=config.bidirectional,\n",
1111 |     "            batch_first=True\n",
1112 |     "        )\n",
1113 |     "\n",
1114 |     "        self.attention = Attention(config.hidden_size)\n",
1115 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1116 |     "    \n",
1117 |     "    def forward(self, input):\n",
1118 |     "        input = self.mel_spec(input)\n",
1119 |     "        \n",
1120 |     "        input = input.unsqueeze(dim=1)\n",
1121 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1122 |     "        gru_output, _ = self.gru(conv_output)\n",
1123 |     "        contex_vector = self.attention(gru_output)\n",
1124 |     "        output = self.classifier(contex_vector)\n",
1125 |     "        return output\n",
1126 |     "\n",
1127 |     "\n",
1128 |     "class CRNNV3(nn.Module):\n",
1129 |     "\n",
1130 |     "    def __init__(self, config: TaskConfig):\n",
1131 |     "        super().__init__()\n",
1132 |     "        self.config = config\n",
1133 |     "\n",
1134 |     "        self.mel_spec = LogMelspec(config)\n",
1135 |     "\n",
1136 |     "        self.conv = nn.Sequential(\n",
1137 |     "            nn.Conv2d(\n",
1138 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1139 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1140 |     "            ),\n",
1141 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1142 |     "        )\n",
1143 |     "\n",
1144 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1145 |     "            config.stride[0] + 1\n",
1146 |     "        \n",
1147 |     "        self.gru = nn.GRU(\n",
1148 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1149 |     "            hidden_size=config.hidden_size,\n",
1150 |     "            num_layers=config.gru_num_layers,\n",
1151 |     "            dropout=0.1,\n",
1152 |     "            bidirectional=config.bidirectional,\n",
1153 |     "            batch_first=True\n",
1154 |     "        )\n",
1155 |     "\n",
1156 |     "        self.attention = Attention(config.hidden_size)\n",
1157 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1158 |     "    \n",
1159 |     "    def forward(self, input):\n",
1160 |     "        input = self.mel_spec(input)\n",
1161 |     "        \n",
1162 |     "        input = input.unsqueeze(dim=1)\n",
1163 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1164 |     "        gru_output, _ = self.gru(conv_output)\n",
1165 |     "        contex_vector = self.attention(gru_output)\n",
1166 |     "        output = self.classifier(contex_vector)\n",
1167 |     "        return output\n",
1168 |     "\n",
1169 |     "\n",
1170 |     "class CRNNV3(nn.Module):\n",
1171 |     "\n",
1172 |     "    def __init__(self, config: TaskConfig):\n",
1173 |     "        super().__init__()\n",
1174 |     "        self.config = config\n",
1175 |     "\n",
1176 |     "        self.mel_spec = LogMelspec(config)\n",
1177 |     "\n",
1178 |     "        self.conv = nn.Sequential(\n",
1179 |     "            nn.Conv2d(\n",
1180 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1181 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1182 |     "            ),\n",
1183 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1184 |     "        )\n",
1185 |     "\n",
1186 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1187 |     "            config.stride[0] + 1\n",
1188 |     "        \n",
1189 |     "        self.gru = nn.GRU(\n",
1190 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1191 |     "            hidden_size=config.hidden_size,\n",
1192 |     "            num_layers=config.gru_num_layers,\n",
1193 |     "            dropout=0.1,\n",
1194 |     "            bidirectional=config.bidirectional,\n",
1195 |     "            batch_first=True\n",
1196 |     "        )\n",
1197 |     "\n",
1198 |     "        self.attention = Attention(config.hidden_size)\n",
1199 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1200 |     "    \n",
1201 |     "    def forward(self, input):\n",
1202 |     "        input = self.mel_spec(input)\n",
1203 |     "        \n",
1204 |     "        input = input.unsqueeze(dim=1)\n",
1205 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1206 |     "        gru_output, _ = self.gru(conv_output)\n",
1207 |     "        contex_vector = self.attention(gru_output)\n",
1208 |     "        output = self.classifier(contex_vector)\n",
1209 |     "        return output\n",
1210 |     "\n",
1211 |     "\n",
1212 |     "class CRNNV3(nn.Module):\n",
1213 |     "\n",
1214 |     "    def __init__(self, config: TaskConfig):\n",
1215 |     "        super().__init__()\n",
1216 |     "        self.config = config\n",
1217 |     "\n",
1218 |     "        self.mel_spec = LogMelspec(config)\n",
1219 |     "\n",
1220 |     "        self.conv = nn.Sequential(\n",
1221 |     "            nn.Conv2d(\n",
1222 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1223 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1224 |     "            ),\n",
1225 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1226 |     "        )\n",
1227 |     "\n",
1228 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1229 |     "            config.stride[0] + 1\n",
1230 |     "        \n",
1231 |     "        self.gru = nn.GRU(\n",
1232 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1233 |     "            hidden_size=config.hidden_size,\n",
1234 |     "            num_layers=config.gru_num_layers,\n",
1235 |     "            dropout=0.1,\n",
1236 |     "            bidirectional=config.bidirectional,\n",
1237 |     "            batch_first=True\n",
1238 |     "        )\n",
1239 |     "\n",
1240 |     "        self.attention = Attention(config.hidden_size)\n",
1241 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1242 |     "    \n",
1243 |     "    def forward(self, input):\n",
1244 |     "        input = self.mel_spec(input)\n",
1245 |     "        \n",
1246 |     "        input = input.unsqueeze(dim=1)\n",
1247 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1248 |     "        gru_output, _ = self.gru(conv_output)\n",
1249 |     "        contex_vector = self.attention(gru_output)\n",
1250 |     "        output = self.classifier(contex_vector)\n",
1251 |     "        return output\n",
1252 |     "\n",
1253 |     "\n",
1254 |     "class CRNNV3(nn.Module):\n",
1255 |     "\n",
1256 |     "    def __init__(self, config: TaskConfig):\n",
1257 |     "        super().__init__()\n",
1258 |     "        self.config = config\n",
1259 |     "\n",
1260 |     "        self.mel_spec = LogMelspec(config)\n",
1261 |     "\n",
1262 |     "        self.conv = nn.Sequential(\n",
1263 |     "            nn.Conv2d(\n",
1264 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1265 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1266 |     "            ),\n",
1267 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1268 |     "        )\n",
1269 |     "\n",
1270 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1271 |     "            config.stride[0] + 1\n",
1272 |     "        \n",
1273 |     "        self.gru = nn.GRU(\n",
1274 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1275 |     "            hidden_size=config.hidden_size,\n",
1276 |     "            num_layers=config.gru_num_layers,\n",
1277 |     "            dropout=0.1,\n",
1278 |     "            bidirectional=config.bidirectional,\n",
1279 |     "            batch_first=True\n",
1280 |     "        )\n",
1281 |     "\n",
1282 |     "        self.attention = Attention(config.hidden_size)\n",
1283 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1284 |     "    \n",
1285 |     "    def forward(self, input):\n",
1286 |     "        input = self.mel_spec(input)\n",
1287 |     "        \n",
1288 |     "        input = input.unsqueeze(dim=1)\n",
1289 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1290 |     "        gru_output, _ = self.gru(conv_output)\n",
1291 |     "        contex_vector = self.attention(gru_output)\n",
1292 |     "        output = self.classifier(contex_vector)\n",
1293 |     "        return output\n",
1294 |     "\n",
1295 |     "\n",
1296 |     "class CRNNV3(nn.Module):\n",
1297 |     "\n",
1298 |     "    def __init__(self, config: TaskConfig):\n",
1299 |     "        super().__init__()\n",
1300 |     "        self.config = config\n",
1301 |     "\n",
1302 |     "        self.mel_spec = LogMelspec(config)\n",
1303 |     "\n",
1304 |     "        self.conv = nn.Sequential(\n",
1305 |     "            nn.Conv2d(\n",
1306 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1307 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1308 |     "            ),\n",
1309 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1310 |     "        )\n",
1311 |     "\n",
1312 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1313 |     "            config.stride[0] + 1\n",
1314 |     "        \n",
1315 |     "        self.gru = nn.GRU(\n",
1316 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1317 |     "            hidden_size=config.hidden_size,\n",
1318 |     "            num_layers=config.gru_num_layers,\n",
1319 |     "            dropout=0.1,\n",
1320 |     "            bidirectional=config.bidirectional,\n",
1321 |     "            batch_first=True\n",
1322 |     "        )\n",
1323 |     "\n",
1324 |     "        self.attention = Attention(config.hidden_size)\n",
1325 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1326 |     "    \n",
1327 |     "    def forward(self, input):\n",
1328 |     "        input = self.mel_spec(input)\n",
1329 |     "        \n",
1330 |     "        input = input.unsqueeze(dim=1)\n",
1331 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1332 |     "        gru_output, _ = self.gru(conv_output)\n",
1333 |     "        contex_vector = self.attention(gru_output)\n",
1334 |     "        output = self.classifier(contex_vector)\n",
1335 |     "        return output\n",
1336 |     "\n",
1337 |     "\n",
1338 |     "class CRNNV3(nn.Module):\n",
1339 |     "\n",
1340 |     "    def __init__(self, config: TaskConfig):\n",
1341 |     "        super().__init__()\n",
1342 |     "        self.config = config\n",
1343 |     "\n",
1344 |     "        self.mel_spec = LogMelspec(config)\n",
1345 |     "\n",
1346 |     "        self.conv = nn.Sequential(\n",
1347 |     "            nn.Conv2d(\n",
1348 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1349 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1350 |     "            ),\n",
1351 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1352 |     "        )\n",
1353 |     "\n",
1354 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1355 |     "            config.stride[0] + 1\n",
1356 |     "        \n",
1357 |     "        self.gru = nn.GRU(\n",
1358 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1359 |     "            hidden_size=config.hidden_size,\n",
1360 |     "            num_layers=config.gru_num_layers,\n",
1361 |     "            dropout=0.1,\n",
1362 |     "            bidirectional=config.bidirectional,\n",
1363 |     "            batch_first=True\n",
1364 |     "        )\n",
1365 |     "\n",
1366 |     "        self.attention = Attention(config.hidden_size)\n",
1367 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1368 |     "    \n",
1369 |     "    def forward(self, input):\n",
1370 |     "        input = self.mel_spec(input)\n",
1371 |     "        \n",
1372 |     "        input = input.unsqueeze(dim=1)\n",
1373 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1374 |     "        gru_output, _ = self.gru(conv_output)\n",
1375 |     "        contex_vector = self.attention(gru_output)\n",
1376 |     "        output = self.classifier(contex_vector)\n",
1377 |     "        return output\n",
1378 |     "\n",
1379 |     "\n",
1380 |     "class CRNNV3(nn.Module):\n",
1381 |     "\n",
1382 |     "    def __init__(self, config: TaskConfig):\n",
1383 |     "        super().__init__()\n",
1384 |     "        self.config = config\n",
1385 |     "\n",
1386 |     "        self.mel_spec = LogMelspec(config)\n",
1387 |     "\n",
1388 |     "        self.conv = nn.Sequential(\n",
1389 |     "            nn.Conv2d(\n",
1390 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1391 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1392 |     "            ),\n",
1393 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1394 |     "        )\n",
1395 |     "\n",
1396 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1397 |     "            config.stride[0] + 1\n",
1398 |     "        \n",
1399 |     "        self.gru = nn.GRU(\n",
1400 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1401 |     "            hidden_size=config.hidden_size,\n",
1402 |     "            num_layers=config.gru_num_layers,\n",
1403 |     "            dropout=0.1,\n",
1404 |     "            bidirectional=config.bidirectional,\n",
1405 |     "            batch_first=True\n",
1406 |     "        )\n",
1407 |     "\n",
1408 |     "        self.attention = Attention(config.hidden_size)\n",
1409 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1410 |     "    \n",
1411 |     "    def forward(self, input):\n",
1412 |     "        input = self.mel_spec(input)\n",
1413 |     "        \n",
1414 |     "        input = input.unsqueeze(dim=1)\n",
1415 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1416 |     "        gru_output, _ = self.gru(conv_output)\n",
1417 |     "        contex_vector = self.attention(gru_output)\n",
1418 |     "        output = self.classifier(contex_vector)\n",
1419 |     "        return output"
1420 |    ]
1421 |   },
1422 |   {
1423 |    "cell_type": "code",
1424 |    "execution_count": null,
1425 |    "id": "b10279b6-7547-47e1-a8ff-44d9a9feef7d",
1426 |    "metadata": {},
1427 |    "outputs": [],
1428 |    "source": [
1429 |     "config = TaskConfig()\n",
1430 |     "model = CRNN(config)\n",
1431 |     "model"
1432 |    ]
1433 |   },
1434 |   {
1435 |    "cell_type": "code",
1436 |    "execution_count": null,
1437 |    "id": "275de3c2-9dfd-477b-98ba-e42e4ceef70d",
1438 |    "metadata": {},
1439 |    "outputs": [],
1440 |    "source": [
1441 |     "model(train_dataset[0][\"wav\"].unsqueeze(0))"
1442 |    ]
1443 |   },
1444 |   {
1445 |    "cell_type": "code",
1446 |    "execution_count": null,
1447 |    "id": "18e98516-e71b-4afe-9c2c-59ca6cbd7ee0",
1448 |    "metadata": {},
1449 |    "outputs": [],
1450 |    "source": [
1451 |     "from tqdm.auto import tqdm\n",
1452 |     "import wandb\n",
1453 |     "\n",
1454 |     "wandb.login()"
1455 |    ]
1456 |   },
1457 |   {
1458 |    "cell_type": "code",
1459 |    "execution_count": null,
1460 |    "id": "f20e5fe8-f64e-43bd-a308-0c504cea4785",
1461 |    "metadata": {},
1462 |    "outputs": [],
1463 |    "source": [
1464 |     "criterion = nn.CrossEntropyLoss()"
1465 |    ]
1466 |   },
1467 |   {
1468 |    "cell_type": "code",
1469 |    "execution_count": null,
1470 |    "id": "6dc333e1-65e5-4f2f-8b3b-825801d0c5d8",
1471 |    "metadata": {},
1472 |    "outputs": [],
1473 |    "source": [
1474 |     "# FA - true: 0, model: 1\n",
1475 |     "# FR - true: 1, model: 0\n",
1476 |     "\n",
1477 |     "def count_fa(preds, labels):\n",
1478 |     "    preds = torch.argmax(preds, dim=-1)\n",
1479 |     "\n",
1480 |     "    FA = torch.sum(preds[labels == 0])\n",
1481 |     "    \n",
1482 |     "    # torch.numel - returns total number of elements in tensor\n",
1483 |     "    return FA.item() / torch.numel(preds)\n",
1484 |     "\n",
1485 |     "def count_fr(preds, labels):\n",
1486 |     "    preds = torch.argmax(preds, dim=-1)\n",
1487 |     "\n",
1488 |     "    FR = torch.sum(labels[preds == 0])\n",
1489 |     "    \n",
1490 |     "    # torch.numel - returns total number of elements in tensor\n",
1491 |     "    return FR.item() / torch.numel(preds)\n",
1492 |     "\n",
1493 |     "def count_acc(preds, labels):\n",
1494 |     "    preds = torch.argmax(preds, dim=-1)\n",
1495 |     "\n",
1496 |     "    acc = torch.sum(preds == labels)\n",
1497 |     "    \n",
1498 |     "    # torch.numel - returns total number of elements in tensor\n",
1499 |     "    return acc.item() / torch.numel(preds)"
1500 |    ]
1501 |   },
1502 |   {
1503 |    "cell_type": "code",
1504 |    "execution_count": null,
1505 |    "id": "2d87ba59-936c-45b6-8468-2dd88c23a032",
1506 |    "metadata": {},
1507 |    "outputs": [],
1508 |    "source": [
1509 |     "def train_one_epoch(model, dataloader, criterion, optimizer, scheduler, device, epoch):\n",
1510 |     "    model.train()\n",
1511 |     "\n",
1512 |     "    avg_loss = 0\n",
1513 |     "    step = epoch * len(dataloader)\n",
1514 |     "    for batch_idx, (wav, label) in tqdm(enumerate(dataloader), total=len(dataloader)):\n",
1515 |     "        wav, label = wav.to(device), label.to(device)\n",
1516 |     "\n",
1517 |     "        preds = model(wav)\n",
1518 |     "        loss = criterion(preds, label)\n",
1519 |     "\n",
1520 |     "        loss.backward()\n",
1521 |     "        optimizer.step()\n",
1522 |     "        optimizer.zero_grad()\n",
1523 |     "        scheduler.step()\n",
1524 |     "\n",
1525 |     "        avg_loss += loss.item()\n",
1526 |     "        \n",
1527 |     "        wandb.log({\n",
1528 |     "            \"train_step_loss\": loss.item(),\n",
1529 |     "            \"lr\": scheduler.get_last_lr()[0], # get current lr for the 0th param group\n",
1530 |     "            \"acc_step\": count_acc(preds, label),\n",
1531 |     "            \"fa_step\": count_fa(preds, label),\n",
1532 |     "            \"fr_step\": count_fr(preds, label),\n",
1533 |     "        }, step=step + batch_idx)\n",
1534 |     "\n",
1535 |     "        if batch_idx == 0:        \n",
1536 |     "            wandb.log({\"train_image\": wandb.Audio(wav[0].detach().cpu().numpy(), sample_rate=16000,\n",
1537 |     "                                                  caption=f\"Label: {label[0]}, Pred: {preds[0].argmax(-1)}\")},\n",
1538 |     "                      step=step+batch_idx)\n",
1539 |     "\n",
1540 |     "    avg_loss = avg_loss / (batch_idx + 1)\n",
1541 |     "    return avg_loss\n",
1542 |     "\n",
1543 |     "\n",
1544 |     "def evaluate(model, dataloader, criterion, device):\n",
1545 |     "    model.eval()\n",
1546 |     "\n",
1547 |     "    avg_loss = 0\n",
1548 |     "    accuracy = 0\n",
1549 |     "    fa = 0\n",
1550 |     "    fr = 0\n",
1551 |     "    total_elements = 0\n",
1552 |     "    for batch_idx, (wav, label) in enumerate(dataloader):\n",
1553 |     "        wav, label = wav.to(device), label.to(device)\n",
1554 |     "\n",
1555 |     "        preds = model(wav)\n",
1556 |     "        loss = criterion(preds, label)\n",
1557 |     "\n",
1558 |     "        accuracy += count_acc(preds, label)\n",
1559 |     "        fa += count_fa(preds, label)\n",
1560 |     "        fr += count_fr(preds, label)\n",
1561 |     "        avg_loss += loss.item()\n",
1562 |     "        \n",
1563 |     "\n",
1564 |     "    avg_loss = avg_loss / (batch_idx + 1)\n",
1565 |     "    accuracy = accuracy / (batch_idx + 1)\n",
1566 |     "    fa = fa / (batch_idx + 1)\n",
1567 |     "    fr = fr / (batch_idx + 1)\n",
1568 |     "\n",
1569 |     "    return avg_loss, accuracy, fa, fr\n",
1570 |     "\n",
1571 |     "\n",
1572 |     "def train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, device, n_epochs):\n",
1573 |     "\n",
1574 |     "    train_avg_losses = []\n",
1575 |     "    val_avg_losses = []\n",
1576 |     "    val_accuracy_list = []\n",
1577 |     "\n",
1578 |     "    for epoch in range(n_epochs):        \n",
1579 |     "        train_avg_loss = train_one_epoch(model, train_dataloader, criterion, optimizer, scheduler, device, epoch)\n",
1580 |     "        val_avg_loss, val_accuracy, val_fa, val_fr = evaluate(model, val_dataloader, criterion, device)\n",
1581 |     "\n",
1582 |     "        wandb.log({\n",
1583 |     "            \"train_avg_loss\": train_avg_loss, \n",
1584 |     "            \"val_avg_loss\": val_avg_loss,\n",
1585 |     "            \"val_accuracy\": val_accuracy,\n",
1586 |     "            \"val_fa\": val_fa,\n",
1587 |     "            \"val_fr\": val_fr,\n",
1588 |     "        }, step=(epoch + 1) * len(train_dataloader))"
1589 |    ]
1590 |   },
1591 |   {
1592 |    "cell_type": "code",
1593 |    "execution_count": null,
1594 |    "id": "2d1fb606-b1f6-4f7e-b656-2523402950a8",
1595 |    "metadata": {},
1596 |    "outputs": [],
1597 |    "source": [
1598 |     "model.to(config.device)\n",
1599 |     "\n",
1600 |     "NUM_EPOCHS = 2\n",
1601 |     "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)\n",
1602 |     "scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS * len(train_dataloader), eta_min=1e-4)"
1603 |    ]
1604 |   },
1605 |   {
1606 |    "cell_type": "code",
1607 |    "execution_count": null,
1608 |    "id": "deffa1c5-bee8-441b-aee7-8d5eba4f9a38",
1609 |    "metadata": {},
1610 |    "outputs": [],
1611 |    "source": [
1612 |     "with wandb.init(\n",
1613 |     "                project=\"seminar_wandb_kws\", # project name\n",
1614 |     "                name=\"crnn\" # run name within the project\n",
1615 |     "            ) as run:\n",
1616 |     "    train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, config.device, NUM_EPOCHS)"
1617 |    ]
1618 |   },
1619 |   {
1620 |    "cell_type": "code",
1621 |    "execution_count": null,
1622 |    "id": "da9bdc8f-075f-4171-a3bf-3362159c8a6b",
1623 |    "metadata": {},
1624 |    "outputs": [],
1625 |    "source": []
1626 |   },
1627 |   {
1628 |    "cell_type": "code",
1629 |    "execution_count": null,
1630 |    "id": "dfbf78d5-80dc-417f-b444-5f30a031397f",
1631 |    "metadata": {},
1632 |    "outputs": [],
1633 |    "source": [
1634 |     "model = CRNNV2(...)\n",
1635 |     "\n",
1636 |     "model.to(config.device)\n",
1637 |     "\n",
1638 |     "NUM_EPOCHS = 2\n",
1639 |     "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)\n",
1640 |     "scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS * len(train_dataloader), eta_min=1e-4)\n",
1641 |     "\n",
1642 |     "with wandb.init(\n",
1643 |     "                project=\"seminar_wandb_kws\", # project name\n",
1644 |     "                name=\"crnnv2\" # run name within the project\n",
1645 |     "            ) as run:\n",
1646 |     "    train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, config.device, NUM_EPOCHS)"
1647 |    ]
1648 |   },
1649 |   {
1650 |    "cell_type": "code",
1651 |    "execution_count": null,
1652 |    "id": "0a31500c-c304-450d-8862-bde7a679f809",
1653 |    "metadata": {},
1654 |    "outputs": [],
1655 |    "source": [
1656 |     "model = CRNNV3(...)\n",
1657 |     "\n",
1658 |     "model.to(config.device)\n",
1659 |     "\n",
1660 |     "NUM_EPOCHS = 2\n",
1661 |     "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)\n",
1662 |     "scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS * len(train_dataloader), eta_min=1e-4)\n",
1663 |     "\n",
1664 |     "with wandb.init(\n",
1665 |     "                project=\"seminar_wandb_kws\", # project name\n",
1666 |     "                name=\"crnnv3\" # run name within the project\n",
1667 |     "            ) as run:\n",
1668 |     "    train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, config.device, NUM_EPOCHS)"
1669 |    ]
1670 |   }
1671 |  ],
1672 |  "metadata": {
1673 |   "kernelspec": {
1674 |    "display_name": "Python 3 (ipykernel)",
1675 |    "language": "python",
1676 |    "name": "python3"
1677 |   },
1678 |   "language_info": {
1679 |    "codemirror_mode": {
1680 |     "name": "ipython",
1681 |     "version": 3
1682 |    },
1683 |    "file_extension": ".py",
1684 |    "mimetype": "text/x-python",
1685 |    "name": "python",
1686 |    "nbconvert_exporter": "python",
1687 |    "pygments_lexer": "ipython3",
1688 |    "version": "3.9.7"
1689 |   }
1690 |  },
1691 |  "nbformat": 4,
1692 |  "nbformat_minor": 5
1693 | }
1694 | 


--------------------------------------------------------------------------------
/day03/notebook_problems_examples/Example_Unstructured_Unordered.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "ad755e7f-097a-47ef-af96-db14f4c617e5",
   6 |    "metadata": {},
   7 |    "source": [
   8 |     "This notebook is based on [DLA Seminar](https://github.com/markovka17/dla/blob/2022/week06/seminar.ipynb)"
   9 |    ]
  10 |   },
  11 |   {
  12 |    "cell_type": "code",
  13 |    "execution_count": null,
  14 |    "id": "aac684a2-d865-446f-84df-212ccb25604e",
  15 |    "metadata": {},
  16 |    "outputs": [],
  17 |    "source": [
  18 |     "def train_one_epoch(model, dataloader, criterion, optimizer, scheduler, device, epoch):\n",
  19 |     "    model.train()\n",
  20 |     "\n",
  21 |     "    avg_loss = 0\n",
  22 |     "    step = epoch * len(dataloader)\n",
  23 |     "    for batch_idx, (wav, label) in tqdm(enumerate(dataloader), total=len(dataloader)):\n",
  24 |     "        wav, label = wav.to(device), label.to(device)\n",
  25 |     "\n",
  26 |     "        preds = model(wav)\n",
  27 |     "        loss = criterion(preds, label)\n",
  28 |     "\n",
  29 |     "        loss.backward()\n",
  30 |     "        optimizer.step()\n",
  31 |     "        optimizer.zero_grad()\n",
  32 |     "        scheduler.step()\n",
  33 |     "\n",
  34 |     "        avg_loss += loss.item()\n",
  35 |     "        \n",
  36 |     "        wandb.log({\n",
  37 |     "            \"train_step_loss\": loss.item(),\n",
  38 |     "            \"lr\": scheduler.get_last_lr()[0], # get current lr for the 0th param group\n",
  39 |     "            \"acc_step\": count_acc(preds, label),\n",
  40 |     "            \"fa_step\": count_fa(preds, label),\n",
  41 |     "            \"fr_step\": count_fr(preds, label),\n",
  42 |     "        }, step=step + batch_idx)\n",
  43 |     "\n",
  44 |     "        if batch_idx == 0:        \n",
  45 |     "            wandb.log({\"train_image\": wandb.Audio(wav[0].detach().cpu().numpy(), sample_rate=16000,\n",
  46 |     "                                                  caption=f\"Label: {label[0]}, Pred: {preds[0].argmax(-1)}\")},\n",
  47 |     "                      step=step+batch_idx)\n",
  48 |     "\n",
  49 |     "    avg_loss = avg_loss / (batch_idx + 1)\n",
  50 |     "    return avg_loss\n",
  51 |     "\n",
  52 |     "\n",
  53 |     "def evaluate(model, dataloader, criterion, device):\n",
  54 |     "    model.eval()\n",
  55 |     "\n",
  56 |     "    avg_loss = 0\n",
  57 |     "    accuracy = 0\n",
  58 |     "    fa = 0\n",
  59 |     "    fr = 0\n",
  60 |     "    total_elements = 0\n",
  61 |     "    for batch_idx, (wav, label) in enumerate(dataloader):\n",
  62 |     "        wav, label = wav.to(device), label.to(device)\n",
  63 |     "\n",
  64 |     "        preds = model(wav)\n",
  65 |     "        loss = criterion(preds, label)\n",
  66 |     "\n",
  67 |     "        accuracy += count_acc(preds, label)\n",
  68 |     "        fa += count_fa(preds, label)\n",
  69 |     "        fr += count_fr(preds, label)\n",
  70 |     "        avg_loss += loss.item()\n",
  71 |     "        \n",
  72 |     "\n",
  73 |     "    avg_loss = avg_loss / (batch_idx + 1)\n",
  74 |     "    accuracy = accuracy / (batch_idx + 1)\n",
  75 |     "    fa = fa / (batch_idx + 1)\n",
  76 |     "    fr = fr / (batch_idx + 1)\n",
  77 |     "\n",
  78 |     "    return avg_loss, accuracy, fa, fr\n",
  79 |     "\n",
  80 |     "\n",
  81 |     "def train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, device, n_epochs):\n",
  82 |     "\n",
  83 |     "    train_avg_losses = []\n",
  84 |     "    val_avg_losses = []\n",
  85 |     "    val_accuracy_list = []\n",
  86 |     "\n",
  87 |     "    for epoch in range(n_epochs):        \n",
  88 |     "        train_avg_loss = train_one_epoch(model, train_dataloader, criterion, optimizer, scheduler, device, epoch)\n",
  89 |     "        val_avg_loss, val_accuracy, val_fa, val_fr = evaluate(model, val_dataloader, criterion, device)\n",
  90 |     "\n",
  91 |     "        wandb.log({\n",
  92 |     "            \"train_avg_loss\": train_avg_loss, \n",
  93 |     "            \"val_avg_loss\": val_avg_loss,\n",
  94 |     "            \"val_accuracy\": val_accuracy,\n",
  95 |     "            \"val_fa\": val_fa,\n",
  96 |     "            \"val_fr\": val_fr,\n",
  97 |     "        }, step=(epoch + 1) * len(train_dataloader))"
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": null,
 103 |    "id": "1a09c197-776c-433f-9275-9561f2000d0e",
 104 |    "metadata": {},
 105 |    "outputs": [],
 106 |    "source": [
 107 |     "import dataclasses\n",
 108 |     "import torch\n",
 109 |     "from typing import Tuple, Union, List, Callable, Optional\n",
 110 |     "\n",
 111 |     "@dataclasses.dataclass\n",
 112 |     "class TaskConfig:\n",
 113 |     "    keyword: str = 'sheila'  # We will use 1 key word -- 'sheila'\n",
 114 |     "    batch_size: int = 128\n",
 115 |     "    learning_rate: float = 3e-4\n",
 116 |     "    weight_decay: float = 1e-5\n",
 117 |     "    num_epochs: int = 20\n",
 118 |     "    n_mels: int = 40\n",
 119 |     "    cnn_out_channels: int = 8\n",
 120 |     "    kernel_size: Tuple[int, int] = (5, 20)\n",
 121 |     "    stride: Tuple[int, int] = (2, 8)\n",
 122 |     "    hidden_size: int = 64\n",
 123 |     "    gru_num_layers: int = 2\n",
 124 |     "    bidirectional: bool = False\n",
 125 |     "    num_classes: int = 2\n",
 126 |     "    sample_rate: int = 16000\n",
 127 |     "    device: torch.device = torch.device(\n",
 128 |     "        'cuda:0' if torch.cuda.is_available() else 'cpu')"
 129 |    ]
 130 |   },
 131 |   {
 132 |    "cell_type": "code",
 133 |    "execution_count": null,
 134 |    "id": "2bc4b4bf-3687-4885-a826-0beaae4b0146",
 135 |    "metadata": {},
 136 |    "outputs": [],
 137 |    "source": [
 138 |     "# !wget http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz -O speech_commands_v0.01.tar.gz\n",
 139 |     "# !mkdir speech_commands && tar -C speech_commands -xvzf speech_commands_v0.01.tar.gz 1> log"
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "code",
 144 |    "execution_count": null,
 145 |    "id": "39dee0e1-4b94-44e1-ab94-a471ec5d028d",
 146 |    "metadata": {},
 147 |    "outputs": [],
 148 |    "source": [
 149 |     "from torch.utils.data import Dataset, DataLoader"
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "code",
 154 |    "execution_count": null,
 155 |    "id": "a3125256-a635-4a7f-86dc-175bedcdd3dc",
 156 |    "metadata": {},
 157 |    "outputs": [],
 158 |    "source": [
 159 |     "import pandas as pd\n",
 160 |     "from pathlib import Path\n",
 161 |     "import json\n",
 162 |     "from collections import OrderedDict\n",
 163 |     "\n",
 164 |     "def read_json(fname):\n",
 165 |     "    fname = Path(fname)\n",
 166 |     "    with fname.open(\"rt\") as handle:\n",
 167 |     "        return json.load(handle, object_hook=OrderedDict)\n",
 168 |     "\n",
 169 |     "\n",
 170 |     "def write_json(content, fname):\n",
 171 |     "    fname = Path(fname)\n",
 172 |     "    with fname.open(\"wt\") as handle:\n",
 173 |     "        json.dump(content, handle, indent=4, sort_keys=False)"
 174 |    ]
 175 |   },
 176 |   {
 177 |    "cell_type": "code",
 178 |    "execution_count": null,
 179 |    "id": "f46b0a0b-40b9-44e4-b784-df2a8443cb57",
 180 |    "metadata": {},
 181 |    "outputs": [],
 182 |    "source": [
 183 |     "class SpeechCommandDataset(Dataset):\n",
 184 |     "\n",
 185 |     "    def __init__(\n",
 186 |     "        self,\n",
 187 |     "        transform: Optional[Callable] = None,\n",
 188 |     "        path2dir: str = None,\n",
 189 |     "        keywords: Union[str, List[str]] = None,\n",
 190 |     "        csv: Optional[pd.DataFrame] = None,\n",
 191 |     "        part: \"str\" = \"train\",\n",
 192 |     "    ):        \n",
 193 |     "        self.transform = transform\n",
 194 |     "\n",
 195 |     "        self.path2dir = path2dir\n",
 196 |     "        self.keywords = keywords\n",
 197 |     "        self.index = self.create_or_load_index(part)\n",
 198 |     "\n",
 199 |     "    def create_or_load_index(self, part):\n",
 200 |     "        index_path = Path(f\"{part}_index.json\")\n",
 201 |     "        \n",
 202 |     "        if not index_path.exists():\n",
 203 |     "            self.create_index(part)\n",
 204 |     "            \n",
 205 |     "        return read_json(index_path)\n",
 206 |     "\n",
 207 |     "    def create_index(self, part):\n",
 208 |     "        path2dir = Path(self.path2dir)\n",
 209 |     "        keywords = self.keywords if isinstance(self.keywords, list) else [self.keywords]\n",
 210 |     "        \n",
 211 |     "        all_keywords = [\n",
 212 |     "            p.stem for p in path2dir.glob('*')\n",
 213 |     "            if p.is_dir() and not p.stem.startswith('_')\n",
 214 |     "        ]\n",
 215 |     "\n",
 216 |     "        index = []\n",
 217 |     "        for keyword in all_keywords:\n",
 218 |     "            paths = (path2dir / keyword).rglob('*.wav')\n",
 219 |     "            if keyword in keywords:\n",
 220 |     "                for path2wav in paths:\n",
 221 |     "                    index.append({\n",
 222 |     "                        \"path\": path2wav.as_posix(),\n",
 223 |     "                        \"keyword\": keyword,\n",
 224 |     "                        \"label\": 1\n",
 225 |     "                    })\n",
 226 |     "            else:\n",
 227 |     "                for path2wav in paths:\n",
 228 |     "                    index.append({\n",
 229 |     "                        \"path\": path2wav.as_posix(),\n",
 230 |     "                        \"keyword\": keyword,\n",
 231 |     "                        \"label\": 0\n",
 232 |     "                    })\n",
 233 |     "\n",
 234 |     "        torch.manual_seed(0)\n",
 235 |     "        indexes = torch.randperm(len(index))\n",
 236 |     "        train_indexes = indexes[:int(len(index) * 0.8)]\n",
 237 |     "        val_indexes = indexes[int(len(index) * 0.8):]\n",
 238 |     "\n",
 239 |     "        train_index = [index[i] for i in train_indexes]\n",
 240 |     "        val_index = [index[i] for i in val_indexes]\n",
 241 |     "\n",
 242 |     "        train_index_path = pathlib.Path(\"train_index.json\")\n",
 243 |     "        write_json(train_index, str(train_index_path))\n",
 244 |     "        \n",
 245 |     "        val_index_path = pathlib.Path(\"val_index.json\")\n",
 246 |     "        write_json(val_index, str(val_index_path))\n",
 247 |     "\n",
 248 |     "    def __getitem__(self, ind: int):\n",
 249 |     "        instance = self.index[ind]\n",
 250 |     "\n",
 251 |     "        path2wav = instance['path']\n",
 252 |     "        wav, sr = torchaudio.load(path2wav)\n",
 253 |     "        wav = wav.sum(dim=0)\n",
 254 |     "        \n",
 255 |     "        if self.transform:\n",
 256 |     "            wav = self.transform(wav)\n",
 257 |     "\n",
 258 |     "        return {\n",
 259 |     "            'wav': wav,\n",
 260 |     "            'keywors': instance['keyword'],\n",
 261 |     "            'label': instance['label']\n",
 262 |     "        }\n",
 263 |     "\n",
 264 |     "    def __len__(self):\n",
 265 |     "        return len(self.index)\n"
 266 |    ]
 267 |   },
 268 |   {
 269 |    "cell_type": "code",
 270 |    "execution_count": null,
 271 |    "id": "af4dfa68-7138-4f97-b482-b43e02f40f48",
 272 |    "metadata": {},
 273 |    "outputs": [],
 274 |    "source": [
 275 |     "class CRNNV3(nn.Module):\n",
 276 |     "\n",
 277 |     "    def __init__(self, config: TaskConfig):\n",
 278 |     "        super().__init__()\n",
 279 |     "        self.config = config\n",
 280 |     "\n",
 281 |     "        self.mel_spec = LogMelspec(config)\n",
 282 |     "\n",
 283 |     "        self.conv = nn.Sequential(\n",
 284 |     "            nn.Conv2d(\n",
 285 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 286 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 287 |     "            ),\n",
 288 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 289 |     "        )\n",
 290 |     "\n",
 291 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 292 |     "            config.stride[0] + 1\n",
 293 |     "        \n",
 294 |     "        self.gru = nn.GRU(\n",
 295 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 296 |     "            hidden_size=config.hidden_size,\n",
 297 |     "            num_layers=config.gru_num_layers,\n",
 298 |     "            dropout=0.1,\n",
 299 |     "            bidirectional=config.bidirectional,\n",
 300 |     "            batch_first=True\n",
 301 |     "        )\n",
 302 |     "\n",
 303 |     "        self.attention = Attention(config.hidden_size)\n",
 304 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 305 |     "    \n",
 306 |     "    def forward(self, input):\n",
 307 |     "        input = self.mel_spec(input)\n",
 308 |     "        \n",
 309 |     "        input = input.unsqueeze(dim=1)\n",
 310 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 311 |     "        gru_output, _ = self.gru(conv_output)\n",
 312 |     "        contex_vector = self.attention(gru_output)\n",
 313 |     "        output = self.classifier(contex_vector)\n",
 314 |     "        return output\n",
 315 |     "\n",
 316 |     "\n",
 317 |     "class CRNNV3(nn.Module):\n",
 318 |     "\n",
 319 |     "    def __init__(self, config: TaskConfig):\n",
 320 |     "        super().__init__()\n",
 321 |     "        self.config = config\n",
 322 |     "\n",
 323 |     "        self.mel_spec = LogMelspec(config)\n",
 324 |     "\n",
 325 |     "        self.conv = nn.Sequential(\n",
 326 |     "            nn.Conv2d(\n",
 327 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 328 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 329 |     "            ),\n",
 330 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 331 |     "        )\n",
 332 |     "\n",
 333 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 334 |     "            config.stride[0] + 1\n",
 335 |     "        \n",
 336 |     "        self.gru = nn.GRU(\n",
 337 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 338 |     "            hidden_size=config.hidden_size,\n",
 339 |     "            num_layers=config.gru_num_layers,\n",
 340 |     "            dropout=0.1,\n",
 341 |     "            bidirectional=config.bidirectional,\n",
 342 |     "            batch_first=True\n",
 343 |     "        )\n",
 344 |     "\n",
 345 |     "        self.attention = Attention(config.hidden_size)\n",
 346 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 347 |     "    \n",
 348 |     "    def forward(self, input):\n",
 349 |     "        input = self.mel_spec(input)\n",
 350 |     "        \n",
 351 |     "        input = input.unsqueeze(dim=1)\n",
 352 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 353 |     "        gru_output, _ = self.gru(conv_output)\n",
 354 |     "        contex_vector = self.attention(gru_output)\n",
 355 |     "        output = self.classifier(contex_vector)\n",
 356 |     "        return output\n",
 357 |     "\n",
 358 |     "\n",
 359 |     "class CRNNV3(nn.Module):\n",
 360 |     "\n",
 361 |     "    def __init__(self, config: TaskConfig):\n",
 362 |     "        super().__init__()\n",
 363 |     "        self.config = config\n",
 364 |     "\n",
 365 |     "        self.mel_spec = LogMelspec(config)\n",
 366 |     "\n",
 367 |     "        self.conv = nn.Sequential(\n",
 368 |     "            nn.Conv2d(\n",
 369 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 370 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 371 |     "            ),\n",
 372 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 373 |     "        )\n",
 374 |     "\n",
 375 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 376 |     "            config.stride[0] + 1\n",
 377 |     "        \n",
 378 |     "        self.gru = nn.GRU(\n",
 379 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 380 |     "            hidden_size=config.hidden_size,\n",
 381 |     "            num_layers=config.gru_num_layers,\n",
 382 |     "            dropout=0.1,\n",
 383 |     "            bidirectional=config.bidirectional,\n",
 384 |     "            batch_first=True\n",
 385 |     "        )\n",
 386 |     "\n",
 387 |     "        self.attention = Attention(config.hidden_size)\n",
 388 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 389 |     "    \n",
 390 |     "    def forward(self, input):\n",
 391 |     "        input = self.mel_spec(input)\n",
 392 |     "        \n",
 393 |     "        input = input.unsqueeze(dim=1)\n",
 394 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 395 |     "        gru_output, _ = self.gru(conv_output)\n",
 396 |     "        contex_vector = self.attention(gru_output)\n",
 397 |     "        output = self.classifier(contex_vector)\n",
 398 |     "        return output\n",
 399 |     "\n",
 400 |     "\n",
 401 |     "class CRNNV3(nn.Module):\n",
 402 |     "\n",
 403 |     "    def __init__(self, config: TaskConfig):\n",
 404 |     "        super().__init__()\n",
 405 |     "        self.config = config\n",
 406 |     "\n",
 407 |     "        self.mel_spec = LogMelspec(config)\n",
 408 |     "\n",
 409 |     "        self.conv = nn.Sequential(\n",
 410 |     "            nn.Conv2d(\n",
 411 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 412 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 413 |     "            ),\n",
 414 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 415 |     "        )\n",
 416 |     "\n",
 417 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 418 |     "            config.stride[0] + 1\n",
 419 |     "        \n",
 420 |     "        self.gru = nn.GRU(\n",
 421 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 422 |     "            hidden_size=config.hidden_size,\n",
 423 |     "            num_layers=config.gru_num_layers,\n",
 424 |     "            dropout=0.1,\n",
 425 |     "            bidirectional=config.bidirectional,\n",
 426 |     "            batch_first=True\n",
 427 |     "        )\n",
 428 |     "\n",
 429 |     "        self.attention = Attention(config.hidden_size)\n",
 430 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 431 |     "    \n",
 432 |     "    def forward(self, input):\n",
 433 |     "        input = self.mel_spec(input)\n",
 434 |     "        \n",
 435 |     "        input = input.unsqueeze(dim=1)\n",
 436 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 437 |     "        gru_output, _ = self.gru(conv_output)\n",
 438 |     "        contex_vector = self.attention(gru_output)\n",
 439 |     "        output = self.classifier(contex_vector)\n",
 440 |     "        return output\n",
 441 |     "\n",
 442 |     "\n",
 443 |     "class CRNNV3(nn.Module):\n",
 444 |     "\n",
 445 |     "    def __init__(self, config: TaskConfig):\n",
 446 |     "        super().__init__()\n",
 447 |     "        self.config = config\n",
 448 |     "\n",
 449 |     "        self.mel_spec = LogMelspec(config)\n",
 450 |     "\n",
 451 |     "        self.conv = nn.Sequential(\n",
 452 |     "            nn.Conv2d(\n",
 453 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 454 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 455 |     "            ),\n",
 456 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 457 |     "        )\n",
 458 |     "\n",
 459 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 460 |     "            config.stride[0] + 1\n",
 461 |     "        \n",
 462 |     "        self.gru = nn.GRU(\n",
 463 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 464 |     "            hidden_size=config.hidden_size,\n",
 465 |     "            num_layers=config.gru_num_layers,\n",
 466 |     "            dropout=0.1,\n",
 467 |     "            bidirectional=config.bidirectional,\n",
 468 |     "            batch_first=True\n",
 469 |     "        )\n",
 470 |     "\n",
 471 |     "        self.attention = Attention(config.hidden_size)\n",
 472 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 473 |     "    \n",
 474 |     "    def forward(self, input):\n",
 475 |     "        input = self.mel_spec(input)\n",
 476 |     "        \n",
 477 |     "        input = input.unsqueeze(dim=1)\n",
 478 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 479 |     "        gru_output, _ = self.gru(conv_output)\n",
 480 |     "        contex_vector = self.attention(gru_output)\n",
 481 |     "        output = self.classifier(contex_vector)\n",
 482 |     "        return outputclass CRNNV3(nn.Module):\n",
 483 |     "\n",
 484 |     "    def __init__(self, config: TaskConfig):\n",
 485 |     "        super().__init__()\n",
 486 |     "        self.config = config\n",
 487 |     "\n",
 488 |     "        self.mel_spec = LogMelspec(config)\n",
 489 |     "\n",
 490 |     "        self.conv = nn.Sequential(\n",
 491 |     "            nn.Conv2d(\n",
 492 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 493 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 494 |     "            ),\n",
 495 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 496 |     "        )\n",
 497 |     "\n",
 498 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 499 |     "            config.stride[0] + 1\n",
 500 |     "        \n",
 501 |     "        self.gru = nn.GRU(\n",
 502 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 503 |     "            hidden_size=config.hidden_size,\n",
 504 |     "            num_layers=config.gru_num_layers,\n",
 505 |     "            dropout=0.1,\n",
 506 |     "            bidirectional=config.bidirectional,\n",
 507 |     "            batch_first=True\n",
 508 |     "        )\n",
 509 |     "\n",
 510 |     "        self.attention = Attention(config.hidden_size)\n",
 511 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 512 |     "    \n",
 513 |     "    def forward(self, input):\n",
 514 |     "        input = self.mel_spec(input)\n",
 515 |     "        \n",
 516 |     "        input = input.unsqueeze(dim=1)\n",
 517 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 518 |     "        gru_output, _ = self.gru(conv_output)\n",
 519 |     "        contex_vector = self.attention(gru_output)\n",
 520 |     "        output = self.classifier(contex_vector)\n",
 521 |     "        return output\n",
 522 |     "\n",
 523 |     "\n",
 524 |     "class CRNNV3(nn.Module):\n",
 525 |     "\n",
 526 |     "    def __init__(self, config: TaskConfig):\n",
 527 |     "        super().__init__()\n",
 528 |     "        self.config = config\n",
 529 |     "\n",
 530 |     "        self.mel_spec = LogMelspec(config)\n",
 531 |     "\n",
 532 |     "        self.conv = nn.Sequential(\n",
 533 |     "            nn.Conv2d(\n",
 534 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 535 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 536 |     "            ),\n",
 537 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 538 |     "        )\n",
 539 |     "\n",
 540 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 541 |     "            config.stride[0] + 1\n",
 542 |     "        \n",
 543 |     "        self.gru = nn.GRU(\n",
 544 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 545 |     "            hidden_size=config.hidden_size,\n",
 546 |     "            num_layers=config.gru_num_layers,\n",
 547 |     "            dropout=0.1,\n",
 548 |     "            bidirectional=config.bidirectional,\n",
 549 |     "            batch_first=True\n",
 550 |     "        )\n",
 551 |     "\n",
 552 |     "        self.attention = Attention(config.hidden_size)\n",
 553 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 554 |     "    \n",
 555 |     "    def forward(self, input):\n",
 556 |     "        input = self.mel_spec(input)\n",
 557 |     "        \n",
 558 |     "        input = input.unsqueeze(dim=1)\n",
 559 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 560 |     "        gru_output, _ = self.gru(conv_output)\n",
 561 |     "        contex_vector = self.attention(gru_output)\n",
 562 |     "        output = self.classifier(contex_vector)\n",
 563 |     "        return output\n",
 564 |     "\n",
 565 |     "\n",
 566 |     "class CRNNV3(nn.Module):\n",
 567 |     "\n",
 568 |     "    def __init__(self, config: TaskConfig):\n",
 569 |     "        super().__init__()\n",
 570 |     "        self.config = config\n",
 571 |     "\n",
 572 |     "        self.mel_spec = LogMelspec(config)\n",
 573 |     "\n",
 574 |     "        self.conv = nn.Sequential(\n",
 575 |     "            nn.Conv2d(\n",
 576 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 577 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 578 |     "            ),\n",
 579 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 580 |     "        )\n",
 581 |     "\n",
 582 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 583 |     "            config.stride[0] + 1\n",
 584 |     "        \n",
 585 |     "        self.gru = nn.GRU(\n",
 586 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 587 |     "            hidden_size=config.hidden_size,\n",
 588 |     "            num_layers=config.gru_num_layers,\n",
 589 |     "            dropout=0.1,\n",
 590 |     "            bidirectional=config.bidirectional,\n",
 591 |     "            batch_first=True\n",
 592 |     "        )\n",
 593 |     "\n",
 594 |     "        self.attention = Attention(config.hidden_size)\n",
 595 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 596 |     "    \n",
 597 |     "    def forward(self, input):\n",
 598 |     "        input = self.mel_spec(input)\n",
 599 |     "        \n",
 600 |     "        input = input.unsqueeze(dim=1)\n",
 601 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 602 |     "        gru_output, _ = self.gru(conv_output)\n",
 603 |     "        contex_vector = self.attention(gru_output)\n",
 604 |     "        output = self.classifier(contex_vector)\n",
 605 |     "        return output\n",
 606 |     "\n",
 607 |     "\n",
 608 |     "class CRNNV3(nn.Module):\n",
 609 |     "\n",
 610 |     "    def __init__(self, config: TaskConfig):\n",
 611 |     "        super().__init__()\n",
 612 |     "        self.config = config\n",
 613 |     "\n",
 614 |     "        self.mel_spec = LogMelspec(config)\n",
 615 |     "\n",
 616 |     "        self.conv = nn.Sequential(\n",
 617 |     "            nn.Conv2d(\n",
 618 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 619 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 620 |     "            ),\n",
 621 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 622 |     "        )\n",
 623 |     "\n",
 624 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 625 |     "            config.stride[0] + 1\n",
 626 |     "        \n",
 627 |     "        self.gru = nn.GRU(\n",
 628 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 629 |     "            hidden_size=config.hidden_size,\n",
 630 |     "            num_layers=config.gru_num_layers,\n",
 631 |     "            dropout=0.1,\n",
 632 |     "            bidirectional=config.bidirectional,\n",
 633 |     "            batch_first=True\n",
 634 |     "        )\n",
 635 |     "\n",
 636 |     "        self.attention = Attention(config.hidden_size)\n",
 637 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 638 |     "    \n",
 639 |     "    def forward(self, input):\n",
 640 |     "        input = self.mel_spec(input)\n",
 641 |     "        \n",
 642 |     "        input = input.unsqueeze(dim=1)\n",
 643 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 644 |     "        gru_output, _ = self.gru(conv_output)\n",
 645 |     "        contex_vector = self.attention(gru_output)\n",
 646 |     "        output = self.classifier(contex_vector)\n",
 647 |     "        return output\n",
 648 |     "\n",
 649 |     "\n",
 650 |     "class CRNNV3(nn.Module):\n",
 651 |     "\n",
 652 |     "    def __init__(self, config: TaskConfig):\n",
 653 |     "        super().__init__()\n",
 654 |     "        self.config = config\n",
 655 |     "\n",
 656 |     "        self.mel_spec = LogMelspec(config)\n",
 657 |     "\n",
 658 |     "        self.conv = nn.Sequential(\n",
 659 |     "            nn.Conv2d(\n",
 660 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 661 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 662 |     "            ),\n",
 663 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 664 |     "        )\n",
 665 |     "\n",
 666 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 667 |     "            config.stride[0] + 1\n",
 668 |     "        \n",
 669 |     "        self.gru = nn.GRU(\n",
 670 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 671 |     "            hidden_size=config.hidden_size,\n",
 672 |     "            num_layers=config.gru_num_layers,\n",
 673 |     "            dropout=0.1,\n",
 674 |     "            bidirectional=config.bidirectional,\n",
 675 |     "            batch_first=True\n",
 676 |     "        )\n",
 677 |     "\n",
 678 |     "        self.attention = Attention(config.hidden_size)\n",
 679 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 680 |     "    \n",
 681 |     "    def forward(self, input):\n",
 682 |     "        input = self.mel_spec(input)\n",
 683 |     "        \n",
 684 |     "        input = input.unsqueeze(dim=1)\n",
 685 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 686 |     "        gru_output, _ = self.gru(conv_output)\n",
 687 |     "        contex_vector = self.attention(gru_output)\n",
 688 |     "        output = self.classifier(contex_vector)\n",
 689 |     "        return output\n",
 690 |     "\n",
 691 |     "\n",
 692 |     "class CRNNV3(nn.Module):\n",
 693 |     "\n",
 694 |     "    def __init__(self, config: TaskConfig):\n",
 695 |     "        super().__init__()\n",
 696 |     "        self.config = config\n",
 697 |     "\n",
 698 |     "        self.mel_spec = LogMelspec(config)\n",
 699 |     "\n",
 700 |     "        self.conv = nn.Sequential(\n",
 701 |     "            nn.Conv2d(\n",
 702 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 703 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 704 |     "            ),\n",
 705 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 706 |     "        )\n",
 707 |     "\n",
 708 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 709 |     "            config.stride[0] + 1\n",
 710 |     "        \n",
 711 |     "        self.gru = nn.GRU(\n",
 712 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 713 |     "            hidden_size=config.hidden_size,\n",
 714 |     "            num_layers=config.gru_num_layers,\n",
 715 |     "            dropout=0.1,\n",
 716 |     "            bidirectional=config.bidirectional,\n",
 717 |     "            batch_first=True\n",
 718 |     "        )\n",
 719 |     "\n",
 720 |     "        self.attention = Attention(config.hidden_size)\n",
 721 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 722 |     "    \n",
 723 |     "    def forward(self, input):\n",
 724 |     "        input = self.mel_spec(input)\n",
 725 |     "        \n",
 726 |     "        input = input.unsqueeze(dim=1)\n",
 727 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 728 |     "        gru_output, _ = self.gru(conv_output)\n",
 729 |     "        contex_vector = self.attention(gru_output)\n",
 730 |     "        output = self.classifier(contex_vector)\n",
 731 |     "        return output\n",
 732 |     "\n",
 733 |     "\n",
 734 |     "class CRNNV3(nn.Module):\n",
 735 |     "\n",
 736 |     "    def __init__(self, config: TaskConfig):\n",
 737 |     "        super().__init__()\n",
 738 |     "        self.config = config\n",
 739 |     "\n",
 740 |     "        self.mel_spec = LogMelspec(config)\n",
 741 |     "\n",
 742 |     "        self.conv = nn.Sequential(\n",
 743 |     "            nn.Conv2d(\n",
 744 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 745 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 746 |     "            ),\n",
 747 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 748 |     "        )\n",
 749 |     "\n",
 750 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 751 |     "            config.stride[0] + 1\n",
 752 |     "        \n",
 753 |     "        self.gru = nn.GRU(\n",
 754 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 755 |     "            hidden_size=config.hidden_size,\n",
 756 |     "            num_layers=config.gru_num_layers,\n",
 757 |     "            dropout=0.1,\n",
 758 |     "            bidirectional=config.bidirectional,\n",
 759 |     "            batch_first=True\n",
 760 |     "        )\n",
 761 |     "\n",
 762 |     "        self.attention = Attention(config.hidden_size)\n",
 763 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 764 |     "    \n",
 765 |     "    def forward(self, input):\n",
 766 |     "        input = self.mel_spec(input)\n",
 767 |     "        \n",
 768 |     "        input = input.unsqueeze(dim=1)\n",
 769 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 770 |     "        gru_output, _ = self.gru(conv_output)\n",
 771 |     "        contex_vector = self.attention(gru_output)\n",
 772 |     "        output = self.classifier(contex_vector)\n",
 773 |     "        return output\n",
 774 |     "\n",
 775 |     "\n",
 776 |     "class CRNNV3(nn.Module):\n",
 777 |     "\n",
 778 |     "    def __init__(self, config: TaskConfig):\n",
 779 |     "        super().__init__()\n",
 780 |     "        self.config = config\n",
 781 |     "\n",
 782 |     "        self.mel_spec = LogMelspec(config)\n",
 783 |     "\n",
 784 |     "        self.conv = nn.Sequential(\n",
 785 |     "            nn.Conv2d(\n",
 786 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 787 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 788 |     "            ),\n",
 789 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 790 |     "        )\n",
 791 |     "\n",
 792 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 793 |     "            config.stride[0] + 1\n",
 794 |     "        \n",
 795 |     "        self.gru = nn.GRU(\n",
 796 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 797 |     "            hidden_size=config.hidden_size,\n",
 798 |     "            num_layers=config.gru_num_layers,\n",
 799 |     "            dropout=0.1,\n",
 800 |     "            bidirectional=config.bidirectional,\n",
 801 |     "            batch_first=True\n",
 802 |     "        )\n",
 803 |     "\n",
 804 |     "        self.attention = Attention(config.hidden_size)\n",
 805 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 806 |     "    \n",
 807 |     "    def forward(self, input):\n",
 808 |     "        input = self.mel_spec(input)\n",
 809 |     "        \n",
 810 |     "        input = input.unsqueeze(dim=1)\n",
 811 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 812 |     "        gru_output, _ = self.gru(conv_output)\n",
 813 |     "        contex_vector = self.attention(gru_output)\n",
 814 |     "        output = self.classifier(contex_vector)\n",
 815 |     "        return output\n",
 816 |     "\n",
 817 |     "\n",
 818 |     "class CRNNV3(nn.Module):\n",
 819 |     "\n",
 820 |     "    def __init__(self, config: TaskConfig):\n",
 821 |     "        super().__init__()\n",
 822 |     "        self.config = config\n",
 823 |     "\n",
 824 |     "        self.mel_spec = LogMelspec(config)\n",
 825 |     "\n",
 826 |     "        self.conv = nn.Sequential(\n",
 827 |     "            nn.Conv2d(\n",
 828 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
 829 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
 830 |     "            ),\n",
 831 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
 832 |     "        )\n",
 833 |     "\n",
 834 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
 835 |     "            config.stride[0] + 1\n",
 836 |     "        \n",
 837 |     "        self.gru = nn.GRU(\n",
 838 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
 839 |     "            hidden_size=config.hidden_size,\n",
 840 |     "            num_layers=config.gru_num_layers,\n",
 841 |     "            dropout=0.1,\n",
 842 |     "            bidirectional=config.bidirectional,\n",
 843 |     "            batch_first=True\n",
 844 |     "        )\n",
 845 |     "\n",
 846 |     "        self.attention = Attention(config.hidden_size)\n",
 847 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
 848 |     "    \n",
 849 |     "    def forward(self, input):\n",
 850 |     "        input = self.mel_spec(input)\n",
 851 |     "        \n",
 852 |     "        input = input.unsqueeze(dim=1)\n",
 853 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
 854 |     "        gru_output, _ = self.gru(conv_output)\n",
 855 |     "        contex_vector = self.attention(gru_output)\n",
 856 |     "        output = self.classifier(contex_vector)\n",
 857 |     "        return output"
 858 |    ]
 859 |   },
 860 |   {
 861 |    "cell_type": "code",
 862 |    "execution_count": null,
 863 |    "id": "2e704499-9c1f-40a0-a3ad-5cde8cdd25f8",
 864 |    "metadata": {},
 865 |    "outputs": [],
 866 |    "source": [
 867 |     "class SpeechCommandDatasetV3(Dataset):\n",
 868 |     "\n",
 869 |     "    def __init__(\n",
 870 |     "        self,\n",
 871 |     "        transform: Optional[Callable] = None,\n",
 872 |     "        path2dir: str = None,\n",
 873 |     "        keywords: Union[str, List[str]] = None,\n",
 874 |     "        csv: Optional[pd.DataFrame] = None,\n",
 875 |     "        part: \"str\" = \"train\",\n",
 876 |     "    ):        \n",
 877 |     "        self.transform = transform\n",
 878 |     "\n",
 879 |     "        self.path2dir = path2dir\n",
 880 |     "        self.keywords = keywords\n",
 881 |     "        self.index = self.create_or_load_index(part)\n",
 882 |     "\n",
 883 |     "    def create_or_load_index(self, part):\n",
 884 |     "        index_path = Path(f\"{part}_index.json\")\n",
 885 |     "        \n",
 886 |     "        if not index_path.exists():\n",
 887 |     "            self.create_index(part)\n",
 888 |     "            \n",
 889 |     "        return read_json(index_path)\n",
 890 |     "\n",
 891 |     "    def create_index(self, part):\n",
 892 |     "        path2dir = Path(self.path2dir)\n",
 893 |     "        keywords = self.keywords if isinstance(self.keywords, list) else [self.keywords]\n",
 894 |     "        \n",
 895 |     "        all_keywords = [\n",
 896 |     "            p.stem for p in path2dir.glob('*')\n",
 897 |     "            if p.is_dir() and not p.stem.startswith('_')\n",
 898 |     "        ]\n",
 899 |     "\n",
 900 |     "        index = []\n",
 901 |     "        for keyword in all_keywords:\n",
 902 |     "            paths = (path2dir / keyword).rglob('*.wav')\n",
 903 |     "            if keyword in keywords:\n",
 904 |     "                for path2wav in paths:\n",
 905 |     "                    index.append({\n",
 906 |     "                        \"path\": path2wav.as_posix(),\n",
 907 |     "                        \"keyword\": keyword,\n",
 908 |     "                        \"label\": 1\n",
 909 |     "                    })\n",
 910 |     "            else:\n",
 911 |     "                for path2wav in paths:\n",
 912 |     "                    index.append({\n",
 913 |     "                        \"path\": path2wav.as_posix(),\n",
 914 |     "                        \"keyword\": keyword,\n",
 915 |     "                        \"label\": 0\n",
 916 |     "                    })\n",
 917 |     "\n",
 918 |     "        torch.manual_seed(0)\n",
 919 |     "        indexes = torch.randperm(len(index))\n",
 920 |     "        train_indexes = indexes[:int(len(index) * 0.8)]\n",
 921 |     "        val_indexes = indexes[int(len(index) * 0.8):]\n",
 922 |     "\n",
 923 |     "        train_index = [index[i] for i in train_indexes]\n",
 924 |     "        val_index = [index[i] for i in val_indexes]\n",
 925 |     "\n",
 926 |     "        train_index_path = pathlib.Path(\"train_index.json\")\n",
 927 |     "        write_json(train_index, str(train_index_path))\n",
 928 |     "        \n",
 929 |     "        val_index_path = pathlib.Path(\"val_index.json\")\n",
 930 |     "        write_json(val_index, str(val_index_path))\n",
 931 |     "\n",
 932 |     "    def __getitem__(self, ind: int):\n",
 933 |     "        instance = self.index[ind]\n",
 934 |     "\n",
 935 |     "        path2wav = instance['path']\n",
 936 |     "        wav, sr = torchaudio.load(path2wav)\n",
 937 |     "        wav = wav.sum(dim=0)\n",
 938 |     "        \n",
 939 |     "        if self.transform:\n",
 940 |     "            wav = self.transform(wav)\n",
 941 |     "\n",
 942 |     "        return {\n",
 943 |     "            'wav': wav,\n",
 944 |     "            'keywors': instance['keyword'],\n",
 945 |     "            'label': instance['label']\n",
 946 |     "        }\n",
 947 |     "\n",
 948 |     "    def __len__(self):\n",
 949 |     "        return len(self.index)\n"
 950 |    ]
 951 |   },
 952 |   {
 953 |    "cell_type": "code",
 954 |    "execution_count": null,
 955 |    "id": "7a5caeba-a61e-4c9e-b263-b0eb81c0bafa",
 956 |    "metadata": {},
 957 |    "outputs": [],
 958 |    "source": [
 959 |     "class SpeechCommandDatasetV4(Dataset):\n",
 960 |     "\n",
 961 |     "    def __init__(\n",
 962 |     "        self,\n",
 963 |     "        transform: Optional[Callable] = None,\n",
 964 |     "        path2dir: str = None,\n",
 965 |     "        keywords: Union[str, List[str]] = None,\n",
 966 |     "        csv: Optional[pd.DataFrame] = None,\n",
 967 |     "        part: \"str\" = \"train\",\n",
 968 |     "    ):        \n",
 969 |     "        self.transform = transform\n",
 970 |     "\n",
 971 |     "        self.path2dir = path2dir\n",
 972 |     "        self.keywords = keywords\n",
 973 |     "        self.index = self.create_or_load_index(part)\n",
 974 |     "\n",
 975 |     "    def create_or_load_index(self, part):\n",
 976 |     "        index_path = Path(f\"{part}_index.json\")\n",
 977 |     "        \n",
 978 |     "        if not index_path.exists():\n",
 979 |     "            self.create_index(part)\n",
 980 |     "            \n",
 981 |     "        return read_json(index_path)\n",
 982 |     "\n",
 983 |     "    def create_index(self, part):\n",
 984 |     "        path2dir = Path(self.path2dir)\n",
 985 |     "        keywords = self.keywords if isinstance(self.keywords, list) else [self.keywords]\n",
 986 |     "        \n",
 987 |     "        all_keywords = [\n",
 988 |     "            p.stem for p in path2dir.glob('*')\n",
 989 |     "            if p.is_dir() and not p.stem.startswith('_')\n",
 990 |     "        ]\n",
 991 |     "\n",
 992 |     "        index = []\n",
 993 |     "        for keyword in all_keywords:\n",
 994 |     "            paths = (path2dir / keyword).rglob('*.wav')\n",
 995 |     "            if keyword in keywords:\n",
 996 |     "                for path2wav in paths:\n",
 997 |     "                    index.append({\n",
 998 |     "                        \"path\": path2wav.as_posix(),\n",
 999 |     "                        \"keyword\": keyword,\n",
1000 |     "                        \"label\": 1\n",
1001 |     "                    })\n",
1002 |     "            else:\n",
1003 |     "                for path2wav in paths:\n",
1004 |     "                    index.append({\n",
1005 |     "                        \"path\": path2wav.as_posix(),\n",
1006 |     "                        \"keyword\": keyword,\n",
1007 |     "                        \"label\": 0\n",
1008 |     "                    })\n",
1009 |     "\n",
1010 |     "        torch.manual_seed(0)\n",
1011 |     "        indexes = torch.randperm(len(index))\n",
1012 |     "        train_indexes = indexes[:int(len(index) * 0.8)]\n",
1013 |     "        val_indexes = indexes[int(len(index) * 0.8):]\n",
1014 |     "\n",
1015 |     "        train_index = [index[i] for i in train_indexes]\n",
1016 |     "        val_index = [index[i] for i in val_indexes]\n",
1017 |     "\n",
1018 |     "        train_index_path = pathlib.Path(\"train_index.json\")\n",
1019 |     "        write_json(train_index, str(train_index_path))\n",
1020 |     "        \n",
1021 |     "        val_index_path = pathlib.Path(\"val_index.json\")\n",
1022 |     "        write_json(val_index, str(val_index_path))\n",
1023 |     "\n",
1024 |     "    def __getitem__(self, ind: int):\n",
1025 |     "        instance = self.index[ind]\n",
1026 |     "\n",
1027 |     "        path2wav = instance['path']\n",
1028 |     "        wav, sr = torchaudio.load(path2wav)\n",
1029 |     "        wav = wav.sum(dim=0)\n",
1030 |     "        \n",
1031 |     "        if self.transform:\n",
1032 |     "            wav = self.transform(wav)\n",
1033 |     "\n",
1034 |     "        return {\n",
1035 |     "            'wav': wav,\n",
1036 |     "            'keywors': instance['keyword'],\n",
1037 |     "            'label': instance['label']\n",
1038 |     "        }\n",
1039 |     "\n",
1040 |     "    def __len__(self):\n",
1041 |     "        return len(self.index)\n"
1042 |    ]
1043 |   },
1044 |   {
1045 |    "cell_type": "code",
1046 |    "execution_count": null,
1047 |    "id": "c3861ca2-70de-4ed3-9cdd-82d37ad4cc13",
1048 |    "metadata": {},
1049 |    "outputs": [],
1050 |    "source": []
1051 |   },
1052 |   {
1053 |    "cell_type": "code",
1054 |    "execution_count": null,
1055 |    "id": "07b777e5-5a57-4251-90e5-f291a540a935",
1056 |    "metadata": {},
1057 |    "outputs": [],
1058 |    "source": [
1059 |     "import torchaudio\n",
1060 |     "\n",
1061 |     "class AugsCreation:\n",
1062 |     "\n",
1063 |     "    def __init__(self):\n",
1064 |     "        self.background_noises = [\n",
1065 |     "            'speech_commands/_background_noise_/white_noise.wav',\n",
1066 |     "            'speech_commands/_background_noise_/dude_miaowing.wav',\n",
1067 |     "            'speech_commands/_background_noise_/doing_the_dishes.wav',\n",
1068 |     "            'speech_commands/_background_noise_/exercise_bike.wav',\n",
1069 |     "            'speech_commands/_background_noise_/pink_noise.wav',\n",
1070 |     "            'speech_commands/_background_noise_/running_tap.wav'\n",
1071 |     "        ]\n",
1072 |     "\n",
1073 |     "        self.noises = [\n",
1074 |     "            torchaudio.load(p)[0].squeeze()\n",
1075 |     "            for p in self.background_noises\n",
1076 |     "        ]\n",
1077 |     "\n",
1078 |     "    def add_rand_noise(self, audio):\n",
1079 |     "\n",
1080 |     "        # randomly choose noise\n",
1081 |     "        noise_num = torch.randint(low=0, high=len(\n",
1082 |     "            self.background_noises), size=(1,)).item()\n",
1083 |     "        noise = self.noises[noise_num]\n",
1084 |     "\n",
1085 |     "        noise_level = torch.Tensor([1])  # [0, 40]\n",
1086 |     "\n",
1087 |     "        noise_energy = torch.norm(noise)\n",
1088 |     "        audio_energy = torch.norm(audio)\n",
1089 |     "        alpha = (audio_energy / noise_energy) * \\\n",
1090 |     "            torch.pow(10, -noise_level / 20)\n",
1091 |     "\n",
1092 |     "        start = torch.randint(\n",
1093 |     "            low=0,\n",
1094 |     "            high=max(int(noise.size(0) - audio.size(0) - 1), 1),\n",
1095 |     "            size=(1,)\n",
1096 |     "        ).item()\n",
1097 |     "        noise_sample = noise[start: start + audio.size(0)]\n",
1098 |     "\n",
1099 |     "        audio_new = audio + alpha * noise_sample\n",
1100 |     "        audio_new.clamp_(-1, 1)\n",
1101 |     "        return audio_new\n",
1102 |     "\n",
1103 |     "    def __call__(self, wav):\n",
1104 |     "        aug_num = torch.randint(low=0, high=4, size=(1,)).item()   # choose 1 random aug from augs\n",
1105 |     "        augs = [\n",
1106 |     "            lambda x: x,\n",
1107 |     "            lambda x: (x + torch.distributions.Normal(0, 0.01).sample(x.size())).clamp_(-1, 1),\n",
1108 |     "            lambda x: torchaudio.transforms.Vol(.25)(x),\n",
1109 |     "            lambda x: self.add_rand_noise(x)\n",
1110 |     "        ]\n",
1111 |     "\n",
1112 |     "        return augs[aug_num](wav)"
1113 |    ]
1114 |   },
1115 |   {
1116 |    "cell_type": "code",
1117 |    "execution_count": null,
1118 |    "id": "3f45ccf7-aafa-46b0-9473-e27fbefae2e0",
1119 |    "metadata": {},
1120 |    "outputs": [],
1121 |    "source": [
1122 |     "train_dataset = SpeechCommandDataset(\n",
1123 |     "    path2dir='speech_commands', keywords=TaskConfig.keyword, part=\"train\", transform=AugsCreation()\n",
1124 |     ")\n",
1125 |     "val_dataset = SpeechCommandDataset(\n",
1126 |     "    path2dir='speech_commands', keywords=TaskConfig.keyword, part=\"val\"\n",
1127 |     ")"
1128 |    ]
1129 |   },
1130 |   {
1131 |    "cell_type": "code",
1132 |    "execution_count": null,
1133 |    "id": "3b336ca8-c552-43d6-8d76-e7398d921f06",
1134 |    "metadata": {},
1135 |    "outputs": [],
1136 |    "source": [
1137 |     "train_dataset.index[:2]"
1138 |    ]
1139 |   },
1140 |   {
1141 |    "cell_type": "code",
1142 |    "execution_count": null,
1143 |    "id": "dbb86465-6cf1-40cb-9e45-102843000e70",
1144 |    "metadata": {},
1145 |    "outputs": [],
1146 |    "source": [
1147 |     "from torch.nn.utils.rnn import pad_sequence\n",
1148 |     "\n",
1149 |     "def collate_fn(data):\n",
1150 |     "    wavs = []\n",
1151 |     "    labels = []    \n",
1152 |     "\n",
1153 |     "    for el in data:\n",
1154 |     "        wavs.append(el['wav'])\n",
1155 |     "        labels.append(el['label'])\n",
1156 |     "\n",
1157 |     "    # torch.nn.utils.rnn.pad_sequence takes list(Tensors) and returns padded (with 0.0) Tensor\n",
1158 |     "    wavs = pad_sequence(wavs, batch_first=True)    \n",
1159 |     "    labels = torch.Tensor(labels).long()\n",
1160 |     "    return wavs, labels"
1161 |    ]
1162 |   },
1163 |   {
1164 |    "cell_type": "code",
1165 |    "execution_count": null,
1166 |    "id": "2dc73f86-847c-46cc-bb1c-df557e141748",
1167 |    "metadata": {},
1168 |    "outputs": [],
1169 |    "source": [
1170 |     "train_dataloader = DataLoader(train_dataset, batch_size=TaskConfig.batch_size,\n",
1171 |     "                          shuffle=False, collate_fn=collate_fn,\n",
1172 |     "                          num_workers=2, pin_memory=True)\n",
1173 |     "\n",
1174 |     "val_dataloader = DataLoader(val_dataset, batch_size=TaskConfig.batch_size,\n",
1175 |     "                        shuffle=False, collate_fn=collate_fn,\n",
1176 |     "                        num_workers=2, pin_memory=True)"
1177 |    ]
1178 |   },
1179 |   {
1180 |    "cell_type": "code",
1181 |    "execution_count": null,
1182 |    "id": "3797e444-a0ee-44a9-9d1a-d3c83e29f6f3",
1183 |    "metadata": {},
1184 |    "outputs": [],
1185 |    "source": [
1186 |     "from torch import nn"
1187 |    ]
1188 |   },
1189 |   {
1190 |    "cell_type": "code",
1191 |    "execution_count": null,
1192 |    "id": "130538c5-e66e-411a-9b7d-3ff978948365",
1193 |    "metadata": {},
1194 |    "outputs": [],
1195 |    "source": [
1196 |     "class LogMelspec(nn.Module):\n",
1197 |     "\n",
1198 |     "    def __init__(self, config):\n",
1199 |     "        super().__init__()\n",
1200 |     "        self.melspec = torchaudio.transforms.MelSpectrogram(\n",
1201 |     "                sample_rate=config.sample_rate,\n",
1202 |     "                n_fft=400,\n",
1203 |     "                win_length=400,\n",
1204 |     "                hop_length=160,\n",
1205 |     "                n_mels=config.n_mels\n",
1206 |     "        )\n",
1207 |     "\n",
1208 |     "        self.spec_augs = nn.Sequential(\n",
1209 |     "                torchaudio.transforms.FrequencyMasking(freq_mask_param=15),\n",
1210 |     "                torchaudio.transforms.TimeMasking(time_mask_param=35),\n",
1211 |     "        )\n",
1212 |     "\n",
1213 |     "\n",
1214 |     "    def __call__(self, batch):\n",
1215 |     "        x = torch.log(self.melspec(batch).clamp_(min=1e-9, max=1e9))\n",
1216 |     "        if self.training:\n",
1217 |     "            x = self.spec_augs(x)\n",
1218 |     "        return x"
1219 |    ]
1220 |   },
1221 |   {
1222 |    "cell_type": "code",
1223 |    "execution_count": null,
1224 |    "id": "3474f4ea-910d-40b7-82ca-56650bff4ae4",
1225 |    "metadata": {},
1226 |    "outputs": [],
1227 |    "source": [
1228 |     "class SpeechCommandDatasetV2(Dataset):\n",
1229 |     "\n",
1230 |     "    def __init__(\n",
1231 |     "        self,\n",
1232 |     "        transform: Optional[Callable] = None,\n",
1233 |     "        path2dir: str = None,\n",
1234 |     "        keywords: Union[str, List[str]] = None,\n",
1235 |     "        csv: Optional[pd.DataFrame] = None,\n",
1236 |     "        part: \"str\" = \"train\",\n",
1237 |     "    ):        \n",
1238 |     "        self.transform = transform\n",
1239 |     "\n",
1240 |     "        self.path2dir = path2dir\n",
1241 |     "        self.keywords = keywords\n",
1242 |     "        self.index = self.create_or_load_index(part)\n",
1243 |     "\n",
1244 |     "    def create_or_load_index(self, part):\n",
1245 |     "        index_path = Path(f\"{part}_index.json\")\n",
1246 |     "        \n",
1247 |     "        if not index_path.exists():\n",
1248 |     "            self.create_index(part)\n",
1249 |     "            \n",
1250 |     "        return read_json(index_path)\n",
1251 |     "\n",
1252 |     "    def create_index(self, part):\n",
1253 |     "        path2dir = Path(self.path2dir)\n",
1254 |     "        keywords = self.keywords if isinstance(self.keywords, list) else [self.keywords]\n",
1255 |     "        \n",
1256 |     "        all_keywords = [\n",
1257 |     "            p.stem for p in path2dir.glob('*')\n",
1258 |     "            if p.is_dir() and not p.stem.startswith('_')\n",
1259 |     "        ]\n",
1260 |     "\n",
1261 |     "        index = []\n",
1262 |     "        for keyword in all_keywords:\n",
1263 |     "            paths = (path2dir / keyword).rglob('*.wav')\n",
1264 |     "            if keyword in keywords:\n",
1265 |     "                for path2wav in paths:\n",
1266 |     "                    index.append({\n",
1267 |     "                        \"path\": path2wav.as_posix(),\n",
1268 |     "                        \"keyword\": keyword,\n",
1269 |     "                        \"label\": 1\n",
1270 |     "                    })\n",
1271 |     "            else:\n",
1272 |     "                for path2wav in paths:\n",
1273 |     "                    index.append({\n",
1274 |     "                        \"path\": path2wav.as_posix(),\n",
1275 |     "                        \"keyword\": keyword,\n",
1276 |     "                        \"label\": 0\n",
1277 |     "                    })\n",
1278 |     "\n",
1279 |     "        torch.manual_seed(0)\n",
1280 |     "        indexes = torch.randperm(len(index))\n",
1281 |     "        train_indexes = indexes[:int(len(index) * 0.8)]\n",
1282 |     "        val_indexes = indexes[int(len(index) * 0.8):]\n",
1283 |     "\n",
1284 |     "        train_index = [index[i] for i in train_indexes]\n",
1285 |     "        val_index = [index[i] for i in val_indexes]\n",
1286 |     "\n",
1287 |     "        train_index_path = pathlib.Path(\"train_index.json\")\n",
1288 |     "        write_json(train_index, str(train_index_path))\n",
1289 |     "        \n",
1290 |     "        val_index_path = pathlib.Path(\"val_index.json\")\n",
1291 |     "        write_json(val_index, str(val_index_path))\n",
1292 |     "\n",
1293 |     "    def __getitem__(self, ind: int):\n",
1294 |     "        instance = self.index[ind]\n",
1295 |     "\n",
1296 |     "        path2wav = instance['path']\n",
1297 |     "        wav, sr = torchaudio.load(path2wav)\n",
1298 |     "        wav = wav.sum(dim=0)\n",
1299 |     "        \n",
1300 |     "        if self.transform:\n",
1301 |     "            wav = self.transform(wav)\n",
1302 |     "\n",
1303 |     "        return {\n",
1304 |     "            'wav': wav,\n",
1305 |     "            'keywors': instance['keyword'],\n",
1306 |     "            'label': instance['label']\n",
1307 |     "        }\n",
1308 |     "\n",
1309 |     "    def __len__(self):\n",
1310 |     "        return len(self.index)\n"
1311 |    ]
1312 |   },
1313 |   {
1314 |    "cell_type": "code",
1315 |    "execution_count": null,
1316 |    "id": "adf8be91-a439-42ad-bd77-77b5be595190",
1317 |    "metadata": {},
1318 |    "outputs": [],
1319 |    "source": [
1320 |     "class Attention(nn.Module):\n",
1321 |     "\n",
1322 |     "    def __init__(self, hidden_size: int):\n",
1323 |     "        super().__init__()\n",
1324 |     "\n",
1325 |     "        self.energy = nn.Sequential(\n",
1326 |     "            nn.Linear(hidden_size, hidden_size),\n",
1327 |     "            nn.Tanh(),\n",
1328 |     "            nn.Linear(hidden_size, 1)\n",
1329 |     "        )\n",
1330 |     "    \n",
1331 |     "    def forward(self, input):\n",
1332 |     "        energy = self.energy(input)\n",
1333 |     "        alpha = torch.softmax(energy, dim=-2)\n",
1334 |     "        return (input * alpha).sum(dim=-2)\n",
1335 |     "\n",
1336 |     "class SpeechCommandDatasetV5(Dataset):\n",
1337 |     "\n",
1338 |     "    def __init__(\n",
1339 |     "        self,\n",
1340 |     "        transform: Optional[Callable] = None,\n",
1341 |     "        path2dir: str = None,\n",
1342 |     "        keywords: Union[str, List[str]] = None,\n",
1343 |     "        csv: Optional[pd.DataFrame] = None,\n",
1344 |     "        part: \"str\" = \"train\",\n",
1345 |     "    ):        \n",
1346 |     "        self.transform = transform\n",
1347 |     "\n",
1348 |     "        self.path2dir = path2dir\n",
1349 |     "        self.keywords = keywords\n",
1350 |     "        self.index = self.create_or_load_index(part)\n",
1351 |     "\n",
1352 |     "    def create_or_load_index(self, part):\n",
1353 |     "        index_path = Path(f\"{part}_index.json\")\n",
1354 |     "        \n",
1355 |     "        if not index_path.exists():\n",
1356 |     "            self.create_index(part)\n",
1357 |     "            \n",
1358 |     "        return read_json(index_path)\n",
1359 |     "\n",
1360 |     "    def create_index(self, part):\n",
1361 |     "        path2dir = Path(self.path2dir)\n",
1362 |     "        keywords = self.keywords if isinstance(self.keywords, list) else [self.keywords]\n",
1363 |     "        \n",
1364 |     "        all_keywords = [\n",
1365 |     "            p.stem for p in path2dir.glob('*')\n",
1366 |     "            if p.is_dir() and not p.stem.startswith('_')\n",
1367 |     "        ]\n",
1368 |     "\n",
1369 |     "        index = []\n",
1370 |     "        for keyword in all_keywords:\n",
1371 |     "            paths = (path2dir / keyword).rglob('*.wav')\n",
1372 |     "            if keyword in keywords:\n",
1373 |     "                for path2wav in paths:\n",
1374 |     "                    index.append({\n",
1375 |     "                        \"path\": path2wav.as_posix(),\n",
1376 |     "                        \"keyword\": keyword,\n",
1377 |     "                        \"label\": 1\n",
1378 |     "                    })\n",
1379 |     "            else:\n",
1380 |     "                for path2wav in paths:\n",
1381 |     "                    index.append({\n",
1382 |     "                        \"path\": path2wav.as_posix(),\n",
1383 |     "                        \"keyword\": keyword,\n",
1384 |     "                        \"label\": 0\n",
1385 |     "                    })\n",
1386 |     "\n",
1387 |     "        torch.manual_seed(0)\n",
1388 |     "        indexes = torch.randperm(len(index))\n",
1389 |     "        train_indexes = indexes[:int(len(index) * 0.8)]\n",
1390 |     "        val_indexes = indexes[int(len(index) * 0.8):]\n",
1391 |     "\n",
1392 |     "        train_index = [index[i] for i in train_indexes]\n",
1393 |     "        val_index = [index[i] for i in val_indexes]\n",
1394 |     "\n",
1395 |     "        train_index_path = pathlib.Path(\"train_index.json\")\n",
1396 |     "        write_json(train_index, str(train_index_path))\n",
1397 |     "        \n",
1398 |     "        val_index_path = pathlib.Path(\"val_index.json\")\n",
1399 |     "        write_json(val_index, str(val_index_path))\n",
1400 |     "\n",
1401 |     "    def __getitem__(self, ind: int):\n",
1402 |     "        instance = self.index[ind]\n",
1403 |     "\n",
1404 |     "        path2wav = instance['path']\n",
1405 |     "        wav, sr = torchaudio.load(path2wav)\n",
1406 |     "        wav = wav.sum(dim=0)\n",
1407 |     "        \n",
1408 |     "        if self.transform:\n",
1409 |     "            wav = self.transform(wav)\n",
1410 |     "\n",
1411 |     "        return {\n",
1412 |     "            'wav': wav,\n",
1413 |     "            'keywors': instance['keyword'],\n",
1414 |     "            'label': instance['label']\n",
1415 |     "        }\n",
1416 |     "\n",
1417 |     "    def __len__(self):\n",
1418 |     "        return len(self.index)\n",
1419 |     "\n",
1420 |     "\n",
1421 |     "class CRNN(nn.Module):\n",
1422 |     "\n",
1423 |     "    def __init__(self, config: TaskConfig):\n",
1424 |     "        super().__init__()\n",
1425 |     "        self.config = config\n",
1426 |     "\n",
1427 |     "        self.mel_spec = LogMelspec(config)\n",
1428 |     "\n",
1429 |     "        self.conv = nn.Sequential(\n",
1430 |     "            nn.Conv2d(\n",
1431 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1432 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1433 |     "            ),\n",
1434 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1435 |     "        )\n",
1436 |     "\n",
1437 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1438 |     "            config.stride[0] + 1\n",
1439 |     "        \n",
1440 |     "        self.gru = nn.GRU(\n",
1441 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1442 |     "            hidden_size=config.hidden_size,\n",
1443 |     "            num_layers=config.gru_num_layers,\n",
1444 |     "            dropout=0.1,\n",
1445 |     "            bidirectional=config.bidirectional,\n",
1446 |     "            batch_first=True\n",
1447 |     "        )\n",
1448 |     "\n",
1449 |     "        self.attention = Attention(config.hidden_size)\n",
1450 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1451 |     "    \n",
1452 |     "    def forward(self, input):\n",
1453 |     "        input = self.mel_spec(input)\n",
1454 |     "        \n",
1455 |     "        input = input.unsqueeze(dim=1)\n",
1456 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1457 |     "        gru_output, _ = self.gru(conv_output)\n",
1458 |     "        contex_vector = self.attention(gru_output)\n",
1459 |     "        output = self.classifier(contex_vector)\n",
1460 |     "        return output"
1461 |    ]
1462 |   },
1463 |   {
1464 |    "cell_type": "code",
1465 |    "execution_count": null,
1466 |    "id": "9732e122-7988-4544-8815-c92de443a72e",
1467 |    "metadata": {},
1468 |    "outputs": [],
1469 |    "source": [
1470 |     "class CRNNV2(nn.Module):\n",
1471 |     "\n",
1472 |     "    def __init__(self, config: TaskConfig):\n",
1473 |     "        super().__init__()\n",
1474 |     "        self.config = config\n",
1475 |     "\n",
1476 |     "        self.mel_spec = LogMelspec(config)\n",
1477 |     "\n",
1478 |     "        self.conv = nn.Sequential(\n",
1479 |     "            nn.Conv2d(\n",
1480 |     "                in_channels=1, out_channels=config.cnn_out_channels,\n",
1481 |     "                kernel_size=config.kernel_size, stride=config.stride\n",
1482 |     "            ),\n",
1483 |     "            nn.Flatten(start_dim=1, end_dim=2),\n",
1484 |     "        )\n",
1485 |     "\n",
1486 |     "        self.conv_out_frequency = (config.n_mels - config.kernel_size[0]) // \\\n",
1487 |     "            config.stride[0] + 1\n",
1488 |     "        \n",
1489 |     "        self.gru = nn.GRU(\n",
1490 |     "            input_size=self.conv_out_frequency * config.cnn_out_channels,\n",
1491 |     "            hidden_size=config.hidden_size,\n",
1492 |     "            num_layers=config.gru_num_layers,\n",
1493 |     "            dropout=0.1,\n",
1494 |     "            bidirectional=config.bidirectional,\n",
1495 |     "            batch_first=True\n",
1496 |     "        )\n",
1497 |     "\n",
1498 |     "        self.attention = Attention(config.hidden_size)\n",
1499 |     "        self.classifier = nn.Linear(config.hidden_size, config.num_classes)\n",
1500 |     "    \n",
1501 |     "    def forward(self, input):\n",
1502 |     "        input = self.mel_spec(input)\n",
1503 |     "        \n",
1504 |     "        input = input.unsqueeze(dim=1)\n",
1505 |     "        conv_output = self.conv(input).transpose(-1, -2)\n",
1506 |     "        gru_output, _ = self.gru(conv_output)\n",
1507 |     "        contex_vector = self.attention(gru_output)\n",
1508 |     "        output = self.classifier(contex_vector)\n",
1509 |     "        return output"
1510 |    ]
1511 |   },
1512 |   {
1513 |    "cell_type": "code",
1514 |    "execution_count": null,
1515 |    "id": "4eab3ad0-c01c-4e2f-be1b-5a3c8c9ae227",
1516 |    "metadata": {},
1517 |    "outputs": [],
1518 |    "source": []
1519 |   },
1520 |   {
1521 |    "cell_type": "code",
1522 |    "execution_count": null,
1523 |    "id": "b10279b6-7547-47e1-a8ff-44d9a9feef7d",
1524 |    "metadata": {},
1525 |    "outputs": [],
1526 |    "source": [
1527 |     "config = TaskConfig()\n",
1528 |     "model = CRNN(config)\n",
1529 |     "model"
1530 |    ]
1531 |   },
1532 |   {
1533 |    "cell_type": "code",
1534 |    "execution_count": null,
1535 |    "id": "275de3c2-9dfd-477b-98ba-e42e4ceef70d",
1536 |    "metadata": {},
1537 |    "outputs": [],
1538 |    "source": [
1539 |     "model(train_dataset[0][\"wav\"].unsqueeze(0))"
1540 |    ]
1541 |   },
1542 |   {
1543 |    "cell_type": "code",
1544 |    "execution_count": null,
1545 |    "id": "18e98516-e71b-4afe-9c2c-59ca6cbd7ee0",
1546 |    "metadata": {},
1547 |    "outputs": [],
1548 |    "source": [
1549 |     "from tqdm.auto import tqdm\n",
1550 |     "import wandb\n",
1551 |     "\n",
1552 |     "wandb.login()"
1553 |    ]
1554 |   },
1555 |   {
1556 |    "cell_type": "code",
1557 |    "execution_count": null,
1558 |    "id": "f20e5fe8-f64e-43bd-a308-0c504cea4785",
1559 |    "metadata": {},
1560 |    "outputs": [],
1561 |    "source": [
1562 |     "criterion = nn.CrossEntropyLoss()"
1563 |    ]
1564 |   },
1565 |   {
1566 |    "cell_type": "code",
1567 |    "execution_count": null,
1568 |    "id": "59918ad1-3a74-4588-b26e-827d0e337fc0",
1569 |    "metadata": {},
1570 |    "outputs": [],
1571 |    "source": [
1572 |     "model = CRNNV2(...)\n",
1573 |     "\n",
1574 |     "model.to(config.device)\n",
1575 |     "\n",
1576 |     "NUM_EPOCHS = 2\n",
1577 |     "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)\n",
1578 |     "scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS * len(train_dataloader), eta_min=1e-4)\n",
1579 |     "\n",
1580 |     "with wandb.init(\n",
1581 |     "                project=\"seminar_wandb_kws\", # project name\n",
1582 |     "                name=\"crnnv2\" # run name within the project\n",
1583 |     "            ) as run:\n",
1584 |     "    train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, config.device, NUM_EPOCHS)"
1585 |    ]
1586 |   },
1587 |   {
1588 |    "cell_type": "code",
1589 |    "execution_count": null,
1590 |    "id": "6dc333e1-65e5-4f2f-8b3b-825801d0c5d8",
1591 |    "metadata": {},
1592 |    "outputs": [],
1593 |    "source": [
1594 |     "# FA - true: 0, model: 1\n",
1595 |     "# FR - true: 1, model: 0\n",
1596 |     "\n",
1597 |     "def count_fa(preds, labels):\n",
1598 |     "    preds = torch.argmax(preds, dim=-1)\n",
1599 |     "\n",
1600 |     "    FA = torch.sum(preds[labels == 0])\n",
1601 |     "    \n",
1602 |     "    # torch.numel - returns total number of elements in tensor\n",
1603 |     "    return FA.item() / torch.numel(preds)\n",
1604 |     "\n",
1605 |     "def count_fr(preds, labels):\n",
1606 |     "    preds = torch.argmax(preds, dim=-1)\n",
1607 |     "\n",
1608 |     "    FR = torch.sum(labels[preds == 0])\n",
1609 |     "    \n",
1610 |     "    # torch.numel - returns total number of elements in tensor\n",
1611 |     "    return FR.item() / torch.numel(preds)\n",
1612 |     "\n",
1613 |     "def count_acc(preds, labels):\n",
1614 |     "    preds = torch.argmax(preds, dim=-1)\n",
1615 |     "\n",
1616 |     "    acc = torch.sum(preds == labels)\n",
1617 |     "    \n",
1618 |     "    # torch.numel - returns total number of elements in tensor\n",
1619 |     "    return acc.item() / torch.numel(preds)"
1620 |    ]
1621 |   },
1622 |   {
1623 |    "cell_type": "code",
1624 |    "execution_count": null,
1625 |    "id": "2d87ba59-936c-45b6-8468-2dd88c23a032",
1626 |    "metadata": {},
1627 |    "outputs": [],
1628 |    "source": []
1629 |   },
1630 |   {
1631 |    "cell_type": "code",
1632 |    "execution_count": null,
1633 |    "id": "2d1fb606-b1f6-4f7e-b656-2523402950a8",
1634 |    "metadata": {},
1635 |    "outputs": [],
1636 |    "source": [
1637 |     "model.to(config.device)\n",
1638 |     "\n",
1639 |     "NUM_EPOCHS = 2\n",
1640 |     "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)\n",
1641 |     "scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS * len(train_dataloader), eta_min=1e-4)"
1642 |    ]
1643 |   },
1644 |   {
1645 |    "cell_type": "code",
1646 |    "execution_count": null,
1647 |    "id": "deffa1c5-bee8-441b-aee7-8d5eba4f9a38",
1648 |    "metadata": {},
1649 |    "outputs": [],
1650 |    "source": [
1651 |     "with wandb.init(\n",
1652 |     "                project=\"seminar_wandb_kws\", # project name\n",
1653 |     "                name=\"crnn\" # run name within the project\n",
1654 |     "            ) as run:\n",
1655 |     "    train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, config.device, NUM_EPOCHS)"
1656 |    ]
1657 |   },
1658 |   {
1659 |    "cell_type": "code",
1660 |    "execution_count": null,
1661 |    "id": "da9bdc8f-075f-4171-a3bf-3362159c8a6b",
1662 |    "metadata": {},
1663 |    "outputs": [],
1664 |    "source": []
1665 |   },
1666 |   {
1667 |    "cell_type": "code",
1668 |    "execution_count": null,
1669 |    "id": "dfbf78d5-80dc-417f-b444-5f30a031397f",
1670 |    "metadata": {},
1671 |    "outputs": [],
1672 |    "source": []
1673 |   },
1674 |   {
1675 |    "cell_type": "code",
1676 |    "execution_count": null,
1677 |    "id": "0a31500c-c304-450d-8862-bde7a679f809",
1678 |    "metadata": {},
1679 |    "outputs": [],
1680 |    "source": [
1681 |     "model = CRNNV3(...)\n",
1682 |     "\n",
1683 |     "model.to(config.device)\n",
1684 |     "\n",
1685 |     "NUM_EPOCHS = 2\n",
1686 |     "optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)\n",
1687 |     "scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=NUM_EPOCHS * len(train_dataloader), eta_min=1e-4)\n",
1688 |     "\n",
1689 |     "with wandb.init(\n",
1690 |     "                project=\"seminar_wandb_kws\", # project name\n",
1691 |     "                name=\"crnnv3\" # run name within the project\n",
1692 |     "            ) as run:\n",
1693 |     "    train(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, config.device, NUM_EPOCHS)"
1694 |    ]
1695 |   }
1696 |  ],
1697 |  "metadata": {
1698 |   "kernelspec": {
1699 |    "display_name": "Python 3 (ipykernel)",
1700 |    "language": "python",
1701 |    "name": "python3"
1702 |   },
1703 |   "language_info": {
1704 |    "codemirror_mode": {
1705 |     "name": "ipython",
1706 |     "version": 3
1707 |    },
1708 |    "file_extension": ".py",
1709 |    "mimetype": "text/x-python",
1710 |    "name": "python",
1711 |    "nbconvert_exporter": "python",
1712 |    "pygments_lexer": "ipython3",
1713 |    "version": "3.9.7"
1714 |   }
1715 |  },
1716 |  "nbformat": 4,
1717 |  "nbformat_minor": 5
1718 | }
1719 | 


--------------------------------------------------------------------------------
/day03/pics/git_four.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LauzHack/deep-learning-bootcamp/40fc389c0b1e6639b7525dc33a54e5fcb0563c30/day03/pics/git_four.png


--------------------------------------------------------------------------------
/day03/pics/git_one.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LauzHack/deep-learning-bootcamp/40fc389c0b1e6639b7525dc33a54e5fcb0563c30/day03/pics/git_one.png


--------------------------------------------------------------------------------
/day03/pics/git_three.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LauzHack/deep-learning-bootcamp/40fc389c0b1e6639b7525dc33a54e5fcb0563c30/day03/pics/git_three.png


--------------------------------------------------------------------------------
/day03/pics/git_two.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LauzHack/deep-learning-bootcamp/40fc389c0b1e6639b7525dc33a54e5fcb0563c30/day03/pics/git_two.png


--------------------------------------------------------------------------------
/day06/README.md:
--------------------------------------------------------------------------------
1 | # Day 06
2 | 
3 | Deep Learning for Audio, Eric Bezzam and Petr Grinberg
4 | 
5 | - [Lecture slides](https://docs.google.com/presentation/d/1cneKqSzHmTN_O4a4q1jVSstFbf-6e7rNGOhRD8UYno4/edit?usp=sharing)
6 | 
7 | 


--------------------------------------------------------------------------------
/day07/GNN_lecture_Ali_Hariri.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LauzHack/deep-learning-bootcamp/40fc389c0b1e6639b7525dc33a54e5fcb0563c30/day07/GNN_lecture_Ali_Hariri.pdf


--------------------------------------------------------------------------------
/day07/README.md:
--------------------------------------------------------------------------------
1 | # Day 07
2 | 
3 | Graph Neural Networks, Ali Hariri
4 | 
5 | - [Lecture slides](GNN_lecture_Ali_Hariri.pdf)
6 | - [Colab Notebook](https://colab.research.google.com/drive/1QFjYjQrUwh8CAlIgzZp6oDAH-23F4DJ1?usp=sharing) ([Solutions](https://colab.research.google.com/drive/1SwvCzCyo-O-v-0E4s94FKP32Ukbzb4Kn?usp=sharing))


--------------------------------------------------------------------------------
/day08/Computer_Vision_13_02_2025.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LauzHack/deep-learning-bootcamp/40fc389c0b1e6639b7525dc33a54e5fcb0563c30/day08/Computer_Vision_13_02_2025.pdf


--------------------------------------------------------------------------------
/day08/README.md:
--------------------------------------------------------------------------------
1 | # Day 08
2 | 
3 | Computer Vision, Nikita Durasov
4 | 
5 | - [Lecture slides](Computer_Vision_13_02_2025.pdf)
6 | - [(Notebook) MNIST Diffusion](https://colab.research.google.com/drive/1oTt38IIkbDaQouFrsqNjdQR7wwo6ST6w#scrollTo=_odH2MHpZSAY)
7 | - [(Notebook) MNIST Test-Time Training](https://colab.research.google.com/drive/1XFqG3s3gSmcByP88HWmyO-0Uz7KFQw3r#scrollTo=4fa619b8-a8cc-4e08-b607-89bc387d631a)


--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LauzHack/deep-learning-bootcamp/40fc389c0b1e6639b7525dc33a54e5fcb0563c30/docs/logo.png


--------------------------------------------------------------------------------