├── .gitignore
├── Hw1
    └── 「「ML2022Spring_HW1(MODEL5).ipynb
├── Hw10
    └── ML2022_hw10-0.09.ipynb
├── Hw11
    ├── hw11_domain_adaptation_(en) 0.81400.ipynb
    ├── hw_11_report.docx
    └── hw_11_report.pdf
├── Hw2
    └── ensemble(v2).ipynb
├── Hw3
    └── p10922001_hw3.ipynb
├── Hw4
    ├── hw04(0.84625_best_model4).ipynb
    ├── hw04(0.87050try ensemble).ipynb
    ├── hw04(model3).ipynb
    └── report.pdf
├── Hw5
    ├── HW05(29.18 try backtrans 2).ipynb
    └── report.pdf
├── Hw6
    ├── ML_HW6 try style.ipynb
    └── ml_hw6_report.pdf
├── Hw7
    ├── ML2022Spring_HW7 ensemble.ipynb
    ├── ML2022Spring_HW7 m1 0.83057.ipynb
    ├── ML2022Spring_HW7 m6 last try.ipynb
    ├── ML2022Spring_HW7 m6 last try2.ipynb
    └── hw7_report.pdf
├── Hw8
    ├── ML2022Spring_HW8 0.82379 fcn6.ipynb
    └── report.pdf
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/Hw11/hw_11_report.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macTracyHuang/NTU-ML2022-Spring/0c14ae8d9a1448ee2da03f93836e3dd5d3a62b16/Hw11/hw_11_report.docx


--------------------------------------------------------------------------------
/Hw11/hw_11_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macTracyHuang/NTU-ML2022-Spring/0c14ae8d9a1448ee2da03f93836e3dd5d3a62b16/Hw11/hw_11_report.pdf


--------------------------------------------------------------------------------
/Hw4/hw04(0.84625_best_model4).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "C_jdZ5vHJ4A9"
  7 |       },
  8 |       "source": [
  9 |         "# Task description\n",
 10 |         "- Classify the speakers of given features.\n",
 11 |         "- Main goal: Learn how to use transformer.\n",
 12 |         "- Baselines:\n",
 13 |         "  - Easy: Run sample code and know how to use transformer.\n",
 14 |         "  - Medium: Know how to adjust parameters of transformer.\n",
 15 |         "  - Strong: Construct [conformer](https://arxiv.org/abs/2005.08100) which is a variety of transformer. \n",
 16 |         "  - Boss: Implement [Self-Attention Pooling](https://arxiv.org/pdf/2008.01077v1.pdf) & [Additive Margin Softmax](https://arxiv.org/pdf/1801.05599.pdf) to further boost the performance.\n",
 17 |         "\n",
 18 |         "- Other links\n",
 19 |         "  - Kaggle: [link](https://www.kaggle.com/t/ac77388c90204a4c8daebeddd40ff916)\n",
 20 |         "  - Slide: [link](https://docs.google.com/presentation/d/1HLAj7UUIjZOycDe7DaVLSwJfXVd3bXPOyzSb6Zk3hYU/edit?usp=sharing)\n",
 21 |         "  - Data: [link](https://github.com/MachineLearningHW/ML_HW4_Dataset)\n",
 22 |         "\n",
 23 |         "# Download dataset\n",
 24 |         "- Data is [here](https://github.com/MachineLearningHW/ML_HW4_Dataset)"
 25 |       ]
 26 |     },
 27 |     {
 28 |       "cell_type": "code",
 29 |       "execution_count": 1,
 30 |       "metadata": {
 31 |         "id": "LhLNWB-AK2Z5"
 32 |       },
 33 |       "outputs": [],
 34 |       "source": [
 35 |         "# \"\"\"\n",
 36 |         "# If the links below become inaccessible, please connect TAs.\n",
 37 |         "# \"\"\"\n",
 38 |         "\n",
 39 |         "# !wget https://github.com/MachineLearningHW/ML_HW4_Dataset/raw/0.0.1/Dataset.tar.gz.partaa\n",
 40 |         "# !wget https://github.com/MachineLearningHW/ML_HW4_Dataset/raw/0.0.1/Dataset.tar.gz.partab\n",
 41 |         "# !wget https://github.com/MachineLearningHW/ML_HW4_Dataset/raw/0.0.1/Dataset.tar.gz.partac\n",
 42 |         "# !wget https://github.com/MachineLearningHW/ML_HW4_Dataset/raw/0.0.1/Dataset.tar.gz.partad\n",
 43 |         "\n",
 44 |         "# !cat Dataset.tar.gz.parta* > Dataset.tar.gz\n",
 45 |         "\n",
 46 |         "# !tar zxvf Dataset.tar.gz"
 47 |       ]
 48 |     },
 49 |     {
 50 |       "cell_type": "markdown",
 51 |       "metadata": {
 52 |         "id": "ENWVAUDVJtVY"
 53 |       },
 54 |       "source": [
 55 |         "## Fix Random Seed"
 56 |       ]
 57 |     },
 58 |     {
 59 |       "cell_type": "code",
 60 |       "execution_count": 2,
 61 |       "metadata": {
 62 |         "id": "E6burzCXIyuA"
 63 |       },
 64 |       "outputs": [
 65 |         {
 66 |           "name": "stderr",
 67 |           "output_type": "stream",
 68 |           "text": [
 69 |             "/home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 70 |             "  from .autonotebook import tqdm as notebook_tqdm\n"
 71 |           ]
 72 |         }
 73 |       ],
 74 |       "source": [
 75 |         "import numpy as np\n",
 76 |         "import torch\n",
 77 |         "import random\n",
 78 |         "import torchaudio\n",
 79 |         "\n",
 80 |         "def set_seed(seed):\n",
 81 |         "    np.random.seed(seed)\n",
 82 |         "    random.seed(seed)\n",
 83 |         "    torch.manual_seed(seed)\n",
 84 |         "    if torch.cuda.is_available():\n",
 85 |         "        torch.cuda.manual_seed(seed)\n",
 86 |         "        torch.cuda.manual_seed_all(seed)\n",
 87 |         "    torch.backends.cudnn.benchmark = False\n",
 88 |         "    torch.backends.cudnn.deterministic = True\n",
 89 |         "\n",
 90 |         "set_seed(87)"
 91 |       ]
 92 |     },
 93 |     {
 94 |       "cell_type": "markdown",
 95 |       "metadata": {
 96 |         "id": "k7dVbxW2LASN"
 97 |       },
 98 |       "source": [
 99 |         "# Data\n",
100 |         "\n",
101 |         "## Dataset\n",
102 |         "- Original dataset is [Voxceleb2](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html).\n",
103 |         "- The [license](https://creativecommons.org/licenses/by/4.0/) and [complete version](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/files/license.txt) of Voxceleb2.\n",
104 |         "- We randomly select 600 speakers from Voxceleb2.\n",
105 |         "- Then preprocess the raw waveforms into mel-spectrograms.\n",
106 |         "\n",
107 |         "- Args:\n",
108 |         "  - data_dir: The path to the data directory.\n",
109 |         "  - metadata_path: The path to the metadata.\n",
110 |         "  - segment_len: The length of audio segment for training. \n",
111 |         "- The architecture of data directory \\\\\n",
112 |         "  - data directory \\\\\n",
113 |         "  |---- metadata.json \\\\\n",
114 |         "  |---- testdata.json \\\\\n",
115 |         "  |---- mapping.json \\\\\n",
116 |         "  |---- uttr-{random string}.pt \\\\\n",
117 |         "\n",
118 |         "- The information in metadata\n",
119 |         "  - \"n_mels\": The dimention of mel-spectrogram.\n",
120 |         "  - \"speakers\": A dictionary. \n",
121 |         "    - Key: speaker ids.\n",
122 |         "    - value: \"feature_path\" and \"mel_len\"\n",
123 |         "\n",
124 |         "\n",
125 |         "For efficiency, we segment the mel-spectrograms into segments in the traing step."
126 |       ]
127 |     },
128 |     {
129 |       "cell_type": "code",
130 |       "execution_count": 3,
131 |       "metadata": {
132 |         "id": "KpuGxl4CI2pr"
133 |       },
134 |       "outputs": [],
135 |       "source": [
136 |         "import os\n",
137 |         "import json\n",
138 |         "import torch\n",
139 |         "import random\n",
140 |         "from pathlib import Path\n",
141 |         "from torch.utils.data import Dataset\n",
142 |         "from torch.nn.utils.rnn import pad_sequence\n",
143 |         " \n",
144 |         " \n",
145 |         "class myDataset(Dataset):\n",
146 |         "\tdef __init__(self, data_dir, segment_len=128):\n",
147 |         "\t\tself.data_dir = data_dir\n",
148 |         "\t\tself.segment_len = segment_len\n",
149 |         "\t\n",
150 |         "\t\t# Load the mapping from speaker neme to their corresponding id. \n",
151 |         "\t\tmapping_path = Path(data_dir) / \"mapping.json\"\n",
152 |         "\t\tmapping = json.load(mapping_path.open())\n",
153 |         "\t\tself.speaker2id = mapping[\"speaker2id\"]\n",
154 |         "\t\n",
155 |         "\t\t# Load metadata of training data.\n",
156 |         "\t\tmetadata_path = Path(data_dir) / \"metadata.json\"\n",
157 |         "\t\tmetadata = json.load(open(metadata_path))[\"speakers\"]\n",
158 |         "\t\n",
159 |         "\t\t# Get the total number of speaker.\n",
160 |         "\t\tself.speaker_num = len(metadata.keys())\n",
161 |         "\t\tself.data = []\n",
162 |         "\t\tfor speaker in metadata.keys():\n",
163 |         "\t\t\tfor utterances in metadata[speaker]:\n",
164 |         "\t\t\t\tself.data.append([utterances[\"feature_path\"], self.speaker2id[speaker]])\n",
165 |         " \n",
166 |         "\tdef __len__(self):\n",
167 |         "\t\t\treturn len(self.data)\n",
168 |         " \n",
169 |         "\tdef __getitem__(self, index):\n",
170 |         "\t\tfeat_path, speaker = self.data[index]\n",
171 |         "\t\t# Load preprocessed mel-spectrogram.\n",
172 |         "\t\tmel = torch.load(os.path.join(self.data_dir, feat_path))\n",
173 |         "\n",
174 |         "\t\t# Segmemt mel-spectrogram into \"segment_len\" frames.\n",
175 |         "\t\tif len(mel) > self.segment_len:\n",
176 |         "\t\t\t# Randomly get the starting point of the segment.\n",
177 |         "\t\t\tstart = random.randint(0, len(mel) - self.segment_len)\n",
178 |         "\t\t\t# Get a segment with \"segment_len\" frames.\n",
179 |         "\t\t\tmel = torch.FloatTensor(mel[start:start+self.segment_len])\n",
180 |         "\t\telse:\n",
181 |         "\t\t\tmel = torch.FloatTensor(mel)\n",
182 |         "\t\t# Turn the speaker id into long for computing loss later.\n",
183 |         "\t\tspeaker = torch.FloatTensor([speaker]).long()\n",
184 |         "\t\treturn mel, speaker\n",
185 |         " \n",
186 |         "\tdef get_speaker_number(self):\n",
187 |         "\t\treturn self.speaker_num"
188 |       ]
189 |     },
190 |     {
191 |       "cell_type": "markdown",
192 |       "metadata": {
193 |         "id": "668hverTMlGN"
194 |       },
195 |       "source": [
196 |         "## Dataloader\n",
197 |         "- Split dataset into training dataset(90%) and validation dataset(10%).\n",
198 |         "- Create dataloader to iterate the data."
199 |       ]
200 |     },
201 |     {
202 |       "cell_type": "code",
203 |       "execution_count": 4,
204 |       "metadata": {
205 |         "id": "B7c2gZYoJDRS"
206 |       },
207 |       "outputs": [],
208 |       "source": [
209 |         "import torch\n",
210 |         "from torch.utils.data import DataLoader, random_split\n",
211 |         "from torch.nn.utils.rnn import pad_sequence\n",
212 |         "\n",
213 |         "\n",
214 |         "def collate_batch(batch):\n",
215 |         "\t# Process features within a batch.\n",
216 |         "\t\"\"\"Collate a batch of data.\"\"\"\n",
217 |         "\tmel, speaker = zip(*batch)\n",
218 |         "\t# Because we train the model batch by batch, we need to pad the features in the same batch to make their lengths the same.\n",
219 |         "\tmel = pad_sequence(mel, batch_first=True, padding_value=-20)    # pad log 10^(-20) which is very small value.\n",
220 |         "\t# mel: (batch size, length, 40)\n",
221 |         "\treturn mel, torch.FloatTensor(speaker).long()\n",
222 |         "\n",
223 |         "\n",
224 |         "def get_dataloader(data_dir, batch_size, n_workers):\n",
225 |         "\t\"\"\"Generate dataloader\"\"\"\n",
226 |         "\tdataset = myDataset(data_dir)\n",
227 |         "\tspeaker_num = dataset.get_speaker_number()\n",
228 |         "\t# Split dataset into training dataset and validation dataset\n",
229 |         "\ttrainlen = int(0.9 * len(dataset))\n",
230 |         "\tlengths = [trainlen, len(dataset) - trainlen]\n",
231 |         "\ttrainset, validset = random_split(dataset, lengths)\n",
232 |         "\n",
233 |         "\ttrain_loader = DataLoader(\n",
234 |         "\t\ttrainset,\n",
235 |         "\t\tbatch_size=batch_size,\n",
236 |         "\t\tshuffle=True,\n",
237 |         "\t\tdrop_last=True,\n",
238 |         "\t\tnum_workers=n_workers,\n",
239 |         "\t\tpin_memory=True,\n",
240 |         "\t\tcollate_fn=collate_batch,\n",
241 |         "\t)\n",
242 |         "\tvalid_loader = DataLoader(\n",
243 |         "\t\tvalidset,\n",
244 |         "\t\tbatch_size=batch_size,\n",
245 |         "\t\tnum_workers=n_workers,\n",
246 |         "\t\tdrop_last=True,\n",
247 |         "\t\tpin_memory=True,\n",
248 |         "\t\tcollate_fn=collate_batch,\n",
249 |         "\t)\n",
250 |         "\n",
251 |         "\treturn train_loader, valid_loader, speaker_num"
252 |       ]
253 |     },
254 |     {
255 |       "cell_type": "code",
256 |       "execution_count": 5,
257 |       "metadata": {},
258 |       "outputs": [],
259 |       "source": [
260 |         "from torch import nn\n",
261 |         "#ref: https://gist.github.com/pohanchi/c77f6dbfbcbc21c5215acde4f62e4362\n",
262 |         "class SelfAttentionPooling(nn.Module):\n",
263 |         "    \"\"\"\n",
264 |         "    Implementation of SelfAttentionPooling \n",
265 |         "    Original Paper: Self-Attention Encoding and Pooling for Speaker Recognition\n",
266 |         "    https://arxiv.org/pdf/2008.01077v1.pdf\n",
267 |         "    \"\"\"\n",
268 |         "    def __init__(self, input_dim):\n",
269 |         "        super(SelfAttentionPooling, self).__init__()\n",
270 |         "        self.W = nn.Linear(input_dim, 1)\n",
271 |         "        \n",
272 |         "    def forward(self, batch_rep):\n",
273 |         "        \"\"\"\n",
274 |         "        input:\n",
275 |         "            batch_rep : size (N, T, H), N: batch size, T: sequence length, H: Hidden dimension\n",
276 |         "        \n",
277 |         "        attention_weight:\n",
278 |         "            att_w : size (N, T, 1)\n",
279 |         "        \n",
280 |         "        return:\n",
281 |         "            utter_rep: size (N, H)\n",
282 |         "        \"\"\"\n",
283 |         "        softmax = nn.functional.softmax\n",
284 |         "        att_w = softmax(self.W(batch_rep).squeeze(-1),dim=1).unsqueeze(-1)\n",
285 |         "        utter_rep = torch.sum(batch_rep * att_w, dim=1)\n",
286 |         "\n",
287 |         "        return utter_rep"
288 |       ]
289 |     },
290 |     {
291 |       "cell_type": "markdown",
292 |       "metadata": {
293 |         "id": "5FOSZYxrMqhc"
294 |       },
295 |       "source": [
296 |         "# Model\n",
297 |         "- TransformerEncoderLayer:\n",
298 |         "  - Base transformer encoder layer in [Attention Is All You Need](https://arxiv.org/abs/1706.03762)\n",
299 |         "  - Parameters:\n",
300 |         "    - d_model: the number of expected features of the input (required).\n",
301 |         "\n",
302 |         "    - nhead: the number of heads of the multiheadattention models (required).\n",
303 |         "\n",
304 |         "    - dim_feedforward: the dimension of the feedforward network model (default=2048).\n",
305 |         "\n",
306 |         "    - dropout: the dropout value (default=0.1).\n",
307 |         "\n",
308 |         "    - activation: the activation function of intermediate layer, relu or gelu (default=relu).\n",
309 |         "\n",
310 |         "- TransformerEncoder:\n",
311 |         "  - TransformerEncoder is a stack of N transformer encoder layers\n",
312 |         "  - Parameters:\n",
313 |         "    - encoder_layer: an instance of the TransformerEncoderLayer() class (required).\n",
314 |         "\n",
315 |         "    - num_layers: the number of sub-encoder-layers in the encoder (required).\n",
316 |         "\n",
317 |         "    - norm: the layer normalization component (optional)."
318 |       ]
319 |     },
320 |     {
321 |       "cell_type": "code",
322 |       "execution_count": 6,
323 |       "metadata": {
324 |         "id": "iXZ5B0EKJGs8"
325 |       },
326 |       "outputs": [],
327 |       "source": [
328 |         "import torch\n",
329 |         "import torch.nn as nn\n",
330 |         "import torch.nn.functional as F\n",
331 |         "\n",
332 |         "\n",
333 |         "class Classifier(nn.Module):\n",
334 |         "\tdef __init__(self, d_model=160, n_spks=600, dropout=0.1):\n",
335 |         "\t\tsuper().__init__()\n",
336 |         "\t\t# Project the dimension of features from that of input into d_model.\n",
337 |         "\t\tself.prenet = nn.Linear(40, d_model)\n",
338 |         "\t\t# TODO:\n",
339 |         "\t\t#   Change Transformer to Conformer.\n",
340 |         "\t\t#   https://arxiv.org/abs/2005.08100\n",
341 |         "\t\t# self.encoder_layer = nn.TransformerEncoderLayer(\n",
342 |         "\t\t# \td_model=d_model, dim_feedforward=256, nhead=1\n",
343 |         "\t\t# )\n",
344 |         "\t\t# self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)\n",
345 |         "\t\tself.encoder = torchaudio.models.Conformer(input_dim=d_model, num_heads=1, ffn_dim=128, num_layers = 6,dropout=0.1,depthwise_conv_kernel_size=21)\n",
346 |         "\t\t# Project the the dimension of features from d_model into speaker nums.\n",
347 |         "\t\tself.pred_layer = nn.Sequential(\n",
348 |         "\t\t\tnn.Linear(d_model, n_spks),\n",
349 |         "\t\t)\n",
350 |         "\t\tself.pooling = SelfAttentionPooling(d_model)\n",
351 |         "\n",
352 |         "\tdef forward(self, mels):\n",
353 |         "\t\t\"\"\"\n",
354 |         "\t\targs:\n",
355 |         "\t\t\tmels: (batch size, length, 40)\n",
356 |         "\t\treturn:\n",
357 |         "\t\t\tout: (batch size, n_spks)\n",
358 |         "\t\t\"\"\"\n",
359 |         "\t\t# out: (batch size, length, d_model) 32 128 40\n",
360 |         "\t\tout = self.prenet(mels)\n",
361 |         "\t\t# out: (length, batch size, d_model)\n",
362 |         "\t\t# out = out.permute(1, 0, 2)\n",
363 |         "\t\t# The encoder layer expect features in the shape of (length, batch size, d_model).\n",
364 |         "\t\t# out = self.encoder_layer(out)\n",
365 |         "\t\tlengths = torch.full((out.shape[0],),out.shape[1]).to(\"cuda\")\n",
366 |         "\t\tout = self.encoder(out, lengths)\n",
367 |         "\t\t# out: (batch size, length, d_model)\n",
368 |         "\t\t# out = out[0].transpose(0, 1)\n",
369 |         "\t\t# mean pooling\n",
370 |         "\t\t# stats = out[0].mean(dim=1)\n",
371 |         "\t\t\n",
372 |         "\t\tstats = self.pooling(out[0])\n",
373 |         "\n",
374 |         "\t\t# out: (batch, n_spks)\n",
375 |         "\t\tout = self.pred_layer(stats)\n",
376 |         "\t\treturn out"
377 |       ]
378 |     },
379 |     {
380 |       "cell_type": "markdown",
381 |       "metadata": {
382 |         "id": "W7yX8JinM5Ly"
383 |       },
384 |       "source": [
385 |         "# Learning rate schedule\n",
386 |         "- For transformer architecture, the design of learning rate schedule is different from that of CNN.\n",
387 |         "- Previous works show that the warmup of learning rate is useful for training models with transformer architectures.\n",
388 |         "- The warmup schedule\n",
389 |         "  - Set learning rate to 0 in the beginning.\n",
390 |         "  - The learning rate increases linearly from 0 to initial learning rate during warmup period."
391 |       ]
392 |     },
393 |     {
394 |       "cell_type": "code",
395 |       "execution_count": 7,
396 |       "metadata": {
397 |         "id": "ykt0N1nVJJi2"
398 |       },
399 |       "outputs": [],
400 |       "source": [
401 |         "import math\n",
402 |         "\n",
403 |         "import torch\n",
404 |         "from torch.optim import Optimizer\n",
405 |         "from torch.optim.lr_scheduler import LambdaLR\n",
406 |         "\n",
407 |         "\n",
408 |         "def get_cosine_schedule_with_warmup(\n",
409 |         "\toptimizer: Optimizer,\n",
410 |         "\tnum_warmup_steps: int,\n",
411 |         "\tnum_training_steps: int,\n",
412 |         "\tnum_cycles: float = 0.5,\n",
413 |         "\tlast_epoch: int = -1,\n",
414 |         "):\n",
415 |         "\t\"\"\"\n",
416 |         "\tCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n",
417 |         "\tinitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\n",
418 |         "\tinitial lr set in the optimizer.\n",
419 |         "\n",
420 |         "\tArgs:\n",
421 |         "\t\toptimizer (:class:`~torch.optim.Optimizer`):\n",
422 |         "\t\tThe optimizer for which to schedule the learning rate.\n",
423 |         "\t\tnum_warmup_steps (:obj:`int`):\n",
424 |         "\t\tThe number of steps for the warmup phase.\n",
425 |         "\t\tnum_training_steps (:obj:`int`):\n",
426 |         "\t\tThe total number of training steps.\n",
427 |         "\t\tnum_cycles (:obj:`float`, `optional`, defaults to 0.5):\n",
428 |         "\t\tThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0\n",
429 |         "\t\tfollowing a half-cosine).\n",
430 |         "\t\tlast_epoch (:obj:`int`, `optional`, defaults to -1):\n",
431 |         "\t\tThe index of the last epoch when resuming training.\n",
432 |         "\n",
433 |         "\tReturn:\n",
434 |         "\t\t:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.\n",
435 |         "\t\"\"\"\n",
436 |         "\tdef lr_lambda(current_step):\n",
437 |         "\t\t# Warmup\n",
438 |         "\t\tif current_step < num_warmup_steps:\n",
439 |         "\t\t\treturn float(current_step) / float(max(1, num_warmup_steps))\n",
440 |         "\t\t# decadence\n",
441 |         "\t\tprogress = float(current_step - num_warmup_steps) / float(\n",
442 |         "\t\t\tmax(1, num_training_steps - num_warmup_steps)\n",
443 |         "\t\t)\n",
444 |         "\t\treturn max(\n",
445 |         "\t\t\t0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))\n",
446 |         "\t\t)\n",
447 |         "\n",
448 |         "\treturn LambdaLR(optimizer, lr_lambda, last_epoch)"
449 |       ]
450 |     },
451 |     {
452 |       "cell_type": "markdown",
453 |       "metadata": {
454 |         "id": "-LN2XkteM_uH"
455 |       },
456 |       "source": [
457 |         "# Model Function\n",
458 |         "- Model forward function."
459 |       ]
460 |     },
461 |     {
462 |       "cell_type": "code",
463 |       "execution_count": 8,
464 |       "metadata": {
465 |         "id": "N-rr8529JMz0"
466 |       },
467 |       "outputs": [],
468 |       "source": [
469 |         "import torch\n",
470 |         "\n",
471 |         "\n",
472 |         "def model_fn(batch, model, criterion, device):\n",
473 |         "\t\"\"\"Forward a batch through the model.\"\"\"\n",
474 |         "\n",
475 |         "\tmels, labels = batch\n",
476 |         "\tmels = mels.to(device)\n",
477 |         "\tlabels = labels.to(device)\n",
478 |         "\n",
479 |         "\touts = model(mels)\n",
480 |         "\n",
481 |         "\tloss = criterion(outs, labels)\n",
482 |         "\n",
483 |         "\t# Get the speaker id with highest probability.\n",
484 |         "\tpreds = outs.argmax(1)\n",
485 |         "\t# Compute accuracy.\n",
486 |         "\taccuracy = torch.mean((preds == labels).float())\n",
487 |         "\n",
488 |         "\treturn loss, accuracy"
489 |       ]
490 |     },
491 |     {
492 |       "cell_type": "markdown",
493 |       "metadata": {
494 |         "id": "cwM_xyOtNCI2"
495 |       },
496 |       "source": [
497 |         "# Validate\n",
498 |         "- Calculate accuracy of the validation set."
499 |       ]
500 |     },
501 |     {
502 |       "cell_type": "code",
503 |       "execution_count": 9,
504 |       "metadata": {
505 |         "id": "YAiv6kpdJRTJ"
506 |       },
507 |       "outputs": [],
508 |       "source": [
509 |         "from tqdm import tqdm\n",
510 |         "import torch\n",
511 |         "\n",
512 |         "\n",
513 |         "def valid(dataloader, model, criterion, device): \n",
514 |         "\t\"\"\"Validate on validation set.\"\"\"\n",
515 |         "\n",
516 |         "\tmodel.eval()\n",
517 |         "\trunning_loss = 0.0\n",
518 |         "\trunning_accuracy = 0.0\n",
519 |         "\t# pbar = tqdm(total=len(dataloader.dataset), ncols=0, desc=\"Valid\", unit=\" uttr\")\n",
520 |         "\n",
521 |         "\tfor i, batch in enumerate(dataloader):\n",
522 |         "\t\twith torch.no_grad():\n",
523 |         "\t\t\tloss, accuracy = model_fn(batch, model, criterion, device)\n",
524 |         "\t\t\trunning_loss += loss.item()\n",
525 |         "\t\t\trunning_accuracy += accuracy.item()\n",
526 |         "\n",
527 |         "\t# \tpbar.update(dataloader.batch_size)\n",
528 |         "\t# \tpbar.set_postfix(\n",
529 |         "\t# \t\tloss=f\"{running_loss / (i+1):.2f}\",\n",
530 |         "\t# \t\taccuracy=f\"{running_accuracy / (i+1):.2f}\",\n",
531 |         "\t# \t)\n",
532 |         "\n",
533 |         "\t# pbar.close()\n",
534 |         "\tmodel.train()\n",
535 |         "\n",
536 |         "\treturn running_accuracy / len(dataloader)"
537 |       ]
538 |     },
539 |     {
540 |       "cell_type": "markdown",
541 |       "metadata": {
542 |         "id": "g6ne9G-eNEdG"
543 |       },
544 |       "source": [
545 |         "# Main function"
546 |       ]
547 |     },
548 |     {
549 |       "cell_type": "code",
550 |       "execution_count": 10,
551 |       "metadata": {
552 |         "id": "Usv9s-CuJSG7"
553 |       },
554 |       "outputs": [
555 |         {
556 |           "name": "stdout",
557 |           "output_type": "stream",
558 |           "text": [
559 |             "[Info]: Use cuda now!\n",
560 |             "[Info]: Finish loading data!\n",
561 |             "[Info]: Finish creating model!\n"
562 |           ]
563 |         },
564 |         {
565 |           "name": "stderr",
566 |           "output_type": "stream",
567 |           "text": [
568 |             "/tmp/ipykernel_448168/573601384.py:25: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
569 |             "  att_w = softmax(self.W(batch_rep).squeeze(-1)).unsqueeze(-1)\n"
570 |           ]
571 |         },
572 |         {
573 |           "name": "stdout",
574 |           "output_type": "stream",
575 |           "text": [
576 |             "Step 10000, best model saved. (accuracy=0.9440)\n",
577 |             "Step 20000, best model saved. (accuracy=0.9440)\n",
578 |             "Step 30000, best model saved. (accuracy=0.9456)\n",
579 |             "Step 40000, best model saved. (accuracy=0.9488)\n",
580 |             "Step 50000, best model saved. (accuracy=0.9488)\n",
581 |             "Step 60000, best model saved. (accuracy=0.9488)\n",
582 |             "Step 70000, best model saved. (accuracy=0.9500)\n",
583 |             "Step 80000, best model saved. (accuracy=0.9500)\n",
584 |             "Step 90000, best model saved. (accuracy=0.9500)\n",
585 |             "Step 100000, best model saved. (accuracy=0.9500)\n",
586 |             "Step 110000, best model saved. (accuracy=0.9500)\n"
587 |           ]
588 |         },
589 |         {
590 |           "ename": "KeyboardInterrupt",
591 |           "evalue": "",
592 |           "output_type": "error",
593 |           "traceback": [
594 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
595 |             "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
596 |             "\u001b[1;32m/home/tracy/Projects/ML2022/hw4/hw04(try pooling2).ipynb Cell 19'\u001b[0m in \u001b[0;36m<cell line: 103>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20pooling2%29.ipynb#ch0000018vscode-remote?line=97'>98</a>\u001b[0m \t\t\t\u001b[39m# pbar.write(f\"Step {step + 1}, best model saved. (accuracy={best_accuracy:.4f})\")\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20pooling2%29.ipynb#ch0000018vscode-remote?line=98'>99</a>\u001b[0m \n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20pooling2%29.ipynb#ch0000018vscode-remote?line=99'>100</a>\u001b[0m \t\u001b[39m# pbar.close()\u001b[39;00m\n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20pooling2%29.ipynb#ch0000018vscode-remote?line=102'>103</a>\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m__name__\u001b[39m \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m__main__\u001b[39m\u001b[39m\"\u001b[39m:\n\u001b[0;32m--> <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20pooling2%29.ipynb#ch0000018vscode-remote?line=103'>104</a>\u001b[0m \tmain(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mparse_args())\n",
597 |             "\u001b[1;32m/home/tracy/Projects/ML2022/hw4/hw04(try pooling2).ipynb Cell 19'\u001b[0m in \u001b[0;36mmain\u001b[0;34m(data_dir, save_path, batch_size, n_workers, valid_steps, warmup_steps, total_steps, save_steps)\u001b[0m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20pooling2%29.ipynb#ch0000018vscode-remote?line=66'>67</a>\u001b[0m \u001b[39m# Updata model\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20pooling2%29.ipynb#ch0000018vscode-remote?line=67'>68</a>\u001b[0m loss\u001b[39m.\u001b[39mbackward()\n\u001b[0;32m---> <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20pooling2%29.ipynb#ch0000018vscode-remote?line=68'>69</a>\u001b[0m optimizer\u001b[39m.\u001b[39;49mstep()\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20pooling2%29.ipynb#ch0000018vscode-remote?line=69'>70</a>\u001b[0m scheduler\u001b[39m.\u001b[39mstep()\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20pooling2%29.ipynb#ch0000018vscode-remote?line=70'>71</a>\u001b[0m optimizer\u001b[39m.\u001b[39mzero_grad()\n",
598 |             "File \u001b[0;32m~/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:65\u001b[0m, in \u001b[0;36m_LRScheduler.__init__.<locals>.with_counter.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/lr_scheduler.py?line=62'>63</a>\u001b[0m instance\u001b[39m.\u001b[39m_step_count \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[1;32m     <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/lr_scheduler.py?line=63'>64</a>\u001b[0m wrapped \u001b[39m=\u001b[39m func\u001b[39m.\u001b[39m\u001b[39m__get__\u001b[39m(instance, \u001b[39mcls\u001b[39m)\n\u001b[0;32m---> <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/lr_scheduler.py?line=64'>65</a>\u001b[0m \u001b[39mreturn\u001b[39;00m wrapped(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
599 |             "File \u001b[0;32m~/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/optimizer.py:88\u001b[0m, in \u001b[0;36mOptimizer._hook_for_profile.<locals>.profile_hook_step.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/optimizer.py?line=85'>86</a>\u001b[0m profile_name \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mOptimizer.step#\u001b[39m\u001b[39m{}\u001b[39;00m\u001b[39m.step\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m.\u001b[39mformat(obj\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m)\n\u001b[1;32m     <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/optimizer.py?line=86'>87</a>\u001b[0m \u001b[39mwith\u001b[39;00m torch\u001b[39m.\u001b[39mautograd\u001b[39m.\u001b[39mprofiler\u001b[39m.\u001b[39mrecord_function(profile_name):\n\u001b[0;32m---> <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/optimizer.py?line=87'>88</a>\u001b[0m     \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
600 |             "File \u001b[0;32m~/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/autograd/grad_mode.py:27\u001b[0m, in \u001b[0;36m_DecoratorContextManager.__call__.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/autograd/grad_mode.py?line=23'>24</a>\u001b[0m \u001b[39m@functools\u001b[39m\u001b[39m.\u001b[39mwraps(func)\n\u001b[1;32m     <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/autograd/grad_mode.py?line=24'>25</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdecorate_context\u001b[39m(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m     <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/autograd/grad_mode.py?line=25'>26</a>\u001b[0m     \u001b[39mwith\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclone():\n\u001b[0;32m---> <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/autograd/grad_mode.py?line=26'>27</a>\u001b[0m         \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
601 |             "File \u001b[0;32m~/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py:145\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=141'>142</a>\u001b[0m         \u001b[39m# record the step after step update\u001b[39;00m\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=142'>143</a>\u001b[0m         state_steps\u001b[39m.\u001b[39mappend(state[\u001b[39m'\u001b[39m\u001b[39mstep\u001b[39m\u001b[39m'\u001b[39m])\n\u001b[0;32m--> <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=144'>145</a>\u001b[0m     F\u001b[39m.\u001b[39;49madamw(params_with_grad,\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=145'>146</a>\u001b[0m             grads,\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=146'>147</a>\u001b[0m             exp_avgs,\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=147'>148</a>\u001b[0m             exp_avg_sqs,\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=148'>149</a>\u001b[0m             max_exp_avg_sqs,\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=149'>150</a>\u001b[0m             state_steps,\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=150'>151</a>\u001b[0m             amsgrad\u001b[39m=\u001b[39;49mamsgrad,\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=151'>152</a>\u001b[0m             beta1\u001b[39m=\u001b[39;49mbeta1,\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=152'>153</a>\u001b[0m             beta2\u001b[39m=\u001b[39;49mbeta2,\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=153'>154</a>\u001b[0m             lr\u001b[39m=\u001b[39;49mgroup[\u001b[39m'\u001b[39;49m\u001b[39mlr\u001b[39;49m\u001b[39m'\u001b[39;49m],\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=154'>155</a>\u001b[0m             weight_decay\u001b[39m=\u001b[39;49mgroup[\u001b[39m'\u001b[39;49m\u001b[39mweight_decay\u001b[39;49m\u001b[39m'\u001b[39;49m],\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=155'>156</a>\u001b[0m             eps\u001b[39m=\u001b[39;49mgroup[\u001b[39m'\u001b[39;49m\u001b[39meps\u001b[39;49m\u001b[39m'\u001b[39;49m],\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=156'>157</a>\u001b[0m             maximize\u001b[39m=\u001b[39;49mgroup[\u001b[39m'\u001b[39;49m\u001b[39mmaximize\u001b[39;49m\u001b[39m'\u001b[39;49m])\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/adamw.py?line=158'>159</a>\u001b[0m \u001b[39mreturn\u001b[39;00m loss\n",
602 |             "File \u001b[0;32m~/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/_functional.py:151\u001b[0m, in \u001b[0;36madamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)\u001b[0m\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/_functional.py?line=148'>149</a>\u001b[0m     denom \u001b[39m=\u001b[39m (max_exp_avg_sqs[i]\u001b[39m.\u001b[39msqrt() \u001b[39m/\u001b[39m math\u001b[39m.\u001b[39msqrt(bias_correction2))\u001b[39m.\u001b[39madd_(eps)\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/_functional.py?line=149'>150</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/_functional.py?line=150'>151</a>\u001b[0m     denom \u001b[39m=\u001b[39m (exp_avg_sq\u001b[39m.\u001b[39;49msqrt() \u001b[39m/\u001b[39;49m math\u001b[39m.\u001b[39;49msqrt(bias_correction2))\u001b[39m.\u001b[39;49madd_(eps)\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/_functional.py?line=152'>153</a>\u001b[0m step_size \u001b[39m=\u001b[39m lr \u001b[39m/\u001b[39m bias_correction1\n\u001b[1;32m    <a href='file:///home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/_functional.py?line=154'>155</a>\u001b[0m param\u001b[39m.\u001b[39maddcdiv_(exp_avg, denom, value\u001b[39m=\u001b[39m\u001b[39m-\u001b[39mstep_size)\n",
603 |             "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
604 |           ]
605 |         }
606 |       ],
607 |       "source": [
608 |         "from tqdm import tqdm\n",
609 |         "\n",
610 |         "import torch\n",
611 |         "import torch.nn as nn\n",
612 |         "from torch.optim import AdamW\n",
613 |         "from torch.utils.data import DataLoader, random_split\n",
614 |         "\n",
615 |         "\n",
616 |         "def parse_args():\n",
617 |         "\t\"\"\"arguments\"\"\"\n",
618 |         "\tconfig = {\n",
619 |         "\t\t\"data_dir\": \"./Dataset\",\n",
620 |         "\t\t\"save_path\": \"model4.ckpt\",\n",
621 |         "\t\t\"batch_size\": 32,\n",
622 |         "\t\t\"n_workers\": 8,\n",
623 |         "\t\t\"valid_steps\": 2000,\n",
624 |         "\t\t\"warmup_steps\": 1000,\n",
625 |         "\t\t\"save_steps\": 10000,\n",
626 |         "\t\t\"total_steps\": 500000,\n",
627 |         "\t}\n",
628 |         "\n",
629 |         "\treturn config\n",
630 |         "\n",
631 |         "\n",
632 |         "def main(\n",
633 |         "\tdata_dir,\n",
634 |         "\tsave_path,\n",
635 |         "\tbatch_size,\n",
636 |         "\tn_workers,\n",
637 |         "\tvalid_steps,\n",
638 |         "\twarmup_steps,\n",
639 |         "\ttotal_steps,\n",
640 |         "\tsave_steps,\n",
641 |         "):\n",
642 |         "\t\"\"\"Main function.\"\"\"\n",
643 |         "\tdevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
644 |         "\tprint(f\"[Info]: Use {device} now!\")\n",
645 |         "\n",
646 |         "\ttrain_loader, valid_loader, speaker_num = get_dataloader(data_dir, batch_size, n_workers)\n",
647 |         "\ttrain_iterator = iter(train_loader)\n",
648 |         "\tprint(f\"[Info]: Finish loading data!\",flush = True)\n",
649 |         "\n",
650 |         "\tmodel = Classifier(n_spks=speaker_num).to(device)\n",
651 |         "\tmodel.load_state_dict(torch.load(\"model4.ckpt\"))\n",
652 |         "\tcriterion = nn.CrossEntropyLoss()\n",
653 |         "\toptimizer = AdamW(model.parameters(), lr=1e-5)\n",
654 |         "\tscheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)\n",
655 |         "\tprint(f\"[Info]: Finish creating model!\",flush = True)\n",
656 |         "\n",
657 |         "\tbest_accuracy = -1.0\n",
658 |         "\tbest_state_dict = None\n",
659 |         "\n",
660 |         "\t# pbar = tqdm(total=valid_steps, ncols=0, desc=\"Train\", unit=\" step\")\n",
661 |         "\n",
662 |         "\tfor step in range(total_steps):\n",
663 |         "\t\t# Get data\n",
664 |         "\t\ttry:\n",
665 |         "\t\t\tbatch = next(train_iterator)\n",
666 |         "\t\texcept StopIteration:\n",
667 |         "\t\t\ttrain_iterator = iter(train_loader)\n",
668 |         "\t\t\tbatch = next(train_iterator)\n",
669 |         "\n",
670 |         "\t\tloss, accuracy = model_fn(batch, model, criterion, device)\n",
671 |         "\t\tbatch_loss = loss.item()\n",
672 |         "\t\tbatch_accuracy = accuracy.item()\n",
673 |         "\n",
674 |         "\t\t# Updata model\n",
675 |         "\t\tloss.backward()\n",
676 |         "\t\toptimizer.step()\n",
677 |         "\t\tscheduler.step()\n",
678 |         "\t\toptimizer.zero_grad()\n",
679 |         "\n",
680 |         "\t\t# Log\n",
681 |         "\t\t# pbar.update()\n",
682 |         "\t\t# pbar.set_postfix(\n",
683 |         "\t\t# \tloss=f\"{batch_loss:.2f}\",\n",
684 |         "\t\t# \taccuracy=f\"{batch_accuracy:.2f}\",\n",
685 |         "\t\t# \tstep=step + 1,\n",
686 |         "\t\t# )\n",
687 |         "\t\tstep = step + 1\n",
688 |         "\t\t# Do validation\n",
689 |         "\t\tif (step + 1) % valid_steps == 0:\n",
690 |         "\t\t\t# pbar.close()\n",
691 |         "\n",
692 |         "\t\t\tvalid_accuracy = valid(valid_loader, model, criterion, device)\n",
693 |         "\n",
694 |         "\t\t\t# keep the best model\n",
695 |         "\t\t\tif valid_accuracy > best_accuracy:\n",
696 |         "\t\t\t\tbest_accuracy = valid_accuracy\n",
697 |         "\t\t\t\tbest_state_dict = model.state_dict()\n",
698 |         "\n",
699 |         "\t\t\t# pbar = tqdm(total=valid_steps, ncols=0, desc=\"Train\", unit=\" step\")\n",
700 |         "\n",
701 |         "\t\t# Save the best model so far.\n",
702 |         "\t\tif (step + 1) % save_steps == 0 and best_state_dict is not None:\n",
703 |         "\t\t\ttorch.save(best_state_dict, save_path)\n",
704 |         "\t\t\tprint(f\"Step {step + 1}, best model saved. (accuracy={best_accuracy:.4f})\")\n",
705 |         "\t\t\t# pbar.write(f\"Step {step + 1}, best model saved. (accuracy={best_accuracy:.4f})\")\n",
706 |         "\n",
707 |         "\t# pbar.close()\n",
708 |         "\n",
709 |         "\n",
710 |         "if __name__ == \"__main__\":\n",
711 |         "\tmain(**parse_args())"
712 |       ]
713 |     },
714 |     {
715 |       "cell_type": "markdown",
716 |       "metadata": {
717 |         "id": "NLatBYAhNNMx"
718 |       },
719 |       "source": [
720 |         "# Inference\n",
721 |         "\n",
722 |         "## Dataset of inference"
723 |       ]
724 |     },
725 |     {
726 |       "cell_type": "code",
727 |       "execution_count": 11,
728 |       "metadata": {
729 |         "colab": {
730 |           "background_save": true
731 |         },
732 |         "id": "efS4pCmAJXJH"
733 |       },
734 |       "outputs": [],
735 |       "source": [
736 |         "import os\n",
737 |         "import json\n",
738 |         "import torch\n",
739 |         "from pathlib import Path\n",
740 |         "from torch.utils.data import Dataset\n",
741 |         "\n",
742 |         "\n",
743 |         "class InferenceDataset(Dataset):\n",
744 |         "\tdef __init__(self, data_dir):\n",
745 |         "\t\ttestdata_path = Path(data_dir) / \"testdata.json\"\n",
746 |         "\t\tmetadata = json.load(testdata_path.open())\n",
747 |         "\t\tself.data_dir = data_dir\n",
748 |         "\t\tself.data = metadata[\"utterances\"]\n",
749 |         "\n",
750 |         "\tdef __len__(self):\n",
751 |         "\t\treturn len(self.data)\n",
752 |         "\n",
753 |         "\tdef __getitem__(self, index):\n",
754 |         "\t\tutterance = self.data[index]\n",
755 |         "\t\tfeat_path = utterance[\"feature_path\"]\n",
756 |         "\t\tmel = torch.load(os.path.join(self.data_dir, feat_path))\n",
757 |         "\n",
758 |         "\t\treturn feat_path, mel\n",
759 |         "\n",
760 |         "\n",
761 |         "def inference_collate_batch(batch):\n",
762 |         "\t\"\"\"Collate a batch of data.\"\"\"\n",
763 |         "\tfeat_paths, mels = zip(*batch)\n",
764 |         "\n",
765 |         "\treturn feat_paths, torch.stack(mels)"
766 |       ]
767 |     },
768 |     {
769 |       "cell_type": "markdown",
770 |       "metadata": {
771 |         "id": "tl0WnYwxNK_S"
772 |       },
773 |       "source": [
774 |         "## Main funcrion of Inference"
775 |       ]
776 |     },
777 |     {
778 |       "cell_type": "code",
779 |       "execution_count": 12,
780 |       "metadata": {
781 |         "colab": {
782 |           "background_save": true
783 |         },
784 |         "id": "i8SAbuXEJb2A"
785 |       },
786 |       "outputs": [
787 |         {
788 |           "name": "stdout",
789 |           "output_type": "stream",
790 |           "text": [
791 |             "[Info]: Use cuda now!\n",
792 |             "[Info]: Finish loading data!\n",
793 |             "[Info]: Finish creating model!\n"
794 |           ]
795 |         },
796 |         {
797 |           "name": "stderr",
798 |           "output_type": "stream",
799 |           "text": [
800 |             "/tmp/ipykernel_448168/573601384.py:25: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
801 |             "  att_w = softmax(self.W(batch_rep).squeeze(-1)).unsqueeze(-1)\n"
802 |           ]
803 |         }
804 |       ],
805 |       "source": [
806 |         "import json\n",
807 |         "import csv\n",
808 |         "from pathlib import Path\n",
809 |         "from tqdm.notebook import tqdm\n",
810 |         "\n",
811 |         "import torch\n",
812 |         "from torch.utils.data import DataLoader\n",
813 |         "\n",
814 |         "def parse_args():\n",
815 |         "\t\"\"\"arguments\"\"\"\n",
816 |         "\tconfig = {\n",
817 |         "\t\t\"data_dir\": \"./Dataset\",\n",
818 |         "\t\t\"model_path\": \"./model4.ckpt\",\n",
819 |         "\t\t\"output_path\": \"./output_nextsubmit.csv\",\n",
820 |         "\t}\n",
821 |         "\n",
822 |         "\treturn config\n",
823 |         "\n",
824 |         "\n",
825 |         "def main(\n",
826 |         "\tdata_dir,\n",
827 |         "\tmodel_path,\n",
828 |         "\toutput_path,\n",
829 |         "):\n",
830 |         "\t\"\"\"Main function.\"\"\"\n",
831 |         "\tdevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
832 |         "\tprint(f\"[Info]: Use {device} now!\")\n",
833 |         "\n",
834 |         "\tmapping_path = Path(data_dir) / \"mapping.json\"\n",
835 |         "\tmapping = json.load(mapping_path.open())\n",
836 |         "\n",
837 |         "\tdataset = InferenceDataset(data_dir)\n",
838 |         "\tdataloader = DataLoader(\n",
839 |         "\t\tdataset,\n",
840 |         "\t\tbatch_size=1,\n",
841 |         "\t\tshuffle=False,\n",
842 |         "\t\tdrop_last=False,\n",
843 |         "\t\tnum_workers=8,\n",
844 |         "\t\tcollate_fn=inference_collate_batch,\n",
845 |         "\t)\n",
846 |         "\tprint(f\"[Info]: Finish loading data!\",flush = True)\n",
847 |         "\n",
848 |         "\tspeaker_num = len(mapping[\"id2speaker\"])\n",
849 |         "\tmodel = Classifier(n_spks=speaker_num).to(device)\n",
850 |         "\tmodel.load_state_dict(torch.load(model_path))\n",
851 |         "\tmodel.eval()\n",
852 |         "\tprint(f\"[Info]: Finish creating model!\",flush = True)\n",
853 |         "\n",
854 |         "\tresults = [[\"Id\", \"Category\"]]\n",
855 |         "\tfor feat_paths, mels in dataloader:\n",
856 |         "\t\twith torch.no_grad():\n",
857 |         "\t\t\tmels = mels.to(device)\n",
858 |         "\t\t\touts = model(mels)\n",
859 |         "\t\t\tpreds = outs.argmax(1).cpu().numpy()\n",
860 |         "\t\t\tfor feat_path, pred in zip(feat_paths, preds):\n",
861 |         "\t\t\t\tresults.append([feat_path, mapping[\"id2speaker\"][str(pred)]])\n",
862 |         "\n",
863 |         "\twith open(output_path, 'w', newline='') as csvfile:\n",
864 |         "\t\twriter = csv.writer(csvfile)\n",
865 |         "\t\twriter.writerows(results)\n",
866 |         "\n",
867 |         "\n",
868 |         "if __name__ == \"__main__\":\n",
869 |         "\tmain(**parse_args())"
870 |       ]
871 |     }
872 |   ],
873 |   "metadata": {
874 |     "accelerator": "GPU",
875 |     "colab": {
876 |       "collapsed_sections": [],
877 |       "name": "hw04.ipynb",
878 |       "provenance": []
879 |     },
880 |     "kernelspec": {
881 |       "display_name": "Python 3",
882 |       "name": "python3"
883 |     },
884 |     "language_info": {
885 |       "codemirror_mode": {
886 |         "name": "ipython",
887 |         "version": 3
888 |       },
889 |       "file_extension": ".py",
890 |       "mimetype": "text/x-python",
891 |       "name": "python",
892 |       "nbconvert_exporter": "python",
893 |       "pygments_lexer": "ipython3",
894 |       "version": "3.10.2"
895 |     }
896 |   },
897 |   "nbformat": 4,
898 |   "nbformat_minor": 0
899 | }
900 | 


--------------------------------------------------------------------------------
/Hw4/hw04(0.87050try ensemble).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "C_jdZ5vHJ4A9"
  7 |    },
  8 |    "source": [
  9 |     "# Task description\n",
 10 |     "- Classify the speakers of given features.\n",
 11 |     "- Main goal: Learn how to use transformer.\n",
 12 |     "- Baselines:\n",
 13 |     "  - Easy: Run sample code and know how to use transformer.\n",
 14 |     "  - Medium: Know how to adjust parameters of transformer.\n",
 15 |     "  - Strong: Construct [conformer](https://arxiv.org/abs/2005.08100) which is a variety of transformer. \n",
 16 |     "  - Boss: Implement [Self-Attention Pooling](https://arxiv.org/pdf/2008.01077v1.pdf) & [Additive Margin Softmax](https://arxiv.org/pdf/1801.05599.pdf) to further boost the performance.\n",
 17 |     "\n",
 18 |     "- Other links\n",
 19 |     "  - Kaggle: [link](https://www.kaggle.com/t/ac77388c90204a4c8daebeddd40ff916)\n",
 20 |     "  - Slide: [link](https://docs.google.com/presentation/d/1HLAj7UUIjZOycDe7DaVLSwJfXVd3bXPOyzSb6Zk3hYU/edit?usp=sharing)\n",
 21 |     "  - Data: [link](https://github.com/MachineLearningHW/ML_HW4_Dataset)\n",
 22 |     "\n",
 23 |     "# Download dataset\n",
 24 |     "- Data is [here](https://github.com/MachineLearningHW/ML_HW4_Dataset)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 1,
 30 |    "metadata": {
 31 |     "id": "LhLNWB-AK2Z5"
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# \"\"\"\n",
 36 |     "# If the links below become inaccessible, please connect TAs.\n",
 37 |     "# \"\"\"\n",
 38 |     "\n",
 39 |     "# !wget https://github.com/MachineLearningHW/ML_HW4_Dataset/raw/0.0.1/Dataset.tar.gz.partaa\n",
 40 |     "# !wget https://github.com/MachineLearningHW/ML_HW4_Dataset/raw/0.0.1/Dataset.tar.gz.partab\n",
 41 |     "# !wget https://github.com/MachineLearningHW/ML_HW4_Dataset/raw/0.0.1/Dataset.tar.gz.partac\n",
 42 |     "# !wget https://github.com/MachineLearningHW/ML_HW4_Dataset/raw/0.0.1/Dataset.tar.gz.partad\n",
 43 |     "\n",
 44 |     "# !cat Dataset.tar.gz.parta* > Dataset.tar.gz\n",
 45 |     "\n",
 46 |     "# !tar zxvf Dataset.tar.gz"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {
 52 |     "id": "ENWVAUDVJtVY"
 53 |    },
 54 |    "source": [
 55 |     "## Fix Random Seed"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 1,
 61 |    "metadata": {
 62 |     "id": "E6burzCXIyuA"
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "name": "stderr",
 67 |      "output_type": "stream",
 68 |      "text": [
 69 |       "/home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 70 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "import numpy as np\n",
 76 |     "import torch\n",
 77 |     "import random\n",
 78 |     "import torchaudio\n",
 79 |     "from torchensemble import VotingClassifier\n",
 80 |     "\n",
 81 |     "def set_seed(seed):\n",
 82 |     "    np.random.seed(seed)\n",
 83 |     "    random.seed(seed)\n",
 84 |     "    torch.manual_seed(seed)\n",
 85 |     "    if torch.cuda.is_available():\n",
 86 |     "        torch.cuda.manual_seed(seed)\n",
 87 |     "        torch.cuda.manual_seed_all(seed)\n",
 88 |     "    torch.backends.cudnn.benchmark = False\n",
 89 |     "    torch.backends.cudnn.deterministic = True\n",
 90 |     "\n",
 91 |     "set_seed(87)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {
 97 |     "id": "k7dVbxW2LASN"
 98 |    },
 99 |    "source": [
100 |     "# Data\n",
101 |     "\n",
102 |     "## Dataset\n",
103 |     "- Original dataset is [Voxceleb2](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html).\n",
104 |     "- The [license](https://creativecommons.org/licenses/by/4.0/) and [complete version](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/files/license.txt) of Voxceleb2.\n",
105 |     "- We randomly select 600 speakers from Voxceleb2.\n",
106 |     "- Then preprocess the raw waveforms into mel-spectrograms.\n",
107 |     "\n",
108 |     "- Args:\n",
109 |     "  - data_dir: The path to the data directory.\n",
110 |     "  - metadata_path: The path to the metadata.\n",
111 |     "  - segment_len: The length of audio segment for training. \n",
112 |     "- The architecture of data directory \\\\\n",
113 |     "  - data directory \\\\\n",
114 |     "  |---- metadata.json \\\\\n",
115 |     "  |---- testdata.json \\\\\n",
116 |     "  |---- mapping.json \\\\\n",
117 |     "  |---- uttr-{random string}.pt \\\\\n",
118 |     "\n",
119 |     "- The information in metadata\n",
120 |     "  - \"n_mels\": The dimention of mel-spectrogram.\n",
121 |     "  - \"speakers\": A dictionary. \n",
122 |     "    - Key: speaker ids.\n",
123 |     "    - value: \"feature_path\" and \"mel_len\"\n",
124 |     "\n",
125 |     "\n",
126 |     "For efficiency, we segment the mel-spectrograms into segments in the traing step."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 2,
132 |    "metadata": {
133 |     "id": "KpuGxl4CI2pr"
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "import os\n",
138 |     "import json\n",
139 |     "import torch\n",
140 |     "import random\n",
141 |     "from pathlib import Path\n",
142 |     "from torch.utils.data import Dataset\n",
143 |     "from torch.nn.utils.rnn import pad_sequence\n",
144 |     " \n",
145 |     " \n",
146 |     "class myDataset(Dataset):\n",
147 |     "\tdef __init__(self, data_dir, segment_len=128):\n",
148 |     "\t\tself.data_dir = data_dir\n",
149 |     "\t\tself.segment_len = segment_len\n",
150 |     "\t\n",
151 |     "\t\t# Load the mapping from speaker neme to their corresponding id. \n",
152 |     "\t\tmapping_path = Path(data_dir) / \"mapping.json\"\n",
153 |     "\t\tmapping = json.load(mapping_path.open())\n",
154 |     "\t\tself.speaker2id = mapping[\"speaker2id\"]\n",
155 |     "\t\n",
156 |     "\t\t# Load metadata of training data.\n",
157 |     "\t\tmetadata_path = Path(data_dir) / \"metadata.json\"\n",
158 |     "\t\tmetadata = json.load(open(metadata_path))[\"speakers\"]\n",
159 |     "\t\n",
160 |     "\t\t# Get the total number of speaker.\n",
161 |     "\t\tself.speaker_num = len(metadata.keys())\n",
162 |     "\t\tself.data = []\n",
163 |     "\t\tfor speaker in metadata.keys():\n",
164 |     "\t\t\tfor utterances in metadata[speaker]:\n",
165 |     "\t\t\t\tself.data.append([utterances[\"feature_path\"], self.speaker2id[speaker]])\n",
166 |     " \n",
167 |     "\tdef __len__(self):\n",
168 |     "\t\t\treturn len(self.data)\n",
169 |     " \n",
170 |     "\tdef __getitem__(self, index):\n",
171 |     "\t\tfeat_path, speaker = self.data[index]\n",
172 |     "\t\t# Load preprocessed mel-spectrogram.\n",
173 |     "\t\tmel = torch.load(os.path.join(self.data_dir, feat_path))\n",
174 |     "\n",
175 |     "\t\t# Segmemt mel-spectrogram into \"segment_len\" frames.\n",
176 |     "\t\tif len(mel) > self.segment_len:\n",
177 |     "\t\t\t# Randomly get the starting point of the segment.\n",
178 |     "\t\t\tstart = random.randint(0, len(mel) - self.segment_len)\n",
179 |     "\t\t\t# Get a segment with \"segment_len\" frames.\n",
180 |     "\t\t\tmel = torch.FloatTensor(mel[start:start+self.segment_len])\n",
181 |     "\t\telse:\n",
182 |     "\t\t\tmel = torch.FloatTensor(mel)\n",
183 |     "\t\t# Turn the speaker id into long for computing loss later.\n",
184 |     "\t\tspeaker = torch.FloatTensor([speaker]).long()\n",
185 |     "\t\treturn mel, speaker\n",
186 |     " \n",
187 |     "\tdef get_speaker_number(self):\n",
188 |     "\t\treturn self.speaker_num"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {
194 |     "id": "668hverTMlGN"
195 |    },
196 |    "source": [
197 |     "## Dataloader\n",
198 |     "- Split dataset into training dataset(90%) and validation dataset(10%).\n",
199 |     "- Create dataloader to iterate the data."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 3,
205 |    "metadata": {
206 |     "id": "B7c2gZYoJDRS"
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "import torch\n",
211 |     "from torch.utils.data import DataLoader, random_split\n",
212 |     "from torch.nn.utils.rnn import pad_sequence\n",
213 |     "\n",
214 |     "\n",
215 |     "def collate_batch(batch):\n",
216 |     "\t# Process features within a batch.\n",
217 |     "\t\"\"\"Collate a batch of data.\"\"\"\n",
218 |     "\tmel, speaker = zip(*batch)\n",
219 |     "\t# Because we train the model batch by batch, we need to pad the features in the same batch to make their lengths the same.\n",
220 |     "\tmel = pad_sequence(mel, batch_first=True, padding_value=-20)    # pad log 10^(-20) which is very small value.\n",
221 |     "\t# mel: (batch size, length, 40)\n",
222 |     "\treturn mel, torch.FloatTensor(speaker).long()\n",
223 |     "\n",
224 |     "\n",
225 |     "def get_dataloader(data_dir, batch_size, n_workers):\n",
226 |     "\t\"\"\"Generate dataloader\"\"\"\n",
227 |     "\tdataset = myDataset(data_dir)\n",
228 |     "\tspeaker_num = dataset.get_speaker_number()\n",
229 |     "\t# Split dataset into training dataset and validation dataset\n",
230 |     "\ttrainlen = int(0.9 * len(dataset))\n",
231 |     "\tlengths = [trainlen, len(dataset) - trainlen]\n",
232 |     "\ttrainset, validset = random_split(dataset, lengths)\n",
233 |     "\n",
234 |     "\ttrain_loader = DataLoader(\n",
235 |     "\t\ttrainset,\n",
236 |     "\t\tbatch_size=batch_size,\n",
237 |     "\t\tshuffle=True,\n",
238 |     "\t\tdrop_last=True,\n",
239 |     "\t\tnum_workers=n_workers,\n",
240 |     "\t\tpin_memory=True,\n",
241 |     "\t\tcollate_fn=collate_batch,\n",
242 |     "\t)\n",
243 |     "\tvalid_loader = DataLoader(\n",
244 |     "\t\tvalidset,\n",
245 |     "\t\tbatch_size=batch_size,\n",
246 |     "\t\tnum_workers=n_workers,\n",
247 |     "\t\tdrop_last=True,\n",
248 |     "\t\tpin_memory=True,\n",
249 |     "\t\tcollate_fn=collate_batch,\n",
250 |     "\t)\n",
251 |     "\n",
252 |     "\treturn train_loader, valid_loader, speaker_num"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 4,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "from torch import nn\n",
262 |     "#ref: https://gist.github.com/pohanchi/c77f6dbfbcbc21c5215acde4f62e4362\n",
263 |     "class SelfAttentionPooling(nn.Module):\n",
264 |     "    \"\"\"\n",
265 |     "    Implementation of SelfAttentionPooling \n",
266 |     "    Original Paper: Self-Attention Encoding and Pooling for Speaker Recognition\n",
267 |     "    https://arxiv.org/pdf/2008.01077v1.pdf\n",
268 |     "    \"\"\"\n",
269 |     "    def __init__(self, input_dim):\n",
270 |     "        super(SelfAttentionPooling, self).__init__()\n",
271 |     "        self.W = nn.Linear(input_dim, 1)\n",
272 |     "        \n",
273 |     "    def forward(self, batch_rep):\n",
274 |     "        \"\"\"\n",
275 |     "        input:\n",
276 |     "            batch_rep : size (N, T, H), N: batch size, T: sequence length, H: Hidden dimension\n",
277 |     "        \n",
278 |     "        attention_weight:\n",
279 |     "            att_w : size (N, T, 1)\n",
280 |     "        \n",
281 |     "        return:\n",
282 |     "            utter_rep: size (N, H)\n",
283 |     "        \"\"\"\n",
284 |     "        softmax = nn.functional.softmax\n",
285 |     "        att_w = softmax(self.W(batch_rep).squeeze(-1),dim=1).unsqueeze(-1)\n",
286 |     "        utter_rep = torch.sum(batch_rep * att_w, dim=1)\n",
287 |     "\n",
288 |     "        return utter_rep"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "markdown",
293 |    "metadata": {
294 |     "id": "5FOSZYxrMqhc"
295 |    },
296 |    "source": [
297 |     "# Model\n",
298 |     "- TransformerEncoderLayer:\n",
299 |     "  - Base transformer encoder layer in [Attention Is All You Need](https://arxiv.org/abs/1706.03762)\n",
300 |     "  - Parameters:\n",
301 |     "    - d_model: the number of expected features of the input (required).\n",
302 |     "\n",
303 |     "    - nhead: the number of heads of the multiheadattention models (required).\n",
304 |     "\n",
305 |     "    - dim_feedforward: the dimension of the feedforward network model (default=2048).\n",
306 |     "\n",
307 |     "    - dropout: the dropout value (default=0.1).\n",
308 |     "\n",
309 |     "    - activation: the activation function of intermediate layer, relu or gelu (default=relu).\n",
310 |     "\n",
311 |     "- TransformerEncoder:\n",
312 |     "  - TransformerEncoder is a stack of N transformer encoder layers\n",
313 |     "  - Parameters:\n",
314 |     "    - encoder_layer: an instance of the TransformerEncoderLayer() class (required).\n",
315 |     "\n",
316 |     "    - num_layers: the number of sub-encoder-layers in the encoder (required).\n",
317 |     "\n",
318 |     "    - norm: the layer normalization component (optional)."
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 15,
324 |    "metadata": {
325 |     "id": "iXZ5B0EKJGs8"
326 |    },
327 |    "outputs": [],
328 |    "source": [
329 |     "import torch\n",
330 |     "import torch.nn as nn\n",
331 |     "import torch.nn.functional as F\n",
332 |     "\n",
333 |     "\n",
334 |     "class Classifier_cur(nn.Module):\n",
335 |     "\tdef __init__(self, d_model=160, n_spks=600, dropout=0.1):\n",
336 |     "\t\tsuper().__init__()\n",
337 |     "\t\t# Project the dimension of features from that of input into d_model.\n",
338 |     "\t\tself.prenet = nn.Linear(40, d_model)\n",
339 |     "\t\t# TODO:\n",
340 |     "\t\t#   Change Transformer to Conformer.\n",
341 |     "\t\t#   https://arxiv.org/abs/2005.08100\n",
342 |     "\t\t# self.encoder_layer = nn.TransformerEncoderLayer(\n",
343 |     "\t\t# \td_model=d_model, dim_feedforward=256, nhead=1\n",
344 |     "\t\t# )\n",
345 |     "\t\t# self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)\n",
346 |     "\t\tself.encoder = torchaudio.models.Conformer(input_dim=d_model, num_heads=1, ffn_dim=128, num_layers = 4,dropout=0.1,depthwise_conv_kernel_size=21)\n",
347 |     "\t\t# Project the the dimension of features from d_model into speaker nums.\n",
348 |     "\t\tself.pred_layer = nn.Sequential(\n",
349 |     "\t\t\tnn.Linear(d_model, n_spks),\n",
350 |     "\t\t)\n",
351 |     "\t\tself.pooling = SelfAttentionPooling(d_model)\n",
352 |     "\n",
353 |     "\tdef forward(self, mels):\n",
354 |     "\t\t\"\"\"\n",
355 |     "\t\targs:\n",
356 |     "\t\t\tmels: (batch size, length, 40)\n",
357 |     "\t\treturn:\n",
358 |     "\t\t\tout: (batch size, n_spks)\n",
359 |     "\t\t\"\"\"\n",
360 |     "\t\t# out: (batch size, length, d_model) 32 128 40\n",
361 |     "\t\tout = self.prenet(mels)\n",
362 |     "\t\t# out: (length, batch size, d_model)\n",
363 |     "\t\t# out = out.permute(1, 0, 2)\n",
364 |     "\t\t# The encoder layer expect features in the shape of (length, batch size, d_model).\n",
365 |     "\t\t# out = self.encoder_layer(out)\n",
366 |     "\t\tlengths = torch.full((out.shape[0],),out.shape[1]).to(\"cuda\")\n",
367 |     "\t\tout = self.encoder(out, lengths)\n",
368 |     "\t\t# out: (batch size, length, d_model)\n",
369 |     "\t\t# out = out[0].transpose(0, 1)\n",
370 |     "\t\t# mean pooling\n",
371 |     "\t\t# stats = out[0].mean(dim=1)\n",
372 |     "\t\t\n",
373 |     "\t\tstats = self.pooling(out[0])\n",
374 |     "\n",
375 |     "\t\t# out: (batch, n_spks)\n",
376 |     "\t\tout = self.pred_layer(stats)\n",
377 |     "\t\treturn out"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {
383 |     "id": "W7yX8JinM5Ly"
384 |    },
385 |    "source": [
386 |     "# Learning rate schedule\n",
387 |     "- For transformer architecture, the design of learning rate schedule is different from that of CNN.\n",
388 |     "- Previous works show that the warmup of learning rate is useful for training models with transformer architectures.\n",
389 |     "- The warmup schedule\n",
390 |     "  - Set learning rate to 0 in the beginning.\n",
391 |     "  - The learning rate increases linearly from 0 to initial learning rate during warmup period."
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 6,
397 |    "metadata": {
398 |     "id": "ykt0N1nVJJi2"
399 |    },
400 |    "outputs": [],
401 |    "source": [
402 |     "import math\n",
403 |     "\n",
404 |     "import torch\n",
405 |     "from torch.optim import Optimizer\n",
406 |     "from torch.optim.lr_scheduler import LambdaLR\n",
407 |     "\n",
408 |     "\n",
409 |     "def get_cosine_schedule_with_warmup(\n",
410 |     "\toptimizer: Optimizer,\n",
411 |     "\tnum_warmup_steps: int,\n",
412 |     "\tnum_training_steps: int,\n",
413 |     "\tnum_cycles: float = 0.5,\n",
414 |     "\tlast_epoch: int = -1,\n",
415 |     "):\n",
416 |     "\t\"\"\"\n",
417 |     "\tCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n",
418 |     "\tinitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\n",
419 |     "\tinitial lr set in the optimizer.\n",
420 |     "\n",
421 |     "\tArgs:\n",
422 |     "\t\toptimizer (:class:`~torch.optim.Optimizer`):\n",
423 |     "\t\tThe optimizer for which to schedule the learning rate.\n",
424 |     "\t\tnum_warmup_steps (:obj:`int`):\n",
425 |     "\t\tThe number of steps for the warmup phase.\n",
426 |     "\t\tnum_training_steps (:obj:`int`):\n",
427 |     "\t\tThe total number of training steps.\n",
428 |     "\t\tnum_cycles (:obj:`float`, `optional`, defaults to 0.5):\n",
429 |     "\t\tThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0\n",
430 |     "\t\tfollowing a half-cosine).\n",
431 |     "\t\tlast_epoch (:obj:`int`, `optional`, defaults to -1):\n",
432 |     "\t\tThe index of the last epoch when resuming training.\n",
433 |     "\n",
434 |     "\tReturn:\n",
435 |     "\t\t:obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.\n",
436 |     "\t\"\"\"\n",
437 |     "\tdef lr_lambda(current_step):\n",
438 |     "\t\t# Warmup\n",
439 |     "\t\tif current_step < num_warmup_steps:\n",
440 |     "\t\t\treturn float(current_step) / float(max(1, num_warmup_steps))\n",
441 |     "\t\t# decadence\n",
442 |     "\t\tprogress = float(current_step - num_warmup_steps) / float(\n",
443 |     "\t\t\tmax(1, num_training_steps - num_warmup_steps)\n",
444 |     "\t\t)\n",
445 |     "\t\treturn max(\n",
446 |     "\t\t\t0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))\n",
447 |     "\t\t)\n",
448 |     "\n",
449 |     "\treturn LambdaLR(optimizer, lr_lambda, last_epoch)"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {
455 |     "id": "-LN2XkteM_uH"
456 |    },
457 |    "source": [
458 |     "# Model Function\n",
459 |     "- Model forward function."
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": 7,
465 |    "metadata": {
466 |     "id": "N-rr8529JMz0"
467 |    },
468 |    "outputs": [],
469 |    "source": [
470 |     "import torch\n",
471 |     "\n",
472 |     "\n",
473 |     "def model_fn(batch, model, criterion, device):\n",
474 |     "\t\"\"\"Forward a batch through the model.\"\"\"\n",
475 |     "\n",
476 |     "\tmels, labels = batch\n",
477 |     "\tmels = mels.to(device)\n",
478 |     "\tlabels = labels.to(device)\n",
479 |     "\n",
480 |     "\touts = model(mels)\n",
481 |     "\n",
482 |     "\tloss = criterion(outs, labels)\n",
483 |     "\n",
484 |     "\t# Get the speaker id with highest probability.\n",
485 |     "\tpreds = outs.argmax(1)\n",
486 |     "\t# Compute accuracy.\n",
487 |     "\taccuracy = torch.mean((preds == labels).float())\n",
488 |     "\n",
489 |     "\treturn loss, accuracy"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "markdown",
494 |    "metadata": {
495 |     "id": "cwM_xyOtNCI2"
496 |    },
497 |    "source": [
498 |     "# Validate\n",
499 |     "- Calculate accuracy of the validation set."
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": 8,
505 |    "metadata": {
506 |     "id": "YAiv6kpdJRTJ"
507 |    },
508 |    "outputs": [],
509 |    "source": [
510 |     "from tqdm import tqdm\n",
511 |     "import torch\n",
512 |     "\n",
513 |     "\n",
514 |     "def valid(dataloader, model, criterion, device): \n",
515 |     "\t\"\"\"Validate on validation set.\"\"\"\n",
516 |     "\n",
517 |     "\tmodel.eval()\n",
518 |     "\trunning_loss = 0.0\n",
519 |     "\trunning_accuracy = 0.0\n",
520 |     "\t# pbar = tqdm(total=len(dataloader.dataset), ncols=0, desc=\"Valid\", unit=\" uttr\")\n",
521 |     "\n",
522 |     "\tfor i, batch in enumerate(dataloader):\n",
523 |     "\t\twith torch.no_grad():\n",
524 |     "\t\t\tloss, accuracy = model_fn(batch, model, criterion, device)\n",
525 |     "\t\t\trunning_loss += loss.item()\n",
526 |     "\t\t\trunning_accuracy += accuracy.item()\n",
527 |     "\n",
528 |     "\t# \tpbar.update(dataloader.batch_size)\n",
529 |     "\t# \tpbar.set_postfix(\n",
530 |     "\t# \t\tloss=f\"{running_loss / (i+1):.2f}\",\n",
531 |     "\t# \t\taccuracy=f\"{running_accuracy / (i+1):.2f}\",\n",
532 |     "\t# \t)\n",
533 |     "\n",
534 |     "\t# pbar.close()\n",
535 |     "\tmodel.train()\n",
536 |     "\n",
537 |     "\treturn running_accuracy / len(dataloader)"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "markdown",
542 |    "metadata": {
543 |     "id": "g6ne9G-eNEdG"
544 |    },
545 |    "source": [
546 |     "# Main function"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": 9,
552 |    "metadata": {
553 |     "id": "Usv9s-CuJSG7"
554 |    },
555 |    "outputs": [
556 |     {
557 |      "ename": "NameError",
558 |      "evalue": "name 'main' is not defined",
559 |      "output_type": "error",
560 |      "traceback": [
561 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
562 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
563 |       "\u001b[1;32m/home/tracy/Projects/ML2022/hw4/hw04(try ensemble).ipynb Cell 19'\u001b[0m in \u001b[0;36m<cell line: 135>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20ensemble%29.ipynb#ch0000018vscode-remote?line=8'>9</a>\u001b[0m \u001b[39m# def parse_args():\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20ensemble%29.ipynb#ch0000018vscode-remote?line=9'>10</a>\u001b[0m \u001b[39m# \t\"\"\"arguments\"\"\"\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20ensemble%29.ipynb#ch0000018vscode-remote?line=10'>11</a>\u001b[0m \u001b[39m# \tconfig = {\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20ensemble%29.ipynb#ch0000018vscode-remote?line=130'>131</a>\u001b[0m \n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20ensemble%29.ipynb#ch0000018vscode-remote?line=131'>132</a>\u001b[0m \t\u001b[39m# # pbar.close()\u001b[39;00m\n\u001b[1;32m    <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20ensemble%29.ipynb#ch0000018vscode-remote?line=134'>135</a>\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m__name__\u001b[39m \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m__main__\u001b[39m\u001b[39m\"\u001b[39m:\n\u001b[0;32m--> <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a224c61625043227d/home/tracy/Projects/ML2022/hw4/hw04%28try%20ensemble%29.ipynb#ch0000018vscode-remote?line=135'>136</a>\u001b[0m \tmain(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mparse_args())\n",
564 |       "\u001b[0;31mNameError\u001b[0m: name 'main' is not defined"
565 |      ]
566 |     }
567 |    ],
568 |    "source": [
569 |     "from tqdm import tqdm\n",
570 |     "\n",
571 |     "import torch\n",
572 |     "import torch.nn as nn\n",
573 |     "from torch.optim import AdamW\n",
574 |     "from torch.utils.data import DataLoader, random_split\n",
575 |     "\n",
576 |     "\n",
577 |     "def parse_args():\n",
578 |     "\t\"\"\"arguments\"\"\"\n",
579 |     "\tconfig = {\n",
580 |     "\t\t\"data_dir\": \"./Dataset\",\n",
581 |     "\t\t\"save_path\": \"model4.ckpt\",\n",
582 |     "\t\t\"batch_size\": 32,\n",
583 |     "\t\t\"n_workers\": 8,\n",
584 |     "\t\t\"valid_steps\": 2000,\n",
585 |     "\t\t\"warmup_steps\": 1000,\n",
586 |     "\t\t\"save_steps\": 10000,\n",
587 |     "\t\t\"total_steps\": 500000,\n",
588 |     "\t}\n",
589 |     "\n",
590 |     "\treturn config\n",
591 |     "\n",
592 |     "\n",
593 |     "def main(\n",
594 |     "\tdata_dir,\n",
595 |     "\tsave_path,\n",
596 |     "\tbatch_size,\n",
597 |     "\tn_workers,\n",
598 |     "\tvalid_steps,\n",
599 |     "\twarmup_steps,\n",
600 |     "\ttotal_steps,\n",
601 |     "\tsave_steps,\n",
602 |     "):\n",
603 |     "\t\"\"\"Main function.\"\"\"\n",
604 |     "\tdevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
605 |     "\tprint(f\"[Info]: Use {device} now!\")\n",
606 |     "\n",
607 |     "\ttrain_loader, valid_loader, speaker_num = get_dataloader(data_dir, batch_size, n_workers)\n",
608 |     "\ttrain_iterator = iter(train_loader)\n",
609 |     "\tprint(f\"[Info]: Finish loading data!\",flush = True)\n",
610 |     "\n",
611 |     "\t# Define the ensemble\n",
612 |     "\tensemble = VotingClassifier(\n",
613 |     "\t\testimator=Classifier(n_spks=speaker_num),               # here is your deep learning model\n",
614 |     "\t\tn_estimators=4,\n",
615 |     "\t\tcuda=True,                        # number of base estimators\n",
616 |     "\t)\n",
617 |     "\tio.load(ensemble, \"./model\")  # reload\n",
618 |     "\t# Set the criterion\n",
619 |     "\tcriterion = nn.CrossEntropyLoss()           # training objective\n",
620 |     "\tensemble.set_criterion(criterion)\n",
621 |     "\n",
622 |     "\t# Set the optimizer\n",
623 |     "\tensemble.set_optimizer(\n",
624 |     "\t\t\"Adam\",                                 # type of parameter optimizer\n",
625 |     "\t\tlr=1e-3,                       # learning rate of parameter optimizer\n",
626 |     "\t\t# weight_decay=1e-5,              # weight decay of parameter optimizer\n",
627 |     "\t)\n",
628 |     "\n",
629 |     "\t# Set the learning rate scheduler\n",
630 |     "\tensemble.set_scheduler(\n",
631 |     "\t\t\"CosineAnnealingLR\",                    # type of learning rate scheduler\n",
632 |     "\t\tT_max=30,                           # additional arguments on the scheduler\n",
633 |     "\t)\n",
634 |     "\n",
635 |     "\t# Train the ensemble\n",
636 |     "\tensemble.fit(\n",
637 |     "\t\ttrain_loader,\n",
638 |     "\t\tepochs=100, \n",
639 |     "\t\ttest_loader=valid_loader,\n",
640 |     "\t\tsave_model=True,\n",
641 |     "\t\tsave_dir = \"./model\",\n",
642 |     "\t)\n",
643 |     "\n",
644 |     "\n",
645 |     "if __name__ == \"__main__\":\n",
646 |     "\tmain(**parse_args())"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "markdown",
651 |    "metadata": {
652 |     "id": "NLatBYAhNNMx"
653 |    },
654 |    "source": [
655 |     "# Inference\n",
656 |     "\n",
657 |     "## Dataset of inference"
658 |    ]
659 |   },
660 |   {
661 |    "cell_type": "code",
662 |    "execution_count": 10,
663 |    "metadata": {
664 |     "colab": {
665 |      "background_save": true
666 |     },
667 |     "id": "efS4pCmAJXJH"
668 |    },
669 |    "outputs": [],
670 |    "source": [
671 |     "import os\n",
672 |     "import json\n",
673 |     "import torch\n",
674 |     "from pathlib import Path\n",
675 |     "from torch.utils.data import Dataset\n",
676 |     "\n",
677 |     "\n",
678 |     "class InferenceDataset(Dataset):\n",
679 |     "\tdef __init__(self, data_dir):\n",
680 |     "\t\ttestdata_path = Path(data_dir) / \"testdata.json\"\n",
681 |     "\t\tmetadata = json.load(testdata_path.open())\n",
682 |     "\t\tself.data_dir = data_dir\n",
683 |     "\t\tself.data = metadata[\"utterances\"]\n",
684 |     "\n",
685 |     "\tdef __len__(self):\n",
686 |     "\t\treturn len(self.data)\n",
687 |     "\n",
688 |     "\tdef __getitem__(self, index):\n",
689 |     "\t\tutterance = self.data[index]\n",
690 |     "\t\tfeat_path = utterance[\"feature_path\"]\n",
691 |     "\t\tmel = torch.load(os.path.join(self.data_dir, feat_path))\n",
692 |     "\n",
693 |     "\t\treturn feat_path, mel\n",
694 |     "\n",
695 |     "\n",
696 |     "def inference_collate_batch(batch):\n",
697 |     "\t\"\"\"Collate a batch of data.\"\"\"\n",
698 |     "\tfeat_paths, mels = zip(*batch)\n",
699 |     "\n",
700 |     "\treturn feat_paths, torch.stack(mels)"
701 |    ]
702 |   },
703 |   {
704 |    "cell_type": "markdown",
705 |    "metadata": {
706 |     "id": "tl0WnYwxNK_S"
707 |    },
708 |    "source": [
709 |     "## Main funcrion of Inference"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "code",
714 |    "execution_count": 16,
715 |    "metadata": {},
716 |    "outputs": [],
717 |    "source": [
718 |     "class Classifier(nn.Module):\n",
719 |     "\tdef __init__(self, d_model=128, n_spks=600, dropout=0.1):\n",
720 |     "\t\tsuper().__init__()\n",
721 |     "\t\t# Project the dimension of features from that of input into d_model.\n",
722 |     "\t\tself.prenet = nn.Linear(40, d_model)\n",
723 |     "\t\t# TODO:\n",
724 |     "\t\t#   Change Transformer to Conformer.\n",
725 |     "\t\t#   https://arxiv.org/abs/2005.08100\n",
726 |     "\t\t# self.encoder_layer = nn.TransformerEncoderLayer(\n",
727 |     "\t\t# \td_model=d_model, dim_feedforward=256, nhead=1\n",
728 |     "\t\t# )\n",
729 |     "\t\t# self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)\n",
730 |     "\t\tself.encoder = torchaudio.models.Conformer(input_dim=d_model, num_heads=4, ffn_dim=128, num_layers = 4,dropout=0.2,depthwise_conv_kernel_size=17)\n",
731 |     "\t\t# Project the the dimension of features from d_model into speaker nums.\n",
732 |     "\t\tself.pred_layer = nn.Sequential(\n",
733 |     "\t\t\tnn.BatchNorm1d(d_model),\n",
734 |     "\t\t\tnn.Linear(d_model, d_model),\n",
735 |     "\t\t\tnn.ReLU(),\n",
736 |     "\t\t\tnn.BatchNorm1d(d_model),\n",
737 |     "\t\t\tnn.Linear(d_model, n_spks),\n",
738 |     "\t\t)\n",
739 |     "\t\tself.pooling = SelfAttentionPooling(d_model)\n",
740 |     "\n",
741 |     "\tdef forward(self, mels):\n",
742 |     "\t\t\"\"\"\n",
743 |     "\t\targs:\n",
744 |     "\t\t\tmels: (batch size, length, 40)\n",
745 |     "\t\treturn:\n",
746 |     "\t\t\tout: (batch size, n_spks)\n",
747 |     "\t\t\"\"\"\n",
748 |     "\t\t# out: (batch size, length, d_model) 32 128 40\n",
749 |     "\t\tout = self.prenet(mels)\n",
750 |     "\t\t# out: (length, batch size, d_model)\n",
751 |     "\t\t# out = out.permute(1, 0, 2)\n",
752 |     "\t\t# The encoder layer expect features in the shape of (length, batch size, d_model).\n",
753 |     "\t\t# out = self.encoder_layer(out)\n",
754 |     "\t\tlengths = torch.full((out.shape[0],),out.shape[1]).to(\"cuda\")\n",
755 |     "\t\tout = self.encoder(out, lengths)\n",
756 |     "\t\t# out: (batch size, length, d_model)\n",
757 |     "\t\t# out = out[0].transpose(0, 1)\n",
758 |     "\t\t# mean pooling\n",
759 |     "\t\t# stats = out[0].mean(dim=1)\n",
760 |     "\t\t\n",
761 |     "\t\tstats = self.pooling(out[0])\n",
762 |     "\n",
763 |     "\t\t# out: (batch, n_spks)\n",
764 |     "\t\tout = self.pred_layer(stats)\n",
765 |     "\t\treturn out\n",
766 |     "\n",
767 |     "class Classifier4(nn.Module):\n",
768 |     "\tdef __init__(self, d_model=160, n_spks=600, dropout=0.1):\n",
769 |     "\t\tsuper().__init__()\n",
770 |     "\t\t# Project the dimension of features from that of input into d_model.\n",
771 |     "\t\tself.prenet = nn.Linear(40, d_model)\n",
772 |     "\t\t# TODO:\n",
773 |     "\t\t#   Change Transformer to Conformer.\n",
774 |     "\t\t#   https://arxiv.org/abs/2005.08100\n",
775 |     "\t\t# self.encoder_layer = nn.TransformerEncoderLayer(\n",
776 |     "\t\t# \td_model=d_model, dim_feedforward=256, nhead=1\n",
777 |     "\t\t# )\n",
778 |     "\t\t# self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)\n",
779 |     "\t\tself.encoder = torchaudio.models.Conformer(input_dim=d_model, num_heads=1, ffn_dim=128, num_layers = 6,dropout=0.1,depthwise_conv_kernel_size=21)\n",
780 |     "\t\t# Project the the dimension of features from d_model into speaker nums.\n",
781 |     "\t\tself.pred_layer = nn.Sequential(\n",
782 |     "\t\t\tnn.Linear(d_model, n_spks),\n",
783 |     "\t\t)\n",
784 |     "\t\tself.pooling = SelfAttentionPooling(d_model)\n",
785 |     "\n",
786 |     "\tdef forward(self, mels):\n",
787 |     "\t\t\"\"\"\n",
788 |     "\t\targs:\n",
789 |     "\t\t\tmels: (batch size, length, 40)\n",
790 |     "\t\treturn:\n",
791 |     "\t\t\tout: (batch size, n_spks)\n",
792 |     "\t\t\"\"\"\n",
793 |     "\t\t# out: (batch size, length, d_model) 32 128 40\n",
794 |     "\t\tout = self.prenet(mels)\n",
795 |     "\t\t# out: (length, batch size, d_model)\n",
796 |     "\t\t# out = out.permute(1, 0, 2)\n",
797 |     "\t\t# The encoder layer expect features in the shape of (length, batch size, d_model).\n",
798 |     "\t\t# out = self.encoder_layer(out)\n",
799 |     "\t\tlengths = torch.full((out.shape[0],),out.shape[1]).to(\"cuda\")\n",
800 |     "\t\tout = self.encoder(out, lengths)\n",
801 |     "\t\t# out: (batch size, length, d_model)\n",
802 |     "\t\t# out = out[0].transpose(0, 1)\n",
803 |     "\t\t# mean pooling\n",
804 |     "\t\t# stats = out[0].mean(dim=1)\n",
805 |     "\t\t\n",
806 |     "\t\tstats = self.pooling(out[0])\n",
807 |     "\n",
808 |     "\t\t# out: (batch, n_spks)\n",
809 |     "\t\tout = self.pred_layer(stats)\n",
810 |     "\t\treturn out"
811 |    ]
812 |   },
813 |   {
814 |    "cell_type": "code",
815 |    "execution_count": 18,
816 |    "metadata": {
817 |     "colab": {
818 |      "background_save": true
819 |     },
820 |     "id": "i8SAbuXEJb2A"
821 |    },
822 |    "outputs": [
823 |     {
824 |      "name": "stdout",
825 |      "output_type": "stream",
826 |      "text": [
827 |       "[Info]: Use cuda now!\n",
828 |       "[Info]: Finish loading data!\n",
829 |       "[Info]: Finish creating model!\n",
830 |       "[Info]: Finish creating model!\n"
831 |      ]
832 |     }
833 |    ],
834 |    "source": [
835 |     "import json\n",
836 |     "import csv\n",
837 |     "from pathlib import Path\n",
838 |     "from tqdm.notebook import tqdm\n",
839 |     "from torchensemble.utils import io\n",
840 |     "\n",
841 |     "import torch\n",
842 |     "from torch.utils.data import DataLoader\n",
843 |     "\n",
844 |     "def parse_args():\n",
845 |     "\t\"\"\"arguments\"\"\"\n",
846 |     "\tconfig = {\n",
847 |     "\t\t\"data_dir\": \"./Dataset\",\n",
848 |     "\t\t\"model_path\": \"./model_ensemble.ckpt\",\n",
849 |     "\t\t\"output_path\": \"./output_ensemble2.csv\",\n",
850 |     "\t}\n",
851 |     "\n",
852 |     "\treturn config\n",
853 |     "\n",
854 |     "\n",
855 |     "def main(\n",
856 |     "\tdata_dir,\n",
857 |     "\tmodel_path,\n",
858 |     "\toutput_path,\n",
859 |     "):\n",
860 |     "\t\"\"\"Main function.\"\"\"\n",
861 |     "\tdevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
862 |     "\tprint(f\"[Info]: Use {device} now!\")\n",
863 |     "\n",
864 |     "\tmapping_path = Path(data_dir) / \"mapping.json\"\n",
865 |     "\tmapping = json.load(mapping_path.open())\n",
866 |     "\n",
867 |     "\tdataset = InferenceDataset(data_dir)\n",
868 |     "\tdataloader = DataLoader(\n",
869 |     "\t\tdataset,\n",
870 |     "\t\tbatch_size=1,\n",
871 |     "\t\tshuffle=False,\n",
872 |     "\t\tdrop_last=False,\n",
873 |     "\t\tnum_workers=8,\n",
874 |     "\t\tcollate_fn=inference_collate_batch,\n",
875 |     "\t)\n",
876 |     "\tprint(f\"[Info]: Finish loading data!\",flush = True)\n",
877 |     "\n",
878 |     "\tspeaker_num = len(mapping[\"id2speaker\"])\n",
879 |     "\t# model = Classifier(n_spks=speaker_num).to(device)\n",
880 |     "\tmodel = VotingClassifier(\n",
881 |     "\t\testimator=Classifier_cur(n_spks=speaker_num),               # here is your deep learning model\n",
882 |     "\t\tn_estimators=4,\n",
883 |     "\t\tcuda=True,                        # number of base estimators\n",
884 |     "\t)\n",
885 |     "\tio.load(model, \"./model\")  # reload\n",
886 |     "\t# model.load_state_dict(torch.load(model_path))\n",
887 |     "\tmodel.eval()\n",
888 |     "\tprint(f\"[Info]: Finish creating model!\",flush = True)\n",
889 |     "\n",
890 |     "\t# import model3 and model4\n",
891 |     "\tmodel3 = Classifier(n_spks=speaker_num).to(device)\n",
892 |     "\tmodel3.load_state_dict(torch.load(\"model3.ckpt\"))\n",
893 |     "\tmodel3.eval()\n",
894 |     "\tprint(f\"[Info]: Finish creating model!\",flush = True)\n",
895 |     "\tmodel4 = Classifier4(n_spks=speaker_num).to(device)\n",
896 |     "\tmodel4.load_state_dict(torch.load(\"model4.ckpt\"))\n",
897 |     "\tmodel4.eval()\n",
898 |     "\n",
899 |     "\n",
900 |     "\tresults = [[\"Id\", \"Category\"]]\n",
901 |     "\tfor feat_paths, mels in dataloader:\n",
902 |     "\t\twith torch.no_grad():\n",
903 |     "\t\t\tmels = mels.to(device)\n",
904 |     "\t\t\touts3 = model3(mels)\n",
905 |     "\t\t\touts4 = model4(mels)\n",
906 |     "\t\t\touts_cur = model(mels)\n",
907 |     "\t\t\touts = (outs_cur + outs3 + outs4) / 3\n",
908 |     "\t\t\t# outs = model(mels)\n",
909 |     "\t\t\tpreds = outs.argmax(1).cpu().numpy()\n",
910 |     "\t\t\tfor feat_path, pred in zip(feat_paths, preds):\n",
911 |     "\t\t\t\tresults.append([feat_path, mapping[\"id2speaker\"][str(pred)]])\n",
912 |     "\n",
913 |     "\twith open(output_path, 'w', newline='') as csvfile:\n",
914 |     "\t\twriter = csv.writer(csvfile)\n",
915 |     "\t\twriter.writerows(results)\n",
916 |     "\n",
917 |     "\n",
918 |     "if __name__ == \"__main__\":\n",
919 |     "\tmain(**parse_args())"
920 |    ]
921 |   }
922 |  ],
923 |  "metadata": {
924 |   "accelerator": "GPU",
925 |   "colab": {
926 |    "collapsed_sections": [],
927 |    "name": "hw04.ipynb",
928 |    "provenance": []
929 |   },
930 |   "kernelspec": {
931 |    "display_name": "Python 3 (ipykernel)",
932 |    "language": "python",
933 |    "name": "python3"
934 |   },
935 |   "language_info": {
936 |    "codemirror_mode": {
937 |     "name": "ipython",
938 |     "version": 3
939 |    },
940 |    "file_extension": ".py",
941 |    "mimetype": "text/x-python",
942 |    "name": "python",
943 |    "nbconvert_exporter": "python",
944 |    "pygments_lexer": "ipython3",
945 |    "version": "3.10.2"
946 |   }
947 |  },
948 |  "nbformat": 4,
949 |  "nbformat_minor": 1
950 | }
951 | 


--------------------------------------------------------------------------------
/Hw4/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macTracyHuang/NTU-ML2022-Spring/0c14ae8d9a1448ee2da03f93836e3dd5d3a62b16/Hw4/report.pdf


--------------------------------------------------------------------------------
/Hw5/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macTracyHuang/NTU-ML2022-Spring/0c14ae8d9a1448ee2da03f93836e3dd5d3a62b16/Hw5/report.pdf


--------------------------------------------------------------------------------
/Hw6/ml_hw6_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macTracyHuang/NTU-ML2022-Spring/0c14ae8d9a1448ee2da03f93836e3dd5d3a62b16/Hw6/ml_hw6_report.pdf


--------------------------------------------------------------------------------
/Hw7/ML2022Spring_HW7 m1 0.83057.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "xvSGDbExff_I"
  7 |       },
  8 |       "source": [
  9 |         "# **Homework 7 - Bert (Question Answering)**\n",
 10 |         "\n",
 11 |         "If you have any questions, feel free to email us at mlta-2022-spring@googlegroups.com\n",
 12 |         "\n",
 13 |         "\n",
 14 |         "\n",
 15 |         "Slide:    [Link](https://docs.google.com/presentation/d/1H5ZONrb2LMOCixLY7D5_5-7LkIaXO6AGEaV2mRdTOMY/edit?usp=sharing)　Kaggle: [Link](https://www.kaggle.com/c/ml2022spring-hw7)　Data: [Link](https://drive.google.com/uc?id=1AVgZvy3VFeg0fX-6WQJMHPVrx3A-M1kb)\n",
 16 |         "\n",
 17 |         "\n"
 18 |       ]
 19 |     },
 20 |     {
 21 |       "cell_type": "markdown",
 22 |       "metadata": {
 23 |         "id": "WGOr_eS3wJJf"
 24 |       },
 25 |       "source": [
 26 |         "## Task description\n",
 27 |         "- Chinese Extractive Question Answering\n",
 28 |         "  - Input: Paragraph + Question\n",
 29 |         "  - Output: Answer\n",
 30 |         "\n",
 31 |         "- Objective: Learn how to fine tune a pretrained model on downstream task using transformers\n",
 32 |         "\n",
 33 |         "- Todo\n",
 34 |         "    - Fine tune a pretrained chinese BERT model\n",
 35 |         "    - Change hyperparameters (e.g. doc_stride)\n",
 36 |         "    - Apply linear learning rate decay\n",
 37 |         "    - Try other pretrained models\n",
 38 |         "    - Improve preprocessing\n",
 39 |         "    - Improve postprocessing\n",
 40 |         "- Training tips\n",
 41 |         "    - Automatic mixed precision\n",
 42 |         "    - Gradient accumulation\n",
 43 |         "    - Ensemble\n",
 44 |         "\n",
 45 |         "- Estimated training time (tesla t4 with automatic mixed precision enabled)\n",
 46 |         "    - Simple: 8mins\n",
 47 |         "    - Medium: 8mins\n",
 48 |         "    - Strong: 25mins\n",
 49 |         "    - Boss: 2.5hrs\n",
 50 |         "  "
 51 |       ]
 52 |     },
 53 |     {
 54 |       "cell_type": "markdown",
 55 |       "metadata": {
 56 |         "id": "TJ1fSAJE2oaC"
 57 |       },
 58 |       "source": [
 59 |         "## Download Dataset"
 60 |       ]
 61 |     },
 62 |     {
 63 |       "cell_type": "code",
 64 |       "execution_count": 1,
 65 |       "metadata": {
 66 |         "id": "YPrc4Eie9Yo5"
 67 |       },
 68 |       "outputs": [],
 69 |       "source": [
 70 |         "# import gdown\n",
 71 |         "# # Download link 1\n",
 72 |         "# !gdown --id '1AVgZvy3VFeg0fX-6WQJMHPVrx3A-M1kb' --output hw7_data.zip\n",
 73 |         "\n",
 74 |         "# # Download Link 2 (if the above link fails) \n",
 75 |         "# # !gdown --id '1qwjbRjq481lHsnTrrF4OjKQnxzgoLEFR' --output hw7_data.zip\n",
 76 |         "\n",
 77 |         "# # Download Link 3 (if the above link fails) \n",
 78 |         "# # !gdown --id '1QXuWjNRZH6DscSd6QcRER0cnxmpZvijn' --output hw7_data.zip\n",
 79 |         "\n",
 80 |         "# !unzip -o hw7_data.zip\n",
 81 |         "\n",
 82 |         "# # For this HW, K80 < P4 < T4 < P100 <= T4(fp16) < V100\n",
 83 |         "# !nvidia-smi"
 84 |       ]
 85 |     },
 86 |     {
 87 |       "cell_type": "markdown",
 88 |       "metadata": {
 89 |         "id": "TevOvhC03m0h"
 90 |       },
 91 |       "source": [
 92 |         "## Install transformers\n",
 93 |         "\n",
 94 |         "Documentation for the toolkit:　https://huggingface.co/transformers/"
 95 |       ]
 96 |     },
 97 |     {
 98 |       "cell_type": "code",
 99 |       "execution_count": 2,
100 |       "metadata": {
101 |         "id": "tbxWFX_jpDom"
102 |       },
103 |       "outputs": [],
104 |       "source": [
105 |         "# # You are allowed to change version of transformers or use other toolkits\n",
106 |         "# !pip install transformers==4.18.0"
107 |       ]
108 |     },
109 |     {
110 |       "cell_type": "markdown",
111 |       "metadata": {
112 |         "id": "8dKM4yCh4LI_"
113 |       },
114 |       "source": [
115 |         "## Import Packages"
116 |       ]
117 |     },
118 |     {
119 |       "cell_type": "code",
120 |       "execution_count": 3,
121 |       "metadata": {
122 |         "id": "WOTHHtWJoahe"
123 |       },
124 |       "outputs": [],
125 |       "source": [
126 |         "import json\n",
127 |         "import numpy as np\n",
128 |         "import random\n",
129 |         "import torch\n",
130 |         "from torch.utils.data import DataLoader, Dataset \n",
131 |         "from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast, get_linear_schedule_with_warmup\n",
132 |         "import os\n",
133 |         "\n",
134 |         "from tqdm.auto import tqdm\n",
135 |         "\n",
136 |         "os.environ['CUDA_VISIBLE_DEVICES'] = \"0\"\n",
137 |         "\n",
138 |         "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
139 |         "\n",
140 |         "# Fix random seed for reproducibility\n",
141 |         "def same_seeds(seed):\n",
142 |         "\t  torch.manual_seed(seed)\n",
143 |         "\t  if torch.cuda.is_available():\n",
144 |         "\t\t    torch.cuda.manual_seed(seed)\n",
145 |         "\t\t    torch.cuda.manual_seed_all(seed)\n",
146 |         "\t  np.random.seed(seed)\n",
147 |         "\t  random.seed(seed)\n",
148 |         "\t  torch.backends.cudnn.benchmark = False\n",
149 |         "\t  torch.backends.cudnn.deterministic = True\n",
150 |         "same_seeds(0)"
151 |       ]
152 |     },
153 |     {
154 |       "cell_type": "code",
155 |       "execution_count": 4,
156 |       "metadata": {
157 |         "id": "7pBtSZP1SKQO"
158 |       },
159 |       "outputs": [],
160 |       "source": [
161 |         "# Change \"fp16_training\" to True to support automatic mixed precision training (fp16)\t\n",
162 |         "fp16_training = True\n",
163 |         "\n",
164 |         "if fp16_training:\n",
165 |         "    # !pip install accelerate==0.2.0\n",
166 |         "    from accelerate import Accelerator\n",
167 |         "    accelerator = Accelerator(fp16=True)\n",
168 |         "    device = accelerator.device\n",
169 |         "\n",
170 |         "# Documentation for the toolkit:  https://huggingface.co/docs/accelerate/"
171 |       ]
172 |     },
173 |     {
174 |       "cell_type": "markdown",
175 |       "metadata": {
176 |         "id": "2YgXHuVLp_6j"
177 |       },
178 |       "source": [
179 |         "## Load Model and Tokenizer\n",
180 |         "\n",
181 |         "\n",
182 |         "\n",
183 |         "\n",
184 |         " "
185 |       ]
186 |     },
187 |     {
188 |       "cell_type": "code",
189 |       "execution_count": 5,
190 |       "metadata": {
191 |         "id": "xyBCYGjAp3ym"
192 |       },
193 |       "outputs": [
194 |         {
195 |           "name": "stderr",
196 |           "output_type": "stream",
197 |           "text": [
198 |             "Some weights of the model checkpoint at hfl/chinese-macbert-large were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']\n",
199 |             "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
200 |             "- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
201 |             "Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at hfl/chinese-macbert-large and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']\n",
202 |             "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
203 |           ]
204 |         }
205 |       ],
206 |       "source": [
207 |         "# model = BertForQuestionAnswering.from_pretrained(\"bert-base-chinese\").to(device)\n",
208 |         "# tokenizer = BertTokenizerFast.from_pretrained(\"bert-base-chinese\")\n",
209 |         "pre_model = \"hfl/chinese-macbert-large\"\n",
210 |         "tokenizer = BertTokenizerFast.from_pretrained(pre_model)\n",
211 |         "model = BertForQuestionAnswering.from_pretrained(pre_model).to(device)\n",
212 |         "\n",
213 |         "# tokenizer = RobertaTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')\n",
214 |         "# model = RobertaModel.from_pretrained('hfl/chinese-roberta-wwm-ext')\n",
215 |         "\n",
216 |         "# You can safely ignore the warning message (it pops up because new prediction heads for QA are initialized randomly)"
217 |       ]
218 |     },
219 |     {
220 |       "cell_type": "markdown",
221 |       "metadata": {
222 |         "id": "3Td-GTmk5OW4"
223 |       },
224 |       "source": [
225 |         "## Read Data\n",
226 |         "\n",
227 |         "- Training set: 31690 QA pairs\n",
228 |         "- Dev set: 4131  QA pairs\n",
229 |         "- Test set: 4957  QA pairs\n",
230 |         "\n",
231 |         "- {train/dev/test}_questions:\t\n",
232 |         "  - List of dicts with the following keys:\n",
233 |         "   - id (int)\n",
234 |         "   - paragraph_id (int)\n",
235 |         "   - question_text (string)\n",
236 |         "   - answer_text (string)\n",
237 |         "   - answer_start (int)\n",
238 |         "   - answer_end (int)\n",
239 |         "- {train/dev/test}_paragraphs: \n",
240 |         "  - List of strings\n",
241 |         "  - paragraph_ids in questions correspond to indexs in paragraphs\n",
242 |         "  - A paragraph may be used by several questions "
243 |       ]
244 |     },
245 |     {
246 |       "cell_type": "code",
247 |       "execution_count": 6,
248 |       "metadata": {
249 |         "id": "NvX7hlepogvu"
250 |       },
251 |       "outputs": [],
252 |       "source": [
253 |         "def read_data(file):\n",
254 |         "    with open(file, 'r', encoding=\"utf-8\") as reader:\n",
255 |         "        data = json.load(reader)\n",
256 |         "    return data[\"questions\"], data[\"paragraphs\"]\n",
257 |         "\n",
258 |         "train_questions, train_paragraphs = read_data(\"hw7_train.json\")\n",
259 |         "dev_questions, dev_paragraphs = read_data(\"hw7_dev.json\")\n",
260 |         "test_questions, test_paragraphs = read_data(\"hw7_test.json\")"
261 |       ]
262 |     },
263 |     {
264 |       "cell_type": "markdown",
265 |       "metadata": {
266 |         "id": "Fm0rpTHq0e4N"
267 |       },
268 |       "source": [
269 |         "## Tokenize Data"
270 |       ]
271 |     },
272 |     {
273 |       "cell_type": "code",
274 |       "execution_count": 7,
275 |       "metadata": {
276 |         "id": "rTZ6B70Hoxie"
277 |       },
278 |       "outputs": [],
279 |       "source": [
280 |         "# Tokenize questions and paragraphs separately\n",
281 |         "# 「add_special_tokens」 is set to False since special tokens will be added when tokenized questions and paragraphs are combined in datset __getitem__ \n",
282 |         "\n",
283 |         "train_questions_tokenized = tokenizer([train_question[\"question_text\"] for train_question in train_questions], add_special_tokens=False)\n",
284 |         "dev_questions_tokenized = tokenizer([dev_question[\"question_text\"] for dev_question in dev_questions], add_special_tokens=False)\n",
285 |         "test_questions_tokenized = tokenizer([test_question[\"question_text\"] for test_question in test_questions], add_special_tokens=False) \n",
286 |         "\n",
287 |         "train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens=False)\n",
288 |         "dev_paragraphs_tokenized = tokenizer(dev_paragraphs, add_special_tokens=False)\n",
289 |         "test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)\n",
290 |         "\n",
291 |         "# You can safely ignore the warning message as tokenized sequences will be futher processed in datset __getitem__ before passing to model"
292 |       ]
293 |     },
294 |     {
295 |       "cell_type": "markdown",
296 |       "metadata": {
297 |         "id": "Ws8c8_4d5UCI"
298 |       },
299 |       "source": [
300 |         "## Dataset and Dataloader"
301 |       ]
302 |     },
303 |     {
304 |       "cell_type": "code",
305 |       "execution_count": 8,
306 |       "metadata": {
307 |         "id": "Xjooag-Swnuh"
308 |       },
309 |       "outputs": [],
310 |       "source": [
311 |         "class QA_Dataset(Dataset):\n",
312 |         "    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):\n",
313 |         "        self.split = split\n",
314 |         "        self.questions = questions\n",
315 |         "        self.tokenized_questions = tokenized_questions\n",
316 |         "        self.tokenized_paragraphs = tokenized_paragraphs\n",
317 |         "        self.max_question_len = 40\n",
318 |         "        self.max_paragraph_len = 466\n",
319 |         "        \n",
320 |         "        ##### TODO: Change value of doc_stride #####\n",
321 |         "        self.doc_stride = self.max_paragraph_len // 2\n",
322 |         "\n",
323 |         "        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]\n",
324 |         "        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1\n",
325 |         "\n",
326 |         "    def __len__(self):\n",
327 |         "        return len(self.questions)\n",
328 |         "\n",
329 |         "    def __getitem__(self, idx):\n",
330 |         "        question = self.questions[idx]\n",
331 |         "        tokenized_question = self.tokenized_questions[idx]\n",
332 |         "        tokenized_paragraph = self.tokenized_paragraphs[question[\"paragraph_id\"]]\n",
333 |         "\n",
334 |         "        ##### TODO: Preprocessing #####\n",
335 |         "        # Hint: How to prevent model from learning something it should not learn\n",
336 |         "        exceed = True if len(tokenized_paragraph) > self.max_paragraph_len else False\n",
337 |         "\n",
338 |         "        if self.split == \"train\":\n",
339 |         "            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph  \n",
340 |         "            answer_start_token = tokenized_paragraph.char_to_token(question[\"answer_start\"])\n",
341 |         "            answer_end_token = tokenized_paragraph.char_to_token(question[\"answer_end\"])\n",
342 |         "\n",
343 |         "            # A single window is obtained by slicing the portion of paragraph containing the answer\n",
344 |         "            if exceed:\n",
345 |         "                mid = (answer_start_token + answer_end_token) // 2\n",
346 |         "                paragraph_start = max(0, min(mid - self.max_paragraph_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))\n",
347 |         "            else:\n",
348 |         "                rand_start = random.randint(0, answer_start_token)\n",
349 |         "                paragraph_start = rand_start\n",
350 |         "            paragraph_end = paragraph_start + self.max_paragraph_len\n",
351 |         "\n",
352 |         "            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)\n",
353 |         "            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102] \n",
354 |         "            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]\t\t\n",
355 |         "            \n",
356 |         "            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window\n",
357 |         "            answer_start_token += len(input_ids_question) - paragraph_start\n",
358 |         "            answer_end_token += len(input_ids_question) - paragraph_start\n",
359 |         "            \n",
360 |         "            # Pad sequence and obtain inputs to model \n",
361 |         "            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)\n",
362 |         "            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token\n",
363 |         "\n",
364 |         "        # Validation/Testing\n",
365 |         "        else:\n",
366 |         "            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []\n",
367 |         "            \n",
368 |         "            # Paragraph is split into several windows, each with start positions separated by step \"doc_stride\"\n",
369 |         "            for i in range(0, len(tokenized_paragraph), self.doc_stride):\n",
370 |         "                \n",
371 |         "                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)\n",
372 |         "                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]\n",
373 |         "                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]\n",
374 |         "                \n",
375 |         "                # Pad sequence and obtain inputs to model\n",
376 |         "                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)\n",
377 |         "                \n",
378 |         "                input_ids_list.append(input_ids)\n",
379 |         "                token_type_ids_list.append(token_type_ids)\n",
380 |         "                attention_mask_list.append(attention_mask)\n",
381 |         "            \n",
382 |         "            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)\n",
383 |         "\n",
384 |         "    def padding(self, input_ids_question, input_ids_paragraph):\n",
385 |         "        # Pad zeros if sequence length is shorter than max_seq_len\n",
386 |         "        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)\n",
387 |         "        # Indices of input sequence tokens in the vocabulary\n",
388 |         "        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len\n",
389 |         "        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]\n",
390 |         "        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len\n",
391 |         "        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]\n",
392 |         "        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len\n",
393 |         "        \n",
394 |         "        return input_ids, token_type_ids, attention_mask\n",
395 |         "\n",
396 |         "train_set = QA_Dataset(\"train\", train_questions, train_questions_tokenized, train_paragraphs_tokenized)\n",
397 |         "dev_set = QA_Dataset(\"dev\", dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)\n",
398 |         "test_set = QA_Dataset(\"test\", test_questions, test_questions_tokenized, test_paragraphs_tokenized)\n",
399 |         "\n",
400 |         "train_batch_size = 2\n",
401 |         "\n",
402 |         "# Note: Do NOT change batch size of dev_loader / test_loader !\n",
403 |         "# Although batch size=1, it is actually a batch consisting of several windows from the same QA pair\n",
404 |         "train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, pin_memory=True)\n",
405 |         "dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False, pin_memory=True)\n",
406 |         "test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)"
407 |       ]
408 |     },
409 |     {
410 |       "cell_type": "markdown",
411 |       "metadata": {
412 |         "id": "5_H1kqhR8CdM"
413 |       },
414 |       "source": [
415 |         "## Function for Evaluation"
416 |       ]
417 |     },
418 |     {
419 |       "cell_type": "code",
420 |       "execution_count": 9,
421 |       "metadata": {
422 |         "id": "SqeA3PLPxOHu"
423 |       },
424 |       "outputs": [],
425 |       "source": [
426 |         "# def evaluate(data, output):\n",
427 |         "#     ##### TODO: Postprocessing #####\n",
428 |         "#     # There is a bug and room for improvement in postprocessing \n",
429 |         "#     # Hint: Open your prediction file to see what is wrong \n",
430 |         "    \n",
431 |         "#     answer = ''\n",
432 |         "#     max_prob = float('-inf')\n",
433 |         "#     num_of_windows = data[0].shape[1]\n",
434 |         "    \n",
435 |         "#     for k in range(num_of_windows):\n",
436 |         "#         # Obtain answer by choosing the most probable start position / end position\n",
437 |         "#         start_prob, start_index = torch.max(output.start_logits[k], dim=0)\n",
438 |         "#         end_prob, end_index = torch.max(output.end_logits[k], dim=0)\n",
439 |         "        \n",
440 |         "#         # Probability of answer is calculated as sum of start_prob and end_prob\n",
441 |         "#         prob = start_prob + end_prob\n",
442 |         "#         if start_index > end_index:\n",
443 |         "#            prob = 0\n",
444 |         "#         # Replace answer if calculated probability is larger than previous windows\n",
445 |         "#         if prob > max_prob:\n",
446 |         "#             max_prob = prob\n",
447 |         "#             # Convert tokens to chars (e.g. [1920, 7032] --> \"大 金\")\n",
448 |         "#             answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])\n",
449 |         "    \n",
450 |         "#     # Remove spaces in answer (e.g. \"大 金\" --> \"大金\")\n",
451 |         "#     return answer.replace(' ','')\n",
452 |         "\n",
453 |         "max_answer_length = 40\n",
454 |         "\n",
455 |         "def evaluate(data, output):\n",
456 |         "    ##### TODO: Postprocessing #####\n",
457 |         "    # There is a bug and room for improvement in postprocessing \n",
458 |         "    # Hint: Open your prediction file to see what is wrong \n",
459 |         "    \n",
460 |         "    answer = ''\n",
461 |         "    max_prob = float('-inf')\n",
462 |         "    num_of_windows = data[0].shape[1]\n",
463 |         "    n_best = 60\n",
464 |         "\n",
465 |         "    for k in range(num_of_windows):\n",
466 |         "        start_indexes = np.argsort(output.start_logits[k].cpu().numpy())[-1 : -n_best - 1 : -1].tolist()\n",
467 |         "        end_indexes = np.argsort(output.end_logits[k].cpu().numpy())[-1 : -n_best - 1 : -1].tolist()\n",
468 |         "        \n",
469 |         "        for start_index in start_indexes:\n",
470 |         "            for end_index in end_indexes:\n",
471 |         "                if start_index > end_index or end_index - start_index + 1 > max_answer_length:\n",
472 |         "                    continue\n",
473 |         "\n",
474 |         "                start_prob, end_prob = output.start_logits[k][start_index], output.end_logits[k][end_index]\n",
475 |         "                \n",
476 |         "                prob = start_prob + end_prob\n",
477 |         "                if prob > max_prob:\n",
478 |         "                    max_prob = prob\n",
479 |         "                    # Convert tokens to chars (e.g. [1920, 7032] --> \"大 金\")\n",
480 |         "                    answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])\n",
481 |         "    return answer.replace(' ','')"
482 |       ]
483 |     },
484 |     {
485 |       "cell_type": "markdown",
486 |       "metadata": {
487 |         "id": "rzHQit6eMnKG"
488 |       },
489 |       "source": [
490 |         "## Training"
491 |       ]
492 |     },
493 |     {
494 |       "cell_type": "code",
495 |       "execution_count": 10,
496 |       "metadata": {
497 |         "id": "3Q-B6ka7xoCM"
498 |       },
499 |       "outputs": [],
500 |       "source": [
501 |         "# num_epoch = 4\n",
502 |         "# validation = True\n",
503 |         "# logging_step = 100\n",
504 |         "# learning_rate = 3e-5\n",
505 |         "# optimizer = AdamW(model.parameters(), lr=learning_rate)\n",
506 |         "\n",
507 |         "# # batch accumulation parameter\n",
508 |         "# accum_iter = 32 \n",
509 |         "\n",
510 |         "# #set up scheduler\n",
511 |         "# len_dataset = len(train_set)\n",
512 |         "# print(f\"length of train_set: {len_dataset}\")\n",
513 |         "# total_steps = (len_dataset // accum_iter) * num_epoch\n",
514 |         "# warm_up_ratio = 0\n",
515 |         "# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warm_up_ratio * total_steps, num_training_steps = total_steps)\n",
516 |         "\n",
517 |         "\n",
518 |         "\n",
519 |         "# if fp16_training:\n",
520 |         "#     model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) \n",
521 |         "\n",
522 |         "# model.train()\n",
523 |         "\n",
524 |         "# print(\"Start Training ...\")\n",
525 |         "\n",
526 |         "# best_acc = 0\n",
527 |         "\n",
528 |         "# for epoch in range(num_epoch):\n",
529 |         "#     step = 1\n",
530 |         "#     train_loss = train_acc = 0\n",
531 |         "    \n",
532 |         "#     for data in tqdm(train_loader):\t\n",
533 |         "#         # Load all data into GPU\n",
534 |         "#         data = [i.to(device) for i in data]\n",
535 |         "        \n",
536 |         "#         # Model inputs: input_ids, token_type_ids, attention_mask, start_positions, end_positions (Note: only \"input_ids\" is mandatory)\n",
537 |         "#         # Model outputs: start_logits, end_logits, loss (return when start_positions/end_positions are provided)  \n",
538 |         "#         output = model(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3], end_positions=data[4])\n",
539 |         "\n",
540 |         "#         # Choose the most probable start position / end position\n",
541 |         "#         start_index = torch.argmax(output.start_logits, dim=1)\n",
542 |         "#         end_index = torch.argmax(output.end_logits, dim=1)\n",
543 |         "        \n",
544 |         "#         # Prediction is correct only if both start_index and end_index are correct\n",
545 |         "#         train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()\n",
546 |         "\n",
547 |         "#         # normalize loss to account for batch accumulation\n",
548 |         "#         train_loss += output.loss / accum_iter \n",
549 |         "        \n",
550 |         "#         if fp16_training:\n",
551 |         "#             accelerator.backward(output.loss)\n",
552 |         "#         else:\n",
553 |         "#             output.loss.backward()\n",
554 |         "        \n",
555 |         "#         # weights update\n",
556 |         "#         if (step % accum_iter == 0) or (step == len(train_loader)):\n",
557 |         "#             optimizer.step()\n",
558 |         "#             scheduler.step()\n",
559 |         "#             optimizer.zero_grad()\n",
560 |         "            \n",
561 |         "#         # optimizer.step()\n",
562 |         "#         # optimizer.zero_grad()\n",
563 |         "#         step += 1\n",
564 |         "\n",
565 |         "#         # # ##### TODO: Apply linear learning rate decay #####\n",
566 |         "#         # scheduler.step()\n",
567 |         "        \n",
568 |         "#         # Print training loss and accuracy over past logging step\n",
569 |         "#         if step % logging_step == 0:\n",
570 |         "#             print(f\"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / logging_step:.3f}, acc = {train_acc / logging_step:.3f}\")\n",
571 |         "#             train_loss = train_acc = 0\n",
572 |         "\n",
573 |         "\n",
574 |         "\n",
575 |         "#     if validation:\n",
576 |         "#         print(\"Evaluating Dev Set ...\")\n",
577 |         "#         model.eval()\n",
578 |         "#         with torch.no_grad():\n",
579 |         "#             dev_acc = 0\n",
580 |         "#             for i, data in enumerate(tqdm(dev_loader)):\n",
581 |         "#                 output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),\n",
582 |         "#                        attention_mask=data[2].squeeze(dim=0).to(device))\n",
583 |         "#                 # prediction is correct only if answer text exactly matches\n",
584 |         "#                 dev_acc += evaluate(data, output) == dev_questions[i][\"answer_text\"]\n",
585 |         "#             print(f\"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}\")\n",
586 |         "\n",
587 |         "#             if dev_acc > best_acc:\n",
588 |         "#                 best_acc = dev_acc\n",
589 |         "#                 print(\"Saving Model ...\")\n",
590 |         "#                 model_save_dir = \"saved_model_1\" \n",
591 |         "#                 model.save_pretrained(model_save_dir)\n",
592 |         "#         model.train()\n",
593 |         "\n",
594 |         "# # Save a model and its configuration file to the directory 「saved_model」 \n",
595 |         "# # i.e. there are two files under the direcory 「saved_model」: 「pytorch_model.bin」 and 「config.json」\n",
596 |         "# # Saved model can be re-loaded using 「model = BertForQuestionAnswering.from_pretrained(\"saved_model\")」\n",
597 |         "\n",
598 |         "# # print(\"Saving Model ...\")\n",
599 |         "# # model_save_dir = \"saved_model\" \n",
600 |         "# # model.save_pretrained(model_save_dir)"
601 |       ]
602 |     },
603 |     {
604 |       "cell_type": "code",
605 |       "execution_count": 24,
606 |       "metadata": {},
607 |       "outputs": [],
608 |       "source": [
609 |         "max_answer_length = 20\n",
610 |         "\n",
611 |         "def ensemble(data, output1, output2, output5):\n",
612 |         "    ##### TODO: Postprocessing #####\n",
613 |         "    # There is a bug and room for improvement in postprocessing \n",
614 |         "    # Hint: Open your prediction file to see what is wrong \n",
615 |         "    \n",
616 |         "    answer = ''\n",
617 |         "    max_prob = float('-inf')\n",
618 |         "    num_of_windows = data[0].shape[1]\n",
619 |         "    n_best = 60\n",
620 |         "\n",
621 |         "    for k in range(num_of_windows):\n",
622 |         "        output_start = output1.start_logits[k].cpu().numpy() + output2.start_logits[k].cpu().numpy() + output5.start_logits[k].cpu().numpy()\n",
623 |         "        output_end = output1.end_logits[k].cpu().numpy() + output2.end_logits[k].cpu().numpy() + output5.end_logits[k].cpu().numpy()\n",
624 |         "\n",
625 |         "        start_indexes = np.argsort(output_start)[-1 : -n_best - 1 : -1].tolist()\n",
626 |         "        end_indexes = np.argsort(output_end)[-1 : -n_best - 1 : -1].tolist()\n",
627 |         "\n",
628 |         "        # start_indexes2 = np.argsort(output2.start_logits[k].cpu().numpy())[-1 : -n_best - 1 : -1]\n",
629 |         "        # end_indexes2 = np.argsort(output2.end_logits[k].cpu().numpy())[-1 : -n_best - 1 : -1]\n",
630 |         "        \n",
631 |         "        # start_indexes = (start_indexes1 + start_indexes2).tolist()\n",
632 |         "        # end_indexes = (end_indexes1 + end_indexes2).tolist()\n",
633 |         "\n",
634 |         "\n",
635 |         "        for i, start_index in enumerate(start_indexes):\n",
636 |         "            for j, end_index in enumerate(end_indexes):\n",
637 |         "                if start_index > end_index or end_index - start_index + 1 > max_answer_length:\n",
638 |         "                    continue\n",
639 |         "                \n",
640 |         "                start_prob= output1.start_logits[k][start_index] + output2.start_logits[k][start_index] + output5.start_logits[k][start_index]\n",
641 |         "                end_prob = output1.end_logits[k][end_index]+ output2.end_logits[k][end_index] + output5.end_logits[k][end_index]\n",
642 |         "                \n",
643 |         "                prob = start_prob + end_prob\n",
644 |         "                if prob > max_prob:\n",
645 |         "                    max_prob = prob\n",
646 |         "                    # Convert tokens to chars (e.g. [1920, 7032] --> \"大 金\")\n",
647 |         "                    answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])\n",
648 |         "    res = answer.replace(' ','')\n",
649 |         "    print(res)\n",
650 |         "    return res"
651 |       ]
652 |     },
653 |     {
654 |       "cell_type": "markdown",
655 |       "metadata": {
656 |         "id": "kMmdLOKBMsdE"
657 |       },
658 |       "source": [
659 |         "## Testing"
660 |       ]
661 |     },
662 |     {
663 |       "cell_type": "code",
664 |       "execution_count": null,
665 |       "metadata": {
666 |         "id": "U5scNKC9xz0C"
667 |       },
668 |       "outputs": [],
669 |       "source": [
670 |         "\n",
671 |         "# print(\"Evaluating Test Set ...\")\n",
672 |         "\n",
673 |         "# result = []\n",
674 |         "\n",
675 |         "# model.eval()\n",
676 |         "# with torch.no_grad():\n",
677 |         "#     for i, data in enumerate(tqdm(test_loader)):\n",
678 |         "#         output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),\n",
679 |         "#                        attention_mask=data[2].squeeze(dim=0).to(device))\n",
680 |         "#         result.append(evaluate(data, output))\n",
681 |         "\n",
682 |         "# result_file = \"result.csv\"\n",
683 |         "# with open(result_file, 'w') as f:\t\n",
684 |         "# \t  f.write(\"ID,Answer\\n\")\n",
685 |         "# \t  for i, test_question in enumerate(test_questions):\n",
686 |         "#         # Replace commas in answers with empty strings (since csv is separated by comma)\n",
687 |         "#         # Answers in kaggle are processed in the same way\n",
688 |         "# \t\t    f.write(f\"{test_question['id']},{result[i].replace(',','')}\\n\")\n",
689 |         "\n",
690 |         "# print(f\"Completed! Result is in {result_file}\")"
691 |       ]
692 |     }
693 |   ],
694 |   "metadata": {
695 |     "accelerator": "GPU",
696 |     "colab": {
697 |       "collapsed_sections": [],
698 |       "name": "ML2022Spring - HW7.ipynb",
699 |       "provenance": []
700 |     },
701 |     "interpreter": {
702 |       "hash": "ecbd9286bd544f7fe4ef1add3c640987467b2b9ee7c82bb6d3e9831005f6ced4"
703 |     },
704 |     "kernelspec": {
705 |       "display_name": "torch11",
706 |       "language": "python",
707 |       "name": "python3"
708 |     },
709 |     "language_info": {
710 |       "codemirror_mode": {
711 |         "name": "ipython",
712 |         "version": 3
713 |       },
714 |       "file_extension": ".py",
715 |       "mimetype": "text/x-python",
716 |       "name": "python",
717 |       "nbconvert_exporter": "python",
718 |       "pygments_lexer": "ipython3",
719 |       "version": "3.10.2"
720 |     }
721 |   },
722 |   "nbformat": 4,
723 |   "nbformat_minor": 0
724 | }
725 | 


--------------------------------------------------------------------------------
/Hw7/ML2022Spring_HW7 m6 last try2.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "cells": [
   3 |     {
   4 |       "cell_type": "markdown",
   5 |       "metadata": {
   6 |         "id": "xvSGDbExff_I"
   7 |       },
   8 |       "source": [
   9 |         "# **Homework 7 - Bert (Question Answering)**\n",
  10 |         "\n",
  11 |         "If you have any questions, feel free to email us at mlta-2022-spring@googlegroups.com\n",
  12 |         "\n",
  13 |         "\n",
  14 |         "\n",
  15 |         "Slide:    [Link](https://docs.google.com/presentation/d/1H5ZONrb2LMOCixLY7D5_5-7LkIaXO6AGEaV2mRdTOMY/edit?usp=sharing)　Kaggle: [Link](https://www.kaggle.com/c/ml2022spring-hw7)　Data: [Link](https://drive.google.com/uc?id=1AVgZvy3VFeg0fX-6WQJMHPVrx3A-M1kb)\n",
  16 |         "\n",
  17 |         "\n"
  18 |       ]
  19 |     },
  20 |     {
  21 |       "cell_type": "markdown",
  22 |       "metadata": {
  23 |         "id": "WGOr_eS3wJJf"
  24 |       },
  25 |       "source": [
  26 |         "## Task description\n",
  27 |         "- Chinese Extractive Question Answering\n",
  28 |         "  - Input: Paragraph + Question\n",
  29 |         "  - Output: Answer\n",
  30 |         "\n",
  31 |         "- Objective: Learn how to fine tune a pretrained model on downstream task using transformers\n",
  32 |         "\n",
  33 |         "- Todo\n",
  34 |         "    - Fine tune a pretrained chinese BERT model\n",
  35 |         "    - Change hyperparameters (e.g. doc_stride)\n",
  36 |         "    - Apply linear learning rate decay\n",
  37 |         "    - Try other pretrained models\n",
  38 |         "    - Improve preprocessing\n",
  39 |         "    - Improve postprocessing\n",
  40 |         "- Training tips\n",
  41 |         "    - Automatic mixed precision\n",
  42 |         "    - Gradient accumulation\n",
  43 |         "    - Ensemble\n",
  44 |         "\n",
  45 |         "- Estimated training time (tesla t4 with automatic mixed precision enabled)\n",
  46 |         "    - Simple: 8mins\n",
  47 |         "    - Medium: 8mins\n",
  48 |         "    - Strong: 25mins\n",
  49 |         "    - Boss: 2.5hrs\n",
  50 |         "  "
  51 |       ]
  52 |     },
  53 |     {
  54 |       "cell_type": "markdown",
  55 |       "metadata": {
  56 |         "id": "TJ1fSAJE2oaC"
  57 |       },
  58 |       "source": [
  59 |         "## Download Dataset"
  60 |       ]
  61 |     },
  62 |     {
  63 |       "cell_type": "code",
  64 |       "execution_count": 1,
  65 |       "metadata": {
  66 |         "id": "YPrc4Eie9Yo5"
  67 |       },
  68 |       "outputs": [],
  69 |       "source": [
  70 |         "# import gdown\n",
  71 |         "# # Download link 1\n",
  72 |         "# !gdown --id '1AVgZvy3VFeg0fX-6WQJMHPVrx3A-M1kb' --output hw7_data.zip\n",
  73 |         "\n",
  74 |         "# # Download Link 2 (if the above link fails) \n",
  75 |         "# # !gdown --id '1qwjbRjq481lHsnTrrF4OjKQnxzgoLEFR' --output hw7_data.zip\n",
  76 |         "\n",
  77 |         "# # Download Link 3 (if the above link fails) \n",
  78 |         "# # !gdown --id '1QXuWjNRZH6DscSd6QcRER0cnxmpZvijn' --output hw7_data.zip\n",
  79 |         "\n",
  80 |         "# !unzip -o hw7_data.zip\n",
  81 |         "\n",
  82 |         "# # For this HW, K80 < P4 < T4 < P100 <= T4(fp16) < V100\n",
  83 |         "# !nvidia-smi"
  84 |       ]
  85 |     },
  86 |     {
  87 |       "cell_type": "markdown",
  88 |       "metadata": {
  89 |         "id": "TevOvhC03m0h"
  90 |       },
  91 |       "source": [
  92 |         "## Install transformers\n",
  93 |         "\n",
  94 |         "Documentation for the toolkit:　https://huggingface.co/transformers/"
  95 |       ]
  96 |     },
  97 |     {
  98 |       "cell_type": "code",
  99 |       "execution_count": 2,
 100 |       "metadata": {
 101 |         "id": "tbxWFX_jpDom"
 102 |       },
 103 |       "outputs": [],
 104 |       "source": [
 105 |         "# # You are allowed to change version of transformers or use other toolkits\n",
 106 |         "# !pip install transformers==4.18.0"
 107 |       ]
 108 |     },
 109 |     {
 110 |       "cell_type": "markdown",
 111 |       "metadata": {
 112 |         "id": "8dKM4yCh4LI_"
 113 |       },
 114 |       "source": [
 115 |         "## Import Packages"
 116 |       ]
 117 |     },
 118 |     {
 119 |       "cell_type": "code",
 120 |       "execution_count": 3,
 121 |       "metadata": {
 122 |         "id": "WOTHHtWJoahe"
 123 |       },
 124 |       "outputs": [],
 125 |       "source": [
 126 |         "import json\n",
 127 |         "import numpy as np\n",
 128 |         "import random\n",
 129 |         "import torch\n",
 130 |         "from torch.utils.data import DataLoader, Dataset \n",
 131 |         "from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast, get_linear_schedule_with_warmup\n",
 132 |         "import os\n",
 133 |         "\n",
 134 |         "from tqdm.auto import tqdm\n",
 135 |         "\n",
 136 |         "os.environ['CUDA_VISIBLE_DEVICES'] = \"0\"\n",
 137 |         "\n",
 138 |         "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
 139 |         "\n",
 140 |         "# Fix random seed for reproducibility\n",
 141 |         "def same_seeds(seed):\n",
 142 |         "\t  torch.manual_seed(seed)\n",
 143 |         "\t  if torch.cuda.is_available():\n",
 144 |         "\t\t    torch.cuda.manual_seed(seed)\n",
 145 |         "\t\t    torch.cuda.manual_seed_all(seed)\n",
 146 |         "\t  np.random.seed(seed)\n",
 147 |         "\t  random.seed(seed)\n",
 148 |         "\t  torch.backends.cudnn.benchmark = False\n",
 149 |         "\t  torch.backends.cudnn.deterministic = True\n",
 150 |         "same_seeds(50936)"
 151 |       ]
 152 |     },
 153 |     {
 154 |       "cell_type": "code",
 155 |       "execution_count": 4,
 156 |       "metadata": {
 157 |         "id": "7pBtSZP1SKQO"
 158 |       },
 159 |       "outputs": [],
 160 |       "source": [
 161 |         "# Change \"fp16_training\" to True to support automatic mixed precision training (fp16)\t\n",
 162 |         "fp16_training = True\n",
 163 |         "\n",
 164 |         "if fp16_training:\n",
 165 |         "    # !pip install accelerate==0.2.0\n",
 166 |         "    from accelerate import Accelerator\n",
 167 |         "    accelerator = Accelerator(fp16=True)\n",
 168 |         "    device = accelerator.device\n",
 169 |         "\n",
 170 |         "# Documentation for the toolkit:  https://huggingface.co/docs/accelerate/"
 171 |       ]
 172 |     },
 173 |     {
 174 |       "cell_type": "markdown",
 175 |       "metadata": {
 176 |         "id": "2YgXHuVLp_6j"
 177 |       },
 178 |       "source": [
 179 |         "## Load Model and Tokenizer\n",
 180 |         "\n",
 181 |         "\n",
 182 |         "\n",
 183 |         "\n",
 184 |         " "
 185 |       ]
 186 |     },
 187 |     {
 188 |       "cell_type": "code",
 189 |       "execution_count": 5,
 190 |       "metadata": {
 191 |         "id": "xyBCYGjAp3ym"
 192 |       },
 193 |       "outputs": [],
 194 |       "source": [
 195 |         "# model = BertForQuestionAnswering.from_pretrained(\"bert-base-chinese\").to(device)\n",
 196 |         "# tokenizer = BertTokenizerFast.from_pretrained(\"bert-base-chinese\")\n",
 197 |         "pre_model = \"luhua/chinese_pretrain_mrc_macbert_large\"\n",
 198 |         "tokenizer = BertTokenizerFast.from_pretrained(pre_model)\n",
 199 |         "model = BertForQuestionAnswering.from_pretrained(pre_model).to(device)\n",
 200 |         "# model = BertForQuestionAnswering.from_pretrained(\"saved_model_last\").to(device)\n",
 201 |         "# tokenizer = RobertaTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext')\n",
 202 |         "# model = RobertaModel.from_pretrained('hfl/chinese-roberta-wwm-ext')\n",
 203 |         "\n",
 204 |         "# You can safely ignore the warning message (it pops up because new prediction heads for QA are initialized randomly)"
 205 |       ]
 206 |     },
 207 |     {
 208 |       "cell_type": "markdown",
 209 |       "metadata": {
 210 |         "id": "3Td-GTmk5OW4"
 211 |       },
 212 |       "source": [
 213 |         "## Read Data\n",
 214 |         "\n",
 215 |         "- Training set: 31690 QA pairs\n",
 216 |         "- Dev set: 4131  QA pairs\n",
 217 |         "- Test set: 4957  QA pairs\n",
 218 |         "\n",
 219 |         "- {train/dev/test}_questions:\t\n",
 220 |         "  - List of dicts with the following keys:\n",
 221 |         "   - id (int)\n",
 222 |         "   - paragraph_id (int)\n",
 223 |         "   - question_text (string)\n",
 224 |         "   - answer_text (string)\n",
 225 |         "   - answer_start (int)\n",
 226 |         "   - answer_end (int)\n",
 227 |         "- {train/dev/test}_paragraphs: \n",
 228 |         "  - List of strings\n",
 229 |         "  - paragraph_ids in questions correspond to indexs in paragraphs\n",
 230 |         "  - A paragraph may be used by several questions "
 231 |       ]
 232 |     },
 233 |     {
 234 |       "cell_type": "code",
 235 |       "execution_count": 6,
 236 |       "metadata": {
 237 |         "id": "NvX7hlepogvu"
 238 |       },
 239 |       "outputs": [],
 240 |       "source": [
 241 |         "def read_data(file):\n",
 242 |         "    with open(file, 'r', encoding=\"utf-8\") as reader:\n",
 243 |         "        data = json.load(reader)\n",
 244 |         "    return data[\"questions\"], data[\"paragraphs\"]\n",
 245 |         "\n",
 246 |         "train_questions, train_paragraphs = read_data(\"hw7_train.json\")\n",
 247 |         "dev_questions, dev_paragraphs = read_data(\"hw7_dev.json\")\n",
 248 |         "test_questions, test_paragraphs = read_data(\"hw7_test.json\")"
 249 |       ]
 250 |     },
 251 |     {
 252 |       "cell_type": "markdown",
 253 |       "metadata": {
 254 |         "id": "Fm0rpTHq0e4N"
 255 |       },
 256 |       "source": [
 257 |         "## Tokenize Data"
 258 |       ]
 259 |     },
 260 |     {
 261 |       "cell_type": "code",
 262 |       "execution_count": 7,
 263 |       "metadata": {
 264 |         "id": "rTZ6B70Hoxie"
 265 |       },
 266 |       "outputs": [],
 267 |       "source": [
 268 |         "# Tokenize questions and paragraphs separately\n",
 269 |         "# 「add_special_tokens」 is set to False since special tokens will be added when tokenized questions and paragraphs are combined in datset __getitem__ \n",
 270 |         "\n",
 271 |         "train_questions_tokenized = tokenizer([train_question[\"question_text\"] for train_question in train_questions], add_special_tokens=False)\n",
 272 |         "dev_questions_tokenized = tokenizer([dev_question[\"question_text\"] for dev_question in dev_questions], add_special_tokens=False)\n",
 273 |         "test_questions_tokenized = tokenizer([test_question[\"question_text\"] for test_question in test_questions], add_special_tokens=False) \n",
 274 |         "\n",
 275 |         "train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens=False)\n",
 276 |         "dev_paragraphs_tokenized = tokenizer(dev_paragraphs, add_special_tokens=False)\n",
 277 |         "test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)\n",
 278 |         "\n",
 279 |         "# You can safely ignore the warning message as tokenized sequences will be futher processed in datset __getitem__ before passing to model"
 280 |       ]
 281 |     },
 282 |     {
 283 |       "cell_type": "markdown",
 284 |       "metadata": {
 285 |         "id": "Ws8c8_4d5UCI"
 286 |       },
 287 |       "source": [
 288 |         "## Dataset and Dataloader"
 289 |       ]
 290 |     },
 291 |     {
 292 |       "cell_type": "code",
 293 |       "execution_count": 8,
 294 |       "metadata": {
 295 |         "id": "Xjooag-Swnuh"
 296 |       },
 297 |       "outputs": [],
 298 |       "source": [
 299 |         "class QA_Dataset(Dataset):\n",
 300 |         "    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):\n",
 301 |         "        self.split = split\n",
 302 |         "        self.questions = questions\n",
 303 |         "        self.tokenized_questions = tokenized_questions\n",
 304 |         "        self.tokenized_paragraphs = tokenized_paragraphs\n",
 305 |         "        self.max_question_len = 50\n",
 306 |         "        self.max_paragraph_len = 384\n",
 307 |         "        \n",
 308 |         "        ##### TODO: Change value of doc_stride #####\n",
 309 |         "        self.doc_stride = 320\n",
 310 |         "\n",
 311 |         "        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]\n",
 312 |         "        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1\n",
 313 |         "\n",
 314 |         "    def __len__(self):\n",
 315 |         "        return len(self.questions)\n",
 316 |         "\n",
 317 |         "    def __getitem__(self, idx):\n",
 318 |         "        question = self.questions[idx]\n",
 319 |         "        tokenized_question = self.tokenized_questions[idx]\n",
 320 |         "        tokenized_paragraph = self.tokenized_paragraphs[question[\"paragraph_id\"]]\n",
 321 |         "\n",
 322 |         "        ##### TODO: Preprocessing #####\n",
 323 |         "        # Hint: How to prevent model from learning something it should not learn\n",
 324 |         "        exceed = True if len(tokenized_paragraph) > self.max_paragraph_len else False\n",
 325 |         "\n",
 326 |         "        if self.split == \"train\":\n",
 327 |         "            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph  \n",
 328 |         "            answer_start_token = tokenized_paragraph.char_to_token(question[\"answer_start\"])\n",
 329 |         "            answer_end_token = tokenized_paragraph.char_to_token(question[\"answer_end\"])\n",
 330 |         "\n",
 331 |         "            # A single window is obtained by slicing the portion of paragraph containing the answer\n",
 332 |         "            if exceed:\n",
 333 |         "                mid = (answer_start_token + answer_end_token) // 2\n",
 334 |         "                paragraph_start = max(0, min(mid - self.max_paragraph_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))\n",
 335 |         "            else:\n",
 336 |         "                rand_start = random.randint(0, answer_start_token)\n",
 337 |         "                paragraph_start = rand_start\n",
 338 |         "            paragraph_end = paragraph_start + self.max_paragraph_len\n",
 339 |         "\n",
 340 |         "            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)\n",
 341 |         "            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102] \n",
 342 |         "            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]\t\t\n",
 343 |         "            \n",
 344 |         "            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window\n",
 345 |         "            answer_start_token += len(input_ids_question) - paragraph_start\n",
 346 |         "            answer_end_token += len(input_ids_question) - paragraph_start\n",
 347 |         "            \n",
 348 |         "            # Pad sequence and obtain inputs to model \n",
 349 |         "            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)\n",
 350 |         "            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token\n",
 351 |         "\n",
 352 |         "        # Validation/Testing\n",
 353 |         "        else:\n",
 354 |         "            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []\n",
 355 |         "            qa_offset_list = []\n",
 356 |         "            p_offset_list = []\n",
 357 |         "            # Paragraph is split into several windows, each with start positions separated by step \"doc_stride\"\n",
 358 |         "            for i in range(0, len(tokenized_paragraph), self.doc_stride):\n",
 359 |         "                \n",
 360 |         "                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)\n",
 361 |         "                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]\n",
 362 |         "                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]\n",
 363 |         "                \n",
 364 |         "                # calculate qa offset\n",
 365 |         "                qa_offset = len(input_ids_question)\n",
 366 |         "                qa_offset_list.append(qa_offset)\n",
 367 |         "                p_offset = tokenized_paragraph.offsets[i : i + self.max_paragraph_len]\n",
 368 |         "                p_offset_list.append(p_offset)\n",
 369 |         "                # print(f\"qa: {len(input_ids_question)})\")\n",
 370 |         "                # print(f\"para: {len(input_ids_paragraph)})\")\n",
 371 |         "                # Pad sequence and obtain inputs to model\n",
 372 |         "                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)\n",
 373 |         "                \n",
 374 |         "                input_ids_list.append(input_ids)\n",
 375 |         "                token_type_ids_list.append(token_type_ids)\n",
 376 |         "                attention_mask_list.append(attention_mask)\n",
 377 |         "            \n",
 378 |         "            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list), torch.tensor(qa_offset_list), p_offset_list\n",
 379 |         "\n",
 380 |         "    def padding(self, input_ids_question, input_ids_paragraph):\n",
 381 |         "        # Pad zeros if sequence length is shorter than max_seq_len\n",
 382 |         "        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)\n",
 383 |         "        # Indices of input sequence tokens in the vocabulary\n",
 384 |         "        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len\n",
 385 |         "        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]\n",
 386 |         "        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len\n",
 387 |         "        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]\n",
 388 |         "        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len\n",
 389 |         "        \n",
 390 |         "        return input_ids, token_type_ids, attention_mask\n",
 391 |         "\n",
 392 |         "train_set = QA_Dataset(\"train\", train_questions, train_questions_tokenized, train_paragraphs_tokenized)\n",
 393 |         "dev_set = QA_Dataset(\"dev\", dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)\n",
 394 |         "test_set = QA_Dataset(\"test\", test_questions, test_questions_tokenized, test_paragraphs_tokenized)\n",
 395 |         "\n",
 396 |         "train_batch_size = 4\n",
 397 |         "\n",
 398 |         "# Note: Do NOT change batch size of dev_loader / test_loader !\n",
 399 |         "# Although batch size=1, it is actually a batch consisting of several windows from the same QA pair\n",
 400 |         "train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, pin_memory=True)\n",
 401 |         "dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False, pin_memory=True)\n",
 402 |         "test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)"
 403 |       ]
 404 |     },
 405 |     {
 406 |       "cell_type": "markdown",
 407 |       "metadata": {
 408 |         "id": "rzHQit6eMnKG"
 409 |       },
 410 |       "source": [
 411 |         "## Training"
 412 |       ]
 413 |     },
 414 |     {
 415 |       "cell_type": "code",
 416 |       "execution_count": 9,
 417 |       "metadata": {
 418 |         "id": "3Q-B6ka7xoCM"
 419 |       },
 420 |       "outputs": [
 421 |         {
 422 |           "name": "stderr",
 423 |           "output_type": "stream",
 424 |           "text": [
 425 |             "/home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
 426 |             "  warnings.warn(\n"
 427 |           ]
 428 |         },
 429 |         {
 430 |           "name": "stdout",
 431 |           "output_type": "stream",
 432 |           "text": [
 433 |             "length of train_set: 31690\n",
 434 |             "Start Training ...\n"
 435 |           ]
 436 |         },
 437 |         {
 438 |           "name": "stderr",
 439 |           "output_type": "stream",
 440 |           "text": [
 441 |             "/home/tracy/miniconda3/envs/torch11/lib/python3.10/site-packages/torch/optim/lr_scheduler.py:131: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate\n",
 442 |             "  warnings.warn(\"Detected call of `lr_scheduler.step()` before `optimizer.step()`. \"\n"
 443 |           ]
 444 |         },
 445 |         {
 446 |           "name": "stdout",
 447 |           "output_type": "stream",
 448 |           "text": [
 449 |             "Epoch 1 | Step 100 | loss = 0.405, acc = 0.647\n",
 450 |             "Epoch 1 | Step 200 | loss = 0.324, acc = 0.748\n",
 451 |             "Epoch 1 | Step 300 | loss = 0.329, acc = 0.732\n",
 452 |             "Epoch 1 | Step 400 | loss = 0.261, acc = 0.755\n",
 453 |             "Epoch 1 | Step 500 | loss = 0.281, acc = 0.785\n",
 454 |             "Epoch 1 | Step 600 | loss = 0.247, acc = 0.787\n",
 455 |             "Epoch 1 | Step 700 | loss = 0.271, acc = 0.792\n",
 456 |             "Epoch 1 | Step 800 | loss = 0.267, acc = 0.780\n",
 457 |             "Epoch 1 | Step 900 | loss = 0.240, acc = 0.815\n",
 458 |             "Epoch 1 | Step 1000 | loss = 0.249, acc = 0.800\n",
 459 |             "Epoch 1 | Step 1100 | loss = 0.299, acc = 0.762\n",
 460 |             "Epoch 1 | Step 1200 | loss = 0.257, acc = 0.772\n",
 461 |             "Epoch 1 | Step 1300 | loss = 0.220, acc = 0.805\n",
 462 |             "Epoch 1 | Step 1400 | loss = 0.261, acc = 0.792\n",
 463 |             "Epoch 1 | Step 1500 | loss = 0.237, acc = 0.790\n",
 464 |             "Epoch 1 | Step 1600 | loss = 0.224, acc = 0.810\n",
 465 |             "Epoch 1 | Step 1700 | loss = 0.243, acc = 0.772\n",
 466 |             "Epoch 1 | Step 1800 | loss = 0.224, acc = 0.792\n",
 467 |             "Epoch 1 | Step 1900 | loss = 0.259, acc = 0.780\n",
 468 |             "Epoch 1 | Step 2000 | loss = 0.252, acc = 0.792\n",
 469 |             "Epoch 1 | Step 2100 | loss = 0.256, acc = 0.815\n",
 470 |             "Epoch 1 | Step 2200 | loss = 0.254, acc = 0.795\n",
 471 |             "Epoch 1 | Step 2300 | loss = 0.240, acc = 0.800\n",
 472 |             "Epoch 1 | Step 2400 | loss = 0.233, acc = 0.772\n",
 473 |             "Epoch 1 | Step 2500 | loss = 0.208, acc = 0.812\n",
 474 |             "Epoch 1 | Step 2600 | loss = 0.256, acc = 0.810\n",
 475 |             "Epoch 1 | Step 2700 | loss = 0.272, acc = 0.780\n",
 476 |             "Epoch 1 | Step 2800 | loss = 0.248, acc = 0.827\n",
 477 |             "Epoch 1 | Step 2900 | loss = 0.244, acc = 0.780\n",
 478 |             "Epoch 1 | Step 3000 | loss = 0.220, acc = 0.817\n",
 479 |             "Epoch 1 | Step 3100 | loss = 0.280, acc = 0.792\n",
 480 |             "Epoch 1 | Step 3200 | loss = 0.258, acc = 0.782\n",
 481 |             "Epoch 1 | Step 3300 | loss = 0.212, acc = 0.822\n",
 482 |             "Epoch 1 | Step 3400 | loss = 0.218, acc = 0.812\n",
 483 |             "Epoch 1 | Step 3500 | loss = 0.227, acc = 0.808\n",
 484 |             "Epoch 1 | Step 3600 | loss = 0.244, acc = 0.803\n",
 485 |             "Epoch 1 | Step 3700 | loss = 0.228, acc = 0.832\n",
 486 |             "Epoch 1 | Step 3800 | loss = 0.230, acc = 0.812\n",
 487 |             "Epoch 1 | Step 3900 | loss = 0.203, acc = 0.852\n",
 488 |             "Epoch 1 | Step 4000 | loss = 0.189, acc = 0.835\n",
 489 |             "Epoch 1 | Step 4100 | loss = 0.246, acc = 0.812\n",
 490 |             "Epoch 1 | Step 4200 | loss = 0.187, acc = 0.840\n",
 491 |             "Epoch 1 | Step 4300 | loss = 0.208, acc = 0.815\n",
 492 |             "Epoch 1 | Step 4400 | loss = 0.233, acc = 0.792\n",
 493 |             "Epoch 1 | Step 4500 | loss = 0.209, acc = 0.847\n",
 494 |             "Epoch 1 | Step 4600 | loss = 0.194, acc = 0.835\n",
 495 |             "Epoch 1 | Step 4700 | loss = 0.222, acc = 0.815\n",
 496 |             "Epoch 1 | Step 4800 | loss = 0.260, acc = 0.812\n",
 497 |             "Epoch 1 | Step 4900 | loss = 0.204, acc = 0.815\n",
 498 |             "Epoch 1 | Step 5000 | loss = 0.258, acc = 0.817\n",
 499 |             "Epoch 1 | Step 5100 | loss = 0.219, acc = 0.810\n",
 500 |             "Epoch 1 | Step 5200 | loss = 0.256, acc = 0.810\n",
 501 |             "Epoch 1 | Step 5300 | loss = 0.231, acc = 0.827\n",
 502 |             "Epoch 1 | Step 5400 | loss = 0.205, acc = 0.817\n",
 503 |             "Epoch 1 | Step 5500 | loss = 0.196, acc = 0.847\n",
 504 |             "Epoch 1 | Step 5600 | loss = 0.237, acc = 0.825\n",
 505 |             "Epoch 1 | Step 5700 | loss = 0.216, acc = 0.825\n",
 506 |             "Epoch 1 | Step 5800 | loss = 0.179, acc = 0.852\n",
 507 |             "Epoch 1 | Step 5900 | loss = 0.232, acc = 0.825\n",
 508 |             "Epoch 1 | Step 6000 | loss = 0.199, acc = 0.830\n",
 509 |             "Epoch 1 | Step 6100 | loss = 0.214, acc = 0.825\n",
 510 |             "Epoch 1 | Step 6200 | loss = 0.193, acc = 0.837\n",
 511 |             "Epoch 1 | Step 6300 | loss = 0.205, acc = 0.825\n",
 512 |             "Epoch 1 | Step 6400 | loss = 0.231, acc = 0.812\n",
 513 |             "Epoch 1 | Step 6500 | loss = 0.197, acc = 0.842\n",
 514 |             "Epoch 1 | Step 6600 | loss = 0.203, acc = 0.797\n",
 515 |             "Epoch 1 | Step 6700 | loss = 0.229, acc = 0.825\n",
 516 |             "Epoch 1 | Step 6800 | loss = 0.255, acc = 0.800\n",
 517 |             "Epoch 1 | Step 6900 | loss = 0.244, acc = 0.787\n",
 518 |             "Epoch 1 | Step 7000 | loss = 0.173, acc = 0.835\n",
 519 |             "Epoch 1 | Step 7100 | loss = 0.211, acc = 0.803\n",
 520 |             "Epoch 1 | Step 7200 | loss = 0.200, acc = 0.837\n",
 521 |             "Epoch 1 | Step 7300 | loss = 0.182, acc = 0.837\n",
 522 |             "Epoch 1 | Step 7400 | loss = 0.189, acc = 0.832\n",
 523 |             "Epoch 1 | Step 7500 | loss = 0.148, acc = 0.862\n",
 524 |             "Epoch 1 | Step 7600 | loss = 0.168, acc = 0.845\n",
 525 |             "Epoch 1 | Step 7700 | loss = 0.181, acc = 0.830\n",
 526 |             "Epoch 1 | Step 7800 | loss = 0.196, acc = 0.845\n",
 527 |             "Epoch 1 | Step 7900 | loss = 0.166, acc = 0.850\n",
 528 |             "Epoch 2 | Step 100 | loss = 0.104, acc = 0.892\n",
 529 |             "Epoch 2 | Step 200 | loss = 0.095, acc = 0.910\n",
 530 |             "Epoch 2 | Step 300 | loss = 0.079, acc = 0.925\n",
 531 |             "Epoch 2 | Step 400 | loss = 0.103, acc = 0.907\n",
 532 |             "Epoch 2 | Step 500 | loss = 0.118, acc = 0.877\n",
 533 |             "Epoch 2 | Step 600 | loss = 0.092, acc = 0.920\n",
 534 |             "Epoch 2 | Step 700 | loss = 0.108, acc = 0.885\n",
 535 |             "Epoch 2 | Step 800 | loss = 0.088, acc = 0.910\n",
 536 |             "Epoch 2 | Step 900 | loss = 0.110, acc = 0.890\n",
 537 |             "Epoch 2 | Step 1000 | loss = 0.113, acc = 0.887\n",
 538 |             "Epoch 2 | Step 1100 | loss = 0.082, acc = 0.917\n",
 539 |             "Epoch 2 | Step 1200 | loss = 0.103, acc = 0.912\n",
 540 |             "Epoch 2 | Step 1300 | loss = 0.106, acc = 0.895\n",
 541 |             "Epoch 2 | Step 1400 | loss = 0.099, acc = 0.900\n",
 542 |             "Epoch 2 | Step 1500 | loss = 0.088, acc = 0.917\n",
 543 |             "Epoch 2 | Step 1600 | loss = 0.121, acc = 0.890\n",
 544 |             "Epoch 2 | Step 1700 | loss = 0.115, acc = 0.890\n",
 545 |             "Epoch 2 | Step 1800 | loss = 0.123, acc = 0.887\n",
 546 |             "Epoch 2 | Step 1900 | loss = 0.097, acc = 0.907\n",
 547 |             "Epoch 2 | Step 2000 | loss = 0.093, acc = 0.905\n",
 548 |             "Epoch 2 | Step 2100 | loss = 0.088, acc = 0.910\n",
 549 |             "Epoch 2 | Step 2200 | loss = 0.092, acc = 0.910\n",
 550 |             "Epoch 2 | Step 2300 | loss = 0.122, acc = 0.882\n",
 551 |             "Epoch 2 | Step 2400 | loss = 0.093, acc = 0.915\n",
 552 |             "Epoch 2 | Step 2500 | loss = 0.101, acc = 0.892\n",
 553 |             "Epoch 2 | Step 2600 | loss = 0.108, acc = 0.905\n",
 554 |             "Epoch 2 | Step 2700 | loss = 0.134, acc = 0.887\n",
 555 |             "Epoch 2 | Step 2800 | loss = 0.092, acc = 0.920\n",
 556 |             "Epoch 2 | Step 2900 | loss = 0.117, acc = 0.920\n",
 557 |             "Epoch 2 | Step 3000 | loss = 0.123, acc = 0.882\n",
 558 |             "Epoch 2 | Step 3100 | loss = 0.122, acc = 0.870\n",
 559 |             "Epoch 2 | Step 3200 | loss = 0.169, acc = 0.860\n",
 560 |             "Epoch 2 | Step 3300 | loss = 0.101, acc = 0.892\n",
 561 |             "Epoch 2 | Step 3400 | loss = 0.095, acc = 0.905\n",
 562 |             "Epoch 2 | Step 3500 | loss = 0.094, acc = 0.930\n",
 563 |             "Epoch 2 | Step 3600 | loss = 0.095, acc = 0.892\n",
 564 |             "Epoch 2 | Step 3700 | loss = 0.106, acc = 0.892\n",
 565 |             "Epoch 2 | Step 3800 | loss = 0.102, acc = 0.877\n",
 566 |             "Epoch 2 | Step 3900 | loss = 0.082, acc = 0.922\n",
 567 |             "Epoch 2 | Step 4000 | loss = 0.127, acc = 0.877\n",
 568 |             "Epoch 2 | Step 4100 | loss = 0.115, acc = 0.890\n",
 569 |             "Epoch 2 | Step 4200 | loss = 0.130, acc = 0.877\n",
 570 |             "Epoch 2 | Step 4300 | loss = 0.096, acc = 0.890\n",
 571 |             "Epoch 2 | Step 4400 | loss = 0.145, acc = 0.857\n",
 572 |             "Epoch 2 | Step 4500 | loss = 0.100, acc = 0.900\n",
 573 |             "Epoch 2 | Step 4600 | loss = 0.063, acc = 0.938\n",
 574 |             "Epoch 2 | Step 4700 | loss = 0.102, acc = 0.897\n",
 575 |             "Epoch 2 | Step 4800 | loss = 0.104, acc = 0.910\n",
 576 |             "Epoch 2 | Step 4900 | loss = 0.091, acc = 0.925\n",
 577 |             "Epoch 2 | Step 5000 | loss = 0.110, acc = 0.900\n",
 578 |             "Epoch 2 | Step 5100 | loss = 0.107, acc = 0.895\n",
 579 |             "Epoch 2 | Step 5200 | loss = 0.098, acc = 0.920\n",
 580 |             "Epoch 2 | Step 5300 | loss = 0.120, acc = 0.882\n",
 581 |             "Epoch 2 | Step 5400 | loss = 0.118, acc = 0.897\n",
 582 |             "Epoch 2 | Step 5500 | loss = 0.085, acc = 0.915\n",
 583 |             "Epoch 2 | Step 5600 | loss = 0.097, acc = 0.917\n",
 584 |             "Epoch 2 | Step 5700 | loss = 0.096, acc = 0.900\n",
 585 |             "Epoch 2 | Step 5800 | loss = 0.068, acc = 0.922\n",
 586 |             "Epoch 2 | Step 5900 | loss = 0.087, acc = 0.900\n",
 587 |             "Epoch 2 | Step 6000 | loss = 0.107, acc = 0.885\n",
 588 |             "Epoch 2 | Step 6100 | loss = 0.073, acc = 0.922\n",
 589 |             "Epoch 2 | Step 6200 | loss = 0.077, acc = 0.930\n",
 590 |             "Epoch 2 | Step 6300 | loss = 0.133, acc = 0.887\n",
 591 |             "Epoch 2 | Step 6400 | loss = 0.068, acc = 0.920\n",
 592 |             "Epoch 2 | Step 6500 | loss = 0.135, acc = 0.868\n",
 593 |             "Epoch 2 | Step 6600 | loss = 0.109, acc = 0.895\n",
 594 |             "Epoch 2 | Step 6700 | loss = 0.124, acc = 0.885\n",
 595 |             "Epoch 2 | Step 6800 | loss = 0.099, acc = 0.907\n",
 596 |             "Epoch 2 | Step 6900 | loss = 0.103, acc = 0.897\n",
 597 |             "Epoch 2 | Step 7000 | loss = 0.088, acc = 0.912\n",
 598 |             "Epoch 2 | Step 7100 | loss = 0.109, acc = 0.892\n",
 599 |             "Epoch 2 | Step 7200 | loss = 0.119, acc = 0.880\n",
 600 |             "Epoch 2 | Step 7300 | loss = 0.098, acc = 0.897\n",
 601 |             "Epoch 2 | Step 7400 | loss = 0.101, acc = 0.907\n",
 602 |             "Epoch 2 | Step 7500 | loss = 0.080, acc = 0.917\n",
 603 |             "Epoch 2 | Step 7600 | loss = 0.110, acc = 0.915\n",
 604 |             "Epoch 2 | Step 7700 | loss = 0.105, acc = 0.905\n",
 605 |             "Epoch 2 | Step 7800 | loss = 0.093, acc = 0.917\n",
 606 |             "Epoch 2 | Step 7900 | loss = 0.104, acc = 0.892\n",
 607 |             "Epoch 3 | Step 100 | loss = 0.063, acc = 0.935\n",
 608 |             "Epoch 3 | Step 200 | loss = 0.049, acc = 0.955\n",
 609 |             "Epoch 3 | Step 300 | loss = 0.038, acc = 0.957\n",
 610 |             "Epoch 3 | Step 400 | loss = 0.036, acc = 0.955\n",
 611 |             "Epoch 3 | Step 500 | loss = 0.037, acc = 0.950\n",
 612 |             "Epoch 3 | Step 600 | loss = 0.046, acc = 0.962\n",
 613 |             "Epoch 3 | Step 700 | loss = 0.052, acc = 0.942\n",
 614 |             "Epoch 3 | Step 800 | loss = 0.036, acc = 0.957\n",
 615 |             "Epoch 3 | Step 900 | loss = 0.059, acc = 0.925\n",
 616 |             "Epoch 3 | Step 1000 | loss = 0.066, acc = 0.935\n",
 617 |             "Epoch 3 | Step 1100 | loss = 0.037, acc = 0.950\n",
 618 |             "Epoch 3 | Step 1200 | loss = 0.042, acc = 0.952\n",
 619 |             "Epoch 3 | Step 1300 | loss = 0.045, acc = 0.945\n",
 620 |             "Epoch 3 | Step 1400 | loss = 0.062, acc = 0.945\n",
 621 |             "Epoch 3 | Step 1500 | loss = 0.047, acc = 0.960\n",
 622 |             "Epoch 3 | Step 1600 | loss = 0.061, acc = 0.940\n",
 623 |             "Epoch 3 | Step 1700 | loss = 0.036, acc = 0.962\n",
 624 |             "Epoch 3 | Step 1800 | loss = 0.045, acc = 0.957\n",
 625 |             "Epoch 3 | Step 1900 | loss = 0.050, acc = 0.945\n",
 626 |             "Epoch 3 | Step 2000 | loss = 0.044, acc = 0.955\n",
 627 |             "Epoch 3 | Step 2100 | loss = 0.044, acc = 0.952\n",
 628 |             "Epoch 3 | Step 2200 | loss = 0.038, acc = 0.957\n",
 629 |             "Epoch 3 | Step 2300 | loss = 0.059, acc = 0.942\n",
 630 |             "Epoch 3 | Step 2400 | loss = 0.084, acc = 0.920\n",
 631 |             "Epoch 3 | Step 2500 | loss = 0.057, acc = 0.933\n",
 632 |             "Epoch 3 | Step 2600 | loss = 0.064, acc = 0.940\n",
 633 |             "Epoch 3 | Step 2700 | loss = 0.044, acc = 0.962\n",
 634 |             "Epoch 3 | Step 2800 | loss = 0.057, acc = 0.938\n",
 635 |             "Epoch 3 | Step 2900 | loss = 0.036, acc = 0.957\n",
 636 |             "Epoch 3 | Step 3000 | loss = 0.049, acc = 0.942\n",
 637 |             "Epoch 3 | Step 3100 | loss = 0.062, acc = 0.942\n",
 638 |             "Epoch 3 | Step 3200 | loss = 0.054, acc = 0.938\n",
 639 |             "Epoch 3 | Step 3300 | loss = 0.059, acc = 0.933\n",
 640 |             "Epoch 3 | Step 3400 | loss = 0.044, acc = 0.960\n",
 641 |             "Epoch 3 | Step 3500 | loss = 0.057, acc = 0.935\n",
 642 |             "Epoch 3 | Step 3600 | loss = 0.057, acc = 0.950\n",
 643 |             "Epoch 3 | Step 3700 | loss = 0.044, acc = 0.945\n",
 644 |             "Epoch 3 | Step 3800 | loss = 0.062, acc = 0.950\n",
 645 |             "Epoch 3 | Step 3900 | loss = 0.059, acc = 0.945\n",
 646 |             "Epoch 3 | Step 4000 | loss = 0.052, acc = 0.940\n",
 647 |             "Epoch 3 | Step 4100 | loss = 0.046, acc = 0.947\n",
 648 |             "Epoch 3 | Step 4200 | loss = 0.039, acc = 0.952\n",
 649 |             "Epoch 3 | Step 4300 | loss = 0.051, acc = 0.952\n",
 650 |             "Epoch 3 | Step 4400 | loss = 0.051, acc = 0.945\n",
 651 |             "Epoch 3 | Step 4500 | loss = 0.036, acc = 0.957\n",
 652 |             "Epoch 3 | Step 4600 | loss = 0.048, acc = 0.950\n",
 653 |             "Epoch 3 | Step 4700 | loss = 0.041, acc = 0.952\n",
 654 |             "Epoch 3 | Step 4800 | loss = 0.069, acc = 0.957\n",
 655 |             "Epoch 3 | Step 4900 | loss = 0.054, acc = 0.938\n",
 656 |             "Epoch 3 | Step 5000 | loss = 0.056, acc = 0.957\n",
 657 |             "Epoch 3 | Step 5100 | loss = 0.056, acc = 0.950\n",
 658 |             "Epoch 3 | Step 5200 | loss = 0.050, acc = 0.933\n",
 659 |             "Epoch 3 | Step 5300 | loss = 0.045, acc = 0.940\n",
 660 |             "Epoch 3 | Step 5400 | loss = 0.062, acc = 0.930\n",
 661 |             "Epoch 3 | Step 5500 | loss = 0.033, acc = 0.970\n",
 662 |             "Epoch 3 | Step 5600 | loss = 0.060, acc = 0.942\n",
 663 |             "Epoch 3 | Step 5700 | loss = 0.055, acc = 0.950\n",
 664 |             "Epoch 3 | Step 5800 | loss = 0.035, acc = 0.957\n",
 665 |             "Epoch 3 | Step 5900 | loss = 0.065, acc = 0.942\n",
 666 |             "Epoch 3 | Step 6000 | loss = 0.047, acc = 0.952\n",
 667 |             "Epoch 3 | Step 6100 | loss = 0.073, acc = 0.920\n",
 668 |             "Epoch 3 | Step 6200 | loss = 0.050, acc = 0.947\n",
 669 |             "Epoch 3 | Step 6300 | loss = 0.075, acc = 0.920\n",
 670 |             "Epoch 3 | Step 6400 | loss = 0.044, acc = 0.942\n",
 671 |             "Epoch 3 | Step 6500 | loss = 0.034, acc = 0.962\n",
 672 |             "Epoch 3 | Step 6600 | loss = 0.059, acc = 0.952\n",
 673 |             "Epoch 3 | Step 6700 | loss = 0.057, acc = 0.960\n",
 674 |             "Epoch 3 | Step 6800 | loss = 0.047, acc = 0.955\n",
 675 |             "Epoch 3 | Step 6900 | loss = 0.051, acc = 0.952\n",
 676 |             "Epoch 3 | Step 7000 | loss = 0.068, acc = 0.930\n",
 677 |             "Epoch 3 | Step 7100 | loss = 0.062, acc = 0.930\n",
 678 |             "Epoch 3 | Step 7200 | loss = 0.057, acc = 0.945\n",
 679 |             "Epoch 3 | Step 7300 | loss = 0.048, acc = 0.940\n",
 680 |             "Epoch 3 | Step 7400 | loss = 0.035, acc = 0.957\n",
 681 |             "Epoch 3 | Step 7500 | loss = 0.044, acc = 0.947\n",
 682 |             "Epoch 3 | Step 7600 | loss = 0.045, acc = 0.950\n",
 683 |             "Epoch 3 | Step 7700 | loss = 0.044, acc = 0.957\n",
 684 |             "Epoch 3 | Step 7800 | loss = 0.078, acc = 0.933\n",
 685 |             "Epoch 3 | Step 7900 | loss = 0.044, acc = 0.947\n",
 686 |             "Saving Model ...\n"
 687 |           ]
 688 |         }
 689 |       ],
 690 |       "source": [
 691 |         "num_epoch = 3\n",
 692 |         "validation = False\n",
 693 |         "logging_step = 100\n",
 694 |         "learning_rate = 1e-5\n",
 695 |         "optimizer = AdamW(model.parameters(), lr=learning_rate)\n",
 696 |         "\n",
 697 |         "# batch accumulation parameter\n",
 698 |         "accum_iter = 2\n",
 699 |         "\n",
 700 |         "#set up scheduler\n",
 701 |         "len_dataset = len(train_set)\n",
 702 |         "print(f\"length of train_set: {len_dataset}\")\n",
 703 |         "total_steps = (len_dataset // accum_iter) * num_epoch\n",
 704 |         "warm_up_ratio = 0\n",
 705 |         "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warm_up_ratio * total_steps, num_training_steps = total_steps)\n",
 706 |         "\n",
 707 |         "\n",
 708 |         "\n",
 709 |         "if fp16_training:\n",
 710 |         "    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) \n",
 711 |         "\n",
 712 |         "model.train()\n",
 713 |         "\n",
 714 |         "print(\"Start Training ...\")\n",
 715 |         "\n",
 716 |         "best_acc = 0\n",
 717 |         "\n",
 718 |         "for epoch in range(num_epoch):\n",
 719 |         "    step = 1\n",
 720 |         "    train_loss = train_acc = 0\n",
 721 |         "    \n",
 722 |         "    for data in train_loader:\t\n",
 723 |         "        # Load all data into GPU\n",
 724 |         "        data = [i.to(device) for i in data]\n",
 725 |         "        \n",
 726 |         "        # Model inputs: input_ids, token_type_ids, attention_mask, start_positions, end_positions (Note: only \"input_ids\" is mandatory)\n",
 727 |         "        # Model outputs: start_logits, end_logits, loss (return when start_positions/end_positions are provided)  \n",
 728 |         "        output = model(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3], end_positions=data[4])\n",
 729 |         "\n",
 730 |         "        # Choose the most probable start position / end position\n",
 731 |         "        start_index = torch.argmax(output.start_logits, dim=1)\n",
 732 |         "        end_index = torch.argmax(output.end_logits, dim=1)\n",
 733 |         "        \n",
 734 |         "        # Prediction is correct only if both start_index and end_index are correct\n",
 735 |         "        train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()\n",
 736 |         "\n",
 737 |         "        # normalize loss to account for batch accumulation\n",
 738 |         "        train_loss += output.loss / accum_iter \n",
 739 |         "        \n",
 740 |         "        if fp16_training:\n",
 741 |         "            accelerator.backward(output.loss)\n",
 742 |         "        else:\n",
 743 |         "            output.loss.backward()\n",
 744 |         "        \n",
 745 |         "        # weights update\n",
 746 |         "        if (step % accum_iter == 0) or (step == len(train_loader)):\n",
 747 |         "            optimizer.step()\n",
 748 |         "            scheduler.step()\n",
 749 |         "            optimizer.zero_grad()\n",
 750 |         "            \n",
 751 |         "        # optimizer.step()\n",
 752 |         "        # optimizer.zero_grad()\n",
 753 |         "        step += 1\n",
 754 |         "\n",
 755 |         "        # # ##### TODO: Apply linear learning rate decay #####\n",
 756 |         "        # scheduler.step()\n",
 757 |         "        \n",
 758 |         "        # Print training loss and accuracy over past logging step\n",
 759 |         "        if step % logging_step == 0:\n",
 760 |         "            print(f\"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / logging_step:.3f}, acc = {train_acc / logging_step:.3f}\")\n",
 761 |         "            train_loss = train_acc = 0\n",
 762 |         "\n",
 763 |         "\n",
 764 |         "\n",
 765 |         "    if validation:\n",
 766 |         "        print(\"Evaluating Dev Set ...\")\n",
 767 |         "        model.eval()\n",
 768 |         "        with torch.no_grad():\n",
 769 |         "            dev_acc = 0\n",
 770 |         "            for i, data in enumerate(tqdm(dev_loader)):\n",
 771 |         "                output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),\n",
 772 |         "                       attention_mask=data[2].squeeze(dim=0).to(device))\n",
 773 |         "                # prediction is correct only if answer text exactly matches\n",
 774 |         "                dev_acc += evaluate(data, output) == dev_questions[i][\"answer_text\"]\n",
 775 |         "            print(f\"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}\")\n",
 776 |         "\n",
 777 |         "            if dev_acc > best_acc:\n",
 778 |         "                best_acc = dev_acc\n",
 779 |         "                print(\"Saving Model ...\")\n",
 780 |         "                model_save_dir = \"saved_model_last_2\" \n",
 781 |         "                model.save_pretrained(model_save_dir)\n",
 782 |         "        model.train()\n",
 783 |         "\n",
 784 |         "# Save a model and its configuration file to the directory 「saved_model」 \n",
 785 |         "# i.e. there are two files under the direcory 「saved_model」: 「pytorch_model.bin」 and 「config.json」\n",
 786 |         "# Saved model can be re-loaded using 「model = BertForQuestionAnswering.from_pretrained(\"saved_model\")」\n",
 787 |         "\n",
 788 |         "print(\"Saving Model ...\")\n",
 789 |         "model_save_dir = \"saved_model_last_2\" \n",
 790 |         "model.save_pretrained(model_save_dir)"
 791 |       ]
 792 |     },
 793 |     {
 794 |       "cell_type": "code",
 795 |       "execution_count": 10,
 796 |       "metadata": {},
 797 |       "outputs": [],
 798 |       "source": [
 799 |         "max_answer_length = 30\n",
 800 |         "\n",
 801 |         "def evaluate(data, output1, paragraph):\n",
 802 |         "\n",
 803 |         "    p_offsets = data[4]\n",
 804 |         "    question_offset = data[3].squeeze(dim=0)\n",
 805 |         "    ##### TODO: Postprocessing #####\n",
 806 |         "    # There is a bug and room for improvement in postprocessing \n",
 807 |         "    # Hint: Open your prediction file to see what is wrong \n",
 808 |         "    answer = ''\n",
 809 |         "    max_prob = float('-inf')\n",
 810 |         "    num_of_windows = data[0].shape[1]\n",
 811 |         "    n_best = 100\n",
 812 |         "\n",
 813 |         "    \n",
 814 |         "    ans_start, ans_end, ans_k  = 0,0,0\n",
 815 |         "    for k in range(num_of_windows):\n",
 816 |         "        output_start = output1.start_logits[k].cpu().numpy()\n",
 817 |         "        output_end = output1.end_logits[k].cpu().numpy()\n",
 818 |         "\n",
 819 |         "        start_indexes = np.argsort(output_start)[-1 : -n_best - 1 : -1].tolist()\n",
 820 |         "        end_indexes = np.argsort(output_end)[-1 : -n_best - 1 : -1].tolist()\n",
 821 |         "\n",
 822 |         "        for start_index in start_indexes:\n",
 823 |         "            for end_index in end_indexes:\n",
 824 |         "                if start_index > end_index or end_index - start_index + 1 > max_answer_length:\n",
 825 |         "                    continue\n",
 826 |         "                \n",
 827 |         "                if start_index < question_offset[k].item():\n",
 828 |         "                    # print(f\"answer is in question: {start_index},{end_index}\")\n",
 829 |         "                    continue\n",
 830 |         "                \n",
 831 |         "                start_prob= output1.start_logits[k][start_index]\n",
 832 |         "                end_prob = output1.end_logits[k][end_index]\n",
 833 |         "                \n",
 834 |         "                prob = start_prob + end_prob\n",
 835 |         "                if prob > max_prob:\n",
 836 |         "                    max_prob = prob\n",
 837 |         "                    # Convert tokens to chars (e.g. [1920, 7032] --> \"大 金\")\n",
 838 |         "                    answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])\n",
 839 |         "                    ans_start, ans_end, ans_k = start_index , end_index, k\n",
 840 |         "\n",
 841 |         "\n",
 842 |         "    # print(ans_start, ans_end, ans_k)\n",
 843 |         "    # print(paragraph)\n",
 844 |         "    \n",
 845 |         "    res = ''\n",
 846 |         "    prefix = question_offset[ans_k].item() # char occupied by question\n",
 847 |         "\n",
 848 |         "    ans_p_start = ans_start  -  prefix #ans_start in origin paragraph token\n",
 849 |         "    ans_p_end = ans_end -  prefix\n",
 850 |         "\n",
 851 |         "\n",
 852 |         "    # tks = ids[ans_k][ans_start:ans_end+1].tolist()\n",
 853 |         "    # print(tks)\n",
 854 |         "    st, ed = 0,0\n",
 855 |         "    p_offset = [(a.item(),b.item()) for a,b in p_offsets[ans_k]]\n",
 856 |         "    # print(len(p_offset), ans_p_start, ans_p_end, ans_k)\n",
 857 |         "    for i in range(ans_p_start, ans_p_end+1):\n",
 858 |         "        st, ed = p_offset[i]\n",
 859 |         "        res += paragraph[st : ed]\n",
 860 |         "    # print(f\"k: {ans_k}, res: {res}\")\n",
 861 |         "    # raise Exception\n",
 862 |         "    # if '[UNK]' in answer:\n",
 863 |         "    #     print('found [UNK] in prediction, using original text')\n",
 864 |         "    #     print('original prediction', answer)\n",
 865 |         "    #     print('final prediction',res)\n",
 866 |         "        \n",
 867 |         "    # res = res.replace(' ','')\n",
 868 |         "    # print(res)\n",
 869 |         "    return res\n",
 870 |         "\n",
 871 |         "def ensemble(data, output1, output2, output5, paragraph):\n",
 872 |         "\n",
 873 |         "    p_offsets = data[4]\n",
 874 |         "    question_offset = data[3].squeeze(dim=0)\n",
 875 |         "    ##### TODO: Postprocessing #####\n",
 876 |         "    # There is a bug and room for improvement in postprocessing \n",
 877 |         "    # Hint: Open your prediction file to see what is wrong \n",
 878 |         "    answer = ''\n",
 879 |         "    max_prob = float('-inf')\n",
 880 |         "    num_of_windows = data[0].shape[1]\n",
 881 |         "    n_best = 100\n",
 882 |         "\n",
 883 |         "    \n",
 884 |         "    ans_start, ans_end, ans_k  = 0,0,0\n",
 885 |         "    for k in range(num_of_windows):\n",
 886 |         "        # output_start = output1.start_logits[k].cpu().numpy() + output2.start_logits[k].cpu().numpy() + output5.start_logits[k].cpu().numpy()\n",
 887 |         "        # output_end = output1.end_logits[k].cpu().numpy() + output2.end_logits[k].cpu().numpy() + output5.end_logits[k].cpu().numpy()\n",
 888 |         "        output_start = output1.start_logits[k].cpu().numpy() + output2.start_logits[k].cpu().numpy()\n",
 889 |         "        output_end = output1.end_logits[k].cpu().numpy() + output2.end_logits[k].cpu().numpy()\n",
 890 |         "\n",
 891 |         "        start_indexes = np.argsort(output_start)[-1 : -n_best - 1 : -1].tolist()\n",
 892 |         "        end_indexes = np.argsort(output_end)[-1 : -n_best - 1 : -1].tolist()\n",
 893 |         "\n",
 894 |         "        for i, start_index in enumerate(start_indexes):\n",
 895 |         "            for j, end_index in enumerate(end_indexes):\n",
 896 |         "                if start_index > end_index or end_index - start_index + 1 > max_answer_length:\n",
 897 |         "                    continue\n",
 898 |         "\n",
 899 |         "                if start_index < question_offset[k].item():\n",
 900 |         "                    # print(f\"answer is in question: {start_index},{end_index}\")\n",
 901 |         "                    continue\n",
 902 |         "                \n",
 903 |         "                # start_prob= output1.start_logits[k][start_index] + output2.start_logits[k][start_index] + output5.start_logits[k][start_index]\n",
 904 |         "                # end_prob = output1.end_logits[k][end_index]+ output2.end_logits[k][end_index] + output5.end_logits[k][end_index]\n",
 905 |         "                start_prob= output1.start_logits[k][start_index] + output2.start_logits[k][start_index]\n",
 906 |         "                end_prob = output1.end_logits[k][end_index]+ output2.end_logits[k][end_index]\n",
 907 |         "                \n",
 908 |         "                prob = start_prob + end_prob\n",
 909 |         "                if prob > max_prob:\n",
 910 |         "                    max_prob = prob\n",
 911 |         "                    # Convert tokens to chars (e.g. [1920, 7032] --> \"大 金\")\n",
 912 |         "                    answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])\n",
 913 |         "                    ans_start, ans_end, ans_k = start_index , end_index, k\n",
 914 |         "\n",
 915 |         "\n",
 916 |         "    # print(ans_start, ans_end, ans_k)\n",
 917 |         "    # print(paragraph)\n",
 918 |         "    \n",
 919 |         "    res = ''\n",
 920 |         "    prefix = question_offset[ans_k].item() # char occupied by question\n",
 921 |         "\n",
 922 |         "    ans_p_start = ans_start  -  prefix #ans_start in origin paragraph token\n",
 923 |         "    ans_p_end = ans_end -  prefix\n",
 924 |         "\n",
 925 |         "\n",
 926 |         "    # tks = ids[ans_k][ans_start:ans_end+1].tolist()\n",
 927 |         "    # print(tks)\n",
 928 |         "    st, ed = 0,0\n",
 929 |         "    p_offset = [(a.item(),b.item()) for a,b in p_offsets[ans_k]]\n",
 930 |         "    # print(p_offset)\n",
 931 |         "    for i in range(ans_p_start, ans_p_end+1):\n",
 932 |         "        st, ed = p_offset[i]\n",
 933 |         "        res += paragraph[st : ed]\n",
 934 |         "    # print(f\"k: {ans_k}, res: {res}\")\n",
 935 |         "    # raise Exception\n",
 936 |         "    # if '[UNK]' in answer:\n",
 937 |         "    #     print('found [UNK] in prediction, using original text')\n",
 938 |         "    #     print('original prediction', answer)\n",
 939 |         "    #     print('final prediction',res)\n",
 940 |         "        \n",
 941 |         "    # res = res.replace(' ','')\n",
 942 |         "    # print(res)\n",
 943 |         "    return res"
 944 |       ]
 945 |     },
 946 |     {
 947 |       "cell_type": "code",
 948 |       "execution_count": 11,
 949 |       "metadata": {},
 950 |       "outputs": [],
 951 |       "source": [
 952 |         "# model2 = BertForQuestionAnswering.from_pretrained(\"saved_model_cml5_2\").to(device)"
 953 |       ]
 954 |     },
 955 |     {
 956 |       "cell_type": "code",
 957 |       "execution_count": 12,
 958 |       "metadata": {},
 959 |       "outputs": [
 960 |         {
 961 |           "name": "stdout",
 962 |           "output_type": "stream",
 963 |           "text": [
 964 |             "Evaluating Dev Set ...\n",
 965 |             "Validation | acc = 0.823\n"
 966 |           ]
 967 |         }
 968 |       ],
 969 |       "source": [
 970 |         "print(\"Evaluating Dev Set ...\")\n",
 971 |         "model.eval()\n",
 972 |         "with torch.no_grad():\n",
 973 |         "    dev_acc = 0\n",
 974 |         "    for i, data in enumerate(dev_loader):\n",
 975 |         "        output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),\n",
 976 |         "                attention_mask=data[2].squeeze(dim=0).to(device))\n",
 977 |         "        # output2 = model2(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),\n",
 978 |         "                # attention_mask=data[2].squeeze(dim=0).to(device))\n",
 979 |         "        # prediction is correct only if answer text exactly matches\n",
 980 |         "        dev_acc += evaluate(data, output, dev_paragraphs[dev_questions[i]['paragraph_id']]) == dev_questions[i][\"answer_text\"]\n",
 981 |         "        # dev_acc += ensemble(data, output,output2, None, dev_paragraphs[dev_questions[i]['paragraph_id']]) == dev_questions[i][\"answer_text\"]\n",
 982 |         "    print(f\"Validation | acc = {dev_acc / len(dev_loader):.3f}\")"
 983 |       ]
 984 |     }
 985 |   ],
 986 |   "metadata": {
 987 |     "accelerator": "GPU",
 988 |     "colab": {
 989 |       "collapsed_sections": [],
 990 |       "name": "ML2022Spring - HW7.ipynb",
 991 |       "provenance": []
 992 |     },
 993 |     "interpreter": {
 994 |       "hash": "ecbd9286bd544f7fe4ef1add3c640987467b2b9ee7c82bb6d3e9831005f6ced4"
 995 |     },
 996 |     "kernelspec": {
 997 |       "display_name": "torch11",
 998 |       "language": "python",
 999 |       "name": "python3"
1000 |     },
1001 |     "language_info": {
1002 |       "codemirror_mode": {
1003 |         "name": "ipython",
1004 |         "version": 3
1005 |       },
1006 |       "file_extension": ".py",
1007 |       "mimetype": "text/x-python",
1008 |       "name": "python",
1009 |       "nbconvert_exporter": "python",
1010 |       "pygments_lexer": "ipython3",
1011 |       "version": "3.10.2"
1012 |     }
1013 |   },
1014 |   "nbformat": 4,
1015 |   "nbformat_minor": 0
1016 | }
1017 | 


--------------------------------------------------------------------------------
/Hw7/hw7_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macTracyHuang/NTU-ML2022-Spring/0c14ae8d9a1448ee2da03f93836e3dd5d3a62b16/Hw7/hw7_report.pdf


--------------------------------------------------------------------------------
/Hw8/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macTracyHuang/NTU-ML2022-Spring/0c14ae8d9a1448ee2da03f93836e3dd5d3a62b16/Hw8/report.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NTU-ML2022-Spring
 2 | Machine Learning (Grade: A+)\
 3 | Instructor: Hung-yi Lee\
 4 | It's my Homeworks implementation of https://github.com/virginiakm1988/ML2022-Spring
 5 | 
 6 | 
 7 | ## Kaggle
 8 | HW   |  Homework Topic| public|private | baseline |
 9 | -----|---------------:|------:|-------:|-------:|
10 | 1    |Regression      |102/948| 35/948 | strong |
11 | 2    |Classification  |11/619 | 10/619 | boss   |
12 | 3    |CNN             |33/553 | 44/553 | boss   |
13 | 4    |Self-attention  |10/521 |  9/521 | boss   |
14 | 5    |Transformer     |47/396 | 48/396 | boss   |
15 | 6    |GAN             |92/438 | NA     | boss   |
16 | 7    |BERT            |4/495  | 24/495 | boss   |
17 | 8    |Autoencoder     |19/499 |  20/499| boss   |
18 | 11   |Adaptation      |65 /499|  49/499| boss   |


--------------------------------------------------------------------------------