├── requirements.txt
├── README.md
├── finetune_with_params.sh
├── finetune.sh
├── Dockerfile
└── run_common_voice.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers >= 4.5.0
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # wav2vec2-indonesian
2 | 
3 | Code based on https://github.com/patil-suraj/wav2vec2-sprint with modification
4 | 
5 | [HuggingFace model](https://huggingface.co/Galuh/wav2vec2-large-xlsr-indonesian)
6 | [Wandb run](https://wandb.ai/wandb/xlsr-indonesian/runs/va6jec4n)


--------------------------------------------------------------------------------
/finetune_with_params.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python /workspace/wav2vec/run_common_voice.py \
 3 |     --model_name_or_path=$model_name_or_path \
 4 |     --dataset_config_name=$dataset_config_name \
 5 |     --output_dir=$output_dir \
 6 |     --cache_dir=$cache_dir \
 7 |     --overwrite_output_dir \
 8 |     --num_train_epochs=$num_train_epochs \
 9 |     --per_device_train_batch_size=$per_device_train_batch_size \
10 |     --per_device_eval_batch_size=$per_device_eval_batch_size \
11 |     --evaluation_strategy=$evaluation_strategy \
12 |     --learning_rate=$learning_rate \
13 |     --warmup_steps=$warmup_steps \
14 |     --fp16 \
15 |     --freeze_feature_extractor \
16 |     --save_steps=$save_steps \
17 |     --eval_steps=$eval_steps \
18 |     --save_total_limit=$save_total_limit \
19 |     --logging_steps=$logging_steps \
20 |     --group_by_length \
21 |     --feat_proj_dropout=$feat_proj_dropout \
22 |     --layerdrop=$layerdrop \
23 |     --gradient_checkpointing \
24 |     --do_train \
25 |     --do_eval \
26 |     --max_train_samples $max_train_samples \
27 |     --max_val_samples $max_val_samples
28 | 
29 | 


--------------------------------------------------------------------------------
/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | python run_common_voice.py \
 3 |     --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
 4 |     --dataset_config_name="id" \
 5 |     --output_dir=/workspace/output_models/wav2vec2-large-xlsr-indonesian \
 6 |     --cache_dir=/workspace/output_models \
 7 |     --overwrite_output_dir \
 8 |     --num_train_epochs="60" \
 9 |     --per_device_eval_batch_size="8" \
10 |     --per_device_train_batch_size="16" \
11 |     --evaluation_strategy="steps" \
12 |     --learning_rate="1e-4" \
13 |     --warmup_steps="300" \
14 |     --fp16 \
15 |     --freeze_feature_extractor \
16 |     --save_steps="100" \
17 |     --eval_steps="100" \
18 |     --save_total_limit="1" \
19 |     --logging_steps="100" \
20 |     --group_by_length \
21 |     --feat_proj_dropout="0.04" \
22 |     --layerdrop="0.041" \
23 |     --attention_dropout="0.094" \
24 |     --activation_dropout="0.055" \
25 |     --hidden_dropout="0.047" \
26 |     --mask_time_prob="0.4" \
27 |     --gradient_checkpointing \
28 |     --do_train --do_eval \
29 |     --gradient_accumulation_steps="2" \
30 |     --cache_dir=/workspace/data \
31 |     --dataloader_num_workers="8"
32 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ovhcom/ai-training-one-for-all
 2 | 
 3 | RUN apt-get update && \
 4 |     apt install -y bash \
 5 |     build-essential \
 6 |     libsndfile1-dev \
 7 |     git-lfs \
 8 |     sox
 9 | 
10 | RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
11 |     apt-get install git-lfs && \
12 |     git lfs install
13 | 
14 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \
15 |     python3 -m pip install --no-cache-dir \
16 |     datasets \
17 |     jiwer==2.2.0 \
18 |     soundfile \
19 |     torchaudio \
20 |     lang-trans==0.6.0 \
21 |     librosa==0.8.0
22 | 
23 | RUN pip3 uninstall -y typing allennlp
24 | 
25 | RUN pip3 install git+https://github.com/huggingface/transformers.git
26 | 
27 | RUN mkdir -p /workspace/wav2vec/
28 | 
29 | COPY finetune.sh run_common_voice.py  finetune_with_params.sh /workspace/wav2vec/
30 | 
31 | COPY home-server.html run_all.sh /usr/bin/
32 | 
33 | RUN chown -R 42420:42420 /workspace
34 | 
35 | RUN chown -R 42420:42420 /usr/bin/run_all.sh
36 | 
37 | #Default training env variables
38 | ENV model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
39 |     dataset_config_name="fr" \
40 |     output_dir="/workspace/output_models/wav2vec2-large-xlsr-indonesian" \
41 |     cache_dir="/workspace/data" \
42 |     num_train_epochs="1" \
43 |     per_device_train_batch_size="32" \
44 |     evaluation_strategy="steps" \
45 |     learning_rate="3e-4" \
46 |     warmup_steps="500" \
47 |     save_steps="10" \
48 |     eval_steps="10" \
49 |     save_total_limit="1" \
50 |     logging_steps="10" \
51 |     feat_proj_dropout="0.0" \
52 |     layerdrop="0.1" \
53 |     max_train_samples=100 \
54 |     max_val_samples=100
55 | 
56 | WORKDIR /workspace
57 | ENTRYPOINT []
58 | #CMD ["sh", "/usr/bin/run_all.sh"]
59 | CMD ["supervisord", "-n", "-u", "42420", "-c", "/etc/supervisor/supervisor.conf"]


--------------------------------------------------------------------------------
/run_common_voice.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import json
  3 | import logging
  4 | import os
  5 | import re
  6 | import sys
  7 | from dataclasses import dataclass, field
  8 | from typing import Any, Dict, List, Optional, Union
  9 | 
 10 | import datasets
 11 | import numpy as np
 12 | import torch
 13 | import torchaudio
 14 | from packaging import version
 15 | from torch import nn
 16 | from torch.optim.lr_scheduler import LambdaLR
 17 | 
 18 | import transformers
 19 | from transformers import (
 20 |     HfArgumentParser,
 21 |     Trainer,
 22 |     TrainingArguments,
 23 |     Wav2Vec2CTCTokenizer,
 24 |     Wav2Vec2FeatureExtractor,
 25 |     Wav2Vec2ForCTC,
 26 |     Wav2Vec2Processor,
 27 |     is_apex_available,
 28 |     set_seed,
 29 | )
 30 | from transformers.trainer_utils import get_last_checkpoint, is_main_process
 31 | 
 32 | 
 33 | if is_apex_available():
 34 |     from apex import amp
 35 | 
 36 | 
 37 | if version.parse(torch.__version__) >= version.parse("1.6"):
 38 |     _is_native_amp_available = True
 39 |     from torch.cuda.amp import autocast
 40 | 
 41 | logger = logging.getLogger(__name__)
 42 | 
 43 | 
 44 | def list_field(default=None, metadata=None):
 45 |     return field(default_factory=lambda: default, metadata=metadata)
 46 | 
 47 | 
 48 | @dataclass
 49 | class ModelArguments:
 50 |     """
 51 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
 52 |     """
 53 | 
 54 |     model_name_or_path: str = field(
 55 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
 56 |     )
 57 |     cache_dir: Optional[str] = field(
 58 |         default=None,
 59 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
 60 |     )
 61 |     freeze_feature_extractor: Optional[bool] = field(
 62 |         default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
 63 |     )
 64 |     attention_dropout: Optional[float] = field(
 65 |         default=0.1, metadata={"help": "The dropout ratio for the attention probabilities."}
 66 |     )
 67 |     activation_dropout: Optional[float] = field(
 68 |         default=0.1, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
 69 |     )
 70 |     hidden_dropout: Optional[float] = field(
 71 |         default=0.1,
 72 |         metadata={
 73 |             "help": "The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler."
 74 |         },
 75 |     )
 76 |     feat_proj_dropout: Optional[float] = field(
 77 |         default=0.1,
 78 |         metadata={"help": "The dropout probabilitiy for all 1D convolutional layers in feature extractor."},
 79 |     )
 80 |     mask_time_prob: Optional[float] = field(
 81 |         default=0.05,
 82 |         metadata={
 83 |             "help": "Propability of each feature vector along the time axis to be chosen as the start of the vector"
 84 |             "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
 85 |             "vectors will be masked along the time axis. This is only relevant if ``apply_spec_augment is True``."
 86 |         },
 87 |     )
 88 |     gradient_checkpointing: Optional[bool] = field(
 89 |         default=True,
 90 |         metadata={
 91 |             "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
 92 |         },
 93 |     )
 94 |     layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."})
 95 | 
 96 | 
 97 | @dataclass
 98 | class DataTrainingArguments:
 99 |     """
100 |     Arguments pertaining to what data we are going to input our model for training and eval.
101 | 
102 |     Using `HfArgumentParser` we can turn this class
103 |     into argparse arguments to be able to specify them on
104 |     the command line.
105 |     """
106 | 
107 |     dataset_config_name: Optional[str] = field(
108 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
109 |     )
110 |     train_split_name: Optional[str] = field(
111 |         default="train+validation",
112 |         metadata={
113 |             "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
114 |         },
115 |     )
116 |     overwrite_cache: bool = field(
117 |         default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
118 |     )
119 |     preprocessing_num_workers: Optional[int] = field(
120 |         default=None,
121 |         metadata={"help": "The number of processes to use for the preprocessing."},
122 |     )
123 |     max_train_samples: Optional[int] = field(
124 |         default=None,
125 |         metadata={
126 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
127 |             "value if set."
128 |         },
129 |     )
130 |     max_val_samples: Optional[int] = field(
131 |         default=None,
132 |         metadata={
133 |             "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
134 |             "value if set."
135 |         },
136 |     )
137 |     chars_to_ignore: List[str] = list_field(
138 |         default=[",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�"],
139 |         metadata={"help": "A list of characters to remove from the transcripts."},
140 |     )
141 | 
142 | 
143 | @dataclass
144 | class DataCollatorCTCWithPadding:
145 |     """
146 |     Data collator that will dynamically pad the inputs received.
147 |     Args:
148 |         processor (:class:`~transformers.Wav2Vec2Processor`)
149 |             The processor used for proccessing the data.
150 |         padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
151 |             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
152 |             among:
153 |             * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
154 |               sequence if provided).
155 |             * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
156 |               maximum acceptable input length for the model if that argument is not provided.
157 |             * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
158 |               different lengths).
159 |         max_length (:obj:`int`, `optional`):
160 |             Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
161 |         max_length_labels (:obj:`int`, `optional`):
162 |             Maximum length of the ``labels`` returned list and optionally padding length (see above).
163 |         pad_to_multiple_of (:obj:`int`, `optional`):
164 |             If set will pad the sequence to a multiple of the provided value.
165 |             This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
166 |             7.5 (Volta).
167 |     """
168 | 
169 |     processor: Wav2Vec2Processor
170 |     padding: Union[bool, str] = True
171 |     max_length: Optional[int] = None
172 |     max_length_labels: Optional[int] = None
173 |     pad_to_multiple_of: Optional[int] = None
174 |     pad_to_multiple_of_labels: Optional[int] = None
175 | 
176 |     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
177 |         # split inputs and labels since they have to be of different lenghts and need
178 |         # different padding methods
179 |         input_features = [{"input_values": feature["input_values"]} for feature in features]
180 |         label_features = [{"input_ids": feature["labels"]} for feature in features]
181 | 
182 |         batch = self.processor.pad(
183 |             input_features,
184 |             padding=self.padding,
185 |             max_length=self.max_length,
186 |             pad_to_multiple_of=self.pad_to_multiple_of,
187 |             return_tensors="pt",
188 |         )
189 |         with self.processor.as_target_processor():
190 |             labels_batch = self.processor.pad(
191 |                 label_features,
192 |                 padding=self.padding,
193 |                 max_length=self.max_length_labels,
194 |                 pad_to_multiple_of=self.pad_to_multiple_of_labels,
195 |                 return_tensors="pt",
196 |             )
197 | 
198 |         # replace padding with -100 to ignore loss correctly
199 |         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
200 | 
201 |         batch["labels"] = labels
202 | 
203 |         return batch
204 | 
205 | 
206 | class CTCTrainer(Trainer):
207 |     def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
208 |         """
209 |         Perform a training step on a batch of inputs.
210 | 
211 |         Subclass and override to inject custom behavior.
212 | 
213 |         Args:
214 |             model (:obj:`nn.Module`):
215 |                 The model to train.
216 |             inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
217 |                 The inputs and targets of the model.
218 | 
219 |                 The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
220 |                 argument :obj:`labels`. Check your model's documentation for all accepted arguments.
221 | 
222 |         Return:
223 |             :obj:`torch.Tensor`: The tensor with training loss on this batch.
224 |         """
225 | 
226 |         model.train()
227 |         inputs = self._prepare_inputs(inputs)
228 | 
229 |         if self.use_amp:
230 |             with autocast():
231 |                 loss = self.compute_loss(model, inputs)
232 |         else:
233 |             loss = self.compute_loss(model, inputs)
234 | 
235 |         if self.args.n_gpu > 1:
236 |             if model.module.config.ctc_loss_reduction == "mean":
237 |                 loss = loss.mean()
238 |             elif model.module.config.ctc_loss_reduction == "sum":
239 |                 loss = loss.sum() / (inputs["labels"] >= 0).sum()
240 |             else:
241 |                 raise ValueError(f"{model.config.ctc_loss_reduction} is not valid. Choose one of ['mean', 'sum']")
242 |             
243 |         if self.args.gradient_accumulation_steps > 1:
244 |             loss = loss / self.args.gradient_accumulation_steps
245 | 
246 |         if self.use_amp:
247 |             self.scaler.scale(loss).backward()
248 |         elif self.use_apex:
249 |             with amp.scale_loss(loss, self.optimizer) as scaled_loss:
250 |                 scaled_loss.backward()
251 |         elif self.deepspeed:
252 |             self.deepspeed.backward(loss)
253 |         else:
254 |             loss.backward()
255 | 
256 |         return loss.detach()
257 | 
258 | 
259 | def get_flat_linear_schedule_with_warmup(optimizer, num_warmup_steps,
260 |                                          num_training_steps, last_epoch=-1):
261 | 
262 |     def lr_lambda(current_step):
263 |         constant_steps = int(num_training_steps * 0.4)
264 |         warmup_steps = int(num_training_steps * 0.1)
265 |         if current_step < warmup_steps:
266 |             return float(current_step) / float(max(1, warmup_steps))
267 |         elif current_step < warmup_steps+constant_steps:
268 |             return 1
269 |         else: return max(
270 |             0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - (warmup_steps+constant_steps)))
271 |         )
272 | 
273 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
274 | 
275 | def get_flat_scheduler(
276 |     name=None,
277 |     optimizer=None,
278 |     num_warmup_steps=None,
279 |     num_training_steps=None,
280 | ):
281 |     return get_flat_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, 
282 |                                                 num_training_steps=num_training_steps)
283 | 
284 | 
285 | class FlatTrainer(Trainer):
286 |     def __init__(self, *args, **kwargs):
287 |         super().__init__(*args, **kwargs)
288 |      
289 |     def create_flat_scheduler(self, num_training_steps: int):
290 |         self.lr_scheduler = get_flat_scheduler(optimizer = self.optimizer,
291 |                                               num_training_steps=num_training_steps)
292 |     
293 |     def create_optimizer_and_scheduler(self, num_training_steps):
294 |         self.create_optimizer()
295 |         self.create_flat_scheduler(num_training_steps)
296 | 
297 | 
298 | def main():
299 |     # See all possible arguments in src/transformers/training_args.py
300 |     # or by passing the --help flag to this script.
301 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
302 |     
303 |     os.environ["WANDB_ENTITY"] = "wandb"
304 |     os.environ["WANDB_PROJECT"] = "xlsr-indonesian"
305 |     os.environ["WANDB_LOG_MODEL"] = "true"
306 |     
307 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
308 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
309 |         # If we pass only one argument to the script and it's the path to a json file,
310 |         # let's parse it to get our arguments.
311 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
312 |     else:
313 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
314 | 
315 |     # Detecting last checkpoint.
316 |     last_checkpoint = None
317 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
318 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
319 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
320 |             raise ValueError(
321 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
322 |                 "Use --overwrite_output_dir to overcome."
323 |             )
324 |         elif last_checkpoint is not None:
325 |             logger.info(
326 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
327 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
328 |             )
329 | 
330 |     # Setup logging
331 |     logging.basicConfig(
332 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
333 |         datefmt="%m/%d/%Y %H:%M:%S",
334 |         handlers=[logging.StreamHandler(sys.stdout)],
335 |     )
336 |     logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
337 | 
338 |     # Log on each process the small summary:
339 |     logger.warning(
340 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
341 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
342 |     )
343 |     # Set the verbosity to info of the Transformers logger (on main process only):
344 |     if is_main_process(training_args.local_rank):
345 |         transformers.utils.logging.set_verbosity_info()
346 |     logger.info("Training/evaluation parameters %s", training_args)
347 | 
348 |     # Set seed before initializing model.
349 |     set_seed(training_args.seed)
350 | 
351 |     # Get the datasets:
352 |     train_dataset = datasets.load_dataset(
353 |         "common_voice", data_args.dataset_config_name, split=data_args.train_split_name, cache_dir=model_args.cache_dir
354 |     )
355 |     eval_dataset = datasets.load_dataset("common_voice", data_args.dataset_config_name, split="test", cache_dir=model_args.cache_dir)
356 | 
357 |     # Create and save tokenizer
358 |     chars_to_ignore_regex = f'[{"".join(data_args.chars_to_ignore)}]'
359 | 
360 |     def remove_special_characters(batch):
361 |         batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
362 |         return batch
363 | 
364 |     train_dataset = train_dataset.map(remove_special_characters, remove_columns=["sentence"])
365 |     eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"])
366 | 
367 |     def extract_all_chars(batch):
368 |         all_text = " ".join(batch["text"])
369 |         vocab = list(set(all_text))
370 |         return {"vocab": [vocab], "all_text": [all_text]}
371 | 
372 |     vocab_train = train_dataset.map(
373 |         extract_all_chars,
374 |         batched=True,
375 |         batch_size=-1,
376 |         keep_in_memory=True,
377 |         remove_columns=train_dataset.column_names,
378 |     )
379 |     vocab_test = train_dataset.map(
380 |         extract_all_chars,
381 |         batched=True,
382 |         batch_size=-1,
383 |         keep_in_memory=True,
384 |         remove_columns=eval_dataset.column_names,
385 |     )
386 | 
387 |     vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
388 |     vocab_dict = {v: k for k, v in enumerate(vocab_list)}
389 |     vocab_dict["|"] = vocab_dict[" "]
390 |     del vocab_dict[" "]
391 |     vocab_dict["[UNK]"] = len(vocab_dict)
392 |     vocab_dict["[PAD]"] = len(vocab_dict)
393 | 
394 |     with open("vocab.json", "w") as vocab_file:
395 |         json.dump(vocab_dict, vocab_file)
396 | 
397 |     # Load pretrained model and tokenizer
398 |     #
399 |     # Distributed training:
400 |     # The .from_pretrained methods guarantee that only one local process can concurrently
401 |     # download model & vocab.
402 |     tokenizer = Wav2Vec2CTCTokenizer(
403 |         "vocab.json",
404 |         unk_token="[UNK]",
405 |         pad_token="[PAD]",
406 |         word_delimiter_token="|",
407 |     )
408 |     feature_extractor = Wav2Vec2FeatureExtractor(
409 |         feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True
410 |     )
411 |     processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
412 |     model = Wav2Vec2ForCTC.from_pretrained(
413 |         model_args.model_name_or_path,
414 |         cache_dir=model_args.cache_dir,
415 |         activation_dropout=model_args.activation_dropout,
416 |         attention_dropout=model_args.attention_dropout,
417 |         hidden_dropout=model_args.hidden_dropout,
418 |         feat_proj_dropout=model_args.feat_proj_dropout,
419 |         mask_time_prob=model_args.mask_time_prob,
420 |         gradient_checkpointing=model_args.gradient_checkpointing,
421 |         layerdrop=model_args.layerdrop,
422 |         ctc_loss_reduction="mean",
423 |         pad_token_id=processor.tokenizer.pad_token_id,
424 |         vocab_size=len(processor.tokenizer),
425 |         ctc_zero_infinity=True
426 |     )
427 | 
428 |     if data_args.max_train_samples is not None:
429 |         train_dataset = train_dataset.select(range(data_args.max_train_samples))
430 | 
431 |     if data_args.max_val_samples is not None:
432 |         eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
433 | 
434 |     resampler = torchaudio.transforms.Resample(48_000, 16_000)
435 | 
436 |     # Preprocessing the datasets.
437 |     # We need to read the aduio files as arrays and tokenize the targets.
438 |     def speech_file_to_array_fn(batch):
439 |         speech_array, sampling_rate = torchaudio.load(batch["path"])
440 |         batch["speech"] = resampler(speech_array).squeeze().numpy()
441 |         batch["sampling_rate"] = 16_000
442 |         batch["target_text"] = batch["text"]
443 |         return batch
444 | 
445 |     train_dataset = train_dataset.map(
446 |         speech_file_to_array_fn,
447 |         remove_columns=train_dataset.column_names,
448 |         num_proc=data_args.preprocessing_num_workers,
449 |     )
450 |     eval_dataset = eval_dataset.map(
451 |         speech_file_to_array_fn,
452 |         remove_columns=eval_dataset.column_names,
453 |         num_proc=data_args.preprocessing_num_workers,
454 |     )
455 | 
456 |     def prepare_dataset(batch):
457 |         # check that all files have the correct sampling rate
458 |         assert (
459 |             len(set(batch["sampling_rate"])) == 1
460 |         ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
461 |         batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
462 |         # Setup the processor for targets
463 |         with processor.as_target_processor():
464 |             batch["labels"] = processor(batch["target_text"]).input_ids
465 |         return batch
466 | 
467 |     train_dataset = train_dataset.map(
468 |         prepare_dataset,
469 |         remove_columns=train_dataset.column_names,
470 |         batch_size=training_args.per_device_train_batch_size,
471 |         batched=True,
472 |         num_proc=data_args.preprocessing_num_workers,
473 |     )
474 |     eval_dataset = eval_dataset.map(
475 |         prepare_dataset,
476 |         remove_columns=eval_dataset.column_names,
477 |         batch_size=training_args.per_device_train_batch_size,
478 |         batched=True,
479 |         num_proc=data_args.preprocessing_num_workers,
480 |     )
481 | 
482 |     # Metric
483 |     wer_metric = datasets.load_metric("wer")
484 | 
485 |     def compute_metrics(pred):
486 |         pred_logits = pred.predictions
487 |         pred_ids = np.argmax(pred_logits, axis=-1)
488 | 
489 |         pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
490 | 
491 |         pred_str = processor.batch_decode(pred_ids)
492 |         # we do not want to group tokens when computing the metrics
493 |         label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
494 | 
495 |         wer = wer_metric.compute(predictions=pred_str, references=label_str)
496 | 
497 |         return {"wer": wer}
498 | 
499 |     if model_args.freeze_feature_extractor:
500 |         model.freeze_feature_extractor()
501 | 
502 |     # Data collator
503 |     data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
504 | 
505 |     # Initialize our Trainer    
506 |     trainer = FlatTrainer(
507 |         model=model,
508 |         data_collator=data_collator,
509 |         args=training_args,
510 |         compute_metrics=compute_metrics,
511 |         train_dataset=train_dataset if training_args.do_train else None,
512 |         eval_dataset=eval_dataset if training_args.do_eval else None,
513 |         tokenizer=processor.feature_extractor,
514 |     )
515 |     
516 |     # save the feature_extractor and the tokenizer
517 |     if is_main_process(training_args.local_rank):
518 |         processor.save_pretrained(training_args.output_dir)
519 | 
520 |     # Training
521 |     if training_args.do_train:
522 |         if last_checkpoint is not None:
523 |             checkpoint = last_checkpoint
524 |         elif os.path.isdir(model_args.model_name_or_path):
525 |             checkpoint = model_args.model_name_or_path
526 |         else:
527 |             checkpoint = None
528 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
529 |         trainer.save_model()
530 | 
531 |         metrics = train_result.metrics
532 |         max_train_samples = (
533 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
534 |         )
535 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
536 | 
537 |         trainer.log_metrics("train", metrics)
538 |         trainer.save_metrics("train", metrics)
539 |         trainer.save_state()
540 | 
541 |     # Evaluation
542 |     results = {}
543 |     if training_args.do_eval:
544 |         logger.info("*** Evaluate ***")
545 |         metrics = trainer.evaluate()
546 |         max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
547 |         metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
548 | 
549 |         trainer.log_metrics("eval", metrics)
550 |         trainer.save_metrics("eval", metrics)
551 | 
552 |     return results
553 | 
554 | 
555 | if __name__ == "__main__":
556 |     main()
557 | 


--------------------------------------------------------------------------------