├── .gitignore
├── .vscode
    ├── launch.json
    └── settings.json
├── CHANGELOG.md
├── Datasets
    └── README.md
├── LICENSE
├── MANIFEST.in
├── Models
    └── README.md
├── README.md
├── Tests
    ├── README.md
    ├── test_tensorflow_metrics.py
    └── test_text_utils.py
├── Tutorials
    ├── 01_image_to_word
    │   ├── README.md
    │   ├── configs.py
    │   ├── inferenceModel.py
    │   ├── model.py
    │   ├── requiremenets.txt
    │   └── train.py
    ├── 02_captcha_to_text
    │   ├── README.md
    │   ├── configs.py
    │   ├── inferenceModel.py
    │   ├── model.py
    │   └── train.py
    ├── 03_handwriting_recognition
    │   ├── README.md
    │   ├── configs.py
    │   ├── inferenceModel.py
    │   ├── model.py
    │   └── train.py
    ├── 04_sentence_recognition
    │   ├── README.md
    │   ├── configs.py
    │   ├── inferenceModel.py
    │   ├── model.py
    │   └── train.py
    ├── 05_sound_to_text
    │   ├── README.md
    │   ├── configs.py
    │   ├── inferenceModel.py
    │   ├── model.py
    │   ├── train.py
    │   └── train_no_limit.py
    ├── 06_pytorch_introduction
    │   ├── README.md
    │   ├── model.py
    │   ├── requirements.txt
    │   ├── test.py
    │   └── train.py
    ├── 07_pytorch_wrapper
    │   ├── README.md
    │   ├── model.py
    │   ├── requirements.txt
    │   ├── test.py
    │   └── train.py
    ├── 08_handwriting_recognition_torch
    │   ├── README.md
    │   ├── configs.py
    │   ├── inferenceModel.py
    │   ├── model.py
    │   ├── requirements.txt
    │   └── train_torch.py
    ├── 09_translation_transformer
    │   ├── README.md
    │   ├── configs.py
    │   ├── download.py
    │   ├── model.py
    │   ├── requirements.txt
    │   ├── test.py
    │   └── train.py
    ├── 10_wav2vec2_torch
    │   ├── configs.py
    │   ├── requirements.txt
    │   ├── test.py
    │   ├── train.py
    │   └── train_tf.py
    ├── 11_Yolov8
    │   ├── README.md
    │   ├── convert2onnx.py
    │   ├── requirements.txt
    │   ├── run_pretrained.py
    │   ├── test_yolov8.py
    │   └── train_yolov8.py
    └── README.md
├── bin
    ├── read_parquet.py
    └── setup.sh
├── mltu
    ├── __init__.py
    ├── annotations
    │   ├── __init__.py
    │   ├── audio.py
    │   ├── detections.py
    │   └── images.py
    ├── augmentors.py
    ├── configs.py
    ├── dataProvider.py
    ├── inferenceModel.py
    ├── preprocessors.py
    ├── tensorflow
    │   ├── README.md
    │   ├── __init__.py
    │   ├── callbacks.py
    │   ├── dataProvider.py
    │   ├── layers.py
    │   ├── losses.py
    │   ├── metrics.py
    │   ├── model_utils.py
    │   ├── models
    │   │   └── u2net.py
    │   ├── requirements.txt
    │   └── transformer
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── callbacks.py
    │   │   ├── layers.py
    │   │   └── utils.py
    ├── tokenizers.py
    ├── torch
    │   ├── README.md
    │   ├── __init__.py
    │   ├── callbacks.py
    │   ├── dataProvider.py
    │   ├── handlers.py
    │   ├── losses.py
    │   ├── metrics.py
    │   ├── model.py
    │   ├── requirements.txt
    │   └── yolo
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── annotation.py
    │   │   ├── detectors
    │   │       ├── __init__.py
    │   │       ├── detector.py
    │   │       ├── onnx_detector.py
    │   │       └── torch_detector.py
    │   │   ├── loss.py
    │   │   ├── metrics.py
    │   │   ├── optimizer.py
    │   │   ├── preprocessors.py
    │   │   ├── pruning_utils.py
    │   │   └── requirements.txt
    ├── transformers.py
    └── utils
    │   ├── __init__.py
    │   └── text_utils.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.egg-info
 3 | *.pyc
 4 | venv
 5 | 
 6 | Datasets/*
 7 | Models/*
 8 | dist
 9 | 
10 | !*.md
11 | 
12 | .idea
13 | .python-version
14 | 
15 | test
16 | build
17 | yolov8*
18 | pyrightconfig.json


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python: Current File",
 9 |             "type": "python",
10 |             "request": "launch",
11 |             "program": "${file}",
12 |             "console": "integratedTerminal",
13 |             "justMyCode": false,
14 |             "subProcess": true,
15 |         }
16 |     ]
17 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.analysis.typeCheckingMode": "off",
 3 |     "python.testing.unittestArgs": [
 4 |         "-v",
 5 |         "-s",
 6 |         "./Tests",
 7 |         "-p",
 8 |         "*test*.py"
 9 |     ],
10 |     "python.testing.pytestEnabled": false,
11 |     "python.testing.unittestEnabled": true
12 | }


--------------------------------------------------------------------------------
/Datasets/README.md:
--------------------------------------------------------------------------------
1 | # Empty repository to hold the datasets when running Tutorials


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Rokas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt


--------------------------------------------------------------------------------
/Models/README.md:
--------------------------------------------------------------------------------
1 | # Empty repository to hold the Models when running Tutorials


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MLTU - Machine Learning Training Utilities
 2 | Machine Learning Training Utilities for <b>TensorFlow 2.*</b> and <b>PyTorch</b> with Python 3
 3 | <p align="center">
 4 |   <img src="https://pylessons.com/media/Tutorials/mltu/machine-learning-training-utilities.png">
 5 | </p>
 6 | 
 7 | # Installation:
 8 | To use MLTU in your own project, you can install it from PyPI:
 9 | ```bash
10 | pip install mltu
11 | ```
12 | When running tutorials, it's necessary to install mltu for a specific tutorial, for example:
13 | ```bash
14 | pip install mltu==0.1.3
15 | ```
16 | Each tutorial has its own requirements.txt file for a specific mltu version. As this project progress, the newest versions may have breaking changes, so it's recommended to use the same version as in the tutorial.
17 | 
18 | # Tutorials and Examples can be found on [PyLessons.com](https://pylessons.com/mltu)
19 | 1. [Text Recognition With TensorFlow and CTC network](https://pylessons.com/ctc-text-recognition), code in ```Tutorials\01_image_to_word``` folder;
20 | 2. [TensorFlow OCR model for reading Captchas](https://pylessons.com/tensorflow-ocr-captcha), code in ```Tutorials\02_captcha_to_text``` folder;
21 | 3. [Handwriting words recognition with TensorFlow](https://pylessons.com/handwriting-recognition), code in ```Tutorials\03_handwriting_recognition``` folder;
22 | 4. [Handwritten sentence recognition with TensorFlow](https://pylessons.com/handwritten-sentence-recognition), code in ```Tutorials\04_sentence_recognition``` folder;
23 | 5. [Introduction to speech recognition with TensorFlow](https://pylessons.com/speech-recognition), code in ```Tutorials\05_speech_recognition``` folder;
24 | 6. [Introduction to PyTorch in a practical way](https://pylessons.com/pytorch-introduction), code in ```Tutorials\06_pytorch_introduction``` folder;
25 | 7. [Using custom wrapper to simplify PyTorch models training pipeline](https://pylessons.com/pytorch-introduction), code in ```Tutorials\07_pytorch_wrapper``` folder;
26 | 8. [Handwriting words recognition with PyTorch](https://pylessons.com/handwriting-recognition-pytorch), code in ```Tutorials\08_handwriting_recognition_torch``` folder;
27 | 9. [Transformer training with TensorFlow for Translation task](https://pylessons.com/transformers-training), code in ```Tutorials\09_translation_transformer``` folder;
28 | 10. [Speech Recognition in Python | finetune wav2vec2 model for a custom ASR model](https://youtu.be/h6ooEGzjkj0), code in ```Tutorials\10_wav2vec2_torch``` folder;
29 | 11. [YOLOv8: Real-Time Object Detection Simplified](https://youtu.be/vegL__weCxY), code in ```Tutorials\11_Yolov8``` folder;
30 | 12. [YOLOv8: Customizing Object Detector training](https://youtu.be/ysYiV1CbCyY), code in ```Tutorials\11_Yolov8\train_yolov8.py``` folder;


--------------------------------------------------------------------------------
/Tests/README.md:
--------------------------------------------------------------------------------
1 | # Repository for unit tests


--------------------------------------------------------------------------------
/Tests/test_tensorflow_metrics.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from mltu.tensorflow.metrics import CERMetric, WERMetric
 4 | 
 5 | import numpy as np
 6 | import tensorflow as tf
 7 | 
 8 | class TestMetrics(unittest.TestCase):
 9 | 
10 |     def to_embeddings(self, sentences, vocab):
11 |         embeddings, max_len = [], 0
12 | 
13 |         for sentence in sentences:
14 |             embedding = []
15 |             for character in sentence:
16 |                 embedding.append(vocab.index(character))
17 |             embeddings.append(embedding)
18 |             max_len = max(max_len, len(embedding))
19 |         return embeddings, max_len
20 | 
21 |     def setUp(self) -> None:
22 |         true_words = ["Who are you", "I am a student", "I am a teacher", "Just different sentence length"]
23 |         pred_words = ["Who are you", "I am a ztudent", "I am A reacher", "Just different length"]
24 | 
25 |         vocab = set()
26 |         for sen in true_words + pred_words:
27 |             for character in sen:
28 |                 vocab.add(character)
29 |         self.vocab = "".join(vocab)
30 | 
31 |         sentence_true, max_len_true = self.to_embeddings(true_words, self.vocab)
32 |         sentence_pred, max_len_pred = self.to_embeddings(pred_words, self.vocab)
33 | 
34 |         max_len = max(max_len_true, max_len_pred)
35 |         padding_length = 64
36 | 
37 |         self.sen_true = [np.pad(sen, (0, max_len - len(sen)), "constant", constant_values=len(self.vocab)) for sen in sentence_true]
38 |         self.sen_pred = [np.pad(sen, (0, padding_length - len(sen)), "constant", constant_values=-1) for sen in sentence_pred]
39 | 
40 |     def test_CERMetric(self):
41 |         vocabulary = tf.constant(list(self.vocab))
42 |         cer = CERMetric.get_cer(self.sen_true, self.sen_pred, vocabulary).numpy()
43 | 
44 |         self.assertTrue(np.array_equal(cer, np.array([0.0, 0.071428575, 0.14285715, 0.42857143], dtype=np.float32)))
45 | 
46 |     def test_WERMetric(self):
47 |         vocabulary = tf.constant(list(self.vocab))
48 |         wer = WERMetric.get_wer(self.sen_true, self.sen_pred, vocabulary).numpy()
49 | 
50 |         self.assertTrue(np.array_equal(wer, np.array([0., 0.25, 0.5, 0.33333334], dtype=np.float32)))
51 | 
52 | if __name__ == "__main__":
53 |     unittest.main()


--------------------------------------------------------------------------------
/Tests/test_text_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from mltu.utils.text_utils import edit_distance, get_cer, get_wer 
 4 | 
 5 | class TestTextUtils(unittest.TestCase):
 6 | 
 7 |     def test_edit_distance(self):
 8 |         """ This unit test includes several test cases to cover different scenarios, including no errors, 
 9 |         substitution errors, insertion errors, deletion errors, and a more complex case with multiple 
10 |         errors. It also includes a test case for empty input.
11 |         """
12 |         # Test simple case with no errors
13 |         prediction_tokens = ["A", "B", "C"]
14 |         reference_tokens = ["A", "B", "C"]
15 |         self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 0)
16 |         
17 |         # Test simple case with one substitution error
18 |         prediction_tokens = ["A", "B", "D"]
19 |         reference_tokens = ["A", "B", "C"]
20 |         self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 1)
21 |         
22 |         # Test simple case with one insertion error
23 |         prediction_tokens = ["A", "B", "C"]
24 |         reference_tokens = ["A", "B", "C", "D"]
25 |         self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 1)
26 |         
27 |         # Test simple case with one deletion error
28 |         prediction_tokens = ["A", "B"]
29 |         reference_tokens = ["A", "B", "C"]
30 |         self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 1)
31 |         
32 |         # Test more complex case with multiple errors
33 |         prediction_tokens = ["A", "B", "C", "D", "E"]
34 |         reference_tokens = ["A", "C", "B", "F", "E"]
35 |         self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 3)
36 |         
37 |         # Test empty input
38 |         prediction_tokens = []
39 |         reference_tokens = []
40 |         self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 0)
41 | 
42 |     def test_get_cer(self):
43 |         # Test simple case with no errors
44 |         preds = ["A B C"]
45 |         target = ["A B C"]
46 |         self.assertEqual(get_cer(preds, target), 0)
47 |         
48 |         # Test simple case with one character error
49 |         preds = ["A B C"]
50 |         target = ["A B D"]
51 |         self.assertEqual(get_cer(preds, target), 1/5)
52 |         
53 |         # Test simple case with multiple character errors
54 |         preds = ["A B C"]
55 |         target = ["D E F"]
56 |         self.assertEqual(get_cer(preds, target), 3/5)
57 |         
58 |         # Test empty input
59 |         preds = []
60 |         target = []
61 |         self.assertEqual(get_cer(preds, target), 0)
62 | 
63 |         # Test simple case with different word lengths
64 |         preds = ["ABC"]
65 |         target = ["ABCDEFG"]
66 |         self.assertEqual(get_cer(preds, target), 4/7)
67 | 
68 |     def test_get_wer(self):
69 |         # Test simple case with no errors
70 |         preds = "A B C"
71 |         target = "A B C"
72 |         self.assertEqual(get_wer(preds, target), 0)
73 |         
74 |         # Test simple case with one word error
75 |         preds = "A B C"
76 |         target = "A B D"
77 |         self.assertEqual(get_wer(preds, target), 1/3)
78 |         
79 |         # Test simple case with multiple word errors
80 |         preds = "A B C"
81 |         target = "D E F"
82 |         self.assertEqual(get_wer(preds, target), 1)
83 |         
84 |         # Test empty input
85 |         preds = ""
86 |         target = ""
87 |         self.assertEqual(get_wer(preds, target), 0)
88 | 
89 |         # Test simple case with different sentence lengths
90 |         preds = ["ABC"]
91 |         target = ["ABC DEF"]
92 |         self.assertEqual(get_wer(preds, target), 1)
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     unittest.main()
97 | 


--------------------------------------------------------------------------------
/Tutorials/01_image_to_word/configs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | from mltu.configs import BaseModelConfigs
 5 | 
 6 | 
 7 | class ModelConfigs(BaseModelConfigs):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model_path = os.path.join("Models/1_image_to_word", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
11 |         self.vocab = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
12 |         self.height = 32
13 |         self.width = 128
14 |         self.max_text_length = 23
15 |         self.batch_size = 1024
16 |         self.learning_rate = 1e-4
17 |         self.train_epochs = 100
18 |         self.train_workers = 20


--------------------------------------------------------------------------------
/Tutorials/01_image_to_word/inferenceModel.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import typing
 3 | import numpy as np
 4 | 
 5 | from mltu.inferenceModel import OnnxInferenceModel
 6 | from mltu.utils.text_utils import ctc_decoder, get_cer
 7 | 
 8 | class ImageToWordModel(OnnxInferenceModel):
 9 |     def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
10 |         super().__init__(*args, **kwargs)
11 |         self.char_list = char_list
12 | 
13 |     def predict(self, image: np.ndarray):
14 |         image = cv2.resize(image, self.input_shapes[0][1:3][::-1])
15 | 
16 |         image_pred = np.expand_dims(image, axis=0).astype(np.float32)
17 | 
18 |         preds = self.model.run(self.output_names, {self.input_names[0]: image_pred})[0]
19 | 
20 |         text = ctc_decoder(preds, self.char_list)[0]
21 | 
22 |         return text
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     import pandas as pd
27 |     from tqdm import tqdm
28 |     from mltu.configs import BaseModelConfigs
29 |     
30 |     configs = BaseModelConfigs.load("Models/1_image_to_word/202211270035/configs.yaml")
31 | 
32 |     model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab)
33 | 
34 |     df = pd.read_csv("Models/1_image_to_word/202211270035/val.csv").dropna().values.tolist()
35 | 
36 |     accum_cer = []
37 |     for image_path, label in tqdm(df[:20]):
38 |         image = cv2.imread(image_path.replace("\\", "/"))
39 | 
40 |         try:
41 |             prediction_text = model.predict(image)
42 | 
43 |             cer = get_cer(prediction_text, label)
44 |             print(f"Image: {image_path}, Label: {label}, Prediction: {prediction_text}, CER: {cer}")
45 | 
46 |             # resize image by 3 times for visualization
47 |             # image = cv2.resize(image, (image.shape[1] * 3, image.shape[0] * 3))
48 |             # cv2.imshow(prediction_text, image)
49 |             # cv2.waitKey(0)
50 |             # cv2.destroyAllWindows()
51 |         except:
52 |             continue
53 |         
54 |         accum_cer.append(cer)
55 | 
56 |     print(f"Average CER: {np.average(accum_cer)}")


--------------------------------------------------------------------------------
/Tutorials/01_image_to_word/model.py:
--------------------------------------------------------------------------------
 1 | from keras import layers
 2 | from keras.models import Model
 3 | 
 4 | from mltu.tensorflow.model_utils import residual_block
 5 | 
 6 | 
 7 | def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2):
 8 |     
 9 |     inputs = layers.Input(shape=input_dim, name="input")
10 | 
11 |     input = layers.Lambda(lambda x: x / 255)(inputs)
12 | 
13 |     x1 = residual_block(input, 16, activation=activation, skip_conv=True, strides=1, dropout=dropout)
14 | 
15 |     x2 = residual_block(x1, 16, activation=activation, skip_conv=True, strides=2, dropout=dropout)
16 |     x3 = residual_block(x2, 16, activation=activation, skip_conv=False, strides=1, dropout=dropout)
17 | 
18 |     x4 = residual_block(x3, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout)
19 |     x5 = residual_block(x4, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout)
20 | 
21 |     x6 = residual_block(x5, 64, activation=activation, skip_conv=True, strides=1, dropout=dropout)
22 |     x7 = residual_block(x6, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout)
23 | 
24 |     squeezed = layers.Reshape((x7.shape[-3] * x7.shape[-2], x7.shape[-1]))(x7)
25 | 
26 |     blstm = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(squeezed)
27 | 
28 |     output = layers.Dense(output_dim + 1, activation="softmax", name="output")(blstm)
29 | 
30 |     model = Model(inputs=inputs, outputs=output)
31 |     return model


--------------------------------------------------------------------------------
/Tutorials/01_image_to_word/requiremenets.txt:
--------------------------------------------------------------------------------
1 | mltu==0.1.3


--------------------------------------------------------------------------------
/Tutorials/01_image_to_word/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from tqdm import tqdm
  3 | import tensorflow as tf
  4 | 
  5 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
  6 | except: pass
  7 | 
  8 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
  9 | 
 10 | from mltu.preprocessors import ImageReader
 11 | from mltu.annotations.images import CVImage
 12 | from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding
 13 | from mltu.tensorflow.dataProvider import DataProvider
 14 | from mltu.tensorflow.losses import CTCloss
 15 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
 16 | from mltu.tensorflow.metrics import CWERMetric
 17 | 
 18 | 
 19 | from model import train_model
 20 | from configs import ModelConfigs
 21 | 
 22 | configs = ModelConfigs()
 23 | 
 24 | data_path = "Datasets/90kDICT32px"
 25 | val_annotation_path = data_path + "/annotation_val.txt"
 26 | train_annotation_path = data_path + "/annotation_train.txt"
 27 | 
 28 | # Read metadata file and parse it
 29 | def read_annotation_file(annotation_path):
 30 |     dataset, vocab, max_len = [], set(), 0
 31 |     with open(annotation_path, "r") as f:
 32 |         for line in tqdm(f.readlines()):
 33 |             line = line.split()
 34 |             image_path = data_path + line[0][1:]
 35 |             label = line[0].split("_")[1]
 36 |             dataset.append([image_path, label])
 37 |             vocab.update(list(label))
 38 |             max_len = max(max_len, len(label))
 39 |     return dataset, sorted(vocab), max_len
 40 | 
 41 | train_dataset, train_vocab, max_train_len = read_annotation_file(train_annotation_path)
 42 | val_dataset, val_vocab, max_val_len = read_annotation_file(val_annotation_path)
 43 | 
 44 | # Save vocab and maximum text length to configs
 45 | configs.vocab = "".join(train_vocab)
 46 | configs.max_text_length = max(max_train_len, max_val_len)
 47 | configs.save()
 48 | 
 49 | # Create training data provider
 50 | train_data_provider = DataProvider(
 51 |     dataset=train_dataset,
 52 |     skip_validation=True,
 53 |     batch_size=configs.batch_size,
 54 |     data_preprocessors=[ImageReader(CVImage)],
 55 |     transformers=[
 56 |         ImageResizer(configs.width, configs.height),
 57 |         LabelIndexer(configs.vocab),
 58 |         LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab))
 59 |         ],
 60 | )
 61 | 
 62 | # Create validation data provider
 63 | val_data_provider = DataProvider(
 64 |     dataset=val_dataset,
 65 |     skip_validation=True,
 66 |     batch_size=configs.batch_size,
 67 |     data_preprocessors=[ImageReader(CVImage)],
 68 |     transformers=[
 69 |         ImageResizer(configs.width, configs.height),
 70 |         LabelIndexer(configs.vocab),
 71 |         LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab))
 72 |         ],
 73 | )
 74 | 
 75 | model = train_model(
 76 |     input_dim = (configs.height, configs.width, 3),
 77 |     output_dim = len(configs.vocab),
 78 | )
 79 | # Compile the model and print summary
 80 | model.compile(
 81 |     optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 
 82 |     loss=CTCloss(), 
 83 |     metrics=[CWERMetric()],
 84 |     run_eagerly=False
 85 | )
 86 | model.summary(line_length=110)
 87 | 
 88 | # Define path to save the model
 89 | os.makedirs(configs.model_path, exist_ok=True)
 90 | 
 91 | # Define callbacks
 92 | earlystopper = EarlyStopping(monitor="val_CER", patience=10, verbose=1)
 93 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
 94 | trainLogger = TrainLogger(configs.model_path)
 95 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
 96 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode="auto")
 97 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5")
 98 | 
 99 | # Train the model
100 | model.fit(
101 |     train_data_provider,
102 |     validation_data=val_data_provider,
103 |     epochs=configs.train_epochs,
104 |     callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
105 |     workers=configs.train_workers
106 | )
107 | 
108 | # Save training and validation datasets as csv files
109 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
110 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))


--------------------------------------------------------------------------------
/Tutorials/02_captcha_to_text/configs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | from mltu.configs import BaseModelConfigs
 5 | 
 6 | 
 7 | class ModelConfigs(BaseModelConfigs):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model_path = os.path.join("Models/02_captcha_to_text", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
11 |         self.vocab = ""
12 |         self.height = 50
13 |         self.width = 200
14 |         self.max_text_length = 0
15 |         self.batch_size = 64
16 |         self.learning_rate = 1e-3
17 |         self.train_epochs = 1000
18 |         self.train_workers = 20


--------------------------------------------------------------------------------
/Tutorials/02_captcha_to_text/inferenceModel.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import typing
 3 | import numpy as np
 4 | 
 5 | from mltu.inferenceModel import OnnxInferenceModel
 6 | from mltu.utils.text_utils import ctc_decoder, get_cer
 7 | 
 8 | class ImageToWordModel(OnnxInferenceModel):
 9 |     def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
10 |         super().__init__(*args, **kwargs)
11 |         self.char_list = char_list
12 | 
13 |     def predict(self, image: np.ndarray):
14 |         image = cv2.resize(image, self.input_shapes[0][1:3][::-1])
15 | 
16 |         image_pred = np.expand_dims(image, axis=0).astype(np.float32)
17 | 
18 |         preds = self.model.run(self.output_names, {self.input_names[0]: image_pred})[0]
19 | 
20 |         text = ctc_decoder(preds, self.char_list)[0]
21 | 
22 |         return text
23 | 
24 | if __name__ == "__main__":
25 |     import pandas as pd
26 |     from tqdm import tqdm
27 |     from mltu.configs import BaseModelConfigs
28 | 
29 |     configs = BaseModelConfigs.load("Models/02_captcha_to_text/202212211205/configs.yaml")
30 | 
31 |     model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab)
32 | 
33 |     df = pd.read_csv("Models/02_captcha_to_text/202212211205/val.csv").values.tolist()
34 | 
35 |     accum_cer = []
36 |     for image_path, label in tqdm(df):
37 |         image = cv2.imread(image_path.replace("\\", "/"))
38 | 
39 |         prediction_text = model.predict(image)
40 | 
41 |         cer = get_cer(prediction_text, label)
42 |         print(f"Image: {image_path}, Label: {label}, Prediction: {prediction_text}, CER: {cer}")
43 | 
44 |         accum_cer.append(cer)
45 | 
46 |     print(f"Average CER: {np.average(accum_cer)}")


--------------------------------------------------------------------------------
/Tutorials/02_captcha_to_text/model.py:
--------------------------------------------------------------------------------
 1 | from keras import layers
 2 | from keras.models import Model
 3 | 
 4 | from mltu.tensorflow.model_utils import residual_block
 5 | 
 6 | 
 7 | def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2):
 8 |     
 9 |     inputs = layers.Input(shape=input_dim, name="input")
10 | 
11 |     # normalize images here instead in preprocessing step
12 |     input = layers.Lambda(lambda x: x / 255)(inputs)
13 | 
14 |     x1 = residual_block(input, 16, activation=activation, skip_conv=True, strides=1, dropout=dropout)
15 | 
16 |     x2 = residual_block(x1, 16, activation=activation, skip_conv=True, strides=2, dropout=dropout)
17 |     x3 = residual_block(x2, 16, activation=activation, skip_conv=False, strides=1, dropout=dropout)
18 | 
19 |     x4 = residual_block(x3, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout)
20 |     x5 = residual_block(x4, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout)
21 | 
22 |     x6 = residual_block(x5, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout)
23 |     x7 = residual_block(x6, 32, activation=activation, skip_conv=True, strides=1, dropout=dropout)
24 | 
25 |     x8 = residual_block(x7, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout)
26 |     x9 = residual_block(x8, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout)
27 | 
28 |     squeezed = layers.Reshape((x9.shape[-3] * x9.shape[-2], x9.shape[-1]))(x9)
29 | 
30 |     blstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(squeezed)
31 |     blstm = layers.Dropout(dropout)(blstm)
32 | 
33 |     output = layers.Dense(output_dim + 1, activation="softmax", name="output")(blstm)
34 | 
35 |     model = Model(inputs=inputs, outputs=output)
36 |     return model
37 | 


--------------------------------------------------------------------------------
/Tutorials/02_captcha_to_text/train.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
  3 | except: pass
  4 | 
  5 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
  6 | 
  7 | from mltu.tensorflow.dataProvider import DataProvider
  8 | from mltu.tensorflow.losses import CTCloss
  9 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
 10 | from mltu.tensorflow.metrics import CWERMetric
 11 | 
 12 | from mltu.preprocessors import ImageReader
 13 | from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding
 14 | from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate
 15 | from mltu.annotations.images import CVImage
 16 | 
 17 | from model import train_model
 18 | from configs import ModelConfigs
 19 | 
 20 | import os
 21 | from urllib.request import urlopen
 22 | from io import BytesIO
 23 | from zipfile import ZipFile
 24 | 
 25 | 
 26 | def download_and_unzip(url, extract_to="Datasets"):
 27 |     http_response = urlopen(url)
 28 |     zipfile = ZipFile(BytesIO(http_response.read()))
 29 |     zipfile.extractall(path=extract_to)
 30 | 
 31 | 
 32 | if not os.path.exists(os.path.join("Datasets", "captcha_images_v2")):
 33 |     download_and_unzip("https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip",
 34 |                        extract_to="Datasets")
 35 | 
 36 | # Create a list of all the images and labels in the dataset
 37 | dataset, vocab, max_len = [], set(), 0
 38 | captcha_path = os.path.join("Datasets", "captcha_images_v2")
 39 | for file in os.listdir(captcha_path):
 40 |     file_path = os.path.join(captcha_path, file)
 41 |     label = os.path.splitext(file)[0] # Get the file name without the extension
 42 |     dataset.append([file_path, label])
 43 |     vocab.update(list(label))
 44 |     max_len = max(max_len, len(label))
 45 | 
 46 | configs = ModelConfigs()
 47 | 
 48 | # Save vocab and maximum text length to configs
 49 | configs.vocab = "".join(vocab)
 50 | configs.max_text_length = max_len
 51 | configs.save()
 52 | 
 53 | # Create a data provider for the dataset
 54 | data_provider = DataProvider(
 55 |     dataset=dataset,
 56 |     skip_validation=True,
 57 |     batch_size=configs.batch_size,
 58 |     data_preprocessors=[ImageReader(CVImage)],
 59 |     transformers=[
 60 |         ImageResizer(configs.width, configs.height),
 61 |         LabelIndexer(configs.vocab),
 62 |         LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab))
 63 |         ],
 64 | )
 65 | # Split the dataset into training and validation sets
 66 | train_data_provider, val_data_provider = data_provider.split(split = 0.9)
 67 | 
 68 | # Augment training data with random brightness, rotation and erode/dilate
 69 | train_data_provider.augmentors = [RandomBrightness(), RandomRotate(), RandomErodeDilate()]
 70 | 
 71 | # Creating TensorFlow model architecture
 72 | model = train_model(
 73 |     input_dim = (configs.height, configs.width, 3),
 74 |     output_dim = len(configs.vocab),
 75 | )
 76 | 
 77 | # Compile the model and print summary
 78 | model.compile(
 79 |     optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 
 80 |     loss=CTCloss(), 
 81 |     metrics=[CWERMetric(padding_token=len(configs.vocab))],
 82 |     run_eagerly=False
 83 | )
 84 | model.summary(line_length=110)
 85 | # Define path to save the model
 86 | os.makedirs(configs.model_path, exist_ok=True)
 87 | 
 88 | # Define callbacks
 89 | earlystopper = EarlyStopping(monitor="val_CER", patience=50, verbose=1, mode="min")
 90 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
 91 | trainLogger = TrainLogger(configs.model_path)
 92 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
 93 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=20, verbose=1, mode="min")
 94 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5")
 95 | 
 96 | # Train the model
 97 | model.fit(
 98 |     train_data_provider,
 99 |     validation_data=val_data_provider,
100 |     epochs=configs.train_epochs,
101 |     callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
102 |     workers=configs.train_workers
103 | )
104 | 
105 | # Save training and validation datasets as csv files
106 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
107 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))


--------------------------------------------------------------------------------
/Tutorials/03_handwriting_recognition/configs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | from mltu.configs import BaseModelConfigs
 5 | 
 6 | class ModelConfigs(BaseModelConfigs):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 |         self.model_path = os.path.join("Models/03_handwriting_recognition", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
10 |         self.vocab = ""
11 |         self.height = 32
12 |         self.width = 128
13 |         self.max_text_length = 0
14 |         self.batch_size = 16
15 |         self.learning_rate = 0.0005
16 |         self.train_epochs = 1000
17 |         self.train_workers = 20


--------------------------------------------------------------------------------
/Tutorials/03_handwriting_recognition/inferenceModel.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import typing
 3 | import numpy as np
 4 | 
 5 | from mltu.inferenceModel import OnnxInferenceModel
 6 | from mltu.utils.text_utils import ctc_decoder, get_cer
 7 | 
 8 | class ImageToWordModel(OnnxInferenceModel):
 9 |     def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
10 |         super().__init__(*args, **kwargs)
11 |         self.char_list = char_list
12 | 
13 |     def predict(self, image: np.ndarray):
14 |         image = cv2.resize(image, self.input_shapes[0][1:3][::-1])
15 | 
16 |         image_pred = np.expand_dims(image, axis=0).astype(np.float32)
17 | 
18 |         preds = self.model.run(self.output_names, {self.input_names[0]: image_pred})[0]
19 | 
20 |         text = ctc_decoder(preds, self.char_list)[0]
21 | 
22 |         return text
23 | 
24 | if __name__ == "__main__":
25 |     import pandas as pd
26 |     from tqdm import tqdm
27 |     from mltu.configs import BaseModelConfigs
28 | 
29 |     configs = BaseModelConfigs.load("Models/03_handwriting_recognition/202301111911/configs.yaml")
30 | 
31 |     model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab)
32 | 
33 |     df = pd.read_csv("Models/03_handwriting_recognition/202301111911/val.csv").values.tolist()
34 | 
35 |     accum_cer = []
36 |     for image_path, label in tqdm(df):
37 |         image = cv2.imread(image_path.replace("\\", "/"))
38 | 
39 |         prediction_text = model.predict(image)
40 | 
41 |         cer = get_cer(prediction_text, label)
42 |         print(f"Image: {image_path}, Label: {label}, Prediction: {prediction_text}, CER: {cer}")
43 | 
44 |         accum_cer.append(cer)
45 | 
46 |         # resize by 4x
47 |         image = cv2.resize(image, (image.shape[1] * 4, image.shape[0] * 4))
48 |         cv2.imshow("Image", image)
49 |         cv2.waitKey(0)
50 |         cv2.destroyAllWindows()
51 | 
52 |     print(f"Average CER: {np.average(accum_cer)}")


--------------------------------------------------------------------------------
/Tutorials/03_handwriting_recognition/model.py:
--------------------------------------------------------------------------------
 1 | from keras import layers
 2 | from keras.models import Model
 3 | 
 4 | from mltu.tensorflow.model_utils import residual_block
 5 | 
 6 | 
 7 | def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2):
 8 |     
 9 |     inputs = layers.Input(shape=input_dim, name="input")
10 | 
11 |     # normalize images here instead in preprocessing step
12 |     input = layers.Lambda(lambda x: x / 255)(inputs)
13 | 
14 |     x1 = residual_block(input, 16, activation=activation, skip_conv=True, strides=1, dropout=dropout)
15 | 
16 |     x2 = residual_block(x1, 16, activation=activation, skip_conv=True, strides=2, dropout=dropout)
17 |     x3 = residual_block(x2, 16, activation=activation, skip_conv=False, strides=1, dropout=dropout)
18 | 
19 |     x4 = residual_block(x3, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout)
20 |     x5 = residual_block(x4, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout)
21 | 
22 |     x6 = residual_block(x5, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout)
23 |     x7 = residual_block(x6, 64, activation=activation, skip_conv=True, strides=1, dropout=dropout)
24 | 
25 |     x8 = residual_block(x7, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout)
26 |     x9 = residual_block(x8, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout)
27 | 
28 |     squeezed = layers.Reshape((x9.shape[-3] * x9.shape[-2], x9.shape[-1]))(x9)
29 | 
30 |     blstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(squeezed)
31 |     blstm = layers.Dropout(dropout)(blstm)
32 | 
33 |     output = layers.Dense(output_dim + 1, activation="softmax", name="output")(blstm)
34 | 
35 |     model = Model(inputs=inputs, outputs=output)
36 |     return model
37 | 


--------------------------------------------------------------------------------
/Tutorials/03_handwriting_recognition/train.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
  3 | except: pass
  4 | 
  5 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
  6 | 
  7 | from mltu.preprocessors import ImageReader
  8 | from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2
  9 | from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen
 10 | from mltu.annotations.images import CVImage
 11 | 
 12 | from mltu.tensorflow.dataProvider import DataProvider
 13 | from mltu.tensorflow.losses import CTCloss
 14 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
 15 | from mltu.tensorflow.metrics import CWERMetric
 16 | 
 17 | from model import train_model
 18 | from configs import ModelConfigs
 19 | 
 20 | import os
 21 | import tarfile
 22 | from tqdm import tqdm
 23 | from urllib.request import urlopen
 24 | from io import BytesIO
 25 | from zipfile import ZipFile
 26 | 
 27 | 
 28 | def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024):
 29 |     http_response = urlopen(url)
 30 | 
 31 |     data = b""
 32 |     iterations = http_response.length // chunk_size + 1
 33 |     for _ in tqdm(range(iterations)):
 34 |         data += http_response.read(chunk_size)
 35 | 
 36 |     zipfile = ZipFile(BytesIO(data))
 37 |     zipfile.extractall(path=extract_to)
 38 | 
 39 | dataset_path = os.path.join("Datasets", "IAM_Words")
 40 | if not os.path.exists(dataset_path):
 41 |     download_and_unzip("https://git.io/J0fjL", extract_to="Datasets")
 42 | 
 43 |     file = tarfile.open(os.path.join(dataset_path, "words.tgz"))
 44 |     file.extractall(os.path.join(dataset_path, "words"))
 45 | 
 46 | dataset, vocab, max_len = [], set(), 0
 47 | 
 48 | # Preprocess the dataset by the specific IAM_Words dataset file structure
 49 | words = open(os.path.join(dataset_path, "words.txt"), "r").readlines()
 50 | for line in tqdm(words):
 51 |     if line.startswith("#"):
 52 |         continue
 53 | 
 54 |     line_split = line.split(" ")
 55 |     if line_split[1] == "err":
 56 |         continue
 57 | 
 58 |     folder1 = line_split[0][:3]
 59 |     folder2 = "-".join(line_split[0].split("-")[:2])
 60 |     file_name = line_split[0] + ".png"
 61 |     label = line_split[-1].rstrip("\n")
 62 | 
 63 |     rel_path = os.path.join(dataset_path, "words", folder1, folder2, file_name)
 64 |     if not os.path.exists(rel_path):
 65 |         print(f"File not found: {rel_path}")
 66 |         continue
 67 | 
 68 |     dataset.append([rel_path, label])
 69 |     vocab.update(list(label))
 70 |     max_len = max(max_len, len(label))
 71 | 
 72 | # Create a ModelConfigs object to store model configurations
 73 | configs = ModelConfigs()
 74 | 
 75 | # Save vocab and maximum text length to configs
 76 | configs.vocab = "".join(vocab)
 77 | configs.max_text_length = max_len
 78 | configs.save()
 79 | 
 80 | # Create a data provider for the dataset
 81 | data_provider = DataProvider(
 82 |     dataset=dataset,
 83 |     skip_validation=True,
 84 |     batch_size=configs.batch_size,
 85 |     data_preprocessors=[ImageReader(CVImage)],
 86 |     transformers=[
 87 |         ImageResizer(configs.width, configs.height, keep_aspect_ratio=False),
 88 |         LabelIndexer(configs.vocab),
 89 |         LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
 90 |         ],
 91 | )
 92 | 
 93 | # Split the dataset into training and validation sets
 94 | train_data_provider, val_data_provider = data_provider.split(split = 0.9)
 95 | 
 96 | # Augment training data with random brightness, rotation and erode/dilate
 97 | train_data_provider.augmentors = [
 98 |     RandomBrightness(), 
 99 |     RandomErodeDilate(),
100 |     RandomSharpen(),
101 |     RandomRotate(angle=10), 
102 |     ]
103 | 
104 | # Creating TensorFlow model architecture
105 | model = train_model(
106 |     input_dim = (configs.height, configs.width, 3),
107 |     output_dim = len(configs.vocab),
108 | )
109 | 
110 | # Compile the model and print summary
111 | model.compile(
112 |     optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 
113 |     loss=CTCloss(), 
114 |     metrics=[CWERMetric(padding_token=len(configs.vocab))],
115 | )
116 | model.summary(line_length=110)
117 | 
118 | # Define callbacks
119 | earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1)
120 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
121 | trainLogger = TrainLogger(configs.model_path)
122 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
123 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=10, verbose=1, mode="auto")
124 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5")
125 | 
126 | # Train the model
127 | model.fit(
128 |     train_data_provider,
129 |     validation_data=val_data_provider,
130 |     epochs=configs.train_epochs,
131 |     callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
132 |     workers=configs.train_workers
133 | )
134 | 
135 | # Save training and validation datasets as csv files
136 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
137 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))


--------------------------------------------------------------------------------
/Tutorials/04_sentence_recognition/README.md:
--------------------------------------------------------------------------------
 1 | # Handwritten sentence recognition with TensorFlow
 2 | ## Unlock the power of handwritten sentence recognition with TensorFlow and CTC loss. From digitizing notes to transcribing historical documents and automating exam grading
 3 | 
 4 | 
 5 | ## **Detailed tutorial**:
 6 | ## [Handwritten sentence recognition with TensorFlow](https://pylessons.com/handwritten-sentence-recognition)
 7 | 
 8 | <p align="center">
 9 |   <img src="https://pylessons.com/media/Tutorials/TensorFlow-CAPTCHA-solver/handwritten-sentence-recognition/handwritten-sentence-recognition_pbLia4E.png">
10 | </p>


--------------------------------------------------------------------------------
/Tutorials/04_sentence_recognition/configs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | from mltu.configs import BaseModelConfigs
 5 | 
 6 | class ModelConfigs(BaseModelConfigs):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 |         self.model_path = os.path.join("Models/04_sentence_recognition", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
10 |         self.vocab = ""
11 |         self.height = 96
12 |         self.width = 1408
13 |         self.max_text_length = 0
14 |         self.batch_size = 32
15 |         self.learning_rate = 0.0005
16 |         self.train_epochs = 1000
17 |         self.train_workers = 20


--------------------------------------------------------------------------------
/Tutorials/04_sentence_recognition/inferenceModel.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import typing
 3 | import numpy as np
 4 | 
 5 | from mltu.inferenceModel import OnnxInferenceModel
 6 | from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer
 7 | from mltu.transformers import ImageResizer
 8 | 
 9 | class ImageToWordModel(OnnxInferenceModel):
10 |     def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
11 |         super().__init__(*args, **kwargs)
12 |         self.char_list = char_list
13 | 
14 |     def predict(self, image: np.ndarray):
15 |         image = ImageResizer.resize_maintaining_aspect_ratio(image, *self.input_shapes[0][1:3][::-1])
16 | 
17 |         image_pred = np.expand_dims(image, axis=0).astype(np.float32)
18 | 
19 |         preds = self.model.run(self.output_names, {self.input_names[0]: image_pred})[0]
20 | 
21 |         text = ctc_decoder(preds, self.char_list)[0]
22 | 
23 |         return text
24 | 
25 | if __name__ == "__main__":
26 |     import pandas as pd
27 |     from tqdm import tqdm
28 |     from mltu.configs import BaseModelConfigs
29 | 
30 |     configs = BaseModelConfigs.load("Models/04_sentence_recognition/202301131202/configs.yaml")
31 | 
32 |     model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab)
33 | 
34 |     df = pd.read_csv("Models/04_sentence_recognition/202301131202/val.csv").values.tolist()
35 | 
36 |     accum_cer, accum_wer = [], []
37 |     for image_path, label in tqdm(df):
38 |         image = cv2.imread(image_path.replace("\\", "/"))
39 | 
40 |         prediction_text = model.predict(image)
41 | 
42 |         cer = get_cer(prediction_text, label)
43 |         wer = get_wer(prediction_text, label)
44 |         print("Image: ", image_path)
45 |         print("Label:", label)
46 |         print("Prediction: ", prediction_text)
47 |         print(f"CER: {cer}; WER: {wer}")
48 | 
49 |         accum_cer.append(cer)
50 |         accum_wer.append(wer)
51 | 
52 |         cv2.imshow(prediction_text, image)
53 |         cv2.waitKey(0)
54 |         cv2.destroyAllWindows()
55 | 
56 |     print(f"Average CER: {np.average(accum_cer)}, Average WER: {np.average(accum_wer)}")


--------------------------------------------------------------------------------
/Tutorials/04_sentence_recognition/model.py:
--------------------------------------------------------------------------------
 1 | from keras import layers
 2 | from keras.models import Model
 3 | 
 4 | from mltu.tensorflow.model_utils import residual_block
 5 | 
 6 | 
 7 | def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2):
 8 |     
 9 |     inputs = layers.Input(shape=input_dim, name="input")
10 | 
11 |     # normalize images here instead in preprocessing step
12 |     input = layers.Lambda(lambda x: x / 255)(inputs)
13 | 
14 |     x1 = residual_block(input, 32, activation=activation, skip_conv=True, strides=1, dropout=dropout)
15 | 
16 |     x2 = residual_block(x1, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout)
17 |     x3 = residual_block(x2, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout)
18 | 
19 |     x4 = residual_block(x3, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout)
20 |     x5 = residual_block(x4, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout)
21 | 
22 |     x6 = residual_block(x5, 128, activation=activation, skip_conv=True, strides=2, dropout=dropout)
23 |     x7 = residual_block(x6, 128, activation=activation, skip_conv=True, strides=1, dropout=dropout)
24 | 
25 |     x8 = residual_block(x7, 128, activation=activation, skip_conv=True, strides=2, dropout=dropout)
26 |     x9 = residual_block(x8, 128, activation=activation, skip_conv=False, strides=1, dropout=dropout)
27 | 
28 |     squeezed = layers.Reshape((x9.shape[-3] * x9.shape[-2], x9.shape[-1]))(x9)
29 | 
30 |     blstm = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(squeezed)
31 |     blstm = layers.Dropout(dropout)(blstm)
32 | 
33 |     blstm = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(blstm)
34 |     blstm = layers.Dropout(dropout)(blstm)
35 | 
36 |     output = layers.Dense(output_dim + 1, activation="softmax", name="output")(blstm)
37 | 
38 |     model = Model(inputs=inputs, outputs=output)
39 |     return model
40 | 


--------------------------------------------------------------------------------
/Tutorials/04_sentence_recognition/train.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
  3 | except: pass
  4 | 
  5 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
  6 | 
  7 | from mltu.preprocessors import ImageReader
  8 | from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2
  9 | from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen
 10 | from mltu.annotations.images import CVImage
 11 | 
 12 | from mltu.tensorflow.dataProvider import DataProvider
 13 | from mltu.tensorflow.losses import CTCloss
 14 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
 15 | from mltu.tensorflow.metrics import CERMetric, WERMetric
 16 | 
 17 | from model import train_model
 18 | from configs import ModelConfigs
 19 | 
 20 | import os
 21 | from tqdm import tqdm
 22 | 
 23 | # Must download and extract datasets manually from https://fki.tic.heia-fr.ch/databases/download-the-iam-handwriting-database to Datasets\IAM_Sentences
 24 | sentences_txt_path = os.path.join("Datasets", "IAM_Sentences", "ascii", "sentences.txt")
 25 | sentences_folder_path = os.path.join("Datasets", "IAM_Sentences", "sentences")
 26 | 
 27 | dataset, vocab, max_len = [], set(), 0
 28 | words = open(sentences_txt_path, "r").readlines()
 29 | for line in tqdm(words):
 30 |     if line.startswith("#"):
 31 |         continue
 32 | 
 33 |     line_split = line.split(" ")
 34 |     if line_split[2] == "err":
 35 |         continue
 36 | 
 37 |     folder1 = line_split[0][:3]
 38 |     folder2 = "-".join(line_split[0].split("-")[:2])
 39 |     file_name = line_split[0] + ".png"
 40 |     label = line_split[-1].rstrip("\n")
 41 | 
 42 |     # replace "|" with " " in label
 43 |     label = label.replace("|", " ")
 44 | 
 45 |     rel_path = os.path.join(sentences_folder_path, folder1, folder2, file_name)
 46 |     if not os.path.exists(rel_path):
 47 |         print(f"File not found: {rel_path}")
 48 |         continue
 49 | 
 50 |     dataset.append([rel_path, label])
 51 |     vocab.update(list(label))
 52 |     max_len = max(max_len, len(label))
 53 | 
 54 | # Create a ModelConfigs object to store model configurations
 55 | configs = ModelConfigs()
 56 | 
 57 | # Save vocab and maximum text length to configs
 58 | configs.vocab = "".join(vocab)
 59 | configs.max_text_length = max_len
 60 | configs.save()
 61 | 
 62 | # Create a data provider for the dataset
 63 | data_provider = DataProvider(
 64 |     dataset=dataset,
 65 |     skip_validation=True,
 66 |     batch_size=configs.batch_size,
 67 |     data_preprocessors=[ImageReader(CVImage)],
 68 |     transformers=[
 69 |         ImageResizer(configs.width, configs.height, keep_aspect_ratio=True),
 70 |         LabelIndexer(configs.vocab),
 71 |         LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
 72 |         ],
 73 | )
 74 | 
 75 | # Split the dataset into training and validation sets
 76 | train_data_provider, val_data_provider = data_provider.split(split = 0.9)
 77 | 
 78 | # Augment training data with random brightness, rotation and erode/dilate
 79 | train_data_provider.augmentors = [
 80 |     RandomBrightness(), 
 81 |     RandomErodeDilate(),
 82 |     RandomSharpen(),
 83 |     ]
 84 | 
 85 | # Creating TensorFlow model architecture
 86 | model = train_model(
 87 |     input_dim = (configs.height, configs.width, 3),
 88 |     output_dim = len(configs.vocab),
 89 | )
 90 | 
 91 | # Compile the model and print summary
 92 | model.compile(
 93 |     optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 
 94 |     loss=CTCloss(), 
 95 |     metrics=[
 96 |         CERMetric(vocabulary=configs.vocab),
 97 |         WERMetric(vocabulary=configs.vocab)
 98 |         ],
 99 |     run_eagerly=False
100 | )
101 | model.summary(line_length=110)
102 | 
103 | # Define callbacks
104 | earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min")
105 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
106 | trainLogger = TrainLogger(configs.model_path)
107 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
108 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode="auto")
109 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5")
110 | 
111 | # Train the model
112 | model.fit(
113 |     train_data_provider,
114 |     validation_data=val_data_provider,
115 |     epochs=configs.train_epochs,
116 |     callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
117 |     workers=configs.train_workers
118 | )
119 | 
120 | # Save training and validation datasets as csv files
121 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
122 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))


--------------------------------------------------------------------------------
/Tutorials/05_sound_to_text/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to speech recognition with TensorFlow
 2 | ## Master the basics of speech recognition with TensorFlow: Learn how to build and train models, implement real-time audio recognition, and develop practical applications
 3 | 
 4 | 
 5 | ## **Detailed tutorial**:
 6 | ## [Introduction to speech recognition with TensorFlow](https://pylessons.com/speech-recognition)
 7 | 
 8 | <p align="center">
 9 |   <img src="https://pylessons.com/media/Tutorials/TensorFlow-CAPTCHA-solver/speech-recognition/speech-recognition.png">
10 | </p>


--------------------------------------------------------------------------------
/Tutorials/05_sound_to_text/configs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | from mltu.configs import BaseModelConfigs
 5 | 
 6 | 
 7 | class ModelConfigs(BaseModelConfigs):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model_path = os.path.join("Models/05_sound_to_text", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
11 |         self.frame_length = 256 
12 |         self.frame_step = 160
13 |         self.fft_length = 384
14 | 
15 |         self.vocab = "abcdefghijklmnopqrstuvwxyz'?! "
16 |         self.input_shape = None
17 |         self.max_text_length = None
18 |         self.max_spectrogram_length = None
19 | 
20 |         self.batch_size = 8
21 |         self.learning_rate = 0.0005
22 |         self.train_epochs = 1000
23 |         self.train_workers = 20


--------------------------------------------------------------------------------
/Tutorials/05_sound_to_text/inferenceModel.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | import numpy as np
 3 | 
 4 | from mltu.inferenceModel import OnnxInferenceModel
 5 | from mltu.preprocessors import WavReader
 6 | from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer
 7 | 
 8 | class WavToTextModel(OnnxInferenceModel):
 9 |     def __init__(self, char_list: typing.Union[str, list], *args, **kwargs):
10 |         super().__init__(*args, **kwargs)
11 |         self.char_list = char_list
12 | 
13 |     def predict(self, data: np.ndarray):
14 |         data_pred = np.expand_dims(data, axis=0)
15 | 
16 |         preds = self.model.run(self.output_names, {self.input_names[0]: data_pred})[0]
17 | 
18 |         text = ctc_decoder(preds, self.char_list)[0]
19 | 
20 |         return text
21 | 
22 | if __name__ == "__main__":
23 |     import pandas as pd
24 |     from tqdm import tqdm
25 |     from mltu.configs import BaseModelConfigs
26 | 
27 |     configs = BaseModelConfigs.load("Models/05_sound_to_text/202302051936/configs.yaml")
28 | 
29 |     model = WavToTextModel(model_path=configs.model_path, char_list=configs.vocab, force_cpu=False)
30 | 
31 |     df = pd.read_csv("Models/05_sound_to_text/202302051936/val.csv").values.tolist()
32 | 
33 |     accum_cer, accum_wer = [], []
34 |     for wav_path, label in tqdm(df):
35 |         wav_path = wav_path.replace("\\", "/")
36 |         spectrogram = WavReader.get_spectrogram(wav_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
37 |         WavReader.plot_raw_audio(wav_path, label)
38 | 
39 |         padded_spectrogram = np.pad(spectrogram, ((0, configs.max_spectrogram_length - spectrogram.shape[0]),(0,0)), mode="constant", constant_values=0)
40 | 
41 |         WavReader.plot_spectrogram(spectrogram, label)
42 | 
43 |         text = model.predict(padded_spectrogram)
44 | 
45 |         true_label = "".join([l for l in label.lower() if l in configs.vocab])
46 | 
47 |         cer = get_cer(text, true_label)
48 |         wer = get_wer(text, true_label)
49 | 
50 |         accum_cer.append(cer)
51 |         accum_wer.append(wer)
52 | 
53 |     print(f"Average CER: {np.average(accum_cer)}, Average WER: {np.average(accum_wer)}")


--------------------------------------------------------------------------------
/Tutorials/05_sound_to_text/model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from keras import layers
 3 | from keras.models import Model
 4 | 
 5 | from mltu.tensorflow.model_utils import residual_block, activation_layer
 6 | 
 7 | 
 8 | def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2):
 9 |     
10 |     inputs = layers.Input(shape=input_dim, name="input", dtype=tf.float32)
11 | 
12 |     # expand dims to add channel dimension
13 |     input = layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(inputs)
14 | 
15 |     # Convolution layer 1
16 |     x = layers.Conv2D(filters=32, kernel_size=[11, 41], strides=[2, 2], padding="same", use_bias=False)(input)
17 |     x = layers.BatchNormalization()(x)
18 |     x = activation_layer(x, activation="leaky_relu")
19 | 
20 |     # Convolution layer 2
21 |     x = layers.Conv2D(filters=32, kernel_size=[11, 21], strides=[1, 2], padding="same", use_bias=False)(x)
22 |     x = layers.BatchNormalization()(x)
23 |     x = activation_layer(x, activation="leaky_relu")
24 |     
25 |     # Reshape the resulted volume to feed the RNNs layers
26 |     x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
27 | 
28 |     # RNN layers
29 |     x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
30 |     x = layers.Dropout(dropout)(x)
31 | 
32 |     x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
33 |     x = layers.Dropout(dropout)(x)
34 | 
35 |     x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
36 |     x = layers.Dropout(dropout)(x)
37 | 
38 |     x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
39 |     x = layers.Dropout(dropout)(x)
40 | 
41 |     x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
42 | 
43 |     # Dense layer
44 |     x = layers.Dense(256)(x)
45 |     x = activation_layer(x, activation="leaky_relu")
46 |     x = layers.Dropout(dropout)(x)
47 | 
48 |     # Classification layer
49 |     output = layers.Dense(output_dim + 1, activation="softmax", dtype=tf.float32)(x)
50 |     
51 |     model = Model(inputs=inputs, outputs=output)
52 |     return model


--------------------------------------------------------------------------------
/Tutorials/05_sound_to_text/train.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
  3 | except: pass
  4 | 
  5 | import os
  6 | import tarfile
  7 | import pandas as pd
  8 | from tqdm import tqdm
  9 | from urllib.request import urlopen
 10 | from io import BytesIO
 11 | 
 12 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
 13 | from mltu.preprocessors import WavReader
 14 | 
 15 | from mltu.tensorflow.dataProvider import DataProvider
 16 | from mltu.transformers import LabelIndexer, LabelPadding, SpectrogramPadding
 17 | from mltu.tensorflow.losses import CTCloss
 18 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
 19 | from mltu.tensorflow.metrics import CERMetric, WERMetric
 20 | 
 21 | from model import train_model
 22 | from configs import ModelConfigs
 23 | 
 24 | 
 25 | def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024):
 26 |     http_response = urlopen(url)
 27 | 
 28 |     data = b""
 29 |     iterations = http_response.length // chunk_size + 1
 30 |     for _ in tqdm(range(iterations)):
 31 |         data += http_response.read(chunk_size)
 32 | 
 33 |     tarFile = tarfile.open(fileobj=BytesIO(data), mode="r|bz2")
 34 |     tarFile.extractall(path=extract_to)
 35 |     tarFile.close()
 36 | 
 37 | 
 38 | dataset_path = os.path.join("Datasets", "LJSpeech-1.1")
 39 | if not os.path.exists(dataset_path):
 40 |     download_and_unzip("https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", extract_to="Datasets")
 41 | 
 42 | dataset_path = "Datasets/LJSpeech-1.1"
 43 | metadata_path = dataset_path + "/metadata.csv"
 44 | wavs_path = dataset_path + "/wavs/"
 45 | 
 46 | # Read metadata file and parse it
 47 | metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
 48 | metadata_df.columns = ["file_name", "transcription", "normalized_transcription"]
 49 | metadata_df = metadata_df[["file_name", "normalized_transcription"]]
 50 | 
 51 | # structure the dataset where each row is a list of [wav_file_path, sound transcription]
 52 | dataset = [[f"Datasets/LJSpeech-1.1/wavs/{file}.wav", label.lower()] for file, label in metadata_df.values.tolist()]
 53 | 
 54 | # Create a ModelConfigs object to store model configurations
 55 | configs = ModelConfigs()
 56 | 
 57 | max_text_length, max_spectrogram_length = 0, 0
 58 | for file_path, label in tqdm(dataset):
 59 |     spectrogram = WavReader.get_spectrogram(file_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length)
 60 |     valid_label = [c for c in label if c in configs.vocab]
 61 |     max_text_length = max(max_text_length, len(valid_label))
 62 |     max_spectrogram_length = max(max_spectrogram_length, spectrogram.shape[0])
 63 |     configs.input_shape = [max_spectrogram_length, spectrogram.shape[1]]
 64 | 
 65 | configs.max_spectrogram_length = max_spectrogram_length
 66 | configs.max_text_length = max_text_length
 67 | configs.save()
 68 | 
 69 | # Create a data provider for the dataset
 70 | data_provider = DataProvider(
 71 |     dataset=dataset,
 72 |     skip_validation=True,
 73 |     batch_size=configs.batch_size,
 74 |     data_preprocessors=[
 75 |         WavReader(frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length),
 76 |         ],
 77 |     transformers=[
 78 |         SpectrogramPadding(max_spectrogram_length=configs.max_spectrogram_length, padding_value=0),
 79 |         LabelIndexer(configs.vocab),
 80 |         LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
 81 |         ],
 82 | )
 83 | 
 84 | # Split the dataset into training and validation sets
 85 | train_data_provider, val_data_provider = data_provider.split(split = 0.9)
 86 | 
 87 | # Creating TensorFlow model architecture
 88 | model = train_model(
 89 |     input_dim = configs.input_shape,
 90 |     output_dim = len(configs.vocab),
 91 |     dropout=0.5
 92 | )
 93 | 
 94 | # Compile the model and print summary
 95 | model.compile(
 96 |     optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 
 97 |     loss=CTCloss(), 
 98 |     metrics=[
 99 |         CERMetric(vocabulary=configs.vocab),
100 |         WERMetric(vocabulary=configs.vocab)
101 |         ],
102 |     run_eagerly=False
103 | )
104 | model.summary(line_length=110)
105 | 
106 | # Define callbacks
107 | earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min")
108 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
109 | trainLogger = TrainLogger(configs.model_path)
110 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
111 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.8, min_delta=1e-10, patience=5, verbose=1, mode="auto")
112 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5")
113 | 
114 | # Train the model
115 | model.fit(
116 |     train_data_provider,
117 |     validation_data=val_data_provider,
118 |     epochs=configs.train_epochs,
119 |     callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
120 |     workers=configs.train_workers
121 | )
122 | 
123 | # Save training and validation datasets as csv files
124 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
125 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))
126 | 


--------------------------------------------------------------------------------
/Tutorials/05_sound_to_text/train_no_limit.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
  3 | except: pass
  4 | tf.keras.mixed_precision.set_global_policy('mixed_float16') # mixed precission training for faster training time
  5 | 
  6 | import os
  7 | import tarfile
  8 | import pandas as pd
  9 | from tqdm import tqdm
 10 | from urllib.request import urlopen
 11 | from io import BytesIO
 12 | 
 13 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
 14 | from mltu.preprocessors import WavReader
 15 | 
 16 | from mltu.tensorflow.dataProvider import DataProvider
 17 | from mltu.transformers import LabelIndexer, LabelPadding, SpectrogramPadding
 18 | from mltu.tensorflow.losses import CTCloss
 19 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
 20 | from mltu.tensorflow.metrics import CERMetric, WERMetric
 21 | 
 22 | from model import train_model
 23 | from configs import ModelConfigs
 24 | 
 25 | 
 26 | def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024):
 27 |     http_response = urlopen(url)
 28 | 
 29 |     data = b""
 30 |     iterations = http_response.length // chunk_size + 1
 31 |     for _ in tqdm(range(iterations)):
 32 |         data += http_response.read(chunk_size)
 33 | 
 34 |     tarFile = tarfile.open(fileobj=BytesIO(data), mode="r|bz2")
 35 |     tarFile.extractall(path=extract_to)
 36 |     tarFile.close()
 37 | 
 38 | 
 39 | dataset_path = os.path.join("Datasets", "LJSpeech-1.1")
 40 | if not os.path.exists(dataset_path):
 41 |     download_and_unzip("https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", extract_to="Datasets")
 42 | 
 43 | dataset_path = "Datasets/LJSpeech-1.1"
 44 | metadata_path = dataset_path + "/metadata.csv"
 45 | wavs_path = dataset_path + "/wavs/"
 46 | 
 47 | # Read metadata file and parse it
 48 | metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
 49 | metadata_df.columns = ["file_name", "transcription", "normalized_transcription"]
 50 | metadata_df = metadata_df[["file_name", "normalized_transcription"]]
 51 | 
 52 | # structure the dataset where each row is a list of [wav_file_path, sound transcription]
 53 | dataset = [[f"Datasets/LJSpeech-1.1/wavs/{file}.wav", label.lower()] for file, label in metadata_df.values.tolist()]
 54 | 
 55 | # Create a ModelConfigs object to store model configurations
 56 | configs = ModelConfigs()
 57 | configs.save()
 58 | 
 59 | # Create a data provider for the dataset
 60 | data_provider = DataProvider(
 61 |     dataset=dataset,
 62 |     skip_validation=True,
 63 |     batch_size=configs.batch_size,
 64 |     data_preprocessors=[
 65 |         WavReader(frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length),
 66 |         ],
 67 |     transformers=[
 68 |         LabelIndexer(configs.vocab),
 69 |         ],
 70 |     batch_postprocessors=[
 71 |         SpectrogramPadding(padding_value=0, use_on_batch=True),
 72 |         LabelPadding(padding_value=len(configs.vocab), use_on_batch=True),
 73 |     ],
 74 | )
 75 | 
 76 | # Split the dataset into training and validation sets
 77 | train_data_provider, val_data_provider = data_provider.split(split = 0.9)
 78 | 
 79 | # Creating TensorFlow model architecture
 80 | model = train_model(
 81 |     input_dim = (None, 193),
 82 |     output_dim = len(configs.vocab),
 83 |     dropout=0.5
 84 | )
 85 | 
 86 | # Compile the model and print summary
 87 | model.compile(
 88 |     optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 
 89 |     loss=CTCloss(), 
 90 |     metrics=[
 91 |         CERMetric(vocabulary=configs.vocab),
 92 |         WERMetric(vocabulary=configs.vocab)
 93 |         ],
 94 |     run_eagerly=False
 95 | )
 96 | model.summary(line_length=110)
 97 | 
 98 | # Define callbacks
 99 | earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min")
100 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
101 | trainLogger = TrainLogger(configs.model_path)
102 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
103 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.8, min_delta=1e-10, patience=5, verbose=1, mode="auto")
104 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5")
105 | 
106 | # Train the model
107 | model.fit(
108 |     train_data_provider,
109 |     validation_data=val_data_provider,
110 |     epochs=configs.train_epochs,
111 |     callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
112 |     workers=configs.train_workers,
113 | )
114 | 
115 | # Save training and validation datasets as csv files
116 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
117 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))


--------------------------------------------------------------------------------
/Tutorials/06_pytorch_introduction/README.md:
--------------------------------------------------------------------------------
1 | # Introduction to PyTorch in a practical way
2 | ## In this tutorial, I'll cover the basics of PyTorch, how to prepare a dataset, construct the network, define training and validation loops, save the model and finally test the saved model<br><br>
3 | 
4 | # **Detailed tutorial**:
5 | ## [Introduction to PyTorch in a practical way](https://pylessons.com/pytorch-introduction)
6 | 
7 | <p align="center">
8 |   <img src="https://pylessons.com/media/Tutorials/TensorFlow-CAPTCHA-solver/pytorch-introduction/pytorch-introduction.png">
9 | </p>


--------------------------------------------------------------------------------
/Tutorials/06_pytorch_introduction/model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | # Define the model architecture
 5 | class Net(nn.Module):
 6 |     def __init__(self):
 7 |         super(Net, self).__init__()
 8 |         self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
 9 |         self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
10 |         self.conv2_drop = nn.Dropout2d()
11 |         self.fc1 = nn.Linear(320, 50)
12 |         self.fc2 = nn.Linear(50, 10)
13 | 
14 |     def forward(self, x):
15 |         x = F.relu(F.max_pool2d(self.conv1(x), 2))
16 |         x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
17 |         x = x.view(-1, 320)
18 |         x = F.relu(self.fc1(x))
19 |         x = F.dropout(x, training=self.training)
20 |         x = self.fc2(x)
21 |         x = F.log_softmax(x, dim=1)
22 |         return x


--------------------------------------------------------------------------------
/Tutorials/06_pytorch_introduction/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | opencv-python
3 | tqdm
4 | torch
5 | torchsummary


--------------------------------------------------------------------------------
/Tutorials/06_pytorch_introduction/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import torch
 4 | import numpy as np
 5 | import requests, gzip, os, hashlib
 6 | 
 7 | from model import Net
 8 | 
 9 | path = "Datasets/mnist"  # Path where to save the downloaded mnist dataset
10 | 
11 | def fetch(url):
12 |     if os.path.exists(path) is False:
13 |         os.makedirs(path)
14 | 
15 |     fp = os.path.join(path, hashlib.md5(url.encode("utf-8")).hexdigest())
16 |     if os.path.isfile(fp):
17 |         with open(fp, "rb") as f:
18 |             data = f.read()
19 |     else:
20 |         with open(fp, "wb") as f:
21 |             data = requests.get(url).content
22 |             f.write(data)
23 |     return np.frombuffer(gzip.decompress(data), dtype=np.uint8).copy()
24 | 
25 | test_data = fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
26 | test_targets = fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")[8:]
27 | 
28 | # output path
29 | model_path = "Model/06_pytorch_introduction"
30 | 
31 | # construct network and load weights
32 | network = Net()
33 | network.load_state_dict(torch.load("Models/06_pytorch_introduction/model.pt"))
34 | network.eval() # set to evaluation mode
35 | 
36 | # loop over test images
37 | for test_image, test_target in zip(test_data, test_targets):
38 | 
39 |     # normalize image and convert to tensor
40 |     inference_image = torch.from_numpy(test_image).float() / 255.0
41 |     inference_image = inference_image.unsqueeze(0).unsqueeze(0)
42 | 
43 |     # predict
44 |     output = network(inference_image)
45 |     pred = output.argmax(dim=1, keepdim=True)
46 |     prediction = str(pred.item())
47 | 
48 |     test_image = cv2.resize(test_image, (400, 400))
49 |     cv2.imshow(prediction, test_image)
50 |     key = cv2.waitKey(0)
51 |     if key == ord("q"):  # break on q key
52 |         break
53 |  
54 |     cv2.destroyAllWindows()
55 | 


--------------------------------------------------------------------------------
/Tutorials/06_pytorch_introduction/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | import requests, gzip, os, hashlib
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.optim as optim
 10 | from torchsummary import summary
 11 | 
 12 | from model import Net
 13 | 
 14 | # define path to store dataset
 15 | path = "Datasets/mnist"
 16 | 
 17 | def fetch(url):
 18 |     if os.path.exists(path) is False:
 19 |         os.makedirs(path)
 20 | 
 21 |     fp = os.path.join(path, hashlib.md5(url.encode("utf-8")).hexdigest())
 22 |     if os.path.isfile(fp):
 23 |         with open(fp, "rb") as f:
 24 |             data = f.read()
 25 |     else:
 26 |         with open(fp, "wb") as f:
 27 |             data = requests.get(url).content
 28 |             f.write(data)
 29 |     return np.frombuffer(gzip.decompress(data), dtype=np.uint8).copy()
 30 | 
 31 | # load mnist dataset from yann.lecun.com, train data is of shape (60000, 28, 28) and targets are of shape (60000)
 32 | train_data = fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
 33 | train_targets = fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")[8:]
 34 | test_data = fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
 35 | test_targets = fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")[8:]
 36 | 
 37 | # uncomment to show images from dataset using OpenCV
 38 | # for train_image, train_target in zip(train_data, train_targets):
 39 | #     train_image = cv2.resize(train_image, (400, 400))
 40 | #     cv2.imshow("Image", train_image)
 41 | #     # if Q button break this loop
 42 | #     if cv2.waitKey(0) & 0xFF == ord("q"):
 43 | #         break
 44 | # cv2.destroyAllWindows()
 45 | 
 46 | # define training hyperparameters
 47 | n_epochs = 5
 48 | batch_size_train = 64
 49 | batch_size_test = 64
 50 | learning_rate = 0.001
 51 | 
 52 | # reshape data to (items, channels, height, width) and normalize to [0, 1]
 53 | train_data = np.expand_dims(train_data, axis=1) / 255.0
 54 | test_data = np.expand_dims(test_data, axis=1) / 255.0
 55 | 
 56 | # split data into batches of size [(batch_size, 1, 28, 28) ...]
 57 | train_batches = [np.array(train_data[i:i+batch_size_train]) for i in range(0, len(train_data), batch_size_train)]
 58 | # split targets into batches of size [(batch_size) ...]
 59 | train_target_batches = [np.array(train_targets[i:i+batch_size_train]) for i in range(0, len(train_targets), batch_size_train)]
 60 | 
 61 | test_batches = [np.array(test_data[i:i+batch_size_test]) for i in range(0, len(test_data), batch_size_test)]
 62 | test_target_batches = [np.array(test_targets[i:i+batch_size_test]) for i in range(0, len(test_targets), batch_size_test)]
 63 | 
 64 | # create network
 65 | network = Net()
 66 | 
 67 | # uncomment to print network summary
 68 | summary(network, (1, 28, 28), device="cpu")
 69 | 
 70 | # define loss function and optimizer
 71 | optimizer = optim.Adam(network.parameters(), lr=learning_rate)
 72 | loss_function = nn.CrossEntropyLoss()
 73 | 
 74 | # create training loop
 75 | def train(epoch):
 76 |     # set network to training mode
 77 |     network.train()
 78 | 
 79 |     loss_sum = 0
 80 |     # create a progress bar
 81 |     train_pbar = tqdm(zip(train_batches, train_target_batches), total=len(train_batches))
 82 |     for index, (data, target) in enumerate(train_pbar, start=1):
 83 | 
 84 |         # convert data to torch.FloatTensor
 85 |         data = torch.from_numpy(data).float()
 86 |         target = torch.from_numpy(target).long()
 87 | 
 88 |         # zero the parameter gradients
 89 |         optimizer.zero_grad()
 90 | 
 91 |         # forward + backward + optimize
 92 |         output = network(data)
 93 |         loss = loss_function(output, target)
 94 |         loss.backward()
 95 |         optimizer.step()
 96 | 
 97 |         # update progress bar with loss value
 98 |         loss_sum += loss.item()
 99 |         train_pbar.set_description(f"Epoch {epoch}, loss: {loss_sum / index:.4f}")
100 | 
101 | # create testing loop
102 | def test(epoch):
103 |     # set network to evaluation mode
104 |     network.eval()
105 | 
106 |     correct, loss_sum = 0, 0
107 |     # create progress bar
108 |     val_pbar = tqdm(zip(test_batches, test_target_batches), total=len(test_batches))
109 |     with torch.no_grad():
110 |         for index, (data, target) in enumerate(val_pbar, start=1):
111 | 
112 |             # convert data to torch.FloatTensor
113 |             data = torch.from_numpy(data).float()
114 |             target = torch.from_numpy(target).long()
115 | 
116 |             # forward pass
117 |             output = network(data)
118 | 
119 |             # update progress bar with loss and accuracy values
120 |             loss_sum += loss_function(output, target).item() / target.size(0)
121 |             pred = output.data.max(1, keepdim=True)[1]
122 |             correct += pred.eq(target.data.view_as(pred)).sum() / target.size(0)
123 | 
124 |             val_pbar.set_description(f"val_loss: {loss_sum / index:.4f}, val_accuracy: {correct / index:.4f}")
125 | 
126 | 
127 | # train and test the model
128 | for epoch in range(1, n_epochs + 1):
129 |     train(epoch)
130 |     test(epoch)
131 | 
132 | # define output path and create folder if not exists
133 | output_path = "Models/06_pytorch_introduction"
134 | if not os.path.exists(output_path):
135 |     os.makedirs(output_path)
136 | 
137 | # save model.pt to defined output path
138 | torch.save(network.state_dict(), os.path.join(output_path, "model.pt"))


--------------------------------------------------------------------------------
/Tutorials/07_pytorch_wrapper/README.md:
--------------------------------------------------------------------------------
1 | # Using custom wrapper to simplify PyTorch models training pipeline
2 | ## I will introduce the PyTorch Wrapper in this tutorial, saving us time when developing the PyTorch models training pipeline. We'll be able to do this in blocks!<br><br>
3 | 
4 | # **Detailed tutorial**:
5 | ## [PyTorch Wrapper to Build and Train Networks](https://pylessons.com/pytorch-introduction)
6 | 
7 | <p align="center">
8 |   <img src="https://pylessons.com/media/Tutorials/mltu/pytorch-wrapper/pytorch-wrapper.png">
9 | </p>


--------------------------------------------------------------------------------
/Tutorials/07_pytorch_wrapper/model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 |    
 4 | # Define the model architecture
 5 | class Net(nn.Module):
 6 |     def __init__(self):
 7 |         super(Net, self).__init__()
 8 |         self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
 9 |         self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
10 |         self.conv2_drop = nn.Dropout2d()
11 |         self.fc1 = nn.Linear(320, 50)
12 |         self.fc2 = nn.Linear(50, 10)
13 | 
14 |     def forward(self, x):
15 |         x = F.relu(F.max_pool2d(self.conv1(x), 2))
16 |         x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
17 |         x = x.view(-1, 320)
18 |         x = F.relu(self.fc1(x))
19 |         x = F.dropout(x, training=self.training)
20 |         x = self.fc2(x)
21 |         x = F.log_softmax(x, dim=1)
22 |         return x


--------------------------------------------------------------------------------
/Tutorials/07_pytorch_wrapper/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchsummary
3 | mltu==1.0.1


--------------------------------------------------------------------------------
/Tutorials/07_pytorch_wrapper/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import torch
 4 | import numpy as np
 5 | import requests, gzip, os, hashlib
 6 | 
 7 | from model import Net
 8 | 
 9 | path = "Datasets/mnist" # Path where to save the downloaded mnist dataset
10 | 
11 | def fetch(url):
12 |     if os.path.exists(path) is False:
13 |         os.makedirs(path)
14 | 
15 |     fp = os.path.join(path, hashlib.md5(url.encode("utf-8")).hexdigest())
16 |     if os.path.isfile(fp):
17 |         with open(fp, "rb") as f:
18 |             data = f.read()
19 |     else:
20 |         with open(fp, "wb") as f:
21 |             data = requests.get(url).content
22 |             f.write(data)
23 |     return np.frombuffer(gzip.decompress(data), dtype=np.uint8).copy()
24 | 
25 | test_data = fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
26 | test_targets = fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")[8:]
27 | 
28 | # output path
29 | model_path = "Model/07_pytorch_wrapper"
30 | 
31 | # construct network and load weights
32 | network = Net()
33 | network.load_state_dict(torch.load("Models/07_pytorch_wrapper/model.pt"))
34 | network.eval() # set to evaluation mode
35 | 
36 | # loop over test images
37 | for test_image, test_target in zip(test_data, test_targets):
38 | 
39 |     # normalize image and convert to tensor
40 |     inference_image = torch.from_numpy(test_image).float() / 255.0
41 |     inference_image = inference_image.unsqueeze(0).unsqueeze(0)
42 | 
43 |     # predict
44 |     output = network(inference_image)
45 |     pred = output.argmax(dim=1, keepdim=True)
46 |     prediction = str(pred.item())
47 | 
48 |     test_image = cv2.resize(test_image, (400, 400))
49 |     cv2.imshow(prediction, test_image)
50 |     key = cv2.waitKey(0)
51 |     if key == ord("q"): # break on q key
52 |         break
53 |  
54 |     cv2.destroyAllWindows()


--------------------------------------------------------------------------------
/Tutorials/07_pytorch_wrapper/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import requests, gzip, os, hashlib
 4 | 
 5 | import torch
 6 | import torch.optim as optim
 7 | 
 8 | from model import Net
 9 | 
10 | from mltu.torch.dataProvider import DataProvider
11 | from mltu.torch.model import Model
12 | from mltu.torch.metrics import Accuracy
13 | from mltu.torch.callbacks import EarlyStopping, ModelCheckpoint
14 | 
15 | # define path to store dataset
16 | path = "Datasets/data"
17 | 
18 | def fetch(url):
19 |     if os.path.exists(path) is False:
20 |         os.makedirs(path)
21 | 
22 |     fp = os.path.join(path, hashlib.md5(url.encode("utf-8")).hexdigest())
23 |     if os.path.isfile(fp):
24 |         with open(fp, "rb") as f:
25 |             data = f.read()
26 |     else:
27 |         with open(fp, "wb") as f:
28 |             data = requests.get(url).content
29 |             f.write(data)
30 |     return np.frombuffer(gzip.decompress(data), dtype=np.uint8).copy()
31 | 
32 | # load mnist dataset from yann.lecun.com, train data is of shape (60000, 28, 28) and targets are of shape (60000)
33 | train_data = fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
34 | train_targets = fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")[8:]
35 | test_data = fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28))
36 | test_targets = fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")[8:]
37 | 
38 | train_dataset = [[data, target] for data, target in zip(train_data, train_targets)]
39 | test_dataset = [[data, target] for data, target in zip(test_data, test_targets)]
40 | 
41 | def preprocessor(data, target):
42 |     # original data is shape of (28, 28), expand to (1, 28, 28) and normalize to [0, 1]
43 |     data = np.expand_dims(data, axis=0) / 255.0
44 |     return data, target
45 | 
46 | train_dataProvider = DataProvider(
47 |     train_dataset, 
48 |     data_preprocessors=[preprocessor],
49 |     batch_size=64,
50 |     )
51 | 
52 | test_dataProvider = DataProvider(
53 |     test_dataset,
54 |     data_preprocessors=[preprocessor],
55 |     batch_size=64
56 |     )
57 | 
58 | # create network, optimizer and define loss function
59 | network = Net()
60 | optimizer = optim.Adam(network.parameters(), lr=0.001)
61 | loss = torch.nn.CrossEntropyLoss()
62 | 
63 | # put on cuda device if available
64 | if torch.cuda.is_available():
65 |     network = network.cuda()
66 | 
67 | # create callbacks
68 | earlyStopping = EarlyStopping(
69 |     monitor="val_accuracy",
70 |     patience=3, 
71 |     mode="max", 
72 |     verbose=1
73 |     )
74 | modelCheckpoint = ModelCheckpoint(
75 |     "Models/07_pytorch_wrapper/model.pt",
76 |     monitor="val_accuracy",
77 |     mode="max", 
78 |     save_best_only=True, 
79 |     verbose=1
80 |     )
81 | 
82 | # create model object that will handle training and testing of the network
83 | model = Model(network, optimizer, loss, metrics=[Accuracy()])
84 | model.fit(
85 |     train_dataProvider, 
86 |     test_dataProvider, 
87 |     epochs=100, 
88 |     callbacks=[earlyStopping, modelCheckpoint]
89 |     )


--------------------------------------------------------------------------------
/Tutorials/08_handwriting_recognition_torch/README.md:
--------------------------------------------------------------------------------
1 | # Using custom wrapper to simplify PyTorch models training pipeline
2 | ### Construct an accurate handwriting recognition model with PyTorch! Understand how to use MLTU package, to simplify the PyTorch models training pipeline, and discover methods to enhance your model's accuracy!<br><br>
3 | 
4 | # **Detailed tutorial**:
5 | ### [Handwriting words recognition with PyTorch](https://pylessons.com/handwriting-recognition-pytorch)
6 | 
7 | <p align="center">
8 |     <img src="https://pylessons.com/media/Tutorials/mltu/handwriting-recognition-pytorch/handwriting-recognition-pytorch.png">
9 | </p>


--------------------------------------------------------------------------------
/Tutorials/08_handwriting_recognition_torch/configs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | from mltu.configs import BaseModelConfigs
 5 | 
 6 | 
 7 | class ModelConfigs(BaseModelConfigs):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model_path = os.path.join("Models/08_handwriting_recognition_torch", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
11 |         self.vocab = ""
12 |         self.height = 32
13 |         self.width = 128
14 |         self.max_text_length = 0
15 |         self.batch_size = 64
16 |         self.learning_rate = 0.002
17 |         self.train_epochs = 1000
18 | 


--------------------------------------------------------------------------------
/Tutorials/08_handwriting_recognition_torch/inferenceModel.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import typing
 3 | import numpy as np
 4 | 
 5 | from mltu.inferenceModel import OnnxInferenceModel
 6 | from mltu.utils.text_utils import ctc_decoder, get_cer
 7 | 
 8 | class ImageToWordModel(OnnxInferenceModel):
 9 |     def __init__(self, *args, **kwargs):
10 |         super().__init__(*args, **kwargs)
11 | 
12 |     def predict(self, image: np.ndarray):
13 |         image = cv2.resize(image, self.input_shapes[0][1:3][::-1])
14 | 
15 |         image_pred = np.expand_dims(image, axis=0).astype(np.float32)
16 | 
17 |         preds = self.model.run(self.output_names, {self.input_names[0]: image_pred})[0]
18 | 
19 |         text = ctc_decoder(preds, self.metadata["vocab"])[0]
20 | 
21 |         return text
22 | 
23 | if __name__ == "__main__":
24 |     import pandas as pd
25 |     from tqdm import tqdm
26 | 
27 |     model = ImageToWordModel(model_path="Models/08_handwriting_recognition_torch/202303142139/model.onnx")
28 | 
29 |     df = pd.read_csv("Models/08_handwriting_recognition_torch/202303142139/val.csv").values.tolist()
30 | 
31 |     accum_cer = []
32 |     for image_path, label in tqdm(df):
33 |         image = cv2.imread(image_path.replace("\\", "/"))
34 | 
35 |         prediction_text = model.predict(image)
36 | 
37 |         cer = get_cer(prediction_text, label)
38 |         print(f"Image: {image_path}, Label: {label}, Prediction: {prediction_text}, CER: {cer}")
39 | 
40 |         accum_cer.append(cer)
41 | 
42 |     print(f"Average CER: {np.average(accum_cer)}")


--------------------------------------------------------------------------------
/Tutorials/08_handwriting_recognition_torch/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | def activation_layer(activation: str="relu", alpha: float=0.1, inplace: bool=True):
  7 |     """ Activation layer wrapper for LeakyReLU and ReLU activation functions
  8 | 
  9 |     Args:
 10 |         activation: str, activation function name (default: 'relu')
 11 |         alpha: float (LeakyReLU activation function parameter)
 12 | 
 13 |     Returns:
 14 |         torch.Tensor: activation layer
 15 |     """
 16 |     if activation == "relu":
 17 |         return nn.ReLU(inplace=inplace)
 18 |     
 19 |     elif activation == "leaky_relu":
 20 |         return nn.LeakyReLU(negative_slope=alpha, inplace=inplace)
 21 | 
 22 | 
 23 | class ConvBlock(nn.Module):
 24 |     """ Convolutional block with batch normalization
 25 |     """
 26 |     def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int, padding: int):
 27 |         super(ConvBlock, self).__init__()
 28 |         self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
 29 |         self.bn = nn.BatchNorm2d(out_channels)
 30 |         
 31 |     def forward(self, x: torch.Tensor):
 32 |         return self.bn(self.conv(x))
 33 | 
 34 | 
 35 | class ResidualBlock(nn.Module):
 36 |     def __init__(self, in_channels, out_channels, skip_conv=True, stride=1, dropout=0.2, activation="leaky_relu"):
 37 |         super(ResidualBlock, self).__init__()
 38 |         self.convb1 = ConvBlock(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
 39 |         self.act1 = activation_layer(activation)
 40 | 
 41 |         self.convb2 = ConvBlock(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
 42 | 
 43 |         self.dropout = nn.Dropout(p=dropout)
 44 |         
 45 |         self.shortcut = None
 46 |         if skip_conv:
 47 |             if stride != 1 or in_channels != out_channels:
 48 |                 self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride)
 49 | 
 50 |         self.act2 = activation_layer(activation)
 51 |         
 52 |     def forward(self, x):
 53 |         skip = x
 54 |         
 55 |         out = self.act1(self.convb1(x))
 56 |         out = self.convb2(out)
 57 | 
 58 |         if self.shortcut is not None:
 59 |             out += self.shortcut(skip)
 60 | 
 61 |         out = self.act2(out)
 62 |         out = self.dropout(out)
 63 |         
 64 |         return out
 65 | 
 66 | class Network(nn.Module):
 67 |     """ Handwriting recognition network for CTC loss"""
 68 |     def __init__(self, num_chars: int, activation: str="leaky_relu", dropout: float=0.2):
 69 |         super(Network, self).__init__()
 70 | 
 71 |         self.rb1 = ResidualBlock(3, 16, skip_conv = True, stride=1, activation=activation, dropout=dropout)
 72 |         self.rb2 = ResidualBlock(16, 16, skip_conv = True, stride=2, activation=activation, dropout=dropout)
 73 |         self.rb3 = ResidualBlock(16, 16, skip_conv = False, stride=1, activation=activation, dropout=dropout)
 74 | 
 75 |         self.rb4 = ResidualBlock(16, 32, skip_conv = True, stride=2, activation=activation, dropout=dropout)
 76 |         self.rb5 = ResidualBlock(32, 32, skip_conv = False, stride=1, activation=activation, dropout=dropout)
 77 | 
 78 |         self.rb6 = ResidualBlock(32, 64, skip_conv = True, stride=2, activation=activation, dropout=dropout)
 79 |         self.rb7 = ResidualBlock(64, 64, skip_conv = True, stride=1, activation=activation, dropout=dropout)
 80 | 
 81 |         self.rb8 = ResidualBlock(64, 64, skip_conv = False, stride=1, activation=activation, dropout=dropout)
 82 |         self.rb9 = ResidualBlock(64, 64, skip_conv = False, stride=1, activation=activation, dropout=dropout)
 83 | 
 84 |         self.lstm = nn.LSTM(64, 128, bidirectional=True, num_layers=1, batch_first=True)
 85 |         self.lstm_dropout = nn.Dropout(p=dropout)
 86 | 
 87 |         self.output = nn.Linear(256, num_chars + 1)
 88 | 
 89 |     def forward(self, images: torch.Tensor) -> torch.Tensor:
 90 |         # normalize images between 0 and 1
 91 |         images_flaot = images / 255.0
 92 | 
 93 |         # transpose image to channel first
 94 |         images_flaot = images_flaot.permute(0, 3, 1, 2)
 95 | 
 96 |         # apply convolutions
 97 |         x = self.rb1(images_flaot)
 98 |         x = self.rb2(x)
 99 |         x = self.rb3(x)
100 |         x = self.rb4(x)
101 |         x = self.rb5(x)
102 |         x = self.rb6(x)
103 |         x = self.rb7(x)
104 |         x = self.rb8(x)
105 |         x = self.rb9(x)
106 | 
107 |         x = x.reshape(x.size(0), -1, x.size(1))
108 | 
109 |         x, _ = self.lstm(x)
110 |         x = self.lstm_dropout(x)
111 | 
112 |         x = self.output(x)
113 |         x = F.log_softmax(x, 2)
114 | 
115 |         return x


--------------------------------------------------------------------------------
/Tutorials/08_handwriting_recognition_torch/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.13.1
2 | tensorboard==2.10.1
3 | onnx==1.12.0
4 | torchsummaryX


--------------------------------------------------------------------------------
/Tutorials/08_handwriting_recognition_torch/train_torch.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tarfile
  3 | from tqdm import tqdm
  4 | from io import BytesIO
  5 | from zipfile import ZipFile
  6 | from urllib.request import urlopen
  7 | 
  8 | import torch
  9 | import torch.optim as optim
 10 | from torchsummaryX import summary
 11 | 
 12 | from mltu.torch.model import Model
 13 | from mltu.torch.losses import CTCLoss
 14 | from mltu.torch.dataProvider import DataProvider
 15 | from mltu.torch.metrics import CERMetric, WERMetric
 16 | from mltu.torch.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, Model2onnx, ReduceLROnPlateau
 17 | 
 18 | from mltu.preprocessors import ImageReader
 19 | from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2
 20 | from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen
 21 | from mltu.annotations.images import CVImage
 22 | 
 23 | from model import Network
 24 | from configs import ModelConfigs
 25 | 
 26 | 
 27 | def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024):
 28 |     http_response = urlopen(url)
 29 | 
 30 |     data = b""
 31 |     iterations = http_response.length // chunk_size + 1
 32 |     for _ in tqdm(range(iterations)):
 33 |         data += http_response.read(chunk_size)
 34 | 
 35 |     zipfile = ZipFile(BytesIO(data))
 36 |     zipfile.extractall(path=extract_to)
 37 | 
 38 | dataset_path = os.path.join("Datasets", "IAM_Words")
 39 | if not os.path.exists(dataset_path):
 40 |     download_and_unzip("https://git.io/J0fjL", extract_to="Datasets")
 41 | 
 42 |     file = tarfile.open(os.path.join(dataset_path, "words.tgz"))
 43 |     file.extractall(os.path.join(dataset_path, "words"))
 44 | 
 45 | dataset, vocab, max_len = [], set(), 0
 46 | 
 47 | # Preprocess the dataset by the specific IAM_Words dataset file structure
 48 | words = open(os.path.join(dataset_path, "words.txt"), "r").readlines()
 49 | for line in tqdm(words):
 50 |     if line.startswith("#"):
 51 |         continue
 52 | 
 53 |     line_split = line.split(" ")
 54 |     if line_split[1] == "err":
 55 |         continue
 56 | 
 57 |     folder1 = line_split[0][:3]
 58 |     folder2 = "-".join(line_split[0].split("-")[:2])
 59 |     file_name = line_split[0] + ".png"
 60 |     label = line_split[-1].rstrip("\n")
 61 | 
 62 |     rel_path = os.path.join(dataset_path, "words", folder1, folder2, file_name)
 63 |     if not os.path.exists(rel_path):
 64 |         print(f"File not found: {rel_path}")
 65 |         continue
 66 | 
 67 |     dataset.append([rel_path, label])
 68 |     vocab.update(list(label))
 69 |     max_len = max(max_len, len(label))
 70 | 
 71 | configs = ModelConfigs()
 72 | 
 73 | # Save vocab and maximum text length to configs
 74 | configs.vocab = "".join(sorted(vocab))
 75 | configs.max_text_length = max_len
 76 | configs.save()
 77 | 
 78 | # Create a data provider for the dataset
 79 | data_provider = DataProvider(
 80 |     dataset=dataset,
 81 |     skip_validation=True,
 82 |     batch_size=configs.batch_size,
 83 |     data_preprocessors=[ImageReader(CVImage)],
 84 |     transformers=[
 85 |         # ImageShowCV2(), # uncomment to show images when iterating over the data provider
 86 |         ImageResizer(configs.width, configs.height, keep_aspect_ratio=False),
 87 |         LabelIndexer(configs.vocab),
 88 |         LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab))
 89 |         ],
 90 |     use_cache=True,
 91 | )
 92 | 
 93 | # Split the dataset into training and validation sets
 94 | train_dataProvider, test_dataProvider = data_provider.split(split = 0.9)
 95 | 
 96 | # Augment training data with random brightness, rotation and erode/dilate
 97 | train_dataProvider.augmentors = [
 98 |     RandomBrightness(), 
 99 |     RandomErodeDilate(),
100 |     RandomSharpen(),
101 |     RandomRotate(angle=10), 
102 |     ]
103 | 
104 | network = Network(len(configs.vocab), activation="leaky_relu", dropout=0.3)
105 | loss = CTCLoss(blank=len(configs.vocab))
106 | optimizer = optim.Adam(network.parameters(), lr=configs.learning_rate)
107 | 
108 | # uncomment to print network summary, torchsummaryX package is required
109 | summary(network, torch.zeros((1, configs.height, configs.width, 3)))
110 | 
111 | # put on cuda device if available
112 | if torch.cuda.is_available():
113 |     network = network.cuda()
114 | 
115 | # create callbacks
116 | earlyStopping = EarlyStopping(monitor="val_CER", patience=20, mode="min", verbose=1)
117 | modelCheckpoint = ModelCheckpoint(configs.model_path + "/model.pt", monitor="val_CER", mode="min", save_best_only=True, verbose=1)
118 | tb_callback = TensorBoard(configs.model_path + "/logs")
119 | reduce_lr = ReduceLROnPlateau(monitor="val_CER", factor=0.9, patience=10, verbose=1, mode="min", min_lr=1e-6)
120 | model2onnx = Model2onnx(
121 |     saved_model_path=configs.model_path + "/model.pt",
122 |     input_shape=(1, configs.height, configs.width, 3), 
123 |     verbose=1,
124 |     metadata={"vocab": configs.vocab}
125 |     )
126 | 
127 | # create model object that will handle training and testing of the network
128 | model = Model(network, optimizer, loss, metrics=[CERMetric(configs.vocab), WERMetric(configs.vocab)])
129 | model.fit(
130 |     train_dataProvider, 
131 |     test_dataProvider, 
132 |     epochs=1000, 
133 |     callbacks=[earlyStopping, modelCheckpoint, tb_callback, reduce_lr, model2onnx]
134 |     )
135 | 
136 | # Save training and validation datasets as csv files
137 | train_dataProvider.to_csv(os.path.join(configs.model_path, "train.csv"))
138 | test_dataProvider.to_csv(os.path.join(configs.model_path, "val.csv"))
139 | 


--------------------------------------------------------------------------------
/Tutorials/09_translation_transformer/README.md:
--------------------------------------------------------------------------------
 1 | # Training TensorFlow Transformer model for Spanish to English translation task
 2 | ### In this tutorial, I'll walk through a practical example of Transformer Training for Language Translation tasks from Spanish to the English language
 3 | 
 4 | <br><br>
 5 | # **Detailed tutorial**:
 6 | ### [Transformer training with TensorFlow for Translation task](https://pylessons.com/transformers-training)
 7 | 
 8 | <p align="center">
 9 |     <img src="https://pylessons.com/media/Tutorials/transformers/transformers-training/transformers-training.png">
10 | </p>


--------------------------------------------------------------------------------
/Tutorials/09_translation_transformer/configs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | from mltu.configs import BaseModelConfigs
 5 | 
 6 | 
 7 | class ModelConfigs(BaseModelConfigs):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.model_path = os.path.join(
11 |             "Models/09_translation_transformer",
12 |             datetime.strftime(datetime.now(), "%Y%m%d%H%M"),
13 |         )
14 |         self.num_layers = 4
15 |         self.d_model = 128
16 |         self.num_heads = 8
17 |         self.dff = 512
18 |         self.dropout_rate = 0.1
19 |         self.batch_size = 16
20 |         self.train_epochs = 50
21 |         # CustomSchedule parameters
22 |         self.init_lr = 0.00001
23 |         self.lr_after_warmup = 0.0005
24 |         self.final_lr = 0.0001
25 |         self.warmup_epochs = 2
26 |         self.decay_epochs = 18


--------------------------------------------------------------------------------
/Tutorials/09_translation_transformer/download.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import requests
 4 | from tqdm import tqdm
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | # URL to the directory containing the files to be downloaded
 8 | language = "en-es"
 9 | url = f"https://data.statmt.org/opus-100-corpus/v1.0/supervised/{language}/"
10 | save_directory = f"./Datasets/{language}"
11 | 
12 | # Create the save directory if it doesn't exist
13 | os.makedirs(save_directory, exist_ok=True)
14 | 
15 | # Send a GET request to the URL
16 | response = requests.get(url)
17 | 
18 | # Parse the HTML response
19 | soup = BeautifulSoup(response.content, 'html.parser')
20 | 
21 | # Find all the anchor tags in the HTML
22 | links = soup.find_all('a')
23 | 
24 | # Extract the href attribute from each anchor tag
25 | file_links = [link['href'] for link in links if '.' in link['href']]
26 | 
27 | # Download each file
28 | for file_link in tqdm(file_links):
29 |     file_url = url + file_link
30 |     save_path = os.path.join(save_directory, file_link)
31 |     
32 |     print(f"Downloading {file_url}")
33 |     
34 |     # Send a GET request for the file
35 |     file_response = requests.get(file_url)
36 |     if file_response.status_code == 404:
37 |         print(f"Could not download {file_url}")
38 |         continue
39 |     
40 |     # Save the file to the specified directory
41 |     with open(save_path, 'wb') as file:
42 |         file.write(file_response.content)
43 |     
44 |     print(f"Saved {file_link}")
45 | 
46 | print("All files have been downloaded.")


--------------------------------------------------------------------------------
/Tutorials/09_translation_transformer/model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from mltu.tensorflow.transformer.layers import Encoder, Decoder
 4 | 
 5 | def Transformer(
 6 |     input_vocab_size: int, 
 7 |     target_vocab_size: int, 
 8 |     encoder_input_size: int = None,
 9 |     decoder_input_size: int = None,
10 |     num_layers: int=6, 
11 |     d_model: int=512, 
12 |     num_heads: int=8,
13 |     dff: int=2048,
14 |     dropout_rate: float=0.1,
15 |     ) -> tf.keras.Model:
16 |     """
17 |     A custom TensorFlow model that implements the Transformer architecture.
18 | 
19 |     Args:
20 |         input_vocab_size (int): The size of the input vocabulary.
21 |         target_vocab_size (int): The size of the target vocabulary.
22 |         encoder_input_size (int): The size of the encoder input sequence.
23 |         decoder_input_size (int): The size of the decoder input sequence.
24 |         num_layers (int): The number of layers in the encoder and decoder.
25 |         d_model (int): The dimensionality of the model.
26 |         num_heads (int): The number of heads in the multi-head attention layer.
27 |         dff (int): The dimensionality of the feed-forward layer.
28 |         dropout_rate (float): The dropout rate.
29 | 
30 |     Returns:
31 |         A TensorFlow Keras model.
32 |     """
33 |     inputs = [
34 |         tf.keras.layers.Input(shape=(encoder_input_size,), dtype=tf.int64), 
35 |         tf.keras.layers.Input(shape=(decoder_input_size,), dtype=tf.int64)
36 |         ]
37 |     
38 |     encoder_input, decoder_input = inputs
39 | 
40 |     encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=input_vocab_size, dropout_rate=dropout_rate)(encoder_input)
41 |     decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=target_vocab_size, dropout_rate=dropout_rate)(decoder_input, encoder)
42 | 
43 |     output = tf.keras.layers.Dense(target_vocab_size)(decoder)
44 | 
45 |     return tf.keras.Model(inputs=inputs, outputs=output)


--------------------------------------------------------------------------------
/Tutorials/09_translation_transformer/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | tf2onnx==1.14.0
3 | onnx==1.12.0


--------------------------------------------------------------------------------
/Tutorials/09_translation_transformer/test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | 
 4 | from mltu.tokenizers import CustomTokenizer
 5 | from mltu.inferenceModel import OnnxInferenceModel
 6 | 
 7 | class PtEnTranslator(OnnxInferenceModel):
 8 |     def __init__(self, *args, **kwargs):
 9 |         super().__init__(*args, **kwargs)
10 | 
11 |         self.new_inputs = self.model.get_inputs()
12 |         self.tokenizer = CustomTokenizer.load(self.metadata["tokenizer"])
13 |         self.detokenizer = CustomTokenizer.load(self.metadata["detokenizer"])
14 | 
15 |     def predict(self, sentence):
16 |         start = time.time()
17 |         tokenized_sentence = self.tokenizer.texts_to_sequences([sentence])[0]
18 |         encoder_input = np.pad(tokenized_sentence, (0, self.tokenizer.max_length - len(tokenized_sentence)), constant_values=0).astype(np.int64)
19 | 
20 |         tokenized_results = [self.detokenizer.start_token_index]
21 |         for index in range(self.detokenizer.max_length - 1):
22 |             decoder_input = np.pad(tokenized_results, (0, self.detokenizer.max_length - len(tokenized_results)), constant_values=0).astype(np.int64)
23 |             input_dict = {
24 |                 self.model._inputs_meta[0].name: np.expand_dims(encoder_input, axis=0),
25 |                 self.model._inputs_meta[1].name: np.expand_dims(decoder_input, axis=0),
26 |             }
27 |             preds = self.model.run(None, input_dict)[0] # preds shape (1, 206, 29110)
28 |             pred_results = np.argmax(preds, axis=2)
29 |             tokenized_results.append(pred_results[0][index])
30 | 
31 |             if tokenized_results[-1] == self.detokenizer.end_token_index:
32 |                 break
33 |         
34 |         results = self.detokenizer.detokenize([tokenized_results])
35 |         return results[0], time.time() - start
36 | 
37 | def read_files(path):
38 |     with open(path, "r", encoding="utf-8") as f:
39 |         en_train_dataset = f.read().split("\n")[:-1]
40 |     return en_train_dataset
41 | 
42 | # Path to dataset
43 | en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en"
44 | es_validation_data_path = "Datasets/en-es/opus.en-es-dev.es"
45 | 
46 | en_validation_data = read_files(en_validation_data_path)
47 | es_validation_data = read_files(es_validation_data_path)
48 | 
49 | # Consider only sentences with length <= 500
50 | max_lenght = 500
51 | val_examples = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_validation_data, en_validation_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
52 | 
53 | translator = PtEnTranslator("Models/09_translation_transformer/202308241514/model.onnx")
54 | 
55 | val_dataset = []
56 | for es, en in val_examples:
57 |     results, duration = translator.predict(es)
58 |     print("Spanish:     ", es.lower())
59 |     print("English:     ", en.lower())
60 |     print("English pred:", results)
61 |     print(duration)
62 |     print()


--------------------------------------------------------------------------------
/Tutorials/09_translation_transformer/train.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import tensorflow as tf
  4 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
  5 | except: pass
  6 | 
  7 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
  8 | from mltu.tensorflow.callbacks import Model2onnx, WarmupCosineDecay
  9 | 
 10 | from mltu.tensorflow.dataProvider import DataProvider
 11 | from mltu.tokenizers import CustomTokenizer
 12 | 
 13 | from mltu.tensorflow.transformer.utils import MaskedAccuracy, MaskedLoss
 14 | from mltu.tensorflow.transformer.callbacks import EncDecSplitCallback
 15 | 
 16 | from model import Transformer
 17 | from configs import ModelConfigs
 18 | 
 19 | configs = ModelConfigs()
 20 | 
 21 | # Path to dataset
 22 | en_training_data_path = "Datasets/en-es/opus.en-es-train.en"
 23 | en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en"
 24 | es_training_data_path = "Datasets/en-es/opus.en-es-train.es"
 25 | es_validation_data_path = "Datasets/en-es/opus.en-es-dev.es"
 26 | 
 27 | def read_files(path):
 28 |     with open(path, "r", encoding="utf-8") as f:
 29 |         en_train_dataset = f.read().split("\n")[:-1]
 30 |     return en_train_dataset
 31 | 
 32 | en_training_data = read_files(en_training_data_path)
 33 | en_validation_data = read_files(en_validation_data_path)
 34 | es_training_data = read_files(es_training_data_path)
 35 | es_validation_data = read_files(es_validation_data_path)
 36 | 
 37 | # Consider only sentences with length <= 500
 38 | max_lenght = 500
 39 | train_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_training_data, en_training_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
 40 | val_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_validation_data, en_validation_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
 41 | es_training_data, en_training_data = zip(*train_dataset)
 42 | es_validation_data, en_validation_data = zip(*val_dataset)
 43 | 
 44 | # prepare spanish tokenizer, this is the input language
 45 | tokenizer = CustomTokenizer(char_level=True)
 46 | tokenizer.fit_on_texts(es_training_data)
 47 | tokenizer.save(configs.model_path + "/tokenizer.json")
 48 | 
 49 | # prepare english tokenizer, this is the output language
 50 | detokenizer = CustomTokenizer(char_level=True)
 51 | detokenizer.fit_on_texts(en_training_data)
 52 | detokenizer.save(configs.model_path + "/detokenizer.json")
 53 | 
 54 | 
 55 | def preprocess_inputs(data_batch, label_batch):
 56 |     encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64)
 57 |     decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)
 58 |     decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)
 59 | 
 60 |     data_batch_tokens = tokenizer.texts_to_sequences(data_batch)
 61 |     label_batch_tokens = detokenizer.texts_to_sequences(label_batch)
 62 | 
 63 |     for index, (data, label) in enumerate(zip(data_batch_tokens, label_batch_tokens)):
 64 |         encoder_input[index][:len(data)] = data
 65 |         decoder_input[index][:len(label)-1] = label[:-1] # Drop the [END] tokens
 66 |         decoder_output[index][:len(label)-1] = label[1:] # Drop the [START] tokens
 67 | 
 68 |     return (encoder_input, decoder_input), decoder_output
 69 | 
 70 | # Create Training Data Provider
 71 | train_dataProvider = DataProvider(
 72 |     train_dataset, 
 73 |     batch_size=configs.batch_size, 
 74 |     batch_postprocessors=[preprocess_inputs],
 75 |     use_cache=True,
 76 |     )
 77 | 
 78 | # Create Validation Data Provider
 79 | val_dataProvider = DataProvider(
 80 |     val_dataset, 
 81 |     batch_size=configs.batch_size, 
 82 |     batch_postprocessors=[preprocess_inputs],
 83 |     use_cache=True,
 84 |     )
 85 | 
 86 | # Create TensorFlow Transformer Model
 87 | transformer = Transformer(
 88 |     num_layers=configs.num_layers,
 89 |     d_model=configs.d_model,
 90 |     num_heads=configs.num_heads,
 91 |     dff=configs.dff,
 92 |     input_vocab_size=len(tokenizer)+1,
 93 |     target_vocab_size=len(detokenizer)+1,
 94 |     dropout_rate=configs.dropout_rate,
 95 |     encoder_input_size=tokenizer.max_length,
 96 |     decoder_input_size=detokenizer.max_length
 97 |     )
 98 | 
 99 | transformer.summary()
100 | 
101 | optimizer = tf.keras.optimizers.Adam(learning_rate=configs.init_lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
102 | 
103 | # Compile the model
104 | transformer.compile(
105 |     loss=MaskedLoss(),
106 |     optimizer=optimizer,
107 |     metrics=[MaskedAccuracy()],
108 |     run_eagerly=False
109 |     )
110 | 
111 | # Define callbacks
112 | warmupCosineDecay = WarmupCosineDecay(
113 |     lr_after_warmup=configs.lr_after_warmup,
114 |     final_lr=configs.final_lr,
115 |     warmup_epochs=configs.warmup_epochs,
116 |     decay_epochs=configs.decay_epochs,
117 |     initial_lr=configs.init_lr,
118 |     )
119 | earlystopper = EarlyStopping(monitor="val_masked_accuracy", patience=5, verbose=1, mode="max")
120 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_masked_accuracy", verbose=1, save_best_only=True, mode="max", save_weights_only=False)
121 | tb_callback = TensorBoard(f"{configs.model_path}/logs")
122 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_masked_accuracy", factor=0.9, min_delta=1e-10, patience=2, verbose=1, mode="max")
123 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5", metadata={"tokenizer": tokenizer.dict(), "detokenizer": detokenizer.dict()}, save_on_epoch_end=False)
124 | encDecSplitCallback = EncDecSplitCallback(configs.model_path, encoder_metadata={"tokenizer": tokenizer.dict()}, decoder_metadata={"detokenizer": detokenizer.dict()})
125 | 
126 | configs.save()
127 | 
128 | # Train the model
129 | transformer.fit(
130 |     train_dataProvider, 
131 |     validation_data=val_dataProvider, 
132 |     epochs=configs.train_epochs,
133 |     callbacks=[
134 |         earlystopper,
135 |         warmupCosineDecay,
136 |         checkpoint, 
137 |         tb_callback, 
138 |         reduceLROnPlat,
139 |         model2onnx,
140 |         encDecSplitCallback
141 |         ]
142 |     )


--------------------------------------------------------------------------------
/Tutorials/10_wav2vec2_torch/configs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | from mltu.configs import BaseModelConfigs
 5 | 
 6 | class ModelConfigs(BaseModelConfigs):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 |         self.model_path = os.path.join(
10 |             "Models/10_wav2vec2_torch",
11 |             datetime.strftime(datetime.now(), "%Y%m%d%H%M"),
12 |         )
13 |         self.batch_size = 8
14 |         self.train_epochs = 60
15 |         self.train_workers = 20
16 | 
17 |         self.init_lr = 1.0e-8
18 |         self.lr_after_warmup = 1e-05
19 |         self.final_lr = 5e-06
20 |         self.warmup_epochs = 10
21 |         self.decay_epochs = 40
22 |         self.weight_decay = 0.005
23 |         self.mixed_precision = True
24 | 
25 |         self.max_audio_length = 246000 
26 |         self.max_label_length = 256
27 | 
28 |         self.vocab = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


--------------------------------------------------------------------------------
/Tutorials/10_wav2vec2_torch/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.13.1+cu117
2 | transformers==4.33.1
3 | mltu==1.1.4
4 | onnx
5 | onnxruntime


--------------------------------------------------------------------------------
/Tutorials/10_wav2vec2_torch/test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from mltu.inferenceModel import OnnxInferenceModel
 4 | from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer
 5 | 
 6 | class Wav2vec2(OnnxInferenceModel):
 7 |     def __init__(self, *args, **kwargs):
 8 |         super().__init__(*args, **kwargs)
 9 | 
10 |     def predict(self, audio: np.ndarray):
11 | 
12 |         audio = np.expand_dims(audio, axis=0).astype(np.float32)
13 | 
14 |         preds = self.model.run(None, {self.input_name: audio})[0]
15 | 
16 |         text = ctc_decoder(preds, self.metadata["vocab"])[0]
17 | 
18 |         return text
19 | 
20 | if __name__ == "__main__":
21 |     import librosa
22 |     import pandas as pd
23 |     from tqdm import tqdm
24 | 
25 |     model = Wav2vec2(model_path="Models/10_wav2vec2_torch/202309171434/model.onnx")
26 | 
27 |     # The list of multiple [audio_path, label] for validation
28 |     val_dataset = pd.read_csv("Models/10_wav2vec2_torch/202309171434/val.csv").values.tolist()
29 | 
30 |     accum_cer, accum_wer = [], []
31 |     pbar = tqdm(val_dataset)
32 |     for vaw_path, label in pbar:
33 |         audio, sr = librosa.load(vaw_path, sr=16000)
34 | 
35 |         prediction_text = model.predict(audio)
36 | 
37 |         cer = get_cer(prediction_text, label)
38 |         wer = get_wer(prediction_text, label)
39 | 
40 |         accum_cer.append(cer)
41 |         accum_wer.append(wer)
42 |         print(label)
43 | 
44 |         pbar.set_description(f"Average CER: {np.average(accum_cer):.4f}, Average WER: {np.average(accum_wer):.4f}")


--------------------------------------------------------------------------------
/Tutorials/10_wav2vec2_torch/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tarfile
  3 | import pandas as pd
  4 | from tqdm import tqdm
  5 | from io import BytesIO
  6 | from urllib.request import urlopen
  7 | 
  8 | import torch
  9 | from torch import nn
 10 | from transformers import Wav2Vec2ForCTC
 11 | import torch.nn.functional as F
 12 | 
 13 | from mltu.torch.model import Model
 14 | from mltu.torch.losses import CTCLoss
 15 | from mltu.torch.dataProvider import DataProvider
 16 | from mltu.torch.metrics import CERMetric, WERMetric
 17 | from mltu.torch.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, Model2onnx, WarmupCosineDecay
 18 | from mltu.augmentors import RandomAudioNoise, RandomAudioPitchShift, RandomAudioTimeStretch
 19 | 
 20 | from mltu.preprocessors import AudioReader
 21 | from mltu.transformers import LabelIndexer, LabelPadding, AudioPadding
 22 | 
 23 | from configs import ModelConfigs
 24 | 
 25 | configs = ModelConfigs()
 26 | 
 27 | 
 28 | def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024):
 29 |     http_response = urlopen(url)
 30 | 
 31 |     data = b""
 32 |     iterations = http_response.length // chunk_size + 1
 33 |     for _ in tqdm(range(iterations)):
 34 |         data += http_response.read(chunk_size)
 35 | 
 36 |     tarFile = tarfile.open(fileobj=BytesIO(data), mode="r|bz2")
 37 |     tarFile.extractall(path=extract_to)
 38 |     tarFile.close()
 39 | 
 40 | 
 41 | dataset_path = os.path.join("Datasets", "LJSpeech-1.1")
 42 | if not os.path.exists(dataset_path):
 43 |     download_and_unzip("https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", extract_to="Datasets")
 44 | 
 45 | dataset_path = "Datasets/LJSpeech-1.1"
 46 | metadata_path = dataset_path + "/metadata.csv"
 47 | wavs_path = dataset_path + "/wavs/"
 48 | 
 49 | # Read metadata file and parse it
 50 | metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
 51 | dataset = []
 52 | vocab = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
 53 | for file_name, transcription, normalized_transcription in metadata_df.values.tolist():
 54 |     path = f"Datasets/LJSpeech-1.1/wavs/{file_name}.wav"
 55 |     new_label = "".join([l for l in normalized_transcription.lower() if l in vocab])
 56 |     dataset.append([path, new_label])
 57 | 
 58 | # Create a data provider for the dataset
 59 | data_provider = DataProvider(
 60 |     dataset=dataset,
 61 |     skip_validation=True,
 62 |     batch_size=configs.batch_size,
 63 |     data_preprocessors=[
 64 |         AudioReader(sample_rate=16000),
 65 |         ],
 66 |     transformers=[
 67 |         LabelIndexer(vocab),
 68 |         ],
 69 |     use_cache=False,
 70 |     batch_postprocessors=[
 71 |         AudioPadding(max_audio_length=configs.max_audio_length, padding_value=0, use_on_batch=True),
 72 |         LabelPadding(padding_value=len(vocab), use_on_batch=True),
 73 |     ],
 74 |     use_multiprocessing=True,
 75 |     max_queue_size=10,
 76 |     workers=configs.train_workers,
 77 | )
 78 | train_dataProvider, test_dataProvider = data_provider.split(split=0.9)
 79 | 
 80 | # train_dataProvider.augmentors = [
 81 | #         RandomAudioNoise(), 
 82 | #         RandomAudioPitchShift(), 
 83 | #         RandomAudioTimeStretch()
 84 | #     ]
 85 | 
 86 | vocab = sorted(vocab)
 87 | configs.vocab = vocab
 88 | configs.save()
 89 | 
 90 | 
 91 | class CustomWav2Vec2Model(nn.Module):
 92 |     def __init__(self, hidden_states, dropout_rate=0.2, **kwargs):
 93 |         super(CustomWav2Vec2Model, self).__init__( **kwargs)
 94 |         pretrained_name = "facebook/wav2vec2-base-960h"
 95 |         self.model = Wav2Vec2ForCTC.from_pretrained(pretrained_name, vocab_size=hidden_states, ignore_mismatched_sizes=True)
 96 |         self.model.freeze_feature_encoder() # this part does not need to be fine-tuned
 97 | 
 98 |     def forward(self, inputs):
 99 |         output = self.model(inputs, attention_mask=None).logits
100 |         # Apply softmax
101 |         output = F.log_softmax(output, -1)
102 |         return output
103 | 
104 | custom_model = CustomWav2Vec2Model(hidden_states = len(vocab)+1)
105 | 
106 | # put on cuda device if available
107 | if torch.cuda.is_available():
108 |     custom_model = custom_model.cuda()
109 | 
110 | # create callbacks
111 | warmupCosineDecay = WarmupCosineDecay(
112 |     lr_after_warmup=configs.lr_after_warmup,
113 |     warmup_epochs=configs.warmup_epochs,
114 |     decay_epochs=configs.decay_epochs,
115 |     final_lr=configs.final_lr,
116 |     initial_lr=configs.init_lr,
117 |     verbose=True,
118 | )
119 | tb_callback = TensorBoard(configs.model_path + "/logs")
120 | earlyStopping = EarlyStopping(monitor="val_CER", patience=16, mode="min", verbose=1)
121 | modelCheckpoint = ModelCheckpoint(configs.model_path + "/model.pt", monitor="val_CER", mode="min", save_best_only=True, verbose=1)
122 | model2onnx = Model2onnx(
123 |     saved_model_path=configs.model_path + "/model.pt",
124 |     input_shape=(1, configs.max_audio_length), 
125 |     verbose=1,
126 |     metadata={"vocab": configs.vocab},
127 |     dynamic_axes={"input": {0: "batch_size", 1: "sequence_length"}, "output": {0: "batch_size", 1: "sequence_length"}}
128 | )
129 | 
130 | # create model object that will handle training and testing of the network
131 | model = Model(
132 |     custom_model, 
133 |     loss = CTCLoss(blank=len(configs.vocab), zero_infinity=True),
134 |     optimizer = torch.optim.AdamW(custom_model.parameters(), lr=configs.init_lr, weight_decay=configs.weight_decay),
135 |     metrics=[
136 |         CERMetric(configs.vocab), 
137 |         WERMetric(configs.vocab)
138 |     ],
139 |     mixed_precision=configs.mixed_precision,
140 | )
141 | 
142 | # Save training and validation datasets as csv files
143 | train_dataProvider.to_csv(os.path.join(configs.model_path, "train.csv"))
144 | test_dataProvider.to_csv(os.path.join(configs.model_path, "val.csv"))
145 | 
146 | model.fit(
147 |     train_dataProvider, 
148 |     test_dataProvider, 
149 |     epochs=configs.train_epochs, 
150 |     callbacks=[
151 |         warmupCosineDecay, 
152 |         tb_callback, 
153 |         earlyStopping,
154 |         modelCheckpoint, 
155 |         model2onnx
156 |     ]
157 | )


--------------------------------------------------------------------------------
/Tutorials/10_wav2vec2_torch/train_tf.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | try:
  3 |     [
  4 |         tf.config.experimental.set_memory_growth(gpu, True)
  5 |         for gpu in tf.config.experimental.list_physical_devices("GPU")
  6 |     ]
  7 | except:
  8 |     pass
  9 | 
 10 | from keras import layers
 11 | from mltu.tensorflow.dataProvider import DataProvider
 12 | from mltu.transformers import LabelIndexer, LabelPadding, AudioPadding
 13 | 
 14 | from mltu.tensorflow.losses import CTCloss
 15 | from mltu.tensorflow.metrics import CERMetric, WERMetric
 16 | from mltu.tensorflow.callbacks import Model2onnx, WarmupCosineDecay
 17 | 
 18 | from mltu.augmentors import RandomAudioNoise, RandomAudioPitchShift, RandomAudioTimeStretch
 19 | 
 20 | from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
 21 | 
 22 | import pandas as pd
 23 | 
 24 | from configs import ModelConfigs
 25 | 
 26 | configs = ModelConfigs()
 27 | from transformers import TFWav2Vec2ForCTC
 28 | from mltu.preprocessors import AudioReader
 29 | 
 30 | 
 31 | train_dataset = pd.read_csv("Models/10_wav2vec2_torch/202309171434/train.csv").values.tolist()
 32 | validation_dataset = pd.read_csv("Models/10_wav2vec2_torch/202309171434/val.csv").values.tolist()
 33 | 
 34 | # Create a data provider for the dataset
 35 | train_dataProvider = DataProvider(
 36 |     dataset=train_dataset,
 37 |     skip_validation=True,
 38 |     batch_size=configs.batch_size,
 39 |     data_preprocessors=[
 40 |         AudioReader(sample_rate=16000),
 41 |         ],
 42 |     transformers=[
 43 |         LabelIndexer(configs.vocab),
 44 |         LabelPadding(max_word_length=configs.max_label_length, padding_value=len(configs.vocab)),
 45 |         ],
 46 |     batch_postprocessors=[
 47 |         AudioPadding(max_audio_length=configs.max_audio_length, padding_value=0, use_on_batch=True)
 48 |     ],
 49 |     augmentors=[
 50 |         RandomAudioNoise(), 
 51 |         RandomAudioPitchShift(), 
 52 |         RandomAudioTimeStretch()
 53 |     ],
 54 |     use_cache=True,
 55 | )
 56 | 
 57 | test_dataProvider = DataProvider(
 58 |     dataset=validation_dataset,
 59 |     skip_validation=True,
 60 |     batch_size=configs.batch_size,
 61 |     data_preprocessors=[
 62 |         AudioReader(sample_rate=16000),
 63 |         ],
 64 |     transformers=[
 65 |         LabelIndexer(configs.vocab),
 66 |         LabelPadding(max_word_length=configs.max_label_length, padding_value=len(configs.vocab)),
 67 |         ],
 68 |     batch_postprocessors=[
 69 |         AudioPadding(max_audio_length=configs.max_audio_length, padding_value=0, use_on_batch=True)
 70 |     ],
 71 |     use_cache=True,
 72 | )
 73 | 
 74 | class CustomWav2Vec2Model(layers.Layer):
 75 |     def __init__(self, output_dim, **kwargs):
 76 |         super().__init__(**kwargs)
 77 | 
 78 |         pretrained_name = "facebook/wav2vec2-base-960h"
 79 |         self.model = TFWav2Vec2ForCTC.from_pretrained(pretrained_name, vocab_size=output_dim, ignore_mismatched_sizes=True)
 80 |         self.model.freeze_feature_encoder() # https://huggingface.co/blog/fine-tune-wav2vec2-english
 81 | 
 82 |     def __call__(self, inputs):
 83 |         outputs = self.model(inputs)
 84 | 
 85 |         final_state = tf.nn.softmax(outputs.logits, axis=-1)
 86 | 
 87 |         return final_state
 88 | 
 89 | custom_model = tf.keras.Sequential([
 90 |     layers.Input(shape=(None,), name="input", dtype=tf.float32),
 91 |     CustomWav2Vec2Model(len(configs.vocab)+1)
 92 | ])
 93 | 
 94 | for data in train_dataProvider:
 95 |     results = custom_model(data[0])
 96 |     break
 97 | 
 98 | custom_model.summary()
 99 | # configs.save()
100 | 
101 | 
102 | # Compile the model and print summary
103 | custom_model.compile(
104 |     optimizer=tf.keras.optimizers.AdamW(learning_rate=configs.init_lr, weight_decay=configs.weight_decay), 
105 |     loss=CTCloss(), 
106 |     metrics=[
107 |         CERMetric(vocabulary=configs.vocab),
108 |         WERMetric(vocabulary=configs.vocab)
109 |         ],
110 | )
111 | 
112 | # Define callbacks
113 | warmupCosineDecay = WarmupCosineDecay(
114 |     lr_after_warmup=configs.lr_after_warmup,
115 |     final_lr=configs.final_lr,
116 |     warmup_epochs=configs.warmup_epochs,
117 |     decay_epochs=configs.decay_epochs,
118 |     initial_lr=configs.init_lr,
119 | )
120 | earlystopper = EarlyStopping(
121 |     monitor="val_CER", patience=16, verbose=1, mode="min"
122 | )
123 | checkpoint = ModelCheckpoint(
124 |     f"{configs.model_path}/model.h5",
125 |     monitor="val_CER",
126 |     verbose=1,
127 |     save_best_only=True,
128 |     mode="min",
129 |     save_weights_only=False,
130 | )
131 | tb_callback = TensorBoard(f"{configs.model_path}/logs")
132 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5", metadata={"vocab": configs.vocab})
133 | 
134 | # Train the model
135 | custom_model.fit(
136 |     train_dataProvider,
137 |     validation_data=test_dataProvider,
138 |     epochs=configs.train_epochs,
139 |     callbacks=[warmupCosineDecay, earlystopper, checkpoint, tb_callback, model2onnx],
140 |     max_queue_size=configs.train_workers,
141 |     workers=configs.train_workers,
142 |     use_multiprocessing=True,
143 | )


--------------------------------------------------------------------------------
/Tutorials/11_Yolov8/convert2onnx.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from ultralytics.engine.model import Model as BaseModel
 3 | 
 4 | base_model = BaseModel("yolov8m.pt")
 5 | 
 6 | classes = base_model.names
 7 | input_width, input_height = 640, 640
 8 | input_shape = (1, 3, input_width, input_height)
 9 | model = base_model.model
10 | 
11 | # place model on cpu
12 | model.to("cpu")
13 | 
14 | # set the model to inference mode
15 | model.eval()
16 | 
17 | # convert the model to ONNX format
18 | dummy_input = torch.randn(input_shape).to("cpu")
19 | 
20 | # Export the model
21 | torch.onnx.export(
22 |     model,               
23 |     dummy_input,                         
24 |     "yolov8m.onnx",   
25 |     export_params=True,        
26 |     input_names = ["input"],   
27 |     output_names = ["output"], 
28 |     dynamic_axes = {
29 |         "input": {0: "batch_size", 2: "height", 3: "width"}, 
30 |         "output": {0: "batch_size", 2: "anchors"}
31 |         }
32 | )
33 | 
34 | # Add the class names to the model as metadata
35 | import onnx
36 | 
37 | metadata = {"classes": classes}
38 | 
39 | # Load the ONNX model
40 | onnx_model = onnx.load("yolov8m.onnx")
41 | 
42 | # Add the metadata dictionary to the onnx model's metadata_props attribute
43 | for key, value in metadata.items():
44 |     meta = onnx_model.metadata_props.add()
45 |     meta.key = key
46 |     meta.value = str(value)
47 | 
48 | # Save the modified ONNX model
49 | onnx.save(onnx_model, "yolov8m.onnx")


--------------------------------------------------------------------------------
/Tutorials/11_Yolov8/requirements.txt:
--------------------------------------------------------------------------------
1 | mltu==1.2.5
2 | ultralytics==8.1.28
3 | torch==2.0.0
4 | torchvision==0.15.1
5 | onnxruntime==1.15.1
6 | onnx==1.12.0


--------------------------------------------------------------------------------
/Tutorials/11_Yolov8/run_pretrained.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | from ultralytics.engine.model import Model as BaseModel
 3 | from mltu.annotations.detections import Detections
 4 | from mltu.torch.yolo.detectors.torch_detector import Detector as TorchDetector
 5 | from mltu.torch.yolo.detectors.onnx_detector import Detector as OnnxDetector
 6 | 
 7 | input_width, input_height = 640, 640
 8 | confidence_threshold = 0.5
 9 | iou_threshold = 0.5
10 | 
11 | # base_model = BaseModel("yolov8m.pt")
12 | # detector = TorchDetector(base_model.model, input_width, input_height, base_model.names, confidence_threshold, iou_threshold)
13 | detector = OnnxDetector("yolov8m.onnx", input_width, input_height, confidence_threshold, iou_threshold)
14 | 
15 | cap = cv2.VideoCapture(0)
16 | while True:
17 |     ret, frame = cap.read()
18 |     if not ret:
19 |         break
20 | 
21 |     # Perform Yolo object detection
22 |     detections: Detections = detector(frame)
23 | 
24 |     # Apply the detections to the frame
25 |     frame = detections.applyToFrame(frame)
26 | 
27 |     # Print the FPS
28 |     print(detector.fps)
29 | 
30 |     # Display the output image
31 |     cv2.imshow("Object Detection", frame)
32 |     if cv2.waitKey(1) & 0xFF == ord('q'):
33 |         break
34 | 
35 | cap.release()
36 | cv2.destroyAllWindows()


--------------------------------------------------------------------------------
/Tutorials/11_Yolov8/test_yolov8.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | from mltu.annotations.detections import Detections
 4 | from mltu.torch.yolo.detectors.onnx_detector import Detector as OnnxDetector
 5 | 
 6 | # https://www.kaggle.com/datasets/andrewmvd/car-plate-detection
 7 | images_path = "Datasets/car-plate-detection/images"
 8 | 
 9 | input_width, input_height = 416, 416
10 | confidence_threshold = 0.5
11 | iou_threshold = 0.5
12 | 
13 | detector = OnnxDetector("Models/11_Yolov8/1714135287/model.onnx", input_width, input_height, confidence_threshold, iou_threshold, force_cpu=False)
14 | 
15 | for image_path in os.listdir(images_path):
16 | 
17 |     frame = cv2.imread(os.path.join(images_path, image_path))
18 | 
19 |     # Perform Yolo object detection
20 |     detections: Detections = detector(frame)
21 | 
22 |     # Apply the detections to the frame
23 |     frame = detections.applyToFrame(frame)
24 | 
25 |     # Print the FPS
26 |     print(detector.fps)
27 | 
28 |     # Display the output image
29 |     cv2.imshow("Object Detection", frame)
30 |     if cv2.waitKey(0) & 0xFF == ord('q'):
31 |         break
32 | 
33 | cv2.destroyAllWindows()


--------------------------------------------------------------------------------
/Tutorials/11_Yolov8/train_yolov8.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import torch
  4 | from mltu.preprocessors import ImageReader
  5 | from mltu.annotations.images import CVImage
  6 | from mltu.transformers import ImageResizer, ImageShowCV2, ImageNormalizer
  7 | from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen, \
  8 |     RandomMirror, RandomFlip, RandomGaussianBlur, RandomSaltAndPepper, RandomDropBlock, RandomMosaic, RandomElasticTransform
  9 | from mltu.torch.model import Model
 10 | from mltu.torch.dataProvider import DataProvider
 11 | from mltu.torch.yolo.annotation import VOCAnnotationReader
 12 | from mltu.torch.yolo.preprocessors import YoloPreprocessor
 13 | from mltu.torch.yolo.loss import v8DetectionLoss
 14 | from mltu.torch.yolo.metrics import YoloMetrics
 15 | from mltu.torch.yolo.optimizer import build_optimizer, AccumulativeOptimizer
 16 | from mltu.torch.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, Model2onnx, WarmupCosineDecay
 17 | 
 18 | from ultralytics.nn.tasks import DetectionModel
 19 | from ultralytics.engine.model import Model as BaseModel
 20 | 
 21 | # https://www.kaggle.com/datasets/andrewmvd/car-plate-detection
 22 | annotations_path = "Datasets/car-plate-detection/annotations"
 23 | 
 24 | # Create a dataset from the annotations, the dataset is a list of lists where each list contains the [image path, annotation path]
 25 | dataset = [[None, os.path.join(annotations_path, f)] for f in os.listdir(annotations_path)]
 26 | 
 27 | # Make sure torch can see GPU device, it is not recommended to train with CPU
 28 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 29 | 
 30 | img_size = 416
 31 | labels = {0: "licence"}
 32 | 
 33 | # Create a data provider for the dataset
 34 | data_provider = DataProvider(
 35 |     dataset=dataset,
 36 |     skip_validation=True,
 37 |     batch_size=16,
 38 |     data_preprocessors=[
 39 |         VOCAnnotationReader(labels=labels),
 40 |         ImageReader(CVImage),
 41 |         ],
 42 |     transformers=[
 43 |         # ImageShowCV2(),
 44 |         ImageResizer(img_size, img_size),
 45 |         ImageNormalizer(transpose_axis=True),
 46 |     ],
 47 |     batch_postprocessors=[
 48 |         YoloPreprocessor(device, img_size)
 49 |     ],
 50 |     numpy=False,
 51 | )
 52 | 
 53 | # split the dataset into train and test
 54 | train_data_provider, val_data_provider = data_provider.split(0.9, shuffle=False)
 55 | 
 56 | # Attaach augmentation to the train data provider
 57 | train_data_provider.augmentors = [
 58 |     RandomBrightness(), 
 59 |     RandomErodeDilate(),
 60 |     RandomSharpen(),
 61 |     RandomMirror(),
 62 |     RandomFlip(),
 63 |     RandomElasticTransform(),
 64 |     RandomGaussianBlur(),
 65 |     RandomSaltAndPepper(),
 66 |     RandomRotate(angle=10),
 67 |     RandomDropBlock(),
 68 |     RandomMosaic(),
 69 | ]
 70 | 
 71 | base_model = BaseModel("yolov8n.pt")
 72 | # Create a YOLO model
 73 | model = DetectionModel('yolov8n.yaml', nc=len(labels))
 74 | 
 75 | # Load the weight from base model
 76 | try: model.load_state_dict(base_model.model.state_dict(), strict=False)
 77 | except: pass
 78 | 
 79 | model.to(device)
 80 | 
 81 | for k, v in model.named_parameters():
 82 |     if any(x in k for x in [".dfl"]):
 83 |         print("freezing", k)
 84 |         v.requires_grad = False
 85 |     elif not v.requires_grad:
 86 |         v.requires_grad = True
 87 | 
 88 | lr = 1e-3
 89 | optimizer = build_optimizer(model.model, name="AdamW", lr=lr, weight_decay=0.0, momentum=0.937, decay=0.0005)
 90 | optimizer = AccumulativeOptimizer(optimizer, 16, 64)
 91 | 
 92 | # create model object that will handle training and testing of the network
 93 | model = Model(
 94 |     model, 
 95 |     optimizer, 
 96 |     v8DetectionLoss(model), 
 97 |     metrics=[YoloMetrics(nc=len(labels))],
 98 |     log_errors=False,
 99 |     output_path=f"Models/11_Yolov8/{int(time.time())}",
100 |     clip_grad_norm=10.0,
101 |     ema=True,
102 | )
103 | 
104 | modelCheckpoint = ModelCheckpoint(monitor="val_fitness", mode="max", save_best_only=True, verbose=True)
105 | tensorBoard = TensorBoard()
106 | earlyStopping = EarlyStopping(monitor="val_fitness", mode="max", patience=31, verbose=True)
107 | model2onnx = Model2onnx(input_shape=(1, 3, img_size, img_size), verbose=True, opset_version=14, 
108 |                     dynamic_axes = {"input": {0: "batch_size", 2: "height", 3: "width"}, 
109 |                                     "output": {0: "batch_size", 2: "anchors"}},
110 |                     metadata={"classes": labels})
111 | warmupCosineDecayBias = WarmupCosineDecay(lr_after_warmup=lr, final_lr=lr, initial_lr=0.1, 
112 |                                       warmup_steps=len(train_data_provider), warmup_epochs=10, ignore_param_groups=[1, 2]) # lr0
113 | warmupCosineDecay = WarmupCosineDecay(lr_after_warmup=lr, final_lr=lr/10, initial_lr=1e-7, 
114 |                                       warmup_steps=len(train_data_provider), warmup_epochs=10, decay_epochs=190, ignore_param_groups=[0]) # lr1 and lr2
115 | 
116 | # Train the model
117 | history = model.fit(
118 |     train_data_provider,
119 |     test_dataProvider=val_data_provider,
120 |     epochs=200, 
121 |     callbacks=[
122 |         modelCheckpoint, 
123 |         tensorBoard, 
124 |         earlyStopping, 
125 |         model2onnx,
126 |         warmupCosineDecayBias,
127 |         warmupCosineDecay
128 |         ]
129 |     )


--------------------------------------------------------------------------------
/Tutorials/README.md:
--------------------------------------------------------------------------------
1 | # Tutorials and Examples made with MLTU library:
2 | 1. [Text Recognition With TensorFlow and CTC network](https://pylessons.com/ctc-text-recognition), code in ```Tutorials\01_image_to_word``` folder;
3 | 2. [TensorFlow OCR model for reading Captchas](https://pylessons.com/tensorflow-ocr-captcha), code in ```Tutorials\02_captcha_to_text``` folder;
4 | 3. [Handwriting words recognition with TensorFlow](https://pylessons.com/handwriting-recognition), code in ```Tutorials\03_handwriting_recognition``` folder;
5 | 4. [Handwritten sentence recognition with TensorFlow](https://pylessons.com/handwritten-sentence-recognition), code in ```Tutorials\04_sentence_recognition``` folder;
6 | 5. [Introduction to speech recognition with TensorFlow](https://pylessons.com/speech-recognition), code in ```Tutorials\05_speech_recognition``` folder;
7 | 6. [Introduction to PyTorch in a practical way](https://pylessons.com/pytorch-introduction), code in ```Tutorials\06_pytorch_introduction``` folder;
8 | 7. [Using custom wrapper to simplify PyTorch models training pipeline](https://pylessons.com/pytorch-introduction), code in ```Tutorials\07_pytorch_wrapper``` folder;
9 | 8. [Handwriting words recognition with PyTorch](https://pylessons.com/handwriting-recognition-pytorch), code in ```Tutorials\08_handwriting_recognition_torch``` folder;


--------------------------------------------------------------------------------
/bin/read_parquet.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 
3 | file_path = "/home/rokbal/Downloads/train-00000-of-00001-bfc7b63751c36ab0 (1).parquet"
4 | 
5 | df = pd.read_parquet(file_path)
6 | 
7 | print(df.head())


--------------------------------------------------------------------------------
/bin/setup.sh:
--------------------------------------------------------------------------------
1 | python3 -m venv venv
2 | activate() {
3 |     . venv/bin/activate
4 |     echo "installing requirements to virtual environment"
5 |     pip install -r requirements.txt
6 | }
7 | activate


--------------------------------------------------------------------------------
/mltu/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.2.5"
2 | 
3 | from .annotations.images import Image
4 | from .annotations.images import CVImage
5 | from .annotations.images import PillowImage


--------------------------------------------------------------------------------
/mltu/annotations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/annotations/__init__.py


--------------------------------------------------------------------------------
/mltu/annotations/audio.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | 
 4 | class Audio:
 5 |     """ Audio object
 6 | 
 7 |     Attributes:
 8 |         audio (np.ndarray): Audio array
 9 |         sample_rate (int): Sample rate
10 |         init_successful (bool): True if audio was successfully read
11 |         library (object): Library used to read audio, tested only with librosa
12 |     """
13 |     init_successful = False
14 |     augmented=False
15 |     
16 |     def __init__(
17 |             self, 
18 |             audioPath: str, 
19 |             sample_rate: int=22050,
20 |             library=None
21 |         ) -> None:
22 |         if library is None:
23 |             raise ValueError("library must be provided. (e.g. librosa object)")
24 | 
25 |         if isinstance(audioPath, str):
26 |             if not os.path.exists(audioPath):
27 |                 raise FileNotFoundError(f"Image {audioPath} not found.")
28 | 
29 |             self._audio, self.sample_rate = library.load(audioPath, sr=sample_rate)
30 |             self.path = audioPath
31 |             self.init_successful = True
32 | 
33 |         else:
34 |             raise TypeError(f"audioPath must be path to audio file, not {type(audioPath)}")
35 |         
36 |     @property
37 |     def audio(self) -> np.ndarray:
38 |         return self._audio
39 |     
40 |     @audio.setter
41 |     def audio(self, value: np.ndarray):
42 |         self.augmented = True
43 |         self._audio = value
44 | 
45 |     @property
46 |     def shape(self) -> tuple:
47 |         return self._audio.shape
48 |     
49 |     def numpy(self) -> np.ndarray:
50 |         return self._audio
51 |     
52 |     def __add__(self, other: np.ndarray) -> np.ndarray:
53 |         self._audio = self._audio + other
54 |         self.augmented = True
55 |         return self
56 |     
57 |     def __len__(self) -> int:
58 |         return len(self._audio)
59 |     
60 |     def __call__(self) -> np.ndarray:
61 |         return self._audio
62 |     
63 |     def __repr__(self):
64 |         return repr(self._audio)
65 |     
66 |     def __array__(self):
67 |         return self._audio


--------------------------------------------------------------------------------
/mltu/configs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | 
 5 | class BaseModelConfigs:
 6 |     def __init__(self):
 7 |         self.model_path = None
 8 | 
 9 |     def serialize(self):
10 |         class_attributes = {key: value
11 |                             for (key, value)
12 |                             in type(self).__dict__.items()
13 |                             if key not in ['__module__', '__init__', '__doc__', '__annotations__']}
14 |         instance_attributes = self.__dict__
15 | 
16 |         # first init with class attributes then apply instance attributes overwriting any existing duplicate attributes
17 |         all_attributes = class_attributes.copy()
18 |         all_attributes.update(instance_attributes)
19 | 
20 |         return all_attributes
21 | 
22 |     def save(self, name: str = "configs.yaml"):
23 |         if self.model_path is None:
24 |             raise Exception("Model path is not specified")
25 | 
26 |         # create directory if not exist
27 |         if not os.path.exists(self.model_path):
28 |             os.makedirs(self.model_path)
29 | 
30 |         with open(os.path.join(self.model_path, name), "w") as f:
31 |             yaml.dump(self.serialize(), f)
32 | 
33 |     @staticmethod
34 |     def load(configs_path: str):
35 |         with open(configs_path, "r") as f:
36 |             configs = yaml.load(f, Loader=yaml.FullLoader)
37 | 
38 |         config = BaseModelConfigs()
39 |         for key, value in configs.items():
40 |             setattr(config, key, value)
41 | 
42 |         return config
43 | 


--------------------------------------------------------------------------------
/mltu/inferenceModel.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import typing
 4 | import numpy as np
 5 | import onnxruntime as ort
 6 | from collections import deque
 7 | 
 8 | class FpsWrapper:
 9 |     """ Decorator to calculate the frames per second of a function
10 |     """
11 |     def __init__(self, func: typing.Callable):
12 |         self.func = func
13 |         self.fps_list = deque([], maxlen=100)
14 | 
15 |     def __call__(self, *args, **kwargs):
16 |         start = time.time()
17 |         results = self.func(self.instance, *args, **kwargs)
18 |         self.fps_list.append(1 / (time.time() - start))
19 |         self.instance.fps = np.mean(self.fps_list)
20 |         return results
21 | 
22 |     def __get__(self, instance, owner):
23 |         self.instance = instance
24 |         return self.__call__.__get__(instance, owner)
25 | 
26 | 
27 | class OnnxInferenceModel:
28 |     """ Base class for all inference models that use onnxruntime 
29 | 
30 |     Attributes:
31 |         model_path (str, optional): Path to the model folder. Defaults to "".
32 |         force_cpu (bool, optional): Force the model to run on CPU or GPU. Defaults to GPU.
33 |         default_model_name (str, optional): Default model name. Defaults to "model.onnx".
34 |     """
35 |     def __init__(
36 |         self, 
37 |         model_path: str = "",
38 |         force_cpu: bool = False,
39 |         default_model_name: str = "model.onnx",
40 |         *args, **kwargs
41 |         ):
42 |         self.model_path = model_path.replace("\\", "/")
43 |         self.force_cpu = force_cpu
44 |         self.default_model_name = default_model_name
45 | 
46 |         # check if model path is a directory with os path
47 |         if os.path.isdir(self.model_path):
48 |             self.model_path = os.path.join(self.model_path, self.default_model_name)
49 | 
50 |         if not os.path.exists(self.model_path):
51 |             raise Exception(f"Model path ({self.model_path}) does not exist")
52 | 
53 |         providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if ort.get_device() == "GPU" and not force_cpu else ["CPUExecutionProvider"]
54 | 
55 |         self.model = ort.InferenceSession(self.model_path, providers=providers)
56 | 
57 |         self.metadata = {}
58 |         if self.model.get_modelmeta().custom_metadata_map:
59 |             # add metadata to self object
60 |             for key, value in self.model.get_modelmeta().custom_metadata_map.items():
61 |                 try:
62 |                     new_value = eval(value) # in case the value is a list or dict
63 |                 except:
64 |                     new_value = value
65 |                 self.metadata[key] = new_value
66 |                 
67 |         # Update providers priority to only CPUExecutionProvider
68 |         if self.force_cpu:
69 |             self.model.set_providers(["CPUExecutionProvider"])
70 | 
71 |         self.input_shapes = [meta.shape for meta in self.model.get_inputs()]
72 |         self.input_names = [meta.name for meta in self.model._inputs_meta]
73 |         self.output_names = [meta.name for meta in self.model._outputs_meta]
74 | 
75 |     def predict(self, data: np.ndarray, *args, **kwargs):
76 |         raise NotImplementedError
77 | 
78 |     @FpsWrapper
79 |     def __call__(self, data: np.ndarray):
80 |         results = self.predict(data)
81 |         return results


--------------------------------------------------------------------------------
/mltu/tensorflow/README.md:
--------------------------------------------------------------------------------
1 | # Functions and objects specific for TensorFlow 2.* and Python 3


--------------------------------------------------------------------------------
/mltu/tensorflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/tensorflow/__init__.py


--------------------------------------------------------------------------------
/mltu/tensorflow/callbacks.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tensorflow as tf
  3 | from keras.callbacks import Callback
  4 | 
  5 | import logging
  6 | 
  7 | class Model2onnx(Callback):
  8 |     """ Converts the model to onnx format after training is finished. """
  9 |     def __init__(
 10 |         self, 
 11 |         saved_model_path: str, 
 12 |         metadata: dict=None,
 13 |         save_on_epoch_end: bool=False,
 14 |         ) -> None:
 15 |         """ Converts the model to onnx format after training is finished.
 16 |         Args:
 17 |             saved_model_path (str): Path to the saved .h5 model.
 18 |             metadata (dict, optional): Dictionary containing metadata to be added to the onnx model. Defaults to None.
 19 |             save_on_epoch_end (bool, optional): Save the onnx model on every epoch end. Defaults to False.
 20 |         """
 21 |         super().__init__()
 22 |         self.saved_model_path = saved_model_path
 23 |         self.metadata = metadata
 24 |         self.save_on_epoch_end = save_on_epoch_end
 25 | 
 26 |         try:
 27 |             import tf2onnx
 28 |         except:
 29 |             raise ImportError("tf2onnx is not installed. Please install it using 'pip install tf2onnx'")
 30 |         
 31 |         try:
 32 |             import onnx
 33 |         except:
 34 |             raise ImportError("onnx is not installed. Please install it using 'pip install onnx'")
 35 | 
 36 |     @staticmethod
 37 |     def model2onnx(model: tf.keras.Model, onnx_model_path: str):
 38 |         try:
 39 |             import tf2onnx
 40 | 
 41 |             # convert the model to onnx format
 42 |             tf2onnx.convert.from_keras(model, output_path=onnx_model_path)
 43 | 
 44 |         except Exception as e:
 45 |             print(e)
 46 | 
 47 |     @staticmethod
 48 |     def include_metadata(onnx_model_path: str, metadata: dict=None):
 49 |         try:
 50 |             if metadata and isinstance(metadata, dict):
 51 | 
 52 |                 import onnx
 53 |                 # Load the ONNX model
 54 |                 onnx_model = onnx.load(onnx_model_path)
 55 | 
 56 |                 # Add the metadata dictionary to the model's metadata_props attribute
 57 |                 for key, value in metadata.items():
 58 |                     meta = onnx_model.metadata_props.add()
 59 |                     meta.key = key
 60 |                     meta.value = str(value)
 61 | 
 62 |                 # Save the modified ONNX model
 63 |                 onnx.save(onnx_model, onnx_model_path)
 64 | 
 65 |         except Exception as e:
 66 |             print(e)  
 67 | 
 68 |     def on_epoch_end(self, epoch: int, logs: dict=None):
 69 |         """ Converts the model to onnx format on every epoch end. """
 70 |         if self.save_on_epoch_end:
 71 |             self.on_train_end(logs=logs)
 72 | 
 73 |     def on_train_end(self, logs=None):
 74 |         """ Converts the model to onnx format after training is finished. """
 75 |         self.model.load_weights(self.saved_model_path)
 76 |         onnx_model_path = self.saved_model_path.replace(".h5", ".onnx")
 77 |         self.model2onnx(self.model, onnx_model_path)
 78 |         self.include_metadata(onnx_model_path, self.metadata)
 79 | 
 80 | 
 81 | class TrainLogger(Callback):
 82 |     """Logs training metrics to a file.
 83 |     
 84 |     Args:
 85 |         log_path (str): Path to the directory where the log file will be saved.
 86 |         log_file (str, optional): Name of the log file. Defaults to 'logs.log'.
 87 |         logLevel (int, optional): Logging level. Defaults to logging.INFO.
 88 |     """
 89 |     def __init__(self, log_path: str, log_file: str="logs.log", logLevel=logging.INFO, console_output=False) -> None:
 90 |         super().__init__()
 91 |         self.log_path = log_path
 92 |         self.log_file = log_file
 93 | 
 94 |         if not os.path.exists(log_path):
 95 |             os.mkdir(log_path)
 96 | 
 97 |         self.logger = logging.getLogger()
 98 |         self.logger.setLevel(logLevel)
 99 | 
100 |         self.formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
101 | 
102 |         self.file_handler = logging.FileHandler(os.path.join(self.log_path, self.log_file))
103 |         self.file_handler.setLevel(logLevel)
104 |         self.file_handler.setFormatter(self.formatter)
105 | 
106 |         if not console_output:
107 |             self.logger.handlers[:] = []
108 | 
109 |         self.logger.addHandler(self.file_handler)
110 | 
111 |     def on_epoch_end(self, epoch: int, logs: dict=None):
112 |         epoch_message = f"Epoch {epoch}; "
113 |         logs_message = "; ".join([f"{key}: {value}" for key, value in logs.items()])
114 |         self.logger.info(epoch_message + logs_message)
115 | 
116 | 
117 | class WarmupCosineDecay(Callback):
118 |     """ Cosine decay learning rate scheduler with warmup
119 | 
120 |     Args:
121 |         lr_after_warmup (float): Learning rate after warmup
122 |         final_lr (float): Final learning rate
123 |         warmup_epochs (int): Number of warmup epochs
124 |         decay_epochs (int): Number of decay epochs
125 |         initial_lr (float, optional): Initial learning rate. Defaults to 0.0.
126 |         verbose (bool, optional): Whether to print learning rate. Defaults to False.
127 |     """
128 |     def __init__(
129 |             self, 
130 |             lr_after_warmup: float, 
131 |             final_lr: float, 
132 |             warmup_epochs: int, 
133 |             decay_epochs: int, 
134 |             initial_lr: float=0.0, 
135 |             verbose=False
136 |         ) -> None:
137 |         super(WarmupCosineDecay, self).__init__()
138 |         self.lr_after_warmup = lr_after_warmup
139 |         self.final_lr = final_lr
140 |         self.warmup_epochs = warmup_epochs
141 |         self.decay_epochs = decay_epochs
142 |         self.initial_lr = initial_lr
143 |         self.verbose = verbose
144 | 
145 |     def on_epoch_begin(self, epoch: int, logs: dict=None):
146 |         """ Adjust learning rate at the beginning of each epoch """
147 | 
148 |         if epoch >= self.warmup_epochs + self.decay_epochs:
149 |             return logs
150 | 
151 |         if epoch < self.warmup_epochs:
152 |             lr = self.initial_lr + (self.lr_after_warmup - self.initial_lr) * (epoch + 1) / self.warmup_epochs
153 |         else:
154 |             progress = (epoch - self.warmup_epochs) / self.decay_epochs
155 |             lr = self.final_lr + 0.5 * (self.lr_after_warmup - self.final_lr) * (1 + tf.cos(tf.constant(progress) * 3.14159))
156 | 
157 |         tf.keras.backend.set_value(self.model.optimizer.lr, lr)
158 |         
159 |         if self.verbose:
160 |             print(f"Epoch {epoch + 1} - Learning Rate: {lr}")
161 |     
162 |     def on_epoch_end(self, epoch: int, logs: dict=None):
163 |         logs = logs or {}
164 |         
165 |         # Log the learning rate value
166 |         logs["lr"] = self.model.optimizer.lr
167 |         
168 |         return logs


--------------------------------------------------------------------------------
/mltu/tensorflow/dataProvider.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | 
3 | from ..dataProvider import DataProvider as dataProvider
4 | 
5 | class DataProvider(dataProvider, tf.keras.utils.Sequence):
6 |     def __init__(self, *args, **kwargs):
7 |         super().__init__(*args, **kwargs)
8 | 


--------------------------------------------------------------------------------
/mltu/tensorflow/layers.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from keras import layers
  3 | from keras import backend as K
  4 | 
  5 | class SelfAttention(layers.Layer):
  6 |     """  A self-attention layer for convolutional neural networks.
  7 |     
  8 |     This layer takes as input a tensor of shape (batch_size, height, width, channels)
  9 |     and applies self-attention to the channels dimension.
 10 | 
 11 |     Args:
 12 |         num_heads (int): The number of attention heads to use. Defaults to 8.
 13 |         wrapper (tf.keras.layers.Wrapper): A wrapper layer to apply to the convolutional layers.
 14 | 
 15 |     Raises:
 16 |         TypeError: If `wrapper` is provided and is not a subclass of `tf.keras.layers.Wrapper`.
 17 |     """
 18 |     def __init__(self, num_heads: int = 8, wrapper: tf.keras.layers.Wrapper = None):
 19 |         super(SelfAttention, self).__init__()
 20 |         self.num_heads = num_heads
 21 |         self.wrapper = wrapper
 22 | 
 23 |         if wrapper and not issubclass(wrapper, tf.keras.layers.Wrapper):
 24 |             raise TypeError("wrapper must be a class derived from tf.keras.layers.Wrapper")
 25 | 
 26 |     def get_config(self) -> dict:
 27 |         config = super().get_config()
 28 |         config.update({
 29 |             "num_heads": self.num_heads,
 30 |         })
 31 |         return config
 32 | 
 33 |     def build(self, input_shape):
 34 |         _, h, w, c = input_shape
 35 |         self.query_conv = self._conv(filters=c // self.num_heads)
 36 |         self.key_conv = self._conv(filters=c // self.num_heads)
 37 |         self.value_conv = self._conv(filters=c)
 38 |         self.gamma = self.add_weight("gamma", shape=[1], initializer=tf.zeros_initializer(), trainable=True)
 39 | 
 40 |     def _conv(self, filters: int) -> tf.keras.layers.Layer:
 41 |         """ Helper function to create a convolutional layer with the given number of filters.
 42 | 
 43 |         Args:
 44 |             filters (int): The number of filters to use.
 45 | 
 46 |         Returns:
 47 |             tf.keras.layers.Layer: The created convolutional layer.
 48 |         """
 49 |         conv = layers.Conv2D(filters=filters, kernel_size=1, strides=1, padding="same")
 50 |         if self.wrapper:
 51 |             conv = self.wrapper(conv)
 52 | 
 53 |         return conv
 54 | 
 55 |     def call(self, inputs: tf.Tensor) -> tf.Tensor:
 56 |         """ Apply the self-attention mechanism to the input tensor.
 57 | 
 58 |         Args:
 59 |             inputs (tf.Tensor): The input tensor of shape (batch_size, height, width, channels).
 60 | 
 61 |         Returns:
 62 |             tf.Tensor: The output tensor after the self-attention mechanism is applied.
 63 |         """
 64 |         _, h, w, c = inputs.shape
 65 |         q = self.query_conv(inputs)
 66 |         k = self.key_conv(inputs)
 67 |         v = self.value_conv(inputs)
 68 | 
 69 |         q_reshaped = tf.reshape(q, [-1, h * w, c // self.num_heads])
 70 |         k_reshaped = tf.reshape(k, [-1, h * w, c // self.num_heads])
 71 |         v_reshaped = tf.reshape(v, [-1, h * w, c])
 72 | 
 73 |         # Compute the attention scores by taking the dot product of the query and key tensors.
 74 |         attention_scores = tf.matmul(q_reshaped, k_reshaped, transpose_b=True)
 75 | 
 76 |         # Scale the attention scores by the square root of the number of channels.
 77 |         attention_scores = attention_scores / tf.sqrt(tf.cast(c // self.num_heads, dtype=tf.float32))
 78 | 
 79 |         # Apply a softmax function to the attention scores to obtain the attention weights.
 80 |         attention_weights = tf.nn.softmax(attention_scores, axis=-1)
 81 | 
 82 |         # Apply the attention weights to the value tensor to obtain the attention output.
 83 |         attention_output = tf.matmul(attention_weights, v_reshaped)
 84 | 
 85 |         # Reshape the attended value tensor to the original input tensor shape.
 86 |         attention_output = tf.reshape(attention_output, [-1, h, w, c])
 87 | 
 88 |         # Apply the gamma parameter to the attended value tensor and add it to the output tensor.
 89 |         attention_output = self.gamma * attention_output + inputs
 90 | 
 91 |         return attention_output
 92 |     
 93 |     
 94 | class SpectralNormalization(tf.keras.layers.Wrapper):
 95 |     """Spectral Normalization Wrapper. !!! This is not working yet !!!"""
 96 |     def __init__(self, layer, power_iterations=1, eps=1e-12, **kwargs):
 97 |         super(SpectralNormalization, self).__init__(layer, **kwargs)
 98 | 
 99 |         if power_iterations <= 0:
100 |             raise ValueError(
101 |                 "`power_iterations` should be greater than zero, got "
102 |                 "`power_iterations={}`".format(power_iterations)
103 |             )
104 |         self.power_iterations = power_iterations
105 |         self.eps = eps
106 |         if not isinstance(layer, tf.keras.layers.Layer):
107 |             raise ValueError(
108 |                 "Please initialize `TimeDistributed` layer with a "
109 |                 "`Layer` instance. You passed: {input}".format(input=layer))
110 | 
111 |     def build(self, input_shape):
112 |         if not self.layer.built:
113 |             self.layer.build(input_shape)
114 | 
115 |         self.w = self.layer.kernel
116 |         self.w_shape = self.w.shape.as_list()
117 | 
118 |         # self.v = self.add_weight(shape=(1, self.w_shape[0] * self.w_shape[1] * self.w_shape[2]),
119 |         #                          initializer=tf.initializers.TruncatedNormal(stddev=0.02),
120 |         #                          trainable=False,
121 |         #                          name="sn_v",
122 |         #                          dtype=tf.float32)
123 | 
124 |         self.u = self.add_weight(shape=(1, self.w_shape[-1]),
125 |                                  initializer=tf.initializers.TruncatedNormal(stddev=0.02),
126 |                                  trainable=False,
127 |                                  name="sn_u",
128 |                                  dtype=tf.float32)
129 | 
130 |         super(SpectralNormalization, self).build()
131 | 
132 |     def l2normalize(self, v, eps=1e-12):
133 |         return v / (tf.reduce_sum(v ** 2) ** 0.5 + eps)
134 |     
135 |     def power_iteration(self, W, u, rounds=1):
136 |         _u = u
137 | 
138 |         for _ in range(rounds):
139 |             # v_ = tf.matmul(_u, tf.transpose(W))
140 |             # v_hat = self.l2normalize(v_)
141 |             _v = self.l2normalize(K.dot(_u, K.transpose(W)), eps=self.eps)
142 | 
143 |             # u_ = tf.matmul(v_hat, W)
144 |             # u_hat = self.l2normalize(u_)
145 |             _u = self.l2normalize(K.dot(_v, W), eps=self.eps)
146 | 
147 |         return _u, _v
148 | 
149 |     def call(self, inputs, training=None):
150 |         if training is None:
151 |             training = tf.keras.backend.learning_phase()
152 | 
153 |         if training:
154 |             self.update_weights()
155 |             output = self.layer(inputs)
156 |             self.restore_weights()  # Restore weights because of this formula "W = W - alpha * W_SN`"
157 |             return output
158 | 
159 |         return self.layer(inputs)
160 |     
161 |     def update_weights(self):
162 |         w_reshaped = tf.reshape(self.w, [-1, self.w_shape[-1]])
163 |         
164 |         # u_hat = self.u
165 |         # v_hat = self.v  # init v vector
166 | 
167 |         u_hat, v_hat = self.power_iteration(w_reshaped, self.u, self.power_iterations)
168 |         # v_ = tf.matmul(u_hat, tf.transpose(w_reshaped))
169 |         # # v_hat = v_ / (tf.reduce_sum(v_**2)**0.5 + self.eps)
170 |         # v_hat = self.l2normalize(v_, self.eps)
171 | 
172 |         # u_ = tf.matmul(v_hat, w_reshaped)
173 |         # # u_hat = u_ / (tf.reduce_sum(u_**2)**0.5 + self.eps)
174 |         # u_hat = self.l2normalize(u_, self.eps)
175 | 
176 |         # sigma = tf.matmul(tf.matmul(v_hat, w_reshaped), tf.transpose(u_hat))
177 |         sigma=K.dot(K.dot(v_hat, w_reshaped), K.transpose(u_hat))
178 |         self.u.assign(u_hat)
179 |         # self.v.assign(v_hat)
180 | 
181 |         self.layer.kernel.assign(self.w / sigma)
182 | 
183 |     def restore_weights(self):
184 |         self.layer.kernel.assign(self.w)


--------------------------------------------------------------------------------
/mltu/tensorflow/losses.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | class CTCloss(tf.keras.losses.Loss):
 5 |     """ CTCLoss objec for training the model"""
 6 |     def __init__(self, name: str = "CTCloss") -> None:
 7 |         super(CTCloss, self).__init__()
 8 |         self.name = name
 9 |         self.loss_fn = tf.keras.backend.ctc_batch_cost
10 | 
11 |     def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None) -> tf.Tensor:
12 |         """ Compute the training batch CTC loss value"""
13 |         batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
14 |         input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
15 |         label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
16 | 
17 |         input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
18 |         label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
19 | 
20 |         loss = self.loss_fn(y_true, y_pred, input_length, label_length)
21 | 
22 |         return loss


--------------------------------------------------------------------------------
/mltu/tensorflow/model_utils.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | import tensorflow as tf
 3 | from tensorflow import keras
 4 | from keras import layers
 5 | from keras.models import Model
 6 | 
 7 | class CustomModel(Model):
 8 |     """ Custom TensorFlow model for debugging training process purposes
 9 |     """
10 |     def train_step(self, train_data):
11 |         # Unpack the data. Its structure depends on your model and
12 |         # on what you pass to `fit()`.
13 |         inputs, targets = train_data
14 |         with tf.GradientTape() as tape:
15 |             results = self(inputs, training=True)
16 |             loss = self.compiled_loss(targets, results, regularization_losses=self.losses)
17 |             gradients = tape.gradient(loss, self.trainable_weights)
18 | 
19 |         # Applying the gradients on the model using the specified optimizer
20 |         self.optimizer.apply_gradients(zip(gradients, self.trainable_weights))
21 | 
22 |         # Update the metrics.
23 |         # Metrics are configured in `compile()`.
24 |         self.compiled_metrics.update_state(targets, results)
25 | 
26 |         return {m.name: m.result() for m in self.metrics}
27 | 
28 |     def test_step(self, test_data):
29 |         inputs, targets = test_data
30 |         # Get prediction from model
31 |         results = self(inputs, training=False)
32 | 
33 |         # Update the loss
34 |         self.compiled_loss(targets, results, regularization_losses=self.losses)
35 | 
36 |         # Update the metrics
37 |         self.compiled_metrics.update_state(targets, results)
38 | 
39 |         # Return a dict mapping metric names to current value.
40 |         # Note that it will include the loss (tracked in self.metrics).
41 |         return {m.name: m.result() for m in self.metrics}
42 | 
43 | 
44 | def activation_layer(layer, activation: str="relu", alpha: float=0.1) -> tf.Tensor:
45 |     """ Activation layer wrapper for LeakyReLU and ReLU activation functions
46 |     Args:
47 |         layer: tf.Tensor
48 |         activation: str, activation function name (default: 'relu')
49 |         alpha: float (LeakyReLU activation function parameter)
50 |     Returns:
51 |         tf.Tensor
52 |     """
53 |     if activation == "relu":
54 |         layer = layers.ReLU()(layer)
55 |     elif activation == "leaky_relu":
56 |         layer = layers.LeakyReLU(alpha=alpha)(layer)
57 | 
58 |     return layer
59 | 
60 | 
61 | def residual_block(
62 |         x: tf.Tensor,
63 |         filter_num: int,
64 |         strides: typing.Union[int, list] = 2,
65 |         kernel_size: typing.Union[int, list] = 3,
66 |         skip_conv: bool = True,
67 |         padding: str = "same",
68 |         kernel_initializer: str = "he_uniform",
69 |         activation: str = "relu",
70 |         dropout: float = 0.2):
71 |     # Create skip connection tensor
72 |     x_skip = x
73 | 
74 |     # Perform 1-st convolution
75 |     x = layers.Conv2D(filter_num, kernel_size, padding = padding, strides = strides, kernel_initializer=kernel_initializer)(x)
76 |     x = layers.BatchNormalization()(x)
77 |     x = activation_layer(x, activation=activation)
78 | 
79 |     # Perform 2-nd convoluti
80 |     x = layers.Conv2D(filter_num, kernel_size, padding = padding, kernel_initializer=kernel_initializer)(x)
81 |     x = layers.BatchNormalization()(x)
82 | 
83 |     # Perform 3-rd convolution if skip_conv is True, matchin the number of filters and the shape of the skip connection tensor
84 |     if skip_conv:
85 |         x_skip = layers.Conv2D(filter_num, 1, padding = padding, strides = strides, kernel_initializer=kernel_initializer)(x_skip)
86 | 
87 |     # Add x and skip connection and apply activation function
88 |     x = layers.Add()([x, x_skip])     
89 |     x = activation_layer(x, activation=activation)
90 | 
91 |     # Apply dropout
92 |     if dropout:
93 |         x = layers.Dropout(dropout)(x)
94 | 
95 |     return x


--------------------------------------------------------------------------------
/mltu/tensorflow/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==2.10.1
2 | tf2onnx
3 | onnx


--------------------------------------------------------------------------------
/mltu/tensorflow/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/tensorflow/transformer/__init__.py


--------------------------------------------------------------------------------
/mltu/tensorflow/transformer/attention.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | class BaseAttention(tf.keras.layers.Layer):
  4 |     """
  5 |     Base class for all attention layers. It contains the common functionality of all attention layers.
  6 |     This layer contains a MultiHeadAttention layer, a LayerNormalization layer and an Add layer.
  7 |     It is used as a base class for the GlobalSelfAttention, CausalSelfAttention and CrossAttention layers.
  8 |     And it is not intended to be used directly.
  9 | 
 10 |     Methods:
 11 |         call: Performs the forward pass of the layer.
 12 | 
 13 |     Attributes:
 14 |         mha (tf.keras.layers.MultiHeadAttention): The MultiHeadAttention layer.
 15 |         layernorm (tf.keras.layers.LayerNormalization): The LayerNormalization layer.
 16 |         add (tf.keras.layers.Add): The Add layer.
 17 |     """
 18 |     def __init__(self, **kwargs: dict):
 19 |         """ Constructor of the BaseAttention layer.
 20 |         
 21 |         Args:
 22 |             **kwargs: Additional keyword arguments that are passed to the MultiHeadAttention layer, e. g. 
 23 |                         num_heads (number of heads), key_dim (dimensionality of the key space), etc.
 24 |         """
 25 |         super().__init__()
 26 |         self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
 27 |         self.layernorm = tf.keras.layers.LayerNormalization()
 28 |         self.add = tf.keras.layers.Add()
 29 | 
 30 | 
 31 | class CrossAttention(BaseAttention):
 32 |     """
 33 |     A class that implements the cross-attention layer by inheriting from the BaseAttention class.
 34 |     This layer is used to process two different sequences and attends to the context sequence while processing the query sequence.
 35 | 
 36 |     Methods:
 37 |         call: Performs the forward pass of the layer.    
 38 | 
 39 |     Attributes:
 40 |         mha (tf.keras.layers.MultiHeadAttention): The MultiHeadAttention layer.
 41 |         layernorm (tf.keras.layers.LayerNormalization): The LayerNormalization layer.
 42 |         add (tf.keras.layers.Add): The Add layer.
 43 |     """
 44 |     def call(self, x: tf.Tensor, context: tf.Tensor) -> tf.Tensor:
 45 |         """
 46 |         The call function that performs the cross-attention operation.
 47 | 
 48 |         Args:
 49 |             x (tf.Tensor): The query (expected Transformer results) sequence of shape (batch_size, seq_length, d_model).
 50 |             context (tf.Tensor): The context (inputs to the Transformer) sequence of shape (batch_size, seq_length, d_model).
 51 | 
 52 |         Returns:
 53 |             tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model).
 54 |         """
 55 |         attn_output, attn_scores = self.mha(query=x, key=context, value=context, return_attention_scores=True)
 56 | 
 57 |         # Cache the attention scores for plotting later.
 58 |         self.last_attn_scores = attn_scores
 59 | 
 60 |         x = self.add([x, attn_output])
 61 |         x = self.layernorm(x)
 62 | 
 63 |         return x
 64 | 
 65 | 
 66 | class GlobalSelfAttention(BaseAttention):
 67 |     """
 68 |     A class that implements the global self-attention layer by inheriting from the BaseAttention class.
 69 |     This layer is used to process a single sequence and attends to all the tokens in the sequence.
 70 | 
 71 |     Methods:
 72 |         call: Performs the forward pass of the layer.
 73 | 
 74 |     Attributes:
 75 |         mha (tf.keras.layers.MultiHeadAttention): The MultiHeadAttention layer.
 76 |         layernorm (tf.keras.layers.LayerNormalization): The LayerNormalization layer.
 77 |         add (tf.keras.layers.Add): The Add layer.
 78 |     """
 79 |     def call(self, x: tf.Tensor) -> tf.Tensor:
 80 |         """
 81 |         The call function that performs the global self-attention operation.
 82 | 
 83 |         Args:
 84 |             x (tf.Tensor): The input sequence of shape (batch_size, seq_length, d_model).
 85 | 
 86 |         Returns:
 87 |             tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model).
 88 |         """
 89 |         attn_output = self.mha(query=x, value=x, key=x)
 90 |         x = self.add([x, attn_output])
 91 |         x = self.layernorm(x)
 92 |         return x
 93 | 
 94 | 
 95 | class CausalSelfAttention(BaseAttention):
 96 |     """
 97 |     Call self attention on the input sequence, ensuring that each position in the 
 98 |     output depends only on previous positions (i.e. a causal model).
 99 | 
100 |     Methods:
101 |         call: Performs the forward pass of the layer.
102 | 
103 |     Attributes:
104 |         mha (tf.keras.layers.MultiHeadAttention): The MultiHeadAttention layer.
105 |         layernorm (tf.keras.layers.LayerNormalization): The LayerNormalization layer.
106 |         add (tf.keras.layers.Add): The Add layer.
107 |     """
108 |     def call(self, x: tf.Tensor) -> tf.Tensor:
109 |         """
110 |         The call function that performs the causal self-attention operation.
111 |         
112 |         Args:
113 |             x (tf.Tensor): The input sequence of shape (batch_size, seq_length, d_model).
114 | 
115 |         Returns:
116 |             tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model).
117 |         """
118 |         attn_output = self.mha(query=x, value=x, key=x, use_causal_mask = True)
119 |         x = self.add([x, attn_output])
120 |         x = self.layernorm(x)
121 |         return x


--------------------------------------------------------------------------------
/mltu/tensorflow/transformer/callbacks.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from keras.callbacks import Callback
 3 | from mltu.tensorflow.callbacks import Model2onnx
 4 | 
 5 | 
 6 | class EncDecSplitCallback(Callback):
 7 |     """Callback to extract the encoder and decoder models from Transformer model and save them separately
 8 |     Also, this callback incorporates Model2onnx callback to convert the encoder and decoder models to ONNX format
 9 | 
10 |     Args:
11 |         model_path (str): Path to save the encoder and decoder models
12 |         encoder_metadata (dict, optional): Metadata to save with the encoder model. Defaults to None.
13 |         decoder_metadata (dict, optional): Metadata to save with the decoder model. Defaults to None.
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         model_path: str,
19 |         encoder_metadata: dict = None,
20 |         decoder_metadata: dict = None,
21 |         model_name = "model.h5"
22 |     ):
23 |         """Callback to extract the encoder and decoder models from Transformer model and save them separately"""
24 |         super(EncDecSplitCallback, self).__init__()
25 |         self.model_path = model_path
26 |         self.encoder_metadata = encoder_metadata
27 |         self.decoder_metadata = decoder_metadata
28 |         self.model_name = model_name
29 | 
30 |     def on_train_end(self, epoch: int, logs: dict = None):
31 |         try:
32 |             # load best model weights
33 |             self.model.load_weights(self.model_path + "/" + self.model_name)
34 |             
35 |             # extract encoder and decoder models
36 |             encoder_model = tf.keras.Model(
37 |                 inputs=self.model.inputs[0], outputs=self.model.get_layer("encoder").output
38 |             )
39 |             decoder_model = tf.keras.Model(
40 |                 inputs=[self.model.inputs[1], self.model.get_layer("encoder").output],
41 |                 outputs=self.model.layers[-1].output,
42 |             )
43 | 
44 |             # save encoder and decoder models
45 |             encoder_model.save(self.model_path + "/encoder.h5")
46 |             decoder_model.save(self.model_path + "/decoder.h5")
47 | 
48 |             # convert encoder and decoder models to onnx
49 |             Model2onnx.model2onnx(encoder_model, self.model_path + "/encoder.onnx")
50 |             Model2onnx.model2onnx(decoder_model, self.model_path + "/decoder.onnx")
51 | 
52 |             # save encoder and decoder metadata
53 |             if self.encoder_metadata:
54 |                 Model2onnx.include_metadata(self.model_path + "/encoder.onnx", self.encoder_metadata)
55 |             if self.decoder_metadata:
56 |                 Model2onnx.include_metadata(self.model_path + "/decoder.onnx", self.decoder_metadata)
57 |         except Exception as e:
58 |             print(e)
59 |             pass
60 | 


--------------------------------------------------------------------------------
/mltu/tensorflow/transformer/utils.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | 
  4 | class MaskedLoss(tf.keras.losses.Loss):
  5 |     """ Masked loss function for Transformer.
  6 | 
  7 |     Args:
  8 |         mask_value (int, optional): Mask value. Defaults to 0.
  9 |         reduction (str, optional): Reduction method. Defaults to 'none'.
 10 |     """
 11 |     def __init__(self, mask_value: int=0, reduction: str='none') -> None:
 12 |         super(MaskedLoss, self).__init__()
 13 |         self.mask_value = mask_value
 14 |         self.reduction = reduction
 15 |         self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=reduction)
 16 | 
 17 |     def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None) -> tf.Tensor:
 18 |         """ Calculate masked loss.
 19 |         
 20 |         Args:
 21 |             y_true (tf.Tensor): True labels.
 22 |             y_pred (tf.Tensor): Predicted labels.
 23 | 
 24 |         Returns:
 25 |             tf.Tensor: Masked loss.
 26 |         """
 27 |         mask = y_true != self.mask_value
 28 |         loss = self.loss_object(y_true, y_pred)
 29 | 
 30 |         mask = tf.cast(mask, dtype=loss.dtype)
 31 |         loss *= mask
 32 | 
 33 |         loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)
 34 |         return loss
 35 | 
 36 | 
 37 | class MaskedAccuracy(tf.keras.metrics.Metric):
 38 |     """ Masked accuracy metric for Transformer.
 39 | 
 40 |     Args:
 41 |         mask_value (int, optional): Mask value. Defaults to 0.
 42 |         name (str, optional): Name of the metric. Defaults to 'masked_accuracy'.
 43 |     """
 44 |     def __init__(self, mask_value: int=0, name: str='masked_accuracy') -> None:
 45 |         super(MaskedAccuracy, self).__init__(name=name)
 46 |         self.mask_value = mask_value
 47 |         self.total = self.add_weight(name='total', initializer='zeros')
 48 |         self.count = self.add_weight(name='count', initializer='zeros')
 49 | 
 50 |     @tf.function
 51 |     def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None):
 52 |         """ Update state of the metric.
 53 | 
 54 |         Args:
 55 |             y_true (tf.Tensor): True labels.
 56 |             y_pred (tf.Tensor): Predicted labels.
 57 |         """
 58 |         pred = tf.argmax(y_pred, axis=2)
 59 |         label = tf.cast(y_true, pred.dtype)
 60 |         match = label == pred
 61 | 
 62 |         mask = label != self.mask_value
 63 | 
 64 |         match = match & mask
 65 | 
 66 |         match = tf.cast(match, dtype=tf.float32)
 67 |         mask = tf.cast(mask, dtype=tf.float32)
 68 |         match = tf.reduce_sum(match)
 69 |         mask = tf.reduce_sum(mask)
 70 | 
 71 |         self.total.assign_add(match)
 72 |         self.count.assign_add(mask)
 73 | 
 74 |     def result(self) -> tf.Tensor:
 75 |         """ Calculate masked accuracy.
 76 | 
 77 |         Returns:
 78 |             tf.Tensor: Masked accuracy.
 79 |         """
 80 |         return self.total / self.count
 81 |     
 82 | 
 83 | class CERMetric(tf.keras.metrics.Metric):
 84 |     """A custom TensorFlow metric to compute the Character Error Rate (CER).
 85 |     
 86 |     Args:
 87 |         vocabulary: A string of the vocabulary used to encode the labels.
 88 |         name: (Optional) string name of the metric instance.
 89 |         **kwargs: Additional keyword arguments.
 90 |     """
 91 |     def __init__(self, end_token, padding_token: int=0, name="CER", **kwargs):
 92 |         # Initialize the base Metric class
 93 |         super(CERMetric, self).__init__(name=name, **kwargs)
 94 |         
 95 |         # Initialize variables to keep track of the cumulative character/word error rates and counter
 96 |         self.cer_accumulator = tf.Variable(0.0, name="cer_accumulator", dtype=tf.float32)
 97 |         self.batch_counter = tf.Variable(0, name="batch_counter", dtype=tf.int32)
 98 |         
 99 |         self.padding_token = padding_token
100 |         self.end_token = end_token
101 | 
102 |     def get_cer(self, pred, y_true, padding=-1):
103 |         """ Calculates the character error rate (CER) between the predicted labels and true labels for a batch of input data.
104 | 
105 |         Args:
106 |             pred(tf.Tensor): The predicted labels, with dtype=tf.int32, usually output from tf.keras.backend.ctc_decode
107 |             y_true (tf.Tensor): The true labels, with dtype=tf.int32
108 |             padding (int, optional): The padding token when converting to sparse tensor. Defaults to -1.
109 | 
110 |         Returns:
111 |             tf.Tensor: The CER between the predicted labels and true labels
112 |         """
113 |         # find index where end token is
114 |         equal = tf.equal(pred, self.end_token)
115 |         equal_int = tf.cast(equal, tf.int64)
116 |         end_token_index = tf.argmax(equal_int, axis=1)
117 | 
118 |         # mask out everything after end token
119 |         new_range = tf.range(tf.shape(pred)[1], dtype=tf.int64)
120 |         range_matrix = tf.tile(new_range[None, :], [tf.shape(pred)[0], 1])
121 | 
122 |         mask = range_matrix <= tf.expand_dims(end_token_index, axis=1)
123 |         masked_pred = tf.where(mask, pred, padding)
124 | 
125 |         # Convert the valid predicted labels tensor to a sparse tensor
126 |         sparse_pred = tf.RaggedTensor.from_tensor(masked_pred, padding=padding).to_sparse()
127 | 
128 |         # Convert the valid true labels tensor to a sparse tensor
129 |         sparse_true = tf.RaggedTensor.from_tensor(y_true, padding=padding).to_sparse()
130 | 
131 |         # Calculate the normalized edit distance between the sparse predicted labels tensor and sparse true labels tensor
132 |         distance = tf.edit_distance(sparse_pred, sparse_true, normalize=True)
133 | 
134 |         return distance
135 | 
136 |     # @tf.function
137 |     def update_state(self, y_true, y_pred, sample_weight=None):
138 |         """Updates the state variables of the metric.
139 | 
140 |         Args:
141 |             y_true: A tensor of true labels with shape (batch_size, sequence_length).
142 |             y_pred: A tensor of predicted labels with shape (batch_size, sequence_length, num_classes).
143 |             sample_weight: (Optional) a tensor of weights with shape (batch_size, sequence_length).
144 |         """
145 |         pred = tf.argmax(y_pred, axis=2)
146 | 
147 |         # Calculate the normalized edit distance between the predicted labels and true labels tensors
148 |         distance = self.get_cer(pred, y_true, self.padding_token)
149 | 
150 |         # Add the sum of the distance tensor to the cer_accumulator variable
151 |         self.cer_accumulator.assign_add(tf.reduce_sum(distance))
152 |         
153 |         # Increment the batch_counter by the batch size
154 |         self.batch_counter.assign_add(len(y_true))
155 | 
156 |     def result(self):
157 |         """ Computes and returns the metric result.
158 | 
159 |         Returns:
160 |             A TensorFlow float representing the CER (character error rate).
161 |         """
162 |         return tf.math.divide_no_nan(self.cer_accumulator, tf.cast(self.batch_counter, tf.float32))


--------------------------------------------------------------------------------
/mltu/torch/README.md:
--------------------------------------------------------------------------------
1 | # Functions and objects specific for PyTorch and Python 3


--------------------------------------------------------------------------------
/mltu/torch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/torch/__init__.py


--------------------------------------------------------------------------------
/mltu/torch/handlers.py:
--------------------------------------------------------------------------------
  1 | import typing
  2 | 
  3 | from .metrics import Metric
  4 | from .callbacks import Callback
  5 | 
  6 | class MetricsHandler:
  7 |     """ Metrics handler class for training and testing loops"""
  8 |     def __init__(self, metrics: typing.List[Metric]):
  9 |         self.metrics = metrics
 10 | 
 11 |         # Validate metrics
 12 |         if not all(isinstance(m, Metric) for m in self.metrics):
 13 |             raise TypeError("all items in the metrics argument must be of type Metric (Check mltu.metrics.metrics.py for more information)")
 14 |         
 15 |         self.train_results_dict = {"loss": None}
 16 |         self.train_results_dict.update({metric.name: None for metric in self.metrics})
 17 |         
 18 |         self.val_results_dict = {"val_loss": None}
 19 |         self.val_results_dict.update({"val_" + metric.name: None for metric in self.metrics})
 20 | 
 21 |     def update(self, target, output, **kwargs):
 22 |         for metric in self.metrics:
 23 |             metric.update(output, target, **kwargs)
 24 | 
 25 |     def reset(self):
 26 |         for metric in self.metrics:
 27 |             metric.reset()
 28 | 
 29 |     def results(self, loss, train: bool=True):
 30 |         suffix = "val_" if not train else ""
 31 |         results_dict = self.val_results_dict if not train else self.train_results_dict
 32 |         results_dict[suffix + "loss"] = loss
 33 |         for metric in self.metrics:
 34 |             result = metric.result()
 35 |             if result:
 36 |                 if isinstance(result, dict):
 37 |                     for k, v in result.items():
 38 |                         results_dict[suffix + k] = v
 39 |                 else:
 40 |                     results_dict[suffix + metric.name] = result
 41 | 
 42 |         logs = {k: round(v, 4) for k, v in results_dict.items() if v is not None}
 43 |         return logs
 44 |     
 45 |     def description(self, epoch: int=None, train: bool=True):
 46 |         epoch_desc = f"Epoch {epoch} - " if epoch is not None else "          "
 47 |         dict = self.train_results_dict if train else self.val_results_dict
 48 |         return epoch_desc + " - ".join([f"{k}: {v:.4f}" for k, v in dict.items() if v])
 49 |     
 50 | 
 51 | class CallbacksHandler:
 52 |     """ Callbacks handler class for training and testing loops"""
 53 |     def __init__(self, model, callbacks: typing.List[Callback]):
 54 |         self.callbacks = callbacks
 55 | 
 56 |         # Validate callbacks
 57 |         if not all(isinstance(c, Callback) for c in self.callbacks):
 58 |             raise TypeError("all items in the callbacks argument must be of type Callback (Check mltu.torch.callbacks.py for more information)")
 59 |         
 60 |         for callback in self.callbacks:
 61 |             callback.model = model
 62 |         
 63 |     def on_train_begin(self, logs=None):
 64 |         for callback in self.callbacks:
 65 |             callback.on_train_begin(logs)
 66 | 
 67 |     def on_train_end(self, logs=None):
 68 |         for callback in self.callbacks:
 69 |             callback.on_train_end(logs)
 70 | 
 71 |     def on_epoch_begin(self, epoch, logs=None):
 72 |         for callback in self.callbacks:
 73 |             callback.on_epoch_begin(epoch, logs)
 74 | 
 75 |     def on_epoch_end(self, epoch, logs=None):
 76 |         for callback in self.callbacks:
 77 |             callback.on_epoch_end(epoch, logs)
 78 | 
 79 |     def on_test_begin(self, logs=None):
 80 |         for callback in self.callbacks:
 81 |             callback.on_test_begin(logs)
 82 | 
 83 |     def on_test_end(self, logs=None):
 84 |         for callback in self.callbacks:
 85 |             callback.on_test_end(logs)
 86 | 
 87 |     def on_batch_begin(self, batch: int, logs=None, train: bool=True):
 88 |         for callback in self.callbacks:
 89 |             callback.on_batch_begin(batch, logs)
 90 | 
 91 |             if train:
 92 |                 callback.on_train_batch_begin(batch, logs)
 93 |             else:
 94 |                 callback.on_test_batch_begin(batch, logs)
 95 | 
 96 |     def on_batch_end(self, batch: int, logs=None, train: bool=True):
 97 |         for callback in self.callbacks:
 98 |             callback.on_batch_end(batch, logs)
 99 | 
100 |             if train:
101 |                 callback.on_train_batch_end(batch, logs)
102 |             else:
103 |                 callback.on_test_batch_end(batch, logs)


--------------------------------------------------------------------------------
/mltu/torch/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class CTCLoss(nn.Module):
 5 |     """ CTC loss for PyTorch
 6 |     """
 7 |     def __init__(self, blank: int, reduction: str="mean", zero_infinity: bool=False):
 8 |         """ CTC loss for PyTorch
 9 | 
10 |         Args:
11 |             blank: Index of the blank label
12 |         """
13 |         super(CTCLoss, self).__init__()
14 |         self.ctc_loss = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=zero_infinity)
15 |         self.blank = blank
16 | 
17 |     def forward(self, output, target):
18 |         """
19 |         Args:
20 |             output: Tensor of shape (batch_size, num_classes, sequence_length)
21 |             target: Tensor of shape (batch_size, sequence_length)
22 |             
23 |         Returns:
24 |             loss: Scalar
25 |         """
26 |         # Remove padding and blank tokens from target
27 |         target_lengths = torch.sum(target != self.blank, dim=1)
28 |         using_dtype = torch.int32 if max(target_lengths) <= 256 else torch.int64
29 |         device = output.device
30 | 
31 |         target_unpadded = target[target != self.blank].view(-1).to(using_dtype)
32 | 
33 |         output = output.permute(1, 0, 2)  # (sequence_length, batch_size, num_classes)
34 |         output_lengths = torch.full(size=(output.size(1),), fill_value=output.size(0), dtype=using_dtype).to(device)
35 | 
36 |         loss = self.ctc_loss(output, target_unpadded, output_lengths, target_lengths.to(using_dtype))
37 | 
38 |         return loss


--------------------------------------------------------------------------------
/mltu/torch/metrics.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import typing
  3 | import numpy as np
  4 | from itertools import groupby
  5 | 
  6 | from mltu.utils.text_utils import get_cer, get_wer
  7 | 
  8 | 
  9 | class Metric:
 10 |     """ Base class for all metrics"""
 11 |     def __init__(self, name: str) -> None:
 12 |         """ Initialize metric with name
 13 | 
 14 |         Args:
 15 |             name (str): name of metric
 16 |         """
 17 |         self.name = name
 18 | 
 19 |     def reset(self):
 20 |         """ Reset metric state to initial values and return metric value"""
 21 |         self.__init__()
 22 | 
 23 |     def update(self, output: torch.Tensor, target: torch.Tensor, **kwargs):
 24 |         """ Update metric state with new data
 25 |         
 26 |         Args:
 27 |             output (torch.Tensor): output of model
 28 |             target (torch.Tensor): target of data
 29 |         """
 30 |         pass
 31 | 
 32 |     def result(self):
 33 |         """ Return metric value"""
 34 |         pass
 35 | 
 36 | 
 37 | class Accuracy(Metric):
 38 |     """ Accuracy metric class
 39 |     
 40 |     Args:
 41 |         name (str, optional): name of metric. Defaults to 'accuracy'.
 42 |     """
 43 |     def __init__(self, name="accuracy") -> None:
 44 |         super(Accuracy, self).__init__(name=name)
 45 |         self.correct = 0
 46 |         self.total = 0
 47 | 
 48 |     def update(self, output: torch.Tensor, target: torch.Tensor, **kwargs):
 49 |         """ Update metric state with new data
 50 | 
 51 |         Args:
 52 |             output (torch.Tensor): output of model
 53 |             target (torch.Tensor): target of data
 54 |         """
 55 |         _, predicted = torch.max(output.data, 1)
 56 |         self.total += target.size(0)
 57 |         self.correct += (predicted == target).sum().item()
 58 | 
 59 |     def result(self):
 60 |         """ Return metric value"""
 61 |         return self.correct / self.total
 62 | 
 63 | 
 64 | class CERMetric(Metric):
 65 |     """A custom PyTorch metric to compute the Character Error Rate (CER).
 66 |     
 67 |     Args:
 68 |         vocabulary: A string of the vocabulary used to encode the labels.
 69 |         name: (Optional) string name of the metric instance.
 70 | 
 71 |     # TODO: implement everything in Torch to avoid converting to numpy
 72 |     """
 73 |     def __init__(
 74 |         self, 
 75 |         vocabulary: typing.Union[str, list],
 76 |         name: str = "CER"
 77 |     ) -> None:
 78 |         super(CERMetric, self).__init__(name=name)
 79 |         self.vocabulary = vocabulary
 80 |         self.reset()
 81 | 
 82 |     def reset(self):
 83 |         """ Reset metric state to initial values"""
 84 |         self.cer = 0
 85 |         self.counter = 0
 86 | 
 87 |     def update(self, output: torch.Tensor, target: torch.Tensor, **kwargs) -> None:
 88 |         """ Update metric state with new data
 89 | 
 90 |         Args:
 91 |             output (torch.Tensor): output of model
 92 |             target (torch.Tensor): target of data
 93 |         """
 94 |         # convert to numpy
 95 |         output = output.detach().cpu().numpy()
 96 |         target = target.detach().cpu().numpy()
 97 |         # use argmax to find the index of the highest probability
 98 |         argmax_preds = np.argmax(output, axis=-1)
 99 |         
100 |         # use groupby to find continuous same indexes
101 |         grouped_preds = [[k for k,_ in groupby(preds)] for preds in argmax_preds]
102 | 
103 |         # convert indexes to strings
104 |         output_texts = ["".join([self.vocabulary[k] for k in group if k < len(self.vocabulary)]) for group in grouped_preds]
105 |         target_texts = ["".join([self.vocabulary[k] for k in group if k < len(self.vocabulary)]) for group in target]
106 | 
107 |         cer = get_cer(output_texts, target_texts)
108 | 
109 |         self.cer += cer
110 |         self.counter += 1
111 | 
112 |     def result(self) -> float:
113 |         """ Return metric value"""
114 |         return self.cer / self.counter
115 |     
116 | 
117 | class WERMetric(Metric):
118 |     """A custom PyTorch metric to compute the Word Error Rate (WER).
119 |     
120 |     Args:
121 |         vocabulary: A string of the vocabulary used to encode the labels.
122 |         name: (Optional) string name of the metric instance.
123 | 
124 |     # TODO: implement everything in Torch to avoid converting to numpy
125 |     """
126 |     def __init__(
127 |         self, 
128 |         vocabulary: typing.Union[str, list],
129 |         name: str = "WER"
130 |     ) -> None:
131 |         super(WERMetric, self).__init__(name=name)
132 |         self.vocabulary = vocabulary
133 |         self.reset()
134 | 
135 |     def reset(self):
136 |         """ Reset metric state to initial values"""
137 |         self.wer = 0
138 |         self.counter = 0
139 | 
140 |     def update(self, output: torch.Tensor, target: torch.Tensor, **kwargs) -> None:
141 |         """ Update metric state with new data
142 | 
143 |         Args:
144 |             output (torch.Tensor): output of model
145 |             target (torch.Tensor): target of data
146 |         """
147 |         # convert to numpy
148 |         output = output.detach().cpu().numpy()
149 |         target = target.detach().cpu().numpy()
150 |         # use argmax to find the index of the highest probability
151 |         argmax_preds = np.argmax(output, axis=-1)
152 |         
153 |         # use groupby to find continuous same indexes
154 |         grouped_preds = [[k for k,_ in groupby(preds)] for preds in argmax_preds]
155 | 
156 |         # convert indexes to strings
157 |         output_texts = ["".join([self.vocabulary[k] for k in group if k < len(self.vocabulary)]) for group in grouped_preds]
158 |         target_texts = ["".join([self.vocabulary[k] for k in group if k < len(self.vocabulary)]) for group in target]
159 | 
160 |         wer = get_wer(output_texts, target_texts)
161 | 
162 |         self.wer += wer
163 |         self.counter += 1
164 | 
165 |     def result(self) -> float:
166 |         """ Return metric value"""
167 |         return self.wer / self.counter


--------------------------------------------------------------------------------
/mltu/torch/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.13.1
2 | tensorboard==2.10.1
3 | onnx==1.12.0
4 | torchsummaryX


--------------------------------------------------------------------------------
/mltu/torch/yolo/README.md:
--------------------------------------------------------------------------------
1 | ## Update Readme


--------------------------------------------------------------------------------
/mltu/torch/yolo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/torch/yolo/__init__.py


--------------------------------------------------------------------------------
/mltu/torch/yolo/annotation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import typing
 3 | from pathlib import Path
 4 | import xml.etree.ElementTree as ET
 5 | from mltu.annotations.detections import Detections, Detection, BboxType
 6 | 
 7 | class VOCAnnotationReader:
 8 |     """Reads annotations from VOC format
 9 |     """
10 |     def __init__(self, labels: dict, images_path: str=None):
11 |         self.labels = labels
12 |         self.images_path = images_path
13 |         self.dataset_found_labels = {}
14 | 
15 |     @staticmethod
16 |     def readFromVOC(voc_annotation_path: str, labels: dict={}, images_path: str=None) -> Detections:
17 |         annotation_path = Path(voc_annotation_path)
18 |         tree = ET.parse(voc_annotation_path)
19 |         root = tree.getroot()
20 | 
21 |         annotation_dict = {}
22 | 
23 |         # Iterate through child elements
24 |         for child in root:
25 |             if child.tag == 'object':
26 |                 obj_dict = {}
27 |                 for obj_child in child:
28 |                     if obj_child.tag == 'bndbox':
29 |                         bbox_dict = {}
30 |                         for bbox_child in obj_child:
31 |                             bbox_dict[bbox_child.tag] = int(bbox_child.text)
32 |                         obj_dict[obj_child.tag] = bbox_dict
33 |                     else:
34 |                         obj_dict[obj_child.tag] = obj_child.text
35 |                 if 'objects' not in annotation_dict:
36 |                     annotation_dict['objects'] = []
37 |                 annotation_dict['objects'].append(obj_dict)
38 |             elif child.tag == 'size':
39 |                 size_dict = {}
40 |                 for size_child in child:
41 |                     size_dict[size_child.tag] = int(size_child.text)
42 |                 annotation_dict['size'] = size_dict
43 |             else:
44 |                 annotation_dict[child.tag] = child.text
45 | 
46 |         # Get the image path if not provided
47 |         if images_path is None:
48 |             images_path = annotation_path.parent.parent / annotation_dict["folder"]
49 | 
50 |         image_path = os.path.join(images_path, annotation_dict['filename'])
51 |         dets = []
52 |         for obj in annotation_dict['objects']:
53 |             if labels and obj['name'] not in labels.values():
54 |                 print(f"Label {obj['name']} not found in labels")
55 |                 continue
56 | 
57 |             dets.append(Detection(
58 |                 bbox=[obj['bndbox']['xmin'], obj['bndbox']['ymin'], obj['bndbox']['xmax'], obj['bndbox']['ymax']],
59 |                 label=obj['name'],
60 |                 bbox_type=BboxType.XYXY,
61 |                 confidence=1,
62 |                 image_path=image_path,
63 |                 width=annotation_dict['size']['width'],
64 |                 height=annotation_dict['size']['height'],
65 |                 relative=False
66 |                 ))
67 |         
68 |         detections = Detections(
69 |             labels=labels,
70 |             width=annotation_dict['size']['width'],
71 |             height=annotation_dict['size']['height'],
72 |             image_path=image_path,
73 |             detections=dets
74 |         )
75 | 
76 |         return detections
77 |     
78 |     def __call__(self, image: typing.Any, annotation: str) -> typing.Tuple[typing.Any, Detections]:
79 |         detections = self.readFromVOC(annotation, self.labels, self.images_path)
80 |         if image is None:
81 |             image = detections.image_path
82 |         return image, detections


--------------------------------------------------------------------------------
/mltu/torch/yolo/detectors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/torch/yolo/detectors/__init__.py


--------------------------------------------------------------------------------
/mltu/torch/yolo/detectors/detector.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | from mltu.inferenceModel import FpsWrapper
 4 | 
 5 | class BaseDetector:
 6 |     """Base class for the detectors in the YOLO family"""
 7 |     @staticmethod
 8 |     def preprocess(image: np.ndarray, height: int, width: int):
 9 |         # Convert the image color space from BGR to RGB
10 |         img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
11 | 
12 |         # Resize the image to match the input shape
13 |         img = cv2.resize(img, (width, height))
14 | 
15 |         # Normalize the image data by dividing it by 255.0
16 |         image_data = np.array(img) / 255.0
17 | 
18 |         # Transpose the image to have the channel dimension as the first dimension
19 |         image_data = np.transpose(image_data, (2, 0, 1))  # Channel first
20 | 
21 |         # Expand the dimensions of the image data to match the expected input shape
22 |         image_data = np.expand_dims(image_data, axis=0).astype(np.float32)
23 | 
24 |         return image_data
25 | 
26 |     @staticmethod
27 |     def postprocess(outputs: np.ndarray, x_factor: float, y_factor: float, confidence_threshold: float=0.5, iou_threshold: float=0.5):
28 |         # Transpose and squeeze the output to match the expected shape
29 |         outputs = np.transpose(np.squeeze(outputs))
30 | 
31 |         # Extract all classes confidence scores
32 |         conf_scores = np.amax(outputs[:, 4:], axis=1)
33 | 
34 |         # Get the data index of the detections with scores above the confidence threshold
35 |         indexes = np.where(conf_scores >= confidence_threshold)[0]
36 | 
37 |         # Extract the confidence scores of the detections
38 |         scores = conf_scores[indexes]
39 | 
40 |         # Extract the class IDs of the detections
41 |         class_ids = np.argmax(outputs[indexes, 4:], axis=1)
42 | 
43 |         # Extract the bounding box coordinates from the outputs and transform them to the original image space
44 |         boxes = outputs[indexes, :4] * np.array([x_factor, y_factor, x_factor, y_factor])
45 | 
46 |         # Apply non-maximum suppression to filter out overlapping bounding boxes
47 |         indices = cv2.dnn.NMSBoxes(boxes, scores, confidence_threshold, iou_threshold)
48 | 
49 |         # Iterate over the selected indices after non-maximum suppression
50 |         return boxes[indices], scores[indices], class_ids[indices]
51 | 
52 |     def predict(self, image: np.ndarray, **kwargs) -> np.ndarray:
53 |         ...
54 |         
55 |     @FpsWrapper
56 |     def __call__(self, image: np.ndarray):
57 |         results = self.predict(image)
58 |         return results


--------------------------------------------------------------------------------
/mltu/torch/yolo/detectors/onnx_detector.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from mltu.inferenceModel import OnnxInferenceModel
 3 | from mltu.torch.yolo.detectors.detector import BaseDetector
 4 | from mltu.annotations.detections import BboxType, Detection, Detections
 5 | 
 6 | class Detector(OnnxInferenceModel, BaseDetector):
 7 |     """ YOLOv8 detector using onnxruntime"""
 8 |     def __init__(
 9 |             self, 
10 |             model_path: str,
11 |             input_width: int, 
12 |             input_height: int, 
13 |             confidence_threshold: float=0.5, 
14 |             iou_threshold: float=0.5, 
15 |             classes: dict = None, 
16 |             return_raw_output: bool=False,
17 |             *args, **kwargs
18 |         ):
19 |         """
20 |         Args:
21 |             model_path (str): Path to the model file
22 |             input_width (int): Input width to use for the model
23 |             input_height (int): Input height to use for the model
24 |             confidence_threshold (float, optional): Confidence threshold for filtering the predictions. Defaults to 0.5.
25 |             iou_threshold (float, optional): Intersection over union threshold for filtering the predictions. Defaults to 0.5.
26 |             classes (dict, optional): Dictionary of class names. Defaults to None.
27 |             return_raw_output (bool, optional): Return raw output of the model (return bounding boxes, scores, and class ids). Defaults to False.
28 |         """
29 |         super().__init__(model_path, *args, **kwargs)
30 |         self.input_width = input_width
31 |         self.input_height = input_height
32 |         self.confidence_threshold = confidence_threshold
33 |         self.iou_threshold = iou_threshold
34 |         self.return_raw_output = return_raw_output
35 | 
36 |         self.classes = classes or self.metadata.get("classes", None)
37 |         if self.classes is None:
38 |             raise ValueError("The classes must be provided")
39 |         
40 |         # Generate a color palette for the classes
41 |         self.color_palette = np.random.uniform(0, 255, size=(len(self.classes), 3))
42 | 
43 |     def predict(self, image: np.ndarray, **kwargs) -> Detections:
44 |         img_height, img_width, _ = image.shape
45 | 
46 |         # Preprocess the image
47 |         preprocessed_image = self.preprocess(image, self.input_height, self.input_width)
48 |         
49 |         # Perform inference on the preprocessed image
50 |         preds = self.model.run(self.output_names, {self.input_names[0]: preprocessed_image})
51 | 
52 |         # Extract the results from the predictions
53 |         results = preds[0][0]
54 | 
55 |         # Calculate the scaling factors for the bounding box coordinates
56 |         x_factor, y_factor = img_width / self.input_width, img_height / self.input_height
57 | 
58 |         # Perform postprocessing on the predictions
59 |         boxes, scores, class_ids = self.postprocess(results, x_factor, y_factor, self.confidence_threshold, self.iou_threshold)
60 | 
61 |         if self.return_raw_output:
62 |             return boxes, scores, class_ids
63 | 
64 |         detections = []
65 |         for bbox, conf, class_id in zip(boxes, scores, class_ids):
66 |             detection = Detection(
67 |                 bbox = bbox,
68 |                 label = self.classes[class_id],
69 |                 labels = self.classes,
70 |                 bbox_type=BboxType.XYWH,
71 |                 confidence=conf,
72 |                 relative=False,
73 |                 width=img_width,
74 |                 height=img_height
75 |             )
76 |             detections.append(detection)
77 | 
78 |         return Detections(
79 |             labels=self.classes,
80 |             width=img_width, 
81 |             height=img_height, 
82 |             detections=detections, 
83 |             color_palette=self.color_palette,
84 |         )


--------------------------------------------------------------------------------
/mltu/torch/yolo/detectors/torch_detector.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from mltu.torch.yolo.detectors.detector import BaseDetector
 4 | from mltu.annotations.detections import BboxType, Detection, Detections
 5 | 
 6 | class Detector(BaseDetector):
 7 |     def __init__(
 8 |             self, 
 9 |             model,
10 |             input_width: int, 
11 |             input_height: int, 
12 |             classes: dict, 
13 |             confidence_threshold: float=0.5, 
14 |             iou_threshold: float=0.5, 
15 |             device: str="cuda"
16 |         ):
17 |         super().__init__()
18 |         self.model = model
19 |         self.input_width = input_width
20 |         self.input_height = input_height
21 |         self.classes = classes
22 |         self.confidence_threshold = confidence_threshold
23 |         self.iou_threshold = iou_threshold
24 |         self.device = torch.device(device if torch.cuda.is_available() else "cpu")
25 |         self.model.to(self.device)
26 |         self.model.eval()
27 |         
28 |         # Generate a color palette for the classes
29 |         self.color_palette = np.random.uniform(0, 255, size=(len(self.classes), 3))
30 | 
31 |     def predict(self, image: np.ndarray, **kwargs) -> Detections:
32 |         img_height, img_width, _ = image.shape
33 | 
34 |         # Preprocess the image
35 |         preprocessed_image = self.preprocess(image, self.input_height, self.input_width)
36 |         
37 |         # Perform inference on the preprocessed image
38 |         preds = self.model(torch.tensor(preprocessed_image).to(self.device))
39 | 
40 |         # Convert torch tensor to numpy array
41 |         results = preds[0].cpu().detach().numpy()
42 | 
43 |         # Calculate the scaling factors for the bounding box coordinates
44 |         x_factor, y_factor = img_width / self.input_width, img_height / self.input_height
45 | 
46 |         # Perform postprocessing on the predictions
47 |         boxes, scores, class_ids = self.postprocess(results, x_factor, y_factor, self.confidence_threshold, self.iou_threshold)
48 | 
49 |         detections = []
50 |         for bbox, conf, class_id in zip(boxes, scores, class_ids):
51 |             detection = Detection(
52 |                 bbox = bbox,
53 |                 label = self.classes[class_id],
54 |                 labels = self.classes,
55 |                 bbox_type=BboxType.XYWH,
56 |                 confidence=conf,
57 |                 relative=False,
58 |                 width=img_width,
59 |                 height=img_height
60 |             )
61 |             detections.append(detection)
62 | 
63 |         return Detections(
64 |             labels=self.classes,
65 |             width=img_width, 
66 |             height=img_height, 
67 |             detections=detections, 
68 |             color_palette=self.color_palette,
69 |         )


--------------------------------------------------------------------------------
/mltu/torch/yolo/loss.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | from ultralytics.utils.loss import BboxLoss, xywh2xyxy
  5 | from ultralytics.utils.tal import TaskAlignedAssigner, dist2bbox, make_anchors
  6 | 
  7 | class v8DetectionLoss:
  8 |     """Criterion class for computing training losses."""
  9 | 
 10 |     def __init__(self, model, box: float=7.5, cls: float=0.5, dfl: float=1.5):  # model must be de-paralleled
 11 |         """Initializes v8DetectionLoss with the model, defining model-related properties and BCE loss function."""
 12 |         self.model = model
 13 |         device = next(model.parameters()).device  # get model device
 14 | 
 15 |         self.head = model.model[-1]  # Detect() module
 16 |         self.bce = nn.BCEWithLogitsLoss(reduction="none")
 17 |         self.stride = self.head.stride  # model strides
 18 |         self.nc = self.head.nc  # number of classes
 19 |         self.no = self.head.no
 20 |         self.reg_max = self.head.reg_max # max number of regression targets
 21 |         self.device = device
 22 | 
 23 |         self.use_dfl = self.head.reg_max > 1
 24 | 
 25 |         self.assigner = TaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0)
 26 |         self.bbox_loss = BboxLoss(self.head.reg_max - 1, use_dfl=self.use_dfl).to(device)
 27 |         self.proj = torch.arange(self.head.reg_max, dtype=torch.float, device=device).to(device)
 28 | 
 29 |         self.box = box # box gain
 30 |         self.cls = cls # cls gain
 31 |         self.dfl = dfl # dfl gain
 32 | 
 33 |     def preprocess(self, targets, batch_size, scale_tensor):
 34 |         """Preprocesses the target counts and matches with the input batch size to output a tensor."""
 35 |         if targets.shape[0] == 0:
 36 |             out = torch.zeros(batch_size, 0, 5, device=self.device)
 37 |         else:
 38 |             i = targets[:, 0]  # image index
 39 |             _, counts = i.unique(return_counts=True)
 40 |             counts = counts.to(dtype=torch.int32)
 41 |             out = torch.zeros(batch_size, counts.max(), 5, device=self.device)
 42 |             for j in range(batch_size):
 43 |                 matches = i == j
 44 |                 n = matches.sum()
 45 |                 if n:
 46 |                     out[j, :n] = targets[matches, 1:]
 47 |             out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor))
 48 |         return out
 49 | 
 50 |     def bbox_decode(self, anchor_points, pred_dist):
 51 |         """Decode predicted object bounding box coordinates from anchor points and distribution."""
 52 |         if self.use_dfl:
 53 |             b, a, c = pred_dist.shape  # batch, anchors, channels
 54 |             self.proj = self.proj.to(pred_dist.device)
 55 |             pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
 56 | 
 57 |         return dist2bbox(pred_dist, anchor_points, xywh=False)
 58 | 
 59 |     def __call__(self, preds, batch):
 60 |         """Calculate the sum of the loss for box, cls and dfl multiplied by batch size."""
 61 |         loss = torch.zeros(3, device=self.device)  # box, cls, dfl
 62 |         feats = preds[1] if isinstance(preds, tuple) else preds
 63 |         pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
 64 |             (self.reg_max * 4, self.nc), 1
 65 |         )
 66 | 
 67 |         pred_scores = pred_scores.permute(0, 2, 1).contiguous()
 68 |         pred_distri = pred_distri.permute(0, 2, 1).contiguous()
 69 | 
 70 |         dtype = pred_scores.dtype
 71 |         batch_size = pred_scores.shape[0]
 72 |         imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0]  # image size (h,w)
 73 |         anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
 74 | 
 75 |         # Targets
 76 |         targets = torch.cat((batch["batch_idx"].view(-1, 1), batch["cls"].view(-1, 1), batch["bboxes"]), 1)
 77 |         targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
 78 |         gt_labels, gt_bboxes = targets.split((1, 4), 2)  # cls, xyxy
 79 |         mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
 80 | 
 81 |         # Pboxes
 82 |         pred_bboxes = self.bbox_decode(anchor_points, pred_distri)  # xyxy, (b, h*w, 4)
 83 | 
 84 |         _, target_bboxes, target_scores, fg_mask, _ = self.assigner(
 85 |             pred_scores.detach().sigmoid(),
 86 |             (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
 87 |             anchor_points * stride_tensor,
 88 |             gt_labels,
 89 |             gt_bboxes,
 90 |             mask_gt,
 91 |         )
 92 | 
 93 |         target_scores_sum = max(target_scores.sum(), 1)
 94 | 
 95 |         # Cls loss
 96 |         loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum  # BCE
 97 | 
 98 |         # Bbox loss
 99 |         if fg_mask.sum():
100 |             target_bboxes /= stride_tensor
101 |             loss[0], loss[2] = self.bbox_loss(
102 |                 pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask
103 |             )
104 | 
105 |         loss[0] *= self.box  # box gain
106 |         loss[1] *= self.cls  # cls gain
107 |         loss[2] *= self.dfl  # dfl gain
108 | 
109 |         detailed_loss = {"box_loss": loss[0].detach(), "cls_loss": loss[1].detach(), "dfl_loss": loss[2].detach()}
110 | 
111 |         return loss.sum() * batch_size, detailed_loss  # loss(box, cls, dfl)


--------------------------------------------------------------------------------
/mltu/torch/yolo/optimizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | class AccumulativeOptimizer(torch.optim.Optimizer):
 5 |     def __init__(self, optimizer, batch_size, nbs=64):
 6 |         super(AccumulativeOptimizer, self).__init__(optimizer.param_groups, optimizer.defaults)
 7 |         self.optimizer = optimizer
 8 |         self.accumulation_steps = int(nbs / batch_size)
 9 |         self.current_step = 0
10 | 
11 |     def zero_grad(self):
12 |         if self.current_step == 0:
13 |             self.optimizer.zero_grad()
14 | 
15 |     def step(self):
16 |         self.current_step += 1
17 |         if self.current_step >= self.accumulation_steps:
18 |             self.optimizer.step()
19 |             self.current_step = 0
20 |             self.optimizer.zero_grad()
21 | 
22 | 
23 | def build_optimizer(model, name: str="AdamW", lr: float=1e-3, weight_decay: float=0.0, momentum: float=0.937, decay=0.0005):
24 | 
25 |     pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
26 |     bn = tuple(v for k, v in nn.__dict__.items() if "Norm" in k)  # normalization layers, i.e. BatchNorm2d()
27 |     for module_name, module in model.named_modules():
28 |         for param_name, param in module.named_parameters(recurse=False):
29 |             fullname = f"{module_name}.{param_name}" if module_name else param_name
30 |             if "bias" in fullname:  # bias (no decay)
31 |                 pg2.append(param)
32 |             elif isinstance(module, bn):  # weight (no decay)
33 |                 pg1.append(param)
34 |             else:  # weight (with decay)
35 |                 pg0.append(param)
36 | 
37 |     if name == "AdamW":
38 |         optimizer = torch.optim.AdamW(pg2, lr=lr, weight_decay=weight_decay, betas=(momentum, 0.999))
39 |     elif name == "Adam":
40 |         optimizer = torch.optim.Adam(pg2, lr=lr, weight_decay=weight_decay, betas=(momentum, 0.999))
41 |     elif name == "SGD":
42 |         optimizer = torch.optim.SGD(pg2, lr=lr, weight_decay=weight_decay, momentum=0.9)
43 |     else:
44 |         raise ValueError(f"Optimizer {name} not supported!")
45 |     
46 |     optimizer.add_param_group({'params': pg0, 'weight_decay': decay})  # add pg1 with weight_decay
47 |     optimizer.add_param_group({'params': pg1, 'weight_decay': 0.0})  # add pg2 (biases)
48 | 
49 |     del pg0, pg1, pg2
50 | 
51 |     return optimizer


--------------------------------------------------------------------------------
/mltu/torch/yolo/preprocessors.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import typing
 3 | import numpy as np
 4 | 
 5 | class YoloPreprocessor:
 6 |     def __init__(self, device: torch.device, imgsz: int=640):
 7 |         self.device = device
 8 |         self.imgsz = imgsz
 9 | 
10 |     def __call__(self, images, annotations) -> typing.Tuple[np.ndarray, dict]:
11 |         batch = {
12 |             "ori_shape": [],
13 |             "resized_shape": [],
14 |             "cls": [],
15 |             "bboxes": [],
16 |             "batch_idx": [],
17 |         }
18 | 
19 |         for i, (image, detections) in enumerate(zip(images, annotations)):
20 |             batch["ori_shape"].append([detections.height, detections.width])
21 |             batch["resized_shape"].append([self.imgsz, self.imgsz])
22 |             for detection in detections:
23 |                 batch["cls"].append([detection.labelId])
24 |                 batch["bboxes"].append(detection.xywh)
25 |                 batch["batch_idx"].append(i)
26 | 
27 |         batch["cls"] = torch.tensor(np.array(batch["cls"])).to(self.device)
28 |         batch["bboxes"] = torch.tensor(np.array(batch["bboxes"])).to(self.device)
29 |         batch["batch_idx"] = torch.tensor(np.array(batch["batch_idx"])).to(self.device)
30 | 
31 |         return np.array(images), batch


--------------------------------------------------------------------------------
/mltu/torch/yolo/pruning_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from ultralytics.nn.modules import C2f, Conv, Bottleneck
 4 | 
 5 | def infer_shortcut(bottleneck):
 6 |     c1 = bottleneck.cv1.conv.in_channels
 7 |     c2 = bottleneck.cv2.conv.out_channels
 8 |     return c1 == c2 and hasattr(bottleneck, 'add') and bottleneck.add
 9 | 
10 | class C2f_v2(nn.Module):
11 |     # CSP Bottleneck with 2 convolutions
12 |     def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
13 |         super().__init__()
14 |         self.c = int(c2 * e)  # hidden channels
15 |         self.cv0 = Conv(c1, self.c, 1, 1)
16 |         self.cv1 = Conv(c1, self.c, 1, 1)
17 |         self.cv2 = Conv((2 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
18 |         self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
19 | 
20 |     def forward(self, x):
21 |         # y = list(self.cv1(x).chunk(2, 1))
22 |         y = [self.cv0(x), self.cv1(x)]
23 |         y.extend(m(y[-1]) for m in self.m)
24 |         return self.cv2(torch.cat(y, 1))
25 | 
26 | def transfer_weights(c2f, c2f_v2):
27 |     c2f_v2.cv2 = c2f.cv2
28 |     c2f_v2.m = c2f.m
29 | 
30 |     state_dict = c2f.state_dict()
31 |     state_dict_v2 = c2f_v2.state_dict()
32 | 
33 |     # Transfer cv1 weights from C2f to cv0 and cv1 in C2f_v2
34 |     old_weight = state_dict['cv1.conv.weight']
35 |     half_channels = old_weight.shape[0] // 2
36 |     state_dict_v2['cv0.conv.weight'] = old_weight[:half_channels]
37 |     state_dict_v2['cv1.conv.weight'] = old_weight[half_channels:]
38 | 
39 |     # Transfer cv1 batchnorm weights and buffers from C2f to cv0 and cv1 in C2f_v2
40 |     for bn_key in ['weight', 'bias', 'running_mean', 'running_var']:
41 |         old_bn = state_dict[f'cv1.bn.{bn_key}']
42 |         state_dict_v2[f'cv0.bn.{bn_key}'] = old_bn[:half_channels]
43 |         state_dict_v2[f'cv1.bn.{bn_key}'] = old_bn[half_channels:]
44 | 
45 |     # Transfer remaining weights and buffers
46 |     for key in state_dict:
47 |         if not key.startswith('cv1.'):
48 |             state_dict_v2[key] = state_dict[key]
49 | 
50 |     # Transfer all non-method attributes
51 |     for attr_name in dir(c2f):
52 |         attr_value = getattr(c2f, attr_name)
53 |         if not callable(attr_value) and '_' not in attr_name:
54 |             setattr(c2f_v2, attr_name, attr_value)
55 | 
56 |     c2f_v2.load_state_dict(state_dict_v2)
57 | 
58 | def replace_c2f_with_c2f_v2(module):
59 |     for name, child_module in module.named_children():
60 |         if isinstance(child_module, C2f):
61 |             # Replace C2f with C2f_v2 while preserving its parameters
62 |             shortcut = infer_shortcut(child_module.m[0])
63 |             c2f_v2 = C2f_v2(child_module.cv1.conv.in_channels, child_module.cv2.conv.out_channels,
64 |                             n=len(child_module.m), shortcut=shortcut,
65 |                             g=child_module.m[0].cv2.conv.groups,
66 |                             e=child_module.c / child_module.cv2.conv.out_channels)
67 |             transfer_weights(child_module, c2f_v2)
68 |             setattr(module, name, c2f_v2)
69 |         else:
70 |             replace_c2f_with_c2f_v2(child_module)


--------------------------------------------------------------------------------
/mltu/torch/yolo/requirements.txt:
--------------------------------------------------------------------------------
1 | ultralytics==8.1.9
2 | torch==2.0.0
3 | torchvision==0.15.1
4 | torch_pruning==1.3.6


--------------------------------------------------------------------------------
/mltu/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/utils/__init__.py


--------------------------------------------------------------------------------
/mltu/utils/text_utils.py:
--------------------------------------------------------------------------------
  1 | import typing
  2 | import numpy as np
  3 | from itertools import groupby
  4 | 
  5 | 
  6 | def ctc_decoder(predictions: np.ndarray, chars: typing.Union[str, list]) -> typing.List[str]:
  7 |     """ CTC greedy decoder for predictions
  8 |     
  9 |     Args:
 10 |         predictions (np.ndarray): predictions from model
 11 |         chars (typing.Union[str, list]): list of characters
 12 | 
 13 |     Returns:
 14 |         typing.List[str]: list of words
 15 |     """
 16 |     # use argmax to find the index of the highest probability
 17 |     argmax_preds = np.argmax(predictions, axis=-1)
 18 |     
 19 |     # use groupby to find continuous same indexes
 20 |     grouped_preds = [[k for k,_ in groupby(preds)] for preds in argmax_preds]
 21 | 
 22 |     # convert indexes to chars
 23 |     texts = ["".join([chars[k] for k in group if k < len(chars)]) for group in grouped_preds]
 24 | 
 25 |     return texts
 26 | 
 27 | 
 28 | def edit_distance(prediction_tokens: typing.List[str], reference_tokens: typing.List[str]) -> int:
 29 |     """ Standard dynamic programming algorithm to compute the Levenshtein Edit Distance Algorithm
 30 | 
 31 |     Args:
 32 |         prediction_tokens: A tokenized predicted sentence
 33 |         reference_tokens: A tokenized reference sentence
 34 |     Returns:
 35 |         Edit distance between the predicted sentence and the reference sentence
 36 |     """
 37 |     # Initialize a matrix to store the edit distances
 38 |     dp = [[0] * (len(reference_tokens) + 1) for _ in range(len(prediction_tokens) + 1)]
 39 | 
 40 |     # Fill the first row and column with the number of insertions needed
 41 |     for i in range(len(prediction_tokens) + 1):
 42 |         dp[i][0] = i
 43 |     
 44 |     for j in range(len(reference_tokens) + 1):
 45 |         dp[0][j] = j
 46 | 
 47 |     # Iterate through the prediction and reference tokens
 48 |     for i, p_tok in enumerate(prediction_tokens):
 49 |         for j, r_tok in enumerate(reference_tokens):
 50 |             # If the tokens are the same, the edit distance is the same as the previous entry
 51 |             if p_tok == r_tok:
 52 |                 dp[i+1][j+1] = dp[i][j]
 53 |             # If the tokens are different, the edit distance is the minimum of the previous entries plus 1
 54 |             else:
 55 |                 dp[i+1][j+1] = min(dp[i][j+1], dp[i+1][j], dp[i][j]) + 1
 56 | 
 57 |     # Return the final entry in the matrix as the edit distance     
 58 |     return dp[-1][-1]
 59 | 
 60 | def get_cer(
 61 |     preds: typing.Union[str, typing.List[str]],
 62 |     target: typing.Union[str, typing.List[str]],
 63 |     ) -> float:
 64 |     """ Update the cer score with the current set of references and predictions.
 65 | 
 66 |     Args:
 67 |         preds (typing.Union[str, typing.List[str]]): list of predicted sentences
 68 |         target (typing.Union[str, typing.List[str]]): list of target words
 69 | 
 70 |     Returns:
 71 |         Character error rate score
 72 |     """
 73 |     if isinstance(preds, str):
 74 |         preds = [preds]
 75 |     if isinstance(target, str):
 76 |         target = [target]
 77 | 
 78 |     total, errors = 0, 0
 79 |     for pred_tokens, tgt_tokens in zip(preds, target):
 80 |         errors += edit_distance(list(pred_tokens), list(tgt_tokens))
 81 |         total += len(tgt_tokens)
 82 | 
 83 |     if total == 0:
 84 |         return 0.0
 85 | 
 86 |     cer = errors / total
 87 | 
 88 |     return cer
 89 | 
 90 | def get_wer(
 91 |     preds: typing.Union[str, typing.List[str]],
 92 |     target: typing.Union[str, typing.List[str]],
 93 |     ) -> float:
 94 |     """ Update the wer score with the current set of references and predictions.
 95 | 
 96 |     Args:
 97 |         target (typing.Union[str, typing.List[str]]): string of target sentence or list of target words
 98 |         preds (typing.Union[str, typing.List[str]]): string of predicted sentence or list of predicted words
 99 | 
100 |     Returns:
101 |         Word error rate score
102 |     """
103 |     if isinstance(preds, str) and isinstance(target, str):
104 |         preds = [preds]
105 |         target = [target]
106 | 
107 |     if isinstance(preds, list) and isinstance(target, list):
108 |         errors, total_words = 0, 0
109 |         for _pred, _target in zip(preds, target):
110 |             if isinstance(_pred, str) and isinstance(_target, str):
111 |                 errors += edit_distance(_pred.split(), _target.split())
112 |                 total_words += len(_target.split())
113 |             else:
114 |                 print("Error: preds and target must be either both strings or both lists of strings.")
115 |                 return np.inf
116 |             
117 |     else:
118 |         print("Error: preds and target must be either both strings or both lists of strings.")
119 |         return np.inf
120 |     
121 |     wer = errors / total_words
122 |             
123 |     return wer


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyYAML>=6.0
2 | tqdm
3 | qqdm==0.0.7
4 | pandas
5 | numpy
6 | opencv-python
7 | Pillow>=9.4.0
8 | onnxruntime>=1.15.0  # onnxruntime-gpu for GPU support
9 | matplotlib


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | 
 4 | 
 5 | DIR = os.path.abspath(os.path.dirname(__file__))
 6 | 
 7 | with open(os.path.join(DIR, "README.md")) as fh:
 8 |     long_description = fh.read()
 9 | 
10 | with open(os.path.join(DIR, "requirements.txt")) as fh:
11 |     requirements = fh.read().splitlines()
12 | 
13 | 
14 | def get_version(initpath: str) -> str:
15 |     """ Get from the init of the source code the version string
16 | 
17 |     Params:
18 |         initpath (str): path to the init file of the python package relative to the setup file
19 | 
20 |     Returns:
21 |         str: The version string in the form 0.0.1
22 |     """
23 | 
24 |     path = os.path.join(os.path.dirname(__file__), initpath)
25 | 
26 |     with open(path, "r") as handle:
27 |         for line in handle.read().splitlines():
28 |             if line.startswith("__version__"):
29 |                 return line.split("=")[1].strip().strip("\"'")
30 |         else:
31 |             raise RuntimeError("Unable to find version string.")
32 | 
33 | 
34 | setup(
35 |     name="mltu",
36 |     version=get_version("mltu/__init__.py"),
37 |     long_description=long_description,
38 |     long_description_content_type="text/markdown",
39 |     url="https://pylessons.com/",
40 |     author="PyLessons",
41 |     author_email="pythonlessons0@gmail.com",
42 |     install_requires=requirements,
43 |     extras_require={
44 |         "gpu": ["onnxruntime-gpu"],
45 |     },
46 |     python_requires=">=3",
47 |     packages=find_packages(exclude=("*_test.py",)),
48 |     include_package_data=True,
49 |     project_urls={
50 |         "Source": "https://github.com/pythonlessons/mltu/",
51 |         "Tracker": "https://github.com/pythonlessons/mltu/issues",
52 |     },
53 |     description="Machine Learning Training Utilities (MLTU) for TensorFlow and PyTorch",
54 | )
55 | 


--------------------------------------------------------------------------------