├── .gitignore ├── .vscode ├── launch.json └── settings.json ├── CHANGELOG.md ├── Datasets └── README.md ├── LICENSE ├── MANIFEST.in ├── Models └── README.md ├── README.md ├── Tests ├── README.md ├── test_tensorflow_metrics.py └── test_text_utils.py ├── Tutorials ├── 01_image_to_word │ ├── README.md │ ├── configs.py │ ├── inferenceModel.py │ ├── model.py │ ├── requiremenets.txt │ └── train.py ├── 02_captcha_to_text │ ├── README.md │ ├── configs.py │ ├── inferenceModel.py │ ├── model.py │ └── train.py ├── 03_handwriting_recognition │ ├── README.md │ ├── configs.py │ ├── inferenceModel.py │ ├── model.py │ └── train.py ├── 04_sentence_recognition │ ├── README.md │ ├── configs.py │ ├── inferenceModel.py │ ├── model.py │ └── train.py ├── 05_sound_to_text │ ├── README.md │ ├── configs.py │ ├── inferenceModel.py │ ├── model.py │ ├── train.py │ └── train_no_limit.py ├── 06_pytorch_introduction │ ├── README.md │ ├── model.py │ ├── requirements.txt │ ├── test.py │ └── train.py ├── 07_pytorch_wrapper │ ├── README.md │ ├── model.py │ ├── requirements.txt │ ├── test.py │ └── train.py ├── 08_handwriting_recognition_torch │ ├── README.md │ ├── configs.py │ ├── inferenceModel.py │ ├── model.py │ ├── requirements.txt │ └── train_torch.py ├── 09_translation_transformer │ ├── README.md │ ├── configs.py │ ├── download.py │ ├── model.py │ ├── requirements.txt │ ├── test.py │ └── train.py ├── 10_wav2vec2_torch │ ├── configs.py │ ├── requirements.txt │ ├── test.py │ ├── train.py │ └── train_tf.py ├── 11_Yolov8 │ ├── README.md │ ├── convert2onnx.py │ ├── requirements.txt │ ├── run_pretrained.py │ ├── test_yolov8.py │ └── train_yolov8.py └── README.md ├── bin ├── read_parquet.py └── setup.sh ├── mltu ├── __init__.py ├── annotations │ ├── __init__.py │ ├── audio.py │ ├── detections.py │ └── images.py ├── augmentors.py ├── configs.py ├── dataProvider.py ├── inferenceModel.py ├── preprocessors.py ├── tensorflow │ ├── README.md │ ├── __init__.py │ ├── callbacks.py │ ├── dataProvider.py │ ├── layers.py │ ├── losses.py │ ├── metrics.py │ ├── model_utils.py │ ├── models │ │ └── u2net.py │ ├── requirements.txt │ └── transformer │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── callbacks.py │ │ ├── layers.py │ │ └── utils.py ├── tokenizers.py ├── torch │ ├── README.md │ ├── __init__.py │ ├── callbacks.py │ ├── dataProvider.py │ ├── handlers.py │ ├── losses.py │ ├── metrics.py │ ├── model.py │ ├── requirements.txt │ └── yolo │ │ ├── README.md │ │ ├── __init__.py │ │ ├── annotation.py │ │ ├── detectors │ │ ├── __init__.py │ │ ├── detector.py │ │ ├── onnx_detector.py │ │ └── torch_detector.py │ │ ├── loss.py │ │ ├── metrics.py │ │ ├── optimizer.py │ │ ├── preprocessors.py │ │ ├── pruning_utils.py │ │ └── requirements.txt ├── transformers.py └── utils │ ├── __init__.py │ └── text_utils.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.egg-info 3 | *.pyc 4 | venv 5 | 6 | Datasets/* 7 | Models/* 8 | dist 9 | 10 | !*.md 11 | 12 | .idea 13 | .python-version 14 | 15 | test 16 | build 17 | yolov8* 18 | pyrightconfig.json -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Current File", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal", 13 | "justMyCode": false, 14 | "subProcess": true, 15 | } 16 | ] 17 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.analysis.typeCheckingMode": "off", 3 | "python.testing.unittestArgs": [ 4 | "-v", 5 | "-s", 6 | "./Tests", 7 | "-p", 8 | "*test*.py" 9 | ], 10 | "python.testing.pytestEnabled": false, 11 | "python.testing.unittestEnabled": true 12 | } -------------------------------------------------------------------------------- /Datasets/README.md: -------------------------------------------------------------------------------- 1 | # Empty repository to hold the datasets when running Tutorials -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Rokas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt -------------------------------------------------------------------------------- /Models/README.md: -------------------------------------------------------------------------------- 1 | # Empty repository to hold the Models when running Tutorials -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MLTU - Machine Learning Training Utilities 2 | Machine Learning Training Utilities for TensorFlow 2.* and PyTorch with Python 3 3 |

4 | 5 |

6 | 7 | # Installation: 8 | To use MLTU in your own project, you can install it from PyPI: 9 | ```bash 10 | pip install mltu 11 | ``` 12 | When running tutorials, it's necessary to install mltu for a specific tutorial, for example: 13 | ```bash 14 | pip install mltu==0.1.3 15 | ``` 16 | Each tutorial has its own requirements.txt file for a specific mltu version. As this project progress, the newest versions may have breaking changes, so it's recommended to use the same version as in the tutorial. 17 | 18 | # Tutorials and Examples can be found on [PyLessons.com](https://pylessons.com/mltu) 19 | 1. [Text Recognition With TensorFlow and CTC network](https://pylessons.com/ctc-text-recognition), code in ```Tutorials\01_image_to_word``` folder; 20 | 2. [TensorFlow OCR model for reading Captchas](https://pylessons.com/tensorflow-ocr-captcha), code in ```Tutorials\02_captcha_to_text``` folder; 21 | 3. [Handwriting words recognition with TensorFlow](https://pylessons.com/handwriting-recognition), code in ```Tutorials\03_handwriting_recognition``` folder; 22 | 4. [Handwritten sentence recognition with TensorFlow](https://pylessons.com/handwritten-sentence-recognition), code in ```Tutorials\04_sentence_recognition``` folder; 23 | 5. [Introduction to speech recognition with TensorFlow](https://pylessons.com/speech-recognition), code in ```Tutorials\05_speech_recognition``` folder; 24 | 6. [Introduction to PyTorch in a practical way](https://pylessons.com/pytorch-introduction), code in ```Tutorials\06_pytorch_introduction``` folder; 25 | 7. [Using custom wrapper to simplify PyTorch models training pipeline](https://pylessons.com/pytorch-introduction), code in ```Tutorials\07_pytorch_wrapper``` folder; 26 | 8. [Handwriting words recognition with PyTorch](https://pylessons.com/handwriting-recognition-pytorch), code in ```Tutorials\08_handwriting_recognition_torch``` folder; 27 | 9. [Transformer training with TensorFlow for Translation task](https://pylessons.com/transformers-training), code in ```Tutorials\09_translation_transformer``` folder; 28 | 10. [Speech Recognition in Python | finetune wav2vec2 model for a custom ASR model](https://youtu.be/h6ooEGzjkj0), code in ```Tutorials\10_wav2vec2_torch``` folder; 29 | 11. [YOLOv8: Real-Time Object Detection Simplified](https://youtu.be/vegL__weCxY), code in ```Tutorials\11_Yolov8``` folder; 30 | 12. [YOLOv8: Customizing Object Detector training](https://youtu.be/ysYiV1CbCyY), code in ```Tutorials\11_Yolov8\train_yolov8.py``` folder; -------------------------------------------------------------------------------- /Tests/README.md: -------------------------------------------------------------------------------- 1 | # Repository for unit tests -------------------------------------------------------------------------------- /Tests/test_tensorflow_metrics.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from mltu.tensorflow.metrics import CERMetric, WERMetric 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | class TestMetrics(unittest.TestCase): 9 | 10 | def to_embeddings(self, sentences, vocab): 11 | embeddings, max_len = [], 0 12 | 13 | for sentence in sentences: 14 | embedding = [] 15 | for character in sentence: 16 | embedding.append(vocab.index(character)) 17 | embeddings.append(embedding) 18 | max_len = max(max_len, len(embedding)) 19 | return embeddings, max_len 20 | 21 | def setUp(self) -> None: 22 | true_words = ["Who are you", "I am a student", "I am a teacher", "Just different sentence length"] 23 | pred_words = ["Who are you", "I am a ztudent", "I am A reacher", "Just different length"] 24 | 25 | vocab = set() 26 | for sen in true_words + pred_words: 27 | for character in sen: 28 | vocab.add(character) 29 | self.vocab = "".join(vocab) 30 | 31 | sentence_true, max_len_true = self.to_embeddings(true_words, self.vocab) 32 | sentence_pred, max_len_pred = self.to_embeddings(pred_words, self.vocab) 33 | 34 | max_len = max(max_len_true, max_len_pred) 35 | padding_length = 64 36 | 37 | self.sen_true = [np.pad(sen, (0, max_len - len(sen)), "constant", constant_values=len(self.vocab)) for sen in sentence_true] 38 | self.sen_pred = [np.pad(sen, (0, padding_length - len(sen)), "constant", constant_values=-1) for sen in sentence_pred] 39 | 40 | def test_CERMetric(self): 41 | vocabulary = tf.constant(list(self.vocab)) 42 | cer = CERMetric.get_cer(self.sen_true, self.sen_pred, vocabulary).numpy() 43 | 44 | self.assertTrue(np.array_equal(cer, np.array([0.0, 0.071428575, 0.14285715, 0.42857143], dtype=np.float32))) 45 | 46 | def test_WERMetric(self): 47 | vocabulary = tf.constant(list(self.vocab)) 48 | wer = WERMetric.get_wer(self.sen_true, self.sen_pred, vocabulary).numpy() 49 | 50 | self.assertTrue(np.array_equal(wer, np.array([0., 0.25, 0.5, 0.33333334], dtype=np.float32))) 51 | 52 | if __name__ == "__main__": 53 | unittest.main() -------------------------------------------------------------------------------- /Tests/test_text_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from mltu.utils.text_utils import edit_distance, get_cer, get_wer 4 | 5 | class TestTextUtils(unittest.TestCase): 6 | 7 | def test_edit_distance(self): 8 | """ This unit test includes several test cases to cover different scenarios, including no errors, 9 | substitution errors, insertion errors, deletion errors, and a more complex case with multiple 10 | errors. It also includes a test case for empty input. 11 | """ 12 | # Test simple case with no errors 13 | prediction_tokens = ["A", "B", "C"] 14 | reference_tokens = ["A", "B", "C"] 15 | self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 0) 16 | 17 | # Test simple case with one substitution error 18 | prediction_tokens = ["A", "B", "D"] 19 | reference_tokens = ["A", "B", "C"] 20 | self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 1) 21 | 22 | # Test simple case with one insertion error 23 | prediction_tokens = ["A", "B", "C"] 24 | reference_tokens = ["A", "B", "C", "D"] 25 | self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 1) 26 | 27 | # Test simple case with one deletion error 28 | prediction_tokens = ["A", "B"] 29 | reference_tokens = ["A", "B", "C"] 30 | self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 1) 31 | 32 | # Test more complex case with multiple errors 33 | prediction_tokens = ["A", "B", "C", "D", "E"] 34 | reference_tokens = ["A", "C", "B", "F", "E"] 35 | self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 3) 36 | 37 | # Test empty input 38 | prediction_tokens = [] 39 | reference_tokens = [] 40 | self.assertEqual(edit_distance(prediction_tokens, reference_tokens), 0) 41 | 42 | def test_get_cer(self): 43 | # Test simple case with no errors 44 | preds = ["A B C"] 45 | target = ["A B C"] 46 | self.assertEqual(get_cer(preds, target), 0) 47 | 48 | # Test simple case with one character error 49 | preds = ["A B C"] 50 | target = ["A B D"] 51 | self.assertEqual(get_cer(preds, target), 1/5) 52 | 53 | # Test simple case with multiple character errors 54 | preds = ["A B C"] 55 | target = ["D E F"] 56 | self.assertEqual(get_cer(preds, target), 3/5) 57 | 58 | # Test empty input 59 | preds = [] 60 | target = [] 61 | self.assertEqual(get_cer(preds, target), 0) 62 | 63 | # Test simple case with different word lengths 64 | preds = ["ABC"] 65 | target = ["ABCDEFG"] 66 | self.assertEqual(get_cer(preds, target), 4/7) 67 | 68 | def test_get_wer(self): 69 | # Test simple case with no errors 70 | preds = "A B C" 71 | target = "A B C" 72 | self.assertEqual(get_wer(preds, target), 0) 73 | 74 | # Test simple case with one word error 75 | preds = "A B C" 76 | target = "A B D" 77 | self.assertEqual(get_wer(preds, target), 1/3) 78 | 79 | # Test simple case with multiple word errors 80 | preds = "A B C" 81 | target = "D E F" 82 | self.assertEqual(get_wer(preds, target), 1) 83 | 84 | # Test empty input 85 | preds = "" 86 | target = "" 87 | self.assertEqual(get_wer(preds, target), 0) 88 | 89 | # Test simple case with different sentence lengths 90 | preds = ["ABC"] 91 | target = ["ABC DEF"] 92 | self.assertEqual(get_wer(preds, target), 1) 93 | 94 | 95 | if __name__ == "__main__": 96 | unittest.main() 97 | -------------------------------------------------------------------------------- /Tutorials/01_image_to_word/configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | from mltu.configs import BaseModelConfigs 5 | 6 | 7 | class ModelConfigs(BaseModelConfigs): 8 | def __init__(self): 9 | super().__init__() 10 | self.model_path = os.path.join("Models/1_image_to_word", datetime.strftime(datetime.now(), "%Y%m%d%H%M")) 11 | self.vocab = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 12 | self.height = 32 13 | self.width = 128 14 | self.max_text_length = 23 15 | self.batch_size = 1024 16 | self.learning_rate = 1e-4 17 | self.train_epochs = 100 18 | self.train_workers = 20 -------------------------------------------------------------------------------- /Tutorials/01_image_to_word/inferenceModel.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import typing 3 | import numpy as np 4 | 5 | from mltu.inferenceModel import OnnxInferenceModel 6 | from mltu.utils.text_utils import ctc_decoder, get_cer 7 | 8 | class ImageToWordModel(OnnxInferenceModel): 9 | def __init__(self, char_list: typing.Union[str, list], *args, **kwargs): 10 | super().__init__(*args, **kwargs) 11 | self.char_list = char_list 12 | 13 | def predict(self, image: np.ndarray): 14 | image = cv2.resize(image, self.input_shapes[0][1:3][::-1]) 15 | 16 | image_pred = np.expand_dims(image, axis=0).astype(np.float32) 17 | 18 | preds = self.model.run(self.output_names, {self.input_names[0]: image_pred})[0] 19 | 20 | text = ctc_decoder(preds, self.char_list)[0] 21 | 22 | return text 23 | 24 | 25 | if __name__ == "__main__": 26 | import pandas as pd 27 | from tqdm import tqdm 28 | from mltu.configs import BaseModelConfigs 29 | 30 | configs = BaseModelConfigs.load("Models/1_image_to_word/202211270035/configs.yaml") 31 | 32 | model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab) 33 | 34 | df = pd.read_csv("Models/1_image_to_word/202211270035/val.csv").dropna().values.tolist() 35 | 36 | accum_cer = [] 37 | for image_path, label in tqdm(df[:20]): 38 | image = cv2.imread(image_path.replace("\\", "/")) 39 | 40 | try: 41 | prediction_text = model.predict(image) 42 | 43 | cer = get_cer(prediction_text, label) 44 | print(f"Image: {image_path}, Label: {label}, Prediction: {prediction_text}, CER: {cer}") 45 | 46 | # resize image by 3 times for visualization 47 | # image = cv2.resize(image, (image.shape[1] * 3, image.shape[0] * 3)) 48 | # cv2.imshow(prediction_text, image) 49 | # cv2.waitKey(0) 50 | # cv2.destroyAllWindows() 51 | except: 52 | continue 53 | 54 | accum_cer.append(cer) 55 | 56 | print(f"Average CER: {np.average(accum_cer)}") -------------------------------------------------------------------------------- /Tutorials/01_image_to_word/model.py: -------------------------------------------------------------------------------- 1 | from keras import layers 2 | from keras.models import Model 3 | 4 | from mltu.tensorflow.model_utils import residual_block 5 | 6 | 7 | def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2): 8 | 9 | inputs = layers.Input(shape=input_dim, name="input") 10 | 11 | input = layers.Lambda(lambda x: x / 255)(inputs) 12 | 13 | x1 = residual_block(input, 16, activation=activation, skip_conv=True, strides=1, dropout=dropout) 14 | 15 | x2 = residual_block(x1, 16, activation=activation, skip_conv=True, strides=2, dropout=dropout) 16 | x3 = residual_block(x2, 16, activation=activation, skip_conv=False, strides=1, dropout=dropout) 17 | 18 | x4 = residual_block(x3, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout) 19 | x5 = residual_block(x4, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout) 20 | 21 | x6 = residual_block(x5, 64, activation=activation, skip_conv=True, strides=1, dropout=dropout) 22 | x7 = residual_block(x6, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout) 23 | 24 | squeezed = layers.Reshape((x7.shape[-3] * x7.shape[-2], x7.shape[-1]))(x7) 25 | 26 | blstm = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(squeezed) 27 | 28 | output = layers.Dense(output_dim + 1, activation="softmax", name="output")(blstm) 29 | 30 | model = Model(inputs=inputs, outputs=output) 31 | return model -------------------------------------------------------------------------------- /Tutorials/01_image_to_word/requiremenets.txt: -------------------------------------------------------------------------------- 1 | mltu==0.1.3 -------------------------------------------------------------------------------- /Tutorials/01_image_to_word/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tqdm import tqdm 3 | import tensorflow as tf 4 | 5 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] 6 | except: pass 7 | 8 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard 9 | 10 | from mltu.preprocessors import ImageReader 11 | from mltu.annotations.images import CVImage 12 | from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding 13 | from mltu.tensorflow.dataProvider import DataProvider 14 | from mltu.tensorflow.losses import CTCloss 15 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger 16 | from mltu.tensorflow.metrics import CWERMetric 17 | 18 | 19 | from model import train_model 20 | from configs import ModelConfigs 21 | 22 | configs = ModelConfigs() 23 | 24 | data_path = "Datasets/90kDICT32px" 25 | val_annotation_path = data_path + "/annotation_val.txt" 26 | train_annotation_path = data_path + "/annotation_train.txt" 27 | 28 | # Read metadata file and parse it 29 | def read_annotation_file(annotation_path): 30 | dataset, vocab, max_len = [], set(), 0 31 | with open(annotation_path, "r") as f: 32 | for line in tqdm(f.readlines()): 33 | line = line.split() 34 | image_path = data_path + line[0][1:] 35 | label = line[0].split("_")[1] 36 | dataset.append([image_path, label]) 37 | vocab.update(list(label)) 38 | max_len = max(max_len, len(label)) 39 | return dataset, sorted(vocab), max_len 40 | 41 | train_dataset, train_vocab, max_train_len = read_annotation_file(train_annotation_path) 42 | val_dataset, val_vocab, max_val_len = read_annotation_file(val_annotation_path) 43 | 44 | # Save vocab and maximum text length to configs 45 | configs.vocab = "".join(train_vocab) 46 | configs.max_text_length = max(max_train_len, max_val_len) 47 | configs.save() 48 | 49 | # Create training data provider 50 | train_data_provider = DataProvider( 51 | dataset=train_dataset, 52 | skip_validation=True, 53 | batch_size=configs.batch_size, 54 | data_preprocessors=[ImageReader(CVImage)], 55 | transformers=[ 56 | ImageResizer(configs.width, configs.height), 57 | LabelIndexer(configs.vocab), 58 | LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)) 59 | ], 60 | ) 61 | 62 | # Create validation data provider 63 | val_data_provider = DataProvider( 64 | dataset=val_dataset, 65 | skip_validation=True, 66 | batch_size=configs.batch_size, 67 | data_preprocessors=[ImageReader(CVImage)], 68 | transformers=[ 69 | ImageResizer(configs.width, configs.height), 70 | LabelIndexer(configs.vocab), 71 | LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)) 72 | ], 73 | ) 74 | 75 | model = train_model( 76 | input_dim = (configs.height, configs.width, 3), 77 | output_dim = len(configs.vocab), 78 | ) 79 | # Compile the model and print summary 80 | model.compile( 81 | optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 82 | loss=CTCloss(), 83 | metrics=[CWERMetric()], 84 | run_eagerly=False 85 | ) 86 | model.summary(line_length=110) 87 | 88 | # Define path to save the model 89 | os.makedirs(configs.model_path, exist_ok=True) 90 | 91 | # Define callbacks 92 | earlystopper = EarlyStopping(monitor="val_CER", patience=10, verbose=1) 93 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min") 94 | trainLogger = TrainLogger(configs.model_path) 95 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1) 96 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode="auto") 97 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5") 98 | 99 | # Train the model 100 | model.fit( 101 | train_data_provider, 102 | validation_data=val_data_provider, 103 | epochs=configs.train_epochs, 104 | callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx], 105 | workers=configs.train_workers 106 | ) 107 | 108 | # Save training and validation datasets as csv files 109 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv")) 110 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv")) -------------------------------------------------------------------------------- /Tutorials/02_captcha_to_text/configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | from mltu.configs import BaseModelConfigs 5 | 6 | 7 | class ModelConfigs(BaseModelConfigs): 8 | def __init__(self): 9 | super().__init__() 10 | self.model_path = os.path.join("Models/02_captcha_to_text", datetime.strftime(datetime.now(), "%Y%m%d%H%M")) 11 | self.vocab = "" 12 | self.height = 50 13 | self.width = 200 14 | self.max_text_length = 0 15 | self.batch_size = 64 16 | self.learning_rate = 1e-3 17 | self.train_epochs = 1000 18 | self.train_workers = 20 -------------------------------------------------------------------------------- /Tutorials/02_captcha_to_text/inferenceModel.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import typing 3 | import numpy as np 4 | 5 | from mltu.inferenceModel import OnnxInferenceModel 6 | from mltu.utils.text_utils import ctc_decoder, get_cer 7 | 8 | class ImageToWordModel(OnnxInferenceModel): 9 | def __init__(self, char_list: typing.Union[str, list], *args, **kwargs): 10 | super().__init__(*args, **kwargs) 11 | self.char_list = char_list 12 | 13 | def predict(self, image: np.ndarray): 14 | image = cv2.resize(image, self.input_shapes[0][1:3][::-1]) 15 | 16 | image_pred = np.expand_dims(image, axis=0).astype(np.float32) 17 | 18 | preds = self.model.run(self.output_names, {self.input_names[0]: image_pred})[0] 19 | 20 | text = ctc_decoder(preds, self.char_list)[0] 21 | 22 | return text 23 | 24 | if __name__ == "__main__": 25 | import pandas as pd 26 | from tqdm import tqdm 27 | from mltu.configs import BaseModelConfigs 28 | 29 | configs = BaseModelConfigs.load("Models/02_captcha_to_text/202212211205/configs.yaml") 30 | 31 | model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab) 32 | 33 | df = pd.read_csv("Models/02_captcha_to_text/202212211205/val.csv").values.tolist() 34 | 35 | accum_cer = [] 36 | for image_path, label in tqdm(df): 37 | image = cv2.imread(image_path.replace("\\", "/")) 38 | 39 | prediction_text = model.predict(image) 40 | 41 | cer = get_cer(prediction_text, label) 42 | print(f"Image: {image_path}, Label: {label}, Prediction: {prediction_text}, CER: {cer}") 43 | 44 | accum_cer.append(cer) 45 | 46 | print(f"Average CER: {np.average(accum_cer)}") -------------------------------------------------------------------------------- /Tutorials/02_captcha_to_text/model.py: -------------------------------------------------------------------------------- 1 | from keras import layers 2 | from keras.models import Model 3 | 4 | from mltu.tensorflow.model_utils import residual_block 5 | 6 | 7 | def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2): 8 | 9 | inputs = layers.Input(shape=input_dim, name="input") 10 | 11 | # normalize images here instead in preprocessing step 12 | input = layers.Lambda(lambda x: x / 255)(inputs) 13 | 14 | x1 = residual_block(input, 16, activation=activation, skip_conv=True, strides=1, dropout=dropout) 15 | 16 | x2 = residual_block(x1, 16, activation=activation, skip_conv=True, strides=2, dropout=dropout) 17 | x3 = residual_block(x2, 16, activation=activation, skip_conv=False, strides=1, dropout=dropout) 18 | 19 | x4 = residual_block(x3, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout) 20 | x5 = residual_block(x4, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout) 21 | 22 | x6 = residual_block(x5, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout) 23 | x7 = residual_block(x6, 32, activation=activation, skip_conv=True, strides=1, dropout=dropout) 24 | 25 | x8 = residual_block(x7, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout) 26 | x9 = residual_block(x8, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout) 27 | 28 | squeezed = layers.Reshape((x9.shape[-3] * x9.shape[-2], x9.shape[-1]))(x9) 29 | 30 | blstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(squeezed) 31 | blstm = layers.Dropout(dropout)(blstm) 32 | 33 | output = layers.Dense(output_dim + 1, activation="softmax", name="output")(blstm) 34 | 35 | model = Model(inputs=inputs, outputs=output) 36 | return model 37 | -------------------------------------------------------------------------------- /Tutorials/02_captcha_to_text/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] 3 | except: pass 4 | 5 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard 6 | 7 | from mltu.tensorflow.dataProvider import DataProvider 8 | from mltu.tensorflow.losses import CTCloss 9 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger 10 | from mltu.tensorflow.metrics import CWERMetric 11 | 12 | from mltu.preprocessors import ImageReader 13 | from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding 14 | from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate 15 | from mltu.annotations.images import CVImage 16 | 17 | from model import train_model 18 | from configs import ModelConfigs 19 | 20 | import os 21 | from urllib.request import urlopen 22 | from io import BytesIO 23 | from zipfile import ZipFile 24 | 25 | 26 | def download_and_unzip(url, extract_to="Datasets"): 27 | http_response = urlopen(url) 28 | zipfile = ZipFile(BytesIO(http_response.read())) 29 | zipfile.extractall(path=extract_to) 30 | 31 | 32 | if not os.path.exists(os.path.join("Datasets", "captcha_images_v2")): 33 | download_and_unzip("https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip", 34 | extract_to="Datasets") 35 | 36 | # Create a list of all the images and labels in the dataset 37 | dataset, vocab, max_len = [], set(), 0 38 | captcha_path = os.path.join("Datasets", "captcha_images_v2") 39 | for file in os.listdir(captcha_path): 40 | file_path = os.path.join(captcha_path, file) 41 | label = os.path.splitext(file)[0] # Get the file name without the extension 42 | dataset.append([file_path, label]) 43 | vocab.update(list(label)) 44 | max_len = max(max_len, len(label)) 45 | 46 | configs = ModelConfigs() 47 | 48 | # Save vocab and maximum text length to configs 49 | configs.vocab = "".join(vocab) 50 | configs.max_text_length = max_len 51 | configs.save() 52 | 53 | # Create a data provider for the dataset 54 | data_provider = DataProvider( 55 | dataset=dataset, 56 | skip_validation=True, 57 | batch_size=configs.batch_size, 58 | data_preprocessors=[ImageReader(CVImage)], 59 | transformers=[ 60 | ImageResizer(configs.width, configs.height), 61 | LabelIndexer(configs.vocab), 62 | LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)) 63 | ], 64 | ) 65 | # Split the dataset into training and validation sets 66 | train_data_provider, val_data_provider = data_provider.split(split = 0.9) 67 | 68 | # Augment training data with random brightness, rotation and erode/dilate 69 | train_data_provider.augmentors = [RandomBrightness(), RandomRotate(), RandomErodeDilate()] 70 | 71 | # Creating TensorFlow model architecture 72 | model = train_model( 73 | input_dim = (configs.height, configs.width, 3), 74 | output_dim = len(configs.vocab), 75 | ) 76 | 77 | # Compile the model and print summary 78 | model.compile( 79 | optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 80 | loss=CTCloss(), 81 | metrics=[CWERMetric(padding_token=len(configs.vocab))], 82 | run_eagerly=False 83 | ) 84 | model.summary(line_length=110) 85 | # Define path to save the model 86 | os.makedirs(configs.model_path, exist_ok=True) 87 | 88 | # Define callbacks 89 | earlystopper = EarlyStopping(monitor="val_CER", patience=50, verbose=1, mode="min") 90 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min") 91 | trainLogger = TrainLogger(configs.model_path) 92 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1) 93 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=20, verbose=1, mode="min") 94 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5") 95 | 96 | # Train the model 97 | model.fit( 98 | train_data_provider, 99 | validation_data=val_data_provider, 100 | epochs=configs.train_epochs, 101 | callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx], 102 | workers=configs.train_workers 103 | ) 104 | 105 | # Save training and validation datasets as csv files 106 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv")) 107 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv")) -------------------------------------------------------------------------------- /Tutorials/03_handwriting_recognition/configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | from mltu.configs import BaseModelConfigs 5 | 6 | class ModelConfigs(BaseModelConfigs): 7 | def __init__(self): 8 | super().__init__() 9 | self.model_path = os.path.join("Models/03_handwriting_recognition", datetime.strftime(datetime.now(), "%Y%m%d%H%M")) 10 | self.vocab = "" 11 | self.height = 32 12 | self.width = 128 13 | self.max_text_length = 0 14 | self.batch_size = 16 15 | self.learning_rate = 0.0005 16 | self.train_epochs = 1000 17 | self.train_workers = 20 -------------------------------------------------------------------------------- /Tutorials/03_handwriting_recognition/inferenceModel.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import typing 3 | import numpy as np 4 | 5 | from mltu.inferenceModel import OnnxInferenceModel 6 | from mltu.utils.text_utils import ctc_decoder, get_cer 7 | 8 | class ImageToWordModel(OnnxInferenceModel): 9 | def __init__(self, char_list: typing.Union[str, list], *args, **kwargs): 10 | super().__init__(*args, **kwargs) 11 | self.char_list = char_list 12 | 13 | def predict(self, image: np.ndarray): 14 | image = cv2.resize(image, self.input_shapes[0][1:3][::-1]) 15 | 16 | image_pred = np.expand_dims(image, axis=0).astype(np.float32) 17 | 18 | preds = self.model.run(self.output_names, {self.input_names[0]: image_pred})[0] 19 | 20 | text = ctc_decoder(preds, self.char_list)[0] 21 | 22 | return text 23 | 24 | if __name__ == "__main__": 25 | import pandas as pd 26 | from tqdm import tqdm 27 | from mltu.configs import BaseModelConfigs 28 | 29 | configs = BaseModelConfigs.load("Models/03_handwriting_recognition/202301111911/configs.yaml") 30 | 31 | model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab) 32 | 33 | df = pd.read_csv("Models/03_handwriting_recognition/202301111911/val.csv").values.tolist() 34 | 35 | accum_cer = [] 36 | for image_path, label in tqdm(df): 37 | image = cv2.imread(image_path.replace("\\", "/")) 38 | 39 | prediction_text = model.predict(image) 40 | 41 | cer = get_cer(prediction_text, label) 42 | print(f"Image: {image_path}, Label: {label}, Prediction: {prediction_text}, CER: {cer}") 43 | 44 | accum_cer.append(cer) 45 | 46 | # resize by 4x 47 | image = cv2.resize(image, (image.shape[1] * 4, image.shape[0] * 4)) 48 | cv2.imshow("Image", image) 49 | cv2.waitKey(0) 50 | cv2.destroyAllWindows() 51 | 52 | print(f"Average CER: {np.average(accum_cer)}") -------------------------------------------------------------------------------- /Tutorials/03_handwriting_recognition/model.py: -------------------------------------------------------------------------------- 1 | from keras import layers 2 | from keras.models import Model 3 | 4 | from mltu.tensorflow.model_utils import residual_block 5 | 6 | 7 | def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2): 8 | 9 | inputs = layers.Input(shape=input_dim, name="input") 10 | 11 | # normalize images here instead in preprocessing step 12 | input = layers.Lambda(lambda x: x / 255)(inputs) 13 | 14 | x1 = residual_block(input, 16, activation=activation, skip_conv=True, strides=1, dropout=dropout) 15 | 16 | x2 = residual_block(x1, 16, activation=activation, skip_conv=True, strides=2, dropout=dropout) 17 | x3 = residual_block(x2, 16, activation=activation, skip_conv=False, strides=1, dropout=dropout) 18 | 19 | x4 = residual_block(x3, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout) 20 | x5 = residual_block(x4, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout) 21 | 22 | x6 = residual_block(x5, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout) 23 | x7 = residual_block(x6, 64, activation=activation, skip_conv=True, strides=1, dropout=dropout) 24 | 25 | x8 = residual_block(x7, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout) 26 | x9 = residual_block(x8, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout) 27 | 28 | squeezed = layers.Reshape((x9.shape[-3] * x9.shape[-2], x9.shape[-1]))(x9) 29 | 30 | blstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(squeezed) 31 | blstm = layers.Dropout(dropout)(blstm) 32 | 33 | output = layers.Dense(output_dim + 1, activation="softmax", name="output")(blstm) 34 | 35 | model = Model(inputs=inputs, outputs=output) 36 | return model 37 | -------------------------------------------------------------------------------- /Tutorials/03_handwriting_recognition/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] 3 | except: pass 4 | 5 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard 6 | 7 | from mltu.preprocessors import ImageReader 8 | from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2 9 | from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen 10 | from mltu.annotations.images import CVImage 11 | 12 | from mltu.tensorflow.dataProvider import DataProvider 13 | from mltu.tensorflow.losses import CTCloss 14 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger 15 | from mltu.tensorflow.metrics import CWERMetric 16 | 17 | from model import train_model 18 | from configs import ModelConfigs 19 | 20 | import os 21 | import tarfile 22 | from tqdm import tqdm 23 | from urllib.request import urlopen 24 | from io import BytesIO 25 | from zipfile import ZipFile 26 | 27 | 28 | def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024): 29 | http_response = urlopen(url) 30 | 31 | data = b"" 32 | iterations = http_response.length // chunk_size + 1 33 | for _ in tqdm(range(iterations)): 34 | data += http_response.read(chunk_size) 35 | 36 | zipfile = ZipFile(BytesIO(data)) 37 | zipfile.extractall(path=extract_to) 38 | 39 | dataset_path = os.path.join("Datasets", "IAM_Words") 40 | if not os.path.exists(dataset_path): 41 | download_and_unzip("https://git.io/J0fjL", extract_to="Datasets") 42 | 43 | file = tarfile.open(os.path.join(dataset_path, "words.tgz")) 44 | file.extractall(os.path.join(dataset_path, "words")) 45 | 46 | dataset, vocab, max_len = [], set(), 0 47 | 48 | # Preprocess the dataset by the specific IAM_Words dataset file structure 49 | words = open(os.path.join(dataset_path, "words.txt"), "r").readlines() 50 | for line in tqdm(words): 51 | if line.startswith("#"): 52 | continue 53 | 54 | line_split = line.split(" ") 55 | if line_split[1] == "err": 56 | continue 57 | 58 | folder1 = line_split[0][:3] 59 | folder2 = "-".join(line_split[0].split("-")[:2]) 60 | file_name = line_split[0] + ".png" 61 | label = line_split[-1].rstrip("\n") 62 | 63 | rel_path = os.path.join(dataset_path, "words", folder1, folder2, file_name) 64 | if not os.path.exists(rel_path): 65 | print(f"File not found: {rel_path}") 66 | continue 67 | 68 | dataset.append([rel_path, label]) 69 | vocab.update(list(label)) 70 | max_len = max(max_len, len(label)) 71 | 72 | # Create a ModelConfigs object to store model configurations 73 | configs = ModelConfigs() 74 | 75 | # Save vocab and maximum text length to configs 76 | configs.vocab = "".join(vocab) 77 | configs.max_text_length = max_len 78 | configs.save() 79 | 80 | # Create a data provider for the dataset 81 | data_provider = DataProvider( 82 | dataset=dataset, 83 | skip_validation=True, 84 | batch_size=configs.batch_size, 85 | data_preprocessors=[ImageReader(CVImage)], 86 | transformers=[ 87 | ImageResizer(configs.width, configs.height, keep_aspect_ratio=False), 88 | LabelIndexer(configs.vocab), 89 | LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)), 90 | ], 91 | ) 92 | 93 | # Split the dataset into training and validation sets 94 | train_data_provider, val_data_provider = data_provider.split(split = 0.9) 95 | 96 | # Augment training data with random brightness, rotation and erode/dilate 97 | train_data_provider.augmentors = [ 98 | RandomBrightness(), 99 | RandomErodeDilate(), 100 | RandomSharpen(), 101 | RandomRotate(angle=10), 102 | ] 103 | 104 | # Creating TensorFlow model architecture 105 | model = train_model( 106 | input_dim = (configs.height, configs.width, 3), 107 | output_dim = len(configs.vocab), 108 | ) 109 | 110 | # Compile the model and print summary 111 | model.compile( 112 | optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 113 | loss=CTCloss(), 114 | metrics=[CWERMetric(padding_token=len(configs.vocab))], 115 | ) 116 | model.summary(line_length=110) 117 | 118 | # Define callbacks 119 | earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1) 120 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min") 121 | trainLogger = TrainLogger(configs.model_path) 122 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1) 123 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=10, verbose=1, mode="auto") 124 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5") 125 | 126 | # Train the model 127 | model.fit( 128 | train_data_provider, 129 | validation_data=val_data_provider, 130 | epochs=configs.train_epochs, 131 | callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx], 132 | workers=configs.train_workers 133 | ) 134 | 135 | # Save training and validation datasets as csv files 136 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv")) 137 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv")) -------------------------------------------------------------------------------- /Tutorials/04_sentence_recognition/README.md: -------------------------------------------------------------------------------- 1 | # Handwritten sentence recognition with TensorFlow 2 | ## Unlock the power of handwritten sentence recognition with TensorFlow and CTC loss. From digitizing notes to transcribing historical documents and automating exam grading 3 | 4 | 5 | ## **Detailed tutorial**: 6 | ## [Handwritten sentence recognition with TensorFlow](https://pylessons.com/handwritten-sentence-recognition) 7 | 8 |

9 | 10 |

-------------------------------------------------------------------------------- /Tutorials/04_sentence_recognition/configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | from mltu.configs import BaseModelConfigs 5 | 6 | class ModelConfigs(BaseModelConfigs): 7 | def __init__(self): 8 | super().__init__() 9 | self.model_path = os.path.join("Models/04_sentence_recognition", datetime.strftime(datetime.now(), "%Y%m%d%H%M")) 10 | self.vocab = "" 11 | self.height = 96 12 | self.width = 1408 13 | self.max_text_length = 0 14 | self.batch_size = 32 15 | self.learning_rate = 0.0005 16 | self.train_epochs = 1000 17 | self.train_workers = 20 -------------------------------------------------------------------------------- /Tutorials/04_sentence_recognition/inferenceModel.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import typing 3 | import numpy as np 4 | 5 | from mltu.inferenceModel import OnnxInferenceModel 6 | from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer 7 | from mltu.transformers import ImageResizer 8 | 9 | class ImageToWordModel(OnnxInferenceModel): 10 | def __init__(self, char_list: typing.Union[str, list], *args, **kwargs): 11 | super().__init__(*args, **kwargs) 12 | self.char_list = char_list 13 | 14 | def predict(self, image: np.ndarray): 15 | image = ImageResizer.resize_maintaining_aspect_ratio(image, *self.input_shapes[0][1:3][::-1]) 16 | 17 | image_pred = np.expand_dims(image, axis=0).astype(np.float32) 18 | 19 | preds = self.model.run(self.output_names, {self.input_names[0]: image_pred})[0] 20 | 21 | text = ctc_decoder(preds, self.char_list)[0] 22 | 23 | return text 24 | 25 | if __name__ == "__main__": 26 | import pandas as pd 27 | from tqdm import tqdm 28 | from mltu.configs import BaseModelConfigs 29 | 30 | configs = BaseModelConfigs.load("Models/04_sentence_recognition/202301131202/configs.yaml") 31 | 32 | model = ImageToWordModel(model_path=configs.model_path, char_list=configs.vocab) 33 | 34 | df = pd.read_csv("Models/04_sentence_recognition/202301131202/val.csv").values.tolist() 35 | 36 | accum_cer, accum_wer = [], [] 37 | for image_path, label in tqdm(df): 38 | image = cv2.imread(image_path.replace("\\", "/")) 39 | 40 | prediction_text = model.predict(image) 41 | 42 | cer = get_cer(prediction_text, label) 43 | wer = get_wer(prediction_text, label) 44 | print("Image: ", image_path) 45 | print("Label:", label) 46 | print("Prediction: ", prediction_text) 47 | print(f"CER: {cer}; WER: {wer}") 48 | 49 | accum_cer.append(cer) 50 | accum_wer.append(wer) 51 | 52 | cv2.imshow(prediction_text, image) 53 | cv2.waitKey(0) 54 | cv2.destroyAllWindows() 55 | 56 | print(f"Average CER: {np.average(accum_cer)}, Average WER: {np.average(accum_wer)}") -------------------------------------------------------------------------------- /Tutorials/04_sentence_recognition/model.py: -------------------------------------------------------------------------------- 1 | from keras import layers 2 | from keras.models import Model 3 | 4 | from mltu.tensorflow.model_utils import residual_block 5 | 6 | 7 | def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2): 8 | 9 | inputs = layers.Input(shape=input_dim, name="input") 10 | 11 | # normalize images here instead in preprocessing step 12 | input = layers.Lambda(lambda x: x / 255)(inputs) 13 | 14 | x1 = residual_block(input, 32, activation=activation, skip_conv=True, strides=1, dropout=dropout) 15 | 16 | x2 = residual_block(x1, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout) 17 | x3 = residual_block(x2, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout) 18 | 19 | x4 = residual_block(x3, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout) 20 | x5 = residual_block(x4, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout) 21 | 22 | x6 = residual_block(x5, 128, activation=activation, skip_conv=True, strides=2, dropout=dropout) 23 | x7 = residual_block(x6, 128, activation=activation, skip_conv=True, strides=1, dropout=dropout) 24 | 25 | x8 = residual_block(x7, 128, activation=activation, skip_conv=True, strides=2, dropout=dropout) 26 | x9 = residual_block(x8, 128, activation=activation, skip_conv=False, strides=1, dropout=dropout) 27 | 28 | squeezed = layers.Reshape((x9.shape[-3] * x9.shape[-2], x9.shape[-1]))(x9) 29 | 30 | blstm = layers.Bidirectional(layers.LSTM(256, return_sequences=True))(squeezed) 31 | blstm = layers.Dropout(dropout)(blstm) 32 | 33 | blstm = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(blstm) 34 | blstm = layers.Dropout(dropout)(blstm) 35 | 36 | output = layers.Dense(output_dim + 1, activation="softmax", name="output")(blstm) 37 | 38 | model = Model(inputs=inputs, outputs=output) 39 | return model 40 | -------------------------------------------------------------------------------- /Tutorials/04_sentence_recognition/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] 3 | except: pass 4 | 5 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard 6 | 7 | from mltu.preprocessors import ImageReader 8 | from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2 9 | from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen 10 | from mltu.annotations.images import CVImage 11 | 12 | from mltu.tensorflow.dataProvider import DataProvider 13 | from mltu.tensorflow.losses import CTCloss 14 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger 15 | from mltu.tensorflow.metrics import CERMetric, WERMetric 16 | 17 | from model import train_model 18 | from configs import ModelConfigs 19 | 20 | import os 21 | from tqdm import tqdm 22 | 23 | # Must download and extract datasets manually from https://fki.tic.heia-fr.ch/databases/download-the-iam-handwriting-database to Datasets\IAM_Sentences 24 | sentences_txt_path = os.path.join("Datasets", "IAM_Sentences", "ascii", "sentences.txt") 25 | sentences_folder_path = os.path.join("Datasets", "IAM_Sentences", "sentences") 26 | 27 | dataset, vocab, max_len = [], set(), 0 28 | words = open(sentences_txt_path, "r").readlines() 29 | for line in tqdm(words): 30 | if line.startswith("#"): 31 | continue 32 | 33 | line_split = line.split(" ") 34 | if line_split[2] == "err": 35 | continue 36 | 37 | folder1 = line_split[0][:3] 38 | folder2 = "-".join(line_split[0].split("-")[:2]) 39 | file_name = line_split[0] + ".png" 40 | label = line_split[-1].rstrip("\n") 41 | 42 | # replace "|" with " " in label 43 | label = label.replace("|", " ") 44 | 45 | rel_path = os.path.join(sentences_folder_path, folder1, folder2, file_name) 46 | if not os.path.exists(rel_path): 47 | print(f"File not found: {rel_path}") 48 | continue 49 | 50 | dataset.append([rel_path, label]) 51 | vocab.update(list(label)) 52 | max_len = max(max_len, len(label)) 53 | 54 | # Create a ModelConfigs object to store model configurations 55 | configs = ModelConfigs() 56 | 57 | # Save vocab and maximum text length to configs 58 | configs.vocab = "".join(vocab) 59 | configs.max_text_length = max_len 60 | configs.save() 61 | 62 | # Create a data provider for the dataset 63 | data_provider = DataProvider( 64 | dataset=dataset, 65 | skip_validation=True, 66 | batch_size=configs.batch_size, 67 | data_preprocessors=[ImageReader(CVImage)], 68 | transformers=[ 69 | ImageResizer(configs.width, configs.height, keep_aspect_ratio=True), 70 | LabelIndexer(configs.vocab), 71 | LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)), 72 | ], 73 | ) 74 | 75 | # Split the dataset into training and validation sets 76 | train_data_provider, val_data_provider = data_provider.split(split = 0.9) 77 | 78 | # Augment training data with random brightness, rotation and erode/dilate 79 | train_data_provider.augmentors = [ 80 | RandomBrightness(), 81 | RandomErodeDilate(), 82 | RandomSharpen(), 83 | ] 84 | 85 | # Creating TensorFlow model architecture 86 | model = train_model( 87 | input_dim = (configs.height, configs.width, 3), 88 | output_dim = len(configs.vocab), 89 | ) 90 | 91 | # Compile the model and print summary 92 | model.compile( 93 | optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 94 | loss=CTCloss(), 95 | metrics=[ 96 | CERMetric(vocabulary=configs.vocab), 97 | WERMetric(vocabulary=configs.vocab) 98 | ], 99 | run_eagerly=False 100 | ) 101 | model.summary(line_length=110) 102 | 103 | # Define callbacks 104 | earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min") 105 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min") 106 | trainLogger = TrainLogger(configs.model_path) 107 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1) 108 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=5, verbose=1, mode="auto") 109 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5") 110 | 111 | # Train the model 112 | model.fit( 113 | train_data_provider, 114 | validation_data=val_data_provider, 115 | epochs=configs.train_epochs, 116 | callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx], 117 | workers=configs.train_workers 118 | ) 119 | 120 | # Save training and validation datasets as csv files 121 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv")) 122 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv")) -------------------------------------------------------------------------------- /Tutorials/05_sound_to_text/README.md: -------------------------------------------------------------------------------- 1 | # Introduction to speech recognition with TensorFlow 2 | ## Master the basics of speech recognition with TensorFlow: Learn how to build and train models, implement real-time audio recognition, and develop practical applications 3 | 4 | 5 | ## **Detailed tutorial**: 6 | ## [Introduction to speech recognition with TensorFlow](https://pylessons.com/speech-recognition) 7 | 8 |

9 | 10 |

-------------------------------------------------------------------------------- /Tutorials/05_sound_to_text/configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | from mltu.configs import BaseModelConfigs 5 | 6 | 7 | class ModelConfigs(BaseModelConfigs): 8 | def __init__(self): 9 | super().__init__() 10 | self.model_path = os.path.join("Models/05_sound_to_text", datetime.strftime(datetime.now(), "%Y%m%d%H%M")) 11 | self.frame_length = 256 12 | self.frame_step = 160 13 | self.fft_length = 384 14 | 15 | self.vocab = "abcdefghijklmnopqrstuvwxyz'?! " 16 | self.input_shape = None 17 | self.max_text_length = None 18 | self.max_spectrogram_length = None 19 | 20 | self.batch_size = 8 21 | self.learning_rate = 0.0005 22 | self.train_epochs = 1000 23 | self.train_workers = 20 -------------------------------------------------------------------------------- /Tutorials/05_sound_to_text/inferenceModel.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import numpy as np 3 | 4 | from mltu.inferenceModel import OnnxInferenceModel 5 | from mltu.preprocessors import WavReader 6 | from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer 7 | 8 | class WavToTextModel(OnnxInferenceModel): 9 | def __init__(self, char_list: typing.Union[str, list], *args, **kwargs): 10 | super().__init__(*args, **kwargs) 11 | self.char_list = char_list 12 | 13 | def predict(self, data: np.ndarray): 14 | data_pred = np.expand_dims(data, axis=0) 15 | 16 | preds = self.model.run(self.output_names, {self.input_names[0]: data_pred})[0] 17 | 18 | text = ctc_decoder(preds, self.char_list)[0] 19 | 20 | return text 21 | 22 | if __name__ == "__main__": 23 | import pandas as pd 24 | from tqdm import tqdm 25 | from mltu.configs import BaseModelConfigs 26 | 27 | configs = BaseModelConfigs.load("Models/05_sound_to_text/202302051936/configs.yaml") 28 | 29 | model = WavToTextModel(model_path=configs.model_path, char_list=configs.vocab, force_cpu=False) 30 | 31 | df = pd.read_csv("Models/05_sound_to_text/202302051936/val.csv").values.tolist() 32 | 33 | accum_cer, accum_wer = [], [] 34 | for wav_path, label in tqdm(df): 35 | wav_path = wav_path.replace("\\", "/") 36 | spectrogram = WavReader.get_spectrogram(wav_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length) 37 | WavReader.plot_raw_audio(wav_path, label) 38 | 39 | padded_spectrogram = np.pad(spectrogram, ((0, configs.max_spectrogram_length - spectrogram.shape[0]),(0,0)), mode="constant", constant_values=0) 40 | 41 | WavReader.plot_spectrogram(spectrogram, label) 42 | 43 | text = model.predict(padded_spectrogram) 44 | 45 | true_label = "".join([l for l in label.lower() if l in configs.vocab]) 46 | 47 | cer = get_cer(text, true_label) 48 | wer = get_wer(text, true_label) 49 | 50 | accum_cer.append(cer) 51 | accum_wer.append(wer) 52 | 53 | print(f"Average CER: {np.average(accum_cer)}, Average WER: {np.average(accum_wer)}") -------------------------------------------------------------------------------- /Tutorials/05_sound_to_text/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from keras import layers 3 | from keras.models import Model 4 | 5 | from mltu.tensorflow.model_utils import residual_block, activation_layer 6 | 7 | 8 | def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2): 9 | 10 | inputs = layers.Input(shape=input_dim, name="input", dtype=tf.float32) 11 | 12 | # expand dims to add channel dimension 13 | input = layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))(inputs) 14 | 15 | # Convolution layer 1 16 | x = layers.Conv2D(filters=32, kernel_size=[11, 41], strides=[2, 2], padding="same", use_bias=False)(input) 17 | x = layers.BatchNormalization()(x) 18 | x = activation_layer(x, activation="leaky_relu") 19 | 20 | # Convolution layer 2 21 | x = layers.Conv2D(filters=32, kernel_size=[11, 21], strides=[1, 2], padding="same", use_bias=False)(x) 22 | x = layers.BatchNormalization()(x) 23 | x = activation_layer(x, activation="leaky_relu") 24 | 25 | # Reshape the resulted volume to feed the RNNs layers 26 | x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x) 27 | 28 | # RNN layers 29 | x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x) 30 | x = layers.Dropout(dropout)(x) 31 | 32 | x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x) 33 | x = layers.Dropout(dropout)(x) 34 | 35 | x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x) 36 | x = layers.Dropout(dropout)(x) 37 | 38 | x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x) 39 | x = layers.Dropout(dropout)(x) 40 | 41 | x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x) 42 | 43 | # Dense layer 44 | x = layers.Dense(256)(x) 45 | x = activation_layer(x, activation="leaky_relu") 46 | x = layers.Dropout(dropout)(x) 47 | 48 | # Classification layer 49 | output = layers.Dense(output_dim + 1, activation="softmax", dtype=tf.float32)(x) 50 | 51 | model = Model(inputs=inputs, outputs=output) 52 | return model -------------------------------------------------------------------------------- /Tutorials/05_sound_to_text/train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] 3 | except: pass 4 | 5 | import os 6 | import tarfile 7 | import pandas as pd 8 | from tqdm import tqdm 9 | from urllib.request import urlopen 10 | from io import BytesIO 11 | 12 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard 13 | from mltu.preprocessors import WavReader 14 | 15 | from mltu.tensorflow.dataProvider import DataProvider 16 | from mltu.transformers import LabelIndexer, LabelPadding, SpectrogramPadding 17 | from mltu.tensorflow.losses import CTCloss 18 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger 19 | from mltu.tensorflow.metrics import CERMetric, WERMetric 20 | 21 | from model import train_model 22 | from configs import ModelConfigs 23 | 24 | 25 | def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024): 26 | http_response = urlopen(url) 27 | 28 | data = b"" 29 | iterations = http_response.length // chunk_size + 1 30 | for _ in tqdm(range(iterations)): 31 | data += http_response.read(chunk_size) 32 | 33 | tarFile = tarfile.open(fileobj=BytesIO(data), mode="r|bz2") 34 | tarFile.extractall(path=extract_to) 35 | tarFile.close() 36 | 37 | 38 | dataset_path = os.path.join("Datasets", "LJSpeech-1.1") 39 | if not os.path.exists(dataset_path): 40 | download_and_unzip("https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", extract_to="Datasets") 41 | 42 | dataset_path = "Datasets/LJSpeech-1.1" 43 | metadata_path = dataset_path + "/metadata.csv" 44 | wavs_path = dataset_path + "/wavs/" 45 | 46 | # Read metadata file and parse it 47 | metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3) 48 | metadata_df.columns = ["file_name", "transcription", "normalized_transcription"] 49 | metadata_df = metadata_df[["file_name", "normalized_transcription"]] 50 | 51 | # structure the dataset where each row is a list of [wav_file_path, sound transcription] 52 | dataset = [[f"Datasets/LJSpeech-1.1/wavs/{file}.wav", label.lower()] for file, label in metadata_df.values.tolist()] 53 | 54 | # Create a ModelConfigs object to store model configurations 55 | configs = ModelConfigs() 56 | 57 | max_text_length, max_spectrogram_length = 0, 0 58 | for file_path, label in tqdm(dataset): 59 | spectrogram = WavReader.get_spectrogram(file_path, frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length) 60 | valid_label = [c for c in label if c in configs.vocab] 61 | max_text_length = max(max_text_length, len(valid_label)) 62 | max_spectrogram_length = max(max_spectrogram_length, spectrogram.shape[0]) 63 | configs.input_shape = [max_spectrogram_length, spectrogram.shape[1]] 64 | 65 | configs.max_spectrogram_length = max_spectrogram_length 66 | configs.max_text_length = max_text_length 67 | configs.save() 68 | 69 | # Create a data provider for the dataset 70 | data_provider = DataProvider( 71 | dataset=dataset, 72 | skip_validation=True, 73 | batch_size=configs.batch_size, 74 | data_preprocessors=[ 75 | WavReader(frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length), 76 | ], 77 | transformers=[ 78 | SpectrogramPadding(max_spectrogram_length=configs.max_spectrogram_length, padding_value=0), 79 | LabelIndexer(configs.vocab), 80 | LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)), 81 | ], 82 | ) 83 | 84 | # Split the dataset into training and validation sets 85 | train_data_provider, val_data_provider = data_provider.split(split = 0.9) 86 | 87 | # Creating TensorFlow model architecture 88 | model = train_model( 89 | input_dim = configs.input_shape, 90 | output_dim = len(configs.vocab), 91 | dropout=0.5 92 | ) 93 | 94 | # Compile the model and print summary 95 | model.compile( 96 | optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 97 | loss=CTCloss(), 98 | metrics=[ 99 | CERMetric(vocabulary=configs.vocab), 100 | WERMetric(vocabulary=configs.vocab) 101 | ], 102 | run_eagerly=False 103 | ) 104 | model.summary(line_length=110) 105 | 106 | # Define callbacks 107 | earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min") 108 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min") 109 | trainLogger = TrainLogger(configs.model_path) 110 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1) 111 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.8, min_delta=1e-10, patience=5, verbose=1, mode="auto") 112 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5") 113 | 114 | # Train the model 115 | model.fit( 116 | train_data_provider, 117 | validation_data=val_data_provider, 118 | epochs=configs.train_epochs, 119 | callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx], 120 | workers=configs.train_workers 121 | ) 122 | 123 | # Save training and validation datasets as csv files 124 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv")) 125 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv")) 126 | -------------------------------------------------------------------------------- /Tutorials/05_sound_to_text/train_no_limit.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] 3 | except: pass 4 | tf.keras.mixed_precision.set_global_policy('mixed_float16') # mixed precission training for faster training time 5 | 6 | import os 7 | import tarfile 8 | import pandas as pd 9 | from tqdm import tqdm 10 | from urllib.request import urlopen 11 | from io import BytesIO 12 | 13 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard 14 | from mltu.preprocessors import WavReader 15 | 16 | from mltu.tensorflow.dataProvider import DataProvider 17 | from mltu.transformers import LabelIndexer, LabelPadding, SpectrogramPadding 18 | from mltu.tensorflow.losses import CTCloss 19 | from mltu.tensorflow.callbacks import Model2onnx, TrainLogger 20 | from mltu.tensorflow.metrics import CERMetric, WERMetric 21 | 22 | from model import train_model 23 | from configs import ModelConfigs 24 | 25 | 26 | def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024): 27 | http_response = urlopen(url) 28 | 29 | data = b"" 30 | iterations = http_response.length // chunk_size + 1 31 | for _ in tqdm(range(iterations)): 32 | data += http_response.read(chunk_size) 33 | 34 | tarFile = tarfile.open(fileobj=BytesIO(data), mode="r|bz2") 35 | tarFile.extractall(path=extract_to) 36 | tarFile.close() 37 | 38 | 39 | dataset_path = os.path.join("Datasets", "LJSpeech-1.1") 40 | if not os.path.exists(dataset_path): 41 | download_and_unzip("https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", extract_to="Datasets") 42 | 43 | dataset_path = "Datasets/LJSpeech-1.1" 44 | metadata_path = dataset_path + "/metadata.csv" 45 | wavs_path = dataset_path + "/wavs/" 46 | 47 | # Read metadata file and parse it 48 | metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3) 49 | metadata_df.columns = ["file_name", "transcription", "normalized_transcription"] 50 | metadata_df = metadata_df[["file_name", "normalized_transcription"]] 51 | 52 | # structure the dataset where each row is a list of [wav_file_path, sound transcription] 53 | dataset = [[f"Datasets/LJSpeech-1.1/wavs/{file}.wav", label.lower()] for file, label in metadata_df.values.tolist()] 54 | 55 | # Create a ModelConfigs object to store model configurations 56 | configs = ModelConfigs() 57 | configs.save() 58 | 59 | # Create a data provider for the dataset 60 | data_provider = DataProvider( 61 | dataset=dataset, 62 | skip_validation=True, 63 | batch_size=configs.batch_size, 64 | data_preprocessors=[ 65 | WavReader(frame_length=configs.frame_length, frame_step=configs.frame_step, fft_length=configs.fft_length), 66 | ], 67 | transformers=[ 68 | LabelIndexer(configs.vocab), 69 | ], 70 | batch_postprocessors=[ 71 | SpectrogramPadding(padding_value=0, use_on_batch=True), 72 | LabelPadding(padding_value=len(configs.vocab), use_on_batch=True), 73 | ], 74 | ) 75 | 76 | # Split the dataset into training and validation sets 77 | train_data_provider, val_data_provider = data_provider.split(split = 0.9) 78 | 79 | # Creating TensorFlow model architecture 80 | model = train_model( 81 | input_dim = (None, 193), 82 | output_dim = len(configs.vocab), 83 | dropout=0.5 84 | ) 85 | 86 | # Compile the model and print summary 87 | model.compile( 88 | optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate), 89 | loss=CTCloss(), 90 | metrics=[ 91 | CERMetric(vocabulary=configs.vocab), 92 | WERMetric(vocabulary=configs.vocab) 93 | ], 94 | run_eagerly=False 95 | ) 96 | model.summary(line_length=110) 97 | 98 | # Define callbacks 99 | earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode="min") 100 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_CER", verbose=1, save_best_only=True, mode="min") 101 | trainLogger = TrainLogger(configs.model_path) 102 | tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1) 103 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.8, min_delta=1e-10, patience=5, verbose=1, mode="auto") 104 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5") 105 | 106 | # Train the model 107 | model.fit( 108 | train_data_provider, 109 | validation_data=val_data_provider, 110 | epochs=configs.train_epochs, 111 | callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx], 112 | workers=configs.train_workers, 113 | ) 114 | 115 | # Save training and validation datasets as csv files 116 | train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv")) 117 | val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv")) -------------------------------------------------------------------------------- /Tutorials/06_pytorch_introduction/README.md: -------------------------------------------------------------------------------- 1 | # Introduction to PyTorch in a practical way 2 | ## In this tutorial, I'll cover the basics of PyTorch, how to prepare a dataset, construct the network, define training and validation loops, save the model and finally test the saved model

3 | 4 | # **Detailed tutorial**: 5 | ## [Introduction to PyTorch in a practical way](https://pylessons.com/pytorch-introduction) 6 | 7 |

8 | 9 |

-------------------------------------------------------------------------------- /Tutorials/06_pytorch_introduction/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | # Define the model architecture 5 | class Net(nn.Module): 6 | def __init__(self): 7 | super(Net, self).__init__() 8 | self.conv1 = nn.Conv2d(1, 10, kernel_size=5) 9 | self.conv2 = nn.Conv2d(10, 20, kernel_size=5) 10 | self.conv2_drop = nn.Dropout2d() 11 | self.fc1 = nn.Linear(320, 50) 12 | self.fc2 = nn.Linear(50, 10) 13 | 14 | def forward(self, x): 15 | x = F.relu(F.max_pool2d(self.conv1(x), 2)) 16 | x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) 17 | x = x.view(-1, 320) 18 | x = F.relu(self.fc1(x)) 19 | x = F.dropout(x, training=self.training) 20 | x = self.fc2(x) 21 | x = F.log_softmax(x, dim=1) 22 | return x -------------------------------------------------------------------------------- /Tutorials/06_pytorch_introduction/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | opencv-python 3 | tqdm 4 | torch 5 | torchsummary -------------------------------------------------------------------------------- /Tutorials/06_pytorch_introduction/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import torch 4 | import numpy as np 5 | import requests, gzip, os, hashlib 6 | 7 | from model import Net 8 | 9 | path = "Datasets/mnist" # Path where to save the downloaded mnist dataset 10 | 11 | def fetch(url): 12 | if os.path.exists(path) is False: 13 | os.makedirs(path) 14 | 15 | fp = os.path.join(path, hashlib.md5(url.encode("utf-8")).hexdigest()) 16 | if os.path.isfile(fp): 17 | with open(fp, "rb") as f: 18 | data = f.read() 19 | else: 20 | with open(fp, "wb") as f: 21 | data = requests.get(url).content 22 | f.write(data) 23 | return np.frombuffer(gzip.decompress(data), dtype=np.uint8).copy() 24 | 25 | test_data = fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28)) 26 | test_targets = fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")[8:] 27 | 28 | # output path 29 | model_path = "Model/06_pytorch_introduction" 30 | 31 | # construct network and load weights 32 | network = Net() 33 | network.load_state_dict(torch.load("Models/06_pytorch_introduction/model.pt")) 34 | network.eval() # set to evaluation mode 35 | 36 | # loop over test images 37 | for test_image, test_target in zip(test_data, test_targets): 38 | 39 | # normalize image and convert to tensor 40 | inference_image = torch.from_numpy(test_image).float() / 255.0 41 | inference_image = inference_image.unsqueeze(0).unsqueeze(0) 42 | 43 | # predict 44 | output = network(inference_image) 45 | pred = output.argmax(dim=1, keepdim=True) 46 | prediction = str(pred.item()) 47 | 48 | test_image = cv2.resize(test_image, (400, 400)) 49 | cv2.imshow(prediction, test_image) 50 | key = cv2.waitKey(0) 51 | if key == ord("q"): # break on q key 52 | break 53 | 54 | cv2.destroyAllWindows() 55 | -------------------------------------------------------------------------------- /Tutorials/06_pytorch_introduction/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import numpy as np 4 | from tqdm import tqdm 5 | import requests, gzip, os, hashlib 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | from torchsummary import summary 11 | 12 | from model import Net 13 | 14 | # define path to store dataset 15 | path = "Datasets/mnist" 16 | 17 | def fetch(url): 18 | if os.path.exists(path) is False: 19 | os.makedirs(path) 20 | 21 | fp = os.path.join(path, hashlib.md5(url.encode("utf-8")).hexdigest()) 22 | if os.path.isfile(fp): 23 | with open(fp, "rb") as f: 24 | data = f.read() 25 | else: 26 | with open(fp, "wb") as f: 27 | data = requests.get(url).content 28 | f.write(data) 29 | return np.frombuffer(gzip.decompress(data), dtype=np.uint8).copy() 30 | 31 | # load mnist dataset from yann.lecun.com, train data is of shape (60000, 28, 28) and targets are of shape (60000) 32 | train_data = fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28)) 33 | train_targets = fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")[8:] 34 | test_data = fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28)) 35 | test_targets = fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")[8:] 36 | 37 | # uncomment to show images from dataset using OpenCV 38 | # for train_image, train_target in zip(train_data, train_targets): 39 | # train_image = cv2.resize(train_image, (400, 400)) 40 | # cv2.imshow("Image", train_image) 41 | # # if Q button break this loop 42 | # if cv2.waitKey(0) & 0xFF == ord("q"): 43 | # break 44 | # cv2.destroyAllWindows() 45 | 46 | # define training hyperparameters 47 | n_epochs = 5 48 | batch_size_train = 64 49 | batch_size_test = 64 50 | learning_rate = 0.001 51 | 52 | # reshape data to (items, channels, height, width) and normalize to [0, 1] 53 | train_data = np.expand_dims(train_data, axis=1) / 255.0 54 | test_data = np.expand_dims(test_data, axis=1) / 255.0 55 | 56 | # split data into batches of size [(batch_size, 1, 28, 28) ...] 57 | train_batches = [np.array(train_data[i:i+batch_size_train]) for i in range(0, len(train_data), batch_size_train)] 58 | # split targets into batches of size [(batch_size) ...] 59 | train_target_batches = [np.array(train_targets[i:i+batch_size_train]) for i in range(0, len(train_targets), batch_size_train)] 60 | 61 | test_batches = [np.array(test_data[i:i+batch_size_test]) for i in range(0, len(test_data), batch_size_test)] 62 | test_target_batches = [np.array(test_targets[i:i+batch_size_test]) for i in range(0, len(test_targets), batch_size_test)] 63 | 64 | # create network 65 | network = Net() 66 | 67 | # uncomment to print network summary 68 | summary(network, (1, 28, 28), device="cpu") 69 | 70 | # define loss function and optimizer 71 | optimizer = optim.Adam(network.parameters(), lr=learning_rate) 72 | loss_function = nn.CrossEntropyLoss() 73 | 74 | # create training loop 75 | def train(epoch): 76 | # set network to training mode 77 | network.train() 78 | 79 | loss_sum = 0 80 | # create a progress bar 81 | train_pbar = tqdm(zip(train_batches, train_target_batches), total=len(train_batches)) 82 | for index, (data, target) in enumerate(train_pbar, start=1): 83 | 84 | # convert data to torch.FloatTensor 85 | data = torch.from_numpy(data).float() 86 | target = torch.from_numpy(target).long() 87 | 88 | # zero the parameter gradients 89 | optimizer.zero_grad() 90 | 91 | # forward + backward + optimize 92 | output = network(data) 93 | loss = loss_function(output, target) 94 | loss.backward() 95 | optimizer.step() 96 | 97 | # update progress bar with loss value 98 | loss_sum += loss.item() 99 | train_pbar.set_description(f"Epoch {epoch}, loss: {loss_sum / index:.4f}") 100 | 101 | # create testing loop 102 | def test(epoch): 103 | # set network to evaluation mode 104 | network.eval() 105 | 106 | correct, loss_sum = 0, 0 107 | # create progress bar 108 | val_pbar = tqdm(zip(test_batches, test_target_batches), total=len(test_batches)) 109 | with torch.no_grad(): 110 | for index, (data, target) in enumerate(val_pbar, start=1): 111 | 112 | # convert data to torch.FloatTensor 113 | data = torch.from_numpy(data).float() 114 | target = torch.from_numpy(target).long() 115 | 116 | # forward pass 117 | output = network(data) 118 | 119 | # update progress bar with loss and accuracy values 120 | loss_sum += loss_function(output, target).item() / target.size(0) 121 | pred = output.data.max(1, keepdim=True)[1] 122 | correct += pred.eq(target.data.view_as(pred)).sum() / target.size(0) 123 | 124 | val_pbar.set_description(f"val_loss: {loss_sum / index:.4f}, val_accuracy: {correct / index:.4f}") 125 | 126 | 127 | # train and test the model 128 | for epoch in range(1, n_epochs + 1): 129 | train(epoch) 130 | test(epoch) 131 | 132 | # define output path and create folder if not exists 133 | output_path = "Models/06_pytorch_introduction" 134 | if not os.path.exists(output_path): 135 | os.makedirs(output_path) 136 | 137 | # save model.pt to defined output path 138 | torch.save(network.state_dict(), os.path.join(output_path, "model.pt")) -------------------------------------------------------------------------------- /Tutorials/07_pytorch_wrapper/README.md: -------------------------------------------------------------------------------- 1 | # Using custom wrapper to simplify PyTorch models training pipeline 2 | ## I will introduce the PyTorch Wrapper in this tutorial, saving us time when developing the PyTorch models training pipeline. We'll be able to do this in blocks!

3 | 4 | # **Detailed tutorial**: 5 | ## [PyTorch Wrapper to Build and Train Networks](https://pylessons.com/pytorch-introduction) 6 | 7 |

8 | 9 |

-------------------------------------------------------------------------------- /Tutorials/07_pytorch_wrapper/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | # Define the model architecture 5 | class Net(nn.Module): 6 | def __init__(self): 7 | super(Net, self).__init__() 8 | self.conv1 = nn.Conv2d(1, 10, kernel_size=5) 9 | self.conv2 = nn.Conv2d(10, 20, kernel_size=5) 10 | self.conv2_drop = nn.Dropout2d() 11 | self.fc1 = nn.Linear(320, 50) 12 | self.fc2 = nn.Linear(50, 10) 13 | 14 | def forward(self, x): 15 | x = F.relu(F.max_pool2d(self.conv1(x), 2)) 16 | x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) 17 | x = x.view(-1, 320) 18 | x = F.relu(self.fc1(x)) 19 | x = F.dropout(x, training=self.training) 20 | x = self.fc2(x) 21 | x = F.log_softmax(x, dim=1) 22 | return x -------------------------------------------------------------------------------- /Tutorials/07_pytorch_wrapper/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchsummary 3 | mltu==1.0.1 -------------------------------------------------------------------------------- /Tutorials/07_pytorch_wrapper/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import torch 4 | import numpy as np 5 | import requests, gzip, os, hashlib 6 | 7 | from model import Net 8 | 9 | path = "Datasets/mnist" # Path where to save the downloaded mnist dataset 10 | 11 | def fetch(url): 12 | if os.path.exists(path) is False: 13 | os.makedirs(path) 14 | 15 | fp = os.path.join(path, hashlib.md5(url.encode("utf-8")).hexdigest()) 16 | if os.path.isfile(fp): 17 | with open(fp, "rb") as f: 18 | data = f.read() 19 | else: 20 | with open(fp, "wb") as f: 21 | data = requests.get(url).content 22 | f.write(data) 23 | return np.frombuffer(gzip.decompress(data), dtype=np.uint8).copy() 24 | 25 | test_data = fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28)) 26 | test_targets = fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")[8:] 27 | 28 | # output path 29 | model_path = "Model/07_pytorch_wrapper" 30 | 31 | # construct network and load weights 32 | network = Net() 33 | network.load_state_dict(torch.load("Models/07_pytorch_wrapper/model.pt")) 34 | network.eval() # set to evaluation mode 35 | 36 | # loop over test images 37 | for test_image, test_target in zip(test_data, test_targets): 38 | 39 | # normalize image and convert to tensor 40 | inference_image = torch.from_numpy(test_image).float() / 255.0 41 | inference_image = inference_image.unsqueeze(0).unsqueeze(0) 42 | 43 | # predict 44 | output = network(inference_image) 45 | pred = output.argmax(dim=1, keepdim=True) 46 | prediction = str(pred.item()) 47 | 48 | test_image = cv2.resize(test_image, (400, 400)) 49 | cv2.imshow(prediction, test_image) 50 | key = cv2.waitKey(0) 51 | if key == ord("q"): # break on q key 52 | break 53 | 54 | cv2.destroyAllWindows() -------------------------------------------------------------------------------- /Tutorials/07_pytorch_wrapper/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import requests, gzip, os, hashlib 4 | 5 | import torch 6 | import torch.optim as optim 7 | 8 | from model import Net 9 | 10 | from mltu.torch.dataProvider import DataProvider 11 | from mltu.torch.model import Model 12 | from mltu.torch.metrics import Accuracy 13 | from mltu.torch.callbacks import EarlyStopping, ModelCheckpoint 14 | 15 | # define path to store dataset 16 | path = "Datasets/data" 17 | 18 | def fetch(url): 19 | if os.path.exists(path) is False: 20 | os.makedirs(path) 21 | 22 | fp = os.path.join(path, hashlib.md5(url.encode("utf-8")).hexdigest()) 23 | if os.path.isfile(fp): 24 | with open(fp, "rb") as f: 25 | data = f.read() 26 | else: 27 | with open(fp, "wb") as f: 28 | data = requests.get(url).content 29 | f.write(data) 30 | return np.frombuffer(gzip.decompress(data), dtype=np.uint8).copy() 31 | 32 | # load mnist dataset from yann.lecun.com, train data is of shape (60000, 28, 28) and targets are of shape (60000) 33 | train_data = fetch("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28)) 34 | train_targets = fetch("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")[8:] 35 | test_data = fetch("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")[0x10:].reshape((-1, 28, 28)) 36 | test_targets = fetch("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")[8:] 37 | 38 | train_dataset = [[data, target] for data, target in zip(train_data, train_targets)] 39 | test_dataset = [[data, target] for data, target in zip(test_data, test_targets)] 40 | 41 | def preprocessor(data, target): 42 | # original data is shape of (28, 28), expand to (1, 28, 28) and normalize to [0, 1] 43 | data = np.expand_dims(data, axis=0) / 255.0 44 | return data, target 45 | 46 | train_dataProvider = DataProvider( 47 | train_dataset, 48 | data_preprocessors=[preprocessor], 49 | batch_size=64, 50 | ) 51 | 52 | test_dataProvider = DataProvider( 53 | test_dataset, 54 | data_preprocessors=[preprocessor], 55 | batch_size=64 56 | ) 57 | 58 | # create network, optimizer and define loss function 59 | network = Net() 60 | optimizer = optim.Adam(network.parameters(), lr=0.001) 61 | loss = torch.nn.CrossEntropyLoss() 62 | 63 | # put on cuda device if available 64 | if torch.cuda.is_available(): 65 | network = network.cuda() 66 | 67 | # create callbacks 68 | earlyStopping = EarlyStopping( 69 | monitor="val_accuracy", 70 | patience=3, 71 | mode="max", 72 | verbose=1 73 | ) 74 | modelCheckpoint = ModelCheckpoint( 75 | "Models/07_pytorch_wrapper/model.pt", 76 | monitor="val_accuracy", 77 | mode="max", 78 | save_best_only=True, 79 | verbose=1 80 | ) 81 | 82 | # create model object that will handle training and testing of the network 83 | model = Model(network, optimizer, loss, metrics=[Accuracy()]) 84 | model.fit( 85 | train_dataProvider, 86 | test_dataProvider, 87 | epochs=100, 88 | callbacks=[earlyStopping, modelCheckpoint] 89 | ) -------------------------------------------------------------------------------- /Tutorials/08_handwriting_recognition_torch/README.md: -------------------------------------------------------------------------------- 1 | # Using custom wrapper to simplify PyTorch models training pipeline 2 | ### Construct an accurate handwriting recognition model with PyTorch! Understand how to use MLTU package, to simplify the PyTorch models training pipeline, and discover methods to enhance your model's accuracy!

3 | 4 | # **Detailed tutorial**: 5 | ### [Handwriting words recognition with PyTorch](https://pylessons.com/handwriting-recognition-pytorch) 6 | 7 |

8 | 9 |

-------------------------------------------------------------------------------- /Tutorials/08_handwriting_recognition_torch/configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | from mltu.configs import BaseModelConfigs 5 | 6 | 7 | class ModelConfigs(BaseModelConfigs): 8 | def __init__(self): 9 | super().__init__() 10 | self.model_path = os.path.join("Models/08_handwriting_recognition_torch", datetime.strftime(datetime.now(), "%Y%m%d%H%M")) 11 | self.vocab = "" 12 | self.height = 32 13 | self.width = 128 14 | self.max_text_length = 0 15 | self.batch_size = 64 16 | self.learning_rate = 0.002 17 | self.train_epochs = 1000 18 | -------------------------------------------------------------------------------- /Tutorials/08_handwriting_recognition_torch/inferenceModel.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import typing 3 | import numpy as np 4 | 5 | from mltu.inferenceModel import OnnxInferenceModel 6 | from mltu.utils.text_utils import ctc_decoder, get_cer 7 | 8 | class ImageToWordModel(OnnxInferenceModel): 9 | def __init__(self, *args, **kwargs): 10 | super().__init__(*args, **kwargs) 11 | 12 | def predict(self, image: np.ndarray): 13 | image = cv2.resize(image, self.input_shapes[0][1:3][::-1]) 14 | 15 | image_pred = np.expand_dims(image, axis=0).astype(np.float32) 16 | 17 | preds = self.model.run(self.output_names, {self.input_names[0]: image_pred})[0] 18 | 19 | text = ctc_decoder(preds, self.metadata["vocab"])[0] 20 | 21 | return text 22 | 23 | if __name__ == "__main__": 24 | import pandas as pd 25 | from tqdm import tqdm 26 | 27 | model = ImageToWordModel(model_path="Models/08_handwriting_recognition_torch/202303142139/model.onnx") 28 | 29 | df = pd.read_csv("Models/08_handwriting_recognition_torch/202303142139/val.csv").values.tolist() 30 | 31 | accum_cer = [] 32 | for image_path, label in tqdm(df): 33 | image = cv2.imread(image_path.replace("\\", "/")) 34 | 35 | prediction_text = model.predict(image) 36 | 37 | cer = get_cer(prediction_text, label) 38 | print(f"Image: {image_path}, Label: {label}, Prediction: {prediction_text}, CER: {cer}") 39 | 40 | accum_cer.append(cer) 41 | 42 | print(f"Average CER: {np.average(accum_cer)}") -------------------------------------------------------------------------------- /Tutorials/08_handwriting_recognition_torch/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | def activation_layer(activation: str="relu", alpha: float=0.1, inplace: bool=True): 7 | """ Activation layer wrapper for LeakyReLU and ReLU activation functions 8 | 9 | Args: 10 | activation: str, activation function name (default: 'relu') 11 | alpha: float (LeakyReLU activation function parameter) 12 | 13 | Returns: 14 | torch.Tensor: activation layer 15 | """ 16 | if activation == "relu": 17 | return nn.ReLU(inplace=inplace) 18 | 19 | elif activation == "leaky_relu": 20 | return nn.LeakyReLU(negative_slope=alpha, inplace=inplace) 21 | 22 | 23 | class ConvBlock(nn.Module): 24 | """ Convolutional block with batch normalization 25 | """ 26 | def __init__(self, in_channels: int, out_channels: int, kernel_size: int, stride: int, padding: int): 27 | super(ConvBlock, self).__init__() 28 | self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding) 29 | self.bn = nn.BatchNorm2d(out_channels) 30 | 31 | def forward(self, x: torch.Tensor): 32 | return self.bn(self.conv(x)) 33 | 34 | 35 | class ResidualBlock(nn.Module): 36 | def __init__(self, in_channels, out_channels, skip_conv=True, stride=1, dropout=0.2, activation="leaky_relu"): 37 | super(ResidualBlock, self).__init__() 38 | self.convb1 = ConvBlock(in_channels, out_channels, kernel_size=3, stride=stride, padding=1) 39 | self.act1 = activation_layer(activation) 40 | 41 | self.convb2 = ConvBlock(out_channels, out_channels, kernel_size=3, stride=1, padding=1) 42 | 43 | self.dropout = nn.Dropout(p=dropout) 44 | 45 | self.shortcut = None 46 | if skip_conv: 47 | if stride != 1 or in_channels != out_channels: 48 | self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride) 49 | 50 | self.act2 = activation_layer(activation) 51 | 52 | def forward(self, x): 53 | skip = x 54 | 55 | out = self.act1(self.convb1(x)) 56 | out = self.convb2(out) 57 | 58 | if self.shortcut is not None: 59 | out += self.shortcut(skip) 60 | 61 | out = self.act2(out) 62 | out = self.dropout(out) 63 | 64 | return out 65 | 66 | class Network(nn.Module): 67 | """ Handwriting recognition network for CTC loss""" 68 | def __init__(self, num_chars: int, activation: str="leaky_relu", dropout: float=0.2): 69 | super(Network, self).__init__() 70 | 71 | self.rb1 = ResidualBlock(3, 16, skip_conv = True, stride=1, activation=activation, dropout=dropout) 72 | self.rb2 = ResidualBlock(16, 16, skip_conv = True, stride=2, activation=activation, dropout=dropout) 73 | self.rb3 = ResidualBlock(16, 16, skip_conv = False, stride=1, activation=activation, dropout=dropout) 74 | 75 | self.rb4 = ResidualBlock(16, 32, skip_conv = True, stride=2, activation=activation, dropout=dropout) 76 | self.rb5 = ResidualBlock(32, 32, skip_conv = False, stride=1, activation=activation, dropout=dropout) 77 | 78 | self.rb6 = ResidualBlock(32, 64, skip_conv = True, stride=2, activation=activation, dropout=dropout) 79 | self.rb7 = ResidualBlock(64, 64, skip_conv = True, stride=1, activation=activation, dropout=dropout) 80 | 81 | self.rb8 = ResidualBlock(64, 64, skip_conv = False, stride=1, activation=activation, dropout=dropout) 82 | self.rb9 = ResidualBlock(64, 64, skip_conv = False, stride=1, activation=activation, dropout=dropout) 83 | 84 | self.lstm = nn.LSTM(64, 128, bidirectional=True, num_layers=1, batch_first=True) 85 | self.lstm_dropout = nn.Dropout(p=dropout) 86 | 87 | self.output = nn.Linear(256, num_chars + 1) 88 | 89 | def forward(self, images: torch.Tensor) -> torch.Tensor: 90 | # normalize images between 0 and 1 91 | images_flaot = images / 255.0 92 | 93 | # transpose image to channel first 94 | images_flaot = images_flaot.permute(0, 3, 1, 2) 95 | 96 | # apply convolutions 97 | x = self.rb1(images_flaot) 98 | x = self.rb2(x) 99 | x = self.rb3(x) 100 | x = self.rb4(x) 101 | x = self.rb5(x) 102 | x = self.rb6(x) 103 | x = self.rb7(x) 104 | x = self.rb8(x) 105 | x = self.rb9(x) 106 | 107 | x = x.reshape(x.size(0), -1, x.size(1)) 108 | 109 | x, _ = self.lstm(x) 110 | x = self.lstm_dropout(x) 111 | 112 | x = self.output(x) 113 | x = F.log_softmax(x, 2) 114 | 115 | return x -------------------------------------------------------------------------------- /Tutorials/08_handwriting_recognition_torch/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.13.1 2 | tensorboard==2.10.1 3 | onnx==1.12.0 4 | torchsummaryX -------------------------------------------------------------------------------- /Tutorials/08_handwriting_recognition_torch/train_torch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tarfile 3 | from tqdm import tqdm 4 | from io import BytesIO 5 | from zipfile import ZipFile 6 | from urllib.request import urlopen 7 | 8 | import torch 9 | import torch.optim as optim 10 | from torchsummaryX import summary 11 | 12 | from mltu.torch.model import Model 13 | from mltu.torch.losses import CTCLoss 14 | from mltu.torch.dataProvider import DataProvider 15 | from mltu.torch.metrics import CERMetric, WERMetric 16 | from mltu.torch.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, Model2onnx, ReduceLROnPlateau 17 | 18 | from mltu.preprocessors import ImageReader 19 | from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2 20 | from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen 21 | from mltu.annotations.images import CVImage 22 | 23 | from model import Network 24 | from configs import ModelConfigs 25 | 26 | 27 | def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024): 28 | http_response = urlopen(url) 29 | 30 | data = b"" 31 | iterations = http_response.length // chunk_size + 1 32 | for _ in tqdm(range(iterations)): 33 | data += http_response.read(chunk_size) 34 | 35 | zipfile = ZipFile(BytesIO(data)) 36 | zipfile.extractall(path=extract_to) 37 | 38 | dataset_path = os.path.join("Datasets", "IAM_Words") 39 | if not os.path.exists(dataset_path): 40 | download_and_unzip("https://git.io/J0fjL", extract_to="Datasets") 41 | 42 | file = tarfile.open(os.path.join(dataset_path, "words.tgz")) 43 | file.extractall(os.path.join(dataset_path, "words")) 44 | 45 | dataset, vocab, max_len = [], set(), 0 46 | 47 | # Preprocess the dataset by the specific IAM_Words dataset file structure 48 | words = open(os.path.join(dataset_path, "words.txt"), "r").readlines() 49 | for line in tqdm(words): 50 | if line.startswith("#"): 51 | continue 52 | 53 | line_split = line.split(" ") 54 | if line_split[1] == "err": 55 | continue 56 | 57 | folder1 = line_split[0][:3] 58 | folder2 = "-".join(line_split[0].split("-")[:2]) 59 | file_name = line_split[0] + ".png" 60 | label = line_split[-1].rstrip("\n") 61 | 62 | rel_path = os.path.join(dataset_path, "words", folder1, folder2, file_name) 63 | if not os.path.exists(rel_path): 64 | print(f"File not found: {rel_path}") 65 | continue 66 | 67 | dataset.append([rel_path, label]) 68 | vocab.update(list(label)) 69 | max_len = max(max_len, len(label)) 70 | 71 | configs = ModelConfigs() 72 | 73 | # Save vocab and maximum text length to configs 74 | configs.vocab = "".join(sorted(vocab)) 75 | configs.max_text_length = max_len 76 | configs.save() 77 | 78 | # Create a data provider for the dataset 79 | data_provider = DataProvider( 80 | dataset=dataset, 81 | skip_validation=True, 82 | batch_size=configs.batch_size, 83 | data_preprocessors=[ImageReader(CVImage)], 84 | transformers=[ 85 | # ImageShowCV2(), # uncomment to show images when iterating over the data provider 86 | ImageResizer(configs.width, configs.height, keep_aspect_ratio=False), 87 | LabelIndexer(configs.vocab), 88 | LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)) 89 | ], 90 | use_cache=True, 91 | ) 92 | 93 | # Split the dataset into training and validation sets 94 | train_dataProvider, test_dataProvider = data_provider.split(split = 0.9) 95 | 96 | # Augment training data with random brightness, rotation and erode/dilate 97 | train_dataProvider.augmentors = [ 98 | RandomBrightness(), 99 | RandomErodeDilate(), 100 | RandomSharpen(), 101 | RandomRotate(angle=10), 102 | ] 103 | 104 | network = Network(len(configs.vocab), activation="leaky_relu", dropout=0.3) 105 | loss = CTCLoss(blank=len(configs.vocab)) 106 | optimizer = optim.Adam(network.parameters(), lr=configs.learning_rate) 107 | 108 | # uncomment to print network summary, torchsummaryX package is required 109 | summary(network, torch.zeros((1, configs.height, configs.width, 3))) 110 | 111 | # put on cuda device if available 112 | if torch.cuda.is_available(): 113 | network = network.cuda() 114 | 115 | # create callbacks 116 | earlyStopping = EarlyStopping(monitor="val_CER", patience=20, mode="min", verbose=1) 117 | modelCheckpoint = ModelCheckpoint(configs.model_path + "/model.pt", monitor="val_CER", mode="min", save_best_only=True, verbose=1) 118 | tb_callback = TensorBoard(configs.model_path + "/logs") 119 | reduce_lr = ReduceLROnPlateau(monitor="val_CER", factor=0.9, patience=10, verbose=1, mode="min", min_lr=1e-6) 120 | model2onnx = Model2onnx( 121 | saved_model_path=configs.model_path + "/model.pt", 122 | input_shape=(1, configs.height, configs.width, 3), 123 | verbose=1, 124 | metadata={"vocab": configs.vocab} 125 | ) 126 | 127 | # create model object that will handle training and testing of the network 128 | model = Model(network, optimizer, loss, metrics=[CERMetric(configs.vocab), WERMetric(configs.vocab)]) 129 | model.fit( 130 | train_dataProvider, 131 | test_dataProvider, 132 | epochs=1000, 133 | callbacks=[earlyStopping, modelCheckpoint, tb_callback, reduce_lr, model2onnx] 134 | ) 135 | 136 | # Save training and validation datasets as csv files 137 | train_dataProvider.to_csv(os.path.join(configs.model_path, "train.csv")) 138 | test_dataProvider.to_csv(os.path.join(configs.model_path, "val.csv")) 139 | -------------------------------------------------------------------------------- /Tutorials/09_translation_transformer/README.md: -------------------------------------------------------------------------------- 1 | # Training TensorFlow Transformer model for Spanish to English translation task 2 | ### In this tutorial, I'll walk through a practical example of Transformer Training for Language Translation tasks from Spanish to the English language 3 | 4 |

5 | # **Detailed tutorial**: 6 | ### [Transformer training with TensorFlow for Translation task](https://pylessons.com/transformers-training) 7 | 8 |

9 | 10 |

-------------------------------------------------------------------------------- /Tutorials/09_translation_transformer/configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | from mltu.configs import BaseModelConfigs 5 | 6 | 7 | class ModelConfigs(BaseModelConfigs): 8 | def __init__(self): 9 | super().__init__() 10 | self.model_path = os.path.join( 11 | "Models/09_translation_transformer", 12 | datetime.strftime(datetime.now(), "%Y%m%d%H%M"), 13 | ) 14 | self.num_layers = 4 15 | self.d_model = 128 16 | self.num_heads = 8 17 | self.dff = 512 18 | self.dropout_rate = 0.1 19 | self.batch_size = 16 20 | self.train_epochs = 50 21 | # CustomSchedule parameters 22 | self.init_lr = 0.00001 23 | self.lr_after_warmup = 0.0005 24 | self.final_lr = 0.0001 25 | self.warmup_epochs = 2 26 | self.decay_epochs = 18 -------------------------------------------------------------------------------- /Tutorials/09_translation_transformer/download.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import requests 4 | from tqdm import tqdm 5 | from bs4 import BeautifulSoup 6 | 7 | # URL to the directory containing the files to be downloaded 8 | language = "en-es" 9 | url = f"https://data.statmt.org/opus-100-corpus/v1.0/supervised/{language}/" 10 | save_directory = f"./Datasets/{language}" 11 | 12 | # Create the save directory if it doesn't exist 13 | os.makedirs(save_directory, exist_ok=True) 14 | 15 | # Send a GET request to the URL 16 | response = requests.get(url) 17 | 18 | # Parse the HTML response 19 | soup = BeautifulSoup(response.content, 'html.parser') 20 | 21 | # Find all the anchor tags in the HTML 22 | links = soup.find_all('a') 23 | 24 | # Extract the href attribute from each anchor tag 25 | file_links = [link['href'] for link in links if '.' in link['href']] 26 | 27 | # Download each file 28 | for file_link in tqdm(file_links): 29 | file_url = url + file_link 30 | save_path = os.path.join(save_directory, file_link) 31 | 32 | print(f"Downloading {file_url}") 33 | 34 | # Send a GET request for the file 35 | file_response = requests.get(file_url) 36 | if file_response.status_code == 404: 37 | print(f"Could not download {file_url}") 38 | continue 39 | 40 | # Save the file to the specified directory 41 | with open(save_path, 'wb') as file: 42 | file.write(file_response.content) 43 | 44 | print(f"Saved {file_link}") 45 | 46 | print("All files have been downloaded.") -------------------------------------------------------------------------------- /Tutorials/09_translation_transformer/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from mltu.tensorflow.transformer.layers import Encoder, Decoder 4 | 5 | def Transformer( 6 | input_vocab_size: int, 7 | target_vocab_size: int, 8 | encoder_input_size: int = None, 9 | decoder_input_size: int = None, 10 | num_layers: int=6, 11 | d_model: int=512, 12 | num_heads: int=8, 13 | dff: int=2048, 14 | dropout_rate: float=0.1, 15 | ) -> tf.keras.Model: 16 | """ 17 | A custom TensorFlow model that implements the Transformer architecture. 18 | 19 | Args: 20 | input_vocab_size (int): The size of the input vocabulary. 21 | target_vocab_size (int): The size of the target vocabulary. 22 | encoder_input_size (int): The size of the encoder input sequence. 23 | decoder_input_size (int): The size of the decoder input sequence. 24 | num_layers (int): The number of layers in the encoder and decoder. 25 | d_model (int): The dimensionality of the model. 26 | num_heads (int): The number of heads in the multi-head attention layer. 27 | dff (int): The dimensionality of the feed-forward layer. 28 | dropout_rate (float): The dropout rate. 29 | 30 | Returns: 31 | A TensorFlow Keras model. 32 | """ 33 | inputs = [ 34 | tf.keras.layers.Input(shape=(encoder_input_size,), dtype=tf.int64), 35 | tf.keras.layers.Input(shape=(decoder_input_size,), dtype=tf.int64) 36 | ] 37 | 38 | encoder_input, decoder_input = inputs 39 | 40 | encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=input_vocab_size, dropout_rate=dropout_rate)(encoder_input) 41 | decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=target_vocab_size, dropout_rate=dropout_rate)(decoder_input, encoder) 42 | 43 | output = tf.keras.layers.Dense(target_vocab_size)(decoder) 44 | 45 | return tf.keras.Model(inputs=inputs, outputs=output) -------------------------------------------------------------------------------- /Tutorials/09_translation_transformer/requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | tf2onnx==1.14.0 3 | onnx==1.12.0 -------------------------------------------------------------------------------- /Tutorials/09_translation_transformer/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | 4 | from mltu.tokenizers import CustomTokenizer 5 | from mltu.inferenceModel import OnnxInferenceModel 6 | 7 | class PtEnTranslator(OnnxInferenceModel): 8 | def __init__(self, *args, **kwargs): 9 | super().__init__(*args, **kwargs) 10 | 11 | self.new_inputs = self.model.get_inputs() 12 | self.tokenizer = CustomTokenizer.load(self.metadata["tokenizer"]) 13 | self.detokenizer = CustomTokenizer.load(self.metadata["detokenizer"]) 14 | 15 | def predict(self, sentence): 16 | start = time.time() 17 | tokenized_sentence = self.tokenizer.texts_to_sequences([sentence])[0] 18 | encoder_input = np.pad(tokenized_sentence, (0, self.tokenizer.max_length - len(tokenized_sentence)), constant_values=0).astype(np.int64) 19 | 20 | tokenized_results = [self.detokenizer.start_token_index] 21 | for index in range(self.detokenizer.max_length - 1): 22 | decoder_input = np.pad(tokenized_results, (0, self.detokenizer.max_length - len(tokenized_results)), constant_values=0).astype(np.int64) 23 | input_dict = { 24 | self.model._inputs_meta[0].name: np.expand_dims(encoder_input, axis=0), 25 | self.model._inputs_meta[1].name: np.expand_dims(decoder_input, axis=0), 26 | } 27 | preds = self.model.run(None, input_dict)[0] # preds shape (1, 206, 29110) 28 | pred_results = np.argmax(preds, axis=2) 29 | tokenized_results.append(pred_results[0][index]) 30 | 31 | if tokenized_results[-1] == self.detokenizer.end_token_index: 32 | break 33 | 34 | results = self.detokenizer.detokenize([tokenized_results]) 35 | return results[0], time.time() - start 36 | 37 | def read_files(path): 38 | with open(path, "r", encoding="utf-8") as f: 39 | en_train_dataset = f.read().split("\n")[:-1] 40 | return en_train_dataset 41 | 42 | # Path to dataset 43 | en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en" 44 | es_validation_data_path = "Datasets/en-es/opus.en-es-dev.es" 45 | 46 | en_validation_data = read_files(en_validation_data_path) 47 | es_validation_data = read_files(es_validation_data_path) 48 | 49 | # Consider only sentences with length <= 500 50 | max_lenght = 500 51 | val_examples = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_validation_data, en_validation_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght] 52 | 53 | translator = PtEnTranslator("Models/09_translation_transformer/202308241514/model.onnx") 54 | 55 | val_dataset = [] 56 | for es, en in val_examples: 57 | results, duration = translator.predict(es) 58 | print("Spanish: ", es.lower()) 59 | print("English: ", en.lower()) 60 | print("English pred:", results) 61 | print(duration) 62 | print() -------------------------------------------------------------------------------- /Tutorials/09_translation_transformer/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import tensorflow as tf 4 | try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")] 5 | except: pass 6 | 7 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard 8 | from mltu.tensorflow.callbacks import Model2onnx, WarmupCosineDecay 9 | 10 | from mltu.tensorflow.dataProvider import DataProvider 11 | from mltu.tokenizers import CustomTokenizer 12 | 13 | from mltu.tensorflow.transformer.utils import MaskedAccuracy, MaskedLoss 14 | from mltu.tensorflow.transformer.callbacks import EncDecSplitCallback 15 | 16 | from model import Transformer 17 | from configs import ModelConfigs 18 | 19 | configs = ModelConfigs() 20 | 21 | # Path to dataset 22 | en_training_data_path = "Datasets/en-es/opus.en-es-train.en" 23 | en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en" 24 | es_training_data_path = "Datasets/en-es/opus.en-es-train.es" 25 | es_validation_data_path = "Datasets/en-es/opus.en-es-dev.es" 26 | 27 | def read_files(path): 28 | with open(path, "r", encoding="utf-8") as f: 29 | en_train_dataset = f.read().split("\n")[:-1] 30 | return en_train_dataset 31 | 32 | en_training_data = read_files(en_training_data_path) 33 | en_validation_data = read_files(en_validation_data_path) 34 | es_training_data = read_files(es_training_data_path) 35 | es_validation_data = read_files(es_validation_data_path) 36 | 37 | # Consider only sentences with length <= 500 38 | max_lenght = 500 39 | train_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_training_data, en_training_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght] 40 | val_dataset = [[es_sentence, en_sentence] for es_sentence, en_sentence in zip(es_validation_data, en_validation_data) if len(es_sentence) <= max_lenght and len(en_sentence) <= max_lenght] 41 | es_training_data, en_training_data = zip(*train_dataset) 42 | es_validation_data, en_validation_data = zip(*val_dataset) 43 | 44 | # prepare spanish tokenizer, this is the input language 45 | tokenizer = CustomTokenizer(char_level=True) 46 | tokenizer.fit_on_texts(es_training_data) 47 | tokenizer.save(configs.model_path + "/tokenizer.json") 48 | 49 | # prepare english tokenizer, this is the output language 50 | detokenizer = CustomTokenizer(char_level=True) 51 | detokenizer.fit_on_texts(en_training_data) 52 | detokenizer.save(configs.model_path + "/detokenizer.json") 53 | 54 | 55 | def preprocess_inputs(data_batch, label_batch): 56 | encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64) 57 | decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64) 58 | decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64) 59 | 60 | data_batch_tokens = tokenizer.texts_to_sequences(data_batch) 61 | label_batch_tokens = detokenizer.texts_to_sequences(label_batch) 62 | 63 | for index, (data, label) in enumerate(zip(data_batch_tokens, label_batch_tokens)): 64 | encoder_input[index][:len(data)] = data 65 | decoder_input[index][:len(label)-1] = label[:-1] # Drop the [END] tokens 66 | decoder_output[index][:len(label)-1] = label[1:] # Drop the [START] tokens 67 | 68 | return (encoder_input, decoder_input), decoder_output 69 | 70 | # Create Training Data Provider 71 | train_dataProvider = DataProvider( 72 | train_dataset, 73 | batch_size=configs.batch_size, 74 | batch_postprocessors=[preprocess_inputs], 75 | use_cache=True, 76 | ) 77 | 78 | # Create Validation Data Provider 79 | val_dataProvider = DataProvider( 80 | val_dataset, 81 | batch_size=configs.batch_size, 82 | batch_postprocessors=[preprocess_inputs], 83 | use_cache=True, 84 | ) 85 | 86 | # Create TensorFlow Transformer Model 87 | transformer = Transformer( 88 | num_layers=configs.num_layers, 89 | d_model=configs.d_model, 90 | num_heads=configs.num_heads, 91 | dff=configs.dff, 92 | input_vocab_size=len(tokenizer)+1, 93 | target_vocab_size=len(detokenizer)+1, 94 | dropout_rate=configs.dropout_rate, 95 | encoder_input_size=tokenizer.max_length, 96 | decoder_input_size=detokenizer.max_length 97 | ) 98 | 99 | transformer.summary() 100 | 101 | optimizer = tf.keras.optimizers.Adam(learning_rate=configs.init_lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9) 102 | 103 | # Compile the model 104 | transformer.compile( 105 | loss=MaskedLoss(), 106 | optimizer=optimizer, 107 | metrics=[MaskedAccuracy()], 108 | run_eagerly=False 109 | ) 110 | 111 | # Define callbacks 112 | warmupCosineDecay = WarmupCosineDecay( 113 | lr_after_warmup=configs.lr_after_warmup, 114 | final_lr=configs.final_lr, 115 | warmup_epochs=configs.warmup_epochs, 116 | decay_epochs=configs.decay_epochs, 117 | initial_lr=configs.init_lr, 118 | ) 119 | earlystopper = EarlyStopping(monitor="val_masked_accuracy", patience=5, verbose=1, mode="max") 120 | checkpoint = ModelCheckpoint(f"{configs.model_path}/model.h5", monitor="val_masked_accuracy", verbose=1, save_best_only=True, mode="max", save_weights_only=False) 121 | tb_callback = TensorBoard(f"{configs.model_path}/logs") 122 | reduceLROnPlat = ReduceLROnPlateau(monitor="val_masked_accuracy", factor=0.9, min_delta=1e-10, patience=2, verbose=1, mode="max") 123 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5", metadata={"tokenizer": tokenizer.dict(), "detokenizer": detokenizer.dict()}, save_on_epoch_end=False) 124 | encDecSplitCallback = EncDecSplitCallback(configs.model_path, encoder_metadata={"tokenizer": tokenizer.dict()}, decoder_metadata={"detokenizer": detokenizer.dict()}) 125 | 126 | configs.save() 127 | 128 | # Train the model 129 | transformer.fit( 130 | train_dataProvider, 131 | validation_data=val_dataProvider, 132 | epochs=configs.train_epochs, 133 | callbacks=[ 134 | earlystopper, 135 | warmupCosineDecay, 136 | checkpoint, 137 | tb_callback, 138 | reduceLROnPlat, 139 | model2onnx, 140 | encDecSplitCallback 141 | ] 142 | ) -------------------------------------------------------------------------------- /Tutorials/10_wav2vec2_torch/configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | from mltu.configs import BaseModelConfigs 5 | 6 | class ModelConfigs(BaseModelConfigs): 7 | def __init__(self): 8 | super().__init__() 9 | self.model_path = os.path.join( 10 | "Models/10_wav2vec2_torch", 11 | datetime.strftime(datetime.now(), "%Y%m%d%H%M"), 12 | ) 13 | self.batch_size = 8 14 | self.train_epochs = 60 15 | self.train_workers = 20 16 | 17 | self.init_lr = 1.0e-8 18 | self.lr_after_warmup = 1e-05 19 | self.final_lr = 5e-06 20 | self.warmup_epochs = 10 21 | self.decay_epochs = 40 22 | self.weight_decay = 0.005 23 | self.mixed_precision = True 24 | 25 | self.max_audio_length = 246000 26 | self.max_label_length = 256 27 | 28 | self.vocab = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] -------------------------------------------------------------------------------- /Tutorials/10_wav2vec2_torch/requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.13.1+cu117 2 | transformers==4.33.1 3 | mltu==1.1.4 4 | onnx 5 | onnxruntime -------------------------------------------------------------------------------- /Tutorials/10_wav2vec2_torch/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from mltu.inferenceModel import OnnxInferenceModel 4 | from mltu.utils.text_utils import ctc_decoder, get_cer, get_wer 5 | 6 | class Wav2vec2(OnnxInferenceModel): 7 | def __init__(self, *args, **kwargs): 8 | super().__init__(*args, **kwargs) 9 | 10 | def predict(self, audio: np.ndarray): 11 | 12 | audio = np.expand_dims(audio, axis=0).astype(np.float32) 13 | 14 | preds = self.model.run(None, {self.input_name: audio})[0] 15 | 16 | text = ctc_decoder(preds, self.metadata["vocab"])[0] 17 | 18 | return text 19 | 20 | if __name__ == "__main__": 21 | import librosa 22 | import pandas as pd 23 | from tqdm import tqdm 24 | 25 | model = Wav2vec2(model_path="Models/10_wav2vec2_torch/202309171434/model.onnx") 26 | 27 | # The list of multiple [audio_path, label] for validation 28 | val_dataset = pd.read_csv("Models/10_wav2vec2_torch/202309171434/val.csv").values.tolist() 29 | 30 | accum_cer, accum_wer = [], [] 31 | pbar = tqdm(val_dataset) 32 | for vaw_path, label in pbar: 33 | audio, sr = librosa.load(vaw_path, sr=16000) 34 | 35 | prediction_text = model.predict(audio) 36 | 37 | cer = get_cer(prediction_text, label) 38 | wer = get_wer(prediction_text, label) 39 | 40 | accum_cer.append(cer) 41 | accum_wer.append(wer) 42 | print(label) 43 | 44 | pbar.set_description(f"Average CER: {np.average(accum_cer):.4f}, Average WER: {np.average(accum_wer):.4f}") -------------------------------------------------------------------------------- /Tutorials/10_wav2vec2_torch/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tarfile 3 | import pandas as pd 4 | from tqdm import tqdm 5 | from io import BytesIO 6 | from urllib.request import urlopen 7 | 8 | import torch 9 | from torch import nn 10 | from transformers import Wav2Vec2ForCTC 11 | import torch.nn.functional as F 12 | 13 | from mltu.torch.model import Model 14 | from mltu.torch.losses import CTCLoss 15 | from mltu.torch.dataProvider import DataProvider 16 | from mltu.torch.metrics import CERMetric, WERMetric 17 | from mltu.torch.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, Model2onnx, WarmupCosineDecay 18 | from mltu.augmentors import RandomAudioNoise, RandomAudioPitchShift, RandomAudioTimeStretch 19 | 20 | from mltu.preprocessors import AudioReader 21 | from mltu.transformers import LabelIndexer, LabelPadding, AudioPadding 22 | 23 | from configs import ModelConfigs 24 | 25 | configs = ModelConfigs() 26 | 27 | 28 | def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024): 29 | http_response = urlopen(url) 30 | 31 | data = b"" 32 | iterations = http_response.length // chunk_size + 1 33 | for _ in tqdm(range(iterations)): 34 | data += http_response.read(chunk_size) 35 | 36 | tarFile = tarfile.open(fileobj=BytesIO(data), mode="r|bz2") 37 | tarFile.extractall(path=extract_to) 38 | tarFile.close() 39 | 40 | 41 | dataset_path = os.path.join("Datasets", "LJSpeech-1.1") 42 | if not os.path.exists(dataset_path): 43 | download_and_unzip("https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2", extract_to="Datasets") 44 | 45 | dataset_path = "Datasets/LJSpeech-1.1" 46 | metadata_path = dataset_path + "/metadata.csv" 47 | wavs_path = dataset_path + "/wavs/" 48 | 49 | # Read metadata file and parse it 50 | metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3) 51 | dataset = [] 52 | vocab = [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 53 | for file_name, transcription, normalized_transcription in metadata_df.values.tolist(): 54 | path = f"Datasets/LJSpeech-1.1/wavs/{file_name}.wav" 55 | new_label = "".join([l for l in normalized_transcription.lower() if l in vocab]) 56 | dataset.append([path, new_label]) 57 | 58 | # Create a data provider for the dataset 59 | data_provider = DataProvider( 60 | dataset=dataset, 61 | skip_validation=True, 62 | batch_size=configs.batch_size, 63 | data_preprocessors=[ 64 | AudioReader(sample_rate=16000), 65 | ], 66 | transformers=[ 67 | LabelIndexer(vocab), 68 | ], 69 | use_cache=False, 70 | batch_postprocessors=[ 71 | AudioPadding(max_audio_length=configs.max_audio_length, padding_value=0, use_on_batch=True), 72 | LabelPadding(padding_value=len(vocab), use_on_batch=True), 73 | ], 74 | use_multiprocessing=True, 75 | max_queue_size=10, 76 | workers=configs.train_workers, 77 | ) 78 | train_dataProvider, test_dataProvider = data_provider.split(split=0.9) 79 | 80 | # train_dataProvider.augmentors = [ 81 | # RandomAudioNoise(), 82 | # RandomAudioPitchShift(), 83 | # RandomAudioTimeStretch() 84 | # ] 85 | 86 | vocab = sorted(vocab) 87 | configs.vocab = vocab 88 | configs.save() 89 | 90 | 91 | class CustomWav2Vec2Model(nn.Module): 92 | def __init__(self, hidden_states, dropout_rate=0.2, **kwargs): 93 | super(CustomWav2Vec2Model, self).__init__( **kwargs) 94 | pretrained_name = "facebook/wav2vec2-base-960h" 95 | self.model = Wav2Vec2ForCTC.from_pretrained(pretrained_name, vocab_size=hidden_states, ignore_mismatched_sizes=True) 96 | self.model.freeze_feature_encoder() # this part does not need to be fine-tuned 97 | 98 | def forward(self, inputs): 99 | output = self.model(inputs, attention_mask=None).logits 100 | # Apply softmax 101 | output = F.log_softmax(output, -1) 102 | return output 103 | 104 | custom_model = CustomWav2Vec2Model(hidden_states = len(vocab)+1) 105 | 106 | # put on cuda device if available 107 | if torch.cuda.is_available(): 108 | custom_model = custom_model.cuda() 109 | 110 | # create callbacks 111 | warmupCosineDecay = WarmupCosineDecay( 112 | lr_after_warmup=configs.lr_after_warmup, 113 | warmup_epochs=configs.warmup_epochs, 114 | decay_epochs=configs.decay_epochs, 115 | final_lr=configs.final_lr, 116 | initial_lr=configs.init_lr, 117 | verbose=True, 118 | ) 119 | tb_callback = TensorBoard(configs.model_path + "/logs") 120 | earlyStopping = EarlyStopping(monitor="val_CER", patience=16, mode="min", verbose=1) 121 | modelCheckpoint = ModelCheckpoint(configs.model_path + "/model.pt", monitor="val_CER", mode="min", save_best_only=True, verbose=1) 122 | model2onnx = Model2onnx( 123 | saved_model_path=configs.model_path + "/model.pt", 124 | input_shape=(1, configs.max_audio_length), 125 | verbose=1, 126 | metadata={"vocab": configs.vocab}, 127 | dynamic_axes={"input": {0: "batch_size", 1: "sequence_length"}, "output": {0: "batch_size", 1: "sequence_length"}} 128 | ) 129 | 130 | # create model object that will handle training and testing of the network 131 | model = Model( 132 | custom_model, 133 | loss = CTCLoss(blank=len(configs.vocab), zero_infinity=True), 134 | optimizer = torch.optim.AdamW(custom_model.parameters(), lr=configs.init_lr, weight_decay=configs.weight_decay), 135 | metrics=[ 136 | CERMetric(configs.vocab), 137 | WERMetric(configs.vocab) 138 | ], 139 | mixed_precision=configs.mixed_precision, 140 | ) 141 | 142 | # Save training and validation datasets as csv files 143 | train_dataProvider.to_csv(os.path.join(configs.model_path, "train.csv")) 144 | test_dataProvider.to_csv(os.path.join(configs.model_path, "val.csv")) 145 | 146 | model.fit( 147 | train_dataProvider, 148 | test_dataProvider, 149 | epochs=configs.train_epochs, 150 | callbacks=[ 151 | warmupCosineDecay, 152 | tb_callback, 153 | earlyStopping, 154 | modelCheckpoint, 155 | model2onnx 156 | ] 157 | ) -------------------------------------------------------------------------------- /Tutorials/10_wav2vec2_torch/train_tf.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | try: 3 | [ 4 | tf.config.experimental.set_memory_growth(gpu, True) 5 | for gpu in tf.config.experimental.list_physical_devices("GPU") 6 | ] 7 | except: 8 | pass 9 | 10 | from keras import layers 11 | from mltu.tensorflow.dataProvider import DataProvider 12 | from mltu.transformers import LabelIndexer, LabelPadding, AudioPadding 13 | 14 | from mltu.tensorflow.losses import CTCloss 15 | from mltu.tensorflow.metrics import CERMetric, WERMetric 16 | from mltu.tensorflow.callbacks import Model2onnx, WarmupCosineDecay 17 | 18 | from mltu.augmentors import RandomAudioNoise, RandomAudioPitchShift, RandomAudioTimeStretch 19 | 20 | from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard 21 | 22 | import pandas as pd 23 | 24 | from configs import ModelConfigs 25 | 26 | configs = ModelConfigs() 27 | from transformers import TFWav2Vec2ForCTC 28 | from mltu.preprocessors import AudioReader 29 | 30 | 31 | train_dataset = pd.read_csv("Models/10_wav2vec2_torch/202309171434/train.csv").values.tolist() 32 | validation_dataset = pd.read_csv("Models/10_wav2vec2_torch/202309171434/val.csv").values.tolist() 33 | 34 | # Create a data provider for the dataset 35 | train_dataProvider = DataProvider( 36 | dataset=train_dataset, 37 | skip_validation=True, 38 | batch_size=configs.batch_size, 39 | data_preprocessors=[ 40 | AudioReader(sample_rate=16000), 41 | ], 42 | transformers=[ 43 | LabelIndexer(configs.vocab), 44 | LabelPadding(max_word_length=configs.max_label_length, padding_value=len(configs.vocab)), 45 | ], 46 | batch_postprocessors=[ 47 | AudioPadding(max_audio_length=configs.max_audio_length, padding_value=0, use_on_batch=True) 48 | ], 49 | augmentors=[ 50 | RandomAudioNoise(), 51 | RandomAudioPitchShift(), 52 | RandomAudioTimeStretch() 53 | ], 54 | use_cache=True, 55 | ) 56 | 57 | test_dataProvider = DataProvider( 58 | dataset=validation_dataset, 59 | skip_validation=True, 60 | batch_size=configs.batch_size, 61 | data_preprocessors=[ 62 | AudioReader(sample_rate=16000), 63 | ], 64 | transformers=[ 65 | LabelIndexer(configs.vocab), 66 | LabelPadding(max_word_length=configs.max_label_length, padding_value=len(configs.vocab)), 67 | ], 68 | batch_postprocessors=[ 69 | AudioPadding(max_audio_length=configs.max_audio_length, padding_value=0, use_on_batch=True) 70 | ], 71 | use_cache=True, 72 | ) 73 | 74 | class CustomWav2Vec2Model(layers.Layer): 75 | def __init__(self, output_dim, **kwargs): 76 | super().__init__(**kwargs) 77 | 78 | pretrained_name = "facebook/wav2vec2-base-960h" 79 | self.model = TFWav2Vec2ForCTC.from_pretrained(pretrained_name, vocab_size=output_dim, ignore_mismatched_sizes=True) 80 | self.model.freeze_feature_encoder() # https://huggingface.co/blog/fine-tune-wav2vec2-english 81 | 82 | def __call__(self, inputs): 83 | outputs = self.model(inputs) 84 | 85 | final_state = tf.nn.softmax(outputs.logits, axis=-1) 86 | 87 | return final_state 88 | 89 | custom_model = tf.keras.Sequential([ 90 | layers.Input(shape=(None,), name="input", dtype=tf.float32), 91 | CustomWav2Vec2Model(len(configs.vocab)+1) 92 | ]) 93 | 94 | for data in train_dataProvider: 95 | results = custom_model(data[0]) 96 | break 97 | 98 | custom_model.summary() 99 | # configs.save() 100 | 101 | 102 | # Compile the model and print summary 103 | custom_model.compile( 104 | optimizer=tf.keras.optimizers.AdamW(learning_rate=configs.init_lr, weight_decay=configs.weight_decay), 105 | loss=CTCloss(), 106 | metrics=[ 107 | CERMetric(vocabulary=configs.vocab), 108 | WERMetric(vocabulary=configs.vocab) 109 | ], 110 | ) 111 | 112 | # Define callbacks 113 | warmupCosineDecay = WarmupCosineDecay( 114 | lr_after_warmup=configs.lr_after_warmup, 115 | final_lr=configs.final_lr, 116 | warmup_epochs=configs.warmup_epochs, 117 | decay_epochs=configs.decay_epochs, 118 | initial_lr=configs.init_lr, 119 | ) 120 | earlystopper = EarlyStopping( 121 | monitor="val_CER", patience=16, verbose=1, mode="min" 122 | ) 123 | checkpoint = ModelCheckpoint( 124 | f"{configs.model_path}/model.h5", 125 | monitor="val_CER", 126 | verbose=1, 127 | save_best_only=True, 128 | mode="min", 129 | save_weights_only=False, 130 | ) 131 | tb_callback = TensorBoard(f"{configs.model_path}/logs") 132 | model2onnx = Model2onnx(f"{configs.model_path}/model.h5", metadata={"vocab": configs.vocab}) 133 | 134 | # Train the model 135 | custom_model.fit( 136 | train_dataProvider, 137 | validation_data=test_dataProvider, 138 | epochs=configs.train_epochs, 139 | callbacks=[warmupCosineDecay, earlystopper, checkpoint, tb_callback, model2onnx], 140 | max_queue_size=configs.train_workers, 141 | workers=configs.train_workers, 142 | use_multiprocessing=True, 143 | ) -------------------------------------------------------------------------------- /Tutorials/11_Yolov8/convert2onnx.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from ultralytics.engine.model import Model as BaseModel 3 | 4 | base_model = BaseModel("yolov8m.pt") 5 | 6 | classes = base_model.names 7 | input_width, input_height = 640, 640 8 | input_shape = (1, 3, input_width, input_height) 9 | model = base_model.model 10 | 11 | # place model on cpu 12 | model.to("cpu") 13 | 14 | # set the model to inference mode 15 | model.eval() 16 | 17 | # convert the model to ONNX format 18 | dummy_input = torch.randn(input_shape).to("cpu") 19 | 20 | # Export the model 21 | torch.onnx.export( 22 | model, 23 | dummy_input, 24 | "yolov8m.onnx", 25 | export_params=True, 26 | input_names = ["input"], 27 | output_names = ["output"], 28 | dynamic_axes = { 29 | "input": {0: "batch_size", 2: "height", 3: "width"}, 30 | "output": {0: "batch_size", 2: "anchors"} 31 | } 32 | ) 33 | 34 | # Add the class names to the model as metadata 35 | import onnx 36 | 37 | metadata = {"classes": classes} 38 | 39 | # Load the ONNX model 40 | onnx_model = onnx.load("yolov8m.onnx") 41 | 42 | # Add the metadata dictionary to the onnx model's metadata_props attribute 43 | for key, value in metadata.items(): 44 | meta = onnx_model.metadata_props.add() 45 | meta.key = key 46 | meta.value = str(value) 47 | 48 | # Save the modified ONNX model 49 | onnx.save(onnx_model, "yolov8m.onnx") -------------------------------------------------------------------------------- /Tutorials/11_Yolov8/requirements.txt: -------------------------------------------------------------------------------- 1 | mltu==1.2.5 2 | ultralytics==8.1.28 3 | torch==2.0.0 4 | torchvision==0.15.1 5 | onnxruntime==1.15.1 6 | onnx==1.12.0 -------------------------------------------------------------------------------- /Tutorials/11_Yolov8/run_pretrained.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from ultralytics.engine.model import Model as BaseModel 3 | from mltu.annotations.detections import Detections 4 | from mltu.torch.yolo.detectors.torch_detector import Detector as TorchDetector 5 | from mltu.torch.yolo.detectors.onnx_detector import Detector as OnnxDetector 6 | 7 | input_width, input_height = 640, 640 8 | confidence_threshold = 0.5 9 | iou_threshold = 0.5 10 | 11 | # base_model = BaseModel("yolov8m.pt") 12 | # detector = TorchDetector(base_model.model, input_width, input_height, base_model.names, confidence_threshold, iou_threshold) 13 | detector = OnnxDetector("yolov8m.onnx", input_width, input_height, confidence_threshold, iou_threshold) 14 | 15 | cap = cv2.VideoCapture(0) 16 | while True: 17 | ret, frame = cap.read() 18 | if not ret: 19 | break 20 | 21 | # Perform Yolo object detection 22 | detections: Detections = detector(frame) 23 | 24 | # Apply the detections to the frame 25 | frame = detections.applyToFrame(frame) 26 | 27 | # Print the FPS 28 | print(detector.fps) 29 | 30 | # Display the output image 31 | cv2.imshow("Object Detection", frame) 32 | if cv2.waitKey(1) & 0xFF == ord('q'): 33 | break 34 | 35 | cap.release() 36 | cv2.destroyAllWindows() -------------------------------------------------------------------------------- /Tutorials/11_Yolov8/test_yolov8.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | from mltu.annotations.detections import Detections 4 | from mltu.torch.yolo.detectors.onnx_detector import Detector as OnnxDetector 5 | 6 | # https://www.kaggle.com/datasets/andrewmvd/car-plate-detection 7 | images_path = "Datasets/car-plate-detection/images" 8 | 9 | input_width, input_height = 416, 416 10 | confidence_threshold = 0.5 11 | iou_threshold = 0.5 12 | 13 | detector = OnnxDetector("Models/11_Yolov8/1714135287/model.onnx", input_width, input_height, confidence_threshold, iou_threshold, force_cpu=False) 14 | 15 | for image_path in os.listdir(images_path): 16 | 17 | frame = cv2.imread(os.path.join(images_path, image_path)) 18 | 19 | # Perform Yolo object detection 20 | detections: Detections = detector(frame) 21 | 22 | # Apply the detections to the frame 23 | frame = detections.applyToFrame(frame) 24 | 25 | # Print the FPS 26 | print(detector.fps) 27 | 28 | # Display the output image 29 | cv2.imshow("Object Detection", frame) 30 | if cv2.waitKey(0) & 0xFF == ord('q'): 31 | break 32 | 33 | cv2.destroyAllWindows() -------------------------------------------------------------------------------- /Tutorials/11_Yolov8/train_yolov8.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | from mltu.preprocessors import ImageReader 5 | from mltu.annotations.images import CVImage 6 | from mltu.transformers import ImageResizer, ImageShowCV2, ImageNormalizer 7 | from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen, \ 8 | RandomMirror, RandomFlip, RandomGaussianBlur, RandomSaltAndPepper, RandomDropBlock, RandomMosaic, RandomElasticTransform 9 | from mltu.torch.model import Model 10 | from mltu.torch.dataProvider import DataProvider 11 | from mltu.torch.yolo.annotation import VOCAnnotationReader 12 | from mltu.torch.yolo.preprocessors import YoloPreprocessor 13 | from mltu.torch.yolo.loss import v8DetectionLoss 14 | from mltu.torch.yolo.metrics import YoloMetrics 15 | from mltu.torch.yolo.optimizer import build_optimizer, AccumulativeOptimizer 16 | from mltu.torch.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, Model2onnx, WarmupCosineDecay 17 | 18 | from ultralytics.nn.tasks import DetectionModel 19 | from ultralytics.engine.model import Model as BaseModel 20 | 21 | # https://www.kaggle.com/datasets/andrewmvd/car-plate-detection 22 | annotations_path = "Datasets/car-plate-detection/annotations" 23 | 24 | # Create a dataset from the annotations, the dataset is a list of lists where each list contains the [image path, annotation path] 25 | dataset = [[None, os.path.join(annotations_path, f)] for f in os.listdir(annotations_path)] 26 | 27 | # Make sure torch can see GPU device, it is not recommended to train with CPU 28 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 29 | 30 | img_size = 416 31 | labels = {0: "licence"} 32 | 33 | # Create a data provider for the dataset 34 | data_provider = DataProvider( 35 | dataset=dataset, 36 | skip_validation=True, 37 | batch_size=16, 38 | data_preprocessors=[ 39 | VOCAnnotationReader(labels=labels), 40 | ImageReader(CVImage), 41 | ], 42 | transformers=[ 43 | # ImageShowCV2(), 44 | ImageResizer(img_size, img_size), 45 | ImageNormalizer(transpose_axis=True), 46 | ], 47 | batch_postprocessors=[ 48 | YoloPreprocessor(device, img_size) 49 | ], 50 | numpy=False, 51 | ) 52 | 53 | # split the dataset into train and test 54 | train_data_provider, val_data_provider = data_provider.split(0.9, shuffle=False) 55 | 56 | # Attaach augmentation to the train data provider 57 | train_data_provider.augmentors = [ 58 | RandomBrightness(), 59 | RandomErodeDilate(), 60 | RandomSharpen(), 61 | RandomMirror(), 62 | RandomFlip(), 63 | RandomElasticTransform(), 64 | RandomGaussianBlur(), 65 | RandomSaltAndPepper(), 66 | RandomRotate(angle=10), 67 | RandomDropBlock(), 68 | RandomMosaic(), 69 | ] 70 | 71 | base_model = BaseModel("yolov8n.pt") 72 | # Create a YOLO model 73 | model = DetectionModel('yolov8n.yaml', nc=len(labels)) 74 | 75 | # Load the weight from base model 76 | try: model.load_state_dict(base_model.model.state_dict(), strict=False) 77 | except: pass 78 | 79 | model.to(device) 80 | 81 | for k, v in model.named_parameters(): 82 | if any(x in k for x in [".dfl"]): 83 | print("freezing", k) 84 | v.requires_grad = False 85 | elif not v.requires_grad: 86 | v.requires_grad = True 87 | 88 | lr = 1e-3 89 | optimizer = build_optimizer(model.model, name="AdamW", lr=lr, weight_decay=0.0, momentum=0.937, decay=0.0005) 90 | optimizer = AccumulativeOptimizer(optimizer, 16, 64) 91 | 92 | # create model object that will handle training and testing of the network 93 | model = Model( 94 | model, 95 | optimizer, 96 | v8DetectionLoss(model), 97 | metrics=[YoloMetrics(nc=len(labels))], 98 | log_errors=False, 99 | output_path=f"Models/11_Yolov8/{int(time.time())}", 100 | clip_grad_norm=10.0, 101 | ema=True, 102 | ) 103 | 104 | modelCheckpoint = ModelCheckpoint(monitor="val_fitness", mode="max", save_best_only=True, verbose=True) 105 | tensorBoard = TensorBoard() 106 | earlyStopping = EarlyStopping(monitor="val_fitness", mode="max", patience=31, verbose=True) 107 | model2onnx = Model2onnx(input_shape=(1, 3, img_size, img_size), verbose=True, opset_version=14, 108 | dynamic_axes = {"input": {0: "batch_size", 2: "height", 3: "width"}, 109 | "output": {0: "batch_size", 2: "anchors"}}, 110 | metadata={"classes": labels}) 111 | warmupCosineDecayBias = WarmupCosineDecay(lr_after_warmup=lr, final_lr=lr, initial_lr=0.1, 112 | warmup_steps=len(train_data_provider), warmup_epochs=10, ignore_param_groups=[1, 2]) # lr0 113 | warmupCosineDecay = WarmupCosineDecay(lr_after_warmup=lr, final_lr=lr/10, initial_lr=1e-7, 114 | warmup_steps=len(train_data_provider), warmup_epochs=10, decay_epochs=190, ignore_param_groups=[0]) # lr1 and lr2 115 | 116 | # Train the model 117 | history = model.fit( 118 | train_data_provider, 119 | test_dataProvider=val_data_provider, 120 | epochs=200, 121 | callbacks=[ 122 | modelCheckpoint, 123 | tensorBoard, 124 | earlyStopping, 125 | model2onnx, 126 | warmupCosineDecayBias, 127 | warmupCosineDecay 128 | ] 129 | ) -------------------------------------------------------------------------------- /Tutorials/README.md: -------------------------------------------------------------------------------- 1 | # Tutorials and Examples made with MLTU library: 2 | 1. [Text Recognition With TensorFlow and CTC network](https://pylessons.com/ctc-text-recognition), code in ```Tutorials\01_image_to_word``` folder; 3 | 2. [TensorFlow OCR model for reading Captchas](https://pylessons.com/tensorflow-ocr-captcha), code in ```Tutorials\02_captcha_to_text``` folder; 4 | 3. [Handwriting words recognition with TensorFlow](https://pylessons.com/handwriting-recognition), code in ```Tutorials\03_handwriting_recognition``` folder; 5 | 4. [Handwritten sentence recognition with TensorFlow](https://pylessons.com/handwritten-sentence-recognition), code in ```Tutorials\04_sentence_recognition``` folder; 6 | 5. [Introduction to speech recognition with TensorFlow](https://pylessons.com/speech-recognition), code in ```Tutorials\05_speech_recognition``` folder; 7 | 6. [Introduction to PyTorch in a practical way](https://pylessons.com/pytorch-introduction), code in ```Tutorials\06_pytorch_introduction``` folder; 8 | 7. [Using custom wrapper to simplify PyTorch models training pipeline](https://pylessons.com/pytorch-introduction), code in ```Tutorials\07_pytorch_wrapper``` folder; 9 | 8. [Handwriting words recognition with PyTorch](https://pylessons.com/handwriting-recognition-pytorch), code in ```Tutorials\08_handwriting_recognition_torch``` folder; -------------------------------------------------------------------------------- /bin/read_parquet.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | file_path = "/home/rokbal/Downloads/train-00000-of-00001-bfc7b63751c36ab0 (1).parquet" 4 | 5 | df = pd.read_parquet(file_path) 6 | 7 | print(df.head()) -------------------------------------------------------------------------------- /bin/setup.sh: -------------------------------------------------------------------------------- 1 | python3 -m venv venv 2 | activate() { 3 | . venv/bin/activate 4 | echo "installing requirements to virtual environment" 5 | pip install -r requirements.txt 6 | } 7 | activate -------------------------------------------------------------------------------- /mltu/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.2.5" 2 | 3 | from .annotations.images import Image 4 | from .annotations.images import CVImage 5 | from .annotations.images import PillowImage -------------------------------------------------------------------------------- /mltu/annotations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/annotations/__init__.py -------------------------------------------------------------------------------- /mltu/annotations/audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | class Audio: 5 | """ Audio object 6 | 7 | Attributes: 8 | audio (np.ndarray): Audio array 9 | sample_rate (int): Sample rate 10 | init_successful (bool): True if audio was successfully read 11 | library (object): Library used to read audio, tested only with librosa 12 | """ 13 | init_successful = False 14 | augmented=False 15 | 16 | def __init__( 17 | self, 18 | audioPath: str, 19 | sample_rate: int=22050, 20 | library=None 21 | ) -> None: 22 | if library is None: 23 | raise ValueError("library must be provided. (e.g. librosa object)") 24 | 25 | if isinstance(audioPath, str): 26 | if not os.path.exists(audioPath): 27 | raise FileNotFoundError(f"Image {audioPath} not found.") 28 | 29 | self._audio, self.sample_rate = library.load(audioPath, sr=sample_rate) 30 | self.path = audioPath 31 | self.init_successful = True 32 | 33 | else: 34 | raise TypeError(f"audioPath must be path to audio file, not {type(audioPath)}") 35 | 36 | @property 37 | def audio(self) -> np.ndarray: 38 | return self._audio 39 | 40 | @audio.setter 41 | def audio(self, value: np.ndarray): 42 | self.augmented = True 43 | self._audio = value 44 | 45 | @property 46 | def shape(self) -> tuple: 47 | return self._audio.shape 48 | 49 | def numpy(self) -> np.ndarray: 50 | return self._audio 51 | 52 | def __add__(self, other: np.ndarray) -> np.ndarray: 53 | self._audio = self._audio + other 54 | self.augmented = True 55 | return self 56 | 57 | def __len__(self) -> int: 58 | return len(self._audio) 59 | 60 | def __call__(self) -> np.ndarray: 61 | return self._audio 62 | 63 | def __repr__(self): 64 | return repr(self._audio) 65 | 66 | def __array__(self): 67 | return self._audio -------------------------------------------------------------------------------- /mltu/configs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | 5 | class BaseModelConfigs: 6 | def __init__(self): 7 | self.model_path = None 8 | 9 | def serialize(self): 10 | class_attributes = {key: value 11 | for (key, value) 12 | in type(self).__dict__.items() 13 | if key not in ['__module__', '__init__', '__doc__', '__annotations__']} 14 | instance_attributes = self.__dict__ 15 | 16 | # first init with class attributes then apply instance attributes overwriting any existing duplicate attributes 17 | all_attributes = class_attributes.copy() 18 | all_attributes.update(instance_attributes) 19 | 20 | return all_attributes 21 | 22 | def save(self, name: str = "configs.yaml"): 23 | if self.model_path is None: 24 | raise Exception("Model path is not specified") 25 | 26 | # create directory if not exist 27 | if not os.path.exists(self.model_path): 28 | os.makedirs(self.model_path) 29 | 30 | with open(os.path.join(self.model_path, name), "w") as f: 31 | yaml.dump(self.serialize(), f) 32 | 33 | @staticmethod 34 | def load(configs_path: str): 35 | with open(configs_path, "r") as f: 36 | configs = yaml.load(f, Loader=yaml.FullLoader) 37 | 38 | config = BaseModelConfigs() 39 | for key, value in configs.items(): 40 | setattr(config, key, value) 41 | 42 | return config 43 | -------------------------------------------------------------------------------- /mltu/inferenceModel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import typing 4 | import numpy as np 5 | import onnxruntime as ort 6 | from collections import deque 7 | 8 | class FpsWrapper: 9 | """ Decorator to calculate the frames per second of a function 10 | """ 11 | def __init__(self, func: typing.Callable): 12 | self.func = func 13 | self.fps_list = deque([], maxlen=100) 14 | 15 | def __call__(self, *args, **kwargs): 16 | start = time.time() 17 | results = self.func(self.instance, *args, **kwargs) 18 | self.fps_list.append(1 / (time.time() - start)) 19 | self.instance.fps = np.mean(self.fps_list) 20 | return results 21 | 22 | def __get__(self, instance, owner): 23 | self.instance = instance 24 | return self.__call__.__get__(instance, owner) 25 | 26 | 27 | class OnnxInferenceModel: 28 | """ Base class for all inference models that use onnxruntime 29 | 30 | Attributes: 31 | model_path (str, optional): Path to the model folder. Defaults to "". 32 | force_cpu (bool, optional): Force the model to run on CPU or GPU. Defaults to GPU. 33 | default_model_name (str, optional): Default model name. Defaults to "model.onnx". 34 | """ 35 | def __init__( 36 | self, 37 | model_path: str = "", 38 | force_cpu: bool = False, 39 | default_model_name: str = "model.onnx", 40 | *args, **kwargs 41 | ): 42 | self.model_path = model_path.replace("\\", "/") 43 | self.force_cpu = force_cpu 44 | self.default_model_name = default_model_name 45 | 46 | # check if model path is a directory with os path 47 | if os.path.isdir(self.model_path): 48 | self.model_path = os.path.join(self.model_path, self.default_model_name) 49 | 50 | if not os.path.exists(self.model_path): 51 | raise Exception(f"Model path ({self.model_path}) does not exist") 52 | 53 | providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if ort.get_device() == "GPU" and not force_cpu else ["CPUExecutionProvider"] 54 | 55 | self.model = ort.InferenceSession(self.model_path, providers=providers) 56 | 57 | self.metadata = {} 58 | if self.model.get_modelmeta().custom_metadata_map: 59 | # add metadata to self object 60 | for key, value in self.model.get_modelmeta().custom_metadata_map.items(): 61 | try: 62 | new_value = eval(value) # in case the value is a list or dict 63 | except: 64 | new_value = value 65 | self.metadata[key] = new_value 66 | 67 | # Update providers priority to only CPUExecutionProvider 68 | if self.force_cpu: 69 | self.model.set_providers(["CPUExecutionProvider"]) 70 | 71 | self.input_shapes = [meta.shape for meta in self.model.get_inputs()] 72 | self.input_names = [meta.name for meta in self.model._inputs_meta] 73 | self.output_names = [meta.name for meta in self.model._outputs_meta] 74 | 75 | def predict(self, data: np.ndarray, *args, **kwargs): 76 | raise NotImplementedError 77 | 78 | @FpsWrapper 79 | def __call__(self, data: np.ndarray): 80 | results = self.predict(data) 81 | return results -------------------------------------------------------------------------------- /mltu/tensorflow/README.md: -------------------------------------------------------------------------------- 1 | # Functions and objects specific for TensorFlow 2.* and Python 3 -------------------------------------------------------------------------------- /mltu/tensorflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/tensorflow/__init__.py -------------------------------------------------------------------------------- /mltu/tensorflow/callbacks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | from keras.callbacks import Callback 4 | 5 | import logging 6 | 7 | class Model2onnx(Callback): 8 | """ Converts the model to onnx format after training is finished. """ 9 | def __init__( 10 | self, 11 | saved_model_path: str, 12 | metadata: dict=None, 13 | save_on_epoch_end: bool=False, 14 | ) -> None: 15 | """ Converts the model to onnx format after training is finished. 16 | Args: 17 | saved_model_path (str): Path to the saved .h5 model. 18 | metadata (dict, optional): Dictionary containing metadata to be added to the onnx model. Defaults to None. 19 | save_on_epoch_end (bool, optional): Save the onnx model on every epoch end. Defaults to False. 20 | """ 21 | super().__init__() 22 | self.saved_model_path = saved_model_path 23 | self.metadata = metadata 24 | self.save_on_epoch_end = save_on_epoch_end 25 | 26 | try: 27 | import tf2onnx 28 | except: 29 | raise ImportError("tf2onnx is not installed. Please install it using 'pip install tf2onnx'") 30 | 31 | try: 32 | import onnx 33 | except: 34 | raise ImportError("onnx is not installed. Please install it using 'pip install onnx'") 35 | 36 | @staticmethod 37 | def model2onnx(model: tf.keras.Model, onnx_model_path: str): 38 | try: 39 | import tf2onnx 40 | 41 | # convert the model to onnx format 42 | tf2onnx.convert.from_keras(model, output_path=onnx_model_path) 43 | 44 | except Exception as e: 45 | print(e) 46 | 47 | @staticmethod 48 | def include_metadata(onnx_model_path: str, metadata: dict=None): 49 | try: 50 | if metadata and isinstance(metadata, dict): 51 | 52 | import onnx 53 | # Load the ONNX model 54 | onnx_model = onnx.load(onnx_model_path) 55 | 56 | # Add the metadata dictionary to the model's metadata_props attribute 57 | for key, value in metadata.items(): 58 | meta = onnx_model.metadata_props.add() 59 | meta.key = key 60 | meta.value = str(value) 61 | 62 | # Save the modified ONNX model 63 | onnx.save(onnx_model, onnx_model_path) 64 | 65 | except Exception as e: 66 | print(e) 67 | 68 | def on_epoch_end(self, epoch: int, logs: dict=None): 69 | """ Converts the model to onnx format on every epoch end. """ 70 | if self.save_on_epoch_end: 71 | self.on_train_end(logs=logs) 72 | 73 | def on_train_end(self, logs=None): 74 | """ Converts the model to onnx format after training is finished. """ 75 | self.model.load_weights(self.saved_model_path) 76 | onnx_model_path = self.saved_model_path.replace(".h5", ".onnx") 77 | self.model2onnx(self.model, onnx_model_path) 78 | self.include_metadata(onnx_model_path, self.metadata) 79 | 80 | 81 | class TrainLogger(Callback): 82 | """Logs training metrics to a file. 83 | 84 | Args: 85 | log_path (str): Path to the directory where the log file will be saved. 86 | log_file (str, optional): Name of the log file. Defaults to 'logs.log'. 87 | logLevel (int, optional): Logging level. Defaults to logging.INFO. 88 | """ 89 | def __init__(self, log_path: str, log_file: str="logs.log", logLevel=logging.INFO, console_output=False) -> None: 90 | super().__init__() 91 | self.log_path = log_path 92 | self.log_file = log_file 93 | 94 | if not os.path.exists(log_path): 95 | os.mkdir(log_path) 96 | 97 | self.logger = logging.getLogger() 98 | self.logger.setLevel(logLevel) 99 | 100 | self.formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 101 | 102 | self.file_handler = logging.FileHandler(os.path.join(self.log_path, self.log_file)) 103 | self.file_handler.setLevel(logLevel) 104 | self.file_handler.setFormatter(self.formatter) 105 | 106 | if not console_output: 107 | self.logger.handlers[:] = [] 108 | 109 | self.logger.addHandler(self.file_handler) 110 | 111 | def on_epoch_end(self, epoch: int, logs: dict=None): 112 | epoch_message = f"Epoch {epoch}; " 113 | logs_message = "; ".join([f"{key}: {value}" for key, value in logs.items()]) 114 | self.logger.info(epoch_message + logs_message) 115 | 116 | 117 | class WarmupCosineDecay(Callback): 118 | """ Cosine decay learning rate scheduler with warmup 119 | 120 | Args: 121 | lr_after_warmup (float): Learning rate after warmup 122 | final_lr (float): Final learning rate 123 | warmup_epochs (int): Number of warmup epochs 124 | decay_epochs (int): Number of decay epochs 125 | initial_lr (float, optional): Initial learning rate. Defaults to 0.0. 126 | verbose (bool, optional): Whether to print learning rate. Defaults to False. 127 | """ 128 | def __init__( 129 | self, 130 | lr_after_warmup: float, 131 | final_lr: float, 132 | warmup_epochs: int, 133 | decay_epochs: int, 134 | initial_lr: float=0.0, 135 | verbose=False 136 | ) -> None: 137 | super(WarmupCosineDecay, self).__init__() 138 | self.lr_after_warmup = lr_after_warmup 139 | self.final_lr = final_lr 140 | self.warmup_epochs = warmup_epochs 141 | self.decay_epochs = decay_epochs 142 | self.initial_lr = initial_lr 143 | self.verbose = verbose 144 | 145 | def on_epoch_begin(self, epoch: int, logs: dict=None): 146 | """ Adjust learning rate at the beginning of each epoch """ 147 | 148 | if epoch >= self.warmup_epochs + self.decay_epochs: 149 | return logs 150 | 151 | if epoch < self.warmup_epochs: 152 | lr = self.initial_lr + (self.lr_after_warmup - self.initial_lr) * (epoch + 1) / self.warmup_epochs 153 | else: 154 | progress = (epoch - self.warmup_epochs) / self.decay_epochs 155 | lr = self.final_lr + 0.5 * (self.lr_after_warmup - self.final_lr) * (1 + tf.cos(tf.constant(progress) * 3.14159)) 156 | 157 | tf.keras.backend.set_value(self.model.optimizer.lr, lr) 158 | 159 | if self.verbose: 160 | print(f"Epoch {epoch + 1} - Learning Rate: {lr}") 161 | 162 | def on_epoch_end(self, epoch: int, logs: dict=None): 163 | logs = logs or {} 164 | 165 | # Log the learning rate value 166 | logs["lr"] = self.model.optimizer.lr 167 | 168 | return logs -------------------------------------------------------------------------------- /mltu/tensorflow/dataProvider.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from ..dataProvider import DataProvider as dataProvider 4 | 5 | class DataProvider(dataProvider, tf.keras.utils.Sequence): 6 | def __init__(self, *args, **kwargs): 7 | super().__init__(*args, **kwargs) 8 | -------------------------------------------------------------------------------- /mltu/tensorflow/layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from keras import layers 3 | from keras import backend as K 4 | 5 | class SelfAttention(layers.Layer): 6 | """ A self-attention layer for convolutional neural networks. 7 | 8 | This layer takes as input a tensor of shape (batch_size, height, width, channels) 9 | and applies self-attention to the channels dimension. 10 | 11 | Args: 12 | num_heads (int): The number of attention heads to use. Defaults to 8. 13 | wrapper (tf.keras.layers.Wrapper): A wrapper layer to apply to the convolutional layers. 14 | 15 | Raises: 16 | TypeError: If `wrapper` is provided and is not a subclass of `tf.keras.layers.Wrapper`. 17 | """ 18 | def __init__(self, num_heads: int = 8, wrapper: tf.keras.layers.Wrapper = None): 19 | super(SelfAttention, self).__init__() 20 | self.num_heads = num_heads 21 | self.wrapper = wrapper 22 | 23 | if wrapper and not issubclass(wrapper, tf.keras.layers.Wrapper): 24 | raise TypeError("wrapper must be a class derived from tf.keras.layers.Wrapper") 25 | 26 | def get_config(self) -> dict: 27 | config = super().get_config() 28 | config.update({ 29 | "num_heads": self.num_heads, 30 | }) 31 | return config 32 | 33 | def build(self, input_shape): 34 | _, h, w, c = input_shape 35 | self.query_conv = self._conv(filters=c // self.num_heads) 36 | self.key_conv = self._conv(filters=c // self.num_heads) 37 | self.value_conv = self._conv(filters=c) 38 | self.gamma = self.add_weight("gamma", shape=[1], initializer=tf.zeros_initializer(), trainable=True) 39 | 40 | def _conv(self, filters: int) -> tf.keras.layers.Layer: 41 | """ Helper function to create a convolutional layer with the given number of filters. 42 | 43 | Args: 44 | filters (int): The number of filters to use. 45 | 46 | Returns: 47 | tf.keras.layers.Layer: The created convolutional layer. 48 | """ 49 | conv = layers.Conv2D(filters=filters, kernel_size=1, strides=1, padding="same") 50 | if self.wrapper: 51 | conv = self.wrapper(conv) 52 | 53 | return conv 54 | 55 | def call(self, inputs: tf.Tensor) -> tf.Tensor: 56 | """ Apply the self-attention mechanism to the input tensor. 57 | 58 | Args: 59 | inputs (tf.Tensor): The input tensor of shape (batch_size, height, width, channels). 60 | 61 | Returns: 62 | tf.Tensor: The output tensor after the self-attention mechanism is applied. 63 | """ 64 | _, h, w, c = inputs.shape 65 | q = self.query_conv(inputs) 66 | k = self.key_conv(inputs) 67 | v = self.value_conv(inputs) 68 | 69 | q_reshaped = tf.reshape(q, [-1, h * w, c // self.num_heads]) 70 | k_reshaped = tf.reshape(k, [-1, h * w, c // self.num_heads]) 71 | v_reshaped = tf.reshape(v, [-1, h * w, c]) 72 | 73 | # Compute the attention scores by taking the dot product of the query and key tensors. 74 | attention_scores = tf.matmul(q_reshaped, k_reshaped, transpose_b=True) 75 | 76 | # Scale the attention scores by the square root of the number of channels. 77 | attention_scores = attention_scores / tf.sqrt(tf.cast(c // self.num_heads, dtype=tf.float32)) 78 | 79 | # Apply a softmax function to the attention scores to obtain the attention weights. 80 | attention_weights = tf.nn.softmax(attention_scores, axis=-1) 81 | 82 | # Apply the attention weights to the value tensor to obtain the attention output. 83 | attention_output = tf.matmul(attention_weights, v_reshaped) 84 | 85 | # Reshape the attended value tensor to the original input tensor shape. 86 | attention_output = tf.reshape(attention_output, [-1, h, w, c]) 87 | 88 | # Apply the gamma parameter to the attended value tensor and add it to the output tensor. 89 | attention_output = self.gamma * attention_output + inputs 90 | 91 | return attention_output 92 | 93 | 94 | class SpectralNormalization(tf.keras.layers.Wrapper): 95 | """Spectral Normalization Wrapper. !!! This is not working yet !!!""" 96 | def __init__(self, layer, power_iterations=1, eps=1e-12, **kwargs): 97 | super(SpectralNormalization, self).__init__(layer, **kwargs) 98 | 99 | if power_iterations <= 0: 100 | raise ValueError( 101 | "`power_iterations` should be greater than zero, got " 102 | "`power_iterations={}`".format(power_iterations) 103 | ) 104 | self.power_iterations = power_iterations 105 | self.eps = eps 106 | if not isinstance(layer, tf.keras.layers.Layer): 107 | raise ValueError( 108 | "Please initialize `TimeDistributed` layer with a " 109 | "`Layer` instance. You passed: {input}".format(input=layer)) 110 | 111 | def build(self, input_shape): 112 | if not self.layer.built: 113 | self.layer.build(input_shape) 114 | 115 | self.w = self.layer.kernel 116 | self.w_shape = self.w.shape.as_list() 117 | 118 | # self.v = self.add_weight(shape=(1, self.w_shape[0] * self.w_shape[1] * self.w_shape[2]), 119 | # initializer=tf.initializers.TruncatedNormal(stddev=0.02), 120 | # trainable=False, 121 | # name="sn_v", 122 | # dtype=tf.float32) 123 | 124 | self.u = self.add_weight(shape=(1, self.w_shape[-1]), 125 | initializer=tf.initializers.TruncatedNormal(stddev=0.02), 126 | trainable=False, 127 | name="sn_u", 128 | dtype=tf.float32) 129 | 130 | super(SpectralNormalization, self).build() 131 | 132 | def l2normalize(self, v, eps=1e-12): 133 | return v / (tf.reduce_sum(v ** 2) ** 0.5 + eps) 134 | 135 | def power_iteration(self, W, u, rounds=1): 136 | _u = u 137 | 138 | for _ in range(rounds): 139 | # v_ = tf.matmul(_u, tf.transpose(W)) 140 | # v_hat = self.l2normalize(v_) 141 | _v = self.l2normalize(K.dot(_u, K.transpose(W)), eps=self.eps) 142 | 143 | # u_ = tf.matmul(v_hat, W) 144 | # u_hat = self.l2normalize(u_) 145 | _u = self.l2normalize(K.dot(_v, W), eps=self.eps) 146 | 147 | return _u, _v 148 | 149 | def call(self, inputs, training=None): 150 | if training is None: 151 | training = tf.keras.backend.learning_phase() 152 | 153 | if training: 154 | self.update_weights() 155 | output = self.layer(inputs) 156 | self.restore_weights() # Restore weights because of this formula "W = W - alpha * W_SN`" 157 | return output 158 | 159 | return self.layer(inputs) 160 | 161 | def update_weights(self): 162 | w_reshaped = tf.reshape(self.w, [-1, self.w_shape[-1]]) 163 | 164 | # u_hat = self.u 165 | # v_hat = self.v # init v vector 166 | 167 | u_hat, v_hat = self.power_iteration(w_reshaped, self.u, self.power_iterations) 168 | # v_ = tf.matmul(u_hat, tf.transpose(w_reshaped)) 169 | # # v_hat = v_ / (tf.reduce_sum(v_**2)**0.5 + self.eps) 170 | # v_hat = self.l2normalize(v_, self.eps) 171 | 172 | # u_ = tf.matmul(v_hat, w_reshaped) 173 | # # u_hat = u_ / (tf.reduce_sum(u_**2)**0.5 + self.eps) 174 | # u_hat = self.l2normalize(u_, self.eps) 175 | 176 | # sigma = tf.matmul(tf.matmul(v_hat, w_reshaped), tf.transpose(u_hat)) 177 | sigma=K.dot(K.dot(v_hat, w_reshaped), K.transpose(u_hat)) 178 | self.u.assign(u_hat) 179 | # self.v.assign(v_hat) 180 | 181 | self.layer.kernel.assign(self.w / sigma) 182 | 183 | def restore_weights(self): 184 | self.layer.kernel.assign(self.w) -------------------------------------------------------------------------------- /mltu/tensorflow/losses.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class CTCloss(tf.keras.losses.Loss): 5 | """ CTCLoss objec for training the model""" 6 | def __init__(self, name: str = "CTCloss") -> None: 7 | super(CTCloss, self).__init__() 8 | self.name = name 9 | self.loss_fn = tf.keras.backend.ctc_batch_cost 10 | 11 | def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None) -> tf.Tensor: 12 | """ Compute the training batch CTC loss value""" 13 | batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64") 14 | input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64") 15 | label_length = tf.cast(tf.shape(y_true)[1], dtype="int64") 16 | 17 | input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64") 18 | label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64") 19 | 20 | loss = self.loss_fn(y_true, y_pred, input_length, label_length) 21 | 22 | return loss -------------------------------------------------------------------------------- /mltu/tensorflow/model_utils.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import tensorflow as tf 3 | from tensorflow import keras 4 | from keras import layers 5 | from keras.models import Model 6 | 7 | class CustomModel(Model): 8 | """ Custom TensorFlow model for debugging training process purposes 9 | """ 10 | def train_step(self, train_data): 11 | # Unpack the data. Its structure depends on your model and 12 | # on what you pass to `fit()`. 13 | inputs, targets = train_data 14 | with tf.GradientTape() as tape: 15 | results = self(inputs, training=True) 16 | loss = self.compiled_loss(targets, results, regularization_losses=self.losses) 17 | gradients = tape.gradient(loss, self.trainable_weights) 18 | 19 | # Applying the gradients on the model using the specified optimizer 20 | self.optimizer.apply_gradients(zip(gradients, self.trainable_weights)) 21 | 22 | # Update the metrics. 23 | # Metrics are configured in `compile()`. 24 | self.compiled_metrics.update_state(targets, results) 25 | 26 | return {m.name: m.result() for m in self.metrics} 27 | 28 | def test_step(self, test_data): 29 | inputs, targets = test_data 30 | # Get prediction from model 31 | results = self(inputs, training=False) 32 | 33 | # Update the loss 34 | self.compiled_loss(targets, results, regularization_losses=self.losses) 35 | 36 | # Update the metrics 37 | self.compiled_metrics.update_state(targets, results) 38 | 39 | # Return a dict mapping metric names to current value. 40 | # Note that it will include the loss (tracked in self.metrics). 41 | return {m.name: m.result() for m in self.metrics} 42 | 43 | 44 | def activation_layer(layer, activation: str="relu", alpha: float=0.1) -> tf.Tensor: 45 | """ Activation layer wrapper for LeakyReLU and ReLU activation functions 46 | Args: 47 | layer: tf.Tensor 48 | activation: str, activation function name (default: 'relu') 49 | alpha: float (LeakyReLU activation function parameter) 50 | Returns: 51 | tf.Tensor 52 | """ 53 | if activation == "relu": 54 | layer = layers.ReLU()(layer) 55 | elif activation == "leaky_relu": 56 | layer = layers.LeakyReLU(alpha=alpha)(layer) 57 | 58 | return layer 59 | 60 | 61 | def residual_block( 62 | x: tf.Tensor, 63 | filter_num: int, 64 | strides: typing.Union[int, list] = 2, 65 | kernel_size: typing.Union[int, list] = 3, 66 | skip_conv: bool = True, 67 | padding: str = "same", 68 | kernel_initializer: str = "he_uniform", 69 | activation: str = "relu", 70 | dropout: float = 0.2): 71 | # Create skip connection tensor 72 | x_skip = x 73 | 74 | # Perform 1-st convolution 75 | x = layers.Conv2D(filter_num, kernel_size, padding = padding, strides = strides, kernel_initializer=kernel_initializer)(x) 76 | x = layers.BatchNormalization()(x) 77 | x = activation_layer(x, activation=activation) 78 | 79 | # Perform 2-nd convoluti 80 | x = layers.Conv2D(filter_num, kernel_size, padding = padding, kernel_initializer=kernel_initializer)(x) 81 | x = layers.BatchNormalization()(x) 82 | 83 | # Perform 3-rd convolution if skip_conv is True, matchin the number of filters and the shape of the skip connection tensor 84 | if skip_conv: 85 | x_skip = layers.Conv2D(filter_num, 1, padding = padding, strides = strides, kernel_initializer=kernel_initializer)(x_skip) 86 | 87 | # Add x and skip connection and apply activation function 88 | x = layers.Add()([x, x_skip]) 89 | x = activation_layer(x, activation=activation) 90 | 91 | # Apply dropout 92 | if dropout: 93 | x = layers.Dropout(dropout)(x) 94 | 95 | return x -------------------------------------------------------------------------------- /mltu/tensorflow/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==2.10.1 2 | tf2onnx 3 | onnx -------------------------------------------------------------------------------- /mltu/tensorflow/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/tensorflow/transformer/__init__.py -------------------------------------------------------------------------------- /mltu/tensorflow/transformer/attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class BaseAttention(tf.keras.layers.Layer): 4 | """ 5 | Base class for all attention layers. It contains the common functionality of all attention layers. 6 | This layer contains a MultiHeadAttention layer, a LayerNormalization layer and an Add layer. 7 | It is used as a base class for the GlobalSelfAttention, CausalSelfAttention and CrossAttention layers. 8 | And it is not intended to be used directly. 9 | 10 | Methods: 11 | call: Performs the forward pass of the layer. 12 | 13 | Attributes: 14 | mha (tf.keras.layers.MultiHeadAttention): The MultiHeadAttention layer. 15 | layernorm (tf.keras.layers.LayerNormalization): The LayerNormalization layer. 16 | add (tf.keras.layers.Add): The Add layer. 17 | """ 18 | def __init__(self, **kwargs: dict): 19 | """ Constructor of the BaseAttention layer. 20 | 21 | Args: 22 | **kwargs: Additional keyword arguments that are passed to the MultiHeadAttention layer, e. g. 23 | num_heads (number of heads), key_dim (dimensionality of the key space), etc. 24 | """ 25 | super().__init__() 26 | self.mha = tf.keras.layers.MultiHeadAttention(**kwargs) 27 | self.layernorm = tf.keras.layers.LayerNormalization() 28 | self.add = tf.keras.layers.Add() 29 | 30 | 31 | class CrossAttention(BaseAttention): 32 | """ 33 | A class that implements the cross-attention layer by inheriting from the BaseAttention class. 34 | This layer is used to process two different sequences and attends to the context sequence while processing the query sequence. 35 | 36 | Methods: 37 | call: Performs the forward pass of the layer. 38 | 39 | Attributes: 40 | mha (tf.keras.layers.MultiHeadAttention): The MultiHeadAttention layer. 41 | layernorm (tf.keras.layers.LayerNormalization): The LayerNormalization layer. 42 | add (tf.keras.layers.Add): The Add layer. 43 | """ 44 | def call(self, x: tf.Tensor, context: tf.Tensor) -> tf.Tensor: 45 | """ 46 | The call function that performs the cross-attention operation. 47 | 48 | Args: 49 | x (tf.Tensor): The query (expected Transformer results) sequence of shape (batch_size, seq_length, d_model). 50 | context (tf.Tensor): The context (inputs to the Transformer) sequence of shape (batch_size, seq_length, d_model). 51 | 52 | Returns: 53 | tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model). 54 | """ 55 | attn_output, attn_scores = self.mha(query=x, key=context, value=context, return_attention_scores=True) 56 | 57 | # Cache the attention scores for plotting later. 58 | self.last_attn_scores = attn_scores 59 | 60 | x = self.add([x, attn_output]) 61 | x = self.layernorm(x) 62 | 63 | return x 64 | 65 | 66 | class GlobalSelfAttention(BaseAttention): 67 | """ 68 | A class that implements the global self-attention layer by inheriting from the BaseAttention class. 69 | This layer is used to process a single sequence and attends to all the tokens in the sequence. 70 | 71 | Methods: 72 | call: Performs the forward pass of the layer. 73 | 74 | Attributes: 75 | mha (tf.keras.layers.MultiHeadAttention): The MultiHeadAttention layer. 76 | layernorm (tf.keras.layers.LayerNormalization): The LayerNormalization layer. 77 | add (tf.keras.layers.Add): The Add layer. 78 | """ 79 | def call(self, x: tf.Tensor) -> tf.Tensor: 80 | """ 81 | The call function that performs the global self-attention operation. 82 | 83 | Args: 84 | x (tf.Tensor): The input sequence of shape (batch_size, seq_length, d_model). 85 | 86 | Returns: 87 | tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model). 88 | """ 89 | attn_output = self.mha(query=x, value=x, key=x) 90 | x = self.add([x, attn_output]) 91 | x = self.layernorm(x) 92 | return x 93 | 94 | 95 | class CausalSelfAttention(BaseAttention): 96 | """ 97 | Call self attention on the input sequence, ensuring that each position in the 98 | output depends only on previous positions (i.e. a causal model). 99 | 100 | Methods: 101 | call: Performs the forward pass of the layer. 102 | 103 | Attributes: 104 | mha (tf.keras.layers.MultiHeadAttention): The MultiHeadAttention layer. 105 | layernorm (tf.keras.layers.LayerNormalization): The LayerNormalization layer. 106 | add (tf.keras.layers.Add): The Add layer. 107 | """ 108 | def call(self, x: tf.Tensor) -> tf.Tensor: 109 | """ 110 | The call function that performs the causal self-attention operation. 111 | 112 | Args: 113 | x (tf.Tensor): The input sequence of shape (batch_size, seq_length, d_model). 114 | 115 | Returns: 116 | tf.Tensor: The output sequence of shape (batch_size, seq_length, d_model). 117 | """ 118 | attn_output = self.mha(query=x, value=x, key=x, use_causal_mask = True) 119 | x = self.add([x, attn_output]) 120 | x = self.layernorm(x) 121 | return x -------------------------------------------------------------------------------- /mltu/tensorflow/transformer/callbacks.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from keras.callbacks import Callback 3 | from mltu.tensorflow.callbacks import Model2onnx 4 | 5 | 6 | class EncDecSplitCallback(Callback): 7 | """Callback to extract the encoder and decoder models from Transformer model and save them separately 8 | Also, this callback incorporates Model2onnx callback to convert the encoder and decoder models to ONNX format 9 | 10 | Args: 11 | model_path (str): Path to save the encoder and decoder models 12 | encoder_metadata (dict, optional): Metadata to save with the encoder model. Defaults to None. 13 | decoder_metadata (dict, optional): Metadata to save with the decoder model. Defaults to None. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | model_path: str, 19 | encoder_metadata: dict = None, 20 | decoder_metadata: dict = None, 21 | model_name = "model.h5" 22 | ): 23 | """Callback to extract the encoder and decoder models from Transformer model and save them separately""" 24 | super(EncDecSplitCallback, self).__init__() 25 | self.model_path = model_path 26 | self.encoder_metadata = encoder_metadata 27 | self.decoder_metadata = decoder_metadata 28 | self.model_name = model_name 29 | 30 | def on_train_end(self, epoch: int, logs: dict = None): 31 | try: 32 | # load best model weights 33 | self.model.load_weights(self.model_path + "/" + self.model_name) 34 | 35 | # extract encoder and decoder models 36 | encoder_model = tf.keras.Model( 37 | inputs=self.model.inputs[0], outputs=self.model.get_layer("encoder").output 38 | ) 39 | decoder_model = tf.keras.Model( 40 | inputs=[self.model.inputs[1], self.model.get_layer("encoder").output], 41 | outputs=self.model.layers[-1].output, 42 | ) 43 | 44 | # save encoder and decoder models 45 | encoder_model.save(self.model_path + "/encoder.h5") 46 | decoder_model.save(self.model_path + "/decoder.h5") 47 | 48 | # convert encoder and decoder models to onnx 49 | Model2onnx.model2onnx(encoder_model, self.model_path + "/encoder.onnx") 50 | Model2onnx.model2onnx(decoder_model, self.model_path + "/decoder.onnx") 51 | 52 | # save encoder and decoder metadata 53 | if self.encoder_metadata: 54 | Model2onnx.include_metadata(self.model_path + "/encoder.onnx", self.encoder_metadata) 55 | if self.decoder_metadata: 56 | Model2onnx.include_metadata(self.model_path + "/decoder.onnx", self.decoder_metadata) 57 | except Exception as e: 58 | print(e) 59 | pass 60 | -------------------------------------------------------------------------------- /mltu/tensorflow/transformer/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class MaskedLoss(tf.keras.losses.Loss): 5 | """ Masked loss function for Transformer. 6 | 7 | Args: 8 | mask_value (int, optional): Mask value. Defaults to 0. 9 | reduction (str, optional): Reduction method. Defaults to 'none'. 10 | """ 11 | def __init__(self, mask_value: int=0, reduction: str='none') -> None: 12 | super(MaskedLoss, self).__init__() 13 | self.mask_value = mask_value 14 | self.reduction = reduction 15 | self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=reduction) 16 | 17 | def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None) -> tf.Tensor: 18 | """ Calculate masked loss. 19 | 20 | Args: 21 | y_true (tf.Tensor): True labels. 22 | y_pred (tf.Tensor): Predicted labels. 23 | 24 | Returns: 25 | tf.Tensor: Masked loss. 26 | """ 27 | mask = y_true != self.mask_value 28 | loss = self.loss_object(y_true, y_pred) 29 | 30 | mask = tf.cast(mask, dtype=loss.dtype) 31 | loss *= mask 32 | 33 | loss = tf.reduce_sum(loss) / tf.reduce_sum(mask) 34 | return loss 35 | 36 | 37 | class MaskedAccuracy(tf.keras.metrics.Metric): 38 | """ Masked accuracy metric for Transformer. 39 | 40 | Args: 41 | mask_value (int, optional): Mask value. Defaults to 0. 42 | name (str, optional): Name of the metric. Defaults to 'masked_accuracy'. 43 | """ 44 | def __init__(self, mask_value: int=0, name: str='masked_accuracy') -> None: 45 | super(MaskedAccuracy, self).__init__(name=name) 46 | self.mask_value = mask_value 47 | self.total = self.add_weight(name='total', initializer='zeros') 48 | self.count = self.add_weight(name='count', initializer='zeros') 49 | 50 | @tf.function 51 | def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None): 52 | """ Update state of the metric. 53 | 54 | Args: 55 | y_true (tf.Tensor): True labels. 56 | y_pred (tf.Tensor): Predicted labels. 57 | """ 58 | pred = tf.argmax(y_pred, axis=2) 59 | label = tf.cast(y_true, pred.dtype) 60 | match = label == pred 61 | 62 | mask = label != self.mask_value 63 | 64 | match = match & mask 65 | 66 | match = tf.cast(match, dtype=tf.float32) 67 | mask = tf.cast(mask, dtype=tf.float32) 68 | match = tf.reduce_sum(match) 69 | mask = tf.reduce_sum(mask) 70 | 71 | self.total.assign_add(match) 72 | self.count.assign_add(mask) 73 | 74 | def result(self) -> tf.Tensor: 75 | """ Calculate masked accuracy. 76 | 77 | Returns: 78 | tf.Tensor: Masked accuracy. 79 | """ 80 | return self.total / self.count 81 | 82 | 83 | class CERMetric(tf.keras.metrics.Metric): 84 | """A custom TensorFlow metric to compute the Character Error Rate (CER). 85 | 86 | Args: 87 | vocabulary: A string of the vocabulary used to encode the labels. 88 | name: (Optional) string name of the metric instance. 89 | **kwargs: Additional keyword arguments. 90 | """ 91 | def __init__(self, end_token, padding_token: int=0, name="CER", **kwargs): 92 | # Initialize the base Metric class 93 | super(CERMetric, self).__init__(name=name, **kwargs) 94 | 95 | # Initialize variables to keep track of the cumulative character/word error rates and counter 96 | self.cer_accumulator = tf.Variable(0.0, name="cer_accumulator", dtype=tf.float32) 97 | self.batch_counter = tf.Variable(0, name="batch_counter", dtype=tf.int32) 98 | 99 | self.padding_token = padding_token 100 | self.end_token = end_token 101 | 102 | def get_cer(self, pred, y_true, padding=-1): 103 | """ Calculates the character error rate (CER) between the predicted labels and true labels for a batch of input data. 104 | 105 | Args: 106 | pred(tf.Tensor): The predicted labels, with dtype=tf.int32, usually output from tf.keras.backend.ctc_decode 107 | y_true (tf.Tensor): The true labels, with dtype=tf.int32 108 | padding (int, optional): The padding token when converting to sparse tensor. Defaults to -1. 109 | 110 | Returns: 111 | tf.Tensor: The CER between the predicted labels and true labels 112 | """ 113 | # find index where end token is 114 | equal = tf.equal(pred, self.end_token) 115 | equal_int = tf.cast(equal, tf.int64) 116 | end_token_index = tf.argmax(equal_int, axis=1) 117 | 118 | # mask out everything after end token 119 | new_range = tf.range(tf.shape(pred)[1], dtype=tf.int64) 120 | range_matrix = tf.tile(new_range[None, :], [tf.shape(pred)[0], 1]) 121 | 122 | mask = range_matrix <= tf.expand_dims(end_token_index, axis=1) 123 | masked_pred = tf.where(mask, pred, padding) 124 | 125 | # Convert the valid predicted labels tensor to a sparse tensor 126 | sparse_pred = tf.RaggedTensor.from_tensor(masked_pred, padding=padding).to_sparse() 127 | 128 | # Convert the valid true labels tensor to a sparse tensor 129 | sparse_true = tf.RaggedTensor.from_tensor(y_true, padding=padding).to_sparse() 130 | 131 | # Calculate the normalized edit distance between the sparse predicted labels tensor and sparse true labels tensor 132 | distance = tf.edit_distance(sparse_pred, sparse_true, normalize=True) 133 | 134 | return distance 135 | 136 | # @tf.function 137 | def update_state(self, y_true, y_pred, sample_weight=None): 138 | """Updates the state variables of the metric. 139 | 140 | Args: 141 | y_true: A tensor of true labels with shape (batch_size, sequence_length). 142 | y_pred: A tensor of predicted labels with shape (batch_size, sequence_length, num_classes). 143 | sample_weight: (Optional) a tensor of weights with shape (batch_size, sequence_length). 144 | """ 145 | pred = tf.argmax(y_pred, axis=2) 146 | 147 | # Calculate the normalized edit distance between the predicted labels and true labels tensors 148 | distance = self.get_cer(pred, y_true, self.padding_token) 149 | 150 | # Add the sum of the distance tensor to the cer_accumulator variable 151 | self.cer_accumulator.assign_add(tf.reduce_sum(distance)) 152 | 153 | # Increment the batch_counter by the batch size 154 | self.batch_counter.assign_add(len(y_true)) 155 | 156 | def result(self): 157 | """ Computes and returns the metric result. 158 | 159 | Returns: 160 | A TensorFlow float representing the CER (character error rate). 161 | """ 162 | return tf.math.divide_no_nan(self.cer_accumulator, tf.cast(self.batch_counter, tf.float32)) -------------------------------------------------------------------------------- /mltu/torch/README.md: -------------------------------------------------------------------------------- 1 | # Functions and objects specific for PyTorch and Python 3 -------------------------------------------------------------------------------- /mltu/torch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/torch/__init__.py -------------------------------------------------------------------------------- /mltu/torch/handlers.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | from .metrics import Metric 4 | from .callbacks import Callback 5 | 6 | class MetricsHandler: 7 | """ Metrics handler class for training and testing loops""" 8 | def __init__(self, metrics: typing.List[Metric]): 9 | self.metrics = metrics 10 | 11 | # Validate metrics 12 | if not all(isinstance(m, Metric) for m in self.metrics): 13 | raise TypeError("all items in the metrics argument must be of type Metric (Check mltu.metrics.metrics.py for more information)") 14 | 15 | self.train_results_dict = {"loss": None} 16 | self.train_results_dict.update({metric.name: None for metric in self.metrics}) 17 | 18 | self.val_results_dict = {"val_loss": None} 19 | self.val_results_dict.update({"val_" + metric.name: None for metric in self.metrics}) 20 | 21 | def update(self, target, output, **kwargs): 22 | for metric in self.metrics: 23 | metric.update(output, target, **kwargs) 24 | 25 | def reset(self): 26 | for metric in self.metrics: 27 | metric.reset() 28 | 29 | def results(self, loss, train: bool=True): 30 | suffix = "val_" if not train else "" 31 | results_dict = self.val_results_dict if not train else self.train_results_dict 32 | results_dict[suffix + "loss"] = loss 33 | for metric in self.metrics: 34 | result = metric.result() 35 | if result: 36 | if isinstance(result, dict): 37 | for k, v in result.items(): 38 | results_dict[suffix + k] = v 39 | else: 40 | results_dict[suffix + metric.name] = result 41 | 42 | logs = {k: round(v, 4) for k, v in results_dict.items() if v is not None} 43 | return logs 44 | 45 | def description(self, epoch: int=None, train: bool=True): 46 | epoch_desc = f"Epoch {epoch} - " if epoch is not None else " " 47 | dict = self.train_results_dict if train else self.val_results_dict 48 | return epoch_desc + " - ".join([f"{k}: {v:.4f}" for k, v in dict.items() if v]) 49 | 50 | 51 | class CallbacksHandler: 52 | """ Callbacks handler class for training and testing loops""" 53 | def __init__(self, model, callbacks: typing.List[Callback]): 54 | self.callbacks = callbacks 55 | 56 | # Validate callbacks 57 | if not all(isinstance(c, Callback) for c in self.callbacks): 58 | raise TypeError("all items in the callbacks argument must be of type Callback (Check mltu.torch.callbacks.py for more information)") 59 | 60 | for callback in self.callbacks: 61 | callback.model = model 62 | 63 | def on_train_begin(self, logs=None): 64 | for callback in self.callbacks: 65 | callback.on_train_begin(logs) 66 | 67 | def on_train_end(self, logs=None): 68 | for callback in self.callbacks: 69 | callback.on_train_end(logs) 70 | 71 | def on_epoch_begin(self, epoch, logs=None): 72 | for callback in self.callbacks: 73 | callback.on_epoch_begin(epoch, logs) 74 | 75 | def on_epoch_end(self, epoch, logs=None): 76 | for callback in self.callbacks: 77 | callback.on_epoch_end(epoch, logs) 78 | 79 | def on_test_begin(self, logs=None): 80 | for callback in self.callbacks: 81 | callback.on_test_begin(logs) 82 | 83 | def on_test_end(self, logs=None): 84 | for callback in self.callbacks: 85 | callback.on_test_end(logs) 86 | 87 | def on_batch_begin(self, batch: int, logs=None, train: bool=True): 88 | for callback in self.callbacks: 89 | callback.on_batch_begin(batch, logs) 90 | 91 | if train: 92 | callback.on_train_batch_begin(batch, logs) 93 | else: 94 | callback.on_test_batch_begin(batch, logs) 95 | 96 | def on_batch_end(self, batch: int, logs=None, train: bool=True): 97 | for callback in self.callbacks: 98 | callback.on_batch_end(batch, logs) 99 | 100 | if train: 101 | callback.on_train_batch_end(batch, logs) 102 | else: 103 | callback.on_test_batch_end(batch, logs) -------------------------------------------------------------------------------- /mltu/torch/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class CTCLoss(nn.Module): 5 | """ CTC loss for PyTorch 6 | """ 7 | def __init__(self, blank: int, reduction: str="mean", zero_infinity: bool=False): 8 | """ CTC loss for PyTorch 9 | 10 | Args: 11 | blank: Index of the blank label 12 | """ 13 | super(CTCLoss, self).__init__() 14 | self.ctc_loss = nn.CTCLoss(blank=blank, reduction=reduction, zero_infinity=zero_infinity) 15 | self.blank = blank 16 | 17 | def forward(self, output, target): 18 | """ 19 | Args: 20 | output: Tensor of shape (batch_size, num_classes, sequence_length) 21 | target: Tensor of shape (batch_size, sequence_length) 22 | 23 | Returns: 24 | loss: Scalar 25 | """ 26 | # Remove padding and blank tokens from target 27 | target_lengths = torch.sum(target != self.blank, dim=1) 28 | using_dtype = torch.int32 if max(target_lengths) <= 256 else torch.int64 29 | device = output.device 30 | 31 | target_unpadded = target[target != self.blank].view(-1).to(using_dtype) 32 | 33 | output = output.permute(1, 0, 2) # (sequence_length, batch_size, num_classes) 34 | output_lengths = torch.full(size=(output.size(1),), fill_value=output.size(0), dtype=using_dtype).to(device) 35 | 36 | loss = self.ctc_loss(output, target_unpadded, output_lengths, target_lengths.to(using_dtype)) 37 | 38 | return loss -------------------------------------------------------------------------------- /mltu/torch/metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import typing 3 | import numpy as np 4 | from itertools import groupby 5 | 6 | from mltu.utils.text_utils import get_cer, get_wer 7 | 8 | 9 | class Metric: 10 | """ Base class for all metrics""" 11 | def __init__(self, name: str) -> None: 12 | """ Initialize metric with name 13 | 14 | Args: 15 | name (str): name of metric 16 | """ 17 | self.name = name 18 | 19 | def reset(self): 20 | """ Reset metric state to initial values and return metric value""" 21 | self.__init__() 22 | 23 | def update(self, output: torch.Tensor, target: torch.Tensor, **kwargs): 24 | """ Update metric state with new data 25 | 26 | Args: 27 | output (torch.Tensor): output of model 28 | target (torch.Tensor): target of data 29 | """ 30 | pass 31 | 32 | def result(self): 33 | """ Return metric value""" 34 | pass 35 | 36 | 37 | class Accuracy(Metric): 38 | """ Accuracy metric class 39 | 40 | Args: 41 | name (str, optional): name of metric. Defaults to 'accuracy'. 42 | """ 43 | def __init__(self, name="accuracy") -> None: 44 | super(Accuracy, self).__init__(name=name) 45 | self.correct = 0 46 | self.total = 0 47 | 48 | def update(self, output: torch.Tensor, target: torch.Tensor, **kwargs): 49 | """ Update metric state with new data 50 | 51 | Args: 52 | output (torch.Tensor): output of model 53 | target (torch.Tensor): target of data 54 | """ 55 | _, predicted = torch.max(output.data, 1) 56 | self.total += target.size(0) 57 | self.correct += (predicted == target).sum().item() 58 | 59 | def result(self): 60 | """ Return metric value""" 61 | return self.correct / self.total 62 | 63 | 64 | class CERMetric(Metric): 65 | """A custom PyTorch metric to compute the Character Error Rate (CER). 66 | 67 | Args: 68 | vocabulary: A string of the vocabulary used to encode the labels. 69 | name: (Optional) string name of the metric instance. 70 | 71 | # TODO: implement everything in Torch to avoid converting to numpy 72 | """ 73 | def __init__( 74 | self, 75 | vocabulary: typing.Union[str, list], 76 | name: str = "CER" 77 | ) -> None: 78 | super(CERMetric, self).__init__(name=name) 79 | self.vocabulary = vocabulary 80 | self.reset() 81 | 82 | def reset(self): 83 | """ Reset metric state to initial values""" 84 | self.cer = 0 85 | self.counter = 0 86 | 87 | def update(self, output: torch.Tensor, target: torch.Tensor, **kwargs) -> None: 88 | """ Update metric state with new data 89 | 90 | Args: 91 | output (torch.Tensor): output of model 92 | target (torch.Tensor): target of data 93 | """ 94 | # convert to numpy 95 | output = output.detach().cpu().numpy() 96 | target = target.detach().cpu().numpy() 97 | # use argmax to find the index of the highest probability 98 | argmax_preds = np.argmax(output, axis=-1) 99 | 100 | # use groupby to find continuous same indexes 101 | grouped_preds = [[k for k,_ in groupby(preds)] for preds in argmax_preds] 102 | 103 | # convert indexes to strings 104 | output_texts = ["".join([self.vocabulary[k] for k in group if k < len(self.vocabulary)]) for group in grouped_preds] 105 | target_texts = ["".join([self.vocabulary[k] for k in group if k < len(self.vocabulary)]) for group in target] 106 | 107 | cer = get_cer(output_texts, target_texts) 108 | 109 | self.cer += cer 110 | self.counter += 1 111 | 112 | def result(self) -> float: 113 | """ Return metric value""" 114 | return self.cer / self.counter 115 | 116 | 117 | class WERMetric(Metric): 118 | """A custom PyTorch metric to compute the Word Error Rate (WER). 119 | 120 | Args: 121 | vocabulary: A string of the vocabulary used to encode the labels. 122 | name: (Optional) string name of the metric instance. 123 | 124 | # TODO: implement everything in Torch to avoid converting to numpy 125 | """ 126 | def __init__( 127 | self, 128 | vocabulary: typing.Union[str, list], 129 | name: str = "WER" 130 | ) -> None: 131 | super(WERMetric, self).__init__(name=name) 132 | self.vocabulary = vocabulary 133 | self.reset() 134 | 135 | def reset(self): 136 | """ Reset metric state to initial values""" 137 | self.wer = 0 138 | self.counter = 0 139 | 140 | def update(self, output: torch.Tensor, target: torch.Tensor, **kwargs) -> None: 141 | """ Update metric state with new data 142 | 143 | Args: 144 | output (torch.Tensor): output of model 145 | target (torch.Tensor): target of data 146 | """ 147 | # convert to numpy 148 | output = output.detach().cpu().numpy() 149 | target = target.detach().cpu().numpy() 150 | # use argmax to find the index of the highest probability 151 | argmax_preds = np.argmax(output, axis=-1) 152 | 153 | # use groupby to find continuous same indexes 154 | grouped_preds = [[k for k,_ in groupby(preds)] for preds in argmax_preds] 155 | 156 | # convert indexes to strings 157 | output_texts = ["".join([self.vocabulary[k] for k in group if k < len(self.vocabulary)]) for group in grouped_preds] 158 | target_texts = ["".join([self.vocabulary[k] for k in group if k < len(self.vocabulary)]) for group in target] 159 | 160 | wer = get_wer(output_texts, target_texts) 161 | 162 | self.wer += wer 163 | self.counter += 1 164 | 165 | def result(self) -> float: 166 | """ Return metric value""" 167 | return self.wer / self.counter -------------------------------------------------------------------------------- /mltu/torch/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.13.1 2 | tensorboard==2.10.1 3 | onnx==1.12.0 4 | torchsummaryX -------------------------------------------------------------------------------- /mltu/torch/yolo/README.md: -------------------------------------------------------------------------------- 1 | ## Update Readme -------------------------------------------------------------------------------- /mltu/torch/yolo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/torch/yolo/__init__.py -------------------------------------------------------------------------------- /mltu/torch/yolo/annotation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import typing 3 | from pathlib import Path 4 | import xml.etree.ElementTree as ET 5 | from mltu.annotations.detections import Detections, Detection, BboxType 6 | 7 | class VOCAnnotationReader: 8 | """Reads annotations from VOC format 9 | """ 10 | def __init__(self, labels: dict, images_path: str=None): 11 | self.labels = labels 12 | self.images_path = images_path 13 | self.dataset_found_labels = {} 14 | 15 | @staticmethod 16 | def readFromVOC(voc_annotation_path: str, labels: dict={}, images_path: str=None) -> Detections: 17 | annotation_path = Path(voc_annotation_path) 18 | tree = ET.parse(voc_annotation_path) 19 | root = tree.getroot() 20 | 21 | annotation_dict = {} 22 | 23 | # Iterate through child elements 24 | for child in root: 25 | if child.tag == 'object': 26 | obj_dict = {} 27 | for obj_child in child: 28 | if obj_child.tag == 'bndbox': 29 | bbox_dict = {} 30 | for bbox_child in obj_child: 31 | bbox_dict[bbox_child.tag] = int(bbox_child.text) 32 | obj_dict[obj_child.tag] = bbox_dict 33 | else: 34 | obj_dict[obj_child.tag] = obj_child.text 35 | if 'objects' not in annotation_dict: 36 | annotation_dict['objects'] = [] 37 | annotation_dict['objects'].append(obj_dict) 38 | elif child.tag == 'size': 39 | size_dict = {} 40 | for size_child in child: 41 | size_dict[size_child.tag] = int(size_child.text) 42 | annotation_dict['size'] = size_dict 43 | else: 44 | annotation_dict[child.tag] = child.text 45 | 46 | # Get the image path if not provided 47 | if images_path is None: 48 | images_path = annotation_path.parent.parent / annotation_dict["folder"] 49 | 50 | image_path = os.path.join(images_path, annotation_dict['filename']) 51 | dets = [] 52 | for obj in annotation_dict['objects']: 53 | if labels and obj['name'] not in labels.values(): 54 | print(f"Label {obj['name']} not found in labels") 55 | continue 56 | 57 | dets.append(Detection( 58 | bbox=[obj['bndbox']['xmin'], obj['bndbox']['ymin'], obj['bndbox']['xmax'], obj['bndbox']['ymax']], 59 | label=obj['name'], 60 | bbox_type=BboxType.XYXY, 61 | confidence=1, 62 | image_path=image_path, 63 | width=annotation_dict['size']['width'], 64 | height=annotation_dict['size']['height'], 65 | relative=False 66 | )) 67 | 68 | detections = Detections( 69 | labels=labels, 70 | width=annotation_dict['size']['width'], 71 | height=annotation_dict['size']['height'], 72 | image_path=image_path, 73 | detections=dets 74 | ) 75 | 76 | return detections 77 | 78 | def __call__(self, image: typing.Any, annotation: str) -> typing.Tuple[typing.Any, Detections]: 79 | detections = self.readFromVOC(annotation, self.labels, self.images_path) 80 | if image is None: 81 | image = detections.image_path 82 | return image, detections -------------------------------------------------------------------------------- /mltu/torch/yolo/detectors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/torch/yolo/detectors/__init__.py -------------------------------------------------------------------------------- /mltu/torch/yolo/detectors/detector.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from mltu.inferenceModel import FpsWrapper 4 | 5 | class BaseDetector: 6 | """Base class for the detectors in the YOLO family""" 7 | @staticmethod 8 | def preprocess(image: np.ndarray, height: int, width: int): 9 | # Convert the image color space from BGR to RGB 10 | img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 11 | 12 | # Resize the image to match the input shape 13 | img = cv2.resize(img, (width, height)) 14 | 15 | # Normalize the image data by dividing it by 255.0 16 | image_data = np.array(img) / 255.0 17 | 18 | # Transpose the image to have the channel dimension as the first dimension 19 | image_data = np.transpose(image_data, (2, 0, 1)) # Channel first 20 | 21 | # Expand the dimensions of the image data to match the expected input shape 22 | image_data = np.expand_dims(image_data, axis=0).astype(np.float32) 23 | 24 | return image_data 25 | 26 | @staticmethod 27 | def postprocess(outputs: np.ndarray, x_factor: float, y_factor: float, confidence_threshold: float=0.5, iou_threshold: float=0.5): 28 | # Transpose and squeeze the output to match the expected shape 29 | outputs = np.transpose(np.squeeze(outputs)) 30 | 31 | # Extract all classes confidence scores 32 | conf_scores = np.amax(outputs[:, 4:], axis=1) 33 | 34 | # Get the data index of the detections with scores above the confidence threshold 35 | indexes = np.where(conf_scores >= confidence_threshold)[0] 36 | 37 | # Extract the confidence scores of the detections 38 | scores = conf_scores[indexes] 39 | 40 | # Extract the class IDs of the detections 41 | class_ids = np.argmax(outputs[indexes, 4:], axis=1) 42 | 43 | # Extract the bounding box coordinates from the outputs and transform them to the original image space 44 | boxes = outputs[indexes, :4] * np.array([x_factor, y_factor, x_factor, y_factor]) 45 | 46 | # Apply non-maximum suppression to filter out overlapping bounding boxes 47 | indices = cv2.dnn.NMSBoxes(boxes, scores, confidence_threshold, iou_threshold) 48 | 49 | # Iterate over the selected indices after non-maximum suppression 50 | return boxes[indices], scores[indices], class_ids[indices] 51 | 52 | def predict(self, image: np.ndarray, **kwargs) -> np.ndarray: 53 | ... 54 | 55 | @FpsWrapper 56 | def __call__(self, image: np.ndarray): 57 | results = self.predict(image) 58 | return results -------------------------------------------------------------------------------- /mltu/torch/yolo/detectors/onnx_detector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mltu.inferenceModel import OnnxInferenceModel 3 | from mltu.torch.yolo.detectors.detector import BaseDetector 4 | from mltu.annotations.detections import BboxType, Detection, Detections 5 | 6 | class Detector(OnnxInferenceModel, BaseDetector): 7 | """ YOLOv8 detector using onnxruntime""" 8 | def __init__( 9 | self, 10 | model_path: str, 11 | input_width: int, 12 | input_height: int, 13 | confidence_threshold: float=0.5, 14 | iou_threshold: float=0.5, 15 | classes: dict = None, 16 | return_raw_output: bool=False, 17 | *args, **kwargs 18 | ): 19 | """ 20 | Args: 21 | model_path (str): Path to the model file 22 | input_width (int): Input width to use for the model 23 | input_height (int): Input height to use for the model 24 | confidence_threshold (float, optional): Confidence threshold for filtering the predictions. Defaults to 0.5. 25 | iou_threshold (float, optional): Intersection over union threshold for filtering the predictions. Defaults to 0.5. 26 | classes (dict, optional): Dictionary of class names. Defaults to None. 27 | return_raw_output (bool, optional): Return raw output of the model (return bounding boxes, scores, and class ids). Defaults to False. 28 | """ 29 | super().__init__(model_path, *args, **kwargs) 30 | self.input_width = input_width 31 | self.input_height = input_height 32 | self.confidence_threshold = confidence_threshold 33 | self.iou_threshold = iou_threshold 34 | self.return_raw_output = return_raw_output 35 | 36 | self.classes = classes or self.metadata.get("classes", None) 37 | if self.classes is None: 38 | raise ValueError("The classes must be provided") 39 | 40 | # Generate a color palette for the classes 41 | self.color_palette = np.random.uniform(0, 255, size=(len(self.classes), 3)) 42 | 43 | def predict(self, image: np.ndarray, **kwargs) -> Detections: 44 | img_height, img_width, _ = image.shape 45 | 46 | # Preprocess the image 47 | preprocessed_image = self.preprocess(image, self.input_height, self.input_width) 48 | 49 | # Perform inference on the preprocessed image 50 | preds = self.model.run(self.output_names, {self.input_names[0]: preprocessed_image}) 51 | 52 | # Extract the results from the predictions 53 | results = preds[0][0] 54 | 55 | # Calculate the scaling factors for the bounding box coordinates 56 | x_factor, y_factor = img_width / self.input_width, img_height / self.input_height 57 | 58 | # Perform postprocessing on the predictions 59 | boxes, scores, class_ids = self.postprocess(results, x_factor, y_factor, self.confidence_threshold, self.iou_threshold) 60 | 61 | if self.return_raw_output: 62 | return boxes, scores, class_ids 63 | 64 | detections = [] 65 | for bbox, conf, class_id in zip(boxes, scores, class_ids): 66 | detection = Detection( 67 | bbox = bbox, 68 | label = self.classes[class_id], 69 | labels = self.classes, 70 | bbox_type=BboxType.XYWH, 71 | confidence=conf, 72 | relative=False, 73 | width=img_width, 74 | height=img_height 75 | ) 76 | detections.append(detection) 77 | 78 | return Detections( 79 | labels=self.classes, 80 | width=img_width, 81 | height=img_height, 82 | detections=detections, 83 | color_palette=self.color_palette, 84 | ) -------------------------------------------------------------------------------- /mltu/torch/yolo/detectors/torch_detector.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from mltu.torch.yolo.detectors.detector import BaseDetector 4 | from mltu.annotations.detections import BboxType, Detection, Detections 5 | 6 | class Detector(BaseDetector): 7 | def __init__( 8 | self, 9 | model, 10 | input_width: int, 11 | input_height: int, 12 | classes: dict, 13 | confidence_threshold: float=0.5, 14 | iou_threshold: float=0.5, 15 | device: str="cuda" 16 | ): 17 | super().__init__() 18 | self.model = model 19 | self.input_width = input_width 20 | self.input_height = input_height 21 | self.classes = classes 22 | self.confidence_threshold = confidence_threshold 23 | self.iou_threshold = iou_threshold 24 | self.device = torch.device(device if torch.cuda.is_available() else "cpu") 25 | self.model.to(self.device) 26 | self.model.eval() 27 | 28 | # Generate a color palette for the classes 29 | self.color_palette = np.random.uniform(0, 255, size=(len(self.classes), 3)) 30 | 31 | def predict(self, image: np.ndarray, **kwargs) -> Detections: 32 | img_height, img_width, _ = image.shape 33 | 34 | # Preprocess the image 35 | preprocessed_image = self.preprocess(image, self.input_height, self.input_width) 36 | 37 | # Perform inference on the preprocessed image 38 | preds = self.model(torch.tensor(preprocessed_image).to(self.device)) 39 | 40 | # Convert torch tensor to numpy array 41 | results = preds[0].cpu().detach().numpy() 42 | 43 | # Calculate the scaling factors for the bounding box coordinates 44 | x_factor, y_factor = img_width / self.input_width, img_height / self.input_height 45 | 46 | # Perform postprocessing on the predictions 47 | boxes, scores, class_ids = self.postprocess(results, x_factor, y_factor, self.confidence_threshold, self.iou_threshold) 48 | 49 | detections = [] 50 | for bbox, conf, class_id in zip(boxes, scores, class_ids): 51 | detection = Detection( 52 | bbox = bbox, 53 | label = self.classes[class_id], 54 | labels = self.classes, 55 | bbox_type=BboxType.XYWH, 56 | confidence=conf, 57 | relative=False, 58 | width=img_width, 59 | height=img_height 60 | ) 61 | detections.append(detection) 62 | 63 | return Detections( 64 | labels=self.classes, 65 | width=img_width, 66 | height=img_height, 67 | detections=detections, 68 | color_palette=self.color_palette, 69 | ) -------------------------------------------------------------------------------- /mltu/torch/yolo/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from ultralytics.utils.loss import BboxLoss, xywh2xyxy 5 | from ultralytics.utils.tal import TaskAlignedAssigner, dist2bbox, make_anchors 6 | 7 | class v8DetectionLoss: 8 | """Criterion class for computing training losses.""" 9 | 10 | def __init__(self, model, box: float=7.5, cls: float=0.5, dfl: float=1.5): # model must be de-paralleled 11 | """Initializes v8DetectionLoss with the model, defining model-related properties and BCE loss function.""" 12 | self.model = model 13 | device = next(model.parameters()).device # get model device 14 | 15 | self.head = model.model[-1] # Detect() module 16 | self.bce = nn.BCEWithLogitsLoss(reduction="none") 17 | self.stride = self.head.stride # model strides 18 | self.nc = self.head.nc # number of classes 19 | self.no = self.head.no 20 | self.reg_max = self.head.reg_max # max number of regression targets 21 | self.device = device 22 | 23 | self.use_dfl = self.head.reg_max > 1 24 | 25 | self.assigner = TaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0) 26 | self.bbox_loss = BboxLoss(self.head.reg_max - 1, use_dfl=self.use_dfl).to(device) 27 | self.proj = torch.arange(self.head.reg_max, dtype=torch.float, device=device).to(device) 28 | 29 | self.box = box # box gain 30 | self.cls = cls # cls gain 31 | self.dfl = dfl # dfl gain 32 | 33 | def preprocess(self, targets, batch_size, scale_tensor): 34 | """Preprocesses the target counts and matches with the input batch size to output a tensor.""" 35 | if targets.shape[0] == 0: 36 | out = torch.zeros(batch_size, 0, 5, device=self.device) 37 | else: 38 | i = targets[:, 0] # image index 39 | _, counts = i.unique(return_counts=True) 40 | counts = counts.to(dtype=torch.int32) 41 | out = torch.zeros(batch_size, counts.max(), 5, device=self.device) 42 | for j in range(batch_size): 43 | matches = i == j 44 | n = matches.sum() 45 | if n: 46 | out[j, :n] = targets[matches, 1:] 47 | out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor)) 48 | return out 49 | 50 | def bbox_decode(self, anchor_points, pred_dist): 51 | """Decode predicted object bounding box coordinates from anchor points and distribution.""" 52 | if self.use_dfl: 53 | b, a, c = pred_dist.shape # batch, anchors, channels 54 | self.proj = self.proj.to(pred_dist.device) 55 | pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype)) 56 | 57 | return dist2bbox(pred_dist, anchor_points, xywh=False) 58 | 59 | def __call__(self, preds, batch): 60 | """Calculate the sum of the loss for box, cls and dfl multiplied by batch size.""" 61 | loss = torch.zeros(3, device=self.device) # box, cls, dfl 62 | feats = preds[1] if isinstance(preds, tuple) else preds 63 | pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split( 64 | (self.reg_max * 4, self.nc), 1 65 | ) 66 | 67 | pred_scores = pred_scores.permute(0, 2, 1).contiguous() 68 | pred_distri = pred_distri.permute(0, 2, 1).contiguous() 69 | 70 | dtype = pred_scores.dtype 71 | batch_size = pred_scores.shape[0] 72 | imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # image size (h,w) 73 | anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) 74 | 75 | # Targets 76 | targets = torch.cat((batch["batch_idx"].view(-1, 1), batch["cls"].view(-1, 1), batch["bboxes"]), 1) 77 | targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]]) 78 | gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy 79 | mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0) 80 | 81 | # Pboxes 82 | pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4) 83 | 84 | _, target_bboxes, target_scores, fg_mask, _ = self.assigner( 85 | pred_scores.detach().sigmoid(), 86 | (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), 87 | anchor_points * stride_tensor, 88 | gt_labels, 89 | gt_bboxes, 90 | mask_gt, 91 | ) 92 | 93 | target_scores_sum = max(target_scores.sum(), 1) 94 | 95 | # Cls loss 96 | loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE 97 | 98 | # Bbox loss 99 | if fg_mask.sum(): 100 | target_bboxes /= stride_tensor 101 | loss[0], loss[2] = self.bbox_loss( 102 | pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask 103 | ) 104 | 105 | loss[0] *= self.box # box gain 106 | loss[1] *= self.cls # cls gain 107 | loss[2] *= self.dfl # dfl gain 108 | 109 | detailed_loss = {"box_loss": loss[0].detach(), "cls_loss": loss[1].detach(), "dfl_loss": loss[2].detach()} 110 | 111 | return loss.sum() * batch_size, detailed_loss # loss(box, cls, dfl) -------------------------------------------------------------------------------- /mltu/torch/yolo/optimizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | class AccumulativeOptimizer(torch.optim.Optimizer): 5 | def __init__(self, optimizer, batch_size, nbs=64): 6 | super(AccumulativeOptimizer, self).__init__(optimizer.param_groups, optimizer.defaults) 7 | self.optimizer = optimizer 8 | self.accumulation_steps = int(nbs / batch_size) 9 | self.current_step = 0 10 | 11 | def zero_grad(self): 12 | if self.current_step == 0: 13 | self.optimizer.zero_grad() 14 | 15 | def step(self): 16 | self.current_step += 1 17 | if self.current_step >= self.accumulation_steps: 18 | self.optimizer.step() 19 | self.current_step = 0 20 | self.optimizer.zero_grad() 21 | 22 | 23 | def build_optimizer(model, name: str="AdamW", lr: float=1e-3, weight_decay: float=0.0, momentum: float=0.937, decay=0.0005): 24 | 25 | pg0, pg1, pg2 = [], [], [] # optimizer parameter groups 26 | bn = tuple(v for k, v in nn.__dict__.items() if "Norm" in k) # normalization layers, i.e. BatchNorm2d() 27 | for module_name, module in model.named_modules(): 28 | for param_name, param in module.named_parameters(recurse=False): 29 | fullname = f"{module_name}.{param_name}" if module_name else param_name 30 | if "bias" in fullname: # bias (no decay) 31 | pg2.append(param) 32 | elif isinstance(module, bn): # weight (no decay) 33 | pg1.append(param) 34 | else: # weight (with decay) 35 | pg0.append(param) 36 | 37 | if name == "AdamW": 38 | optimizer = torch.optim.AdamW(pg2, lr=lr, weight_decay=weight_decay, betas=(momentum, 0.999)) 39 | elif name == "Adam": 40 | optimizer = torch.optim.Adam(pg2, lr=lr, weight_decay=weight_decay, betas=(momentum, 0.999)) 41 | elif name == "SGD": 42 | optimizer = torch.optim.SGD(pg2, lr=lr, weight_decay=weight_decay, momentum=0.9) 43 | else: 44 | raise ValueError(f"Optimizer {name} not supported!") 45 | 46 | optimizer.add_param_group({'params': pg0, 'weight_decay': decay}) # add pg1 with weight_decay 47 | optimizer.add_param_group({'params': pg1, 'weight_decay': 0.0}) # add pg2 (biases) 48 | 49 | del pg0, pg1, pg2 50 | 51 | return optimizer -------------------------------------------------------------------------------- /mltu/torch/yolo/preprocessors.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import typing 3 | import numpy as np 4 | 5 | class YoloPreprocessor: 6 | def __init__(self, device: torch.device, imgsz: int=640): 7 | self.device = device 8 | self.imgsz = imgsz 9 | 10 | def __call__(self, images, annotations) -> typing.Tuple[np.ndarray, dict]: 11 | batch = { 12 | "ori_shape": [], 13 | "resized_shape": [], 14 | "cls": [], 15 | "bboxes": [], 16 | "batch_idx": [], 17 | } 18 | 19 | for i, (image, detections) in enumerate(zip(images, annotations)): 20 | batch["ori_shape"].append([detections.height, detections.width]) 21 | batch["resized_shape"].append([self.imgsz, self.imgsz]) 22 | for detection in detections: 23 | batch["cls"].append([detection.labelId]) 24 | batch["bboxes"].append(detection.xywh) 25 | batch["batch_idx"].append(i) 26 | 27 | batch["cls"] = torch.tensor(np.array(batch["cls"])).to(self.device) 28 | batch["bboxes"] = torch.tensor(np.array(batch["bboxes"])).to(self.device) 29 | batch["batch_idx"] = torch.tensor(np.array(batch["batch_idx"])).to(self.device) 30 | 31 | return np.array(images), batch -------------------------------------------------------------------------------- /mltu/torch/yolo/pruning_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from ultralytics.nn.modules import C2f, Conv, Bottleneck 4 | 5 | def infer_shortcut(bottleneck): 6 | c1 = bottleneck.cv1.conv.in_channels 7 | c2 = bottleneck.cv2.conv.out_channels 8 | return c1 == c2 and hasattr(bottleneck, 'add') and bottleneck.add 9 | 10 | class C2f_v2(nn.Module): 11 | # CSP Bottleneck with 2 convolutions 12 | def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion 13 | super().__init__() 14 | self.c = int(c2 * e) # hidden channels 15 | self.cv0 = Conv(c1, self.c, 1, 1) 16 | self.cv1 = Conv(c1, self.c, 1, 1) 17 | self.cv2 = Conv((2 + n) * self.c, c2, 1) # optional act=FReLU(c2) 18 | self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)) 19 | 20 | def forward(self, x): 21 | # y = list(self.cv1(x).chunk(2, 1)) 22 | y = [self.cv0(x), self.cv1(x)] 23 | y.extend(m(y[-1]) for m in self.m) 24 | return self.cv2(torch.cat(y, 1)) 25 | 26 | def transfer_weights(c2f, c2f_v2): 27 | c2f_v2.cv2 = c2f.cv2 28 | c2f_v2.m = c2f.m 29 | 30 | state_dict = c2f.state_dict() 31 | state_dict_v2 = c2f_v2.state_dict() 32 | 33 | # Transfer cv1 weights from C2f to cv0 and cv1 in C2f_v2 34 | old_weight = state_dict['cv1.conv.weight'] 35 | half_channels = old_weight.shape[0] // 2 36 | state_dict_v2['cv0.conv.weight'] = old_weight[:half_channels] 37 | state_dict_v2['cv1.conv.weight'] = old_weight[half_channels:] 38 | 39 | # Transfer cv1 batchnorm weights and buffers from C2f to cv0 and cv1 in C2f_v2 40 | for bn_key in ['weight', 'bias', 'running_mean', 'running_var']: 41 | old_bn = state_dict[f'cv1.bn.{bn_key}'] 42 | state_dict_v2[f'cv0.bn.{bn_key}'] = old_bn[:half_channels] 43 | state_dict_v2[f'cv1.bn.{bn_key}'] = old_bn[half_channels:] 44 | 45 | # Transfer remaining weights and buffers 46 | for key in state_dict: 47 | if not key.startswith('cv1.'): 48 | state_dict_v2[key] = state_dict[key] 49 | 50 | # Transfer all non-method attributes 51 | for attr_name in dir(c2f): 52 | attr_value = getattr(c2f, attr_name) 53 | if not callable(attr_value) and '_' not in attr_name: 54 | setattr(c2f_v2, attr_name, attr_value) 55 | 56 | c2f_v2.load_state_dict(state_dict_v2) 57 | 58 | def replace_c2f_with_c2f_v2(module): 59 | for name, child_module in module.named_children(): 60 | if isinstance(child_module, C2f): 61 | # Replace C2f with C2f_v2 while preserving its parameters 62 | shortcut = infer_shortcut(child_module.m[0]) 63 | c2f_v2 = C2f_v2(child_module.cv1.conv.in_channels, child_module.cv2.conv.out_channels, 64 | n=len(child_module.m), shortcut=shortcut, 65 | g=child_module.m[0].cv2.conv.groups, 66 | e=child_module.c / child_module.cv2.conv.out_channels) 67 | transfer_weights(child_module, c2f_v2) 68 | setattr(module, name, c2f_v2) 69 | else: 70 | replace_c2f_with_c2f_v2(child_module) -------------------------------------------------------------------------------- /mltu/torch/yolo/requirements.txt: -------------------------------------------------------------------------------- 1 | ultralytics==8.1.9 2 | torch==2.0.0 3 | torchvision==0.15.1 4 | torch_pruning==1.3.6 -------------------------------------------------------------------------------- /mltu/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonlessons/mltu/f3033451f62c3fd2097b990c98b25f97773b640d/mltu/utils/__init__.py -------------------------------------------------------------------------------- /mltu/utils/text_utils.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import numpy as np 3 | from itertools import groupby 4 | 5 | 6 | def ctc_decoder(predictions: np.ndarray, chars: typing.Union[str, list]) -> typing.List[str]: 7 | """ CTC greedy decoder for predictions 8 | 9 | Args: 10 | predictions (np.ndarray): predictions from model 11 | chars (typing.Union[str, list]): list of characters 12 | 13 | Returns: 14 | typing.List[str]: list of words 15 | """ 16 | # use argmax to find the index of the highest probability 17 | argmax_preds = np.argmax(predictions, axis=-1) 18 | 19 | # use groupby to find continuous same indexes 20 | grouped_preds = [[k for k,_ in groupby(preds)] for preds in argmax_preds] 21 | 22 | # convert indexes to chars 23 | texts = ["".join([chars[k] for k in group if k < len(chars)]) for group in grouped_preds] 24 | 25 | return texts 26 | 27 | 28 | def edit_distance(prediction_tokens: typing.List[str], reference_tokens: typing.List[str]) -> int: 29 | """ Standard dynamic programming algorithm to compute the Levenshtein Edit Distance Algorithm 30 | 31 | Args: 32 | prediction_tokens: A tokenized predicted sentence 33 | reference_tokens: A tokenized reference sentence 34 | Returns: 35 | Edit distance between the predicted sentence and the reference sentence 36 | """ 37 | # Initialize a matrix to store the edit distances 38 | dp = [[0] * (len(reference_tokens) + 1) for _ in range(len(prediction_tokens) + 1)] 39 | 40 | # Fill the first row and column with the number of insertions needed 41 | for i in range(len(prediction_tokens) + 1): 42 | dp[i][0] = i 43 | 44 | for j in range(len(reference_tokens) + 1): 45 | dp[0][j] = j 46 | 47 | # Iterate through the prediction and reference tokens 48 | for i, p_tok in enumerate(prediction_tokens): 49 | for j, r_tok in enumerate(reference_tokens): 50 | # If the tokens are the same, the edit distance is the same as the previous entry 51 | if p_tok == r_tok: 52 | dp[i+1][j+1] = dp[i][j] 53 | # If the tokens are different, the edit distance is the minimum of the previous entries plus 1 54 | else: 55 | dp[i+1][j+1] = min(dp[i][j+1], dp[i+1][j], dp[i][j]) + 1 56 | 57 | # Return the final entry in the matrix as the edit distance 58 | return dp[-1][-1] 59 | 60 | def get_cer( 61 | preds: typing.Union[str, typing.List[str]], 62 | target: typing.Union[str, typing.List[str]], 63 | ) -> float: 64 | """ Update the cer score with the current set of references and predictions. 65 | 66 | Args: 67 | preds (typing.Union[str, typing.List[str]]): list of predicted sentences 68 | target (typing.Union[str, typing.List[str]]): list of target words 69 | 70 | Returns: 71 | Character error rate score 72 | """ 73 | if isinstance(preds, str): 74 | preds = [preds] 75 | if isinstance(target, str): 76 | target = [target] 77 | 78 | total, errors = 0, 0 79 | for pred_tokens, tgt_tokens in zip(preds, target): 80 | errors += edit_distance(list(pred_tokens), list(tgt_tokens)) 81 | total += len(tgt_tokens) 82 | 83 | if total == 0: 84 | return 0.0 85 | 86 | cer = errors / total 87 | 88 | return cer 89 | 90 | def get_wer( 91 | preds: typing.Union[str, typing.List[str]], 92 | target: typing.Union[str, typing.List[str]], 93 | ) -> float: 94 | """ Update the wer score with the current set of references and predictions. 95 | 96 | Args: 97 | target (typing.Union[str, typing.List[str]]): string of target sentence or list of target words 98 | preds (typing.Union[str, typing.List[str]]): string of predicted sentence or list of predicted words 99 | 100 | Returns: 101 | Word error rate score 102 | """ 103 | if isinstance(preds, str) and isinstance(target, str): 104 | preds = [preds] 105 | target = [target] 106 | 107 | if isinstance(preds, list) and isinstance(target, list): 108 | errors, total_words = 0, 0 109 | for _pred, _target in zip(preds, target): 110 | if isinstance(_pred, str) and isinstance(_target, str): 111 | errors += edit_distance(_pred.split(), _target.split()) 112 | total_words += len(_target.split()) 113 | else: 114 | print("Error: preds and target must be either both strings or both lists of strings.") 115 | return np.inf 116 | 117 | else: 118 | print("Error: preds and target must be either both strings or both lists of strings.") 119 | return np.inf 120 | 121 | wer = errors / total_words 122 | 123 | return wer -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML>=6.0 2 | tqdm 3 | qqdm==0.0.7 4 | pandas 5 | numpy 6 | opencv-python 7 | Pillow>=9.4.0 8 | onnxruntime>=1.15.0 # onnxruntime-gpu for GPU support 9 | matplotlib -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | 5 | DIR = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | with open(os.path.join(DIR, "README.md")) as fh: 8 | long_description = fh.read() 9 | 10 | with open(os.path.join(DIR, "requirements.txt")) as fh: 11 | requirements = fh.read().splitlines() 12 | 13 | 14 | def get_version(initpath: str) -> str: 15 | """ Get from the init of the source code the version string 16 | 17 | Params: 18 | initpath (str): path to the init file of the python package relative to the setup file 19 | 20 | Returns: 21 | str: The version string in the form 0.0.1 22 | """ 23 | 24 | path = os.path.join(os.path.dirname(__file__), initpath) 25 | 26 | with open(path, "r") as handle: 27 | for line in handle.read().splitlines(): 28 | if line.startswith("__version__"): 29 | return line.split("=")[1].strip().strip("\"'") 30 | else: 31 | raise RuntimeError("Unable to find version string.") 32 | 33 | 34 | setup( 35 | name="mltu", 36 | version=get_version("mltu/__init__.py"), 37 | long_description=long_description, 38 | long_description_content_type="text/markdown", 39 | url="https://pylessons.com/", 40 | author="PyLessons", 41 | author_email="pythonlessons0@gmail.com", 42 | install_requires=requirements, 43 | extras_require={ 44 | "gpu": ["onnxruntime-gpu"], 45 | }, 46 | python_requires=">=3", 47 | packages=find_packages(exclude=("*_test.py",)), 48 | include_package_data=True, 49 | project_urls={ 50 | "Source": "https://github.com/pythonlessons/mltu/", 51 | "Tracker": "https://github.com/pythonlessons/mltu/issues", 52 | }, 53 | description="Machine Learning Training Utilities (MLTU) for TensorFlow and PyTorch", 54 | ) 55 | --------------------------------------------------------------------------------