├── .gitignore ├── CycleGANColab └── CycleGANColab.ipynb ├── CycleGANSolution ├── README.md ├── a4-code-v2-updated │ ├── README.md │ ├── cycle_gan.py │ ├── data_loader.py │ ├── model_checker.py │ ├── models.py │ ├── utils.py │ └── vanilla_gan.py └── a4-handout.pdf ├── GAN ├── README.md ├── Wiley's │ ├── models.py │ └── vanilla_gan.py └── vanilla_gan.py ├── LICENSE ├── LoRA ├── lora_hello_world.ipynb ├── lora_hello_world2.ipynb └── lora_hello_world3.ipynb ├── MachineTranslation ├── README.md ├── ReferenceExample.ipynb ├── seq2seq_translation_tutorial.ipynb ├── torchtext_translation_tutorial.ipynb └── torchtext_translation_tutorial_with_transformers.ipynb ├── NeuralArchitectureSearch ├── Autokeras.ipynb └── NeuralArchitectureSearch.ipynb ├── ProphetCode └── main.py ├── Quantization └── Quantization.ipynb ├── README.md ├── RL └── RL.ipynb ├── RL_from_human_feedback └── RL_from_human_feedback.ipynb ├── ReinforcmentLearning └── simple_example.py ├── SiameseNetwork ├── siamese_network.ipynb ├── siamese_original_network.ipynb └── twin_network.ipynb ├── SinGAN ├── CatGAN.ipynb ├── DoubleGAN.ipynb ├── SinGAN.ipynb ├── SinGANOfficialImplementation.ipynb └── SinGAN_on_custom_image.ipynb ├── TabularXGBoost └── TabularDataXGBoostTutorial.ipynb ├── Transformers ├── README.md ├── Transformers.ipynb ├── requirements.txt └── translation │ └── train.py ├── TwinNetwork └── twin_network.ipynb ├── VisionTransformers ├── VisionTransformers.ipynb ├── VisionTransformers_cleaned_up_code_2021-08-24.ipynb ├── VisionTransformers_with_PyTorch_Transformers.ipynb └── VisionTransformers_with_PyTorch_Transformers_with_BatchNorm.ipynb ├── handwriting-synthesis ├── .gitignore ├── .travis.yml ├── checkpoints │ ├── checkpoint │ ├── model-17900.data-00000-of-00001 │ ├── model-17900.index │ └── model-17900.meta ├── data │ ├── blacklist.npy │ ├── processed │ │ ├── .gitattributes │ │ ├── c.npy │ │ ├── c_len.npy │ │ ├── w_id.npy │ │ ├── x.npy │ │ └── x_len.npy │ └── raw │ │ └── readme.md ├── data_frame.py ├── demo.py ├── drawing.py ├── img │ ├── all_star.svg │ ├── banner.svg │ ├── downtown.svg │ ├── give_up.svg │ └── usage_demo.svg ├── lyrics.py ├── prepare_data.py ├── readme.md ├── requirements.txt ├── rnn.py ├── rnn_cell.py ├── rnn_ops.py ├── styles │ ├── style-0-chars.npy │ ├── style-0-strokes.npy │ ├── style-1-chars.npy │ ├── style-1-strokes.npy │ ├── style-1.npy │ ├── style-10-chars.npy │ ├── style-10-strokes.npy │ ├── style-11-chars.npy │ ├── style-11-strokes.npy │ ├── style-12-chars.npy │ ├── style-12-strokes.npy │ ├── style-2-chars.npy │ ├── style-2-strokes.npy │ ├── style-2.npy │ ├── style-3-chars.npy │ ├── style-3-strokes.npy │ ├── style-4-chars.npy │ ├── style-4-strokes.npy │ ├── style-5-chars.npy │ ├── style-5-strokes.npy │ ├── style-6-chars.npy │ ├── style-6-strokes.npy │ ├── style-7-chars.npy │ ├── style-7-strokes.npy │ ├── style-8-chars.npy │ ├── style-8-strokes.npy │ ├── style-9-chars.npy │ └── style-9-strokes.npy ├── test_example.py ├── tf_base_model.py ├── tf_utils.py └── upgrade_tf2.sh ├── handwriting_generator ├── IBM.csv ├── IBM_Transformer+TimeEmbedding.ipynb ├── handwriting_generator.ipynb └── saved.tgz ├── minGPT ├── .gitignore ├── LICENSE ├── README.md ├── mingpt.jpg ├── mingpt │ ├── __init__.py │ ├── model.py │ ├── trainer.py │ └── utils.py ├── play_char.ipynb ├── play_image.ipynb └── play_math.ipynb └── sound ├── preprocess ├── README.md ├── mp3_to_wav.py ├── to_16000_wav.py └── trim.py ├── project-keyword-spotter ├── .DS_Store ├── CONTRIBUTING.md ├── Icon ├── LICENSE ├── README.md ├── audio_recorder.py ├── config │ ├── Icon │ ├── commands_v2.txt │ ├── commands_v2_snake.txt │ ├── labels_gc2.raw.txt │ └── labels_simple_audio.txt ├── features.py ├── hearing_snake_metadata.json ├── install_requirements.sh ├── media │ ├── Icon │ └── startscreen.png ├── mel_features.py ├── model.py ├── model_yamnet.py ├── models │ ├── Icon │ ├── model-backup1.tflite │ ├── model.tflite │ ├── model_quantized_edgetpu.tflite │ ├── voice_commands_v0.7_edgetpu.tflite │ └── voice_commands_v0.8_edgetpu.tflite ├── params.py ├── pygame_images │ ├── Icon │ ├── apple.png │ ├── bg.jpg │ ├── snake_head_with_ears.png │ └── snake_tail.png ├── run_hearing_snake.py ├── run_model.py ├── run_model_yamnet.py ├── run_snake.sh ├── run_yt_voice_control.py ├── run_yt_voice_control.sh ├── yamnet.py └── yamnet_class_map.csv ├── simple_audio.ipynb ├── simple_audio_custom_cough_dataset_compiled.ipynb ├── simple_audio_load_vggish.ipynb ├── simple_audio_load_vggish_with_layer.ipynb ├── simple_audio_load_yamnet.ipynb ├── simple_audio_new_spectrogram.ipynb ├── simple_audio_new_spectrogram_custom_cough_dataset.ipynb ├── simple_audio_new_spectrogram_custom_cough_dataset_quantize.ipynb ├── simple_audio_new_spectrogram_custom_dataset.ipynb ├── simple_audio_new_spectrogram_numpy.ipynb ├── simple_audio_new_spectrogram_numpy_and_normalize.ipynb ├── simple_audio_new_spectrogram_numpy_and_normalize_only_left_right_working.ipynb ├── simple_audio_working_vggish.ipynb ├── simple_audio_working_vggish_clean.ipynb ├── simple_audio_working_vggish_clean_freeze_vggish_weights.ipynb ├── simple_audio_working_vggish_dataset.ipynb └── sound.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /CycleGANColab/CycleGANColab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "CycleGAN.ipynb", 7 | "provenance": [], 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "accelerator": "GPU" 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "fTGhQfaYH_QV", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# CycleGAN\n", 35 | "\n", 36 | "This notebook makes the CycleGAN homework assignment runnable on Google Colab (free GPU), so you don't need a physical GPU to run this assignment.\n", 37 | "\n", 38 | "Code available on https://github.com/wileyw/DeepLearningDemos.git\n", 39 | "\n", 40 | "Homework Assignment: https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/assignments/a4-handout.pdf" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "id": "g-vpRE5yJebK", 47 | "colab_type": "code", 48 | "colab": {} 49 | }, 50 | "source": [ 51 | "!git clone https://github.com/wileyw/DeepLearningDemos.git" 52 | ], 53 | "execution_count": 0, 54 | "outputs": [] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "metadata": { 59 | "id": "haTvhcMrH8ke", 60 | "colab_type": "code", 61 | "colab": {} 62 | }, 63 | "source": [ 64 | "!wget http://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/assignments/a4-code.zip \n", 65 | "!unzip -q a4-code.zip" 66 | ], 67 | "execution_count": 0, 68 | "outputs": [] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "metadata": { 73 | "id": "ks4AwPQYN-bo", 74 | "colab_type": "code", 75 | "colab": {} 76 | }, 77 | "source": [ 78 | "!ls\n", 79 | "!mv a4-code-v2-updated/emojis .\n", 80 | "!mv a4-code-v2-updated/checker_files ." 81 | ], 82 | "execution_count": 0, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "metadata": { 88 | "id": "5NyF3QrVKNqr", 89 | "colab_type": "code", 90 | "colab": {} 91 | }, 92 | "source": [ 93 | "!python3 DeepLearningDemos/CycleGANSolution/a4-code-v2-updated/model_checker.py" 94 | ], 95 | "execution_count": 0, 96 | "outputs": [] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "metadata": { 101 | "id": "9EHeytGb46j0", 102 | "colab_type": "code", 103 | "colab": {} 104 | }, 105 | "source": [ 106 | "import sys\n", 107 | "sys.path.append('DeepLearningDemos/CycleGANSolution/a4-code-v2-updated')\n", 108 | "import cycle_gan\n", 109 | "from cycle_gan import *\n", 110 | "\n", 111 | "sys.argv[:] = ['cycle_gan.py']\n", 112 | "parser = create_parser()\n", 113 | "opts = parser.parse_args()\n", 114 | "\n", 115 | "opts.use_cycle_consistency_loss = True\n", 116 | "\n", 117 | "batch_size = opts.batch_size\n", 118 | "cycle_gan.batch_size = batch_size\n", 119 | "\n", 120 | "print(opts)\n", 121 | "main(opts)" 122 | ], 123 | "execution_count": 0, 124 | "outputs": [] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "metadata": { 129 | "id": "ac_qDfPs5S_g", 130 | "colab_type": "code", 131 | "colab": {} 132 | }, 133 | "source": [ 134 | "from IPython.display import Image\n", 135 | "import matplotlib.pyplot as plt\n", 136 | "import glob\n", 137 | "images = sorted(glob.glob('./samples_cyclegan/*X-Y.png'))\n", 138 | "Image(images[-1])" 139 | ], 140 | "execution_count": 0, 141 | "outputs": [] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "metadata": { 146 | "id": "uVnVGQo66bYF", 147 | "colab_type": "code", 148 | "colab": {} 149 | }, 150 | "source": [ 151 | "from IPython.display import Image\n", 152 | "import matplotlib.pyplot as plt\n", 153 | "import glob\n", 154 | "images = sorted(glob.glob('./samples_cyclegan/*Y-X.png'))\n", 155 | "Image(images[-1])" 156 | ], 157 | "execution_count": 0, 158 | "outputs": [] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "metadata": { 163 | "id": "ZxN74FK-wdYV", 164 | "colab_type": "code", 165 | "colab": {} 166 | }, 167 | "source": [ 168 | "import sys\n", 169 | "sys.path.append('DeepLearningDemos/CycleGANSolution/a4-code-v2-updated')\n", 170 | "import vanilla_gan\n", 171 | "from vanilla_gan import *\n", 172 | "\n", 173 | "# Run Vanilla GAN\n", 174 | "sys.argv[:] = ['vanilla_gan.py']\n", 175 | "parser = create_parser()\n", 176 | "opts = parser.parse_args()\n", 177 | "\n", 178 | "batch_size = opts.batch_size\n", 179 | "vanilla_gan.batch_size = batch_size\n", 180 | "\n", 181 | "print(opts)\n", 182 | "main(opts)" 183 | ], 184 | "execution_count": 0, 185 | "outputs": [] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "metadata": { 190 | "id": "NTCwanSr1S_D", 191 | "colab_type": "code", 192 | "colab": {} 193 | }, 194 | "source": [ 195 | "# View images\n", 196 | "from IPython.display import Image\n", 197 | "import matplotlib.pyplot as plt\n", 198 | "import glob\n", 199 | "images = sorted(glob.glob('./samples_vanilla/*.png'))\n", 200 | "Image(images[-1])\n" 201 | ], 202 | "execution_count": 0, 203 | "outputs": [] 204 | } 205 | ] 206 | } -------------------------------------------------------------------------------- /CycleGANSolution/README.md: -------------------------------------------------------------------------------- 1 | # Cycle GAN and Vanilla GAN 2 | Homework Assignment: 3 | https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/assignments/a4-handout.pdf 4 | 5 | ## Download Dataset: 6 | This .zip file contains the starting code as well. When we unzip this file, we should not overwrite the existing .py files. 7 | ``` 8 | wget http://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/assignments/a4-code.zip 9 | 10 | # When prompted to overwrite, select [N]one 11 | unzip a4-code.zip 12 | ``` 13 | 14 | ## Original Course Website: 15 | http://www.cs.toronto.edu/~rgrosse/courses/csc421_2019/ 16 | -------------------------------------------------------------------------------- /CycleGANSolution/a4-code-v2-updated/README.md: -------------------------------------------------------------------------------- 1 | Vanilla GAN 2 | ## 1. Run the model checker 3 | ``` 4 | python3 model_checker.py 5 | ``` 6 | 7 | ## 2. Run vanilla GAN 8 | ``` 9 | python3 vanilla_gan.py 10 | ``` 11 | 12 | ## 3. Run Cycle GAN 13 | ``` 14 | python3 cycle_gan.py --use_cycle_consistency_loss 15 | ``` 16 | -------------------------------------------------------------------------------- /CycleGANSolution/a4-code-v2-updated/data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from torch.utils.data import DataLoader 5 | 6 | from torchvision import datasets 7 | from torchvision import transforms 8 | 9 | 10 | def get_emoji_loader(emoji_type, opts): 11 | """Creates training and test data loaders. 12 | """ 13 | transform = transforms.Compose([ 14 | transforms.Scale(opts.image_size), 15 | transforms.ToTensor(), 16 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 17 | ]) 18 | 19 | train_path = os.path.join('./emojis', emoji_type) 20 | test_path = os.path.join('./emojis', 'Test_{}'.format(emoji_type)) 21 | 22 | train_dataset = datasets.ImageFolder(train_path, transform) 23 | test_dataset = datasets.ImageFolder(test_path, transform) 24 | 25 | train_dloader = DataLoader(dataset=train_dataset, batch_size=opts.batch_size, shuffle=True, num_workers=opts.num_workers) 26 | test_dloader = DataLoader(dataset=test_dataset, batch_size=opts.batch_size, shuffle=False, num_workers=opts.num_workers) 27 | 28 | return train_dloader, test_dloader 29 | -------------------------------------------------------------------------------- /CycleGANSolution/a4-code-v2-updated/model_checker.py: -------------------------------------------------------------------------------- 1 | # CSC 321, Assignment 4 2 | # 3 | # This is a script to check whether the outputs of your CycleGenerator, DCDiscriminator, and 4 | # CycleGenerator models produce the expected outputs. 5 | # 6 | # NOTE THAT THIS MODEL CHECKER IS PROVIDED FOR CONVENIENCE ONLY, AND MAY PRODUCE FALSE NEGATIVES. 7 | # DO NOT USE THIS AS THE ONLY WAY TO CHECK THAT YOUR MODEL IS CORRECT. 8 | # 9 | # Usage: 10 | # ====== 11 | # 12 | # python model_checker.py 13 | # 14 | 15 | import warnings 16 | warnings.filterwarnings("ignore") 17 | 18 | # Torch imports 19 | import torch 20 | from torch.autograd import Variable 21 | 22 | # Numpy 23 | import numpy as np 24 | 25 | # Local imports 26 | from models import DCGenerator, DCDiscriminator, CycleGenerator 27 | 28 | 29 | def count_parameters(model): 30 | """Finds the total number of trainable parameters in a model. 31 | """ 32 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 33 | 34 | 35 | def sample_noise(dim): 36 | """ 37 | Generate a PyTorch Tensor of uniform random noise. 38 | 39 | Input: 40 | - batch_size: Integer giving the batch size of noise to generate. 41 | - dim: Integer giving the dimension of noise to generate. 42 | 43 | Output: 44 | - A PyTorch Tensor of shape (1, dim, 1, 1) containing uniform 45 | random noise in the range (-1, 1). 46 | """ 47 | return Variable(torch.rand(1, dim) * 2 - 1).unsqueeze(2).unsqueeze(3) 48 | 49 | 50 | def check_dc_generator(): 51 | """Checks the output and number of parameters of the DCGenerator class. 52 | """ 53 | state = torch.load('checker_files/dc_generator.pt') 54 | 55 | G = DCGenerator(noise_size=100, conv_dim=32) 56 | G.load_state_dict(state['state_dict']) 57 | noise = state['input'] 58 | dc_generator_expected = state['output'] 59 | 60 | output = G(noise) 61 | output_np = output.data.cpu().numpy() 62 | 63 | if np.allclose(output_np, dc_generator_expected, atol=1e-06): 64 | print('DCGenerator output: EQUAL') 65 | else: 66 | print('DCGenerator output: NOT EQUAL') 67 | 68 | num_params = count_parameters(G) 69 | expected_params = 370624 70 | 71 | print('DCGenerator #params = {}, expected #params = {}, {}'.format( 72 | num_params, expected_params, 'EQUAL' if num_params == expected_params else 'NOT EQUAL')) 73 | 74 | print('-' * 80) 75 | 76 | 77 | def check_dc_discriminator(): 78 | """Checks the output and number of parameters of the DCDiscriminator class. 79 | """ 80 | state = torch.load('checker_files/dc_discriminator.pt') 81 | 82 | D = DCDiscriminator(conv_dim=32) 83 | D.load_state_dict(state['state_dict']) 84 | images = state['input'] 85 | dc_discriminator_expected = state['output'] 86 | 87 | output = D(images) 88 | output_np = output.data.cpu().numpy() 89 | 90 | if np.allclose(output_np, dc_discriminator_expected, atol=1e-06): 91 | print('DCDiscriminator output: EQUAL') 92 | else: 93 | print('DCDiscriminator output: NOT EQUAL') 94 | 95 | num_params = count_parameters(D) 96 | expected_params = 167872 97 | 98 | print('DCDiscriminator #params = {}, expected #params = {}, {}'.format( 99 | num_params, expected_params, 'EQUAL' if num_params == expected_params else 'NOT EQUAL')) 100 | 101 | print('-' * 80) 102 | 103 | 104 | def check_cycle_generator(): 105 | """Checks the output and number of parameters of the CycleGenerator class. 106 | """ 107 | state = torch.load('checker_files/cycle_generator.pt') 108 | 109 | G_XtoY = CycleGenerator(conv_dim=32, init_zero_weights=False) 110 | G_XtoY.load_state_dict(state['state_dict']) 111 | images = state['input'] 112 | cycle_generator_expected = state['output'] 113 | 114 | output = G_XtoY(images) 115 | output_np = output.data.cpu().numpy() 116 | 117 | if np.allclose(output_np, cycle_generator_expected, atol=1e-06): 118 | print('CycleGenerator output: EQUAL') 119 | else: 120 | print('CycleGenerator output: NOT EQUAL') 121 | 122 | num_params = count_parameters(G_XtoY) 123 | expected_params = 105856 124 | 125 | print('CycleGenerator #params = {}, expected #params = {}, {}'.format( 126 | num_params, expected_params, 'EQUAL' if num_params == expected_params else 'NOT EQUAL')) 127 | 128 | print('-' * 80) 129 | 130 | 131 | if __name__ == '__main__': 132 | 133 | try: 134 | check_dc_generator() 135 | except: 136 | print('Crashed while checking DCGenerator. Maybe not implemented yet?') 137 | 138 | try: 139 | check_dc_discriminator() 140 | except: 141 | print('Crashed while checking DCDiscriminator. Maybe not implemented yet?') 142 | 143 | #try: 144 | check_cycle_generator() 145 | #except: 146 | # print('Crashed while checking CycleGenerator. Maybe not implemented yet?') 147 | -------------------------------------------------------------------------------- /CycleGANSolution/a4-code-v2-updated/models.py: -------------------------------------------------------------------------------- 1 | # CSC 321, Assignment 4 2 | # 3 | # This file contains the models used for both parts of the assignment: 4 | # 5 | # - DCGenerator --> Used in the vanilla GAN in Part 1 6 | # - CycleGenerator --> Used in the CycleGAN in Part 2 7 | # - DCDiscriminator --> Used in both the vanilla GAN and CycleGAN (Parts 1 and 2) 8 | # 9 | # For the assignment, you are asked to create the architectures of these three networks by 10 | # filling in the __init__ methods in the DCGenerator, CycleGenerator, and DCDiscriminator classes. 11 | # Note that the forward passes of these models are provided for you, so the only part you need to 12 | # fill in is __init__. 13 | 14 | import pdb 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | 19 | 20 | def deconv(in_channels, out_channels, kernel_size, stride=2, padding=1, batch_norm=True): 21 | """Creates a transposed-convolutional layer, with optional batch normalization. 22 | """ 23 | layers = [] 24 | layers.append(nn.ConvTranspose2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)) 25 | if batch_norm: 26 | layers.append(nn.BatchNorm2d(out_channels)) 27 | return nn.Sequential(*layers) 28 | 29 | 30 | def conv(in_channels, out_channels, kernel_size, stride=2, padding=1, batch_norm=True, init_zero_weights=False): 31 | """Creates a convolutional layer, with optional batch normalization. 32 | """ 33 | layers = [] 34 | conv_layer = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=False) 35 | if init_zero_weights: 36 | conv_layer.weight.data = torch.randn(out_channels, in_channels, kernel_size, kernel_size) * 0.001 37 | layers.append(conv_layer) 38 | 39 | if batch_norm: 40 | layers.append(nn.BatchNorm2d(out_channels)) 41 | return nn.Sequential(*layers) 42 | 43 | 44 | class DCGenerator(nn.Module): 45 | def __init__(self, noise_size, conv_dim): 46 | super(DCGenerator, self).__init__() 47 | 48 | ########################################### 49 | ## FILL THIS IN: CREATE ARCHITECTURE ## 50 | ########################################### 51 | 52 | self.deconv1 = deconv(noise_size, 128, 4, stride=1, padding=0, batch_norm=True) 53 | self.deconv2 = deconv(128, 64, 4, stride=2, padding=1, batch_norm=True) 54 | self.deconv3 = deconv(64, 32, 4, stride=2, padding=1, batch_norm=True) 55 | self.deconv4 = deconv(32, 3, 4, stride=2, padding=1, batch_norm=False) 56 | 57 | def forward(self, z): 58 | """Generates an image given a sample of random noise. 59 | 60 | Input 61 | ----- 62 | z: BS x noise_size x 1 x 1 --> 16x100x1x1 63 | 64 | Output 65 | ------ 66 | out: BS x channels x image_width x image_height --> 16x3x32x32 67 | """ 68 | 69 | out = F.relu(self.deconv1(z)) 70 | out = F.relu(self.deconv2(out)) 71 | out = F.relu(self.deconv3(out)) 72 | out = F.tanh(self.deconv4(out)) 73 | return out 74 | 75 | 76 | class ResnetBlock(nn.Module): 77 | def __init__(self, conv_dim): 78 | super(ResnetBlock, self).__init__() 79 | self.conv_layer = conv(in_channels=conv_dim, out_channels=conv_dim, kernel_size=3, stride=1, padding=1) 80 | 81 | def forward(self, x): 82 | out = x + self.conv_layer(x) 83 | return out 84 | 85 | 86 | class CycleGenerator(nn.Module): 87 | """Defines the architecture of the generator network. 88 | Note: Both generators G_XtoY and G_YtoX have the same architecture in this assignment. 89 | """ 90 | def __init__(self, conv_dim=64, init_zero_weights=False): 91 | super(CycleGenerator, self).__init__() 92 | 93 | ########################################### 94 | ## FILL THIS IN: CREATE ARCHITECTURE ## 95 | ########################################### 96 | 97 | # 1. Define the encoder part of the generator (that extracts features from the input image) 98 | self.conv1 = conv(3, 32, 4, stride=2, padding=1, batch_norm=True, init_zero_weights=init_zero_weights) 99 | self.conv2 = conv(32, 64, 4, stride=2, padding=1, batch_norm=True, init_zero_weights=init_zero_weights) 100 | 101 | # 2. Define the transformation part of the generator 102 | self.resnet_block = ResnetBlock(64) 103 | 104 | # 3. Define the decoder part of the generator (that builds up the output image from features) 105 | self.deconv1 = deconv(64, 32, 4, stride=2, padding=1, batch_norm=True) 106 | self.deconv2 = deconv(32, 3, 4, stride=2, padding=1, batch_norm=False) 107 | 108 | def forward(self, x): 109 | """Generates an image conditioned on an input image. 110 | 111 | Input 112 | ----- 113 | x: BS x 3 x 32 x 32 114 | 115 | Output 116 | ------ 117 | out: BS x 3 x 32 x 32 118 | """ 119 | 120 | out = F.relu(self.conv1(x)) 121 | out = F.relu(self.conv2(out)) 122 | 123 | out = F.relu(self.resnet_block(out)) 124 | 125 | out = F.relu(self.deconv1(out)) 126 | out = F.tanh(self.deconv2(out)) 127 | 128 | return out 129 | 130 | 131 | class DCDiscriminator(nn.Module): 132 | """Defines the architecture of the discriminator network. 133 | Note: Both discriminators D_X and D_Y have the same architecture in this assignment. 134 | """ 135 | def __init__(self, conv_dim=64): 136 | super(DCDiscriminator, self).__init__() 137 | 138 | ########################################### 139 | ## FILL THIS IN: CREATE ARCHITECTURE ## 140 | ########################################### 141 | 142 | self.conv1 = conv(3, 32, 4, stride=2, padding=1, batch_norm=True, init_zero_weights=False) 143 | self.conv2 = conv(32, 64, 4, stride=2, padding=1, batch_norm=True, init_zero_weights=False) 144 | self.conv3 = conv(64, 128, 4, stride=2, padding=1, batch_norm=True, init_zero_weights=False) 145 | self.conv4 = conv(128, 1, 4, stride=1, padding=0, batch_norm=False, init_zero_weights=False) 146 | 147 | def forward(self, x): 148 | 149 | out = F.relu(self.conv1(x)) 150 | out = F.relu(self.conv2(out)) 151 | out = F.relu(self.conv3(out)) 152 | 153 | out = self.conv4(out).squeeze() 154 | out = F.sigmoid(out) 155 | return out 156 | -------------------------------------------------------------------------------- /CycleGANSolution/a4-code-v2-updated/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | 7 | def to_var(x): 8 | """Converts numpy to variable.""" 9 | if torch.cuda.is_available(): 10 | x = x.cuda() 11 | return Variable(x) 12 | 13 | 14 | def to_data(x): 15 | """Converts variable to numpy.""" 16 | if torch.cuda.is_available(): 17 | x = x.cpu() 18 | return x.data.numpy() 19 | 20 | 21 | def create_dir(directory): 22 | """Creates a directory if it does not already exist. 23 | """ 24 | if not os.path.exists(directory): 25 | os.makedirs(directory) 26 | -------------------------------------------------------------------------------- /CycleGANSolution/a4-handout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/CycleGANSolution/a4-handout.pdf -------------------------------------------------------------------------------- /GAN/README.md: -------------------------------------------------------------------------------- 1 | # Vanilla GAN 2 | Homework Assignment: 3 | https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/assignments/a4-handout.pdf 4 | 5 | Download Code: 6 | ``` 7 | wget http://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/assignments/a4-code.zip 8 | unzip a4-code.zip 9 | ``` 10 | -------------------------------------------------------------------------------- /GAN/Wiley's/models.py: -------------------------------------------------------------------------------- 1 | # CSC 321, Assignment 4 2 | # 3 | # This file contains the models used for both parts of the assignment: 4 | # 5 | # - DCGenerator --> Used in the vanilla GAN in Part 1 6 | # - CycleGenerator --> Used in the CycleGAN in Part 2 7 | # - DCDiscriminator --> Used in both the vanilla GAN and CycleGAN (Parts 1 and 2) 8 | # 9 | # For the assignment, you are asked to create the architectures of these three networks by 10 | # filling in the __init__ methods in the DCGenerator, CycleGenerator, and DCDiscriminator classes. 11 | # Note that the forward passes of these models are provided for you, so the only part you need to 12 | # fill in is __init__. 13 | 14 | import pdb 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | 19 | 20 | def deconv( 21 | in_channels, out_channels, kernel_size, stride=2, padding=1, batch_norm=True 22 | ): 23 | """Creates a transposed-convolutional layer, with optional batch normalization. 24 | """ 25 | layers = [] 26 | layers.append( 27 | nn.ConvTranspose2d( 28 | in_channels, out_channels, kernel_size, stride, padding, bias=False 29 | ) 30 | ) 31 | if batch_norm: 32 | layers.append(nn.BatchNorm2d(out_channels)) 33 | return nn.Sequential(*layers) 34 | 35 | 36 | def conv( 37 | in_channels, 38 | out_channels, 39 | kernel_size, 40 | stride=2, 41 | padding=1, 42 | batch_norm=True, 43 | init_zero_weights=False, 44 | ): 45 | """Creates a convolutional layer, with optional batch normalization. 46 | """ 47 | layers = [] 48 | conv_layer = nn.Conv2d( 49 | in_channels=in_channels, 50 | out_channels=out_channels, 51 | kernel_size=kernel_size, 52 | stride=stride, 53 | padding=padding, 54 | bias=False, 55 | ) 56 | if init_zero_weights: 57 | conv_layer.weight.data = ( 58 | torch.randn(out_channels, in_channels, kernel_size, kernel_size) * 0.001 59 | ) 60 | layers.append(conv_layer) 61 | 62 | if batch_norm: 63 | layers.append(nn.BatchNorm2d(out_channels)) 64 | return nn.Sequential(*layers) 65 | 66 | 67 | class DCGenerator(nn.Module): 68 | def __init__(self, noise_size, conv_dim): 69 | super(DCGenerator, self).__init__() 70 | 71 | ########################################### 72 | ## FILL THIS IN: CREATE ARCHITECTURE ## 73 | ########################################### 74 | kernel_size = 4 75 | 76 | self.deconv1 = deconv(100, conv_dim * 4, kernel_size, padding=0) 77 | self.deconv2 = deconv(conv_dim * 4, conv_dim * 2, kernel_size) 78 | self.deconv3 = deconv(conv_dim * 2, conv_dim, kernel_size) 79 | self.deconv4 = deconv(conv_dim, 3, kernel_size, 2, batch_norm=False) 80 | 81 | def forward(self, z): 82 | """Generates an image given a sample of random noise. 83 | 84 | Input 85 | ----- 86 | z: BS x noise_size x 1 x 1 --> 16x100x1x1 87 | 88 | Output 89 | ------ 90 | out: BS x channels x image_width x image_height --> 16x3x32x32 91 | """ 92 | 93 | out = F.relu(self.deconv1(z)) 94 | out = F.relu(self.deconv2(out)) 95 | out = F.relu(self.deconv3(out)) 96 | out = F.tanh(self.deconv4(out)) 97 | return out 98 | 99 | 100 | class ResnetBlock(nn.Module): 101 | def __init__(self, conv_dim): 102 | super(ResnetBlock, self).__init__() 103 | self.conv_layer = conv( 104 | in_channels=conv_dim, 105 | out_channels=conv_dim, 106 | kernel_size=3, 107 | stride=1, 108 | padding=1, 109 | ) 110 | 111 | def forward(self, x): 112 | out = x + self.conv_layer(x) 113 | return out 114 | 115 | 116 | class CycleGenerator(nn.Module): 117 | """Defines the architecture of the generator network. 118 | Note: Both generators G_XtoY and G_YtoX have the same architecture in this assignment. 119 | """ 120 | 121 | def __init__(self, conv_dim=64, init_zero_weights=False): 122 | super(CycleGenerator, self).__init__() 123 | 124 | ########################################### 125 | ## FILL THIS IN: CREATE ARCHITECTURE ## 126 | ########################################### 127 | 128 | kernel_size = 4 129 | self.conv1 = conv(3, conv_dim, kernel_size) 130 | self.conv2 = conv(conv_dim, conv_dim * 2, kernel_size) 131 | 132 | self.resnet_block = ResnetBlock(conv_dim * 2) 133 | 134 | self.deconv1 = deconv(conv_dim * 2, conv_dim, kernel_size) 135 | self.deconv2 = deconv(conv_dim, 3, kernel_size, 2, batch_norm=False) 136 | 137 | 138 | # 1. Define the encoder part of the generator (that extracts features from the input image) 139 | # self.conv1 = conv(...) 140 | # self.conv2 = conv(...) 141 | 142 | # 2. Define the transformation part of the generator 143 | # self.resnet_block = ... 144 | 145 | # 3. Define the decoder part of the generator (that builds up the output image from features) 146 | # self.deconv1 = deconv(...) 147 | # self.deconv2 = deconv(...) 148 | 149 | def forward(self, x): 150 | """Generates an image conditioned on an input image. 151 | 152 | Input 153 | ----- 154 | x: BS x 3 x 32 x 32 155 | 156 | Output 157 | ------ 158 | out: BS x 3 x 32 x 32 159 | """ 160 | 161 | out = F.relu(self.conv1(x)) 162 | out = F.relu(self.conv2(out)) 163 | 164 | out = F.relu(self.resnet_block(out)) 165 | 166 | out = F.relu(self.deconv1(out)) 167 | out = F.tanh(self.deconv2(out)) 168 | 169 | return out 170 | 171 | 172 | class DCDiscriminator(nn.Module): 173 | """Defines the architecture of the discriminator network. 174 | Note: Both discriminators D_X and D_Y have the same architecture in this 175 | assignment. 176 | """ 177 | 178 | def __init__(self, conv_dim=64): 179 | super(DCDiscriminator, self).__init__() 180 | 181 | ########################################### 182 | ## FILL THIS IN: CREATE ARCHITECTURE ## 183 | ########################################### 184 | 185 | kernel_size = 4 186 | self.conv1 = conv(3, conv_dim, kernel_size) 187 | self.conv2 = conv(conv_dim, conv_dim * 2, kernel_size) 188 | self.conv3 = conv(conv_dim * 2, conv_dim * 4, kernel_size) 189 | self.conv4 = conv(conv_dim * 4, 1, kernel_size, 2, padding=0, batch_norm=False) 190 | 191 | def forward(self, x): 192 | 193 | out = F.relu(self.conv1(x)) 194 | out = F.relu(self.conv2(out)) 195 | out = F.relu(self.conv3(out)) 196 | 197 | out = self.conv4(out).squeeze() 198 | out = F.sigmoid(out) 199 | return out 200 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Wiley 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LoRA/lora_hello_world.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "private_outputs": true, 7 | "provenance": [], 8 | "gpuType": "T4", 9 | "authorship_tag": "ABX9TyPWmZoHOxQbf2DbGURay9eI", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | }, 19 | "accelerator": "GPU", 20 | "gpuClass": "standard" 21 | }, 22 | "cells": [ 23 | { 24 | "cell_type": "markdown", 25 | "metadata": { 26 | "id": "view-in-github", 27 | "colab_type": "text" 28 | }, 29 | "source": [ 30 | "\"Open" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "source": [ 36 | "!pip3 uninstall torch -y" 37 | ], 38 | "metadata": { 39 | "id": "EzfLQmy-c9fY" 40 | }, 41 | "execution_count": null, 42 | "outputs": [] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "source": [ 47 | "!pip install torch==1.11.0" 48 | ], 49 | "metadata": { 50 | "id": "4-IPSPXGcXsP" 51 | }, 52 | "execution_count": null, 53 | "outputs": [] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "source": [ 58 | "import torch\n", 59 | "print(torch.__version__)" 60 | ], 61 | "metadata": { 62 | "id": "Et1rcuQMeDjf" 63 | }, 64 | "execution_count": null, 65 | "outputs": [] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "id": "FK-QGuS3gsMZ" 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "print('test')" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "source": [ 81 | "!sudo apt-get update" 82 | ], 83 | "metadata": { 84 | "id": "61kVAU1MirsS" 85 | }, 86 | "execution_count": null, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "source": [ 92 | "!sudo apt-get -y install git jq virtualenv" 93 | ], 94 | "metadata": { 95 | "id": "DOUW4eOIizbG" 96 | }, 97 | "execution_count": null, 98 | "outputs": [] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "source": [ 103 | "!git clone https://github.com/microsoft/LoRA.git; cd LoRA" 104 | ], 105 | "metadata": { 106 | "id": "DEUOudE_i4Yv" 107 | }, 108 | "execution_count": null, 109 | "outputs": [] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "source": [ 114 | "!ls" 115 | ], 116 | "metadata": { 117 | "id": "bokzXL8ei8UP" 118 | }, 119 | "execution_count": null, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "source": [ 125 | "!ls LoRA/examples/NLG" 126 | ], 127 | "metadata": { 128 | "id": "WPU5_vxWjCvp" 129 | }, 130 | "execution_count": null, 131 | "outputs": [] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "source": [ 136 | "!python3 -m pip install --upgrade pip" 137 | ], 138 | "metadata": { 139 | "id": "7C80M_rujyM5" 140 | }, 141 | "execution_count": null, 142 | "outputs": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "source": [ 147 | "!python3 -m pip install -r LoRA/examples/NLG/requirement.txt" 148 | ], 149 | "metadata": { 150 | "id": "x8oqUQjHjmj4" 151 | }, 152 | "execution_count": null, 153 | "outputs": [] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "source": [ 158 | "!python3 -m pip install transformers" 159 | ], 160 | "metadata": { 161 | "id": "lS2ZmMO3klGw" 162 | }, 163 | "execution_count": null, 164 | "outputs": [] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "source": [ 169 | "!python3 -m pip install spacy tqdm tensorboard progress" 170 | ], 171 | "metadata": { 172 | "id": "l2uD_wpFk4gP" 173 | }, 174 | "execution_count": null, 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "source": [ 180 | "import torch\n", 181 | "print(torch.__version__)" 182 | ], 183 | "metadata": { 184 | "id": "Q7dAMI4lkR2u" 185 | }, 186 | "execution_count": null, 187 | "outputs": [] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "source": [ 192 | "%cd LoRA/examples/NLG" 193 | ], 194 | "metadata": { 195 | "id": "YbDu2w3FlC9l" 196 | }, 197 | "execution_count": null, 198 | "outputs": [] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "source": [ 203 | "!ls" 204 | ], 205 | "metadata": { 206 | "id": "QVEol-7IlI1A" 207 | }, 208 | "execution_count": null, 209 | "outputs": [] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "source": [ 214 | "!bash download_pretrained_checkpoints.sh" 215 | ], 216 | "metadata": { 217 | "id": "hTaJ7ZYzlMsf" 218 | }, 219 | "execution_count": null, 220 | "outputs": [] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "source": [ 225 | "!bash create_datasets.sh" 226 | ], 227 | "metadata": { 228 | "id": "e2HL_HV-lQlb" 229 | }, 230 | "execution_count": null, 231 | "outputs": [] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "source": [ 236 | "%cd ./eval" 237 | ], 238 | "metadata": { 239 | "id": "JDyJhqaUlSsf" 240 | }, 241 | "execution_count": null, 242 | "outputs": [] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "source": [ 247 | "!bash download_evalscript.sh" 248 | ], 249 | "metadata": { 250 | "id": "K5Cw5xVRl6wM" 251 | }, 252 | "execution_count": null, 253 | "outputs": [] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "source": [ 258 | "%cd .." 259 | ], 260 | "metadata": { 261 | "id": "Smyk-DKyl8rP" 262 | }, 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "source": [ 269 | "!ls" 270 | ], 271 | "metadata": { 272 | "id": "n8rIjmXNmWj3" 273 | }, 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "source": [ 280 | "!python3 -m pip install loralib" 281 | ], 282 | "metadata": { 283 | "id": "gQX04dU0oGW1" 284 | }, 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "source": [ 291 | "!python3 -m torch.distributed.launch --nproc_per_node=1 src/gpt2_ft.py \\\n", 292 | " --train_data ./data/e2e/train.jsonl \\\n", 293 | " --valid_data ./data/e2e/valid.jsonl \\\n", 294 | " --train_batch_size 4 \\\n", 295 | " --grad_acc 1 \\\n", 296 | " --valid_batch_size 2 \\\n", 297 | " --seq_len 512 \\\n", 298 | " --model_card gpt2.md \\\n", 299 | " --init_checkpoint ./pretrained_checkpoints/gpt2-medium-pytorch_model.bin \\\n", 300 | " --platform local \\\n", 301 | " --clip 0.0 \\\n", 302 | " --lr 0.0002 \\\n", 303 | " --weight_decay 0.01 \\\n", 304 | " --correct_bias \\\n", 305 | " --adam_beta2 0.999 \\\n", 306 | " --scheduler linear \\\n", 307 | " --warmup_step 500 \\\n", 308 | " --max_epoch 5 \\\n", 309 | " --save_interval 1000 \\\n", 310 | " --lora_dim 4 \\\n", 311 | " --lora_alpha 32 \\\n", 312 | " --lora_dropout 0.1 \\\n", 313 | " --label_smooth 0.1 \\\n", 314 | " --work_dir ./trained_models/GPT2_M/e2e \\\n", 315 | " --random_seed 110" 316 | ], 317 | "metadata": { 318 | "id": "wWn2H2somOZ5" 319 | }, 320 | "execution_count": null, 321 | "outputs": [] 322 | } 323 | ] 324 | } -------------------------------------------------------------------------------- /LoRA/lora_hello_world2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "private_outputs": true, 7 | "provenance": [], 8 | "gpuType": "T4", 9 | "authorship_tag": "ABX9TyPfd2szk9I+NCou6SCoJGZw", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | }, 19 | "accelerator": "GPU", 20 | "gpuClass": "standard" 21 | }, 22 | "cells": [ 23 | { 24 | "cell_type": "markdown", 25 | "metadata": { 26 | "id": "view-in-github", 27 | "colab_type": "text" 28 | }, 29 | "source": [ 30 | "\"Open" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "source": [ 36 | "!pip3 uninstall torch -y" 37 | ], 38 | "metadata": { 39 | "id": "EzfLQmy-c9fY" 40 | }, 41 | "execution_count": null, 42 | "outputs": [] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "source": [ 47 | "!pip install torch==1.11.0" 48 | ], 49 | "metadata": { 50 | "id": "4-IPSPXGcXsP" 51 | }, 52 | "execution_count": null, 53 | "outputs": [] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "source": [ 58 | "import torch\n", 59 | "print(torch.__version__)" 60 | ], 61 | "metadata": { 62 | "id": "Et1rcuQMeDjf" 63 | }, 64 | "execution_count": null, 65 | "outputs": [] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "id": "FK-QGuS3gsMZ" 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "print('test')" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "source": [ 81 | "!sudo apt-get update" 82 | ], 83 | "metadata": { 84 | "id": "61kVAU1MirsS" 85 | }, 86 | "execution_count": null, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "source": [ 92 | "!sudo apt-get -y install git jq virtualenv" 93 | ], 94 | "metadata": { 95 | "id": "DOUW4eOIizbG" 96 | }, 97 | "execution_count": null, 98 | "outputs": [] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "source": [ 103 | "!git clone https://github.com/microsoft/LoRA.git; cd LoRA" 104 | ], 105 | "metadata": { 106 | "id": "DEUOudE_i4Yv" 107 | }, 108 | "execution_count": null, 109 | "outputs": [] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "source": [ 114 | "!ls" 115 | ], 116 | "metadata": { 117 | "id": "bokzXL8ei8UP" 118 | }, 119 | "execution_count": null, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "source": [ 125 | "!ls LoRA/examples/NLG" 126 | ], 127 | "metadata": { 128 | "id": "WPU5_vxWjCvp" 129 | }, 130 | "execution_count": null, 131 | "outputs": [] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "source": [ 136 | "!python3 -m pip install --upgrade pip" 137 | ], 138 | "metadata": { 139 | "id": "7C80M_rujyM5" 140 | }, 141 | "execution_count": null, 142 | "outputs": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "source": [ 147 | "!python3 -m pip install -r LoRA/examples/NLG/requirement.txt" 148 | ], 149 | "metadata": { 150 | "id": "x8oqUQjHjmj4" 151 | }, 152 | "execution_count": null, 153 | "outputs": [] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "source": [ 158 | "!python3 -m pip install transformers" 159 | ], 160 | "metadata": { 161 | "id": "lS2ZmMO3klGw" 162 | }, 163 | "execution_count": null, 164 | "outputs": [] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "source": [ 169 | "!python3 -m pip install spacy tqdm tensorboard progress" 170 | ], 171 | "metadata": { 172 | "id": "l2uD_wpFk4gP" 173 | }, 174 | "execution_count": null, 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "source": [ 180 | "import torch\n", 181 | "print(torch.__version__)" 182 | ], 183 | "metadata": { 184 | "id": "Q7dAMI4lkR2u" 185 | }, 186 | "execution_count": null, 187 | "outputs": [] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "source": [ 192 | "%cd LoRA/examples/NLG" 193 | ], 194 | "metadata": { 195 | "id": "YbDu2w3FlC9l" 196 | }, 197 | "execution_count": null, 198 | "outputs": [] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "source": [ 203 | "!ls" 204 | ], 205 | "metadata": { 206 | "id": "QVEol-7IlI1A" 207 | }, 208 | "execution_count": null, 209 | "outputs": [] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "source": [ 214 | "!bash download_pretrained_checkpoints.sh" 215 | ], 216 | "metadata": { 217 | "id": "hTaJ7ZYzlMsf" 218 | }, 219 | "execution_count": null, 220 | "outputs": [] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "source": [ 225 | "!bash create_datasets2.sh" 226 | ], 227 | "metadata": { 228 | "id": "e2HL_HV-lQlb" 229 | }, 230 | "execution_count": null, 231 | "outputs": [] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "source": [ 236 | "%cd ./eval" 237 | ], 238 | "metadata": { 239 | "id": "JDyJhqaUlSsf" 240 | }, 241 | "execution_count": null, 242 | "outputs": [] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "source": [ 247 | "!bash download_evalscript.sh" 248 | ], 249 | "metadata": { 250 | "id": "K5Cw5xVRl6wM" 251 | }, 252 | "execution_count": null, 253 | "outputs": [] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "source": [ 258 | "%cd .." 259 | ], 260 | "metadata": { 261 | "id": "Smyk-DKyl8rP" 262 | }, 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "source": [ 269 | "!ls" 270 | ], 271 | "metadata": { 272 | "id": "n8rIjmXNmWj3" 273 | }, 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "source": [ 280 | "!python3 -m pip install loralib" 281 | ], 282 | "metadata": { 283 | "id": "gQX04dU0oGW1" 284 | }, 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "source": [ 291 | "!ls -l data/e2e" 292 | ], 293 | "metadata": { 294 | "id": "5BvLJ03yZx_M" 295 | }, 296 | "execution_count": null, 297 | "outputs": [] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "source": [ 302 | "!python3 -m torch.distributed.launch --nproc_per_node=1 src/gpt2_ft.py \\\n", 303 | " --train_data ./data/e2e/train.jsonl \\\n", 304 | " --valid_data ./data/e2e/valid.jsonl \\\n", 305 | " --train_batch_size 4 \\\n", 306 | " --grad_acc 1 \\\n", 307 | " --valid_batch_size 2 \\\n", 308 | " --seq_len 512 \\\n", 309 | " --model_card gpt2.md \\\n", 310 | " --init_checkpoint ./pretrained_checkpoints/gpt2-medium-pytorch_model.bin \\\n", 311 | " --platform local \\\n", 312 | " --clip 0.0 \\\n", 313 | " --lr 0.0002 \\\n", 314 | " --weight_decay 0.01 \\\n", 315 | " --correct_bias \\\n", 316 | " --adam_beta2 0.999 \\\n", 317 | " --scheduler linear \\\n", 318 | " --warmup_step 500 \\\n", 319 | " --max_epoch 5 \\\n", 320 | " --save_interval 1000 \\\n", 321 | " --lora_dim 4 \\\n", 322 | " --lora_alpha 32 \\\n", 323 | " --lora_dropout 0.1 \\\n", 324 | " --label_smooth 0.1 \\\n", 325 | " --work_dir ./trained_models/GPT2_M/e2e \\\n", 326 | " --random_seed 110" 327 | ], 328 | "metadata": { 329 | "id": "wWn2H2somOZ5" 330 | }, 331 | "execution_count": null, 332 | "outputs": [] 333 | } 334 | ] 335 | } -------------------------------------------------------------------------------- /MachineTranslation/README.md: -------------------------------------------------------------------------------- 1 | # Machine Translation 2 | 3 | ``` 4 | # Download the dataset 5 | wget https://www.manythings.org/anki/fra-eng.zip 6 | 7 | unzip fra-eng.zip 8 | ``` 9 | 10 | ## Tutorials 11 | - https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html# 12 | - [Link to Colab Notebook](https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/a60617788061539b5449701ae76aee56/seq2seq_translation_tutorial.ipynb) 13 | - [TorchText and nn.Transformer](https://pytorch.org/tutorials/beginner/transformer_tutorial.html) 14 | - https://towardsdatascience.com/neural-machine-translation-15ecf6b0b 15 | - https://medium.com/analytics-vidhya/a-must-read-nlp-tutorial-on-neural-machine-translation-the-technique-powering-google-translate-c5c8d97d7587 16 | 17 | ## Datasets 18 | - [Downloads](https://tatoeba.org/eng/downloads) 19 | - [Splitting language pairs into individual text files](https://www.manythings.org/anki/) 20 | -------------------------------------------------------------------------------- /MachineTranslation/ReferenceExample.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "ReferenceExample.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyNfR2AxE7LGpIZL/UWgBjMn", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "CsFfALRqWl-r" 32 | }, 33 | "source": [ 34 | "# https://github.com/andrewpeng02/transformer-translation\n", 35 | "!git clone https://github.com/andrewpeng02/transformer-translation.git" 36 | ], 37 | "execution_count": null, 38 | "outputs": [] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "15vsyh8nXIyo" 44 | }, 45 | "source": [ 46 | "!python3 -m pip install --upgrade pip\n", 47 | "!python3 -m pip install click==7.0\n", 48 | "!python3 -m pip install dill==0.3.1.1 --use-feature=2020-resolver\n", 49 | "!python3 -m pip install einops==0.1.0\n", 50 | "!python3 -m pip install en-core-web-sm==2.1.0\n", 51 | "!python3 -m pip install fr-core-news-sm==2.1.0\n", 52 | "!python3 -m pip install joblib==0.13.2\n", 53 | "!python3 -m pip install torchtext==0.4.0" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "R4pZuNrQX_Gq" 62 | }, 63 | "source": [ 64 | "!ls transformer-translation/data\n", 65 | "%cd transformer-translation/data\n", 66 | "!wget http://www.manythings.org/anki/fra-eng.zip\n", 67 | "!unzip fra-eng.zip\n", 68 | "%cd ../../\n", 69 | "!ls\n" 70 | ], 71 | "execution_count": null, 72 | "outputs": [] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "metadata": { 77 | "id": "0kL2dvTwZxW1" 78 | }, 79 | "source": [ 80 | "cd transformer-translation" 81 | ], 82 | "execution_count": null, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "metadata": { 88 | "id": "bS3Kkzrlbmzr" 89 | }, 90 | "source": [ 91 | "!python3 -m spacy download en\n", 92 | "!python3 -m spacy download fr" 93 | ], 94 | "execution_count": null, 95 | "outputs": [] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "metadata": { 100 | "id": "rAyZTlUKYuun" 101 | }, 102 | "source": [ 103 | "!python3 process-tatoeba-data.py\n", 104 | "!python3 preprocess-data.py" 105 | ], 106 | "execution_count": null, 107 | "outputs": [] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "metadata": { 112 | "id": "DH3jMH4pZM1d" 113 | }, 114 | "source": [ 115 | "!ls\n", 116 | "!echo -----------------\n", 117 | "!ls data/processed\n", 118 | "!echo -----------------\n", 119 | "!ls -l data/processed/fr" 120 | ], 121 | "execution_count": null, 122 | "outputs": [] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "metadata": { 127 | "id": "wMtzQQUQb6Bw" 128 | }, 129 | "source": [ 130 | "!python3 train.py" 131 | ], 132 | "execution_count": null, 133 | "outputs": [] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "metadata": { 138 | "id": "xse_C7t_prc3" 139 | }, 140 | "source": [ 141 | "!python3 translate-sentence.py" 142 | ], 143 | "execution_count": null, 144 | "outputs": [] 145 | } 146 | ] 147 | } -------------------------------------------------------------------------------- /NeuralArchitectureSearch/Autokeras.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Autokeras.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyMXe9x8u7a1Wy8NJkVoxkF6", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "tKCAuX00SO_k", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "# AutoKeras\n", 36 | "\n", 37 | "[Paper](https://arxiv.org/pdf/1806.10282.pdf)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "id": "0fQ17-24Vjbb", 44 | "colab_type": "text" 45 | }, 46 | "source": [ 47 | "In this example, we get an accuracy of 91.13% with only 3 trials and 3 epochs. The above minimal code AutoKeras example shows how simple and easy AutoKeras is to use.\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "id": "JeGbfKc8SAjv", 54 | "colab_type": "text" 55 | }, 56 | "source": [ 57 | "# Questions\n", 58 | "## 1. What is Edit-Distance in a Neural Network?\n", 59 | "Edit-distance is the number of operations needed to morph one architecture into another architecture." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "b4q_-v76Lht7", 66 | "colab_type": "code", 67 | "colab": {} 68 | }, 69 | "source": [ 70 | "!python3 -m pip install autokeras\n", 71 | "!pip install tensorflow-gpu==2.1.0" 72 | ], 73 | "execution_count": 0, 74 | "outputs": [] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "metadata": { 79 | "id": "s9icubivMy5Y", 80 | "colab_type": "code", 81 | "colab": {} 82 | }, 83 | "source": [ 84 | "%tensorflow_version 2.x" 85 | ], 86 | "execution_count": 0, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "metadata": { 92 | "id": "iDaqLMGGMj_y", 93 | "colab_type": "code", 94 | "colab": {} 95 | }, 96 | "source": [ 97 | "import tensorflow as tf\n", 98 | "print(tf.__version__)" 99 | ], 100 | "execution_count": 0, 101 | "outputs": [] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "metadata": { 106 | "id": "K1bwRM-ELrD1", 107 | "colab_type": "code", 108 | "colab": {} 109 | }, 110 | "source": [ 111 | "#from tensorflow.keras.datasets import mnist\n", 112 | "from tensorflow.keras.datasets import fashion_mnist as mnist\n", 113 | "(x_train, y_train), (x_test, y_test) = mnist.load_data()\n", 114 | "print(x_train.shape) # (60000, 28, 28)\n", 115 | "print(y_train.shape) # (60000,)\n", 116 | "print(y_train[:3]) # array([7, 2, 1], dtype=uint8)" 117 | ], 118 | "execution_count": 0, 119 | "outputs": [] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "id": "qheYaZkjL8kv", 125 | "colab_type": "code", 126 | "colab": {} 127 | }, 128 | "source": [ 129 | "import autokeras as ak\n", 130 | "\n", 131 | "# Initialize the image classifier.\n", 132 | "clf = ak.ImageClassifier(max_trials=3) # It tries 3 different models.\n", 133 | "# Feed the image classifier with training data.\n", 134 | "clf.fit(x_train, y_train,epochs=3)" 135 | ], 136 | "execution_count": 0, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "h919aZ_7L-yX", 143 | "colab_type": "code", 144 | "colab": {} 145 | }, 146 | "source": [ 147 | "# Predict with the best model.\n", 148 | "predicted_y = clf.predict(x_test)\n", 149 | "print(predicted_y)" 150 | ], 151 | "execution_count": 0, 152 | "outputs": [] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "metadata": { 157 | "id": "djrTIS6kMAk-", 158 | "colab_type": "code", 159 | "colab": {} 160 | }, 161 | "source": [ 162 | "# Evaluate the best model with testing data.\n", 163 | "print(clf.evaluate(x_test, y_test))\n", 164 | "model = clf.export_model()\n", 165 | "print(model.summary())" 166 | ], 167 | "execution_count": 0, 168 | "outputs": [] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": { 173 | "id": "R7fcbu9WT_7Z", 174 | "colab_type": "text" 175 | }, 176 | "source": [ 177 | "# Resources\n", 178 | "- [Autokeras paper](https://arxiv.org/pdf/1806.10282.pdf)\n", 179 | "- [Autokeras website](https://autokeras.com/)\n", 180 | "- [Custom Autokeras Model](https://autokeras.com/tutorial/customized/)" 181 | ] 182 | } 183 | ] 184 | } -------------------------------------------------------------------------------- /NeuralArchitectureSearch/NeuralArchitectureSearch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "NeuralArchitectureSearch", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyP1Xhoo+Gdh4xH/x0QaPkqg", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "8k2XRoS1zFOk", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# Neural Architecture Search\n", 35 | "\n", 36 | "## EfficientNet\n", 37 | "\n", 38 | "### What's the difference between MBConv1 versus MBConv6?\n", 39 | "- MBConv1 expands the number of channels by a factor of 1.\n", 40 | "- MBConv6 expands the number of channels by a factor of 6.\n", 41 | "\n", 42 | "[MBConv6](https://machinethink.net/blog/mobile-architectures/)\n", 43 | "\n", 44 | "## Do you need to do your own Neural Architecture Search?\n", 45 | "- No, you can use transfer learning to apply an existing neural net architecture and retrain the network on your own dataset\n", 46 | "- [AutoML and Neural Architecture Search](https://towardsdatascience.com/everything-you-need-to-know-about-automl-and-neural-architecture-search-8db1863682bf)\n", 47 | "\n", 48 | "\n", 49 | "## Goals\n", 50 | "1. Understand EfficientNet\n", 51 | "2. Understand why EfficientNet is better than Mobilenet V3, could be a blog post\n", 52 | "3. PyTorch EfficientDet\n", 53 | "4. Run EfficientNet interence\n", 54 | "5. Does EfficientNet produce a different network depending on the dataset?\n", 55 | "6. How does MNas work?" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": { 61 | "id": "ZCTfU5hmxFiW", 62 | "colab_type": "text" 63 | }, 64 | "source": [ 65 | "# Talking Points\n", 66 | "## 1. What are Inverted Residual Blocks?\n", 67 | "- 1x1 Convolution Expands the Filter Dimensions --> 3x3 Depthwise Convolution --> 1x1 Separable Convolution Reduces the number of Filter Dimensions\n", 68 | "- NOTE: The Blocks used for Skip Connections do not have non-linearities (e.g. activation functions)\n", 69 | "\n", 70 | "# How to build EfficientNet\n", 71 | "## Step 1.\n", 72 | "Find the baseline EfficientNet network using Neural Architecture Search\n", 73 | "\n", 74 | "## Step 2.\n", 75 | "Scale up the baseline network using grid search" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "id": "PMA8zOk-5kPD", 82 | "colab_type": "text" 83 | }, 84 | "source": [ 85 | "# Neural Architecture Search\n", 86 | "## Tunable Parameters\n", 87 | "- Depth, Width (Channels), Resolutions" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "metadata": { 93 | "id": "ewrzyHGT3i2x", 94 | "colab_type": "code", 95 | "colab": {} 96 | }, 97 | "source": [ 98 | "!python3 -m pip install efficientnet_pytorch\n", 99 | "!python3 -m pip install torchsummary" 100 | ], 101 | "execution_count": 0, 102 | "outputs": [] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "metadata": { 107 | "id": "9FIc6zqIEQzY", 108 | "colab_type": "code", 109 | "colab": {} 110 | }, 111 | "source": [ 112 | "!git clone https://github.com/lukemelas/EfficientNet-PyTorch" 113 | ], 114 | "execution_count": 0, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "metadata": { 120 | "id": "BicJzIuGESzR", 121 | "colab_type": "code", 122 | "colab": {} 123 | }, 124 | "source": [ 125 | "!ls EfficientNet-PyTorch/examples\n", 126 | "!ls EfficientNet-PyTorch/examples/simple/\n", 127 | "!cp EfficientNet-PyTorch/examples/simple/img.jpg .\n", 128 | "!cp EfficientNet-PyTorch/examples/simple/labels_map.txt ." 129 | ], 130 | "execution_count": 0, 131 | "outputs": [] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "metadata": { 136 | "id": "g_0rOBICy6zP", 137 | "colab_type": "code", 138 | "colab": {} 139 | }, 140 | "source": [ 141 | "import torch\n", 142 | "import torchsummary\n", 143 | "from efficientnet_pytorch import EfficientNet\n", 144 | "\n", 145 | "model = EfficientNet.from_name('efficientnet-b0')\n", 146 | "\n", 147 | "print(model)\n", 148 | "torchsummary.summary(model, input_size=(3, 224, 224))" 149 | ], 150 | "execution_count": 0, 151 | "outputs": [] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "metadata": { 156 | "id": "pbSWEHqzEFrn", 157 | "colab_type": "code", 158 | "colab": {} 159 | }, 160 | "source": [ 161 | "import json\n", 162 | "from PIL import Image\n", 163 | "import torch\n", 164 | "from torchvision import transforms\n", 165 | "\n", 166 | "from efficientnet_pytorch import EfficientNet\n", 167 | "model = EfficientNet.from_pretrained('efficientnet-b0')\n", 168 | "\n", 169 | "# Preprocess image\n", 170 | "tfms = transforms.Compose([transforms.Resize(224), transforms.ToTensor(),\n", 171 | " transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),])\n", 172 | "img = tfms(Image.open('img.jpg')).unsqueeze(0)\n", 173 | "print(img.shape) # torch.Size([1, 3, 224, 224])\n", 174 | "\n", 175 | "# Load ImageNet class names\n", 176 | "labels_map = json.load(open('labels_map.txt'))\n", 177 | "labels_map = [labels_map[str(i)] for i in range(1000)]\n", 178 | "\n", 179 | "# Classify\n", 180 | "model.eval()\n", 181 | "with torch.no_grad():\n", 182 | " outputs = model(img)\n", 183 | "\n", 184 | "# Print predictions\n", 185 | "print('-----')\n", 186 | "for idx in torch.topk(outputs, k=5).indices.squeeze(0).tolist():\n", 187 | " prob = torch.softmax(outputs, dim=1)[0, idx].item()\n", 188 | " print('{label:<75} ({p:.2f}%)'.format(label=labels_map[idx], p=prob*100))" 189 | ], 190 | "execution_count": 0, 191 | "outputs": [] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": { 196 | "id": "7WzcGARdzZjW", 197 | "colab_type": "text" 198 | }, 199 | "source": [ 200 | "# Resources\n", 201 | "\n", 202 | "* [EfficientNet Paper](https://arxiv.org/pdf/1905.11946.pdf)\n", 203 | "* [EfficientDet Paper](https://arxiv.org/pdf/1911.09070.pdf)\n", 204 | "* [Learning OpenCV EfficientNet](https://www.learnopencv.com/efficientnet-theory-code/)\n", 205 | "* [Tensorflow EfficientNet Implementation](https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py)\n", 206 | "* [PyTorch Implementation](https://github.com/lukemelas/EfficientNet-PyTorch)\n", 207 | "* [MnasFPN : Learning Latency-aware Pyramid Architecture\n", 208 | "for Object Detection on Mobile Devices](https://arxiv.org/pdf/1912.01106.pdf)\n", 209 | "* [Yolo V4](https://arxiv.org/pdf/2004.10934.pdf)\n", 210 | "* [EffResNetComparison](https://colab.research.google.com/github/rwightman/pytorch-image-models/blob/master/notebooks/EffResNetComparison.ipynb#scrollTo=SKA-MF-yShDW)\n" 211 | ] 212 | } 213 | ] 214 | } -------------------------------------------------------------------------------- /ProphetCode/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Quick Start 3 | https://facebook.github.io/prophet/docs/quick_start.html#python-api 4 | """ 5 | 6 | import pandas as pd 7 | from prophet import Prophet 8 | 9 | def main(): 10 | print('main') 11 | # Python 12 | df = pd.read_csv('https://raw.githubusercontent.com/facebook/prophet/main/examples/example_wp_log_peyton_manning.csv') 13 | print(df.columns) 14 | if True: 15 | df = pd.read_csv('archive/GlobalLandTemperaturesByMajorCity.csv') 16 | df = df.rename(columns={"dt": "ds", "AverageTemperature": "y"}) 17 | df = df[df['City'] == "New York"] 18 | #df = df[df['City'] == "Sydney"] 19 | #df = df[df['City'] == "Cape Town"] 20 | #df = df[df['City'] == "New Delhi"] 21 | df = df[df.y.notnull()] 22 | print(df.columns) 23 | print(df.head()) 24 | 25 | # Python 26 | m = Prophet() 27 | m.fit(df) 28 | 29 | future = m.make_future_dataframe(periods=365) 30 | print(future.tail()) 31 | 32 | forecast = m.predict(future) 33 | print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()) 34 | 35 | fig1 = m.plot(forecast) 36 | fig1.savefig('test.png') 37 | 38 | fig2 = m.plot_components(forecast) 39 | fig2.savefig('test2.png') 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepLearningDemos 2 | We're a team of Machine Learning Engineers that blog about advances in Deep Learning. 3 | 4 | # 1. Blog Posts 5 | 1. [Vision Transformers for Computer Vision](https://deepganteam.medium.com/vision-transformers-for-computer-vision-9f70418fe41a) (2021-09-07) 6 | 1. [Few Shot Learning from Scratch](https://deepganteam.medium.com/few-shot-learning-from-scratch-a3422b111e05) (2021-07-06) 7 | 1. [Basic Policy Gradients with the Reparameterization Trick](https://deepganteam.medium.com/basic-policy-gradients-with-the-reparameterization-trick-24312c7dbcd) (2021-04-13) 8 | 1. [A Little Rusty? ML Refresher on Linear Regression](https://deepganteam.medium.com/a-little-rusty-ml-refresher-on-linear-regression-76ef4afc6474) (2021-02-19) 9 | 1. [Language Translation with Transformers in PyTorch](https://chatbotslife.com/language-translation-with-transformers-in-pytorch-ff8b32cf848?gi=df7018b86372) (2021-01-22) 10 | 1. [What are Transformers?](https://medium.com/@deepganteam/what-are-transformers-b687f2bcdf49) (2020-09-02) 11 | 1. [Searching for Better Neural Architecture Search](https://medium.com/@deepganteam/searching-for-better-neural-architecture-search-ea91338caa11) (2020-06-17) 12 | 1. [Making SinGAN Double](https://medium.com/@deepganteam/making-singan-double-8568490b572e) (2020-04-15) 13 | 14 | # 2. Notebooks 15 | 1. [AutoKeras Notebook](https://github.com/wileyw/DeepLearningDemos/blob/master/NeuralArchitectureSearch/Autokeras.ipynb) (2020-06-11) 16 | 17 | # 3. Extra 18 | ## Project Ideas 19 | 1. https://www.cs.toronto.edu/~graves/handwriting.html 20 | 21 | ## Specific Interesting Architectures 22 | Note: 23 | 1. Hour glass 24 | 1. U-net 25 | 1. Dense-net/Resnet 26 | 1. SqueezeNet 27 | 28 | -------------------------------------------------------------------------------- /RL_from_human_feedback/RL_from_human_feedback.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyNv29WPr2hSjlJeI/CBi3nZ", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU", 18 | "gpuClass": "standard" 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "view-in-github", 25 | "colab_type": "text" 26 | }, 27 | "source": [ 28 | "\"Open" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "source": [ 34 | "!git clone https://github.com/tatsu-lab/stanford_alpaca.git" 35 | ], 36 | "metadata": { 37 | "id": "Mcfmw95BVnkk" 38 | }, 39 | "execution_count": null, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "source": [ 45 | "%%python3 -m pip install -r stanford_alpaca/requirements.txt" 46 | ], 47 | "metadata": { 48 | "id": "6VQWwYyBWGQP" 49 | }, 50 | "execution_count": null, 51 | "outputs": [] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "source": [ 56 | "!git clone https://github.com/huggingface/transformers.git" 57 | ], 58 | "metadata": { 59 | "id": "ghX-6yKfWoGe" 60 | }, 61 | "execution_count": null, 62 | "outputs": [] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "source": [ 67 | "%cd transformers" 68 | ], 69 | "metadata": { 70 | "id": "E8iEJocrWtyz" 71 | }, 72 | "execution_count": null, 73 | "outputs": [] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "id": "I9OAmFjMQNYF" 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "import torch\n", 84 | "\n", 85 | "import sys\n", 86 | "\n", 87 | "sys.version" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "source": [ 93 | "%cd .." 94 | ], 95 | "metadata": { 96 | "id": "0G3yWBv_XgrN" 97 | }, 98 | "execution_count": null, 99 | "outputs": [] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "source": [ 104 | "%ls stanford_alpaca/\n" 105 | ], 106 | "metadata": { 107 | "id": "41QrI6gNQQLS" 108 | }, 109 | "execution_count": null, 110 | "outputs": [] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "source": [ 115 | "%cd transformers" 116 | ], 117 | "metadata": { 118 | "id": "DIjJ0b8WYy5J" 119 | }, 120 | "execution_count": null, 121 | "outputs": [] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "source": [ 126 | "!python src/transformers/models/llama/convert_llama_weights_to_hf.py \\\n", 127 | " --input_dir /path/to/downloaded/llama/weights \\\n", 128 | " --model_size 7B \\\n", 129 | " --output_dir /output/path" 130 | ], 131 | "metadata": { 132 | "id": "Y4hVfUwoZDJF" 133 | }, 134 | "execution_count": null, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "source": [ 140 | "%cd stanford_alpaca" 141 | ], 142 | "metadata": { 143 | "id": "HXpP37VVXmd-" 144 | }, 145 | "execution_count": null, 146 | "outputs": [] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "source": [ 151 | "mkdir output" 152 | ], 153 | "metadata": { 154 | "id": "Rph7DmASX4wU" 155 | }, 156 | "execution_count": null, 157 | "outputs": [] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "source": [ 162 | "!torchrun --nproc_per_node=1 --master_port= train.py \\\n", 163 | " --model_name_or_path \\\n", 164 | " --data_path ./alpaca_data.json \\\n", 165 | " --bf16 True \\\n", 166 | " --output_dir /content/stanford_alpaca \\\n", 167 | " --num_train_epochs 3 \\\n", 168 | " --per_device_train_batch_size 4 \\\n", 169 | " --per_device_eval_batch_size 4 \\\n", 170 | " --gradient_accumulation_steps 8 \\\n", 171 | " --evaluation_strategy \"no\" \\\n", 172 | " --save_strategy \"steps\" \\\n", 173 | " --save_steps 2000 \\\n", 174 | " --save_total_limit 1 \\\n", 175 | " --learning_rate 2e-5 \\\n", 176 | " --weight_decay 0. \\\n", 177 | " --warmup_ratio 0.03 \\\n", 178 | " --lr_scheduler_type \"cosine\" \\\n", 179 | " --logging_steps 1 \\\n", 180 | " --fsdp \"full_shard auto_wrap\" \\\n", 181 | " --fsdp_transformer_layer_cls_to_wrap 'LLaMADecoderLayer' \\\n", 182 | " --tf32 True" 183 | ], 184 | "metadata": { 185 | "id": "U5NIKbFvXpPW" 186 | }, 187 | "execution_count": null, 188 | "outputs": [] 189 | } 190 | ] 191 | } -------------------------------------------------------------------------------- /ReinforcmentLearning/simple_example.py: -------------------------------------------------------------------------------- 1 | import gym 2 | env = gym.make("MsPacman-ram-v0") 3 | observation = env.reset() 4 | import time 5 | for _ in range(1000): 6 | env.render() 7 | action = env.action_space.sample() # your agent here (this takes random actions) 8 | observation, reward, done, info = env.step(action) 9 | 10 | if done: 11 | observation = env.reset() 12 | env.close() 13 | -------------------------------------------------------------------------------- /SinGAN/SinGANOfficialImplementation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "SinGANOfficialImplementation.ipynb", 7 | "provenance": [], 8 | "private_outputs": true, 9 | "authorship_tag": "ABX9TyPERozuok0RMoHj/JkJzqSS", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "_teh4A4skGRT", 33 | "colab_type": "text" 34 | }, 35 | "source": [ 36 | "# SinGAN\n", 37 | "\n", 38 | "[Official SinGAN Repository](https://github.com/tamarott/SinGAN)\n", 39 | "\n", 40 | "In this notebook, we will implement and create a SinGAN homework assignment for other's to learn how to implement SinGAN as well." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "id": "TF-QL9B0z7sl", 47 | "colab_type": "code", 48 | "colab": {} 49 | }, 50 | "source": [ 51 | "!git clone https://github.com/tamarott/SinGAN.git" 52 | ], 53 | "execution_count": 0, 54 | "outputs": [] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "metadata": { 59 | "id": "ul_FoSC40EjR", 60 | "colab_type": "code", 61 | "colab": {} 62 | }, 63 | "source": [ 64 | "%cd /content/SinGAN\n", 65 | "!ls\n", 66 | "!pwd\n", 67 | "!python3 main_train.py --input_name birds.png" 68 | ], 69 | "execution_count": 0, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "ktEI7QEk1Wmf", 76 | "colab_type": "code", 77 | "colab": {} 78 | }, 79 | "source": [ 80 | "!python3 random_samples.py --input_name birds.png --mode random_samples_arbitrary_sizes --scale_h 1 --scale_v 1" 81 | ], 82 | "execution_count": 0, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "metadata": { 88 | "id": "VBErCpu1LKd8", 89 | "colab_type": "code", 90 | "colab": {} 91 | }, 92 | "source": [ 93 | "!ls\n", 94 | "!ls -l Output/RandomSamples/birds\n", 95 | "!ls -l Output/RandomSamples/birds/gen_start_scale=0" 96 | ], 97 | "execution_count": 0, 98 | "outputs": [] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "metadata": { 103 | "id": "qTHAHNbnL5W6", 104 | "colab_type": "code", 105 | "colab": {} 106 | }, 107 | "source": [ 108 | "import cv2\n", 109 | "import glob\n", 110 | "from google.colab.patches import cv2_imshow\n", 111 | "\n", 112 | "print('original image')\n", 113 | "original_img_path = 'Input/Images/birds.png'\n", 114 | "img = cv2.imread(original_img_path)\n", 115 | "cv2_imshow(img)\n", 116 | "\n", 117 | "print('random sample')\n", 118 | "img_paths = glob.glob('Output/RandomSamples/birds/gen_start_scale=0/*.png')\n", 119 | "img = cv2.imread(img_paths[0])\n", 120 | "cv2_imshow(img)" 121 | ], 122 | "execution_count": 0, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "6jpfpY2_kFeX", 129 | "colab_type": "code", 130 | "colab": {} 131 | }, 132 | "source": [ 133 | "import torch\n", 134 | "\n", 135 | "print('Implement SinGAN here')\n", 136 | "print(torch)" 137 | ], 138 | "execution_count": 0, 139 | "outputs": [] 140 | } 141 | ] 142 | } 143 | -------------------------------------------------------------------------------- /SinGAN/SinGAN_on_custom_image.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "SinGANOfficialImplementation.ipynb", 7 | "provenance": [], 8 | "private_outputs": true, 9 | "authorship_tag": "ABX9TyOqSL8ngNwZVEvzOBFhFwfA", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "_teh4A4skGRT", 33 | "colab_type": "text" 34 | }, 35 | "source": [ 36 | "# SinGAN\n", 37 | "\n", 38 | "[Official SinGAN Repository](https://github.com/tamarott/SinGAN)\n", 39 | "\n", 40 | "In this notebook, we will implement and create a SinGAN homework assignment for other's to learn how to implement SinGAN as well." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "id": "UemrsEXyvxqQ", 47 | "colab_type": "code", 48 | "colab": {} 49 | }, 50 | "source": [ 51 | "def upload_files():\n", 52 | " from google.colab import files\n", 53 | " uploaded = files.upload()\n", 54 | " for k, v in uploaded.items():\n", 55 | " open(k, 'wb').write(v)\n", 56 | " return list(uploaded.keys())\n", 57 | "upload_files()" 58 | ], 59 | "execution_count": 0, 60 | "outputs": [] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "TF-QL9B0z7sl", 66 | "colab_type": "code", 67 | "colab": {} 68 | }, 69 | "source": [ 70 | "!git clone https://github.com/tamarott/SinGAN.git" 71 | ], 72 | "execution_count": 0, 73 | "outputs": [] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "metadata": { 78 | "id": "vZLLI4c7sJjv", 79 | "colab_type": "code", 80 | "colab": {} 81 | }, 82 | "source": [ 83 | "!ls /content/SinGAN/Input/Images/\n", 84 | "!cp carrots_whole.4BVRRZ6FNXYQN.png /content/SinGAN/Input/Images/custom.png" 85 | ], 86 | "execution_count": 0, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "metadata": { 92 | "id": "uKVDKOFetVEW", 93 | "colab_type": "code", 94 | "colab": {} 95 | }, 96 | "source": [ 97 | "%cd /content/SinGAN/Input/Images/\n", 98 | "!ls\n", 99 | "#import cv2\n", 100 | "#custom = cv2.imread('custom.jpg')\n", 101 | "#cv2.imwrite('custom.png', custom)" 102 | ], 103 | "execution_count": 0, 104 | "outputs": [] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "metadata": { 109 | "id": "ul_FoSC40EjR", 110 | "colab_type": "code", 111 | "colab": {} 112 | }, 113 | "source": [ 114 | "%cd /content/SinGAN\n", 115 | "!ls\n", 116 | "!pwd\n", 117 | "!python3 main_train.py --input_name custom.png" 118 | ], 119 | "execution_count": 0, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "metadata": { 125 | "id": "ktEI7QEk1Wmf", 126 | "colab_type": "code", 127 | "colab": {} 128 | }, 129 | "source": [ 130 | "!python3 random_samples.py --input_name custom.png --mode random_samples_arbitrary_sizes --scale_h 1 --scale_v 1" 131 | ], 132 | "execution_count": 0, 133 | "outputs": [] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "metadata": { 138 | "id": "VBErCpu1LKd8", 139 | "colab_type": "code", 140 | "colab": {} 141 | }, 142 | "source": [ 143 | "!ls\n", 144 | "!ls -l Output/RandomSamples/custom\n", 145 | "!ls -l Output/RandomSamples/custom/gen_start_scale=0" 146 | ], 147 | "execution_count": 0, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qTHAHNbnL5W6", 154 | "colab_type": "code", 155 | "colab": {} 156 | }, 157 | "source": [ 158 | "import cv2\n", 159 | "import glob\n", 160 | "from google.colab.patches import cv2_imshow\n", 161 | "\n", 162 | "print('original image')\n", 163 | "original_img_path = 'Input/Images/custom.png'\n", 164 | "img = cv2.imread(original_img_path)\n", 165 | "cv2_imshow(img)\n", 166 | "\n", 167 | "print('random sample')\n", 168 | "img_paths = glob.glob('Output/RandomSamples/custom/gen_start_scale=0/*.png')\n", 169 | "img = cv2.imread(img_paths[0])\n", 170 | "cv2_imshow(img)" 171 | ], 172 | "execution_count": 0, 173 | "outputs": [] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "metadata": { 178 | "id": "6jpfpY2_kFeX", 179 | "colab_type": "code", 180 | "colab": {} 181 | }, 182 | "source": [ 183 | "import torch\n", 184 | "\n", 185 | "print('Implement SinGAN here')\n", 186 | "print(torch)" 187 | ], 188 | "execution_count": 0, 189 | "outputs": [] 190 | } 191 | ] 192 | } -------------------------------------------------------------------------------- /Transformers/README.md: -------------------------------------------------------------------------------- 1 | - [Huggingface Transformers](https://github.com/huggingface/transformers) 2 | -------------------------------------------------------------------------------- /Transformers/Transformers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Transformers.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyMq7zDv55D6nmR0jiBGxxXn", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "3oRmLL1owcS6", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# Transformers\n", 35 | "\n", 36 | "Initial commit of colab notebook for Transformers." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "metadata": { 42 | "id": "LllIDkqLwYuS", 43 | "colab_type": "code", 44 | "colab": {} 45 | }, 46 | "source": [ 47 | "import torch\n", 48 | "import tensorflow as tf" 49 | ], 50 | "execution_count": 2, 51 | "outputs": [] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "wnZMPSeHyG4J", 57 | "colab_type": "text" 58 | }, 59 | "source": [ 60 | "# Resources\n", 61 | "- [Gelu Activation Function](https://mlfromscratch.com/activation-functions-explained/)" 62 | ] 63 | } 64 | ] 65 | } -------------------------------------------------------------------------------- /Transformers/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | -------------------------------------------------------------------------------- /Transformers/translation/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | """ 5 | Parameters 6 | 7 | d_model – the number of expected features in the encoder/decoder inputs (default=512). 8 | 9 | nhead – the number of heads in the multiheadattention models (default=8). 10 | 11 | num_encoder_layers – the number of sub-encoder-layers in the encoder (default=6). 12 | 13 | num_decoder_layers – the number of sub-decoder-layers in the decoder (default=6). 14 | 15 | dim_feedforward – the dimension of the feedforward network model (default=2048). 16 | 17 | dropout – the dropout value (default=0.1). 18 | 19 | activation – the activation function of encoder/decoder intermediate layer, relu or gelu (default=relu). 20 | 21 | custom_encoder – custom encoder (default=None). 22 | 23 | custom_decoder – custom decoder (default=None). 24 | """ 25 | trfmr_config = { 26 | 'd_model': 256, # number of features in embedding 27 | 'nhead': 8, # number of attention heads 28 | 'num_encoder_layers': 8, 29 | 'num_decoder_layers': 8, 30 | 'dim_feedforward': 2048, 31 | 'activation': 'relu', 32 | } 33 | 34 | opt_config = { 35 | 'lr': 3e-4, 36 | 'beta1': 0.5, 37 | 'beta2': 0.999, 38 | 'num_epochs': 300 39 | } 40 | 41 | 42 | def main(): 43 | # Initialize model. 44 | trfm_model = torch.nn.Transformer(**trmfr_config) 45 | 46 | # Initialize optimizer. 47 | opt = torch.optim.AdamW(trfm.parameters(), opt_config['lr'], 48 | [opt_config['beta1'], opt_config['beta2']]) 49 | 50 | # Set loss function. 51 | loss_fn = torch.nn.BCELoss 52 | 53 | # Load Data. 54 | # TODO: implement data loading. 55 | data = [] 56 | 57 | # Training loop 58 | for epoch in range in range(opt_config['num_epochs']): 59 | for expected_out, batch in data: 60 | opt.zero_grad() 61 | actual_out = trfm_model(batch) 62 | loss = torch.nn.BCELoss(actual_out, expected_out) 63 | loss.backward() 64 | opt.step() 65 | 66 | 67 | if __name__ == '__main__': 68 | main() -------------------------------------------------------------------------------- /handwriting-synthesis/.gitignore: -------------------------------------------------------------------------------- 1 | data/raw/ascii 2 | data/raw/lineStrokes 3 | data/raw/original 4 | data/processed 5 | 6 | logs 7 | predictions 8 | -------------------------------------------------------------------------------- /handwriting-synthesis/.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | cache: pip 3 | python: 4 | - 2.7 5 | - 3.6 6 | #- nightly 7 | #- pypy 8 | #- pypy3 9 | matrix: 10 | allow_failures: 11 | - python: nightly 12 | - python: pypy 13 | - python: pypy3 14 | install: 15 | #- pip install -r requirements.txt 16 | - pip install flake8 # pytest # add another testing frameworks later 17 | before_script: 18 | # stop the build if there are Python syntax errors or undefined names 19 | - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics 20 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 21 | - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 22 | script: 23 | - true # pytest --capture=sys # add other tests here 24 | notifications: 25 | on_success: change 26 | on_failure: change # `always` will be the setting once code changes slow down 27 | -------------------------------------------------------------------------------- /handwriting-synthesis/checkpoints/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model-17900" 2 | all_model_checkpoint_paths: "model-17900" 3 | -------------------------------------------------------------------------------- /handwriting-synthesis/checkpoints/model-17900.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/checkpoints/model-17900.data-00000-of-00001 -------------------------------------------------------------------------------- /handwriting-synthesis/checkpoints/model-17900.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/checkpoints/model-17900.index -------------------------------------------------------------------------------- /handwriting-synthesis/checkpoints/model-17900.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/checkpoints/model-17900.meta -------------------------------------------------------------------------------- /handwriting-synthesis/data/blacklist.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/data/blacklist.npy -------------------------------------------------------------------------------- /handwriting-synthesis/data/processed/.gitattributes: -------------------------------------------------------------------------------- 1 | *.npy filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /handwriting-synthesis/data/processed/c.npy: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c292920514ca4f12673a81b96ccdadbf99ee28ef4aa0f7b60a85706691c87abe 3 | size 871253 4 | -------------------------------------------------------------------------------- /handwriting-synthesis/data/processed/c_len.npy: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:afe6b3a07822d7f90b3974a08722b55017319fe1dcee24fc9424ee81a9683195 3 | size 11743 4 | -------------------------------------------------------------------------------- /handwriting-synthesis/data/processed/w_id.npy: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:02939b9bbc2347eca0faeb23e12a33453f69b868ef7ee410287fe0af4121d8d9 3 | size 23358 4 | -------------------------------------------------------------------------------- /handwriting-synthesis/data/processed/x.npy: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c6b95c4b53f6ac656884e0bee483b7d3eb0a2e1352de4a102fae546cac3ae3e8 3 | size 167256128 4 | -------------------------------------------------------------------------------- /handwriting-synthesis/data/processed/x_len.npy: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:dde6dd72f5017c5608c40114d68657adf72be571e7c90493832f151da88f6ab1 3 | size 23358 4 | -------------------------------------------------------------------------------- /handwriting-synthesis/data/raw/readme.md: -------------------------------------------------------------------------------- 1 | ## Model Training Instructions 2 | 3 | In order to train a model, data must be downloaded and placed in this directory. 4 | 5 | Follow the download instructions here http://www.fki.inf.unibe.ch/databases/iam-on-line-handwriting-database. 6 | 7 | Only a subset of the downloaded data is required. Move the relevant download data so the directory structure is as folllows: 8 | 9 | ``` 10 | data/ 11 | ├── raw/ 12 | │ ├── ascii/ 13 | │ ├── lineStrokes/ 14 | │ ├── original/ 15 | | blacklist.npy 16 | ``` 17 | 18 | Once this is completed, run `prepare_data.py` extract the data and dump it to numpy files. 19 | 20 | To train the model, run `rnn.py`. This takes a couple days on a single Tesla K80. 21 | 22 | -------------------------------------------------------------------------------- /handwriting-synthesis/data_frame.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.model_selection import train_test_split 6 | 7 | 8 | class DataFrame(object): 9 | 10 | """Minimal pd.DataFrame analog for handling n-dimensional numpy matrices with additional 11 | support for shuffling, batching, and train/test splitting. 12 | 13 | Args: 14 | columns: List of names corresponding to the matrices in data. 15 | data: List of n-dimensional data matrices ordered in correspondence with columns. 16 | All matrices must have the same leading dimension. Data can also be fed a list of 17 | instances of np.memmap, in which case RAM usage can be limited to the size of a 18 | single batch. 19 | """ 20 | 21 | def __init__(self, columns, data): 22 | assert len(columns) == len(data), 'columns length does not match data length' 23 | 24 | lengths = [mat.shape[0] for mat in data] 25 | assert len(set(lengths)) == 1, 'all matrices in data must have same first dimension' 26 | 27 | self.length = lengths[0] 28 | self.columns = columns 29 | self.data = data 30 | self.dict = dict(zip(self.columns, self.data)) 31 | self.idx = np.arange(self.length) 32 | 33 | def shapes(self): 34 | return pd.Series(dict(zip(self.columns, [mat.shape for mat in self.data]))) 35 | 36 | def dtypes(self): 37 | return pd.Series(dict(zip(self.columns, [mat.dtype for mat in self.data]))) 38 | 39 | def shuffle(self): 40 | np.random.shuffle(self.idx) 41 | 42 | def train_test_split(self, train_size, random_state=np.random.randint(1000), stratify=None): 43 | train_idx, test_idx = train_test_split( 44 | self.idx, 45 | train_size=train_size, 46 | random_state=random_state, 47 | stratify=stratify 48 | ) 49 | train_df = DataFrame(copy.copy(self.columns), [mat[train_idx] for mat in self.data]) 50 | test_df = DataFrame(copy.copy(self.columns), [mat[test_idx] for mat in self.data]) 51 | return train_df, test_df 52 | 53 | def batch_generator(self, batch_size, shuffle=True, num_epochs=10000, allow_smaller_final_batch=False): 54 | epoch_num = 0 55 | while epoch_num < num_epochs: 56 | if shuffle: 57 | self.shuffle() 58 | 59 | for i in range(0, self.length + 1, batch_size): 60 | batch_idx = self.idx[i: i + batch_size] 61 | if not allow_smaller_final_batch and len(batch_idx) != batch_size: 62 | break 63 | yield DataFrame( 64 | columns=copy.copy(self.columns), 65 | data=[mat[batch_idx].copy() for mat in self.data] 66 | ) 67 | 68 | epoch_num += 1 69 | 70 | def iterrows(self): 71 | for i in self.idx: 72 | yield self[i] 73 | 74 | def mask(self, mask): 75 | return DataFrame(copy.copy(self.columns), [mat[mask] for mat in self.data]) 76 | 77 | def concat(self, other_df): 78 | mats = [] 79 | for column in self.columns: 80 | mats.append(np.concatenate([self[column], other_df[column]], axis=0)) 81 | return DataFrame(copy.copy(self.columns), mats) 82 | 83 | def items(self): 84 | return self.dict.items() 85 | 86 | def __iter__(self): 87 | return self.dict.items().__iter__() 88 | 89 | def __len__(self): 90 | return self.length 91 | 92 | def __getitem__(self, key): 93 | if isinstance(key, str): 94 | return self.dict[key] 95 | 96 | elif isinstance(key, int): 97 | return pd.Series(dict(zip(self.columns, [mat[self.idx[key]] for mat in self.data]))) 98 | 99 | def __setitem__(self, key, value): 100 | assert value.shape[0] == len(self), 'matrix first dimension does not match' 101 | if key not in self.columns: 102 | self.columns.append(key) 103 | self.data.append(value) 104 | self.dict[key] = value 105 | -------------------------------------------------------------------------------- /handwriting-synthesis/demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | import numpy as np 5 | import svgwrite 6 | 7 | import drawing 8 | import lyrics 9 | from rnn import rnn 10 | 11 | 12 | class Hand(object): 13 | 14 | def __init__(self): 15 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 16 | self.nn = rnn( 17 | log_dir='logs', 18 | checkpoint_dir='checkpoints', 19 | prediction_dir='predictions', 20 | learning_rates=[.0001, .00005, .00002], 21 | batch_sizes=[32, 64, 64], 22 | patiences=[1500, 1000, 500], 23 | beta1_decays=[.9, .9, .9], 24 | validation_batch_size=32, 25 | optimizer='rms', 26 | num_training_steps=100000, 27 | warm_start_init_step=17900, 28 | regularization_constant=0.0, 29 | keep_prob=1.0, 30 | enable_parameter_averaging=False, 31 | min_steps_to_checkpoint=2000, 32 | log_interval=20, 33 | logging_level=logging.CRITICAL, 34 | grad_clip=10, 35 | lstm_size=400, 36 | output_mixture_components=20, 37 | attention_mixture_components=10 38 | ) 39 | self.nn.restore() 40 | 41 | def write(self, filename, lines, biases=None, styles=None, stroke_colors=None, stroke_widths=None): 42 | valid_char_set = set(drawing.alphabet) 43 | for line_num, line in enumerate(lines): 44 | if len(line) > 75: 45 | raise ValueError( 46 | ( 47 | "Each line must be at most 75 characters. " 48 | "Line {} contains {}" 49 | ).format(line_num, len(line)) 50 | ) 51 | 52 | for char in line: 53 | if char not in valid_char_set: 54 | raise ValueError( 55 | ( 56 | "Invalid character {} detected in line {}. " 57 | "Valid character set is {}" 58 | ).format(char, line_num, valid_char_set) 59 | ) 60 | 61 | strokes = self._sample(lines, biases=biases, styles=styles) 62 | self._draw(strokes, lines, filename, stroke_colors=stroke_colors, stroke_widths=stroke_widths) 63 | 64 | def _sample(self, lines, biases=None, styles=None): 65 | num_samples = len(lines) 66 | max_tsteps = 40*max([len(i) for i in lines]) 67 | biases = biases if biases is not None else [0.5]*num_samples 68 | 69 | x_prime = np.zeros([num_samples, 1200, 3]) 70 | x_prime_len = np.zeros([num_samples]) 71 | chars = np.zeros([num_samples, 120]) 72 | chars_len = np.zeros([num_samples]) 73 | 74 | if styles is not None: 75 | for i, (cs, style) in enumerate(zip(lines, styles)): 76 | x_p = np.load('styles/style-{}-strokes.npy'.format(style)) 77 | c_p = np.load('styles/style-{}-chars.npy'.format(style)).tostring().decode('utf-8') 78 | 79 | c_p = str(c_p) + " " + cs 80 | c_p = drawing.encode_ascii(c_p) 81 | c_p = np.array(c_p) 82 | 83 | x_prime[i, :len(x_p), :] = x_p 84 | x_prime_len[i] = len(x_p) 85 | chars[i, :len(c_p)] = c_p 86 | chars_len[i] = len(c_p) 87 | 88 | else: 89 | for i in range(num_samples): 90 | encoded = drawing.encode_ascii(lines[i]) 91 | chars[i, :len(encoded)] = encoded 92 | chars_len[i] = len(encoded) 93 | 94 | [samples] = self.nn.session.run( 95 | [self.nn.sampled_sequence], 96 | feed_dict={ 97 | self.nn.prime: styles is not None, 98 | self.nn.x_prime: x_prime, 99 | self.nn.x_prime_len: x_prime_len, 100 | self.nn.num_samples: num_samples, 101 | self.nn.sample_tsteps: max_tsteps, 102 | self.nn.c: chars, 103 | self.nn.c_len: chars_len, 104 | self.nn.bias: biases 105 | } 106 | ) 107 | samples = [sample[~np.all(sample == 0.0, axis=1)] for sample in samples] 108 | return samples 109 | 110 | def _draw(self, strokes, lines, filename, stroke_colors=None, stroke_widths=None): 111 | stroke_colors = stroke_colors or ['black']*len(lines) 112 | stroke_widths = stroke_widths or [2]*len(lines) 113 | 114 | line_height = 60 115 | view_width = 1000 116 | view_height = line_height*(len(strokes) + 1) 117 | 118 | dwg = svgwrite.Drawing(filename=filename) 119 | dwg.viewbox(width=view_width, height=view_height) 120 | dwg.add(dwg.rect(insert=(0, 0), size=(view_width, view_height), fill='white')) 121 | 122 | initial_coord = np.array([0, -(3*line_height / 4)]) 123 | for offsets, line, color, width in zip(strokes, lines, stroke_colors, stroke_widths): 124 | 125 | if not line: 126 | initial_coord[1] -= line_height 127 | continue 128 | 129 | offsets[:, :2] *= 1.5 130 | strokes = drawing.offsets_to_coords(offsets) 131 | strokes = drawing.denoise(strokes) 132 | strokes[:, :2] = drawing.align(strokes[:, :2]) 133 | 134 | strokes[:, 1] *= -1 135 | strokes[:, :2] -= strokes[:, :2].min() + initial_coord 136 | strokes[:, 0] += (view_width - strokes[:, 0].max()) / 2 137 | 138 | prev_eos = 1.0 139 | p = "M{},{} ".format(0, 0) 140 | for x, y, eos in zip(*strokes.T): 141 | p += '{}{},{} '.format('M' if prev_eos == 1.0 else 'L', x, y) 142 | prev_eos = eos 143 | path = svgwrite.path.Path(p) 144 | path = path.stroke(color=color, width=width, linecap='round').fill("none") 145 | dwg.add(path) 146 | 147 | initial_coord[1] -= line_height 148 | 149 | dwg.save() 150 | 151 | 152 | if __name__ == '__main__': 153 | hand = Hand() 154 | 155 | # usage demo 156 | lines = [ 157 | "Now this is a story all about how", 158 | "My life got flipped turned upside down", 159 | "And I'd like to take a minute, just sit right there", 160 | "I'll tell you how I became the prince of a town called Bel-Air", 161 | ] 162 | biases = [.75 for i in lines] 163 | styles = [9 for i in lines] 164 | stroke_colors = ['red', 'green', 'black', 'blue'] 165 | stroke_widths = [1, 2, 1, 2] 166 | 167 | hand.write( 168 | filename='img/usage_demo.svg', 169 | lines=lines, 170 | biases=biases, 171 | styles=styles, 172 | stroke_colors=stroke_colors, 173 | stroke_widths=stroke_widths 174 | ) 175 | 176 | # demo number 1 - fixed bias, fixed style 177 | lines = lyrics.all_star.split("\n") 178 | biases = [.75 for i in lines] 179 | styles = [12 for i in lines] 180 | 181 | hand.write( 182 | filename='img/all_star.svg', 183 | lines=lines, 184 | biases=biases, 185 | styles=styles, 186 | ) 187 | 188 | # demo number 2 - fixed bias, varying style 189 | lines = lyrics.downtown.split("\n") 190 | biases = [.75 for i in lines] 191 | styles = np.cumsum(np.array([len(i) for i in lines]) == 0).astype(int) 192 | 193 | hand.write( 194 | filename='img/downtown.svg', 195 | lines=lines, 196 | biases=biases, 197 | styles=styles, 198 | ) 199 | 200 | # demo number 3 - varying bias, fixed style 201 | lines = lyrics.give_up.split("\n") 202 | biases = .2*np.flip(np.cumsum([len(i) == 0 for i in lines]), 0) 203 | styles = [7 for i in lines] 204 | 205 | hand.write( 206 | filename='img/give_up.svg', 207 | lines=lines, 208 | biases=biases, 209 | styles=styles, 210 | ) 211 | -------------------------------------------------------------------------------- /handwriting-synthesis/drawing.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from collections import defaultdict 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | from scipy.signal import savgol_filter 7 | from scipy.interpolate import interp1d 8 | 9 | 10 | alphabet = [ 11 | '\x00', ' ', '!', '"', '#', "'", '(', ')', ',', '-', '.', 12 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', 13 | '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 14 | 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 15 | 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 16 | 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 17 | 'y', 'z' 18 | ] 19 | alphabet_ord = list(map(ord, alphabet)) 20 | alpha_to_num = defaultdict(int, list(map(reversed, enumerate(alphabet)))) 21 | num_to_alpha = dict(enumerate(alphabet_ord)) 22 | 23 | MAX_STROKE_LEN = 1200 24 | MAX_CHAR_LEN = 75 25 | 26 | 27 | def align(coords): 28 | """ 29 | corrects for global slant/offset in handwriting strokes 30 | """ 31 | coords = np.copy(coords) 32 | X, Y = coords[:, 0].reshape(-1, 1), coords[:, 1].reshape(-1, 1) 33 | X = np.concatenate([np.ones([X.shape[0], 1]), X], axis=1) 34 | offset, slope = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(Y).squeeze() 35 | theta = np.arctan(slope) 36 | rotation_matrix = np.array( 37 | [[np.cos(theta), -np.sin(theta)], 38 | [np.sin(theta), np.cos(theta)]] 39 | ) 40 | coords[:, :2] = np.dot(coords[:, :2], rotation_matrix) - offset 41 | return coords 42 | 43 | 44 | def skew(coords, degrees): 45 | """ 46 | skews strokes by given degrees 47 | """ 48 | coords = np.copy(coords) 49 | theta = degrees * np.pi/180 50 | A = np.array([[np.cos(-theta), 0], [np.sin(-theta), 1]]) 51 | coords[:, :2] = np.dot(coords[:, :2], A) 52 | return coords 53 | 54 | 55 | def stretch(coords, x_factor, y_factor): 56 | """ 57 | stretches strokes along x and y axis 58 | """ 59 | coords = np.copy(coords) 60 | coords[:, :2] *= np.array([x_factor, y_factor]) 61 | return coords 62 | 63 | 64 | def add_noise(coords, scale): 65 | """ 66 | adds gaussian noise to strokes 67 | """ 68 | coords = np.copy(coords) 69 | coords[1:, :2] += np.random.normal(loc=0.0, scale=scale, size=coords[1:, :2].shape) 70 | return coords 71 | 72 | 73 | def encode_ascii(ascii_string): 74 | """ 75 | encodes ascii string to array of ints 76 | """ 77 | return np.array(list(map(lambda x: alpha_to_num[x], ascii_string)) + [0]) 78 | 79 | 80 | def denoise(coords): 81 | """ 82 | smoothing filter to mitigate some artifacts of the data collection 83 | """ 84 | coords = np.split(coords, np.where(coords[:, 2] == 1)[0] + 1, axis=0) 85 | new_coords = [] 86 | for stroke in coords: 87 | if len(stroke) != 0: 88 | x_new = savgol_filter(stroke[:, 0], 7, 3, mode='nearest') 89 | y_new = savgol_filter(stroke[:, 1], 7, 3, mode='nearest') 90 | xy_coords = np.hstack([x_new.reshape(-1, 1), y_new.reshape(-1, 1)]) 91 | stroke = np.concatenate([xy_coords, stroke[:, 2].reshape(-1, 1)], axis=1) 92 | new_coords.append(stroke) 93 | 94 | coords = np.vstack(new_coords) 95 | return coords 96 | 97 | 98 | def interpolate(coords, factor=2): 99 | """ 100 | interpolates strokes using cubic spline 101 | """ 102 | coords = np.split(coords, np.where(coords[:, 2] == 1)[0] + 1, axis=0) 103 | new_coords = [] 104 | for stroke in coords: 105 | 106 | if len(stroke) == 0: 107 | continue 108 | 109 | xy_coords = stroke[:, :2] 110 | 111 | if len(stroke) > 3: 112 | f_x = interp1d(np.arange(len(stroke)), stroke[:, 0], kind='cubic') 113 | f_y = interp1d(np.arange(len(stroke)), stroke[:, 1], kind='cubic') 114 | 115 | xx = np.linspace(0, len(stroke) - 1, factor*(len(stroke))) 116 | yy = np.linspace(0, len(stroke) - 1, factor*(len(stroke))) 117 | 118 | x_new = f_x(xx) 119 | y_new = f_y(yy) 120 | 121 | xy_coords = np.hstack([x_new.reshape(-1, 1), y_new.reshape(-1, 1)]) 122 | 123 | stroke_eos = np.zeros([len(xy_coords), 1]) 124 | stroke_eos[-1] = 1.0 125 | stroke = np.concatenate([xy_coords, stroke_eos], axis=1) 126 | new_coords.append(stroke) 127 | 128 | coords = np.vstack(new_coords) 129 | return coords 130 | 131 | 132 | def normalize(offsets): 133 | """ 134 | normalizes strokes to median unit norm 135 | """ 136 | offsets = np.copy(offsets) 137 | offsets[:, :2] /= np.median(np.linalg.norm(offsets[:, :2], axis=1)) 138 | return offsets 139 | 140 | 141 | def coords_to_offsets(coords): 142 | """ 143 | convert from coordinates to offsets 144 | """ 145 | offsets = np.concatenate([coords[1:, :2] - coords[:-1, :2], coords[1:, 2:3]], axis=1) 146 | offsets = np.concatenate([np.array([[0, 0, 1]]), offsets], axis=0) 147 | return offsets 148 | 149 | 150 | def offsets_to_coords(offsets): 151 | """ 152 | convert from offsets to coordinates 153 | """ 154 | return np.concatenate([np.cumsum(offsets[:, :2], axis=0), offsets[:, 2:3]], axis=1) 155 | 156 | 157 | def draw( 158 | offsets, 159 | ascii_seq=None, 160 | align_strokes=True, 161 | denoise_strokes=True, 162 | interpolation_factor=None, 163 | save_file=None 164 | ): 165 | strokes = offsets_to_coords(offsets) 166 | 167 | if denoise_strokes: 168 | strokes = denoise(strokes) 169 | 170 | if interpolation_factor is not None: 171 | strokes = interpolate(strokes, factor=interpolation_factor) 172 | 173 | if align_strokes: 174 | strokes[:, :2] = align(strokes[:, :2]) 175 | 176 | fig, ax = plt.subplots(figsize=(12, 3)) 177 | 178 | stroke = [] 179 | for x, y, eos in strokes: 180 | stroke.append((x, y)) 181 | if eos == 1: 182 | coords = zip(*stroke) 183 | ax.plot(coords[0], coords[1], 'k') 184 | stroke = [] 185 | if stroke: 186 | coords = zip(*stroke) 187 | ax.plot(coords[0], coords[1], 'k') 188 | stroke = [] 189 | 190 | ax.set_xlim(-50, 600) 191 | ax.set_ylim(-40, 40) 192 | 193 | ax.set_aspect('equal') 194 | plt.tick_params( 195 | axis='both', 196 | left='off', 197 | top='off', 198 | right='off', 199 | bottom='off', 200 | labelleft='off', 201 | labeltop='off', 202 | labelright='off', 203 | labelbottom='off' 204 | ) 205 | 206 | if ascii_seq is not None: 207 | if not isinstance(ascii_seq, str): 208 | ascii_seq = ''.join(list(map(chr, ascii_seq))) 209 | plt.title(ascii_seq) 210 | 211 | if save_file is not None: 212 | plt.savefig(save_file) 213 | print('saved to {}'.format(save_file)) 214 | else: 215 | plt.show() 216 | plt.close('all') 217 | -------------------------------------------------------------------------------- /handwriting-synthesis/lyrics.py: -------------------------------------------------------------------------------- 1 | """lyrics taken from https://www.azlyrics.com/""" 2 | 3 | all_star = """Somebody once told me the world is gonna roll me 4 | I ain't the sharpest tool in the shed 5 | She was looking kind of dumb with her finger and her thumb 6 | In the shape of an "L" on her forehead 7 | 8 | Well, the years start coming and they don't stop coming 9 | Fed to the rules and I hit the ground running 10 | Didn't make sense not to live for fun 11 | Your brain gets smart but your head gets dumb 12 | 13 | So much to do, so much to see 14 | So what's wrong with taking the back streets? 15 | You'll never know if you don't go 16 | You'll never shine if you don't glow 17 | 18 | Hey, now, you're an All Star, get your game on, go play 19 | Hey, now, you're a Rock Star, get the show on, get paid 20 | And all that glitters is gold 21 | Only shooting stars break the mold 22 | 23 | It's a cool place and they say it gets colder 24 | You're bundled up now wait 'til you get older 25 | But the meteor men beg to differ 26 | Judging by the hole in the satellite picture 27 | 28 | The ice we skate is getting pretty thin 29 | The water's getting warm so you might as well swim 30 | My world's on fire. How about yours? 31 | That's the way I like it and I'll never get bored. 32 | 33 | Somebody once asked could I spare some change for gas 34 | I need to get myself away from this place 35 | I said yep, what a concept 36 | I could use a little fuel myself 37 | And we could all use a little change 38 | 39 | Well, the years start coming and they don't stop coming 40 | Fed to the rules and I hit the ground running 41 | Didn't make sense not to live for fun 42 | Your brain gets smart but your head gets dumb 43 | 44 | So much to do, so much to see 45 | So what's wrong with taking the back streets? 46 | You'll never know if you don't go 47 | You'll never shine if you don't glow. 48 | 49 | And all that glitters is gold 50 | Only shooting stars break the mold""" 51 | 52 | downtown = """Making my way downtown 53 | Walking fast 54 | Faces pass 55 | And I'm home-bound 56 | 57 | Staring blankly ahead 58 | Just making my way 59 | Making a way 60 | Through the crowd 61 | 62 | And I need you 63 | And I miss you 64 | And now I wonder 65 | 66 | If I could fall into the sky 67 | Do you think time would pass me by? 68 | 'Cause you know I'd walk a thousand miles 69 | If I could just see you tonight 70 | 71 | It's always times like these 72 | When I think of you 73 | And I wonder if you ever think of me 74 | 'Cause everything's so wrong 75 | And I don't belong 76 | Living in your precious memory 77 | 78 | 'Cause I need you 79 | And I miss you 80 | And now I wonder 81 | 82 | If I could fall into the sky 83 | Do you think time would pass me by? 84 | 'Cause you know I'd walk a thousand miles 85 | If I could just see you tonight 86 | 87 | And I, I don't wanna let you know 88 | I, I drown in your memory 89 | I, I don't wanna let this go 90 | I, I don't 91 | 92 | Making my way downtown 93 | Walking fast 94 | Faces pass 95 | And I'm home-bound 96 | 97 | Staring blankly ahead 98 | Just making my way 99 | Making a way 100 | Through the crowd 101 | 102 | And I still need you 103 | And I still miss you 104 | And now I wonder 105 | 106 | If I could fall into the sky 107 | Do you think time would pass us by? 108 | 'Cause you know I'd walk a thousand miles 109 | If I could just see you 110 | 111 | If I could fall into the sky 112 | Do you think time would pass me by? 113 | 'Cause you know I'd walk a thousand miles 114 | If I could just see you 115 | If I could just hold you tonight""" 116 | 117 | give_up = """We're no strangers to love 118 | You know the rules and so do I 119 | A full commitment's what I'm thinking of 120 | You wouldn't get this from any other guy 121 | 122 | I just wanna tell you how I'm feeling 123 | Gotta make you understand 124 | 125 | Never gonna give you up 126 | Never gonna let you down 127 | Never gonna run around and desert you 128 | Never gonna make you cry 129 | Never gonna say goodbye 130 | Never gonna tell a lie and hurt you 131 | 132 | We've known each other for so long 133 | Your heart's been aching, but 134 | You're too shy to say it 135 | Inside, we both know what's been going on 136 | We know the game and we're gonna play it 137 | 138 | And if you ask me how I'm feeling 139 | Don't tell me you're too blind to see 140 | 141 | Never gonna give you up 142 | Never gonna let you down 143 | Never gonna run around and desert you 144 | Never gonna make you cry 145 | Never gonna say goodbye 146 | Never gonna tell a lie and hurt you 147 | 148 | Never gonna give you up 149 | Never gonna let you down 150 | Never gonna run around and desert you 151 | Never gonna make you cry 152 | Never gonna say goodbye 153 | Never gonna tell a lie and hurt you 154 | 155 | (Ooh, give you up) 156 | (Ooh, give you up) 157 | Never gonna give, never gonna give 158 | (Give you up) 159 | Never gonna give, never gonna give 160 | (Give you up) 161 | 162 | We've known each other for so long 163 | Your heart's been aching, but 164 | You're too shy to say it 165 | Inside, we both know what's been going on 166 | We know the game and we're gonna play it 167 | 168 | I just wanna tell you how I'm feeling 169 | Gotta make you understand 170 | 171 | Never gonna give you up 172 | Never gonna let you down 173 | Never gonna run around and desert you 174 | Never gonna make you cry 175 | Never gonna say goodbye 176 | Never gonna tell a lie and hurt you 177 | 178 | Never gonna give you up 179 | Never gonna let you down 180 | Never gonna run around and desert you 181 | Never gonna make you cry 182 | Never gonna say goodbye 183 | Never gonna tell a lie and hurt you 184 | 185 | Never gonna give you up 186 | Never gonna let you down 187 | Never gonna run around and desert you 188 | Never gonna make you cry 189 | Never gonna say goodbye 190 | Never gonna tell a lie and hurt you""" 191 | -------------------------------------------------------------------------------- /handwriting-synthesis/prepare_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | from xml.etree import ElementTree 4 | 5 | import numpy as np 6 | 7 | import drawing 8 | 9 | 10 | def get_stroke_sequence(filename): 11 | tree = ElementTree.parse(filename).getroot() 12 | strokes = [i for i in tree if i.tag == 'StrokeSet'][0] 13 | 14 | coords = [] 15 | for stroke in strokes: 16 | for i, point in enumerate(stroke): 17 | coords.append([ 18 | int(point.attrib['x']), 19 | -1*int(point.attrib['y']), 20 | int(i == len(stroke) - 1) 21 | ]) 22 | coords = np.array(coords) 23 | 24 | coords = drawing.align(coords) 25 | coords = drawing.denoise(coords) 26 | offsets = drawing.coords_to_offsets(coords) 27 | offsets = offsets[:drawing.MAX_STROKE_LEN] 28 | offsets = drawing.normalize(offsets) 29 | return offsets 30 | 31 | 32 | def get_ascii_sequences(filename): 33 | sequences = open(filename, 'r').read() 34 | sequences = sequences.replace(r'%%%%%%%%%%%', '\n') 35 | sequences = [i.strip() for i in sequences.split('\n')] 36 | lines = sequences[sequences.index('CSR:') + 2:] 37 | lines = [line.strip() for line in lines if line.strip()] 38 | lines = [drawing.encode_ascii(line)[:drawing.MAX_CHAR_LEN] for line in lines] 39 | return lines 40 | 41 | 42 | def collect_data(): 43 | fnames = [] 44 | for dirpath, dirnames, filenames in os.walk('data/raw/ascii/'): 45 | if dirnames: 46 | continue 47 | for filename in filenames: 48 | if filename.startswith('.'): 49 | continue 50 | fnames.append(os.path.join(dirpath, filename)) 51 | 52 | # low quality samples (selected by collecting samples to 53 | # which the trained model assigned very low likelihood) 54 | blacklist = set(np.load('data/blacklist.npy')) 55 | 56 | stroke_fnames, transcriptions, writer_ids = [], [], [] 57 | for i, fname in enumerate(fnames): 58 | print(i, fname) 59 | if fname == 'data/raw/ascii/z01/z01-000/z01-000z.txt': 60 | continue 61 | 62 | head, tail = os.path.split(fname) 63 | last_letter = os.path.splitext(fname)[0][-1] 64 | last_letter = last_letter if last_letter.isalpha() else '' 65 | 66 | line_stroke_dir = head.replace('ascii', 'lineStrokes') 67 | line_stroke_fname_prefix = os.path.split(head)[-1] + last_letter + '-' 68 | 69 | if not os.path.isdir(line_stroke_dir): 70 | continue 71 | line_stroke_fnames = sorted([f for f in os.listdir(line_stroke_dir) 72 | if f.startswith(line_stroke_fname_prefix)]) 73 | if not line_stroke_fnames: 74 | continue 75 | 76 | original_dir = head.replace('ascii', 'original') 77 | original_xml = os.path.join(original_dir, 'strokes' + last_letter + '.xml') 78 | tree = ElementTree.parse(original_xml) 79 | root = tree.getroot() 80 | 81 | general = root.find('General') 82 | if general is not None: 83 | writer_id = int(general[0].attrib.get('writerID', '0')) 84 | else: 85 | writer_id = int('0') 86 | 87 | ascii_sequences = get_ascii_sequences(fname) 88 | assert len(ascii_sequences) == len(line_stroke_fnames) 89 | 90 | for ascii_seq, line_stroke_fname in zip(ascii_sequences, line_stroke_fnames): 91 | if line_stroke_fname in blacklist: 92 | continue 93 | 94 | stroke_fnames.append(os.path.join(line_stroke_dir, line_stroke_fname)) 95 | transcriptions.append(ascii_seq) 96 | writer_ids.append(writer_id) 97 | 98 | return stroke_fnames, transcriptions, writer_ids 99 | 100 | 101 | if __name__ == '__main__': 102 | print('traversing data directory...') 103 | stroke_fnames, transcriptions, writer_ids = collect_data() 104 | 105 | print('dumping to numpy arrays...') 106 | x = np.zeros([len(stroke_fnames), drawing.MAX_STROKE_LEN, 3], dtype=np.float32) 107 | x_len = np.zeros([len(stroke_fnames)], dtype=np.int16) 108 | c = np.zeros([len(stroke_fnames), drawing.MAX_CHAR_LEN], dtype=np.int8) 109 | c_len = np.zeros([len(stroke_fnames)], dtype=np.int8) 110 | w_id = np.zeros([len(stroke_fnames)], dtype=np.int16) 111 | valid_mask = np.zeros([len(stroke_fnames)], dtype=np.bool) 112 | 113 | for i, (stroke_fname, c_i, w_id_i) in enumerate(zip(stroke_fnames, transcriptions, writer_ids)): 114 | if i % 200 == 0: 115 | print(i, '\t', '/', len(stroke_fnames)) 116 | x_i = get_stroke_sequence(stroke_fname) 117 | valid_mask[i] = ~np.any(np.linalg.norm(x_i[:, :2], axis=1) > 60) 118 | 119 | x[i, :len(x_i), :] = x_i 120 | x_len[i] = len(x_i) 121 | 122 | c[i, :len(c_i)] = c_i 123 | c_len[i] = len(c_i) 124 | 125 | w_id[i] = w_id_i 126 | 127 | if not os.path.isdir('data/processed'): 128 | os.makedirs('data/processed') 129 | 130 | np.save('data/processed/x.npy', x[valid_mask]) 131 | np.save('data/processed/x_len.npy', x_len[valid_mask]) 132 | np.save('data/processed/c.npy', c[valid_mask]) 133 | np.save('data/processed/c_len.npy', c_len[valid_mask]) 134 | np.save('data/processed/w_id.npy', w_id[valid_mask]) 135 | -------------------------------------------------------------------------------- /handwriting-synthesis/readme.md: -------------------------------------------------------------------------------- 1 | ![](img/banner.svg) 2 | # Handwriting Synthesis 3 | Implementation of the handwriting synthesis experiments in the paper Generating Sequences with Recurrent Neural Networks by Alex Graves. The implementation closely follows the original paper, with a few slight deviations, and the generated samples are of similar quality to those presented in the paper. 4 | 5 | Web demo is available here. 6 | 7 | ## Usage 8 | ```python 9 | lines = [ 10 | "Now this is a story all about how", 11 | "My life got flipped turned upside down", 12 | "And I'd like to take a minute, just sit right there", 13 | "I'll tell you how I became the prince of a town called Bel-Air", 14 | ] 15 | biases = [.75 for i in lines] 16 | styles = [9 for i in lines] 17 | stroke_colors = ['red', 'green', 'black', 'blue'] 18 | stroke_widths = [1, 2, 1, 2] 19 | 20 | hand = Hand() 21 | hand.write( 22 | filename='img/usage_demo.svg', 23 | lines=lines, 24 | biases=biases, 25 | styles=styles, 26 | stroke_colors=stroke_colors, 27 | stroke_widths=stroke_widths 28 | ) 29 | ``` 30 | ![](img/usage_demo.svg) 31 | 32 | Currently, the `Hand` class must be imported from `demo.py`. If someone would like to package this project to make it more usable, please [contribute](#contribute). 33 | 34 | A pretrained model is included, but if you'd like to train your own, read these instructions. 35 | 36 | ## Demonstrations 37 | Below are a few hundred samples from the model, including some samples demonstrating the effect of priming and biasing the model. Loosely speaking, biasing controls the neatness of the samples and priming controls the style of the samples. The code for these demonstrations can be found in `demo.py`. 38 | 39 | ### Demo #1: 40 | The following samples were generated with a fixed style and fixed bias. 41 | 42 | **Smash Mouth – All Star (lyrics)** 43 | ![](img/all_star.svg) 44 | 45 | ### Demo #2 46 | The following samples were generated with varying style and fixed bias. Each verse is generated in a different style. 47 | 48 | **Vanessa Carlton – A Thousand Miles (lyrics)** 49 | ![](img/downtown.svg) 50 | 51 | ### Demo #3 52 | The following samples were generated with a fixed style and varying bias. Each verse has a lower bias than the previous, with the last verse being unbiased. 53 | 54 | **Leonard Cohen – Hallelujah (lyrics)** 55 | ![](img/give_up.svg) 56 | 57 | ## Contribute 58 | This project was intended to serve as a reference implementation for a research paper, but since the results are of decent quality, it may be worthwile to make the project more broadly usable. I plan to continue focusing on the machine learning side of things. That said, I'd welcome contributors who can: 59 | 60 | - Package this, and otherwise make it look more like a usable software project and less like research code. 61 | - Add support for more sophisticated drawing, animations, or anything else in this direction. Currently, the project only creates some simple svg files. 62 | -------------------------------------------------------------------------------- /handwriting-synthesis/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib>=2.1.0 2 | pandas>= 0.22.0 3 | scikit-learn>=0.19.1 4 | scipy>=1.0.0 5 | svgwrite>=1.1.12 6 | tensorflow==2.11.1 7 | -------------------------------------------------------------------------------- /handwriting-synthesis/rnn_cell.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | import tensorflow as tf 4 | import tensorflow.contrib.distributions as tfd 5 | import numpy as np 6 | 7 | from tf_utils import dense_layer, shape 8 | 9 | 10 | LSTMAttentionCellState = namedtuple( 11 | 'LSTMAttentionCellState', 12 | ['h1', 'c1', 'h2', 'c2', 'h3', 'c3', 'alpha', 'beta', 'kappa', 'w', 'phi'] 13 | ) 14 | 15 | 16 | class LSTMAttentionCell(tf.nn.rnn_cell.RNNCell): 17 | 18 | def __init__( 19 | self, 20 | lstm_size, 21 | num_attn_mixture_components, 22 | attention_values, 23 | attention_values_lengths, 24 | num_output_mixture_components, 25 | bias, 26 | reuse=None, 27 | ): 28 | self.reuse = reuse 29 | self.lstm_size = lstm_size 30 | self.num_attn_mixture_components = num_attn_mixture_components 31 | self.attention_values = attention_values 32 | self.attention_values_lengths = attention_values_lengths 33 | self.window_size = shape(self.attention_values, 2) 34 | self.char_len = tf.shape(attention_values)[1] 35 | self.batch_size = tf.shape(attention_values)[0] 36 | self.num_output_mixture_components = num_output_mixture_components 37 | self.output_units = 6*self.num_output_mixture_components + 1 38 | self.bias = bias 39 | 40 | @property 41 | def state_size(self): 42 | return LSTMAttentionCellState( 43 | self.lstm_size, 44 | self.lstm_size, 45 | self.lstm_size, 46 | self.lstm_size, 47 | self.lstm_size, 48 | self.lstm_size, 49 | self.num_attn_mixture_components, 50 | self.num_attn_mixture_components, 51 | self.num_attn_mixture_components, 52 | self.window_size, 53 | self.char_len, 54 | ) 55 | 56 | @property 57 | def output_size(self): 58 | return self.lstm_size 59 | 60 | def zero_state(self, batch_size, dtype): 61 | return LSTMAttentionCellState( 62 | tf.zeros([batch_size, self.lstm_size]), 63 | tf.zeros([batch_size, self.lstm_size]), 64 | tf.zeros([batch_size, self.lstm_size]), 65 | tf.zeros([batch_size, self.lstm_size]), 66 | tf.zeros([batch_size, self.lstm_size]), 67 | tf.zeros([batch_size, self.lstm_size]), 68 | tf.zeros([batch_size, self.num_attn_mixture_components]), 69 | tf.zeros([batch_size, self.num_attn_mixture_components]), 70 | tf.zeros([batch_size, self.num_attn_mixture_components]), 71 | tf.zeros([batch_size, self.window_size]), 72 | tf.zeros([batch_size, self.char_len]), 73 | ) 74 | 75 | def __call__(self, inputs, state, scope=None): 76 | with tf.variable_scope(scope or type(self).__name__, reuse=tf.AUTO_REUSE): 77 | 78 | # lstm 1 79 | s1_in = tf.concat([state.w, inputs], axis=1) 80 | cell1 = tf.contrib.rnn.LSTMCell(self.lstm_size) 81 | s1_out, s1_state = cell1(s1_in, state=(state.c1, state.h1)) 82 | 83 | # attention 84 | attention_inputs = tf.concat([state.w, inputs, s1_out], axis=1) 85 | attention_params = dense_layer(attention_inputs, 3*self.num_attn_mixture_components, scope='attention') 86 | alpha, beta, kappa = tf.split(tf.nn.softplus(attention_params), 3, axis=1) 87 | kappa = state.kappa + kappa / 25.0 88 | beta = tf.clip_by_value(beta, .01, np.inf) 89 | 90 | kappa_flat, alpha_flat, beta_flat = kappa, alpha, beta 91 | kappa, alpha, beta = tf.expand_dims(kappa, 2), tf.expand_dims(alpha, 2), tf.expand_dims(beta, 2) 92 | 93 | enum = tf.reshape(tf.range(self.char_len), (1, 1, self.char_len)) 94 | u = tf.cast(tf.tile(enum, (self.batch_size, self.num_attn_mixture_components, 1)), tf.float32) 95 | phi_flat = tf.reduce_sum(alpha*tf.exp(-tf.square(kappa - u) / beta), axis=1) 96 | 97 | phi = tf.expand_dims(phi_flat, 2) 98 | sequence_mask = tf.cast(tf.sequence_mask(self.attention_values_lengths, maxlen=self.char_len), tf.float32) 99 | sequence_mask = tf.expand_dims(sequence_mask, 2) 100 | w = tf.reduce_sum(phi*self.attention_values*sequence_mask, axis=1) 101 | 102 | # lstm 2 103 | s2_in = tf.concat([inputs, s1_out, w], axis=1) 104 | cell2 = tf.contrib.rnn.LSTMCell(self.lstm_size) 105 | s2_out, s2_state = cell2(s2_in, state=(state.c2, state.h2)) 106 | 107 | # lstm 3 108 | s3_in = tf.concat([inputs, s2_out, w], axis=1) 109 | cell3 = tf.contrib.rnn.LSTMCell(self.lstm_size) 110 | s3_out, s3_state = cell3(s3_in, state=(state.c3, state.h3)) 111 | 112 | new_state = LSTMAttentionCellState( 113 | s1_state.h, 114 | s1_state.c, 115 | s2_state.h, 116 | s2_state.c, 117 | s3_state.h, 118 | s3_state.c, 119 | alpha_flat, 120 | beta_flat, 121 | kappa_flat, 122 | w, 123 | phi_flat, 124 | ) 125 | 126 | return s3_out, new_state 127 | 128 | def output_function(self, state): 129 | params = dense_layer(state.h3, self.output_units, scope='gmm', reuse=tf.AUTO_REUSE) 130 | pis, mus, sigmas, rhos, es = self._parse_parameters(params) 131 | mu1, mu2 = tf.split(mus, 2, axis=1) 132 | mus = tf.stack([mu1, mu2], axis=2) 133 | sigma1, sigma2 = tf.split(sigmas, 2, axis=1) 134 | 135 | covar_matrix = [tf.square(sigma1), rhos*sigma1*sigma2, 136 | rhos*sigma1*sigma2, tf.square(sigma2)] 137 | covar_matrix = tf.stack(covar_matrix, axis=2) 138 | covar_matrix = tf.reshape(covar_matrix, (self.batch_size, self.num_output_mixture_components, 2, 2)) 139 | 140 | mvn = tfd.MultivariateNormalFullCovariance(loc=mus, covariance_matrix=covar_matrix) 141 | b = tfd.Bernoulli(probs=es) 142 | c = tfd.Categorical(probs=pis) 143 | 144 | sampled_e = b.sample() 145 | sampled_coords = mvn.sample() 146 | sampled_idx = c.sample() 147 | 148 | idx = tf.stack([tf.range(self.batch_size), sampled_idx], axis=1) 149 | coords = tf.gather_nd(sampled_coords, idx) 150 | return tf.concat([coords, tf.cast(sampled_e, tf.float32)], axis=1) 151 | 152 | def termination_condition(self, state): 153 | char_idx = tf.cast(tf.argmax(state.phi, axis=1), tf.int32) 154 | final_char = char_idx >= self.attention_values_lengths - 1 155 | past_final_char = char_idx >= self.attention_values_lengths 156 | output = self.output_function(state) 157 | es = tf.cast(output[:, 2], tf.int32) 158 | is_eos = tf.equal(es, np.ones_like(es)) 159 | return tf.logical_or(tf.logical_and(final_char, is_eos), past_final_char) 160 | 161 | def _parse_parameters(self, gmm_params, eps=1e-8, sigma_eps=1e-4): 162 | pis, sigmas, rhos, mus, es = tf.split( 163 | gmm_params, 164 | [ 165 | 1*self.num_output_mixture_components, 166 | 2*self.num_output_mixture_components, 167 | 1*self.num_output_mixture_components, 168 | 2*self.num_output_mixture_components, 169 | 1 170 | ], 171 | axis=-1 172 | ) 173 | pis = pis*(1 + tf.expand_dims(self.bias, 1)) 174 | sigmas = sigmas - tf.expand_dims(self.bias, 1) 175 | 176 | pis = tf.nn.softmax(pis, axis=-1) 177 | pis = tf.where(pis < .01, tf.zeros_like(pis), pis) 178 | sigmas = tf.clip_by_value(tf.exp(sigmas), sigma_eps, np.inf) 179 | rhos = tf.clip_by_value(tf.tanh(rhos), eps - 1.0, 1.0 - eps) 180 | es = tf.clip_by_value(tf.nn.sigmoid(es), eps, 1.0 - eps) 181 | es = tf.where(es < .01, tf.zeros_like(es), es) 182 | 183 | return pis, mus, sigmas, rhos, es 184 | -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-0-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-0-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-0-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-0-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-1-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-1-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-1-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-1-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-1.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-1.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-10-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-10-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-10-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-10-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-11-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-11-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-11-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-11-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-12-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-12-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-12-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-12-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-2-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-2-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-2-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-2-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-2.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-3-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-3-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-3-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-3-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-4-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-4-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-4-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-4-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-5-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-5-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-5-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-5-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-6-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-6-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-6-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-6-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-7-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-7-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-7-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-7-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-8-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-8-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-8-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-8-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-9-chars.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-9-chars.npy -------------------------------------------------------------------------------- /handwriting-synthesis/styles/style-9-strokes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting-synthesis/styles/style-9-strokes.npy -------------------------------------------------------------------------------- /handwriting-synthesis/test_example.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | 4 | import numpy as np 5 | 6 | import drawing 7 | from data_frame import DataFrame 8 | from drawing import alphabet 9 | 10 | import svgwrite 11 | 12 | 13 | class DataReader(object): 14 | 15 | def __init__(self, data_dir): 16 | data_cols = ['x', 'x_len', 'c', 'c_len'] 17 | data = [np.load(os.path.join(data_dir, '{}.npy'.format(i))) for i in data_cols] 18 | 19 | self.test_df = DataFrame(columns=data_cols, data=data) 20 | self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.95, random_state=2018) 21 | 22 | print('train size', len(self.train_df)) 23 | print('val size', len(self.val_df)) 24 | print('test size', len(self.test_df)) 25 | 26 | def train_batch_generator(self, batch_size): 27 | return self.batch_generator( 28 | batch_size=batch_size, 29 | df=self.train_df, 30 | shuffle=True, 31 | num_epochs=10000, 32 | mode='train' 33 | ) 34 | 35 | def val_batch_generator(self, batch_size): 36 | return self.batch_generator( 37 | batch_size=batch_size, 38 | df=self.val_df, 39 | shuffle=True, 40 | num_epochs=10000, 41 | mode='val' 42 | ) 43 | 44 | def test_batch_generator(self, batch_size): 45 | return self.batch_generator( 46 | batch_size=batch_size, 47 | df=self.test_df, 48 | shuffle=False, 49 | num_epochs=1, 50 | mode='test' 51 | ) 52 | 53 | def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, mode='train'): 54 | gen = df.batch_generator( 55 | batch_size=batch_size, 56 | shuffle=shuffle, 57 | num_epochs=num_epochs, 58 | allow_smaller_final_batch=(mode == 'test') 59 | ) 60 | for batch in gen: 61 | batch['x_len'] = batch['x_len'] - 1 62 | max_x_len = np.max(batch['x_len']) 63 | max_c_len = np.max(batch['c_len']) 64 | batch['y'] = batch['x'][:, 1:max_x_len + 1, :] 65 | batch['x'] = batch['x'][:, :max_x_len, :] 66 | batch['c'] = batch['c'][:, :max_c_len] 67 | yield batch 68 | 69 | 70 | def _draw(strokes, lines, filename, stroke_colors=None, stroke_widths=None): 71 | stroke_colors = stroke_colors or ['black']*len(lines) 72 | stroke_widths = stroke_widths or [2]*len(lines) 73 | 74 | line_height = 60 75 | view_width = 1000 76 | view_height = line_height*(len(strokes) + 1) 77 | 78 | dwg = svgwrite.Drawing(filename=filename) 79 | dwg.viewbox(width=view_width, height=view_height) 80 | dwg.add(dwg.rect(insert=(0, 0), size=(view_width, view_height), fill='white')) 81 | 82 | initial_coord = np.array([0, -(3*line_height / 4)]) 83 | for offsets, line, color, width in zip(strokes, lines, stroke_colors, stroke_widths): 84 | 85 | if not line: 86 | initial_coord[1] -= line_height 87 | continue 88 | 89 | offsets[:, :2] *= 1.5 90 | strokes = drawing.offsets_to_coords(offsets) 91 | strokes = drawing.denoise(strokes) 92 | strokes[:, :2] = drawing.align(strokes[:, :2]) 93 | 94 | strokes[:, 1] *= -1 95 | strokes[:, :2] -= strokes[:, :2].min() + initial_coord 96 | strokes[:, 0] += (view_width - strokes[:, 0].max()) / 2 97 | 98 | prev_eos = 1.0 99 | p = "M{},{} ".format(0, 0) 100 | for x, y, eos in zip(*strokes.T): 101 | p += '{}{},{} '.format('M' if prev_eos == 1.0 else 'L', x, y) 102 | prev_eos = eos 103 | path = svgwrite.path.Path(p) 104 | path = path.stroke(color=color, width=width, linecap='round').fill("none") 105 | dwg.add(path) 106 | 107 | initial_coord[1] -= line_height 108 | 109 | dwg.save() 110 | 111 | 112 | def num_to_string(c, c_len): 113 | indices = c[:c_len - 1] 114 | str_out = ''.join([alphabet[x] for x in indices]) 115 | return str_out 116 | 117 | 118 | if __name__ == '__main__': 119 | dr = DataReader(data_dir='data/processed/') 120 | # import ipdb; ipdb.set_trace() 121 | 122 | stroke_colors = ['red', 'green', 'black', 'blue'] 123 | stroke_widths = [1, 2, 1, 2] 124 | 125 | lines = [ 126 | num_to_string(dr.test_df['c'][0], dr.test_df['c_len'][0]), 127 | num_to_string(dr.test_df['c'][1], dr.test_df['c_len'][1]), 128 | ] 129 | strokes = [ 130 | dr.test_df['x'][0][:dr.test_df['x_len'][0]], 131 | dr.test_df['x'][1][:dr.test_df['x_len'][1]], 132 | ] 133 | 134 | import ipdb; ipdb.set_trace() 135 | 136 | _draw(strokes, lines, "test.svg", stroke_colors=stroke_colors, stroke_widths=stroke_widths) 137 | -------------------------------------------------------------------------------- /handwriting-synthesis/tf_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def dense_layer(inputs, output_units, bias=True, activation=None, batch_norm=None, 5 | dropout=None, scope='dense-layer', reuse=False): 6 | """ 7 | Applies a dense layer to a 2D tensor of shape [batch_size, input_units] 8 | to produce a tensor of shape [batch_size, output_units]. 9 | Args: 10 | inputs: Tensor of shape [batch size, input_units]. 11 | output_units: Number of output units. 12 | activation: activation function. 13 | dropout: dropout keep prob. 14 | Returns: 15 | Tensor of shape [batch size, output_units]. 16 | """ 17 | with tf.variable_scope(scope, reuse=reuse): 18 | W = tf.get_variable( 19 | name='weights', 20 | initializer=tf.contrib.layers.variance_scaling_initializer(), 21 | shape=[shape(inputs, -1), output_units] 22 | ) 23 | z = tf.matmul(inputs, W) 24 | if bias: 25 | b = tf.get_variable( 26 | name='biases', 27 | initializer=tf.constant_initializer(), 28 | shape=[output_units] 29 | ) 30 | z = z + b 31 | 32 | if batch_norm is not None: 33 | z = tf.layers.batch_normalization(z, training=batch_norm, reuse=reuse) 34 | 35 | z = activation(z) if activation else z 36 | z = tf.nn.dropout(z, dropout) if dropout is not None else z 37 | return z 38 | 39 | 40 | def time_distributed_dense_layer( 41 | inputs, output_units, bias=True, activation=None, batch_norm=None, 42 | dropout=None, scope='time-distributed-dense-layer', reuse=False): 43 | """ 44 | Applies a shared dense layer to each timestep of a tensor of shape 45 | [batch_size, max_seq_len, input_units] to produce a tensor of shape 46 | [batch_size, max_seq_len, output_units]. 47 | 48 | Args: 49 | inputs: Tensor of shape [batch size, max sequence length, ...]. 50 | output_units: Number of output units. 51 | activation: activation function. 52 | dropout: dropout keep prob. 53 | 54 | Returns: 55 | Tensor of shape [batch size, max sequence length, output_units]. 56 | """ 57 | with tf.variable_scope(scope, reuse=reuse): 58 | W = tf.get_variable( 59 | name='weights', 60 | initializer=tf.contrib.layers.variance_scaling_initializer(), 61 | shape=[shape(inputs, -1), output_units] 62 | ) 63 | z = tf.einsum('ijk,kl->ijl', inputs, W) 64 | if bias: 65 | b = tf.get_variable( 66 | name='biases', 67 | initializer=tf.constant_initializer(), 68 | shape=[output_units] 69 | ) 70 | z = z + b 71 | 72 | if batch_norm is not None: 73 | z = tf.layers.batch_normalization(z, training=batch_norm, reuse=reuse) 74 | 75 | z = activation(z) if activation else z 76 | z = tf.nn.dropout(z, dropout) if dropout is not None else z 77 | return z 78 | 79 | 80 | def shape(tensor, dim=None): 81 | """Get tensor shape/dimension as list/int""" 82 | if dim is None: 83 | return tensor.shape.as_list() 84 | else: 85 | return tensor.shape.as_list()[dim] 86 | 87 | 88 | def rank(tensor): 89 | """Get tensor rank as python list""" 90 | return len(tensor.shape.as_list()) 91 | -------------------------------------------------------------------------------- /handwriting-synthesis/upgrade_tf2.sh: -------------------------------------------------------------------------------- 1 | # NOTE: tf_upgrade_v2 does not successfully convert all these files to tf2 2 | tf_upgrade_v2 --infile rnn_cell.py --outfile rnn_cell.py 3 | tf_upgrade_v2 --infile rnn.py --outfile rnn.py 4 | tf_upgrade_v2 --infile rnn_ops.py --outfile rnn_ops.py 5 | tf_upgrade_v2 --infile tf_utils.py --outfile tf_utils.py 6 | -------------------------------------------------------------------------------- /handwriting_generator/saved.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/handwriting_generator/saved.tgz -------------------------------------------------------------------------------- /minGPT/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /minGPT/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) Copyright (c) 2020 Andrej Karpathy 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /minGPT/README.md: -------------------------------------------------------------------------------- 1 | 2 | # minGPT 3 | 4 | ![mingpt](mingpt.jpg) 5 | 6 | A PyTorch re-implementation of [GPT](https://github.com/openai/gpt-3) training. minGPT tries to be small, clean, interpretable and educational, as most of the currently available ones are a bit sprawling. GPT is not a complicated model and this implementation is appropriately about 300 lines of code, including boilerplate and a totally unnecessary custom causal self-attention module. Anyway, all that's going on is that a sequence of indices goes into a sequence of transformer blocks, and a probability distribution of the next index comes out. The rest of the complexity is just being clever with batching (both across examples and over sequence length) so that training is efficient. 7 | 8 | The core minGPT "library" (hah) is two files: `mingpt/model.py` contains the actual Transformer model definition and `mingpt/trainer.py` is (GPT-independent) PyTorch boilerplate that trains the model. The attached Jupyter notebooks then show how the "library" (hah) can be used to train sequence models: 9 | 10 | - `play_math.ipynb` trains a GPT focused on addition (inspired by the addition section in the GPT-3 paper) 11 | - `play_char.ipynb` trains a GPT to be a character-level language model on arbitrary text, similar to my older char-rnn but with a transformer instead of an RNN 12 | - `play_image.ipynb` trains a GPT on (small) images (CIFAR-10), showing that we can model images just as text, as both can be reduced to just a sequence of integers 13 | - `play_words.ipynb` a BPE version that does not yet exist 14 | 15 | With a bpe encoder, distributed training and maybe fp16 this implementation may be able to reproduce GPT-1/GPT-2 results, though I haven't tried $$$. GPT-3 is likely out of reach as my understanding is that it does not fit into GPU memory and requires a more careful model-parallel treatment. 16 | 17 | ### Example usage 18 | 19 | This code is simple enough to just hack inline, not "used", but current API looks something like: 20 | 21 | ```python 22 | 23 | # you're on your own to define a class that returns individual examples as PyTorch LongTensors 24 | from torch.utils.data import Dataset 25 | train_dataset = MyDataset(...) 26 | test_dataset = MyDataset(...) 27 | 28 | # construct a GPT model 29 | from mingpt.model import GPT, GPTConfig 30 | mconf = GPTConfig(vocab_size, block_size, n_layer=12, n_head=12, n_embd=768) # a GPT-1 31 | model = GPT(mconf) 32 | 33 | # construct a trainer 34 | from mingpt.trainer import Trainer, TrainerConfig 35 | tconf = TrainerConfig(max_epochs=10, batch_size=256) 36 | trainer = Trainer(model, train_dataset, test_dataset, tconf) 37 | trainer.train() 38 | # (... enjoy the show for a while... ) 39 | 40 | # sample from the model (the [None, ...] and [0] are to push/pop a needed dummy batch dimension) 41 | from mingpt.utils import sample 42 | x = torch.tensor([1, 2, 3], dtype=torch.long)[None, ...] # context conditioning 43 | y = sample(model, x, steps=30, temperature=1.0, sample=True, top_k=5)[0] 44 | print(y) # our model filled in the integer sequence with 30 additional likely integers 45 | ``` 46 | 47 | ### References 48 | 49 | Code: 50 | 51 | - [openai/gpt-2](https://github.com/openai/gpt-2) has the model but not the training code, and in TensorFlow 52 | - [openai/image-gpt](https://github.com/openai/image-gpt) has some more modern gpt-3 like modification in its code, good reference as well 53 | - huggingface/transformers has a [language-modeling example](https://github.com/huggingface/transformers/tree/master/examples/language-modeling). It is full-featured but as a result also somewhat challenging to trace. E.g. some large functions have as much as 90% unused code behind various branching statements that is unused in the default setting of simple language modeling. 54 | 55 | Papers + some implementation notes: 56 | 57 | #### Improving Language Understanding by Generative Pre-Training (GPT-1) 58 | 59 | - Our model largely follows the original transformer work 60 | - We trained a 12-layer decoder-only transformer with masked self-attention heads (768 dimensional states and 12 attention heads). For the position-wise feed-forward networks, we used 3072 dimensional inner states. 61 | - Adam max learning rate of 2.5e-4. (later GPT-3 for this model size uses 6e-4) 62 | - LR decay: increased linearly from zero over the first 2000 updates and annealed to 0 using a cosine schedule 63 | - We train for 100 epochs on minibatches of 64 randomly sampled, contiguous sequences of 512 tokens. 64 | - Since layernorm is used extensively throughout the model, a simple weight initialization of N(0, 0.02) was sufficient 65 | - bytepair encoding (BPE) vocabulary with 40,000 merges 66 | - residual, embedding, and attention dropouts with a rate of 0.1 for regularization. 67 | - modified version of L2 regularization proposed in (37), with w = 0.01 on all non bias or gain weights 68 | - For the activation function, we used the Gaussian Error Linear Unit (GELU). 69 | - We used learned position embeddings instead of the sinusoidal version proposed in the original work 70 | - For finetuning: We add dropout to the classifier with a rate of 0.1. learning rate of 6.25e-5 and a batchsize of 32. 3 epochs. We use a linear learning rate decay schedule with warmup over 0.2% of training. λ was set to 0.5. 71 | - GPT-1 model is 12 layers and d_model 768, ~117M params 72 | 73 | #### Language Models are Unsupervised Multitask Learners (GPT-2) 74 | 75 | - LayerNorm was moved to the input of each sub-block, similar to a pre-activation residual network 76 | - an additional layer normalization was added after the final self-attention block. 77 | - modified initialization which accounts for the accumulation on the residual path with model depth is used. We scale the weights of residual layers at initialization by a factor of 1/√N where N is the number of residual layers. (weird because in their released code i can only find a simple use of the old 0.02... in their release of image-gpt I found it used for c_proj, and even then only for attn, not for mlp. huh. https://github.com/openai/image-gpt/blob/master/src/model.py) 78 | - the vocabulary is expanded to 50,257 79 | - increase the context size from 512 to 1024 tokens 80 | - larger batchsize of 512 is used 81 | - GPT-2 used 48 layers and d_model 1600 (vs. original 12 layers and d_model 768). ~1.542B params 82 | 83 | #### Language Models are Few-Shot Learners (GPT-3) 84 | 85 | - GPT-3: 96 layers, 96 heads, with d_model of 12,288 (175B parameters). 86 | - GPT-1-like: 12 layers, 12 heads, d_model 768 (125M) 87 | - We use the same model and architecture as GPT-2, including the modified initialization, pre-normalization, and reversible tokenization described therein 88 | - we use alternating dense and locally banded sparse attention patterns in the layers of the transformer, similar to the Sparse Transformer 89 | - we always have the feedforward layer four times the size of the bottleneck layer, dff = 4 ∗ dmodel 90 | - all models use a context window of nctx = 2048 tokens. 91 | - Adam with β1 = 0.9, β2 = 0.95, and eps = 10−8 92 | - All models use weight decay of 0.1 to provide a small amount of regularization. (NOTE: GPT-1 used 0.01 I believe, see above) 93 | - clip the global norm of the gradient at 1.0 94 | - Linear LR warmup over the first 375 million tokens. Then use cosine decay for learning rate down to 10% of its value, over 260 billion tokens. 95 | - gradually increase the batch size linearly from a small value (32k tokens) to the full value over the first 4-12 billion tokens of training, depending on the model size. 96 | - full 2048-sized time context window is always used, with a special END OF DOCUMENT token delimiter 97 | 98 | #### Generative Pretraining from Pixels (Image GPT) 99 | 100 | - When working with images, we pick the identity permutation πi = i for 1 ≤ i ≤ n, also known as raster order. 101 | - we create our own 9-bit color palette by clustering (R, G, B) pixel values using k-means with k = 512. 102 | - Our largest model, iGPT-XL, contains L = 60 layers and uses an embedding size of d = 3072 for a total of 6.8B parameters. 103 | - Our next largest model, iGPT-L, is essentially identical to GPT-2 with L = 48 layers, but contains a slightly smaller embedding size of d = 1536 (vs 1600) for a total of 1.4M parameters. 104 | - We use the same model code as GPT-2, except that we initialize weights in the layerdependent fashion as in Sparse Transformer (Child et al., 2019) and zero-initialize all projections producing logits. 105 | - We also train iGPT-M, a 455M parameter model with L = 36 and d = 1024 106 | - iGPT-S, a 76M parameter model with L = 24 and d = 512 (okay, and how many heads? looks like the Github code claims 8) 107 | - When pre-training iGPT-XL, we use a batch size of 64 and train for 2M iterations, and for all other models we use a batch size of 128 and train for 1M iterations. 108 | - Adam with β1 = 0.9 and β2 = 0.95 109 | - The learning rate is warmed up for one epoch, and then decays to 0 110 | - We did not use weight decay because applying a small weight decay of 0.01 did not change representation quality. 111 | - iGPT-S lr 0.003 112 | - No dropout is used. 113 | 114 | ### License 115 | 116 | MIT 117 | -------------------------------------------------------------------------------- /minGPT/mingpt.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/minGPT/mingpt.jpg -------------------------------------------------------------------------------- /minGPT/mingpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/minGPT/mingpt/__init__.py -------------------------------------------------------------------------------- /minGPT/mingpt/trainer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple training loop; Boilerplate that could apply to any arbitrary neural network, 3 | so nothing in this file really has anything to do with GPT specifically. 4 | """ 5 | 6 | import math 7 | import logging 8 | 9 | from tqdm import tqdm 10 | import numpy as np 11 | 12 | import torch 13 | import torch.optim as optim 14 | from torch.optim.lr_scheduler import LambdaLR 15 | from torch.utils.data.dataloader import DataLoader 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | class TrainerConfig: 20 | # optimization parameters 21 | max_epochs = 10 22 | batch_size = 64 23 | learning_rate = 3e-4 24 | betas = (0.9, 0.95) 25 | grad_norm_clip = 1.0 26 | weight_decay = 0.1 # only applied on matmul weights 27 | # learning rate decay params: linear warmup followed by cosine decay to 10% of original 28 | lr_decay = False 29 | warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere 30 | final_tokens = 260e9 # (at what point we reach 10% of original LR) 31 | # checkpoint settings 32 | ckpt_path = None 33 | num_workers = 0 # for DataLoader 34 | 35 | def __init__(self, **kwargs): 36 | for k,v in kwargs.items(): 37 | setattr(self, k, v) 38 | 39 | class Trainer: 40 | 41 | def __init__(self, model, train_dataset, test_dataset, config): 42 | self.model = model 43 | self.train_dataset = train_dataset 44 | self.test_dataset = test_dataset 45 | self.config = config 46 | 47 | # take over whatever gpus are on the system 48 | self.device = 'cpu' 49 | if torch.cuda.is_available(): 50 | self.device = torch.cuda.current_device() 51 | self.model = torch.nn.DataParallel(self.model).to(self.device) 52 | 53 | def save_checkpoint(self): 54 | # DataParallel wrappers keep raw model object in .module attribute 55 | raw_model = self.model.module if hasattr(self.model, "module") else self.model 56 | logger.info("saving %s", self.config.ckpt_path) 57 | torch.save(raw_model.state_dict(), self.config.ckpt_path) 58 | 59 | def train(self): 60 | model, config = self.model, self.config 61 | raw_model = model.module if hasattr(self.model, "module") else model 62 | optimizer = raw_model.configure_optimizers(config) 63 | 64 | def run_epoch(split): 65 | is_train = split == 'train' 66 | model.train(is_train) 67 | data = self.train_dataset if is_train else self.test_dataset 68 | loader = DataLoader(data, shuffle=True, pin_memory=True, 69 | batch_size=config.batch_size, 70 | num_workers=config.num_workers) 71 | 72 | losses = [] 73 | pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader) 74 | for it, (x, y) in pbar: 75 | 76 | # place data on the correct device 77 | x = x.to(self.device) 78 | y = y.to(self.device) 79 | 80 | # forward the model 81 | with torch.set_grad_enabled(is_train): 82 | logits, loss = model(x, y) 83 | loss = loss.mean() # collapse all losses if they are scattered on multiple gpus 84 | losses.append(loss.item()) 85 | 86 | if is_train: 87 | 88 | # backprop and update the parameters 89 | model.zero_grad() 90 | loss.backward() 91 | torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip) 92 | optimizer.step() 93 | 94 | # decay the learning rate based on our progress 95 | if config.lr_decay: 96 | self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100) 97 | if self.tokens < config.warmup_tokens: 98 | # linear warmup 99 | lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens)) 100 | else: 101 | # cosine learning rate decay 102 | progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens)) 103 | lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress))) 104 | lr = config.learning_rate * lr_mult 105 | for param_group in optimizer.param_groups: 106 | param_group['lr'] = lr 107 | else: 108 | lr = config.learning_rate 109 | 110 | # report progress 111 | pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}") 112 | 113 | if not is_train: 114 | test_loss = float(np.mean(losses)) 115 | logger.info("test loss: %f", test_loss) 116 | return test_loss 117 | 118 | best_loss = float('inf') 119 | self.tokens = 0 # counter used for learning rate decay 120 | for epoch in range(config.max_epochs): 121 | 122 | run_epoch('train') 123 | if self.test_dataset is not None: 124 | test_loss = run_epoch('test') 125 | 126 | # supports early stopping based on the test loss, or just save always if no test set is provided 127 | good_model = self.test_dataset is None or test_loss < best_loss 128 | if self.config.ckpt_path is not None and good_model: 129 | best_loss = test_loss 130 | self.save_checkpoint() 131 | -------------------------------------------------------------------------------- /minGPT/mingpt/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn import functional as F 6 | 7 | def set_seed(seed): 8 | random.seed(seed) 9 | np.random.seed(seed) 10 | torch.manual_seed(seed) 11 | torch.cuda.manual_seed_all(seed) 12 | 13 | def top_k_logits(logits, k): 14 | v, ix = torch.topk(logits, k) 15 | out = logits.clone() 16 | out[out < v[:, [-1]]] = -float('Inf') 17 | return out 18 | 19 | @torch.no_grad() 20 | def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): 21 | """ 22 | take a conditioning sequence of indices in x (of shape (b,t)) and predict the next token in 23 | the sequence, feeding the predictions back into the model each time. Clearly the sampling 24 | has quadratic complexity unlike an RNN that is only linear, and has a finite context window 25 | of block_size, unlike an RNN that has an infinite context window. 26 | """ 27 | block_size = model.get_block_size() 28 | model.eval() 29 | for k in range(steps): 30 | x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed 31 | logits, _ = model(x_cond) 32 | # pluck the logits at the final step and scale by temperature 33 | logits = logits[:, -1, :] / temperature 34 | # optionally crop probabilities to only the top k options 35 | if top_k is not None: 36 | logits = top_k_logits(logits, top_k) 37 | # apply softmax to convert to probabilities 38 | probs = F.softmax(logits, dim=-1) 39 | # sample from the distribution or take the most likely 40 | if sample: 41 | ix = torch.multinomial(probs, num_samples=1) 42 | else: 43 | _, ix = torch.topk(probs, k=1, dim=-1) 44 | # append to the sequence and continue 45 | x = torch.cat((x, ix), dim=1) 46 | 47 | return x 48 | -------------------------------------------------------------------------------- /sound/preprocess/README.md: -------------------------------------------------------------------------------- 1 | # Preprocess 2 | ``` 3 | python3 to_16000_wav.py INPUT_DIR OUTPUT_DIR 4 | python3 trim.py INPUT_DIR OUTPUT_DIR 5 | ``` 6 | -------------------------------------------------------------------------------- /sound/preprocess/mp3_to_wav.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import os 5 | import glob 6 | import sys 7 | 8 | def main(): 9 | if len(sys.argv) < 3: 10 | print('Usage: python3 mp3_to_wav.py INPUT_DIR OUTPUT_DIR') 11 | return 12 | 13 | INPUT_DIR = sys.argv[1] 14 | OUTPUT_DIR = sys.argv[2] 15 | if not os.path.exists(OUTPUT_DIR): 16 | os.makedirs(OUTPUT_DIR) 17 | 18 | for mp3_path in glob.glob(os.path.join(INPUT_DIR, '*.mp3')): 19 | name = os.path.split(mp3_path)[1][:-len('.mp3')] 20 | output_path = os.path.join(OUTPUT_DIR, name + '.wav') 21 | os.system('ffmpeg -i "{}" -ar 16000 "{}"'.format(mp3_path, output_path)) 22 | print(mp3_path) 23 | print(output_path) 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /sound/preprocess/to_16000_wav.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import os 5 | import glob 6 | import sys 7 | 8 | def main(): 9 | if len(sys.argv) < 3: 10 | print('Usage: python3 mp3_to_wav.py INPUT_DIR OUTPUT_DIR') 11 | return 12 | 13 | INPUT_DIR = sys.argv[1] 14 | OUTPUT_DIR = sys.argv[2] 15 | if not os.path.exists(OUTPUT_DIR): 16 | os.makedirs(OUTPUT_DIR) 17 | 18 | for wav_path in glob.glob(os.path.join(INPUT_DIR, '*.wav')): 19 | name = os.path.split(wav_path)[1][:-len('.wav')] 20 | output_path = os.path.join(OUTPUT_DIR, name + '.wav') 21 | os.system('ffmpeg -i "{}" -ar 16000 "{}"'.format(wav_path, output_path)) 22 | print(wav_path) 23 | print(output_path) 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /sound/preprocess/trim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert to .wav 3 | ffmpeg -i input.m4a output.wav 4 | 5 | Split .wav files with ffmpeg: 6 | NAME=name.wav 7 | ffmpeg -i $NAME.wav -f segment -segment_time 2 -c copy one_second/$NAME%03d.wav 8 | 9 | python3 to_16000_wav.py INPUT_DIR OUTPUT_DIR 10 | python3 trim.py INPUT_DIR OUTPUT_DIR 11 | 12 | https://petewarden.com/2017/07/17/a-quick-hack-to-align-single-word-audio-recordings/ 13 | 14 | NOTE: Run make from the extract_loudest_section repo before running this script 15 | """ 16 | import glob 17 | import os 18 | import sys 19 | 20 | def main(): 21 | if len(sys.argv) < 3: 22 | print('Usage: python3 trim.py INPUT_DIR OUTPUT_DIR') 23 | return 24 | 25 | if not os.path.exists(sys.argv[2]): 26 | os.makedirs(sys.argv[2]) 27 | 28 | file_names = glob.glob(os.path.join(sys.argv[1], '*.wav')) 29 | for filename in file_names: 30 | print(filename) 31 | os.system('/tmp/extract_loudest_section/gen/bin/extract_loudest_section "{}" "{}"'.format(filename, sys.argv[2])) 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/.DS_Store -------------------------------------------------------------------------------- /sound/project-keyword-spotter/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google.com/conduct/). 29 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/Icon : -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/Icon -------------------------------------------------------------------------------- /sound/project-keyword-spotter/audio_recorder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Interface to asynchronously capture continuous audio from PyAudio. 16 | 17 | 18 | This module requires pyaudio. See here for installation instructions: 19 | http://people.csail.mit.edu/hubert/pyaudio/ 20 | 21 | This module provides one class, AudioRecorder, which buffers chunks of audio 22 | from PyAudio. 23 | """ 24 | 25 | from __future__ import absolute_import 26 | from __future__ import division 27 | from __future__ import print_function 28 | 29 | import logging 30 | 31 | import math 32 | import time 33 | 34 | import numpy as np 35 | import pyaudio 36 | import queue 37 | 38 | logger = logging.getLogger(__name__) 39 | 40 | 41 | class TimeoutError(Exception): 42 | """A timeout while waiting for pyaudio to buffer samples.""" 43 | pass 44 | 45 | 46 | class AudioRecorder(object): 47 | """Asynchronously record and buffer audio using pyaudio. 48 | 49 | This class wraps the pyaudio interface. It contains a queue.Queue object to 50 | hold chunks of raw audio, and a callback function _enqueue_audio() which 51 | places raw audio into this queue. This allows the pyaudio.Stream object to 52 | record asynchronously at low latency. 53 | 54 | The class acts as a context manager. When entering the context it creates a 55 | pyaudio.Stream object and starts recording; it stops recording on exit. The 56 | Stream saves all of its audio to the Queue as two-tuples of 57 | (timestamp, raw_audio). The raw_audio is available from the queue as a numpy 58 | array using the get_audio() function. 59 | 60 | This class uses the term "frame" in the same sense that PortAudio does, so 61 | "frame" means something different here than elsewhere in the daredevil stack. 62 | A frame in PortAudio is one audio sample across all channels, so one frame of 63 | 16-bit stereo audio is four bytes of data as two 16-bit integers. 64 | """ 65 | pyaudio_format = pyaudio.paInt16 66 | numpy_format = np.int16 67 | num_channels = 1 68 | 69 | # How many frames of audio PyAudio will fetch at once. 70 | # Higher numbers will increase the latancy. 71 | frames_per_chunk = 2**9 72 | 73 | # Limit queue to this number of audio chunks. 74 | max_queue_chunks = 1200 75 | 76 | # Timeout if we can't get a chunk from the queue for timeout_factor times the 77 | # chunk duration. 78 | timeout_factor = 8 79 | 80 | def __init__(self, raw_audio_sample_rate_hz=48000, 81 | downsample_factor=3, 82 | device_index=None): 83 | self._downsample_factor = downsample_factor 84 | self._raw_audio_sample_rate_hz = raw_audio_sample_rate_hz 85 | self.audio_sample_rate_hz = self._raw_audio_sample_rate_hz // self._downsample_factor 86 | self._raw_audio_queue = queue.Queue(self.max_queue_chunks) 87 | self._audio = pyaudio.PyAudio() 88 | self._print_input_devices() 89 | self._device_index = device_index 90 | 91 | def __enter__(self): 92 | if self._device_index is None: 93 | self._device_index = self._audio.get_default_input_device_info()["index"] 94 | kwargs = { 95 | "input_device_index": self._device_index 96 | } 97 | device_info = self._audio.get_device_info_by_host_api_device_index( 98 | 0, self._device_index) 99 | if device_info.get("maxInputChannels") <= 0: 100 | raise ValueError("Audio device has insufficient input channels.") 101 | print("Using audio device '%s' for index %d" % ( 102 | device_info["name"], device_info["index"])) 103 | self._stream = self._audio.open( 104 | format=self.pyaudio_format, 105 | channels=self.num_channels, 106 | rate=self._raw_audio_sample_rate_hz, 107 | input=True, 108 | output=False, 109 | frames_per_buffer=self.frames_per_chunk, 110 | start=True, 111 | stream_callback=self._enqueue_raw_audio, 112 | **kwargs) 113 | logger.info("Started audio stream.") 114 | return self 115 | 116 | def __exit__(self, exception_type, exception_value, traceback): 117 | self._stream.stop_stream() 118 | self._stream.close() 119 | logger.info("Stopped and closed audio stream.") 120 | 121 | def __del__(self): 122 | self._audio.terminate() 123 | logger.info("Terminated PyAudio/PortAudio.") 124 | 125 | @property 126 | def is_active(self): 127 | return self._stream.is_active() 128 | 129 | @property 130 | def bytes_per_sample(self): 131 | return pyaudio.get_sample_size(self.pyaudio_format) 132 | 133 | @property 134 | def _chunk_duration_seconds(self): 135 | return self.frames_per_chunk / self._raw_audio_sample_rate_hz 136 | 137 | def _print_input_devices(self): 138 | info = self._audio.get_host_api_info_by_index(0) 139 | print("\nInput microphone devices:") 140 | for i in range(0, info.get("deviceCount")): 141 | device_info = self._audio.get_device_info_by_host_api_device_index(0, i) 142 | if device_info.get("maxInputChannels") <= 0: continue 143 | print(" ID: ", i, " - ", device_info.get("name")) 144 | 145 | def _enqueue_raw_audio(self, in_data, *_): # unused args to match expected 146 | try: 147 | self._raw_audio_queue.put((in_data, time.time()), block=False) 148 | return None, pyaudio.paContinue 149 | except queue.Full: 150 | error_message = "Raw audio buffer full." 151 | logger.critical(error_message) 152 | raise TimeoutError(error_message) 153 | 154 | def _get_chunk(self, timeout=None): 155 | raw_data, timestamp = self._raw_audio_queue.get(timeout=timeout) 156 | array_data = np.fromstring(raw_data, self.numpy_format).reshape( 157 | -1, self.num_channels) 158 | return array_data, timestamp 159 | 160 | def get_audio_device_info(self): 161 | if self._device_index is None: 162 | return self._audio.get_default_input_device_info() 163 | else: 164 | return self._audio.get_device_info_by_index(self._device_index) 165 | 166 | def sample_duration_seconds(self, num_samples): 167 | return num_samples / self.audio_sample_rate_hz / self.num_channels 168 | 169 | def clear_queue(self): 170 | logger.debug("Purging %d chunks from queue.", self._raw_audio_queue.qsize()) 171 | while not self._raw_audio_queue.empty(): 172 | self._raw_audio_queue.get() 173 | 174 | def get_audio(self, num_audio_frames): 175 | """Grab at least num_audio_frames frames of audio. 176 | 177 | Record at least num_audio_frames of audio and transform it into a 178 | numpy array. The term "frame" is in the sense used by PortAudio; see the 179 | note in the class docstring for details. 180 | 181 | Audio returned will be the earliest audio in the queue; it could be from 182 | before this function was called. 183 | 184 | Args: 185 | num_audio_frames: minimum number of samples of audio to grab. 186 | 187 | Returns: 188 | A tuple of (audio, first_timestamp, last_timestamp). 189 | """ 190 | num_audio_chunks = int(math.ceil(num_audio_frames * 191 | self._downsample_factor / self.frames_per_chunk)) 192 | logger.debug("Capturing %d chunks to get at least %d frames.", 193 | num_audio_chunks, num_audio_frames) 194 | if num_audio_chunks < 1: 195 | num_audio_chunks = 1 196 | try: 197 | timeout = self.timeout_factor * self._chunk_duration_seconds 198 | chunks, timestamps = zip( 199 | *[self._get_chunk(timeout=timeout) for _ in range(num_audio_chunks)]) 200 | except queue.Empty: 201 | error_message = "Audio capture timed out after %.1f seconds." % timeout 202 | logger.critical(error_message) 203 | raise TimeoutError(error_message) 204 | 205 | assert len(chunks) == num_audio_chunks 206 | logger.debug("Got %d chunks. Chunk 0 has shape %s and dtype %s.", 207 | len(chunks), chunks[0].shape, chunks[0].dtype) 208 | if self._raw_audio_queue.qsize() > (0.8 * self.max_queue_chunks): 209 | logger.warning("%d chunks remain in the queue.", 210 | self._raw_audio_queue.qsize()) 211 | else: 212 | logger.debug("%d chunks remain in the queue.", 213 | self._raw_audio_queue.qsize()) 214 | 215 | audio = np.concatenate(chunks) 216 | if self._downsample_factor != 1: 217 | audio = audio[::self._downsample_factor] 218 | logging.debug("Audio array has shape %s and dtype %s.", audio.shape, 219 | audio.dtype) 220 | return audio * 0.5, timestamps[0], timestamps[-1] 221 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/config/Icon : -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/config/Icon -------------------------------------------------------------------------------- /sound/project-keyword-spotter/config/commands_v2.txt: -------------------------------------------------------------------------------- 1 | volume_up,up, 2 | volume_down,down, 3 | next_song,shift+n, 4 | next_video,shift+n, 5 | next_game,shift+n, 6 | last_song,shift+p, 7 | last_video,shift+p, 8 | last_game,shift+p, 9 | random_song,r, 10 | random_video,r, 11 | pause_song, , 12 | pause_video, , 13 | pause_game, , 14 | stop_song, , 15 | stop_video, , 16 | start_song, , 17 | start_video, , 18 | previous_song,shift+p, 19 | previous_video,shift+p, 20 | move_backwards,left, 21 | move_forwards,right, 22 | go_backwards,left, 23 | go_forwards,right,0.8 24 | position_zero,0, 25 | position_one,1, 26 | position_two,2, 27 | position_three,3, 28 | position_four,4, 29 | position_five,5, 30 | position_six,6, 31 | position_seven,7, 32 | position_eight,8, 33 | position_nine,9, 34 | mute,m, 35 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/config/commands_v2_snake.txt: -------------------------------------------------------------------------------- 1 | start_application,go, 2 | start_game,go, 3 | start_program,go, 4 | start_task,go, 5 | begin_window,go, 6 | begin_application,go, 7 | begin_game,go, 8 | begin_program,go, 9 | begin_task,go, 10 | launch_window,go, 11 | launch_application,go, 12 | launch_game,go, 13 | launch_program,go, 14 | launch_task,go, 15 | close_window,stop, 16 | close_application,stop, 17 | close_game,stop, 18 | close_program,stop, 19 | close_task,stop, 20 | stop_window,stop, 21 | stop_application,stop, 22 | stop_game,stop, 23 | stop_program,stop, 24 | stop_task,stop, 25 | exit_window,stop, 26 | exit_application,stop, 27 | exit_game,stop, 28 | exit_program,stop, 29 | exit_task,stop, 30 | kill_window,stop, 31 | kill_application,stop, 32 | kill_game,stop, 33 | kill_program,stop, 34 | kill_task,stop, 35 | kill_tab,stop, 36 | engage,go, 37 | switch_on,go, 38 | switch_off,stop, 39 | move_up,up, 40 | move_down,down, 41 | move_left,left, 42 | move_right,right, 43 | turn_up,up, 44 | turn_down,down, 45 | turn_left,left, 46 | turn_right,right, 47 | go_up,up, 48 | go_down,down, 49 | go_left,left, 50 | go_right,right, 51 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/config/labels_gc2.raw.txt: -------------------------------------------------------------------------------- 1 | what_can_i_say 2 | what_can_you_do 3 | yes 4 | no 5 | start_window 6 | start_application 7 | start_game 8 | start_program 9 | start_task 10 | start_tab 11 | begin_window 12 | begin_application 13 | begin_game 14 | begin_program 15 | begin_task 16 | begin_tab 17 | launch_window 18 | launch_application 19 | launch_game 20 | launch_program 21 | launch_task 22 | launch_tab 23 | open_window 24 | open_application 25 | open_game 26 | open_program 27 | open_task 28 | open_tab 29 | close_window 30 | close_application 31 | close_game 32 | close_program 33 | close_task 34 | close_tab 35 | stop_window 36 | stop_application 37 | stop_game 38 | stop_program 39 | stop_task 40 | stop_tab 41 | terminate_window 42 | terminate_application 43 | terminate_game 44 | terminate_program 45 | terminate_task 46 | terminate_tab 47 | exit_window 48 | exit_application 49 | exit_game 50 | exit_program 51 | exit_task 52 | exit_tab 53 | kill_window 54 | kill_application 55 | kill_game 56 | kill_program 57 | kill_task 58 | kill_tab 59 | engage 60 | target 61 | switch_on 62 | switch_off 63 | pick_up 64 | volume_up 65 | volume_down 66 | remove 67 | delete 68 | mute 69 | unmute 70 | silence 71 | reverse 72 | next_song 73 | next_video 74 | next_game 75 | last_song 76 | last_video 77 | last_game 78 | random_song 79 | random_video 80 | random_game 81 | pause_song 82 | pause_video 83 | pause_game 84 | stop_song 85 | stop_video 86 | start_song 87 | start_video 88 | previous_song 89 | previous_video 90 | insert 91 | select 92 | unselect 93 | move_up 94 | move_down 95 | move_left 96 | move_right 97 | move_backwards 98 | move_forwards 99 | turn_up 100 | turn_down 101 | turn_left 102 | turn_right 103 | turn_backwards 104 | turn_forwards 105 | go_up 106 | go_down 107 | go_left 108 | go_right 109 | go_backwards 110 | go_forwards 111 | channel_zero 112 | position_zero 113 | one_o_clock 114 | channel_one 115 | position_one 116 | two_o_clock 117 | channel_two 118 | position_two 119 | three_o_clock 120 | channel_three 121 | position_three 122 | four_o_clock 123 | channel_four 124 | position_four 125 | five_o_clock 126 | channel_five 127 | position_five 128 | six_o_clock 129 | channel_six 130 | position_six 131 | seven_o_clock 132 | channel_seven 133 | position_seven 134 | eight_o_clock 135 | channel_eight 136 | position_eight 137 | nine_o_clock 138 | channel_nine 139 | position_nine 140 | ten_o_clock 141 | channel_ten 142 | position_ten 143 | eleven_o_clock 144 | channel_eleven 145 | position_eleven 146 | twelve_o_clock 147 | channel_twelve 148 | position_twelve 149 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/config/labels_simple_audio.txt: -------------------------------------------------------------------------------- 1 | cough 2 | unknown 3 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/features.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Feature computation for YAMNet.""" 17 | 18 | import numpy as np 19 | import tensorflow as tf 20 | 21 | 22 | def waveform_to_log_mel_spectrogram_patches(waveform, params): 23 | """Compute log mel spectrogram patches of a 1-D waveform.""" 24 | with tf.name_scope('log_mel_features'): 25 | # waveform has shape [<# samples>] 26 | 27 | # Convert waveform into spectrogram using a Short-Time Fourier Transform. 28 | # Note that tf.signal.stft() uses a periodic Hann window by default. 29 | window_length_samples = int( 30 | round(params.sample_rate * params.stft_window_seconds)) 31 | hop_length_samples = int( 32 | round(params.sample_rate * params.stft_hop_seconds)) 33 | fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0))) 34 | num_spectrogram_bins = fft_length // 2 + 1 35 | if params.tflite_compatible: 36 | magnitude_spectrogram = _tflite_stft_magnitude( 37 | signal=waveform, 38 | frame_length=window_length_samples, 39 | frame_step=hop_length_samples, 40 | fft_length=fft_length) 41 | else: 42 | magnitude_spectrogram = tf.abs(tf.signal.stft( 43 | signals=waveform, 44 | frame_length=window_length_samples, 45 | frame_step=hop_length_samples, 46 | fft_length=fft_length)) 47 | # magnitude_spectrogram has shape [<# STFT frames>, num_spectrogram_bins] 48 | 49 | # Convert spectrogram into log mel spectrogram. 50 | linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( 51 | num_mel_bins=params.mel_bands, 52 | num_spectrogram_bins=num_spectrogram_bins, 53 | sample_rate=params.sample_rate, 54 | lower_edge_hertz=params.mel_min_hz, 55 | upper_edge_hertz=params.mel_max_hz) 56 | mel_spectrogram = tf.matmul( 57 | magnitude_spectrogram, linear_to_mel_weight_matrix) 58 | log_mel_spectrogram = tf.math.log(mel_spectrogram + params.log_offset) 59 | # log_mel_spectrogram has shape [<# STFT frames>, params.mel_bands] 60 | 61 | # Frame spectrogram (shape [<# STFT frames>, params.mel_bands]) into patches 62 | # (the input examples). Only complete frames are emitted, so if there is 63 | # less than params.patch_window_seconds of waveform then nothing is emitted 64 | # (to avoid this, zero-pad before processing). 65 | spectrogram_hop_length_samples = int( 66 | round(params.sample_rate * params.stft_hop_seconds)) 67 | spectrogram_sample_rate = params.sample_rate / spectrogram_hop_length_samples 68 | patch_window_length_samples = int( 69 | round(spectrogram_sample_rate * params.patch_window_seconds)) 70 | patch_hop_length_samples = int( 71 | round(spectrogram_sample_rate * params.patch_hop_seconds)) 72 | features = tf.signal.frame( 73 | signal=log_mel_spectrogram, 74 | frame_length=patch_window_length_samples, 75 | frame_step=patch_hop_length_samples, 76 | axis=0) 77 | # features has shape [<# patches>, <# STFT frames in an patch>, params.mel_bands] 78 | 79 | return log_mel_spectrogram, features 80 | 81 | 82 | def pad_waveform(waveform, params): 83 | """Pads waveform with silence if needed to get an integral number of patches.""" 84 | # In order to produce one patch of log mel spectrogram input to YAMNet, we 85 | # need at least one patch window length of waveform plus enough extra samples 86 | # to complete the final STFT analysis window. 87 | min_waveform_seconds = ( 88 | params.patch_window_seconds + 89 | params.stft_window_seconds - params.stft_hop_seconds) 90 | min_num_samples = tf.cast(min_waveform_seconds * params.sample_rate, tf.int32) 91 | num_samples = tf.shape(waveform)[0] 92 | num_padding_samples = tf.maximum(0, min_num_samples - num_samples) 93 | 94 | # In addition, there might be enough waveform for one or more additional 95 | # patches formed by hopping forward. If there are more samples than one patch, 96 | # round up to an integral number of hops. 97 | num_samples = tf.maximum(num_samples, min_num_samples) 98 | num_samples_after_first_patch = num_samples - min_num_samples 99 | hop_samples = tf.cast(params.patch_hop_seconds * params.sample_rate, tf.int32) 100 | num_hops_after_first_patch = tf.cast(tf.math.ceil( 101 | tf.cast(num_samples_after_first_patch, tf.float32) / 102 | tf.cast(hop_samples, tf.float32)), tf.int32) 103 | num_padding_samples += ( 104 | hop_samples * num_hops_after_first_patch - num_samples_after_first_patch) 105 | 106 | padded_waveform = tf.pad(waveform, [[0, num_padding_samples]], 107 | mode='CONSTANT', constant_values=0.0) 108 | return padded_waveform 109 | 110 | 111 | def _tflite_stft_magnitude(signal, frame_length, frame_step, fft_length): 112 | """TF-Lite-compatible version of tf.abs(tf.signal.stft()).""" 113 | def _hann_window(): 114 | return tf.reshape( 115 | tf.constant( 116 | (0.5 - 0.5 * np.cos(2 * np.pi * np.arange(0, 1.0, 1.0 / frame_length)) 117 | ).astype(np.float32), 118 | name='hann_window'), [1, frame_length]) 119 | 120 | def _dft_matrix(dft_length): 121 | """Calculate the full DFT matrix in NumPy.""" 122 | # See https://en.wikipedia.org/wiki/DFT_matrix 123 | omega = (0 + 1j) * 2.0 * np.pi / float(dft_length) 124 | # Don't include 1/sqrt(N) scaling, tf.signal.rfft doesn't apply it. 125 | return np.exp(omega * np.outer(np.arange(dft_length), np.arange(dft_length))) 126 | 127 | def _rdft(framed_signal, fft_length): 128 | """Implement real-input Discrete Fourier Transform by matmul.""" 129 | # We are right-multiplying by the DFT matrix, and we are keeping only the 130 | # first half ("positive frequencies"). So discard the second half of rows, 131 | # but transpose the array for right-multiplication. The DFT matrix is 132 | # symmetric, so we could have done it more directly, but this reflects our 133 | # intention better. 134 | complex_dft_matrix_kept_values = _dft_matrix(fft_length)[:( 135 | fft_length // 2 + 1), :].transpose() 136 | real_dft_matrix = tf.constant( 137 | np.real(complex_dft_matrix_kept_values).astype(np.float32), 138 | name='real_dft_matrix') 139 | imag_dft_matrix = tf.constant( 140 | np.imag(complex_dft_matrix_kept_values).astype(np.float32), 141 | name='imaginary_dft_matrix') 142 | signal_frame_length = tf.shape(framed_signal)[-1] 143 | half_pad = (fft_length - signal_frame_length) // 2 144 | padded_frames = tf.pad( 145 | framed_signal, 146 | [ 147 | # Don't add any padding in the frame dimension. 148 | [0, 0], 149 | # Pad before and after the signal within each frame. 150 | [half_pad, fft_length - signal_frame_length - half_pad] 151 | ], 152 | mode='CONSTANT', 153 | constant_values=0.0) 154 | real_stft = tf.matmul(padded_frames, real_dft_matrix) 155 | imag_stft = tf.matmul(padded_frames, imag_dft_matrix) 156 | return real_stft, imag_stft 157 | 158 | def _complex_abs(real, imag): 159 | return tf.sqrt(tf.add(real * real, imag * imag)) 160 | 161 | framed_signal = tf.signal.frame(signal, frame_length, frame_step) 162 | windowed_signal = framed_signal * _hann_window() 163 | real_stft, imag_stft = _rdft(windowed_signal, fft_length) 164 | stft_magnitude = _complex_abs(real_stft, imag_stft) 165 | return stft_magnitude 166 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/hearing_snake_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1.0, 3 | "best_scores": [ 4 | 169, 5 | 34, 6 | 21, 7 | 0, 8 | 0, 9 | 0, 10 | 0, 11 | 0, 12 | 0, 13 | 0 14 | ] 15 | } -------------------------------------------------------------------------------- /sound/project-keyword-spotter/install_requirements.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | #!/bin/bash 15 | sudo apt-get install -y python3 python3-pyaudio python3-numpy python3-scipy 16 | 17 | sudo apt-get install -y python3-dev libsdl-image1.2-dev libsdl-mixer1.2-dev libsdl-ttf2.0-dev libsdl1.2-dev libsmpeg-dev python-numpy subversion libportmidi-dev ffmpeg libswscale-dev libavformat-dev libavcodec-dev libfreetype6-dev 18 | 19 | sudo apt-get install -y python3-pyaudio 20 | 21 | pip3 install pygame 22 | 23 | pip3 install PyUserInput 24 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/media/Icon : -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/media/Icon -------------------------------------------------------------------------------- /sound/project-keyword-spotter/media/startscreen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/media/startscreen.png -------------------------------------------------------------------------------- /sound/project-keyword-spotter/models/Icon : -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/models/Icon -------------------------------------------------------------------------------- /sound/project-keyword-spotter/models/model-backup1.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/models/model-backup1.tflite -------------------------------------------------------------------------------- /sound/project-keyword-spotter/models/model.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/models/model.tflite -------------------------------------------------------------------------------- /sound/project-keyword-spotter/models/model_quantized_edgetpu.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/models/model_quantized_edgetpu.tflite -------------------------------------------------------------------------------- /sound/project-keyword-spotter/models/voice_commands_v0.7_edgetpu.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/models/voice_commands_v0.7_edgetpu.tflite -------------------------------------------------------------------------------- /sound/project-keyword-spotter/models/voice_commands_v0.8_edgetpu.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/models/voice_commands_v0.8_edgetpu.tflite -------------------------------------------------------------------------------- /sound/project-keyword-spotter/params.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Hyperparameters for YAMNet.""" 17 | 18 | from dataclasses import dataclass 19 | 20 | # The following hyperparameters (except patch_hop_seconds) were used to train YAMNet, 21 | # so expect some variability in performance if you change these. The patch hop can 22 | # be changed arbitrarily: a smaller hop should give you more patches from the same 23 | # clip and possibly better performance at a larger computational cost. 24 | @dataclass(frozen=True) # Instances of this class are immutable. 25 | class Params: 26 | sample_rate: float = 16000.0 27 | stft_window_seconds: float = 0.025 28 | stft_hop_seconds: float = 0.010 29 | mel_bands: int = 64 30 | mel_min_hz: float = 125.0 31 | mel_max_hz: float = 7500.0 32 | log_offset: float = 0.001 33 | patch_window_seconds: float = 0.96 34 | patch_hop_seconds: float = 0.48 35 | 36 | @property 37 | def patch_frames(self): 38 | return int(round(self.patch_window_seconds / self.stft_hop_seconds)) 39 | 40 | @property 41 | def patch_bands(self): 42 | return self.mel_bands 43 | 44 | num_classes: int = 521 45 | conv_padding: str = 'same' 46 | batchnorm_center: bool = True 47 | batchnorm_scale: bool = False 48 | batchnorm_epsilon: float = 1e-4 49 | classifier_activation: str = 'sigmoid' 50 | 51 | tflite_compatible: bool = False 52 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/pygame_images/Icon : -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/pygame_images/Icon -------------------------------------------------------------------------------- /sound/project-keyword-spotter/pygame_images/apple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/pygame_images/apple.png -------------------------------------------------------------------------------- /sound/project-keyword-spotter/pygame_images/bg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/pygame_images/bg.jpg -------------------------------------------------------------------------------- /sound/project-keyword-spotter/pygame_images/snake_head_with_ears.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/pygame_images/snake_head_with_ears.png -------------------------------------------------------------------------------- /sound/project-keyword-spotter/pygame_images/snake_tail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wileyw/DeepLearningDemos/efa7e1bc2caabad488c8420b90bb617b9af5c424/sound/project-keyword-spotter/pygame_images/snake_tail.png -------------------------------------------------------------------------------- /sound/project-keyword-spotter/run_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Runs a model on the edgetpu. 16 | 17 | Useage: 18 | python3 run_model.py --model_file model_edgetpu.tflite 19 | """ 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | import argparse 25 | import sys 26 | import model 27 | import numpy as np 28 | 29 | 30 | def print_results(result, commands, labels, top=1): 31 | """Example callback function that prints the passed detections.""" 32 | top_results = np.argsort(-result)[:top] 33 | for p in range(top): 34 | l = labels[top_results[p]] 35 | if l in commands.keys(): 36 | threshold = commands[labels[top_results[p]]]["conf"] 37 | else: 38 | threshold = 0.5 39 | if top_results[p] and result[top_results[p]] > threshold: 40 | sys.stdout.write("\033[1m\033[93m*%15s*\033[0m (%.3f)" % 41 | (l, result[top_results[p]])) 42 | elif result[top_results[p]] > 0.005: 43 | sys.stdout.write(" %15s (%.3f)" % (l, result[top_results[p]])) 44 | sys.stdout.write("\n") 45 | 46 | 47 | def main(): 48 | parser = argparse.ArgumentParser() 49 | model.add_model_flags(parser) 50 | args = parser.parse_args() 51 | interpreter = model.make_interpreter(args.model_file) 52 | interpreter.allocate_tensors() 53 | mic = args.mic if args.mic is None else int(args.mic) 54 | model.classify_audio(mic, interpreter, 55 | labels_file="config/labels_simple_audio.txt", 56 | result_callback=print_results, 57 | sample_rate_hz=int(args.sample_rate_hz), 58 | num_frames_hop=int(args.num_frames_hop)) 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/run_model_yamnet.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Runs a model on the edgetpu. 16 | 17 | Useage: 18 | python3 run_model.py --model_file model_edgetpu.tflite 19 | """ 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | import argparse 25 | import sys 26 | import model_yamnet 27 | import numpy as np 28 | 29 | 30 | def print_results(result, commands, labels, top=1): 31 | """Example callback function that prints the passed detections.""" 32 | top_results = np.argsort(-result)[:top] 33 | for p in range(top): 34 | l = labels[top_results[p]] 35 | if l in commands.keys(): 36 | threshold = commands[labels[top_results[p]]]["conf"] 37 | else: 38 | threshold = 0.5 39 | if top_results[p] and result[top_results[p]] > threshold: 40 | sys.stdout.write("\033[1m\033[93m*%15s*\033[0m (%.3f)" % 41 | (l, result[top_results[p]])) 42 | elif result[top_results[p]] > 0.005: 43 | sys.stdout.write(" %15s (%.3f)" % (l, result[top_results[p]])) 44 | sys.stdout.write("\n") 45 | 46 | 47 | def main(): 48 | parser = argparse.ArgumentParser() 49 | model_yamnet.add_model_flags(parser) 50 | args = parser.parse_args() 51 | interpreter = model_yamnet.make_interpreter(args.model_file) 52 | interpreter.allocate_tensors() 53 | mic = args.mic if args.mic is None else int(args.mic) 54 | model_yamnet.classify_audio(mic, interpreter, 55 | labels_file="config/labels_simple_audio.txt", 56 | result_callback=print_results, 57 | sample_rate_hz=int(args.sample_rate_hz), 58 | num_frames_hop=int(args.num_frames_hop)) 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/run_snake.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2019 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | export DISPLAY="${DISPLAY:-:0}" 16 | python3 run_hearing_snake.py 17 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/run_yt_voice_control.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Controls a YouTube using voice commands. 16 | 17 | 18 | Usage: 19 | Requires YouTube to be running in a browser tab and focus to be on the 20 | YouTube player. 21 | 22 | python3 run_yt_voice_control.py 23 | """ 24 | from __future__ import absolute_import 25 | from __future__ import division 26 | from __future__ import print_function 27 | 28 | import argparse 29 | import sys 30 | import model 31 | from pykeyboard import PyKeyboard 32 | 33 | 34 | class YoutubeControl(object): 35 | """Maps voice command detections to youtube controls.""" 36 | 37 | def __init__(self): 38 | """Creates an instance of `YoutubeControl`.""" 39 | self._keyboard = PyKeyboard() 40 | self._command_lookup = { 41 | "left": self._keyboard.left_key, 42 | "right": self._keyboard.right_key, 43 | "up": self._keyboard.up_key, 44 | "down": self._keyboard.down_key, 45 | "shift": self._keyboard.shift_key 46 | } 47 | 48 | def run_command(self, command): 49 | """Parses and excecuted a command.""" 50 | if len(command) == 1: 51 | self._keyboard.tap_key(command) 52 | elif command in self._command_lookup.keys(): 53 | self._keyboard.tap_key(self._command_lookup[command]) 54 | elif "+" in command: 55 | keys = command.split("+") 56 | press_list = [] 57 | for key in keys: 58 | if len(key) == 1: 59 | press_list.append(key) 60 | elif key in self._command_lookup.keys(): 61 | press_list.append(self._command_lookup[key]) 62 | else: 63 | print("Can't parse: ", command) 64 | return 65 | self._keyboard.press_keys(press_list) 66 | 67 | 68 | def main(): 69 | parser = argparse.ArgumentParser() 70 | model.add_model_flags(parser) 71 | args = parser.parse_args() 72 | interpreter = model.make_interpreter(args.model_file) 73 | interpreter.allocate_tensors() 74 | mic = args.mic if args.mic is None else int(args.mic) 75 | yt_control = YoutubeControl() 76 | sys.stdout.write("--------------------\n") 77 | sys.stdout.write("This script will control Youtube.\n") 78 | sys.stdout.write("Just ensure that focus is on the YouTube player.\n") 79 | sys.stdout.write("--------------------\n") 80 | 81 | model.classify_audio(mic, interpreter, 82 | labels_file="config/labels_gc2.raw.txt", 83 | commands_file="config/commands_v2.txt", 84 | dectection_callback=yt_control.run_command, 85 | sample_rate_hz=int(args.sample_rate_hz), 86 | num_frames_hop=int(args.num_frames_hop)) 87 | 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/run_yt_voice_control.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2019 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | export DISPLAY="${DISPLAY:-:0}" 16 | python3 run_yt_voice_control.py 17 | -------------------------------------------------------------------------------- /sound/project-keyword-spotter/yamnet.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Core model definition of YAMNet.""" 17 | 18 | import csv 19 | 20 | import numpy as np 21 | import tensorflow as tf 22 | from tensorflow.keras import Model, layers 23 | 24 | import features as features_lib 25 | 26 | 27 | def _batch_norm(name, params): 28 | def _bn_layer(layer_input): 29 | return layers.BatchNormalization( 30 | name=name, 31 | center=params.batchnorm_center, 32 | scale=params.batchnorm_scale, 33 | epsilon=params.batchnorm_epsilon)(layer_input) 34 | return _bn_layer 35 | 36 | 37 | def _conv(name, kernel, stride, filters, params): 38 | def _conv_layer(layer_input): 39 | output = layers.Conv2D(name='{}/conv'.format(name), 40 | filters=filters, 41 | kernel_size=kernel, 42 | strides=stride, 43 | padding=params.conv_padding, 44 | use_bias=False, 45 | activation=None)(layer_input) 46 | output = _batch_norm('{}/conv/bn'.format(name), params)(output) 47 | output = layers.ReLU(name='{}/relu'.format(name))(output) 48 | return output 49 | return _conv_layer 50 | 51 | 52 | def _separable_conv(name, kernel, stride, filters, params): 53 | def _separable_conv_layer(layer_input): 54 | output = layers.DepthwiseConv2D(name='{}/depthwise_conv'.format(name), 55 | kernel_size=kernel, 56 | strides=stride, 57 | depth_multiplier=1, 58 | padding=params.conv_padding, 59 | use_bias=False, 60 | activation=None)(layer_input) 61 | output = _batch_norm('{}/depthwise_conv/bn'.format(name), params)(output) 62 | output = layers.ReLU(name='{}/depthwise_conv/relu'.format(name))(output) 63 | output = layers.Conv2D(name='{}/pointwise_conv'.format(name), 64 | filters=filters, 65 | kernel_size=(1, 1), 66 | strides=1, 67 | padding=params.conv_padding, 68 | use_bias=False, 69 | activation=None)(output) 70 | output = _batch_norm('{}/pointwise_conv/bn'.format(name), params)(output) 71 | output = layers.ReLU(name='{}/pointwise_conv/relu'.format(name))(output) 72 | return output 73 | return _separable_conv_layer 74 | 75 | 76 | _YAMNET_LAYER_DEFS = [ 77 | # (layer_function, kernel, stride, num_filters) 78 | (_conv, [3, 3], 2, 32), 79 | (_separable_conv, [3, 3], 1, 64), 80 | (_separable_conv, [3, 3], 2, 128), 81 | (_separable_conv, [3, 3], 1, 128), 82 | (_separable_conv, [3, 3], 2, 256), 83 | (_separable_conv, [3, 3], 1, 256), 84 | (_separable_conv, [3, 3], 2, 512), 85 | (_separable_conv, [3, 3], 1, 512), 86 | (_separable_conv, [3, 3], 1, 512), 87 | (_separable_conv, [3, 3], 1, 512), 88 | (_separable_conv, [3, 3], 1, 512), 89 | (_separable_conv, [3, 3], 1, 512), 90 | (_separable_conv, [3, 3], 2, 1024), 91 | (_separable_conv, [3, 3], 1, 1024) 92 | ] 93 | 94 | 95 | def yamnet(features, params): 96 | """Define the core YAMNet mode in Keras.""" 97 | net = layers.Reshape( 98 | (params.patch_frames, params.patch_bands, 1), 99 | input_shape=(params.patch_frames, params.patch_bands))(features) 100 | for (i, (layer_fun, kernel, stride, filters)) in enumerate(_YAMNET_LAYER_DEFS): 101 | net = layer_fun('layer{}'.format(i + 1), kernel, stride, filters, params)(net) 102 | embeddings = layers.GlobalAveragePooling2D()(net) 103 | logits = layers.Dense(units=params.num_classes, use_bias=True)(embeddings) 104 | predictions = layers.Activation(activation=params.classifier_activation)(logits) 105 | return predictions, embeddings 106 | 107 | 108 | def yamnet_frames_model(params): 109 | """Defines the YAMNet waveform-to-class-scores model. 110 | 111 | Args: 112 | params: An instance of Params containing hyperparameters. 113 | 114 | Returns: 115 | A model accepting (num_samples,) waveform input and emitting: 116 | - predictions: (num_patches, num_classes) matrix of class scores per time frame 117 | - embeddings: (num_patches, embedding size) matrix of embeddings per time frame 118 | - log_mel_spectrogram: (num_spectrogram_frames, num_mel_bins) spectrogram feature matrix 119 | """ 120 | waveform = layers.Input(batch_shape=(None,), dtype=tf.float32) 121 | waveform_padded = features_lib.pad_waveform(waveform, params) 122 | log_mel_spectrogram, features = features_lib.waveform_to_log_mel_spectrogram_patches( 123 | waveform_padded, params) 124 | predictions, embeddings = yamnet(features, params) 125 | frames_model = Model( 126 | name='yamnet_frames', inputs=waveform, 127 | outputs=[predictions, embeddings, log_mel_spectrogram]) 128 | return frames_model 129 | 130 | 131 | def class_names(class_map_csv): 132 | """Read the class name definition file and return a list of strings.""" 133 | if tf.is_tensor(class_map_csv): 134 | class_map_csv = class_map_csv.numpy() 135 | with open(class_map_csv) as csv_file: 136 | reader = csv.reader(csv_file) 137 | next(reader) # Skip header 138 | return np.array([display_name for (_, _, display_name) in reader]) 139 | -------------------------------------------------------------------------------- /sound/sound.ipynb: -------------------------------------------------------------------------------- 1 | 2 | --------------------------------------------------------------------------------