├── examples ├── resnetv2 │ ├── requirements.txt │ ├── deepkit.yml │ └── model.py ├── keras-cifar10 │ ├── requirements.txt │ ├── local.deepkit.yml │ ├── deepkit.yml │ └── model.py ├── tf2-keras-mnist │ ├── requirements.txt │ ├── deepkit.yml │ └── model.py ├── generic │ ├── deepkit.yml │ └── train.py ├── torch │ ├── local.deepkit.yml │ ├── resnet.py │ └── train.py ├── alot │ └── train.py ├── dynamic-experiments │ ├── sub-experiments.py │ └── threaded.py ├── ray │ └── dqn.py └── alexnet │ └── model.py ├── setup.cfg ├── README.md ├── deepkit ├── globals.py ├── home.py ├── utils │ ├── __init__.py │ ├── pilutil.py │ └── image.py ├── model.py ├── __init__.py ├── debugger.py ├── deepkit_keras.py ├── pytorch_graph.py ├── pytorch.py ├── client.py ├── keras_tf.py └── experiment.py ├── Makefile ├── tests └── test_home.py └── setup.py /examples/resnetv2/requirements.txt: -------------------------------------------------------------------------------- 1 | keras>2.0.0 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /examples/keras-cifar10/requirements.txt: -------------------------------------------------------------------------------- 1 | keras==2.3.0 2 | -------------------------------------------------------------------------------- /examples/tf2-keras-mnist/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow>=2.0 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deepkit Python SDK 2 | 3 | This is the Python SDK for Deepkit. 4 | 5 | [View documentation](https://deepkit.ai/documentation/python-sdk/getting-started) -------------------------------------------------------------------------------- /deepkit/globals.py: -------------------------------------------------------------------------------- 1 | import io 2 | from typing import Optional 3 | 4 | from deepkit.experiment import Experiment 5 | 6 | last_experiment: Optional[Experiment] = None 7 | 8 | loaded_job_config = None 9 | 10 | last_logs = io.StringIO('') 11 | -------------------------------------------------------------------------------- /examples/generic/deepkit.yml: -------------------------------------------------------------------------------- 1 | label: Generic data generation 2 | image: python:3.7 3 | build: 4 | - ADD ../../deepkit:/deepkit-sdk/deepkit 5 | - ADD ../../setup.py:/deepkit-sdk/setup.py 6 | - pip install -e /deepkit-sdk/ 7 | 8 | command: python train.py 9 | -------------------------------------------------------------------------------- /examples/torch/local.deepkit.yml: -------------------------------------------------------------------------------- 1 | config: 2 | batch_size: 32 3 | 4 | ignore: 5 | - data 6 | - checkpoint 7 | - runs 8 | 9 | env: 10 | - PYTHONPATH=/Users/marc/bude/deepkit-python-sdk/ 11 | 12 | command: /usr/local/Cellar/python/3.7.6_1/bin/python3 train.py -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | zip: 2 | rm -f deepkit.zip 3 | zip deepkit.zip deepkit/*.py deepkit/utils/*.py README.md setup.cfg setup.py 4 | 5 | publish: 6 | rm -rf dist/* 7 | python3 setup.py sdist bdist_wheel 8 | python3 -m twine upload --repository-url https://upload.pypi.org/legacy/ dist/* -------------------------------------------------------------------------------- /examples/keras-cifar10/local.deepkit.yml: -------------------------------------------------------------------------------- 1 | label: TF1 Keras Cifar10 Partial Host/Local 2 | 3 | ignore: 4 | - 'report.*' 5 | - saved_models 6 | 7 | output: saved_models 8 | 9 | config: 10 | lr: 0.8 11 | batch_size: 128 12 | epochs: 15 13 | train_samples: 60000 14 | test_samples: 10000 15 | 16 | command: /usr/local/Cellar/python/3.7.6_1/bin/python3 model.py -------------------------------------------------------------------------------- /deepkit/home.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import typedload 5 | 6 | from deepkit.model import HomeConfig 7 | 8 | 9 | def get_home_config() -> HomeConfig: 10 | path = os.path.expanduser('~') + '/.deepkit/config' 11 | if not os.path.exists(path): 12 | raise Exception("No ~/.deepkit/config file found") 13 | 14 | with open(path, 'r') as h: 15 | return typedload.load(json.load(h), HomeConfig) -------------------------------------------------------------------------------- /examples/alot/train.py: -------------------------------------------------------------------------------- 1 | import random 2 | import deepkit 3 | 4 | experiment = deepkit.experiment() 5 | experiment.add_file(__file__) 6 | 7 | test = experiment.define_metric('test') 8 | 9 | for i in range(10): 10 | experiment.set_info(i, random.random()) 11 | 12 | total = 100_000 13 | 14 | for i in range(total): 15 | test.send(i, random.gauss(25, 25/3)) 16 | experiment.epoch(i, total) 17 | 18 | print("Bye.") 19 | -------------------------------------------------------------------------------- /examples/dynamic-experiments/sub-experiments.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | 3 | import deepkit 4 | 5 | experiment = deepkit.experiment(project='sub-experiments') 6 | print('root job', experiment.id) 7 | 8 | experiments = 10 9 | 10 | for i in range(experiments): 11 | sub_experiment = experiment.create_sub_experiment() 12 | print('sub job', sub_experiment.id) 13 | 14 | sub_experiment.done() 15 | 16 | sleep(5) 17 | -------------------------------------------------------------------------------- /examples/generic/train.py: -------------------------------------------------------------------------------- 1 | import random 2 | import deepkit 3 | from time import sleep 4 | 5 | experiment = deepkit.experiment() 6 | experiment.add_file(__file__) 7 | 8 | test = experiment.define_metric('test') 9 | 10 | for i in range(10): 11 | experiment.set_info(str(i), random.random()) 12 | 13 | total = 1_000 14 | 15 | for i in range(total): 16 | test.send(random.gauss(25, 25/3), x=i) 17 | experiment.epoch(i, total) 18 | sleep(0.005) 19 | 20 | print("Bye.") 21 | -------------------------------------------------------------------------------- /examples/tf2-keras-mnist/deepkit.yml: -------------------------------------------------------------------------------- 1 | title: TF2 Keras Fashion Mnist 2 | image: tensorflow/tensorflow:2.0.1-gpu-py3 3 | build: 4 | - pip install --upgrade pip 5 | - python -c 'from tensorflow.keras import datasets; datasets.fashion_mnist.load_data()' 6 | - ADD ../../deepkit:/deepkit-sdk/deepkit 7 | - ADD ../../setup.py:/deepkit-sdk/setup.py 8 | - pip install -e /deepkit-sdk/ 9 | 10 | ignore: 11 | - logs 12 | 13 | #resources: 14 | # minCpu: 2 15 | # minMemory: 2 16 | 17 | command: python model.py 18 | -------------------------------------------------------------------------------- /examples/resnetv2/deepkit.yml: -------------------------------------------------------------------------------- 1 | title: Keras TF1 Resnet Cifar10 2 | label: 'keras' 3 | 4 | image: tensorflow/tensorflow:1.15.2-gpu-py3 5 | build: 6 | - pip install --upgrade pip 7 | - ADD requirements.txt 8 | - pip install -r requirements.txt 9 | - python -c 'from keras.datasets import cifar10; cifar10.load_data()' 10 | - ADD ../../deepkit:/deepkit-sdk/deepkit 11 | - ADD ../../setup.py:/deepkit-sdk/setup.py 12 | - pip install -e /deepkit-sdk/ 13 | 14 | ignore: 15 | - report.* 16 | - saved_models 17 | 18 | output: saved_models 19 | 20 | command: python model.py 21 | -------------------------------------------------------------------------------- /tests/test_home.py: -------------------------------------------------------------------------------- 1 | import typedload 2 | 3 | from deepkit.model import HomeConfig 4 | 5 | 6 | def test_home_config_convert(): 7 | config = typedload.load({ 8 | 'accounts': [ 9 | {'id': '1', 'name': 'peter', 'port': 8080, 'ssl': False, 'username': '', 'host': 'deepkit.ai', 'token': 'abc'}, 10 | {'id': '2', 'name': 'localhost', 'port': 8080, 'ssl': False, 'username': '', 'host': 'deepkit.ai', 'token': 'abc'} 11 | ], 12 | 'folderLinks': [] 13 | }, HomeConfig) 14 | 15 | assert config.get_account_for_id('1').name == 'peter' 16 | assert config.get_account_for_id('2').name == 'localhost' 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools import find_packages 3 | __version__ = '1.0.9' 4 | 5 | setup(name='deepkit', 6 | version=__version__, 7 | description='Python SDK for Deepkit', 8 | author='Marc J. Schmidt', 9 | author_email='marc@marcjschmidt.de', 10 | url='https://github.com/deepkit/deepkit-python-sdk', 11 | download_url='https://github.com/deepkit/deepkit-python-sdk/tarball/' + __version__, 12 | license='MIT', 13 | packages=find_packages(), 14 | install_requires=[ 15 | 'numpy', 16 | 'Pillow>=4.0.0', 17 | 'rx>=1.5', 18 | 'typedload>=1.20', 19 | 'PyYAML>=5.0.0', 20 | 'psutil>=5.7.0', 21 | 'websockets>=8.1' 22 | ], 23 | extras_require={ 24 | 'pytorch': ["torch"] 25 | }) 26 | -------------------------------------------------------------------------------- /examples/keras-cifar10/deepkit.yml: -------------------------------------------------------------------------------- 1 | title: TF1 Keras Cifar10 Partial 2 | list: keras-cifar10 3 | label: 'keras' 4 | 5 | image: tensorflow/tensorflow:1.15.2-gpu-py3 6 | build: 7 | - pip install --upgrade pip 8 | - ADD requirements.txt 9 | - pip install -r requirements.txt 10 | - python -c 'from keras.datasets import cifar10; cifar10.load_data()' 11 | - ADD ../../deepkit:/deepkit-sdk/deepkit 12 | - ADD ../../setup.py:/deepkit-sdk/setup.py 13 | - pip install -e /deepkit-sdk/ 14 | 15 | ignore: 16 | - report.* 17 | - saved_models 18 | 19 | output: saved_models 20 | 21 | resources: 22 | minCpu: 2 23 | minMemory: 2 24 | minGpu: 1 25 | 26 | config: 27 | lr: 0.8 28 | batch_size: 128 29 | epochs: 25 30 | train_samples: 60000 31 | test_samples: 10000 32 | data_augmentation: false 33 | 34 | command: python model.py -------------------------------------------------------------------------------- /examples/ray/dqn.py: -------------------------------------------------------------------------------- 1 | import deepkit 2 | import ray 3 | from ray.rllib.agents import dqn 4 | 5 | # note: ray overwrites sys.path[0], Dunno why, but that breaks deepkit looking for the project link 6 | experiment = deepkit.experiment(account='localhost', project='deepkit-python-sdk') 7 | 8 | # Initialize Ray with host that makes docker happy 9 | ray.init(webui_host='127.0.0.1') 10 | 11 | # Initialize DQN Trainer with default config and built-in gym cart-pole environment. 12 | trainer = dqn.DQNTrainer(config=dqn.DEFAULT_CONFIG, env="CartPole-v0") 13 | 14 | # Extract several layers of models 15 | ray_policy = trainer.get_policy() 16 | ray_model = ray_policy.model 17 | # This is the one I think we should "watch" 18 | keras_model = ray_model.base_model 19 | 20 | experiment.watch_keras_model(keras_model) 21 | 22 | experiment.log('lets go') 23 | 24 | # Manually train for a couple of iterations 25 | for i in range(20): 26 | result = trainer.train() 27 | 28 | experiment.log('Done') 29 | -------------------------------------------------------------------------------- /deepkit/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | 4 | import numpy as np 5 | 6 | 7 | def in_self_execution(): 8 | """ 9 | Returns true if the script is directly executed without `deepkit` cli. 10 | """ 11 | return 'DEEPKIT_JOB_ACCESSTOKEN' not in os.environ 12 | 13 | 14 | def array_to_img(x, scale=True): 15 | """ 16 | x should be shape (channels, width, height) 17 | """ 18 | from PIL import Image 19 | if x.ndim != 3: 20 | raise Exception('Unsupported shape : ', str(x.shape), '. Need (channels, width, height)') 21 | if scale: 22 | x += max(-np.min(x), 0) 23 | x /= np.max(x) 24 | x *= 255 25 | if x.shape[0] == 3: 26 | # RGB 27 | if x.dtype != 'uint8': 28 | x = x.astype('uint8') 29 | return Image.fromarray(x.astype('uint8'), 'RGB') 30 | elif x.shape[0] == 1: 31 | # grayscale 32 | if x.dtype != 'uint8': 33 | x = x.astype('uint8') 34 | return Image.fromarray(x.reshape(x.shape[1], x.shape[2]), 'L') 35 | else: 36 | raise Exception('Unsupported channel number: ', x.shape[0]) 37 | 38 | 39 | def numpy_to_binary(array): 40 | buffer = io.BytesIO() 41 | 42 | if isinstance(array, np.ndarray): 43 | np.save(buffer, array) 44 | 45 | return buffer.getvalue() 46 | 47 | 48 | def get_parameter_by_path(dictionary, path): 49 | if not dictionary: 50 | return None 51 | 52 | if path in dictionary: 53 | return dictionary[path] 54 | 55 | current = dictionary 56 | 57 | for item in path.split('.'): 58 | if item not in current: 59 | return None 60 | 61 | current = current[item] 62 | 63 | return current 64 | -------------------------------------------------------------------------------- /examples/dynamic-experiments/threaded.py: -------------------------------------------------------------------------------- 1 | # this script starts multiple experiments 2 | import random 3 | import threading 4 | from time import sleep 5 | 6 | import deepkit 7 | 8 | experiment_optimization_id = '1' 9 | 10 | hyper_parameters_base = { 11 | 'lr': 0.1, 12 | 'optimizer': 'adam', 13 | } 14 | 15 | root_experiment = deepkit.experiment(project='threaded') 16 | experiments = 10 17 | 18 | 19 | class ExperimentExecutor(threading.Thread): 20 | def __init__(self, id: int, root_experiment: deepkit.Experiment, hyper_parameters: dict): 21 | super().__init__() 22 | self.daemon = True 23 | self.id = id 24 | self.root_experiment = root_experiment 25 | self.hyper_parameters = hyper_parameters 26 | 27 | def run(self): 28 | experiment = self.root_experiment.create_sub_experiment() 29 | experiment.set_info('id', id) 30 | experiment.set_info('optimization_id', experiment_optimization_id) 31 | experiment.set_full_config(hyper_parameters) 32 | experiment.add_file(__file__) 33 | 34 | total = 1_000 35 | for epoch in range(total): 36 | experiment.log_metric('test', random.gauss(25, 25 / 3), x=epoch) 37 | experiment.epoch(epoch + 1, total) 38 | sleep(0.05) 39 | 40 | if self.id == 2: 41 | experiment.set_description('Aborted on purpose') 42 | experiment.abort() 43 | else: 44 | experiment.done() 45 | 46 | print(f"Experiment #{self.id} ended.") 47 | 48 | 49 | threads = [] 50 | for i in range(experiments): 51 | hyper_parameters = hyper_parameters_base.copy() 52 | hyper_parameters['lr'] += i * 0.1 # poor man's hyper-parameter optimization :o) 53 | 54 | executor = ExperimentExecutor(i, root_experiment, hyper_parameters) 55 | executor.start() 56 | threads.append(executor) 57 | 58 | for executor in threads: 59 | executor.join() 60 | 61 | print("All done") 62 | -------------------------------------------------------------------------------- /examples/alexnet/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import keras 4 | 5 | keras.backend.set_image_data_format('channels_first') 6 | from keras.models import Sequential 7 | from keras.layers import * 8 | from keras.utils import np_utils 9 | import numpy as np 10 | import deepkit 11 | 12 | experiment = deepkit.experiment() 13 | 14 | batch_size = 64 15 | nb_classes = 1000 16 | 17 | img_rows, img_cols = 224, 224 18 | 19 | if keras.backend.image_data_format() == 'channels_first': 20 | X_train = np.random.random((batch_size, 3, img_rows, img_cols)).astype('float32') 21 | else: 22 | X_train = np.random.random((batch_size, img_rows, img_cols, 3)).astype('float32') 23 | Y_train = np.random.random((batch_size,)).astype('int32') 24 | Y_train = np_utils.to_categorical(Y_train, nb_classes) 25 | 26 | 27 | def gen(): 28 | while True: 29 | yield (X_train, Y_train) 30 | 31 | 32 | model = Sequential() 33 | model.add(Convolution2D(64, 11, strides=4, padding='valid', input_shape=X_train.shape[1:])) 34 | model.add(Activation('relu')) 35 | model.add(MaxPooling2D(pool_size=(3, 3), strides=2)) 36 | model.add(Convolution2D(192, 5, padding='same')) 37 | model.add(Activation('relu')) 38 | model.add(MaxPooling2D(pool_size=(3, 3), strides=2)) 39 | 40 | model.add(Convolution2D(384, 3, padding='same')) 41 | model.add(Activation('relu')) 42 | model.add(Convolution2D(256, 3, padding='same')) 43 | model.add(Activation('relu')) 44 | model.add(Convolution2D(256, 3, padding='same')) 45 | model.add(Activation('relu')) 46 | model.add(MaxPooling2D(pool_size=(3, 3), strides=2)) 47 | 48 | model.add(Flatten()) 49 | model.add(Dense(4096)) 50 | model.add(Activation('relu')) 51 | model.add(Dropout(0.5)) 52 | model.add(Dense(4096)) 53 | model.add(Activation('relu')) 54 | model.add(Dropout(0.5)) 55 | model.add(Dense(nb_classes)) 56 | model.add(Activation('softmax')) 57 | 58 | for l in model.layers: 59 | print(l.input_shape, l.output_shape) 60 | 61 | # Let's train the model using RMSprop 62 | model.compile(loss='categorical_crossentropy', 63 | optimizer='rmsprop', 64 | metrics=['accuracy']) 65 | 66 | model.fit_generator(gen(), epochs=100, steps_per_epoch=200, callbacks=[experiment.create_keras_callback(next(gen())[0])]) 67 | -------------------------------------------------------------------------------- /deepkit/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import NamedTuple, Optional, List 3 | 4 | 5 | class ExperimentOptions(NamedTuple): 6 | """ 7 | Per default the account linked to this folder is used (see `deepkit link`), this is on a new system `localhost`. 8 | You can overwrite which account is used by specifying the name here (see `deepkit id` for 9 | available accounts in your system). 10 | """ 11 | account: Optional[str] = None 12 | 13 | """ 14 | Per default the project linked to this folder is used (see `deepkit link`). 15 | You can overwrite which proect is used. 16 | Names is format of either `my-project`, or `user/my-project`, or `org/my-project`. 17 | 18 | If the current folder is not linked and you don't specify a project here, an error is raised since 19 | Deepkit isn't able to know to which project the experiments data should be sent. 20 | """ 21 | project: Optional[str] = None 22 | 23 | 24 | class Account(NamedTuple): 25 | id: str 26 | port: int 27 | ssl: bool 28 | username: str 29 | token: str 30 | host: str 31 | name: str 32 | 33 | 34 | class FolderLink(NamedTuple): 35 | accountId: str 36 | name: str 37 | path: str 38 | projectId: str 39 | 40 | 41 | class HomeConfig(NamedTuple): 42 | accounts: List[Account] 43 | folderLinks: List[FolderLink] 44 | 45 | def get_first_account(self) -> Account: 46 | if len(self.accounts) is 0: raise Exception(f'No Deepkit accounts configured.') 47 | return self.accounts[0] 48 | 49 | def get_account_for_name(self, name: str) -> Account: 50 | for account in self.accounts: 51 | if account.name == name: 52 | return account 53 | raise Exception(f'No account for name {name} configured. Use `deepkit login` to add new accounts.') 54 | 55 | def get_account_for_id(self, id: str) -> Account: 56 | for account in self.accounts: 57 | if account.id == id: 58 | return account 59 | raise Exception(f'No account for id {id}') 60 | 61 | def get_folder_link_of_directory(self, dir: str) -> FolderLink: 62 | link_map = {} 63 | for item in self.folderLinks: 64 | link_map[item.path] = item 65 | 66 | while dir not in link_map: 67 | dir = os.path.realpath(os.path.join(dir, '..')) 68 | if dir == os.path.realpath(os.path.join(dir, '..')): 69 | # reached root 70 | break 71 | 72 | if dir in link_map: 73 | return link_map[dir] 74 | 75 | raise Exception('No project linked for folder ' + dir) 76 | -------------------------------------------------------------------------------- /examples/tf2-keras-mnist/model.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import tensorflow as tf 4 | from tensorflow.keras import Model, layers, optimizers, datasets 5 | import deepkit 6 | 7 | experiment = deepkit.experiment() 8 | experiment.add_file('model.py') 9 | 10 | (x, y), (x_val, y_val) = datasets.fashion_mnist.load_data() 11 | x = x.reshape(x.shape[0], 28, 28, 1) 12 | x_val = x_val.reshape(x_val.shape[0], 28, 28, 1) 13 | x = x / 255.0 14 | y = tf.one_hot(y, depth=10, dtype=tf.float32) 15 | y_val = tf.one_hot(y_val, depth=10) 16 | print('x/y shape:', x.shape, y.shape) 17 | 18 | 19 | def train_gen(): 20 | global x, y 21 | for x2, y2 in zip(x, y): 22 | yield (x2, x2), y2 23 | # yield x2, y2 24 | 25 | 26 | train_dataset = tf.data.Dataset.from_generator( 27 | train_gen, 28 | ((tf.float32, tf.float32), tf.float32), 29 | # (tf.TensorShape([28, 28]), tf.TensorShape([10])) 30 | ((tf.TensorShape([28, 28, 1]), tf.TensorShape([28, 28, 1])), tf.TensorShape([10])) 31 | ) 32 | train_dataset = train_dataset.batch(100) 33 | # val_dataset = train_dataset.batch(10) 34 | 35 | # train_dataset, val_dataset = mnist_dataset() 36 | 37 | # resnet = tf.keras.applications.ResNet50( 38 | # include_top=True, 39 | # weights=None, 40 | # input_tensor=None, 41 | # input_shape=None, 42 | # pooling=None, 43 | # classes=10 44 | # ) 45 | 46 | # model = tf.keras.Sequential([ 47 | # resnet, 48 | # layers.Dense(10, name='asd') 49 | # ]) 50 | 51 | input1 = layers.Input((28, 28, 1)) 52 | input2 = layers.Input((28, 28, 1)) 53 | 54 | conv1 = layers.Convolution2D(64, (1, 1), activation='relu')(input1) 55 | conv2 = layers.Convolution2D(64, (1, 1), activation='relu')(conv1) 56 | rs1 = layers.Flatten()(conv2) 57 | rs2 = layers.Flatten()(input2) 58 | 59 | d1 = layers.Dense(64, activation='relu')(rs1) 60 | d2 = layers.Dense(64, activation='relu')(rs2) 61 | c1 = layers.Concatenate()([d1, d2]) 62 | d3 = layers.Dense(64, name='YoloDense', activation='relu')(c1) 63 | 64 | output1 = layers.Dense(10)(d3) 65 | model = Model(inputs=[input1, input2], outputs=[output1]) 66 | 67 | model.summary() 68 | 69 | experiment.watch_keras_model(model) 70 | deepkit_callback = experiment.create_keras_callback() 71 | 72 | log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") 73 | tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) 74 | # no need to use compile if you have no loss/optimizer/metrics involved here. 75 | model.compile(optimizer=optimizers.Adam(0.001), 76 | loss=tf.losses.CategoricalCrossentropy(from_logits=True), 77 | metrics=['accuracy']) 78 | 79 | model.fit(train_dataset.repeat(), epochs=30, steps_per_epoch=500, 80 | validation_data=train_dataset.repeat(), 81 | validation_steps=2, 82 | callbacks=[tensorboard_callback, deepkit_callback] 83 | ) 84 | -------------------------------------------------------------------------------- /examples/torch/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | bla = 10 6 | 7 | 8 | def my_activation(x): 9 | return x ** 2 - 1 10 | 11 | 12 | class BasicBlock(nn.Module): 13 | expansion = 1 14 | 15 | def __init__(self, in_planes, planes, stride=1): 16 | super(BasicBlock, self).__init__() 17 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 18 | self.bn1 = nn.BatchNorm2d(planes) 19 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) 20 | self.bn2 = nn.BatchNorm2d(planes) 21 | 22 | self.shortcut = nn.Sequential() 23 | if stride != 1 or in_planes != self.expansion * planes: 24 | self.shortcut = nn.Sequential( 25 | nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), 26 | nn.BatchNorm2d(self.expansion * planes) 27 | ) 28 | 29 | def ups(self, x): 30 | i = 1 + 1.9 - 1 / bla 31 | return x * i 32 | 33 | def forward(self, x): 34 | out = self.bn2(self.conv2(F.relu(self.bn1(self.conv1(x))))) 35 | out += self.shortcut(x) 36 | out = F.relu(out) 37 | return out 38 | 39 | 40 | class Bottleneck(nn.Module): 41 | expansion = 4 42 | 43 | def __init__(self, in_planes, planes, stride=1): 44 | super(Bottleneck, self).__init__() 45 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 46 | self.bn1 = nn.BatchNorm2d(planes) 47 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 48 | self.bn2 = nn.BatchNorm2d(planes) 49 | self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False) 50 | self.bn3 = nn.BatchNorm2d(self.expansion * planes) 51 | 52 | self.shortcut = nn.Sequential() 53 | if stride != 1 or in_planes != self.expansion * planes: 54 | self.shortcut = nn.Sequential( 55 | nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), 56 | nn.BatchNorm2d(self.expansion * planes) 57 | ) 58 | 59 | def forward(self, x): 60 | out = F.relu(self.bn1(self.conv1(x))) 61 | out = F.relu(self.bn2(self.conv2(out))) 62 | out = self.bn3(self.conv3(out)) 63 | out += self.shortcut(x) 64 | out = F.relu(out) 65 | return out 66 | 67 | 68 | class ResNet(nn.Module): 69 | def __init__(self, block, num_blocks, num_classes=10): 70 | super(ResNet, self).__init__() 71 | self.in_planes = 64 72 | 73 | self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) 74 | self.bn1 = nn.BatchNorm2d(64) 75 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 76 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 77 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 78 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 79 | self.linear = nn.Linear(512 * block.expansion, num_classes) 80 | 81 | def _make_layer(self, block, planes, num_blocks, stride): 82 | strides = [stride] + [1] * (num_blocks - 1) 83 | layers = [] 84 | for stride in strides: 85 | layers.append(block(self.in_planes, planes, stride)) 86 | self.in_planes = planes * block.expansion 87 | return nn.Sequential(*layers) 88 | 89 | def forward(self, x): 90 | out = F.relu(self.bn1(self.conv1(x))) 91 | out = self.layer1(out) 92 | out = self.layer2(out) 93 | out = self.layer3(out) 94 | out = self.layer4(out) 95 | out = F.avg_pool2d(out, 4) 96 | out = out.view(out.size(0), -1) 97 | out = self.linear(out) 98 | return out 99 | 100 | 101 | def ResNet18(): 102 | return ResNet(BasicBlock, [2, 2, 2, 2]) 103 | 104 | 105 | def ResNet34(): 106 | return ResNet(BasicBlock, [3, 4, 6, 3]) 107 | 108 | 109 | def ResNet50(): 110 | return ResNet(Bottleneck, [3, 4, 6, 3]) 111 | 112 | 113 | def ResNet101(): 114 | return ResNet(Bottleneck, [3, 4, 23, 3]) 115 | 116 | 117 | def ResNet152(): 118 | return ResNet(Bottleneck, [3, 8, 36, 3]) 119 | -------------------------------------------------------------------------------- /examples/torch/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | import torchvision 5 | import torchvision.transforms as transforms 6 | from torch import nn, optim 7 | from torch.backends import cudnn 8 | 9 | import deepkit 10 | from examples.torch.resnet import ResNet18 11 | 12 | experiment = deepkit.experiment() 13 | 14 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 15 | best_acc = 0 # best test accuracy 16 | start_epoch = experiment.intconfig('start_epoch', 0) # start from epoch 0 or last checkpoint epoch 17 | 18 | batch_size = experiment.intconfig('batch_size', 32) 19 | 20 | # Data 21 | print('==> Preparing data..') 22 | transform_train = transforms.Compose([ 23 | transforms.RandomCrop(32, padding=4), 24 | transforms.RandomHorizontalFlip(), 25 | transforms.ToTensor(), 26 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 27 | ]) 28 | 29 | transform_test = transforms.Compose([ 30 | transforms.ToTensor(), 31 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 32 | ]) 33 | 34 | trainset = torchvision.datasets.CIFAR10(root='~/.data', train=True, download=True, transform=transform_train) 35 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0) 36 | 37 | testset = torchvision.datasets.CIFAR10(root='~/.data', train=False, download=True, transform=transform_test) 38 | testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=0) 39 | 40 | classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') 41 | 42 | # Model 43 | print('==> Building model..') 44 | # net = VGG('VGG19') 45 | net = ResNet18() 46 | # net = ResNet152() 47 | # net = PreActResNet18() 48 | # net = GoogLeNet() 49 | # net = DenseNet121() 50 | # net = ResNeXt29_2x64d() 51 | # net = MobileNet() 52 | # net = MobileNetV2() 53 | # net = DPN92() 54 | # net = ShuffleNetG2() 55 | # net = SENet18() 56 | # net = ShuffleNetV2(1) 57 | # net = EfficientNetB0() 58 | if device == 'cuda': 59 | net = torch.nn.DataParallel(net) 60 | cudnn.benchmark = True 61 | 62 | if experiment.boolconfig('resume'): 63 | # Load checkpoint. 64 | print('==> Resuming from checkpoint..') 65 | assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' 66 | checkpoint = torch.load('./checkpoint/ckpt.pth') 67 | net.load_state_dict(checkpoint['net']) 68 | best_acc = checkpoint['acc'] 69 | start_epoch = checkpoint['epoch'] 70 | 71 | criterion = nn.CrossEntropyLoss() 72 | optimizer = optim.SGD(net.parameters(), lr=experiment.floatconfig('lr', 0.1), momentum=0.9, weight_decay=5e-4) 73 | 74 | experiment.watch_torch_model(net) 75 | 76 | 77 | # Training 78 | def train(epoch): 79 | print('\nEpoch: %d' % epoch) 80 | net.train() 81 | train_loss = 0 82 | correct = 0 83 | total = 0 84 | total_batches = len(trainloader) 85 | for batch_idx, (inputs, targets) in enumerate(trainloader): 86 | inputs, targets = inputs.to(device), targets.to(device) 87 | optimizer.zero_grad() 88 | outputs = net(inputs) 89 | loss = criterion(outputs, targets) 90 | loss.backward() 91 | optimizer.step() 92 | 93 | train_loss += loss.item() 94 | _, predicted = outputs.max(1) 95 | total += targets.size(0) 96 | correct += predicted.eq(targets).sum().item() 97 | 98 | experiment.batch(batch_idx, total_batches, targets.size(0)) 99 | experiment.log_metric('loss/train', epoch + (batch_idx / total_batches), (train_loss / (batch_idx + 1))) 100 | experiment.log_metric('accuracy/train', epoch + (batch_idx / total_batches), correct / total) 101 | # progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' 102 | # % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total)) 103 | 104 | 105 | def test(epoch): 106 | global best_acc 107 | net.eval() 108 | test_loss = 0 109 | correct = 0 110 | total = 0 111 | with torch.no_grad(): 112 | for batch_idx, (inputs, targets) in enumerate(testloader): 113 | inputs, targets = inputs.to(device), targets.to(device) 114 | outputs = net(inputs) 115 | loss = criterion(outputs, targets) 116 | 117 | test_loss += loss.item() 118 | _, predicted = outputs.max(1) 119 | total += targets.size(0) 120 | correct += predicted.eq(targets).sum().item() 121 | 122 | experiment.log_metric('loss/val', epoch, test_loss / (batch_idx + 1)) 123 | experiment.log_metric('accuracy/val', epoch, correct / total) 124 | # progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' 125 | # % (test_loss / (batch_idx + 1), 100. * correct / total, correct, total)) 126 | 127 | # Save checkpoint. 128 | acc = 100. * correct / total 129 | if acc > best_acc: 130 | print('Saving..') 131 | state = { 132 | 'net': net.state_dict(), 133 | 'acc': acc, 134 | 'epoch': epoch, 135 | } 136 | if not os.path.isdir('checkpoint'): 137 | os.mkdir('checkpoint') 138 | torch.save(state, './checkpoint/ckpt.pth') 139 | best_acc = acc 140 | 141 | 142 | for epoch in range(start_epoch, start_epoch + 200): 143 | experiment.epoch(epoch, 200) 144 | train(epoch) 145 | test(epoch) 146 | -------------------------------------------------------------------------------- /deepkit/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import deepkit.globals 5 | import deepkit.utils 6 | from deepkit.client import Client 7 | from deepkit.experiment import Experiment, ExperimentOptions 8 | import getpass 9 | 10 | from deepkit.home import get_home_config 11 | 12 | 13 | def log(s): 14 | if deepkit.globals.last_experiment: 15 | deepkit.globals.last_experiment.log(s) 16 | else: 17 | deepkit.globals.last_logs.write(s) 18 | 19 | 20 | def experiment(project=None, account=None) -> Experiment: 21 | """ 22 | Per default this method returns a singleton. 23 | 24 | If you start an experiment using the Deepkit cli (`deepkit run`) or the Deepkit app, the experiment 25 | is created beforehand and this method picks it up. If an experiment is run without cli or app, 26 | then this method creates a new one. In any case, this method returns always the same instance, so 27 | you don't strictly need to save or pass around its return value. 28 | 29 | If you want to create new sub experiments you should use: 30 | 31 | import deepkit 32 | root_experiment = deepkit.experiment() 33 | sub_experiment = root_experiment.create_sub_experiment() 34 | 35 | This will create _always_ a new child experiments. In this cases, make sure to call `experiment.done()`, 36 | (or abort, crashed, failed) manually to end the created experiment and pass around the created experiment 37 | instance manually (since its not tracked). 38 | 39 | :param project: If the current folder is not linked and you don't specify a project here, an error is raised since 40 | Deepkit isn't able to know to which project the experiments data should be sent. 41 | :param account: Per default the first account linked to this folder is used (see `deepkit link` or `deepkit-sdk auth -l`), 42 | this is on a new system `localhost`. 43 | You can overwrite which account is used by specifying the name here (see `deepkit id` for 44 | available accounts in your system). 45 | :return: 46 | """ 47 | """ 48 | :return: returns either a new experiment or the last created one. 49 | """ 50 | if not deepkit.globals.last_experiment: 51 | deepkit.globals.last_experiment = Experiment(project=project, account=account, monitoring=True, 52 | try_pick_up=True) 53 | 54 | return deepkit.globals.last_experiment 55 | 56 | 57 | if deepkit.utils.in_self_execution(): 58 | class StdHook: 59 | def __init__(self, s): 60 | self.s = s 61 | 62 | def fileno(self): 63 | return self.s.fileno() 64 | 65 | def isatty(self): 66 | return self.s.isatty() 67 | 68 | def flush(self): 69 | self.s.flush() 70 | 71 | def write(self, s): 72 | self.s.write(s) 73 | log(s) 74 | 75 | 76 | sys.stdout = StdHook(sys.__stdout__) 77 | sys.stderr = StdHook(sys.__stderr__) 78 | 79 | 80 | def get_credentials(): 81 | username = input("Username: ") 82 | try: 83 | password = getpass.getpass() 84 | return username, password 85 | except Exception as error: 86 | print('ERROR', error) 87 | 88 | 89 | access_key_map = dict() 90 | 91 | 92 | def access_key_map_cache_key(host, port, ssl): 93 | return host + '-' + str(port) + str(ssl) 94 | 95 | 96 | def login( 97 | access_key=None, 98 | host='app.deepkit.ai', 99 | port=443, 100 | ssl=True, 101 | ): 102 | """ 103 | In environments (like Jupyter Notebooks/Google Colab) where its not possible to use the Deepkit CLI to authenticate 104 | with a Deepkit server (deepkit auth) or where "deepkit run" is not used, it's required to provide an access-key 105 | or login via username/password. 106 | 107 | It's important to call this method BEFORE deepkit.experiment() is called. 108 | """ 109 | if host is 'localhost': 110 | ssl = False 111 | 112 | if port == 443: 113 | port = 8960 114 | 115 | try: 116 | config = get_home_config() 117 | account_config = config.get_account_for_name('localhost') 118 | access_key = account_config.token 119 | except Exception: 120 | pass 121 | 122 | if access_key is None: 123 | cache_key = access_key_map_cache_key(host, port, ssl) 124 | if cache_key in access_key_map: 125 | access_key = access_key_map[cache_key] 126 | else: 127 | print("No access_key provided. Please provide username and password.") 128 | print( 129 | f"Note: You can create an access_key directly in the CLI using `deepkit access-key {host} --port {port}`") 130 | client = Client() 131 | client.host = host 132 | client.port = port 133 | client.ssl = ssl 134 | 135 | username, password = get_credentials() 136 | 137 | print(f"Connecting {client.host}:{client.port}") 138 | client.connect_anon() 139 | access_key = client.app_action_threadsafe('login', [username, password]).result() 140 | if not access_key: 141 | raise Exception("Credentials check failed") 142 | 143 | print("Login successful. Access key is " + access_key) 144 | access_key_map[cache_key] = access_key 145 | 146 | os.environ['DEEPKIT_HOST'] = host 147 | os.environ['DEEPKIT_SSL'] = '1' if ssl else '0' 148 | os.environ['DEEPKIT_PORT'] = str(port) 149 | 150 | if 'DEEPKIT_JOB_ACCESSTOKEN' in os.environ: 151 | del os.environ['DEEPKIT_JOB_ACCESSTOKEN'] 152 | 153 | if 'DEEPKIT_JOB_ID' in os.environ: 154 | del os.environ['DEEPKIT_JOB_ID'] 155 | 156 | os.environ['DEEPKIT_ACCESSTOKEN'] = access_key 157 | -------------------------------------------------------------------------------- /deepkit/debugger.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import time 3 | from typing import NamedTuple, List, Dict 4 | 5 | import PIL.Image 6 | 7 | from deepkit.utils.image import pil_image_to_jpeg 8 | 9 | 10 | class DebuggerFetchItem(NamedTuple): 11 | name: str 12 | output: any 13 | ahistogram: any 14 | whistogram: any 15 | bhistogram: any 16 | 17 | 18 | class DebuggerFetchConfig(NamedTuple): 19 | x: int 20 | layers: List[str] 21 | all: bool 22 | 23 | def needs_fetch(self, name: str) -> bool: 24 | if self.all: return True 25 | return name in self.layers 26 | 27 | 28 | class DebuggerManager: 29 | def __init__(self, experiment): 30 | import deepkit 31 | self.experiment: deepkit.Experiment = experiment 32 | 33 | self.live_last_sent = time.time() 34 | self.x = 0 35 | self.record_snapshot_created = False 36 | self.record_last_sent = time.time() 37 | self.record_last_epoch = 0 38 | self.debuggers = [] 39 | self.active_debug_data_for_this_run = False 40 | self.record_needed = False 41 | self.live_needed = False 42 | self.send_data_futures = [] 43 | 44 | def register_debugger(self, debugger): 45 | self.debuggers.append(debugger) 46 | 47 | def on_disconnect(self): 48 | for f in self.send_data_futures: 49 | f.set_result(False) 50 | 51 | self.send_data_futures = [] 52 | 53 | def create_snapshot(self, x, layers): 54 | self.experiment.client.job_action_threadsafe('addSnapshot', [ 55 | x, 56 | time.time(), 57 | layers, 58 | self.experiment.job_iteration, 59 | self.experiment.job_step, 60 | ]) 61 | 62 | def tick(self): 63 | """ 64 | Checks whether a new snapshot or live data needs to be fetched and sent. If so we trigger on each debugger 65 | instance a fetch() call and send that data to the server. 66 | """ 67 | if self.active_debug_data_for_this_run: return 68 | if not self.experiment.client.is_connected(): return 69 | 70 | state = self.experiment.debugger_controller.state 71 | if not state: return 72 | 73 | self.record_needed = state.recording 74 | fetch_all = False 75 | 76 | if state.recordingMode == 'second': 77 | diff = time.time() - self.record_last_sent 78 | if diff <= state.recordingSecond: 79 | # not enough time past, wait for next call 80 | self.record_needed = False 81 | 82 | if state.recordingMode == 'epoch': 83 | # if not epoch_end: record_needed = False 84 | if self.experiment.job_iteration == self.record_last_epoch: 85 | # nothing to do for records 86 | self.record_needed = False 87 | self.record_last_epoch = self.experiment.job_iteration 88 | 89 | self.live_needed = state.live and (time.time() - self.live_last_sent) > 1 90 | layers = list(state.watchingLayers.keys()) 91 | 92 | if not self.live_needed and not self.record_needed: 93 | return 94 | 95 | self.active_debug_data_for_this_run = True 96 | self.record_snapshot_created = False 97 | 98 | if self.record_needed and state.recordingLayers == 'all': 99 | fetch_all = True 100 | 101 | # wait for all previous to be sent first. 102 | try: 103 | for f in self.send_data_futures: f.result() 104 | except Exception as e: 105 | print('Failing sending debug data', e) 106 | pass 107 | 108 | self.x += 1 109 | 110 | fetch_config = DebuggerFetchConfig(x=self.x, layers=layers, all=fetch_all) 111 | 112 | fetch_layers: Dict[str, DebuggerFetchItem] = dict() 113 | for debugger in self.debuggers: 114 | fetch_layers.update(debugger.fetch(fetch_config)) 115 | 116 | if self.record_needed and len(fetch_layers): 117 | self.create_snapshot(self.x, list(fetch_layers.keys())) 118 | 119 | for fetch in fetch_layers.values(): 120 | output = fetch.output 121 | output_image = None 122 | if isinstance(fetch.output, PIL.Image.Image): 123 | output = None 124 | output_image = base64.b64encode(pil_image_to_jpeg(fetch.output)).decode() 125 | 126 | if self.record_needed: 127 | self.send_data_futures.append(self.experiment.client.job_action_threadsafe('setSnapshotLayerData', [ 128 | fetch_config.x, 129 | self.live_needed, 130 | fetch.name, 131 | output, 132 | output_image, 133 | base64.b64encode(fetch.ahistogram).decode() if fetch.ahistogram else None, 134 | base64.b64encode(fetch.whistogram).decode() if fetch.whistogram else None, 135 | base64.b64encode(fetch.bhistogram).decode() if fetch.bhistogram else None, 136 | ])) 137 | else: 138 | self.send_data_futures.append(self.experiment.client.job_action_threadsafe('addLiveLayerData', [ 139 | fetch.name, 140 | output, 141 | output_image, 142 | base64.b64encode(fetch.ahistogram).decode() if fetch.ahistogram else None, 143 | base64.b64encode(fetch.whistogram).decode() if fetch.whistogram else None, 144 | base64.b64encode(fetch.bhistogram).decode() if fetch.bhistogram else None, 145 | ])) 146 | 147 | self.live_last_sent = time.time() 148 | 149 | self.active_debug_data_for_this_run = False 150 | 151 | if self.record_needed: 152 | self.record_last_sent = time.time() 153 | 154 | if self.live_needed: 155 | self.live_last_sent = time.time() 156 | 157 | self.record_needed = False 158 | self.live_needed = False 159 | self.record_snapshot_created = False 160 | -------------------------------------------------------------------------------- /examples/keras-cifar10/model.py: -------------------------------------------------------------------------------- 1 | '''Train a simple deep CNN on the CIFAR10 small images dataset. 2 | It gets to 75% validation accuracy in 25 epochs, and 79% after 50 epochs. 3 | (it's still underfitting at that point, though). 4 | ''' 5 | import os 6 | 7 | # os.environ["KERAS_BACKEND"] = "plaidml.keras.backend" 8 | # os.environ["RUNFILES_DIR"] = "/usr/local/share/plaidml" 9 | # os.environ["PLAIDML_NATIVE_PATH"] = "/usr/local/lib/libplaidml.dylib" 10 | 11 | import keras 12 | from keras.datasets import cifar10 13 | from keras.preprocessing.image import ImageDataGenerator 14 | from keras.models import Sequential 15 | from keras.layers import Dense, Dropout, Flatten 16 | from keras.layers import Conv2D, MaxPooling2D, Activation 17 | 18 | import deepkit 19 | 20 | experiment = deepkit.experiment() 21 | experiment.add_file(__file__) 22 | 23 | batch_size = experiment.intconfig('batch_size', 16) 24 | num_classes = 10 25 | epochs = experiment.intconfig('epochs', 15) 26 | data_augmentation = experiment.boolconfig('data_augmentation', False) 27 | num_predictions = 20 28 | 29 | save_dir = os.path.join(os.getcwd(), 'saved_models') 30 | model_name = 'keras_cifar10_trained_model.h5' 31 | 32 | # The data, split between train and test sets: 33 | (x_train, y_train), (x_test, y_test) = cifar10.load_data() 34 | 35 | labels = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] 36 | 37 | x_train = x_train[0:experiment.intconfig('train_samples', 10000)] 38 | y_train = y_train[0:experiment.intconfig('train_samples', 10000)] 39 | 40 | x_test = x_test[0:experiment.intconfig('test_samples', 10000)] 41 | y_test = y_test[0:experiment.intconfig('test_samples', 10000)] 42 | 43 | experiment.log_insight(*x_train[0:50], name='samples/train/sample') 44 | 45 | for i, x in enumerate(x_test[0:20]): 46 | experiment.log_insight(x, name='samples/test/sample_' + str(i), meta=labels[y_test[i][0]]) 47 | 48 | experiment.log_insight({'my-data': 123, 'more': True}, name='json-like/sample1') 49 | experiment.log_insight({'my-data': 234, 'more': False}, name='json-like/sample2') 50 | experiment.log_insight(12312312.333, name='json-like/sample3') 51 | experiment.log_insight("This is just text\nYay.", name='json-like/sample4') 52 | experiment.log_insight( 53 | "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's " 54 | "standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make " 55 | "a type specimen book. It has survived not only five centuries.", 56 | name='json-like/sample5') 57 | experiment.log_insight(x_test[0], name='numpy-shizzle/sample1', image_convertion=False) 58 | experiment.log_insight(x_test[1], name='numpy-shizzle/sample2', image_convertion=False) 59 | experiment.log_insight(x_test[2], name='numpy-shizzle/sample3', image_convertion=False) 60 | experiment.log_insight(x_test[3], name='numpy-shizzle/sample4', image_convertion=False) 61 | experiment.log_insight(y_test[0:50], name='numpy-shizzle/y_test', image_convertion=False) 62 | 63 | print('x_train shape:', x_train.shape) 64 | print(x_train.shape[0], 'train samples') 65 | print(x_test.shape[0], 'test samples') 66 | print(x_test.shape[0], 'test samples') 67 | 68 | # Convert class vectors to binary class matrices. 69 | y_train = keras.utils.to_categorical(y_train, num_classes) 70 | y_test = keras.utils.to_categorical(y_test, num_classes) 71 | 72 | model = Sequential() 73 | model.add(Conv2D(12, kernel_size=(3, 3), input_shape=x_train.shape[1:])) 74 | model.add(Activation('relu')) 75 | model.add(Conv2D(64, (3, 3))) 76 | model.add(Activation('relu')) 77 | model.add(MaxPooling2D(pool_size=(2, 2))) 78 | model.add(Dropout(0.25)) 79 | 80 | model.add(Conv2D(64, (3, 3), padding='same')) 81 | model.add(Activation('relu')) 82 | model.add(Conv2D(64, (3, 3))) 83 | model.add(Activation('relu')) 84 | model.add(MaxPooling2D(pool_size=(2, 2))) 85 | model.add(Dropout(0.25)) 86 | 87 | model.add(Flatten()) 88 | model.add(Dense(512)) 89 | model.add(Activation('relu')) 90 | model.add(Dropout(0.5)) 91 | model.add(Dense(num_classes)) 92 | model.add(Activation('softmax')) 93 | 94 | opt = keras.optimizers.Adadelta(lr=experiment.floatconfig('lr', 0.1)) 95 | 96 | deepkit_callback = experiment.create_keras_callback(model) 97 | 98 | callbacks = [deepkit_callback] 99 | 100 | # Let's train the model using RMSprop 101 | model.compile(loss='categorical_crossentropy', 102 | optimizer=opt, 103 | metrics=['accuracy']) 104 | 105 | model.summary() 106 | 107 | x_train = x_train.astype('float32') 108 | x_test = x_test.astype('float32') 109 | x_train /= 255 110 | x_test /= 255 111 | 112 | if not data_augmentation: 113 | print('Not using data augmentation.') 114 | model.fit(x_train, y_train, 115 | batch_size=batch_size, 116 | epochs=epochs, 117 | callbacks=callbacks, 118 | validation_data=(x_test, y_test), 119 | shuffle=True) 120 | else: 121 | print('Using real-time data augmentation.') 122 | # This will do preprocessing and realtime data augmentation: 123 | datagen = ImageDataGenerator( 124 | featurewise_center=False, # set input mean to 0 over the dataset 125 | samplewise_center=False, # set each sample mean to 0 126 | featurewise_std_normalization=False, # divide inputs by std of the dataset 127 | samplewise_std_normalization=False, # divide each input by its std 128 | zca_whitening=False, # apply ZCA whitening 129 | zca_epsilon=1e-06, # epsilon for ZCA whitening 130 | rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180) 131 | # randomly shift images horizontally (fraction of total width) 132 | width_shift_range=0.1, 133 | # randomly shift images vertically (fraction of total height) 134 | height_shift_range=0.1, 135 | shear_range=0., # set range for random shear 136 | zoom_range=0., # set range for random zoom 137 | channel_shift_range=0., # set range for random channel shifts 138 | # set mode for filling points outside the input boundaries 139 | fill_mode='nearest', 140 | cval=0., # value used for fill_mode = "constant" 141 | horizontal_flip=True, # randomly flip images 142 | vertical_flip=False, # randomly flip images 143 | # set rescaling factor (applied before any other transformation) 144 | rescale=None, 145 | # set function that will be applied on each input 146 | preprocessing_function=None, 147 | # image data format, either "channels_first" or "channels_last" 148 | data_format=None, 149 | # fraction of images reserved for validation (strictly between 0 and 1) 150 | validation_split=0.0) 151 | 152 | # Compute quantities required for feature-wise normalization 153 | # (std, mean, and principal components if ZCA whitening is applied). 154 | datagen.fit(x_train) 155 | 156 | # Fit the model on the batches generated by datagen.flow(). 157 | model.fit_generator(datagen.flow(x_train, y_train, 158 | batch_size=batch_size), 159 | epochs=epochs, 160 | steps_per_epoch=len(x_train)/batch_size, 161 | verbose=0, 162 | callbacks=callbacks, 163 | validation_data=(x_test, y_test), 164 | workers=4) 165 | 166 | # Save model and weights 167 | if not os.path.isdir(save_dir): 168 | os.makedirs(save_dir) 169 | model_path = os.path.join(save_dir, model_name) 170 | model.save(model_path) 171 | print('Saved trained model at %s ' % model_path) 172 | 173 | # Score trained model. 174 | scores = model.evaluate(x_test, y_test, verbose=1) 175 | print('Test loss:', scores[0]) 176 | print('Test accuracy:', scores[1]) 177 | -------------------------------------------------------------------------------- /deepkit/deepkit_keras.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | 4 | import math 5 | import os 6 | import sys 7 | import time 8 | import numpy as np 9 | 10 | if 'keras' in sys.modules: 11 | import keras 12 | else: 13 | import tensorflow.keras as keras 14 | 15 | import deepkit 16 | 17 | 18 | def is_generator(obj): 19 | import inspect 20 | 21 | return obj is not None and ( 22 | inspect.isgeneratorfunction(obj) 23 | or inspect.isgenerator(obj) or hasattr(obj, 'next') or hasattr(obj, '__next__')) 24 | 25 | 26 | def ensure_dir(d): 27 | if not os.path.isdir(d): 28 | if os.path.isfile(d): # but a file, so delete it 29 | print("Deleted", d, "because it was a file, but needs to be an directory.") 30 | os.remove(d) 31 | 32 | os.makedirs(d) 33 | 34 | 35 | def get_total_params(model): 36 | total_params = 0 37 | 38 | flattened_layers = model.flattened_layers if hasattr(model, 'flattened_layers') else model.layers 39 | 40 | for i in range(len(flattened_layers)): 41 | total_params += flattened_layers[i].count_params() 42 | 43 | return total_params 44 | 45 | 46 | class KerasCallback(keras.callbacks.Callback): 47 | def __init__(self, debug_model_input=None): 48 | super(KerasCallback, self).__init__() 49 | 50 | self.experiment = deepkit.experiment() 51 | 52 | self.debug_model_input = debug_model_input 53 | 54 | self.data_validation = None 55 | self.data_validation_size = None 56 | 57 | self.current = {} 58 | self.last_batch_time = time.time() 59 | self.start_time = time.time() 60 | self.accuracy_metric = None 61 | self.all_losses = None 62 | self.loss_metric = None 63 | self.learning_rate_metric = None 64 | self.learning_rate_start = 0 65 | 66 | def set_model(self, model): 67 | super().set_model(model) 68 | self.experiment.watch_keras_model(model, self.debug_model_input) 69 | 70 | def on_train_begin(self, logs={}): 71 | self.start_time = time.time() 72 | self.last_batch_time = time.time() 73 | 74 | self.experiment.set_info('parameters', get_total_params(self.model)) 75 | self.experiment.set_info('keras.image_data_format', keras.backend.image_data_format()) 76 | self.experiment.set_info('keras.backend', keras.backend.backend()) 77 | 78 | # self.job_backend.upload_keras_graph(self.model) 79 | 80 | if self.model.optimizer and hasattr(self.model.optimizer, 'get_config'): 81 | config = self.model.optimizer.get_config() 82 | self.experiment.set_info('optimizer', str(type(self.model.optimizer).__name__)) 83 | for i, v in config.items(): 84 | self.experiment.set_info('optimizer.' + str(i), v) 85 | 86 | # compatibility with keras 1.x 87 | if 'epochs' not in self.params and 'nb_epoch' in self.params: 88 | self.params['epochs'] = self.params['nb_epoch'] 89 | if 'samples' not in self.params and 'nb_sample' in self.params: 90 | self.params['samples'] = self.params['nb_sample'] 91 | 92 | traces = ['training', 'validation'] 93 | if hasattr(self.model, 'output_layers') and len(self.model.output_layers) > 1: 94 | traces = [] 95 | for output in self.model.output_layers: 96 | traces.append('train_' + output.name) 97 | traces.append('val_' + output.name) 98 | 99 | self.accuracy_metric = self.experiment.define_metric('accuracy', traces=traces) 100 | self.loss_metric = self.experiment.define_metric('loss', traces=['train', 'val']) 101 | self.learning_rate_metric = self.experiment.define_metric('learning rate', traces=['start', 'end']) 102 | 103 | self.experiment.epoch(1, self.params['epochs']) 104 | if hasattr(self.model, 'output_layers') and len(self.model.output_layers) > 1: 105 | loss_traces = [] 106 | for output in self.model.output_layers: 107 | loss_traces.append('train_' + output.name) 108 | loss_traces.append('val_' + output.name) 109 | 110 | self.all_losses = self.experiment.define_metric('loss_all', traces=loss_traces) 111 | 112 | # if self.force_insights or self.job_model.insights_enabled: 113 | # images = self.build_insight_images() 114 | # self.job_backend.job_add_insight(0, images, None) 115 | 116 | def on_batch_begin(self, batch, logs={}): 117 | if 'nb_batches' not in self.current: 118 | batch_size = logs.get('size', 1) 119 | if 'samples' in self.params and batch_size > 0: 120 | nb_batches = math.ceil(self.params['samples'] / batch_size) # normal nb batches 121 | elif 'steps' in self.params: 122 | nb_batches = self.params['steps'] 123 | else: 124 | nb_batches = 1 125 | 126 | self.current['nb_batches'] = nb_batches 127 | 128 | def on_batch_end(self, batch, logs={}): 129 | self.filter_invalid_json_values(logs) 130 | self.experiment.batch(batch + 1, self.current['nb_batches'], logs.get('size', 1)) 131 | 132 | def on_epoch_begin(self, epoch, logs={}): 133 | self.experiment.epoch(epoch + 1, self.params['epochs']) 134 | self.learning_rate_start = self.get_learning_rate() 135 | 136 | def on_epoch_end(self, epoch, logs={}): 137 | log = logs.copy() 138 | 139 | self.filter_invalid_json_values(log) 140 | 141 | log['created'] = time.time() 142 | log['epoch'] = epoch + 1 143 | 144 | self.send_metrics(logs, log['epoch']) 145 | self.send_optimizer_info(log['epoch']) 146 | 147 | def send_metrics(self, log, x): 148 | if 'acc' in log: 149 | # tf 1 150 | accuracy_log_name = 'acc' 151 | val_accuracy_log_name = 'val_acc' 152 | else: 153 | # tf2 154 | accuracy_log_name = 'accuracy' 155 | val_accuracy_log_name = 'val_accuracy' 156 | 157 | total_accuracy_validation = log.get(val_accuracy_log_name, None) 158 | total_accuracy_training = log.get(accuracy_log_name, None) 159 | 160 | if total_accuracy_validation: total_accuracy_validation = float(total_accuracy_validation) 161 | if total_accuracy_training: total_accuracy_training = float(total_accuracy_training) 162 | 163 | loss = log.get('loss', None) 164 | val_loss = log.get('val_loss', None) 165 | if loss is not None or val_loss is not None: 166 | if loss: loss = float(loss) 167 | if val_loss: val_loss = float(val_loss) 168 | print('loss, val_loss', loss, val_loss) 169 | self.loss_metric.send(loss, val_loss, x=x) 170 | 171 | accuracy = [total_accuracy_training, total_accuracy_validation] 172 | if hasattr(self.model, 'output_layers') and len(self.model.output_layers) > 1: 173 | accuracy = [] 174 | losses = [] 175 | for layer in self.model.output_layers: 176 | accuracy.append(log.get(layer.name + '_acc', None)) 177 | accuracy.append(log.get('val_' + layer.name + '_acc', None)) 178 | 179 | losses.append(log.get(layer.name + '_loss', None)) 180 | losses.append(log.get('val_' + layer.name + '_loss', None)) 181 | 182 | self.all_losses.send(*losses, x=x) 183 | 184 | self.accuracy_metric.send(*accuracy, x=x) 185 | 186 | def send_optimizer_info(self, epoch): 187 | self.learning_rate_metric.send(self.learning_rate_start, self.get_learning_rate(), x=epoch) 188 | 189 | def get_learning_rate(self): 190 | if hasattr(self.model, 'optimizer'): 191 | config = self.model.optimizer.get_config() 192 | 193 | if 'lr' in config and 'decay' in config and hasattr(self.model.optimizer, 'iterations'): 194 | iterations = self.model.optimizer.iterations 195 | # if hasattr(iterations, 'var') and hasattr(iterations.var, 'as_ndarray'): 196 | # # plaidML 197 | # ndarray = iterations.var.as_ndarray(None) 198 | # iterations = float(ndarray) 199 | # else: 200 | iterations = float(keras.backend.get_value(iterations)) 201 | 202 | return config['lr'] * (1. / (1. + config['decay'] * iterations)) 203 | 204 | elif 'lr' in config: 205 | return config['lr'] 206 | 207 | def has_multiple_inputs(self): 208 | return len(self.model.inputs) > 1 209 | 210 | def filter_invalid_json_values(self, dict: dict): 211 | for k, v in dict.items(): 212 | if isinstance(v, (np.ndarray, np.generic)): 213 | dict[k] = v.tolist() 214 | if math.isnan(v) or math.isinf(v): 215 | dict[k] = -1 216 | -------------------------------------------------------------------------------- /deepkit/utils/pilutil.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2001, 2002 Enthought, Inc. 3 | All rights reserved. 4 | Copyright (c) 2003-2017 SciPy Developers. 5 | All rights reserved. 6 | A collection of image utilities using the Python Imaging Library (PIL). 7 | Note that PIL is not a dependency of SciPy and this module is not 8 | available on systems that don't have PIL installed. 9 | 10 | This source is coming from the scipy package and is here copied since building the 11 | scipy package on various computers takes a long time and is not necessary to have 12 | such a big package as requirement when you only need 2-3 methods. Also scipy has a lot of different 13 | licenes in their package, so we used here only code with the MIT license. 14 | """ 15 | 16 | 17 | import numpy 18 | from PIL import Image 19 | from numpy import (amin, amax, ravel, asarray, arange, ones, newaxis, 20 | transpose, iscomplexobj, uint8, issubdtype, array) 21 | 22 | 23 | def fromimage(im, flatten=False, mode=None): 24 | """ 25 | Return a copy of a PIL image as a numpy array. 26 | Parameters 27 | ---------- 28 | im : PIL image 29 | Input image. 30 | flatten : bool 31 | If true, convert the output to grey-scale. 32 | mode : str, optional 33 | Mode to convert image to, e.g. ``'RGB'``. See the Notes of the 34 | `imread` docstring for more details. 35 | Returns 36 | ------- 37 | fromimage : ndarray 38 | The different colour bands/channels are stored in the 39 | third dimension, such that a grey-image is MxN, an 40 | RGB-image MxNx3 and an RGBA-image MxNx4. 41 | """ 42 | if not Image.isImageType(im): 43 | raise TypeError("Input is not a PIL image.") 44 | 45 | if mode is not None: 46 | if mode != im.mode: 47 | im = im.convert(mode) 48 | elif im.mode == 'P': 49 | # Mode 'P' means there is an indexed "palette". If we leave the mode 50 | # as 'P', then when we do `a = array(im)` below, `a` will be a 2-D 51 | # containing the indices into the palette, and not a 3-D array 52 | # containing the RGB or RGBA values. 53 | if 'transparency' in im.info: 54 | im = im.convert('RGBA') 55 | else: 56 | im = im.convert('RGB') 57 | 58 | if flatten: 59 | im = im.convert('F') 60 | elif im.mode == '1': 61 | # Workaround for crash in PIL. When im is 1-bit, the call array(im) 62 | # can cause a seg. fault, or generate garbage. See 63 | # https://github.com/scipy/scipy/issues/2138 and 64 | # https://github.com/python-pillow/Pillow/issues/350. 65 | # 66 | # This converts im from a 1-bit image to an 8-bit image. 67 | im = im.convert('L') 68 | 69 | a = array(im) 70 | return a 71 | 72 | _errstr = "Mode is unknown or incompatible with input array shape." 73 | 74 | 75 | # Returns a byte-scaled image 76 | def bytescale(data, cmin=None, cmax=None, high=255, low=0): 77 | """ 78 | Byte scales an array (image). 79 | Byte scaling means converting the input image to uint8 dtype and scaling 80 | the range to ``(low, high)`` (default 0-255). 81 | If the input image already has dtype uint8, no scaling is done. 82 | Parameters 83 | ---------- 84 | data : ndarray 85 | PIL image data array. 86 | cmin : scalar, optional 87 | Bias scaling of small values. Default is ``data.min()``. 88 | cmax : scalar, optional 89 | Bias scaling of large values. Default is ``data.max()``. 90 | high : scalar, optional 91 | Scale max value to `high`. Default is 255. 92 | low : scalar, optional 93 | Scale min value to `low`. Default is 0. 94 | Returns 95 | ------- 96 | img_array : uint8 ndarray 97 | The byte-scaled array. 98 | Examples 99 | -------- 100 | >>> from scipy.misc import bytescale 101 | >>> img = np.array([[ 91.06794177, 3.39058326, 84.4221549 ], 102 | ... [ 73.88003259, 80.91433048, 4.88878881], 103 | ... [ 51.53875334, 34.45808177, 27.5873488 ]]) 104 | >>> bytescale(img) 105 | array([[255, 0, 236], 106 | [205, 225, 4], 107 | [140, 90, 70]], dtype=uint8) 108 | >>> bytescale(img, high=200, low=100) 109 | array([[200, 100, 192], 110 | [180, 188, 102], 111 | [155, 135, 128]], dtype=uint8) 112 | >>> bytescale(img, cmin=0, cmax=255) 113 | array([[91, 3, 84], 114 | [74, 81, 5], 115 | [52, 34, 28]], dtype=uint8) 116 | """ 117 | if data.dtype == uint8: 118 | return data 119 | 120 | if high > 255: 121 | raise ValueError("`high` should be less than or equal to 255.") 122 | if low < 0: 123 | raise ValueError("`low` should be greater than or equal to 0.") 124 | if high < low: 125 | raise ValueError("`high` should be greater than or equal to `low`.") 126 | 127 | if cmin is None: 128 | cmin = data.min() 129 | if cmax is None: 130 | cmax = data.max() 131 | 132 | cscale = cmax - cmin 133 | if cscale < 0: 134 | raise ValueError("`cmax` should be larger than `cmin`.") 135 | elif cscale == 0: 136 | cscale = 1 137 | 138 | scale = float(high - low) / cscale 139 | bytedata = (data - cmin) * scale + low 140 | return (bytedata.clip(low, high) + 0.5).astype(uint8) 141 | 142 | 143 | def toimage(arr, high=255, low=0, cmin=None, cmax=None, pal=None, 144 | mode=None, channel_axis=None): 145 | """Takes a numpy array and returns a PIL image. 146 | The mode of the PIL image depends on the array shape and the `pal` and 147 | `mode` keywords. 148 | For 2-D arrays, if `pal` is a valid (N,3) byte-array giving the RGB values 149 | (from 0 to 255) then ``mode='P'``, otherwise ``mode='L'``, unless mode 150 | is given as 'F' or 'I' in which case a float and/or integer array is made. 151 | Notes 152 | ----- 153 | For 3-D arrays, the `channel_axis` argument tells which dimension of the 154 | array holds the channel data. 155 | For 3-D arrays if one of the dimensions is 3, the mode is 'RGB' 156 | by default or 'YCbCr' if selected. 157 | The numpy array must be either 2 dimensional or 3 dimensional. 158 | """ 159 | data = asarray(arr) 160 | if iscomplexobj(data): 161 | raise ValueError("Cannot convert a complex-valued array.") 162 | shape = list(data.shape) 163 | valid = len(shape) == 2 or ((len(shape) == 3) and 164 | ((3 in shape) or (4 in shape))) 165 | if not valid: 166 | raise ValueError("'arr' does not have a suitable array shape for " 167 | "any mode.") 168 | if len(shape) == 2: 169 | shape = (shape[1], shape[0]) # columns show up first 170 | if mode == 'F': 171 | data32 = data.astype(numpy.float32) 172 | image = Image.frombytes(mode, shape, data32.tostring()) 173 | return image 174 | if mode in [None, 'L', 'P']: 175 | bytedata = bytescale(data, high=high, low=low, 176 | cmin=cmin, cmax=cmax) 177 | image = Image.frombytes('L', shape, bytedata.tostring()) 178 | if pal is not None: 179 | image.putpalette(asarray(pal, dtype=uint8).tostring()) 180 | # Becomes a mode='P' automagically. 181 | elif mode == 'P': # default gray-scale 182 | pal = (arange(0, 256, 1, dtype=uint8)[:, newaxis] * 183 | ones((3,), dtype=uint8)[newaxis, :]) 184 | image.putpalette(asarray(pal, dtype=uint8).tostring()) 185 | return image 186 | if mode == '1': # high input gives threshold for 1 187 | bytedata = (data > high) 188 | image = Image.frombytes('1', shape, bytedata.tostring()) 189 | return image 190 | if cmin is None: 191 | cmin = amin(ravel(data)) 192 | if cmax is None: 193 | cmax = amax(ravel(data)) 194 | data = (data*1.0 - cmin)*(high - low)/(cmax - cmin) + low 195 | if mode == 'I': 196 | data32 = data.astype(numpy.uint32) 197 | image = Image.frombytes(mode, shape, data32.tostring()) 198 | else: 199 | raise ValueError(_errstr) 200 | return image 201 | 202 | # if here then 3-d array with a 3 or a 4 in the shape length. 203 | # Check for 3 in datacube shape --- 'RGB' or 'YCbCr' 204 | if channel_axis is None: 205 | if (3 in shape): 206 | ca = numpy.flatnonzero(asarray(shape) == 3)[0] 207 | else: 208 | ca = numpy.flatnonzero(asarray(shape) == 4) 209 | if len(ca): 210 | ca = ca[0] 211 | else: 212 | raise ValueError("Could not find channel dimension.") 213 | else: 214 | ca = channel_axis 215 | 216 | numch = shape[ca] 217 | if numch not in [3, 4]: 218 | raise ValueError("Channel axis dimension is not valid.") 219 | 220 | bytedata = bytescale(data, high=high, low=low, cmin=cmin, cmax=cmax) 221 | if ca == 2: 222 | strdata = bytedata.tostring() 223 | shape = (shape[1], shape[0]) 224 | elif ca == 1: 225 | strdata = transpose(bytedata, (0, 2, 1)).tostring() 226 | shape = (shape[2], shape[0]) 227 | elif ca == 0: 228 | strdata = transpose(bytedata, (1, 2, 0)).tostring() 229 | shape = (shape[2], shape[1]) 230 | if mode is None: 231 | if numch == 3: 232 | mode = 'RGB' 233 | else: 234 | mode = 'RGBA' 235 | 236 | if mode not in ['RGB', 'RGBA', 'YCbCr', 'CMYK']: 237 | raise ValueError(_errstr) 238 | 239 | if mode in ['RGB', 'YCbCr']: 240 | if numch != 3: 241 | raise ValueError("Invalid array shape for mode.") 242 | if mode in ['RGBA', 'CMYK']: 243 | if numch != 4: 244 | raise ValueError("Invalid array shape for mode.") 245 | 246 | # Here we know data and mode is correct 247 | image = Image.frombytes(mode, shape, strdata) 248 | return image 249 | 250 | 251 | def imresize(arr, size, interp='bilinear', mode=None): 252 | """ 253 | Resize an image. 254 | Parameters 255 | ---------- 256 | arr : ndarray 257 | The array of image to be resized. 258 | size : int, float or tuple 259 | * int - Percentage of current size. 260 | * float - Fraction of current size. 261 | * tuple - Size of the output image. 262 | interp : str, optional 263 | Interpolation to use for re-sizing ('nearest', 'lanczos', 'bilinear', 'bicubic' 264 | or 'cubic'). 265 | mode : str, optional 266 | The PIL image mode ('P', 'L', etc.) to convert `arr` before resizing. 267 | Returns 268 | ------- 269 | imresize : ndarray 270 | The resized array of image. 271 | See Also 272 | -------- 273 | toimage : Implicitly used to convert `arr` according to `mode`. 274 | scipy.ndimage.zoom : More generic implementation that does not use PIL. 275 | """ 276 | im = toimage(arr, mode=mode) 277 | ts = type(size) 278 | if issubdtype(ts, int): 279 | percent = size / 100.0 280 | size = tuple((array(im.size)*percent).astype(int)) 281 | elif issubdtype(type(size), float): 282 | size = tuple((array(im.size)*size).astype(int)) 283 | else: 284 | size = (size[1], size[0]) 285 | func = {'nearest': 0, 'lanczos': 1, 'bilinear': 2, 'bicubic': 3, 'cubic': 3} 286 | imnew = im.resize(size, resample=func[interp]) 287 | return fromimage(imnew) -------------------------------------------------------------------------------- /deepkit/pytorch_graph.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import torch 3 | 4 | methods_OP = ['attributeNames', 'hasMultipleOutputs', 'hasUses', 'inputs', 5 | 'kind', 'outputs', 'outputsSize', 'scopeName'] 6 | # Some additional methods to explure for methods_IO are 7 | # 8 | # 'unique' (type int) 9 | # 'type' (type >) 10 | # 11 | # But the below are sufficient for now. 12 | methods_IO = ['node', 'offset', 'debugName'] 13 | 14 | GETATTR_KIND = 'prim::GetAttr' 15 | CLASSTYPE_KIND = 'ClassType' 16 | 17 | class NodeBase(object): 18 | def __init__(self, debugName=None, inputs=None, scope=None, tensor_size=None, op_type='UnSpecified', attributes='', node=None): 19 | # TODO; Specify a __slots__ for this class or potentially 20 | # used namedtuple instead 21 | self.node = node 22 | self.debugName = debugName 23 | self.inputs = inputs 24 | self.tensor_size = tensor_size 25 | self.kind = op_type 26 | self.attributes = attributes 27 | self.scope = scope 28 | 29 | def __repr__(self): 30 | repr = [] 31 | repr.append(str(type(self))) 32 | for m in dir(self): 33 | if '__' not in m: 34 | repr.append(m + ': ' + str(getattr(self, m)) + str(type(getattr(self, m)))) 35 | return '\n'.join(repr) + '\n\n' 36 | 37 | 38 | class NodePy(NodeBase): 39 | def __init__(self, node_cpp, valid_methods): 40 | super(NodePy, self).__init__(node_cpp) 41 | valid_methods = valid_methods[:] 42 | self.inputs = [] 43 | 44 | for m in valid_methods: 45 | if m == 'inputs' or m == 'outputs': 46 | list_of_node = list(getattr(node_cpp, m)()) 47 | io_unique_names = [] 48 | io_tensor_sizes = [] 49 | for n in list_of_node: 50 | io_unique_names.append(n.debugName()) 51 | if n.isCompleteTensor(): 52 | io_tensor_sizes.append(n.type().sizes()) 53 | else: 54 | io_tensor_sizes.append(None) 55 | 56 | setattr(self, m, io_unique_names) 57 | setattr(self, m + 'tensor_size', io_tensor_sizes) 58 | 59 | else: 60 | setattr(self, m, getattr(node_cpp, m)()) 61 | 62 | 63 | class NodePyIO(NodePy): 64 | def __init__(self, node_cpp, input_or_output=None): 65 | super(NodePyIO, self).__init__(node_cpp, methods_IO) 66 | try: 67 | tensor_size = node_cpp.type().sizes() 68 | except RuntimeError: 69 | tensor_size = [1, ] # fail when constant model is used. 70 | self.tensor_size = tensor_size 71 | # Kind attribute string is purely descriptive and will be shown 72 | # in detailed information for the node in TensorBoard's graph plugin. 73 | # 74 | # NodePyOP nodes get this from their kind() method. 75 | self.kind = 'Parameter' 76 | if input_or_output: 77 | self.input_or_output = input_or_output 78 | self.kind = 'IO Node' 79 | 80 | 81 | class NodePyOP(NodePy): 82 | def __init__(self, node_cpp): 83 | super(NodePyOP, self).__init__(node_cpp, methods_OP) 84 | # Replace single quote which causes strange behavior in TensorBoard 85 | # TODO: See if we can remove this in the future 86 | self.attributes = str({k: node_cpp[k] for k in node_cpp.attributeNames()}).replace("'", ' ') 87 | self.kind = node_cpp.kind() 88 | 89 | 90 | class GraphPy(object): 91 | """Helper class to convert torch.nn.Module to GraphDef proto and visualization 92 | with TensorBoard. 93 | 94 | GraphDef generation operates in two passes: 95 | 96 | In the first pass, all nodes are read and saved to two lists. 97 | One list is for input/output nodes (nodes_io), which only have inbound 98 | or outbound connections, but not both. Another list is for internal 99 | operator nodes (nodes_op). The first pass also saves all scope name 100 | appeared in the nodes in scope_name_appeared list for later processing. 101 | 102 | In the second pass, scope names are fully applied to all nodes. 103 | debugNameToScopedName is a mapping from a node's ID to its fully qualified 104 | scope name. e.g. Net1/Linear[0]/1. Unfortunately torch.jit doesn't have 105 | totally correct scope output, so this is nontrivial. The function 106 | populate_namespace_from_OP_to_IO and find_common_root are used to 107 | assign scope name to a node based on the connection between nodes 108 | in a heuristic kind of way. Bookkeeping is done with shallowest_scope_name 109 | and scope_name_appeared. 110 | """ 111 | def __init__(self): 112 | self.nodes_op = [] 113 | self.nodes_io = OrderedDict() 114 | self.unique_name_to_scoped_name = {} 115 | self.shallowest_scope_name = 'default' 116 | self.scope_name_appeared = [] 117 | 118 | def append(self, x): 119 | if isinstance(x, NodePyIO): 120 | self.nodes_io[x.debugName] = x 121 | if isinstance(x, NodePyOP): 122 | self.nodes_op.append(x) 123 | 124 | def printall(self): 125 | print('all nodes') 126 | for node in self.nodes_op: 127 | print(node) 128 | for key in self.nodes_io: 129 | print(self.nodes_io[key]) 130 | 131 | def find_common_root(self): 132 | for fullscope in self.scope_name_appeared: 133 | if fullscope: 134 | self.shallowest_scope_name = fullscope.split('/')[0] 135 | 136 | def populate_namespace_from_OP_to_IO(self): 137 | for node in self.nodes_op: 138 | for node_output, outputSize in zip(node.outputs, node.outputstensor_size): 139 | self.scope_name_appeared.append(node.scopeName) 140 | self.nodes_io[node_output] = NodeBase(node_output, 141 | node.inputs, 142 | node.scopeName, 143 | outputSize, 144 | op_type=node.kind, 145 | node=node, 146 | attributes=node.attributes) 147 | 148 | self.find_common_root() 149 | 150 | for node in self.nodes_op: 151 | for input_node_id in node.inputs: 152 | self.unique_name_to_scoped_name[input_node_id] = node.scopeName + '/' + input_node_id 153 | 154 | for key, node in self.nodes_io.items(): 155 | if type(node) == NodeBase: 156 | self.unique_name_to_scoped_name[key] = node.scope + '/' + node.debugName 157 | if hasattr(node, 'input_or_output'): 158 | self.unique_name_to_scoped_name[key] = node.input_or_output + '/' + node.debugName 159 | 160 | if hasattr(node, 'scope') and node.scope is not None: 161 | self.unique_name_to_scoped_name[key] = node.scope + '/' + node.debugName 162 | if node.scope == '' and self.shallowest_scope_name: 163 | self.unique_name_to_scoped_name[node.debugName] = self.shallowest_scope_name + '/' + node.debugName 164 | 165 | # replace name 166 | for key, node in self.nodes_io.items(): 167 | self.nodes_io[key].inputs = [self.unique_name_to_scoped_name[node_input_id] for node_input_id in node.inputs] 168 | if node.debugName in self.unique_name_to_scoped_name: 169 | self.nodes_io[key].debugName = self.unique_name_to_scoped_name[node.debugName] 170 | 171 | 172 | def parse(graph, trace, args=None, omit_useless_nodes=True): 173 | """This method parses an optimized PyTorch model graph and produces 174 | a list of nodes and node stats for eventual conversion to TensorBoard 175 | protobuf format. 176 | 177 | Args: 178 | graph (PyTorch module): The model graph to be parsed. 179 | trace (PyTorch JIT TracedModule): The model trace to be parsed. 180 | args (tuple): input tensor[s] for the model. 181 | omit_useless_nodes (boolean): Whether to remove nodes from the graph. 182 | """ 183 | n_inputs = len(args) 184 | 185 | scope = {} 186 | nodes_py = GraphPy() 187 | for node in graph.inputs(): 188 | if omit_useless_nodes: 189 | if len(node.uses()) == 0: # number of user of the node (= number of outputs/ fanout) 190 | continue 191 | 192 | if node.type().kind() != CLASSTYPE_KIND: 193 | nodes_py.append(NodePyIO(node, 'input')) 194 | 195 | attr_to_scope = dict() 196 | for node in graph.nodes(): 197 | if node.kind() == GETATTR_KIND: 198 | attr_name = node.s('name') 199 | parent = node.input().node() 200 | if parent.kind() == GETATTR_KIND: # If the parent node is not the top-level "self" node 201 | parent_attr_name = parent.s('name') 202 | parent_scope = attr_to_scope[parent_attr_name] 203 | attr_scope = parent_scope.split('/')[-1] 204 | attr_to_scope[attr_name] = '{}/{}.{}'.format(parent_scope, attr_scope, attr_name) 205 | else: 206 | attr_to_scope[attr_name] = '__module.{}'.format(attr_name) 207 | # We don't need classtype nodes; scope will provide this information 208 | if node.output().type().kind() != CLASSTYPE_KIND: 209 | node_py = NodePyOP(node) 210 | node_py.scopeName = attr_to_scope[attr_name] 211 | nodes_py.append(node_py) 212 | else: 213 | nodes_py.append(NodePyOP(node)) 214 | 215 | for i, node in enumerate(graph.outputs()): # Create sink nodes for output ops 216 | node_py = NodePyIO(node, 'output') 217 | node_py.debugName = "output.{}".format(i + 1) 218 | node_py.inputs = [node.debugName()] 219 | nodes_py.append(node_py) 220 | 221 | def parse_traced_name(module_name): 222 | prefix = 'TracedModule[' 223 | suffix = ']' 224 | if module_name.startswith(prefix) and module_name.endswith(suffix): 225 | module_name = module_name[len(prefix):-len(suffix)] 226 | return module_name 227 | 228 | alias_to_name = dict() 229 | base_name = parse_traced_name(trace._name) 230 | for name, module in trace.named_modules(prefix='__module'): 231 | mod_name = parse_traced_name(module._name) 232 | attr_name = name.split('.')[-1] 233 | alias_to_name[name] = '{}[{}]'.format(mod_name, attr_name) 234 | 235 | for node in nodes_py.nodes_op: 236 | module_aliases = node.scopeName.split('/') 237 | replacements = [ 238 | alias_to_name[alias] 239 | if alias in alias_to_name 240 | else alias.split('.')[-1] 241 | for alias in module_aliases 242 | ] 243 | node.scopeName = base_name 244 | if any(replacements): 245 | node.scopeName += '/' + '/'.join(replacements) 246 | 247 | nodes_py.populate_namespace_from_OP_to_IO() 248 | return nodes_py 249 | 250 | 251 | def build_graph(model, args): 252 | try: 253 | trace = torch.jit.trace(model, args) 254 | graph = trace.graph 255 | torch._C._jit_pass_inline(graph) 256 | except RuntimeError as e: 257 | print(e) 258 | print('Error occurs, No graph saved') 259 | raise e 260 | 261 | list_of_nodes = parse(graph, trace, args) 262 | return graph, list_of_nodes.nodes_io 263 | -------------------------------------------------------------------------------- /examples/resnetv2/model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | #os.environ["KERAS_BACKEND"] = "plaidml.keras.backend" 4 | #os.environ["RUNFILES_DIR"] = "/usr/local/share/plaidml" 5 | #os.environ["PLAIDML_NATIVE_PATH"] = "/usr/local/lib/libplaidml.dylib" 6 | 7 | import keras 8 | from keras.layers import Dense, Conv2D, BatchNormalization, Activation 9 | from keras.layers import AveragePooling2D, Input, Flatten 10 | from keras.optimizers import Adam 11 | from keras.callbacks import ModelCheckpoint, LearningRateScheduler 12 | from keras.callbacks import ReduceLROnPlateau 13 | from keras.preprocessing.image import ImageDataGenerator 14 | from keras.regularizers import l2 15 | from keras.models import Model 16 | from keras.datasets import cifar10 17 | import numpy as np 18 | import deepkit 19 | 20 | experiment = deepkit.experiment() 21 | 22 | experiment.add_label('resnet', 'keras') 23 | 24 | # Training parameters 25 | batch_size = 128 # orig paper trained all networks with batch_size=128 26 | epochs = 200 27 | data_augmentation = False 28 | num_classes = 10 29 | 30 | # Subtracting pixel mean improves accuracy 31 | subtract_pixel_mean = True 32 | 33 | # Model parameter 34 | # ---------------------------------------------------------------------------- 35 | # | | 200-epoch | Orig Paper| 200-epoch | Orig Paper| sec/epoch 36 | # Model | n | ResNet v1 | ResNet v1 | ResNet v2 | ResNet v2 | GTX1080Ti 37 | # |v1(v2)| %Accuracy | %Accuracy | %Accuracy | %Accuracy | v1 (v2) 38 | # ---------------------------------------------------------------------------- 39 | # ResNet20 | 3 (2)| 92.16 | 91.25 | ----- | ----- | 35 (---) 40 | # ResNet32 | 5(NA)| 92.46 | 92.49 | NA | NA | 50 ( NA) 41 | # ResNet44 | 7(NA)| 92.50 | 92.83 | NA | NA | 70 ( NA) 42 | # ResNet56 | 9 (6)| 92.71 | 93.03 | 93.01 | NA | 90 (100) 43 | # ResNet110 |18(12)| 92.65 | 93.39+-.16| 93.15 | 93.63 | 165(180) 44 | # ResNet164 |27(18)| ----- | 94.07 | ----- | 94.54 | ---(---) 45 | # ResNet1001| (111)| ----- | 92.39 | ----- | 95.08+-.14| ---(---) 46 | # --------------------------------------------------------------------------- 47 | n = 3 48 | 49 | # Model version 50 | # Orig paper: version = 1 (ResNet v1), Improved ResNet: version = 2 (ResNet v2) 51 | version = 1 52 | 53 | # Computed depth from supplied model parameter n 54 | if version == 1: 55 | depth = n * 6 + 2 56 | elif version == 2: 57 | depth = n * 9 + 2 58 | 59 | # Model name, depth and version 60 | model_type = 'ResNet%dv%d' % (depth, version) 61 | 62 | # Load the CIFAR10 data. 63 | (x_train, y_train), (x_test, y_test) = cifar10.load_data() 64 | 65 | # Input image dimensions. 66 | input_shape = x_train.shape[1:] 67 | 68 | # Normalize data. 69 | x_train = x_train.astype('float32') / 255 70 | x_test = x_test.astype('float32') / 255 71 | 72 | # If subtract pixel mean is enabled 73 | if subtract_pixel_mean: 74 | x_train_mean = np.mean(x_train, axis=0) 75 | x_train -= x_train_mean 76 | x_test -= x_train_mean 77 | 78 | print('x_train shape:', x_train.shape) 79 | print(x_train.shape[0], 'train samples') 80 | print(x_test.shape[0], 'test samples') 81 | print('y_train shape:', y_train.shape) 82 | 83 | # Convert class vectors to binary class matrices. 84 | y_train = keras.utils.to_categorical(y_train, num_classes) 85 | y_test = keras.utils.to_categorical(y_test, num_classes) 86 | 87 | 88 | def lr_schedule(epoch): 89 | """Learning Rate Schedule 90 | 91 | Learning rate is scheduled to be reduced after 80, 120, 160, 180 epochs. 92 | Called automatically every epoch as part of callbacks during training. 93 | 94 | # Arguments 95 | epoch (int): The number of epochs 96 | 97 | # Returns 98 | lr (float32): learning rate 99 | """ 100 | lr = 1e-3 101 | if epoch > 180: 102 | lr *= 0.5e-3 103 | elif epoch > 160: 104 | lr *= 1e-3 105 | elif epoch > 120: 106 | lr *= 1e-2 107 | elif epoch > 80: 108 | lr *= 1e-1 109 | print('Learning rate: ', lr) 110 | return lr 111 | 112 | 113 | def resnet_layer(inputs, 114 | num_filters=16, 115 | kernel_size=3, 116 | strides=1, 117 | activation='relu', 118 | batch_normalization=True, 119 | conv_first=True): 120 | """2D Convolution-Batch Normalization-Activation stack builder 121 | 122 | # Arguments 123 | inputs (tensor): input tensor from input image or previous layer 124 | num_filters (int): Conv2D number of filters 125 | kernel_size (int): Conv2D square kernel dimensions 126 | strides (int): Conv2D square stride dimensions 127 | activation (string): activation name 128 | batch_normalization (bool): whether to include batch normalization 129 | conv_first (bool): conv-bn-activation (True) or 130 | bn-activation-conv (False) 131 | 132 | # Returns 133 | x (tensor): tensor as input to the next layer 134 | """ 135 | conv = Conv2D(num_filters, 136 | kernel_size=kernel_size, 137 | strides=strides, 138 | padding='same', 139 | kernel_initializer='he_normal', 140 | kernel_regularizer=l2(1e-4)) 141 | 142 | x = inputs 143 | if conv_first: 144 | x = conv(x) 145 | if batch_normalization: 146 | x = BatchNormalization()(x) 147 | if activation is not None: 148 | x = Activation(activation)(x) 149 | else: 150 | if batch_normalization: 151 | x = BatchNormalization()(x) 152 | if activation is not None: 153 | x = Activation(activation)(x) 154 | x = conv(x) 155 | return x 156 | 157 | 158 | def resnet_v1(input_shape, depth, num_classes=10): 159 | """ResNet Version 1 Model builder [a] 160 | 161 | Stacks of 2 x (3 x 3) Conv2D-BN-ReLU 162 | Last ReLU is after the shortcut connection. 163 | At the beginning of each stage, the feature map size is halved (downsampled) 164 | by a convolutional layer with strides=2, while the number of filters is 165 | doubled. Within each stage, the layers have the same number filters and the 166 | same number of filters. 167 | Features maps sizes: 168 | stage 0: 32x32, 16 169 | stage 1: 16x16, 32 170 | stage 2: 8x8, 64 171 | The Number of parameters is approx the same as Table 6 of [a]: 172 | ResNet20 0.27M 173 | ResNet32 0.46M 174 | ResNet44 0.66M 175 | ResNet56 0.85M 176 | ResNet110 1.7M 177 | 178 | # Arguments 179 | input_shape (tensor): shape of input image tensor 180 | depth (int): number of core convolutional layers 181 | num_classes (int): number of classes (CIFAR10 has 10) 182 | 183 | # Returns 184 | model (Model): Keras model instance 185 | """ 186 | if (depth - 2) % 6 != 0: 187 | raise ValueError('depth should be 6n+2 (eg 20, 32, 44 in [a])') 188 | # Start model definition. 189 | num_filters = 16 190 | num_res_blocks = int((depth - 2) / 6) 191 | 192 | inputs = Input(shape=input_shape) 193 | x = resnet_layer(inputs=inputs) 194 | # Instantiate the stack of residual units 195 | for stack in range(3): 196 | for res_block in range(num_res_blocks): 197 | strides = 1 198 | if stack > 0 and res_block == 0: # first layer but not first stack 199 | strides = 2 # downsample 200 | y = resnet_layer(inputs=x, 201 | num_filters=num_filters, 202 | strides=strides) 203 | y = resnet_layer(inputs=y, 204 | num_filters=num_filters, 205 | activation=None) 206 | if stack > 0 and res_block == 0: # first layer but not first stack 207 | # linear projection residual shortcut connection to match 208 | # changed dims 209 | x = resnet_layer(inputs=x, 210 | num_filters=num_filters, 211 | kernel_size=1, 212 | strides=strides, 213 | activation=None, 214 | batch_normalization=False) 215 | x = keras.layers.add([x, y]) 216 | x = Activation('relu')(x) 217 | num_filters *= 2 218 | 219 | # Add classifier on top. 220 | # v1 does not use BN after last shortcut connection-ReLU 221 | x = AveragePooling2D(pool_size=8)(x) 222 | y = Flatten()(x) 223 | outputs = Dense(num_classes, 224 | activation='softmax', 225 | kernel_initializer='he_normal')(y) 226 | 227 | # Instantiate model. 228 | model = Model(inputs=inputs, outputs=outputs) 229 | return model 230 | 231 | 232 | def resnet_v2(input_shape, depth, num_classes=10): 233 | """ResNet Version 2 Model builder [b] 234 | 235 | Stacks of (1 x 1)-(3 x 3)-(1 x 1) BN-ReLU-Conv2D or also known as 236 | bottleneck layer 237 | First shortcut connection per layer is 1 x 1 Conv2D. 238 | Second and onwards shortcut connection is identity. 239 | At the beginning of each stage, the feature map size is halved (downsampled) 240 | by a convolutional layer with strides=2, while the number of filter maps is 241 | doubled. Within each stage, the layers have the same number filters and the 242 | same filter map sizes. 243 | Features maps sizes: 244 | conv1 : 32x32, 16 245 | stage 0: 32x32, 64 246 | stage 1: 16x16, 128 247 | stage 2: 8x8, 256 248 | 249 | # Arguments 250 | input_shape (tensor): shape of input image tensor 251 | depth (int): number of core convolutional layers 252 | num_classes (int): number of classes (CIFAR10 has 10) 253 | 254 | # Returns 255 | model (Model): Keras model instance 256 | """ 257 | if (depth - 2) % 9 != 0: 258 | raise ValueError('depth should be 9n+2 (eg 56 or 110 in [b])') 259 | # Start model definition. 260 | num_filters_in = 16 261 | num_res_blocks = int((depth - 2) / 9) 262 | 263 | inputs = Input(shape=input_shape) 264 | # v2 performs Conv2D with BN-ReLU on input before splitting into 2 paths 265 | x = resnet_layer(inputs=inputs, 266 | num_filters=num_filters_in, 267 | conv_first=True) 268 | 269 | # Instantiate the stack of residual units 270 | for stage in range(3): 271 | for res_block in range(num_res_blocks): 272 | activation = 'relu' 273 | batch_normalization = True 274 | strides = 1 275 | if stage == 0: 276 | num_filters_out = num_filters_in * 4 277 | if res_block == 0: # first layer and first stage 278 | activation = None 279 | batch_normalization = False 280 | else: 281 | num_filters_out = num_filters_in * 2 282 | if res_block == 0: # first layer but not first stage 283 | strides = 2 # downsample 284 | 285 | # bottleneck residual unit 286 | y = resnet_layer(inputs=x, 287 | num_filters=num_filters_in, 288 | kernel_size=1, 289 | strides=strides, 290 | activation=activation, 291 | batch_normalization=batch_normalization, 292 | conv_first=False) 293 | y = resnet_layer(inputs=y, 294 | num_filters=num_filters_in, 295 | conv_first=False) 296 | y = resnet_layer(inputs=y, 297 | num_filters=num_filters_out, 298 | kernel_size=1, 299 | conv_first=False) 300 | if res_block == 0: 301 | # linear projection residual shortcut connection to match 302 | # changed dims 303 | x = resnet_layer(inputs=x, 304 | num_filters=num_filters_out, 305 | kernel_size=1, 306 | strides=strides, 307 | activation=None, 308 | batch_normalization=False) 309 | x = keras.layers.add([x, y]) 310 | 311 | num_filters_in = num_filters_out 312 | 313 | # Add classifier on top. 314 | # v2 has BN-ReLU before Pooling 315 | x = BatchNormalization()(x) 316 | x = Activation('relu')(x) 317 | x = AveragePooling2D(pool_size=8)(x) 318 | y = Flatten()(x) 319 | outputs = Dense(num_classes, 320 | activation='softmax', 321 | kernel_initializer='he_normal')(y) 322 | 323 | # Instantiate model. 324 | model = Model(inputs=inputs, outputs=outputs) 325 | return model 326 | 327 | 328 | if version == 2: 329 | model = resnet_v2(input_shape=input_shape, depth=depth) 330 | else: 331 | model = resnet_v1(input_shape=input_shape, depth=depth) 332 | 333 | model.compile(loss='categorical_crossentropy', 334 | optimizer=Adam(), 335 | metrics=['accuracy']) 336 | model.summary() 337 | print(model_type) 338 | 339 | # Prepare model model saving directory. 340 | save_dir = os.path.join(os.getcwd(), 'saved_models') 341 | model_name = 'cifar10_%s_model.{epoch:03d}.h5' % model_type 342 | if not os.path.isdir(save_dir): 343 | os.makedirs(save_dir) 344 | filepath = os.path.join(save_dir, model_name) 345 | 346 | # Prepare callbacks for model saving and for learning rate adjustment. 347 | checkpoint = ModelCheckpoint(filepath=filepath, 348 | monitor='val_acc', 349 | verbose=1, 350 | save_best_only=True) 351 | 352 | lr_scheduler = LearningRateScheduler(lr_schedule) 353 | 354 | lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), 355 | cooldown=0, 356 | patience=5, 357 | min_lr=0.5e-6) 358 | 359 | experiment.watch_keras_model(model) 360 | callbacks = [checkpoint, lr_reducer, lr_scheduler, experiment.create_keras_callback()] 361 | 362 | # Run training, with or without data augmentation. 363 | if not data_augmentation: 364 | print('Not using data augmentation.') 365 | model.fit(x_train, y_train, 366 | batch_size=batch_size, 367 | epochs=epochs, 368 | validation_data=(x_test, y_test), 369 | shuffle=True, 370 | callbacks=callbacks) 371 | else: 372 | print('Using real-time data augmentation.') 373 | # This will do preprocessing and realtime data augmentation: 374 | datagen = ImageDataGenerator( 375 | # set input mean to 0 over the dataset 376 | featurewise_center=False, 377 | # set each sample mean to 0 378 | samplewise_center=False, 379 | # divide inputs by std of dataset 380 | featurewise_std_normalization=False, 381 | # divide each input by its std 382 | samplewise_std_normalization=False, 383 | # apply ZCA whitening 384 | zca_whitening=False, 385 | # epsilon for ZCA whitening 386 | zca_epsilon=1e-06, 387 | # randomly rotate images in the range (deg 0 to 180) 388 | rotation_range=0, 389 | # randomly shift images horizontally 390 | width_shift_range=0.1, 391 | # randomly shift images vertically 392 | height_shift_range=0.1, 393 | # set range for random shear 394 | shear_range=0., 395 | # set range for random zoom 396 | zoom_range=0., 397 | # set range for random channel shifts 398 | channel_shift_range=0., 399 | # set mode for filling points outside the input boundaries 400 | fill_mode='nearest', 401 | # value used for fill_mode = "constant" 402 | cval=0., 403 | # randomly flip images 404 | horizontal_flip=True, 405 | # randomly flip images 406 | vertical_flip=False, 407 | # set rescaling factor (applied before any other transformation) 408 | rescale=None, 409 | # set function that will be applied on each input 410 | preprocessing_function=None, 411 | # image data format, either "channels_first" or "channels_last" 412 | data_format=None, 413 | # fraction of images reserved for validation (strictly between 0 and 1) 414 | validation_split=0.0) 415 | 416 | # Compute quantities required for featurewise normalization 417 | # (std, mean, and principal components if ZCA whitening is applied). 418 | datagen.fit(x_train) 419 | 420 | # Fit the model on the batches generated by datagen.flow(). 421 | model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), 422 | validation_data=(x_test, y_test), 423 | epochs=epochs, verbose=1, workers=4, 424 | callbacks=callbacks) 425 | 426 | # Score trained model. 427 | scores = model.evaluate(x_test, y_test, verbose=1) 428 | print('Test loss:', scores[0]) 429 | print('Test accuracy:', scores[1]) 430 | -------------------------------------------------------------------------------- /deepkit/pytorch.py: -------------------------------------------------------------------------------- 1 | import math 2 | import re 3 | from struct import pack 4 | from typing import Dict, Optional 5 | 6 | import PIL.Image 7 | import numpy as np 8 | 9 | import deepkit.experiment 10 | import deepkit.debugger 11 | from deepkit.pytorch_graph import build_graph 12 | from deepkit.utils import array_to_img 13 | from deepkit.utils.image import get_layer_vis_square, get_image_tales, make_image_from_dense 14 | 15 | blacklist_attributes = {'weight', 'dump_patches'} 16 | 17 | 18 | def extract_attributes(module): 19 | res = {} 20 | for attr in dir(module): 21 | if attr in blacklist_attributes: continue 22 | if attr.startswith('_'): continue 23 | val = getattr(module, attr) 24 | if not isinstance(val, (str, bool, int, float, list, tuple)): 25 | continue 26 | res[attr] = val 27 | 28 | return res 29 | 30 | 31 | scope_name_prog = re.compile(r'^([a-zA-Z0-9_\-]+)/') 32 | short_name_prog = re.compile(r'\[([a-zA-Z0-9_]+)\]') 33 | is_variable = re.compile(r'/([a-zA-Z0-9_]+(?:\.[0-9]+)?)$') 34 | 35 | 36 | def get_layer_id(name: str): 37 | """ 38 | Takes a name like 'ResNet/Conv2d[conv1]/1504' and converts it to a shorter version 39 | 40 | Examples 41 | 1. 'ResNet/Sequential[layer1]/BasicBlock[1]/Conv2d[conv2]/1658' 42 | -> layer1.1.conv2/1657 43 | 2. 'ResNet/Sequential[layer2]/BasicBlock[0]/BatchNorm2d[bn1]/1714' 44 | -> layer2.0.bn1/1714 45 | 3. 'ResNet/Sequential[layer1]/BasicBlock[0]/input.4' 46 | -> layer1.0/input.4 47 | 4. 'input/input.1' 48 | -> input-1 49 | 5. 'output/output.1' 50 | -> output-1 51 | """ 52 | res = short_name_prog.findall(name) 53 | var = is_variable.search(name) 54 | if not res: 55 | return name 56 | if var: 57 | return '.'.join(res) + '/' + var.group(1) 58 | return '.'.join(res) 59 | 60 | 61 | def get_scope_id(name: str): 62 | """ 63 | Takes a name like 'ResNet/Conv2d[conv1]/1504' and converts it to 64 | its scope variant, which could be later used for `named_modules` method. 65 | Examples 66 | 1. 'ResNet/Sequential[layer1]/BasicBlock[1]/Conv2d[conv2]/1658' 67 | -> Resnet.layer1.1.conv2 68 | 2. 'ResNet/Sequential[layer2]/BasicBlock[0]/BatchNorm2d[bn1]/1714' 69 | -> Resnet.layer2.0.bn1 70 | 2. 'ResNet/Sequential[layer2]/BasicBlock[0]/BatchNorm2d[bn1]/input.2' 71 | -> Resnet.layer2.0.bn1 72 | 3. 'ResNet/Sequential[layer1]/BasicBlock[0]/input.4' 73 | -> Resnet.layer1.0 74 | 3. 'ResNet/x.1' 75 | -> Resnet.x 76 | """ 77 | res = short_name_prog.findall(name) 78 | if not res: 79 | # no groups mean its something like Resnet/x.2, which we normalize to Resnet 80 | return name.split('/')[0] 81 | 82 | scope = scope_name_prog.findall(name) 83 | 84 | return scope[0] + '.' + ('.'.join(res)) 85 | 86 | 87 | def get_pytorch_graph(net, inputs): 88 | names_from_id = dict() 89 | nodes_from_id = dict() 90 | names_from_debug = dict() 91 | scopes_from_debug = dict() 92 | names_to_scope = dict() 93 | scope_nodes = dict() 94 | # names_to_scope = dict() 95 | 96 | container_names = dict() 97 | known_modules_map = dict() 98 | known_modules_name_map = dict() 99 | 100 | torch_graph, torch_nodes = build_graph(net, inputs) 101 | 102 | for name, module in net.named_modules(prefix=type(net).__name__): 103 | known_modules_map[module] = name 104 | known_modules_name_map[name] = module 105 | 106 | def get_parent(name, go_up=1) -> str: 107 | return '.'.join(name.split('.')[:go_up * -1]) 108 | 109 | for node in torch_nodes.values(): 110 | if node.kind == 'prim::Constant': continue 111 | if node.kind == 'prim::GetAttr': continue 112 | layer_id = get_layer_id(node.debugName) 113 | scope_id = get_scope_id(node.debugName) 114 | 115 | if node.kind == 'prim::ListConstruct': 116 | # if that list constructor has only inputs of the same scope, ignore it 117 | all_scope = True 118 | for input in node.inputs: 119 | if get_scope_id(input) != scope_id: 120 | all_scope = False 121 | break 122 | if all_scope: 123 | continue 124 | 125 | # if node.kind == 'aten::t': continue 126 | 127 | names_from_id[layer_id] = node.debugName 128 | nodes_from_id[layer_id] = node 129 | names_from_debug[node.debugName] = layer_id 130 | scopes_from_debug[node.debugName] = scope_id 131 | names_to_scope[layer_id] = scopes_from_debug[node.debugName] 132 | if scope_id not in scope_nodes: 133 | scope_nodes[scope_id] = [layer_id] 134 | else: 135 | scope_nodes[scope_id].append(layer_id) 136 | 137 | edges = dict() 138 | edges_internal = dict() 139 | 140 | for node in torch_nodes.values(): 141 | if node.debugName not in names_from_debug: continue 142 | layer_id = names_from_debug[node.debugName] 143 | scope_id = scopes_from_debug[node.debugName] 144 | 145 | # print(node.debugName, '=>', layer_id, short_layer_id, node.kind, node.tensor_size) 146 | edges[layer_id] = set() 147 | 148 | for input in node.inputs: 149 | if layer_id not in edges_internal: edges_internal[layer_id] = [] 150 | edges_internal[layer_id].append(input) 151 | 152 | # filter unknown nodes 153 | if input not in names_from_debug: continue 154 | 155 | # reference to itself is forbidden 156 | if layer_id == names_from_debug[input]: continue 157 | 158 | # reference to its scope is forbidden 159 | if scope_id == names_from_debug[input]: continue 160 | 161 | # print(' outgoing', names_from_debug[input], scopes_from_debug[input], input, 162 | # nodes_from_id[names_from_debug[input]].kind) 163 | # this node points out of itself, so create an edge 164 | edge_to = names_from_debug[input] 165 | edges[layer_id].add(edge_to) 166 | 167 | deepkit_nodes = [] 168 | 169 | nodes_names_to_display = set() 170 | 171 | def collect_nodes_to_display(inputs): 172 | for input in inputs: 173 | if input not in nodes_names_to_display: 174 | nodes_names_to_display.add(input) 175 | if input in edges: 176 | collect_nodes_to_display(edges[input]) 177 | 178 | def find_outputs(name: str, outputs: set): 179 | kind = nodes_from_id[name].kind 180 | 181 | if kind == 'IO Node' and len(edges[name]) != 1: 182 | # an IO node with multiple inputs is probably correct already 183 | outputs.add(name) 184 | return 185 | 186 | if kind == 'IO Node' or kind == 'prim::TupleConstruct': 187 | # resolve inputs 188 | for input in edges[name]: 189 | find_outputs(input, outputs) 190 | else: 191 | outputs.add(name) 192 | 193 | for name in edges.copy().keys(): 194 | if name.startswith('output/'): 195 | collect_nodes_to_display(edges[name]) 196 | 197 | # resolve first to first nodes with available shape, and then use those as output 198 | # this is necessary since tuple outputs come via prim::TupleConstruct and no shape. 199 | found_outputs = set() 200 | find_outputs(name, found_outputs) 201 | i = 0 202 | # print('found new outputs', name, found_outputs) 203 | 204 | for output in found_outputs: 205 | i += 1 206 | new_name = 'output/output.' + str(i) 207 | edges[new_name] = edges[name] 208 | nodes_from_id[new_name] = nodes_from_id[output] 209 | names_to_scope[new_name] = '' 210 | 211 | nodes_names_to_display.add(new_name) 212 | 213 | activation_functions = set(map(str.lower, [ 214 | 'ReLU6', 215 | 'LogSigmoid', 216 | 'LeakyReLU', 217 | 'MultiheadAttention', 218 | 'elu', 'hardshrink', 'hardtanh', 'leaky_relu', 'logsigmoid', 'prelu', 219 | 'rrelu', 'relu', 220 | 'sigmoid', 'elu', 'celu', 'selu', 'glu', 'gelu', 'softplus', 'softshrink', 'softsign', 221 | 'tanh', 'tanhshrink', 222 | 'softmin', 'softmax', 'softmax2d', 'log_softmax', 'LogSoftmax', 223 | 'AdaptiveLogSoftmaxWithLoss' 224 | ])) 225 | 226 | input_names = [] 227 | output_names = [] 228 | 229 | record_map = dict() 230 | for name in nodes_names_to_display: 231 | inputs = edges[name] if name in edges else [] 232 | # for [name, inputs] in edges.items(): 233 | torch_node = nodes_from_id[name] 234 | scope_name = names_to_scope[name] 235 | if not name: 236 | raise Exception('No name given') 237 | 238 | node_type = 'layer' 239 | scope_id = scope_name 240 | recordable = False 241 | 242 | # filterer_inputs = [] 243 | if name.startswith('input/'): 244 | recordable = True 245 | node_type = 'input' 246 | input_names.append(name) 247 | 248 | if name.startswith('output/'): 249 | recordable = True 250 | node_type = 'output' 251 | output_names.append(name) 252 | 253 | # for input in inputs: 254 | # # second_parent = get_parent(names_to_scope[input], 2) 255 | # # if second_parent and not scope_name.startswith(second_parent): 256 | # # continue 257 | # if input.startswith('input/input'): 258 | # filterer_inputs.append(input) 259 | # continue 260 | # if input in edges: filterer_inputs.append(input) 261 | 262 | attributes = {} 263 | node_sub_type = '' 264 | node_label = name 265 | 266 | if node_type != 'output': 267 | if scope_name and scope_name in scope_nodes and len( 268 | scope_nodes[scope_name]) == 1 and scope_name in known_modules_name_map: 269 | # this node is at the same time a module(and thus scope), since it only has one node. 270 | recordable = True 271 | record_map[scope_name] = name 272 | node_label = scope_name 273 | module = known_modules_name_map[scope_name] 274 | node_sub_type = type(module).__name__ 275 | scope_id = get_parent(scope_name) 276 | attributes = extract_attributes(module) 277 | else: 278 | if str(torch_node.kind).startswith('aten::'): 279 | node_type = 'op' 280 | node_sub_type = torch_node.kind.replace('aten::', '').strip('_') 281 | 282 | if str(torch_node.kind).startswith('prim::'): 283 | node_type = 'primitive' 284 | node_sub_type = torch_node.kind.replace('prim::', '').strip('_') 285 | 286 | if node_sub_type.lower() in activation_functions: 287 | node_type = 'activation' 288 | node_sub_type = node_sub_type 289 | 290 | # attributes['torch.debugName'] = torch_node.debugName 291 | # attributes['torch.kind'] = torch_node.kind 292 | # attributes['torch.inputs'] = ', '.join(torch_node.inputs) 293 | 294 | # source = str(torch_node.node.debugName).split(' # ')[1].strip() \ 295 | # if hasattr(torch_node.node, 'debugName') and ' # ' in str(torch_node.node.debugName) else None 296 | 297 | node = { 298 | 'id': name, 299 | 'label': node_label, 300 | 'type': node_type, 301 | 'subType': node_sub_type, 302 | # 'source': source, 303 | 'input': list(inputs), 304 | 'attributes': attributes, 305 | 'recordable': recordable, 306 | 'scope': scope_id.replace('.', '/'), 307 | 'shape': torch_node.tensor_size, 308 | } 309 | deepkit_nodes.append(node) 310 | 311 | scopes = [] 312 | for name, module in known_modules_name_map.items(): 313 | # skip modules that are already added as nodes 314 | if name in scope_nodes and len(scope_nodes[name]) == 1: 315 | continue 316 | 317 | scope_id = name.replace('.', '/') 318 | record_map[name] = scope_id 319 | 320 | # the root scope is not recordable. For that we have global input and outputs 321 | recordable = '/' in scope_id 322 | 323 | scope = { 324 | 'id': scope_id, 325 | 'label': scope_id, 326 | 'subType': type(module).__name__, 327 | 'recordable': recordable, 328 | 'attributes': extract_attributes(module) 329 | } 330 | scopes.append(scope) 331 | 332 | graph = { 333 | 'nodes': deepkit_nodes, 334 | 'scopes': scopes, 335 | } 336 | 337 | return graph, record_map, input_names, output_names 338 | 339 | 340 | class TorchDebugger: 341 | def __init__(self, debugger: deepkit.debugger.DebuggerManager, net, graph_name: str, resolve_map): 342 | self.known_modules_map = dict() 343 | self.known_modules_name_map = dict() 344 | self.debugger = debugger 345 | 346 | for name, module in net.named_modules(prefix=type(net).__name__): 347 | self.known_modules_map[module] = name 348 | self.known_modules_name_map[name] = module 349 | 350 | self.net = net 351 | self.graph_name = graph_name 352 | self.resolve_map = resolve_map 353 | 354 | # contains a map of recording map, names from nodes of the full graph to actual modules 355 | # this is necessary since we map certain internal nodes to a scope/layer/module. 356 | self.record_map = dict() 357 | self.model_input_names = [] 358 | self.model_output_names = [] 359 | self.model_input = None 360 | self.extract_graph = False 361 | 362 | self.fetch_result: Dict[str, deepkit.debugger.DebuggerFetchItem] = dict() 363 | self.fetch_config: Optional[deepkit.debugger.DebuggerFetchConfig] = None 364 | 365 | def root_hook(module, input): 366 | if self.extract_graph: return 367 | if self.debugger.active_debug_data_for_this_run: return 368 | 369 | if self.model_input is None: 370 | self.model_input = input 371 | self.extract_graph = True 372 | self.record_map, self.model_input_names, self.model_output_names = self.resolve_map(input) 373 | self.extract_graph = False 374 | else: 375 | self.debugger.tick() 376 | 377 | net.register_forward_pre_hook(root_hook) 378 | 379 | self.net.apply(self.register_hook) 380 | 381 | def fetch(self, fetch_config: deepkit.debugger.DebuggerFetchConfig) -> Dict[ 382 | str, deepkit.debugger.DebuggerFetchItem]: 383 | self.fetch_config = fetch_config 384 | self.fetch_result = dict() 385 | 386 | if not self.model_input: 387 | return self.fetch_result 388 | 389 | if len(self.model_input_names) > 1: 390 | for i, name in enumerate(self.model_input_names): 391 | self.send_debug(name, self.net, self.model_input[i]) 392 | elif len(self.model_input_names) == 1: 393 | self.send_debug(self.model_input_names[0], self.net, self.model_input) 394 | 395 | self.net(*self.model_input) 396 | 397 | return self.fetch_result 398 | 399 | def register_hook(self, module): 400 | def hook(module, input, output): 401 | if self.extract_graph: return 402 | if not self.debugger.active_debug_data_for_this_run: 403 | # we don't care about hook calls outside of our debug tracking 404 | return 405 | 406 | module_id = self.known_modules_map[module] 407 | node_id = module_id 408 | if '.' not in module_id: 409 | # we are in the root module, so we use that for global output tracking 410 | if len(self.model_output_names) > 1: 411 | for i, name in enumerate(self.model_output_names): 412 | self.send_debug(name, module, output[i]) 413 | elif len(self.model_output_names) == 1: 414 | self.send_debug(self.model_output_names[0], module, output) 415 | else: 416 | # sub node 417 | self.send_debug(node_id, module, output) 418 | 419 | module.register_forward_hook(hook) 420 | 421 | def get_histogram(self, x, tensor): 422 | h = np.histogram(tensor.cpu().detach().numpy(), bins=20) 423 | # <...x><...y>, little endian 424 | # uint8|Uint32|Uint16|...Float32|...Uint32 425 | # B|L|H|...f|...L 426 | return pack(' 0: 432 | output = output[0] 433 | 434 | if hasattr(output, 'shape'): 435 | activations = self.get_histogram(x, output) 436 | 437 | if len(output.shape) > 1: 438 | # outputs come in batch usually, so pick first 439 | sample = output[0].cpu().detach().numpy() 440 | if len(sample.shape) == 3: 441 | if sample.shape[0] == 3: 442 | image = PIL.Image.fromarray(get_layer_vis_square(sample)) 443 | else: 444 | image = PIL.Image.fromarray(get_image_tales(sample)) 445 | elif len(sample.shape) > 1: 446 | image = PIL.Image.fromarray(get_layer_vis_square(sample)) 447 | elif len(sample.shape) == 1: 448 | if sample.shape[0] == 1: 449 | # we got a single number 450 | output = sample[0] 451 | else: 452 | image = make_image_from_dense(sample) 453 | # elif isinstance(output[0], (float, str, int)): 454 | # image = output 455 | 456 | whistogram = None 457 | bhistogram = None 458 | 459 | if hasattr(module, 'weight') and module.weight is not None: 460 | whistogram = self.get_histogram(x, module.weight) 461 | 462 | if hasattr(module, 'bias') and module.bias is not None: 463 | bhistogram = self.get_histogram(x, module.bias) 464 | 465 | output_rep = None 466 | if isinstance(image, PIL.Image.Image): 467 | output_rep = image 468 | elif isinstance(output, (float, np.floating)): 469 | output_rep = float(output) 470 | elif isinstance(output, (int, np.integer)): 471 | output_rep = int(output) 472 | 473 | return output_rep, activations, whistogram, bhistogram 474 | 475 | def send_debug(self, node_id, module, output): 476 | if node_id in self.record_map: 477 | node_id = self.record_map[node_id] 478 | node_id = self.graph_name + ':' + node_id 479 | 480 | if self.fetch_config.needs_fetch(node_id): 481 | output_rep, ahistogram, whistogram, bhistogram = self.get_debug_data( 482 | self.fetch_config.x, module, output 483 | ) 484 | 485 | self.fetch_result[node_id] = deepkit.debugger.DebuggerFetchItem( 486 | name=node_id, 487 | output=output_rep, 488 | ahistogram=ahistogram, 489 | whistogram=whistogram, 490 | bhistogram=bhistogram, 491 | ) 492 | -------------------------------------------------------------------------------- /deepkit/client.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | import json 4 | import os 5 | import sys 6 | import threading 7 | from asyncio import Future 8 | import datetime 9 | from enum import Enum 10 | from typing import Dict, Optional 11 | 12 | import numpy as np 13 | import websockets 14 | from rx.subject import BehaviorSubject 15 | 16 | import deepkit.globals 17 | from deepkit.home import get_home_config 18 | from deepkit.model import FolderLink 19 | 20 | 21 | def is_in_directory(filepath, directory): 22 | return os.path.realpath(filepath).startswith(os.path.realpath(directory)) 23 | 24 | 25 | class ApiError(Exception): 26 | pass 27 | 28 | 29 | def json_converter(obj): 30 | if isinstance(obj, np.integer): 31 | return int(obj) 32 | elif isinstance(obj, np.floating): 33 | return float(obj) 34 | elif isinstance(obj, np.float): 35 | return float(obj) 36 | elif isinstance(obj, np.ndarray): 37 | return obj.tolist() 38 | elif isinstance(obj, datetime.datetime): 39 | # we assume all datetime instances are UTC 40 | return obj.strftime('%Y-%m-%dT%H:%M:%S.%fZ') 41 | else: 42 | return str(obj) 43 | 44 | 45 | class JobStatus(Enum): 46 | done = 150 # when all tasks are done 47 | aborted = 200 # when at least one task aborted 48 | failed = 250 # when at least one task failed 49 | crashed = 300 # when at least one task crashed 50 | 51 | 52 | class Client(threading.Thread): 53 | connection: websockets.WebSocketClientProtocol 54 | 55 | def __init__(self, project: Optional[str] = None, 56 | account: Optional[str] = None, 57 | try_pick_up=False, 58 | parent_experiment=None, 59 | silent=False): 60 | self.connected = BehaviorSubject(False) 61 | self.project = project 62 | self.account = account 63 | self.parent_experiment = parent_experiment 64 | self.silent = silent 65 | 66 | self.host = os.environ.get('DEEPKIT_HOST', '127.0.0.1') 67 | self.socket_path = os.environ.get('DEEPKIT_SOCKET', None) 68 | self.ssl = os.environ.get('DEEPKIT_SSL', '0') is '1' 69 | self.port = int(os.environ.get('DEEPKIT_PORT', '8960')) 70 | 71 | self.job_token = None 72 | self.job_id = None 73 | 74 | if try_pick_up: 75 | # is set by Deepkit cli 76 | self.job_token = os.environ.get('DEEPKIT_JOB_ACCESSTOKEN', None) 77 | self.job_id = os.environ.get('DEEPKIT_JOB_ID', None) 78 | 79 | # is set by deepkit.login() 80 | self.token = os.environ.get('DEEPKIT_ACCESSTOKEN', None) 81 | 82 | self.result_status = None 83 | 84 | self.message_id = 0 85 | self.callbacks: Dict[int, asyncio.Future] = {} 86 | self.subscriber: Dict[int, any] = {} 87 | self.stopping = False 88 | self.queue = [] 89 | self.controllers = {} 90 | self.patches = {} 91 | self.offline = False 92 | self.connections = 0 93 | self.lock = threading.Lock() 94 | threading.Thread.__init__(self) 95 | self.daemon = True 96 | self.loop = asyncio.new_event_loop() 97 | self.start() 98 | 99 | def is_connected(self): 100 | return self.connected.value 101 | 102 | def run(self): 103 | self.connecting = self.loop.create_future() 104 | self.loop.run_forever() 105 | 106 | def connect(self): 107 | asyncio.run_coroutine_threadsafe(self._connect(), self.loop) 108 | 109 | def connect_anon(self): 110 | asyncio.run_coroutine_threadsafe(self._connect_anon(), self.loop).result() 111 | 112 | def shutdown(self): 113 | if self.offline: return 114 | promise = asyncio.run_coroutine_threadsafe(self.stop_and_sync(), self.loop) 115 | promise.result() 116 | if not self.connection.closed: 117 | raise Exception('Connection still active') 118 | self.loop.stop() 119 | 120 | async def stop_and_sync(self): 121 | self.stopping = True 122 | 123 | if deepkit.utils.in_self_execution() or self.result_status: 124 | # only when we are in self execution do we set status, time stamps etc 125 | # otherwise the CLI is doing that and the server. Or when 126 | # the experiment set result_state explicitly. 127 | 128 | # done = 150, //when all tasks are done 129 | # aborted = 200, //when at least one task aborted 130 | # failed = 250, //when at least one task failed 131 | # crashed = 300, //when at least one task crashed 132 | self.patches['status'] = 150 133 | self.patches['ended'] = datetime.datetime.utcnow() 134 | self.patches['tasks.main.ended'] = datetime.datetime.utcnow() 135 | 136 | # done = 500, 137 | # aborted = 550, 138 | # failed = 600, 139 | # crashed = 650, 140 | self.patches['tasks.main.status'] = 500 141 | self.patches['tasks.main.instances.0.ended'] = datetime.datetime.utcnow() 142 | 143 | # done = 500, 144 | # aborted = 550, 145 | # failed = 600, 146 | # crashed = 650, 147 | self.patches['tasks.main.instances.0.status'] = 500 148 | 149 | if hasattr(sys, 'last_value'): 150 | if isinstance(sys.last_value, KeyboardInterrupt): 151 | self.patches['status'] = 200 152 | self.patches['tasks.main.status'] = 550 153 | self.patches['tasks.main.instances.0.status'] = 550 154 | else: 155 | self.patches['status'] = 300 156 | self.patches['tasks.main.status'] = 650 157 | self.patches['tasks.main.instances.0.status'] = 650 158 | 159 | if self.result_status: 160 | self.patches['status'] = self.result_status.value 161 | 162 | while len(self.patches) > 0 or len(self.queue) > 0: 163 | await asyncio.sleep(0.15) 164 | 165 | await self.connection.close() 166 | 167 | def register_controller(self, name: str, controller): 168 | return asyncio.run_coroutine_threadsafe(self._register_controller(name, controller), self.loop) 169 | 170 | async def _register_controller(self, name: str, controller): 171 | self.controllers[name] = controller 172 | 173 | async def handle_peer_message(message, done): 174 | if message['type'] == 'error': 175 | done() 176 | del self.controllers[name] 177 | raise Exception('Register controller error: ' + message['error']) 178 | 179 | if message['type'] == 'ack': 180 | pass 181 | 182 | if message['type'] == 'peerController/message': 183 | data = message['data'] 184 | 185 | if not hasattr(controller, data['action']): 186 | error = f"Requested action {message['action']} not available in {name}" 187 | print(error, file=sys.stderr) 188 | await self._message({ 189 | 'name': 'peerController/message', 190 | 'controllerName': name, 191 | 'clientId': message['clientId'], 192 | 'data': {'type': 'error', 'id': data['id'], 'stack': None, 'entityName': '@error:default', 193 | 'error': error} 194 | }, no_response=True) 195 | 196 | if data['name'] == 'actionTypes': 197 | parameters = [] 198 | 199 | i = 0 200 | for arg in inspect.getfullargspec(getattr(controller, data['action'])).args: 201 | parameters.append({ 202 | 'type': 'any', 203 | 'name': '#' + str(i) 204 | }) 205 | i += 1 206 | 207 | await self._message({ 208 | 'name': 'peerController/message', 209 | 'controllerName': name, 210 | 'clientId': message['clientId'], 211 | 'data': { 212 | 'type': 'actionTypes/result', 213 | 'id': data['id'], 214 | 'parameters': parameters, 215 | 'returnType': {'type': 'any', 'name': 'result'} 216 | } 217 | }, no_response=True) 218 | 219 | if data['name'] == 'action': 220 | try: 221 | res = await getattr(controller, data['action'])(*data['args']) 222 | 223 | await self._message({ 224 | 'name': 'peerController/message', 225 | 'controllerName': name, 226 | 'clientId': message['clientId'], 227 | 'data': { 228 | 'type': 'next/json', 229 | 'id': data['id'], 230 | 'encoding': {'name': 'r', 'type': 'any'}, 231 | 'next': res, 232 | } 233 | }, no_response=True) 234 | except Exception as e: 235 | await self._message({ 236 | 'name': 'peerController/message', 237 | 'controllerName': name, 238 | 'clientId': message['clientId'], 239 | 'data': {'type': 'error', 'id': data['id'], 'stack': None, 'entityName': '@error:default', 240 | 'error': str(e)} 241 | }, no_response=True) 242 | 243 | def subscriber(message, on_done): 244 | self.loop.create_task(handle_peer_message(message, on_done)) 245 | 246 | await self._subscribe({ 247 | 'name': 'peerController/register', 248 | 'controllerName': name, 249 | }, subscriber) 250 | 251 | class Controller: 252 | def __init__(self, client): 253 | self.client = client 254 | 255 | def stop(self): 256 | self.client._message({ 257 | 'name': 'peerController/unregister', 258 | 'controllerName': name, 259 | }, no_response=True) 260 | 261 | return Controller(self) 262 | 263 | async def _action(self, controller: str, action: str, args=None, lock=True, allow_in_shutdown=False): 264 | if args is None: 265 | args = [] 266 | 267 | if lock: await self.connecting 268 | if self.offline: return 269 | if self.stopping and not allow_in_shutdown: raise Exception('In shutdown: actions disallowed') 270 | 271 | if not controller: raise Exception('No controller given') 272 | if not action: raise Exception('No action given') 273 | 274 | # print('> action', action, threading.current_thread().name) 275 | res = await self._message({ 276 | 'name': 'action', 277 | 'controller': controller, 278 | 'action': action, 279 | 'args': args, 280 | 'timeout': 60 281 | }, lock=lock) 282 | 283 | # print('< action', action) 284 | 285 | if res['type'] == 'next/json': 286 | return res['next'] if 'next' in res else None 287 | 288 | if res['type'] == 'error': 289 | print(res, file=sys.stderr) 290 | raise ApiError('API Error: ' + str(res['error'])) 291 | 292 | raise ApiError(f"Invalid action type '{res['type']}'. Not implemented") 293 | 294 | def app_action_threadsafe(self, action: str, args=None) -> Future: 295 | if args is None: args = [] 296 | return asyncio.run_coroutine_threadsafe(self._action('app', action, args), self.loop) 297 | 298 | async def job_action(self, action: str, args=None): 299 | return await self._action('job', action, args) 300 | 301 | def job_action_threadsafe(self, action: str, args=None) -> Future: 302 | """ 303 | This method is non-blocking and every try to block-wait for an answers means 304 | script execution stops when connection is broken (offline training entirely impossible). 305 | So, we just schedule the call and return a Future, which the user can subscribe to. 306 | """ 307 | if args is None: args = [] 308 | return asyncio.run_coroutine_threadsafe(self._action('job', action, args), self.loop) 309 | 310 | async def _subscribe(self, message, subscriber): 311 | await self.connecting 312 | 313 | self.message_id += 1 314 | message['id'] = self.message_id 315 | 316 | message_id = self.message_id 317 | 318 | def on_done(): 319 | del self.subscriber[message_id] 320 | 321 | def on_incoming_message(incoming_message): 322 | subscriber(incoming_message, on_done) 323 | 324 | self.subscriber[self.message_id] = on_incoming_message 325 | self.queue.append(message) 326 | 327 | def _create_message(self, message: dict, lock=True, no_response=False) -> dict: 328 | self.message_id += 1 329 | message['id'] = self.message_id 330 | if not no_response: 331 | self.callbacks[self.message_id] = self.loop.create_future() 332 | 333 | return message 334 | 335 | async def _message(self, message, lock=True, no_response=False): 336 | if lock: await self.connecting 337 | 338 | message = self._create_message(message, no_response=no_response) 339 | self.queue.append(message) 340 | 341 | if no_response: 342 | return 343 | 344 | return await self.callbacks[self.message_id] 345 | 346 | def patch(self, path: str, value: any): 347 | if self.offline: return 348 | if self.stopping: return 349 | 350 | self.patches[path] = value 351 | 352 | async def send_messages(self, connection): 353 | while not connection.closed: 354 | try: 355 | q = self.queue[:] 356 | for m in q: 357 | try: 358 | j = json.dumps(m, default=json_converter) 359 | except TypeError as e: 360 | print('Could not send message since JSON error', e, m, file=sys.stderr) 361 | continue 362 | await connection.send(j) 363 | self.queue.remove(m) 364 | except Exception as e: 365 | print("Failed sending, exit send_messages", file=sys.stderr) 366 | raise e 367 | 368 | if len(self.patches) > 0: 369 | # we have to send first all messages/actions out 370 | # before sending patches, as most of the time 371 | # patches are based on previously created entities, 372 | # so we need to make sure those entities are created 373 | # first before sending any patches. 374 | # print('patches', self.patches) 375 | try: 376 | send = self.patches.copy() 377 | await connection.send(json.dumps({ 378 | 'name': 'action', 379 | 'controller': 'job', 380 | 'action': 'patchJob', 381 | 'args': [ 382 | send 383 | ], 384 | 'timeout': 60 385 | }, default=json_converter)) 386 | 387 | for i in send.keys(): 388 | if self.patches[i] == send[i]: 389 | del self.patches[i] 390 | except websockets.exceptions.ConnectionClosed: 391 | return 392 | except ApiError: 393 | print("Patching failed. Syncing job data disabled.", file=sys.stderr) 394 | return 395 | 396 | await asyncio.sleep(0.5) 397 | 398 | async def handle_messages(self, connection): 399 | while not connection.closed: 400 | try: 401 | res = json.loads(await connection.recv()) 402 | except websockets.exceptions.ConnectionClosedError: 403 | # we need reconnect 404 | break 405 | except websockets.exceptions.ConnectionClosedOK: 406 | # we closed on purpose, so no reconnect necessary 407 | return 408 | 409 | if res and 'id' in res: 410 | if res['id'] in self.subscriber: 411 | self.subscriber[res['id']](res) 412 | 413 | if res['id'] in self.callbacks: 414 | self.callbacks[res['id']].set_result(res) 415 | del self.callbacks[res['id']] 416 | 417 | if not self.stopping: 418 | self.log("Deepkit: lost connection. reconnect ...") 419 | self.connecting = self.loop.create_future() 420 | self.connected.on_next(False) 421 | self.loop.create_task(self._connect()) 422 | 423 | async def _connected(self, id: str, token: str): 424 | try: 425 | if self.socket_path: 426 | self.connection = await websockets.unix_connect(self.socket_path) 427 | else: 428 | ws = 'wss' if self.ssl else 'ws' 429 | url = f"{ws}://{self.host}:{self.port}" 430 | self.connection = await websockets.connect(url) 431 | except Exception as e: 432 | # try again later 433 | self.log('Unable to connect', e) 434 | await asyncio.sleep(1) 435 | self.loop.create_task(self._connect()) 436 | return 437 | 438 | self.loop.create_task(self.handle_messages(self.connection)) 439 | # we don't use send_messages() since this would send all queue/patches 440 | # which would lead to permission issues when we're not first authenticated 441 | 442 | if token: 443 | message = self._create_message({ 444 | 'name': 'authenticate', 445 | 'token': { 446 | 'id': 'job', 447 | 'token': token, 448 | 'job': id 449 | } 450 | }, lock=False) 451 | 452 | await self.connection.send(json.dumps(message, default=json_converter)) 453 | 454 | res = await self.callbacks[message['id']] 455 | if not res['result'] or res['result'] is not True: 456 | raise Exception('Job token invalid') 457 | 458 | self.loop.create_task(self.send_messages(self.connection)) 459 | 460 | self.connecting.set_result(True) 461 | if self.connections > 0: 462 | self.log("Deepkit: Reconnected.") 463 | 464 | self.connected.on_next(True) 465 | self.connections += 1 466 | 467 | async def _connect_anon(self): 468 | ws = 'wss' if self.ssl else 'ws' 469 | url = f"{ws}://{self.host}:{self.port}" 470 | self.connection = await websockets.connect(url) 471 | self.loop.create_task(self.handle_messages(self.connection)) 472 | self.loop.create_task(self.send_messages(self.connection)) 473 | 474 | self.connecting.set_result(True) 475 | self.connected.on_next(True) 476 | self.connections += 1 477 | 478 | async def _connect(self): 479 | # we want to restart with a empty queue, so authentication happens always first 480 | queue_copy = self.queue[:] 481 | self.queue = [] 482 | 483 | if self.job_token: 484 | await self._connected(self.job_id, self.job_token) 485 | return 486 | 487 | try: 488 | link: Optional[FolderLink] = None 489 | 490 | user_token = self.token 491 | account_name = 'none' 492 | 493 | if not user_token: 494 | config = get_home_config() 495 | # when no user_token is given (via deepkit.login() for example) 496 | # we need to find the host, port, token from the user config in ~/.deepkit/config 497 | if not self.account and not self.project: 498 | # find both, start with 499 | link = config.get_folder_link_of_directory(sys.path[0]) 500 | account_config = config.get_account_for_id(link.accountId) 501 | elif self.account and not self.project: 502 | account_config = config.get_account_for_name(self.account) 503 | else: 504 | # default to first account configured 505 | account_config = config.get_first_account() 506 | 507 | account_name = account_config.name 508 | self.host = account_config.host 509 | self.port = account_config.port 510 | self.ssl = account_config.ssl 511 | user_token = account_config.token 512 | 513 | ws = 'wss' if self.ssl else 'ws' 514 | try: 515 | url = f"{ws}://{self.host}:{self.port}" 516 | self.connection = await websockets.connect(url) 517 | except Exception as e: 518 | self.offline = True 519 | print(f"Deepkit: App not started or server not reachable. Monitoring disabled. {e}", file=sys.stderr) 520 | self.connecting.set_result(False) 521 | return 522 | 523 | self.loop.create_task(self.handle_messages(self.connection)) 524 | self.loop.create_task(self.send_messages(self.connection)) 525 | 526 | res = await self._message({ 527 | 'name': 'authenticate', 528 | 'token': { 529 | 'id': 'user', 530 | 'token': user_token 531 | } 532 | }, lock=False) 533 | if not res['result']: 534 | raise Exception('Login invalid') 535 | 536 | project_name = '' 537 | if link: 538 | project_name = link.name 539 | projectId = link.projectId 540 | else: 541 | if not self.project: 542 | raise Exception('No project defined. Please use project="project-name" ' 543 | 'to specify which project to use.') 544 | 545 | project = await self._action('app', 'getProjectForPublicName', [self.project], lock=False) 546 | 547 | if not project: 548 | raise Exception( 549 | f'No project found for name {self.project}. Make sure it exists before using it. ' 550 | f'Do you use the correct account? (used {account_name})') 551 | project_name = project['name'] 552 | projectId = project['id'] 553 | 554 | job = await self._action('app', 'createJob', [projectId, self.parent_experiment], 555 | lock=False) 556 | 557 | prefix = "Sub experiment" if self.parent_experiment else "Experiment" 558 | self.log(f"{prefix} #{job['number']} created in project {project_name} using account {account_name}") 559 | 560 | deepkit.globals.loaded_job_config = job['config']['config'] 561 | self.job_token = await self._action('app', 'getJobAccessToken', [job['id']], lock=False) 562 | self.job_id = job['id'] 563 | 564 | # todo, implement re-authentication, so we don't have to drop the active connection 565 | await self.connection.close() 566 | await self._connected(self.job_id, self.job_token) 567 | except Exception as e: 568 | self.connecting.set_exception(e) 569 | 570 | self.queue = queue_copy + self.queue 571 | 572 | def log(self, *message: str): 573 | if not self.silent: print(*message) 574 | -------------------------------------------------------------------------------- /deepkit/keras_tf.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import sys 3 | from os.path import dirname 4 | from struct import pack 5 | from typing import Dict, Optional, List 6 | 7 | import PIL.Image 8 | import numpy as np 9 | 10 | if 'keras' in sys.modules: 11 | import keras 12 | else: 13 | import tensorflow.keras as keras 14 | 15 | import tensorflow as tf 16 | 17 | import deepkit.debugger 18 | from deepkit.utils.image import get_layer_vis_square, get_image_tales, make_image_from_dense 19 | 20 | if 'keras' in sys.modules: 21 | from keras import Model 22 | else: 23 | from tensorflow.keras import Model 24 | 25 | 26 | def count_params(weights): 27 | return int(sum(np.prod(p.shape.as_list()) for p in weights)) 28 | 29 | 30 | def get_tf_shape_as_list(tf_shape_dim): 31 | return list(map(lambda x: x.size, list(tf_shape_dim))) 32 | 33 | 34 | def extract_model_graph(model): 35 | def extract_attributes(layer): 36 | attrs = [ 37 | # InputLayer 38 | 'input_shape', 'batch_size', 'dtype', 'sparse', 'ragged', 39 | 40 | # conv, LocallyConnected1D 41 | 'rank', 'filters', 'kernel_size', 'strides', 'padding', 'data_format', 'dilation_rate', 42 | 'use_bias', 43 | 'kernel_initializer', 'bias_initializer', 'kernel_regularizer', 'bias_regularizer', 44 | 'activity_regularizer', 'kernel_constraint', 'bias_constraint', 45 | 46 | # pooling 47 | 'pool_size', 'strides', 'padding', 'data_format', 48 | 'pool_function', 49 | 50 | # RNN 51 | 'cell', 'return_sequences', 'return_state', 'go_backwards', 'stateful', 'unroll', 'time_major', 52 | # RNNCell 53 | 'units', 'recurrent_activation', 'use_bias', 'kernel_initializer', 'recurrent_initializer', 54 | 'bias_initializer', 'unit_forget_bias', 'kernel_regularizer', 'recurrent_regularizer', 55 | 'bias_regularizer', 'kernel_constraint', 'recurrent_constraint', 'bias_constraint', 56 | 'dropout', 'recurrent_dropout', 'implementation', 57 | 58 | # Embedding 59 | 'input_dim', 'output_dim', 60 | 'embeddings_initializer', 'embeddings_regularizer', 'activity_regularizer', 61 | 'embeddings_constraint', 'mask_zero', 'input_length', 'fused', 62 | 63 | # Merge 64 | 'axes', 'normalize', 65 | 66 | # Noise 67 | 'stddev', 'rate', 'noise_shape', 68 | 69 | # BatchNormalization 70 | 'momentum', 'epsilon', 'center', 'scale', 71 | 'beta_initializer', 'gamma_initializer', 'moving_mean_initializer', 'moving_variance_initializer', 72 | 'beta_regularizer', '', 'gamma_regularizer', 'beta_constraint', 'gamma_constraint', 'renorm', 73 | 'virtual_batch_size', 'adjustment' 74 | 75 | 'rate', 'noise_shape', # Dropout 76 | 'data_format', # Flatten 77 | 'target_shape', # Reshape 78 | 'dims', # Permute 79 | 'n', # RepeatVector 80 | 'function', # Lambda 81 | 'l1', 'l2', # ActivityRegularization 82 | 'mask_value', # Masking 83 | ] 84 | res = {} 85 | 86 | def normalize_value(name, v): 87 | if inspect.isfunction(v): 88 | return v.__name__ 89 | 90 | if isinstance(v, (str, int, float, bool)): 91 | return v 92 | 93 | if isinstance(v, (list, tuple)): 94 | return str(v) 95 | 96 | if type(v).__name__ != 'type': 97 | # todo, if `cell` for RNN we probably want to extract those information as well 98 | return type(v).__name__ 99 | 100 | return str(v) 101 | 102 | for attr in attrs: 103 | if hasattr(layer, attr): 104 | res[attr] = getattr(layer, attr) 105 | 106 | res[attr] = normalize_value(attr, res[attr]) 107 | 108 | if hasattr(layer, 'activation'): 109 | if layer.activation: 110 | res['activation'] = layer.activation.__name__ 111 | # todo get action parameters. `alpha`, etc 112 | 113 | if hasattr(layer, 'trainable_weights'): 114 | res['trainable_weights'] = count_params(layer.trainable_weights) 115 | if hasattr(layer, 'non_trainable_weights'): 116 | res['non_trainable_weights'] = count_params(layer.non_trainable_weights) 117 | 118 | return res 119 | 120 | def tensor_name_to_node_name(name: str) -> str: 121 | return name[0:name.rindex(':')] 122 | 123 | def get_parent(name, go_up=1) -> str: 124 | return '/'.join(name.split('/')[:go_up * -1]) 125 | 126 | def get_scope_id(name: str): 127 | """ 128 | Takes a name like 'dense_2/MatMul' and converts it to its scope `dense_2`. 129 | Examples 130 | 1. 'dense_1/MatMul/ReadVariableOp/resource' 131 | -> dense_1/MatMul/ReadVariableOp 132 | 2. 'dense_1/MatMul/ReadVariableOp' 133 | -> dense_1/MatMul 134 | """ 135 | return dirname(name) 136 | 137 | edges = dict() 138 | nodes = dict() 139 | names_to_scope = dict() 140 | scope_nodes = dict() 141 | input_names = [] 142 | output_names = [] 143 | record_map = dict() 144 | 145 | output_tensor = model.outputs[0] if hasattr(model, 'outputs') else model.output 146 | if not hasattr(output_tensor, 'graph'): 147 | # only tensorflow has `graph` defined. 148 | graph = {'nodes': [], 'scopes': []} 149 | return graph, record_map, input_names 150 | 151 | g = output_tensor.graph 152 | tf_nodes = list(g.as_graph_def(add_shapes=True).node) 153 | blacklist = {'Placeholder', 'PlaceholderWithDefault', 'Const'} 154 | 155 | model_scoped_layer_names = set() 156 | model_unique_layer_names = set() 157 | 158 | def extract_layers(model, scope_id=''): 159 | scope_prefix = ((scope_id + '/') if scope_id else '') 160 | for layer in model.layers: 161 | model_scoped_layer_names.add(scope_prefix + layer.name) 162 | model_unique_layer_names.add(layer.name) 163 | 164 | if isinstance(layer, Model): 165 | extract_layers(layer, scope_prefix + layer.name) 166 | 167 | extract_layers(model, '') 168 | 169 | def get_scoped_name(full_name: str): 170 | """ 171 | 1. 'sequential_1/conv2d_1/convolution/ReadVariableOp' 172 | => 'sequential_1', 'conv2d_1' 173 | 2. 'conv2d_1/convolution/ReadVariableOp' 174 | => '', 'conv2d_1' 175 | 3. 'dense_1/MatMul' 176 | => '', 'dense_1' 177 | """ 178 | names = full_name.split('/') 179 | scope = '' 180 | name = '' 181 | 182 | for part in names: 183 | next_scope = scope + ('/' if scope else '') + name 184 | next_name = part 185 | next_full_name = next_scope + ('/' if next_scope else '') + next_name 186 | if next_full_name not in model_scoped_layer_names: 187 | break 188 | scope = next_scope 189 | name = next_name 190 | 191 | if not scope and not name and names[0] in model_unique_layer_names: 192 | return '', names[0] 193 | 194 | return scope, name 195 | 196 | for tensor in model.inputs: 197 | input_names.append(tensor_name_to_node_name(tensor.name)) 198 | 199 | for tensor in model.outputs: 200 | output_names.append(tensor_name_to_node_name(tensor.name)) 201 | 202 | for node in tf_nodes: 203 | is_input = node.name in input_names 204 | if node.op in blacklist and not is_input: continue 205 | 206 | nodes[node.name] = node 207 | scope_id = get_scope_id(node.name) 208 | names_to_scope[node.name] = scope_id 209 | 210 | if scope_id not in scope_nodes: 211 | scope_nodes[names_to_scope[node.name]] = [] 212 | 213 | scope_nodes[scope_id].append(node.name) 214 | 215 | for node in nodes.values(): 216 | edges[node.name] = set() 217 | node_scope, node_name = get_scoped_name(node.name) 218 | 219 | for input in node.input: 220 | # filter unknown nodes 221 | if input not in nodes: continue 222 | 223 | input_scope, input_name = get_scoped_name(input) 224 | if input_name == node_name and node_scope != input_scope: 225 | # 'sequential_1/conv2d_1/convolution/ReadVariableOp' points to 226 | # its internals at 'conv2d_1/kernel', which are both conv2d_1, but 227 | # on different scopes, which mean we don't display `conv2d_1/kernel`, since 228 | # its only internals. 229 | continue 230 | 231 | edges[node.name].add(input) 232 | 233 | nodes_names_to_display = set() 234 | 235 | def collect_nodes_to_display(inputs): 236 | for input in inputs: 237 | if input not in nodes_names_to_display: 238 | nodes_names_to_display.add(input) 239 | if input in edges: 240 | collect_nodes_to_display(edges[input]) 241 | 242 | dk_nodes = [] 243 | dk_scopes = [] 244 | 245 | primitive = {'Identity'} 246 | 247 | # shows those layers activation nodes. 248 | activations = {'elu', 'softmax', 'selu', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 249 | 'exponential', 'linear', 'leakyrelu'} 250 | 251 | op_names_normalization = {'AddV2': 'add'} 252 | 253 | # show as type 'layer' when no `activation` or linear activation has been set. This 254 | # hides internals of those layers in the graph. 255 | layers = {'Embedding', 'Flatten', 'Dense', 'Dropout', 'Reshape', 'BatchNormalization', 'UpSampling2D', 'Conv2D'} 256 | 257 | for i, output in enumerate(model.outputs): 258 | name = tensor_name_to_node_name(output.name) 259 | nodes_names_to_display.add(name) 260 | collect_nodes_to_display(edges[name]) 261 | record_map[output.name] = name + ':0' 262 | 263 | shape = [] 264 | if '_output_shapes' in nodes[name].attr: 265 | shape = list(map(lambda x: x.size, list(nodes[name].attr['_output_shapes'].list.shape[0].dim))) 266 | 267 | dk_nodes.append({ 268 | 'id': output.name, 269 | 'label': name, 270 | 'type': 'output', 271 | 'subType': '', 272 | 'input': list(edges[name]), 273 | 'attributes': {}, 274 | 'recordable': True, 275 | 'scope': '', 276 | 'shape': shape, 277 | }) 278 | 279 | for name in nodes_names_to_display: 280 | scope_id = names_to_scope[name] 281 | node_label = name 282 | node_type = 'op' 283 | node_sub_type: str = nodes[name].op 284 | inputs = edges[name] if name in edges else [] 285 | recordable = True 286 | shape = [] 287 | if '_output_shapes' in nodes[name].attr: 288 | shape = list(map(lambda x: x.size, list(nodes[name].attr['_output_shapes'].list.shape[0].dim))) 289 | 290 | if name in input_names: 291 | recordable = True 292 | node_type = 'input' 293 | 294 | if name in output_names: 295 | recordable = True 296 | node_type = 'output' 297 | 298 | if node_sub_type in primitive: 299 | node_type = 'primitive' 300 | 301 | if node_sub_type.lower() in activations: 302 | node_type = 'activation' 303 | 304 | if node_sub_type in op_names_normalization: 305 | node_sub_type = op_names_normalization[node_sub_type] 306 | 307 | if recordable: 308 | # map names to tensor, which can be later used to fetch the output 309 | try: 310 | g.get_tensor_by_name(name + ':0') 311 | record_map[name] = name + ':0' 312 | except: 313 | recordable = False 314 | 315 | is_collapsible = node_sub_type != 'Sequential' 316 | if scope_id and is_collapsible and scope_id in scope_nodes and len(scope_nodes[scope_id]) == 1: 317 | # the scope has only one item, so collapse it. 318 | parent_scope_id = get_parent(scope_id) 319 | if scope_id not in nodes: 320 | scope_id = parent_scope_id 321 | 322 | # is_collapsible = node_sub_type != 'Sequential' 323 | # while scope_id and is_collapsible and scope_id in scope_nodes and len(scope_nodes[scope_id]) == 1: 324 | # # the scope has only one item, so collapse it. 325 | # scope_id = get_parent(scope_id) 326 | # if scope_id in nodes: 327 | # is_collapsible = nodes[scope_id].op != 'Sequential' 328 | # inputs = edges[scope_id] 329 | # else: 330 | # is_collapsible = False 331 | 332 | node = { 333 | 'id': name, 334 | 'label': node_label, 335 | 'type': node_type, 336 | 'subType': node_sub_type, 337 | 'input': list(inputs), 338 | 'attributes': {}, 339 | 'recordable': recordable, 340 | 'scope': scope_id, 341 | 'shape': shape, 342 | } 343 | dk_nodes.append(node) 344 | 345 | def extract_layers(model, scope_id=''): 346 | if scope_id: 347 | dk_scopes.append({ 348 | 'id': scope_id, 349 | 'label': scope_id, 350 | 'subType': type(model).__name__, 351 | 'recordable': True, 352 | }) 353 | scope_prefix = ((scope_id + '/') if scope_id else '') 354 | 355 | for layer in model.layers: 356 | recordable = True 357 | if hasattr(layer, 'outputs'): 358 | recordable = len(layer.outputs) == 1 359 | else: 360 | recordable = layer.output is not None 361 | 362 | if recordable: 363 | # we track here the actual layer, because it contains the weights/biases correctly 364 | tensor = layer.outputs[0] if hasattr(layer, 'outputs') else layer.output 365 | # sub tensors must have the layer name as prefix. If this is not the case 366 | # it references the wrong tensor. We make here sure the correct sub tensor 367 | # is chosen and not one from a shadow/sibling graph. 368 | # 1. scope_prefix='', layer=Dense1 and tensor is like 'dense_1/Relu' which is correct 369 | # 2. scope_prefix='', layer=Sequential1 and tensor is like 'activation/tanh', but we need 'sequential_1/activation/tanh' 370 | # 3. scope_prefix='sequential_1', layer=Dense2, tensor is like 'dense_1/Relu', but we need 'sequential_1/dense_2/Relu' 371 | if '/' in tensor.name and not tensor.name.startswith(scope_prefix + layer.name + '/'): 372 | try: 373 | tensor = g.get_tensor_by_name(scope_prefix + tensor.name) 374 | except: 375 | tensor = g.get_tensor_by_name(scope_prefix + layer.name + '/' + tensor.name) 376 | 377 | record_map[scope_prefix + layer.name] = { 378 | 'layer': layer, 379 | 'tensor': tensor 380 | } 381 | 382 | node_sub_type = type(layer).__name__ 383 | node_type = 'scope' 384 | 385 | if node_sub_type in layers: 386 | if not hasattr(layer, 'activation') or layer.activation is None or layer.activation.__name__ == 'linear': 387 | # once the layer has a custom activation function, we don't collapse it to a layer type 388 | # since that wouldn't be visible in the graph anymore. 389 | node_type = 'layer' 390 | 391 | dk_scopes.append({ 392 | 'id': scope_prefix + layer.name, 393 | 'label': layer.name, 394 | 'type': node_type, 395 | 'subType': node_sub_type, 396 | 'attributes': extract_attributes(layer), 397 | 'recordable': recordable, 398 | 'shape': layer.output_shape, 399 | }) 400 | 401 | if isinstance(layer, Model): 402 | extract_layers(layer, scope_prefix + layer.name) 403 | 404 | extract_layers(model, '') 405 | 406 | graph = {'nodes': dk_nodes, 'scopes': dk_scopes} 407 | return graph, record_map, input_names 408 | 409 | 410 | class TFDebugger: 411 | def __init__(self, debugger: deepkit.debugger.DebuggerManager, model, model_input, graph_name: str, 412 | record_map: dict, is_batch: bool, input_names: List[str]): 413 | self.debugger = debugger 414 | self.model = model 415 | self.model_input = model_input 416 | self.graph_name = graph_name 417 | self.input_names = input_names 418 | self.is_batch = is_batch 419 | 420 | # contains a map of recording map, names from nodes of the full graph to actual modules 421 | # this is necessary since we map certain internal nodes to a scope/layer/module. 422 | self.record_map = record_map 423 | 424 | self.fetch_result: Dict[str, deepkit.debugger.DebuggerFetchItem] = dict() 425 | self.fetch_config: Optional[deepkit.debugger.DebuggerFetchConfig] = None 426 | 427 | def set_input(self, x): 428 | # resize batches to size 1 if is_batch=True 429 | if isinstance(x, tf.data.Dataset): 430 | x = next(iter(x))[0] 431 | 432 | if len(self.input_names) == 1: 433 | self.model_input = np.array([x[0]] if self.is_batch else x) 434 | else: 435 | self.model_input = [np.array([v[0]]) if self.is_batch else v for v in x] 436 | 437 | def fetch(self, fetch_config: deepkit.debugger.DebuggerFetchConfig) -> Dict[ 438 | str, deepkit.debugger.DebuggerFetchItem]: 439 | self.fetch_config = fetch_config 440 | self.fetch_result = dict() 441 | 442 | node_names = [] 443 | for name in self.record_map: 444 | # if name is an input, we need to fetch it directly from the self.model_input 445 | # otherwise TF crashes with `input_1:0 is both fed and fetched` 446 | if name in self.input_names: 447 | continue 448 | 449 | node_id = self.graph_name + ':' + name 450 | if self.fetch_config.needs_fetch(node_id): 451 | node_names.append(name) 452 | 453 | if self.model_input is not None: 454 | if len(self.input_names) > 1: 455 | for i, name in enumerate(self.input_names): 456 | self._set_item_from_input(i, self.model_input[i]) 457 | elif len(self.input_names) == 1: 458 | self._set_item_from_input(0, self.model_input) 459 | 460 | if not len(node_names): 461 | return self.fetch_result 462 | 463 | if self.model_input is None: 464 | return self.fetch_result 465 | 466 | data = self.get_image_and_histogram_from_layers(self.fetch_config.x, node_names) 467 | 468 | for i, name in enumerate(node_names): 469 | jpeg, ahistogram = data[i] 470 | whistogram = None 471 | bhistogram = None 472 | tensor_or_layer_dict = self.record_map[name] 473 | if isinstance(tensor_or_layer_dict, dict): 474 | layer = tensor_or_layer_dict['layer'] 475 | whistogram, bhistogram = self.get_weight_histogram_from_layer(self.fetch_config.x, layer) 476 | 477 | node_id = self.graph_name + ':' + name 478 | self.fetch_result[node_id] = deepkit.debugger.DebuggerFetchItem( 479 | name=node_id, 480 | output=jpeg, 481 | ahistogram=ahistogram, 482 | whistogram=whistogram, 483 | bhistogram=bhistogram, 484 | ) 485 | 486 | return self.fetch_result 487 | 488 | def _set_item_from_input(self, index, data): 489 | name = self.input_names[index] 490 | node_id = self.graph_name + ':' + name 491 | 492 | if not self.fetch_config.needs_fetch(node_id): 493 | return 494 | 495 | jpeg, ahistogram = self._image_and_histogram(self.fetch_config.x, data) 496 | self.fetch_result[node_id] = deepkit.debugger.DebuggerFetchItem( 497 | name=node_id, 498 | output=jpeg, 499 | ahistogram=ahistogram, 500 | whistogram=None, 501 | bhistogram=None, 502 | ) 503 | 504 | def get_image_and_histogram_from_layers(self, x, names): 505 | outputs = [] 506 | output_tensor = self.model.outputs[0] if hasattr(self.model, 'outputs') else self.model.output 507 | g = output_tensor.graph 508 | for name in names: 509 | tensor_name_or_layer_dict = self.record_map[name] 510 | if isinstance(tensor_name_or_layer_dict, str): 511 | tensor = g.get_tensor_by_name(tensor_name_or_layer_dict) 512 | outputs.append(tensor) 513 | else: 514 | layer_dict = tensor_name_or_layer_dict 515 | outputs.append(layer_dict['tensor']) 516 | 517 | inputs = self.model.inputs if hasattr(self.model, 'inputs') else self.model.input 518 | 519 | fn = keras.backend.function(inputs, outputs) 520 | try: 521 | y = fn(self.model_input) 522 | 523 | result = [] 524 | 525 | for i, _ in enumerate(names): 526 | result.append(self._image_and_histogram(x, y[i])) 527 | 528 | return result 529 | except Exception as e: 530 | print(f"Failed to watch tensor. Input shape: {self.model_input.shape}, outputs={len(outputs)}") 531 | raise e 532 | 533 | def _image_and_histogram(self, x, output): 534 | image = None 535 | histogram = None 536 | if hasattr(output, 'shape'): 537 | # tf is not batch per default 538 | sample = np.copy(output) 539 | shape = output.shape 540 | 541 | if self.is_batch: 542 | # display only first item in batch 543 | sample = np.copy(output[0]) 544 | shape = output.shape[1:] # first is batch shizzle 545 | 546 | if len(shape) == 3: 547 | if keras.backend.image_data_format() == 'channels_last': 548 | sample = np.transpose(sample, (2, 0, 1)) 549 | 550 | if sample.shape[0] == 3: 551 | image = PIL.Image.fromarray(get_layer_vis_square(sample)) 552 | else: 553 | image = PIL.Image.fromarray(get_image_tales(sample)) 554 | elif len(shape) > 1: 555 | image = PIL.Image.fromarray(get_layer_vis_square(sample)) 556 | elif len(shape) == 1: 557 | if shape[0] == 1: 558 | # we got a single number 559 | output = sample[0] 560 | else: 561 | image = make_image_from_dense(sample) 562 | 563 | h = np.histogram(sample, bins=20) 564 | histogram = pack(' 0: 581 | h = np.histogram(layer_weights[0], bins=20) 582 | # <...x><...y>, little endian 583 | # uint8|Uint32|Uint16|...Float32|...Uint32 584 | # B|L|H|...f|...L 585 | weights = pack(' 1: 589 | h = np.histogram(layer_weights[1], bins=20) 590 | biases = pack(' 0: 49 | neurons = np.append(neurons, np.zeros(diff, dtype=neurons.dtype)) 50 | 51 | img = array_to_img(neurons.reshape((1, cols, cols))) 52 | img = img.resize((cols * 8, cols * 8), PIL.Image.NEAREST) 53 | 54 | return img 55 | 56 | 57 | def upscale(image, ratio): 58 | """ 59 | return upscaled image array 60 | Arguments: 61 | image -- a (H,W,C) numpy.ndarray 62 | ratio -- scaling factor (>1) 63 | """ 64 | if not isinstance(image, np.ndarray): 65 | raise ValueError('Expected ndarray') 66 | if ratio < 1: 67 | raise ValueError('Ratio must be greater than 1 (ratio=%f)' % ratio) 68 | width = int(math.floor(image.shape[1] * ratio)) 69 | height = int(math.floor(image.shape[0] * ratio)) 70 | channels = image.shape[2] 71 | out = np.ndarray((height, width, channels), dtype=np.uint8) 72 | for x, y in np.ndindex((width, height)): 73 | out[y, x] = image[int(math.floor(y / ratio)), int(math.floor(x / ratio))] 74 | return out 75 | 76 | 77 | def resize_image(image, height, width, 78 | channels=None, 79 | resize_mode=None 80 | ): 81 | """ 82 | Resizes an image and returns it as a np.array 83 | Arguments: 84 | image -- a PIL.Image or numpy.ndarray 85 | height -- height of new image 86 | width -- width of new image 87 | Keyword Arguments: 88 | channels -- channels of new image (stays unchanged if not specified) 89 | resize_mode -- can be crop, squash, fill or half_crop 90 | """ 91 | if resize_mode is None: 92 | resize_mode = 'squash' 93 | if resize_mode not in ['crop', 'squash', 'fill', 'half_crop']: 94 | raise ValueError('resize_mode "%s" not supported' % resize_mode) 95 | 96 | if channels not in [None, 1, 3]: 97 | raise ValueError('unsupported number of channels: %s' % channels) 98 | 99 | if isinstance(image, PIL.Image.Image): 100 | # Convert image mode (channels) 101 | if channels is None: 102 | image_mode = image.mode 103 | if image_mode == 'L': 104 | channels = 1 105 | elif image_mode == 'RGB': 106 | channels = 3 107 | else: 108 | raise ValueError('unknown image mode "%s"' % image_mode) 109 | elif channels == 1: 110 | # 8-bit pixels, black and white 111 | image_mode = 'L' 112 | elif channels == 3: 113 | # 3x8-bit pixels, true color 114 | image_mode = 'RGB' 115 | if image.mode != image_mode: 116 | image = image.convert(image_mode) 117 | image = np.array(image) 118 | elif isinstance(image, np.ndarray): 119 | if image.dtype != np.uint8: 120 | image = image.astype(np.uint8) 121 | if image.ndim == 3 and image.shape[2] == 1: 122 | image = image.reshape(image.shape[:2]) 123 | if channels is None: 124 | if image.ndim == 2: 125 | channels = 1 126 | elif image.ndim == 3 and image.shape[2] == 3: 127 | channels = 3 128 | else: 129 | raise ValueError('invalid image shape: %s' % (image.shape,)) 130 | elif channels == 1: 131 | if image.ndim != 2: 132 | if image.ndim == 3 and image.shape[2] == 3: 133 | # color to grayscale 134 | image = np.dot(image, [0.299, 0.587, 0.114]).astype(np.uint8) 135 | else: 136 | raise ValueError('invalid image shape: %s' % (image.shape,)) 137 | elif channels == 3: 138 | if image.ndim == 2: 139 | # grayscale to color 140 | image = np.repeat(image, 3).reshape(image.shape + (3,)) 141 | elif image.shape[2] != 3: 142 | raise ValueError('invalid image shape: %s' % (image.shape,)) 143 | else: 144 | raise ValueError('resize_image() expected a PIL.Image.Image or a numpy.ndarray') 145 | 146 | # No need to resize 147 | if image.shape[0] == height and image.shape[1] == width: 148 | return image 149 | 150 | # Resize 151 | interp = 'bilinear' 152 | 153 | width_ratio = float(image.shape[1]) / width 154 | height_ratio = float(image.shape[0]) / height 155 | if resize_mode == 'squash' or width_ratio == height_ratio: 156 | return imresize(image, (height, width), interp=interp) 157 | elif resize_mode == 'crop': 158 | # resize to smallest of ratios (relatively larger image), keeping aspect ratio 159 | if width_ratio > height_ratio: 160 | resize_height = height 161 | resize_width = int(round(image.shape[1] / height_ratio)) 162 | else: 163 | resize_width = width 164 | resize_height = int(round(image.shape[0] / width_ratio)) 165 | image = imresize(image, (resize_height, resize_width), interp=interp) 166 | 167 | # chop off ends of dimension that is still too long 168 | if width_ratio > height_ratio: 169 | start = int(round((resize_width - width) / 2.0)) 170 | return image[:, start:start + width] 171 | else: 172 | start = int(round((resize_height - height) / 2.0)) 173 | return image[start:start + height, :] 174 | else: 175 | if resize_mode == 'fill': 176 | # resize to biggest of ratios (relatively smaller image), keeping aspect ratio 177 | if width_ratio > height_ratio: 178 | resize_width = width 179 | resize_height = int(round(image.shape[0] / width_ratio)) 180 | if (height - resize_height) % 2 == 1: 181 | resize_height += 1 182 | else: 183 | resize_height = height 184 | resize_width = int(round(image.shape[1] / height_ratio)) 185 | if (width - resize_width) % 2 == 1: 186 | resize_width += 1 187 | image = imresize(image, (resize_height, resize_width), interp=interp) 188 | elif resize_mode == 'half_crop': 189 | # resize to average ratio keeping aspect ratio 190 | new_ratio = (width_ratio + height_ratio) / 2.0 191 | resize_width = int(round(image.shape[1] / new_ratio)) 192 | resize_height = int(round(image.shape[0] / new_ratio)) 193 | if width_ratio > height_ratio and (height - resize_height) % 2 == 1: 194 | resize_height += 1 195 | elif width_ratio < height_ratio and (width - resize_width) % 2 == 1: 196 | resize_width += 1 197 | image = imresize(image, (resize_height, resize_width), interp=interp) 198 | # chop off ends of dimension that is still too long 199 | if width_ratio > height_ratio: 200 | start = int(round((resize_width - width) / 2.0)) 201 | image = image[:, start:start + width] 202 | else: 203 | start = int(round((resize_height - height) / 2.0)) 204 | image = image[start:start + height, :] 205 | else: 206 | raise Exception('unrecognized resize_mode "%s"' % resize_mode) 207 | 208 | # fill ends of dimension that is too short with random noise 209 | if width_ratio > height_ratio: 210 | padding = (height - resize_height) / 2 211 | noise_size = (padding, width) 212 | if channels > 1: 213 | noise_size += (channels,) 214 | noise = np.random.randint(0, 255, noise_size).astype('uint8') 215 | image = np.concatenate((noise, image, noise), axis=0) 216 | else: 217 | padding = (width - resize_width) / 2 218 | noise_size = (height, padding) 219 | if channels > 1: 220 | noise_size += (channels,) 221 | noise = np.random.randint(0, 255, noise_size).astype('uint8') 222 | image = np.concatenate((noise, image, noise), axis=1) 223 | 224 | return image 225 | 226 | 227 | def add_bboxes_to_image(image, bboxes, color='red', width=1): 228 | """ 229 | Draw rectangles on the image for the bounding boxes 230 | Returns a PIL.Image 231 | Arguments: 232 | image -- input image 233 | bboxes -- bounding boxes in the [((l, t), (r, b)), ...] format 234 | Keyword arguments: 235 | color -- color to draw the rectangles 236 | width -- line width of the rectangles 237 | Example: 238 | image = Image.open(filename) 239 | add_bboxes_to_image(image, bboxes[filename], width=2, color='#FF7700') 240 | image.show() 241 | """ 242 | 243 | def expanded_bbox(bbox, n): 244 | """ 245 | Grow the bounding box by n pixels 246 | """ 247 | l = min(bbox[0][0], bbox[1][0]) 248 | r = max(bbox[0][0], bbox[1][0]) 249 | t = min(bbox[0][1], bbox[1][1]) 250 | b = max(bbox[0][1], bbox[1][1]) 251 | return ((l - n, t - n), (r + n, b + n)) 252 | 253 | from PIL import ImageDraw 254 | draw = ImageDraw.Draw(image) 255 | for bbox in bboxes: 256 | for n in range(width): 257 | draw.rectangle(expanded_bbox(bbox, n), outline=color) 258 | 259 | return image 260 | 261 | 262 | def get_layer_vis_square(data, 263 | allow_heatmap=True, 264 | normalize=True, 265 | min_img_dim=100, 266 | max_width=1200, 267 | channel_order='RGB', 268 | colormap='jet', 269 | ): 270 | """ 271 | Returns a vis_square for the given layer data 272 | Arguments: 273 | data -- a np.ndarray 274 | Keyword arguments: 275 | allow_heatmap -- if True, convert single channel images to heatmaps 276 | normalize -- whether to normalize the data when visualizing 277 | max_width -- maximum width for the vis_square 278 | """ 279 | if channel_order not in ['RGB', 'BGR']: 280 | raise ValueError('Unsupported channel_order %s' % channel_order) 281 | if data.ndim == 1: 282 | # interpret as 1x1 grayscale images 283 | # (N, 1, 1) 284 | data = data[:, np.newaxis, np.newaxis] 285 | elif data.ndim == 2: 286 | # interpret as 1x1 grayscale images 287 | # (N, 1, 1) 288 | data = data.reshape((data.shape[0] * data.shape[1], 1, 1)) 289 | elif data.ndim == 3: 290 | if data.shape[0] == 3: 291 | # interpret as a color image 292 | # (1, H, W, 3) 293 | if channel_order == 'BGR': 294 | data = data[[2, 1, 0], ...] # BGR to RGB (see issue #59) 295 | data = data.transpose(1, 2, 0) 296 | data = data[np.newaxis, ...] 297 | else: 298 | # interpret as grayscale images 299 | # (N, H, W) 300 | pass 301 | elif data.ndim == 4: 302 | if data.shape[0] == 3: 303 | # interpret as HxW color images 304 | # (N, H, W, 3) 305 | data = data.transpose(1, 2, 3, 0) 306 | if channel_order == 'BGR': 307 | data = data[:, :, :, [2, 1, 0]] # BGR to RGB (see issue #59) 308 | elif data.shape[1] == 3: 309 | # interpret as HxW color images 310 | # (N, H, W, 3) 311 | data = data.transpose(0, 2, 3, 1) 312 | if channel_order == 'BGR': 313 | data = data[:, :, :, [2, 1, 0]] # BGR to RGB (see issue #59) 314 | else: 315 | # interpret as HxW grayscale images 316 | # (N, H, W) 317 | data = data.reshape((data.shape[0] * data.shape[1], data.shape[2], data.shape[3])) 318 | else: 319 | raise RuntimeError('unrecognized data shape: %s' % (data.shape,)) 320 | 321 | return get_layer_vis_square_raw(data, 322 | allow_heatmap, 323 | normalize, 324 | min_img_dim, 325 | max_width, 326 | colormap, 327 | ) 328 | 329 | 330 | def get_image_tales(images, colormap='jet', min_img_dim=100, max_width=1000): 331 | padsize = 1 332 | # convert to float since we're going to do some math 333 | images = images.astype('float32') 334 | 335 | images -= images.min() 336 | if images.max() > 0: 337 | images /= images.max() 338 | images *= 255 339 | 340 | if images.ndim == 3: 341 | # they're grayscale - convert to a colormap 342 | redmap, greenmap, bluemap = get_color_map(colormap) 343 | 344 | red = np.interp(images * (len(redmap) - 1) / 255.0, range(len(redmap)), redmap) 345 | green = np.interp(images * (len(greenmap) - 1) / 255.0, range(len(greenmap)), greenmap) 346 | blue = np.interp(images * (len(bluemap) - 1) / 255.0, range(len(bluemap)), bluemap) 347 | 348 | # Slap the channels back together 349 | images = np.concatenate( 350 | (red[..., np.newaxis], green[..., np.newaxis], blue[..., np.newaxis]), axis=3) 351 | images = np.minimum(images, 255) 352 | images = np.maximum(images, 0) 353 | 354 | # convert back to uint8 355 | images = images.astype('uint8') 356 | 357 | # Compute the output image matrix dimensions 358 | n = int(np.ceil(np.sqrt(images.shape[0]))) 359 | ny = n 360 | nx = n 361 | length = images.shape[0] 362 | if n * (n - 1) >= length: 363 | nx = n - 1 364 | 365 | # Add padding between the images 366 | padding = ((0, nx * ny - length), (0, padsize), (0, padsize)) + ((0, 0),) * (images.ndim - 3) 367 | padded = np.pad(images, padding, mode='constant', constant_values=0) 368 | 369 | # Tile the images beside each other 370 | tiles = padded.reshape( 371 | (ny, nx) + padded.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, padded.ndim + 1))) 372 | tiles = tiles.reshape((ny * tiles.shape[1], nx * tiles.shape[3]) + tiles.shape[4:]) 373 | 374 | return tiles 375 | 376 | 377 | def get_layer_vis_square_raw(data, 378 | allow_heatmap=True, 379 | normalize=True, 380 | min_img_dim=100, 381 | max_width=1200, 382 | colormap='jet', 383 | ): 384 | # chop off data so that it will fit within max_width 385 | padsize = 0 386 | width = data.shape[2] 387 | if width > max_width: 388 | data = data[:1, :max_width, :max_width] 389 | else: 390 | if width > 1: 391 | padsize = 1 392 | width += 1 393 | n = max(max_width // width, 1) 394 | n *= n 395 | data = data[:n] 396 | 397 | if not allow_heatmap and data.ndim == 3: 398 | data = data[..., np.newaxis] 399 | 400 | vis = vis_square(data, 401 | padsize=padsize, 402 | normalize=normalize, 403 | colormap=colormap 404 | ) 405 | 406 | # find minimum dimension and upscale if necessary 407 | _min = sorted(vis.shape[:2])[0] 408 | if _min < min_img_dim: 409 | # upscale image 410 | ratio = min_img_dim / float(_min) 411 | vis = upscale(vis, ratio) 412 | return vis 413 | 414 | 415 | def vis_square(images, 416 | padsize=1, 417 | normalize=False, 418 | colormap='jet', 419 | ): 420 | """ 421 | Visualize each image in a grid of size approx sqrt(n) by sqrt(n) 422 | Returns a np.array image 423 | (Based on Caffe's filter_visualization notebook) 424 | Arguments: 425 | images -- an array of shape (N, H, W) or (N, H, W, C) 426 | if C is not set, a heatmap is computed for the result 427 | Keyword arguments: 428 | padsize -- how many pixels go inbetween the tiles 429 | normalize -- if true, scales (min, max) across all images out to (0, 1) 430 | colormap -- a string representing one of the supported colormaps 431 | """ 432 | assert 3 <= images.ndim <= 4, 'images.ndim must be 3 or 4' 433 | # convert to float since we're going to do some math 434 | images = images.astype('float32') 435 | if normalize: 436 | images -= images.min() 437 | if images.max() > 0: 438 | images /= images.max() 439 | images *= 255 440 | 441 | if images.ndim == 3: 442 | # they're grayscale - convert to a colormap 443 | redmap, greenmap, bluemap = get_color_map(colormap) 444 | 445 | red = np.interp(images * (len(redmap) - 1) / 255.0, range(len(redmap)), redmap) 446 | green = np.interp(images * (len(greenmap) - 1) / 255.0, range(len(greenmap)), greenmap) 447 | blue = np.interp(images * (len(bluemap) - 1) / 255.0, range(len(bluemap)), bluemap) 448 | 449 | # Slap the channels back together 450 | images = np.concatenate( 451 | (red[..., np.newaxis], green[..., np.newaxis], blue[..., np.newaxis]), axis=3) 452 | images = np.minimum(images, 255) 453 | images = np.maximum(images, 0) 454 | 455 | # convert back to uint8 456 | images = images.astype('uint8') 457 | 458 | # Compute the output image matrix dimensions 459 | n = int(np.ceil(np.sqrt(images.shape[0]))) 460 | ny = n 461 | nx = n 462 | length = images.shape[0] 463 | if n * (n - 1) >= length: 464 | nx = n - 1 465 | 466 | # Add padding between the images 467 | padding = ((0, nx * ny - length), (0, padsize), (0, padsize)) + ((0, 0),) * (images.ndim - 3) 468 | padded = np.pad(images, padding, mode='constant', constant_values=255) 469 | 470 | # Tile the images beside each other 471 | tiles = padded.reshape( 472 | (ny, nx) + padded.shape[1:]).transpose((0, 2, 1, 3) + tuple(range(4, padded.ndim + 1))) 473 | tiles = tiles.reshape((ny * tiles.shape[1], nx * tiles.shape[3]) + tiles.shape[4:]) 474 | 475 | if tiles.shape[-1] == 1: 476 | # grayscale to color 477 | tiles = np.dstack([tiles.squeeze()] * 3) 478 | 479 | return tiles 480 | 481 | 482 | def get_color_map(name): 483 | """ 484 | Return a colormap as (redmap, greenmap, bluemap) 485 | Arguments: 486 | name -- the name of the colormap. If unrecognized, will default to 'jet'. 487 | """ 488 | redmap = [0] 489 | greenmap = [0] 490 | bluemap = [0] 491 | if name == 'white': 492 | # essentially a noop 493 | redmap = [0, 1] 494 | greenmap = [0, 1] 495 | bluemap = [0, 1] 496 | elif name == 'simple': 497 | redmap = [0, 1, 1, 1] 498 | greenmap = [0, 0, 1, 1] 499 | bluemap = [0, 0, 0, 1] 500 | elif name == 'hot': 501 | redmap = [0, 0.03968253968253968, 0.07936507936507936, 0.119047619047619, 0.1587301587301587, 502 | 0.1984126984126984, 0.2380952380952381, 0.2777777777777778, 0.3174603174603174, 0.3571428571428571, 503 | 0.3968253968253968, 0.4365079365079365, 0.4761904761904762, 0.5158730158730158, 0.5555555555555556, 504 | 0.5952380952380952, 505 | 0.6349206349206349, 0.6746031746031745, 0.7142857142857142, 0.753968253968254, 0.7936507936507936, 506 | 0.8333333333333333, 0.873015873015873, 0.9126984126984127, 0.9523809523809523, 0.992063492063492, 1, 507 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 508 | 1, 1, 1] 509 | greenmap = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.03174603174603163, 510 | 0.0714285714285714, 0.1111111111111112, 0.1507936507936507, 0.1904761904761905, 0.23015873015873, 511 | 0.2698412698412698, 0.3095238095238093, 0.3492063492063491, 0.3888888888888888, 0.4285714285714284, 512 | 0.4682539682539679, 0.5079365079365079, 0.5476190476190477, 0.5873015873015872, 0.6269841269841268, 513 | 0.6666666666666665, 0.7063492063492065, 0.746031746031746, 0.7857142857142856, 0.8253968253968254, 514 | 0.8650793650793651, 0.9047619047619047, 0.9444444444444442, 0.984126984126984, 1, 1, 1, 1, 1, 1, 1, 515 | 1, 1, 1, 1, 1, 1] 516 | bluemap = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 517 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.04761904761904745, 0.1269841269841265, 518 | 0.2063492063492056, 0.2857142857142856, 0.3650793650793656, 0.4444444444444446, 0.5238095238095237, 519 | 0.6031746031746028, 0.6825396825396828, 0.7619047619047619, 0.8412698412698409, 0.92063492063492, 1] 520 | elif name == 'rainbow': 521 | redmap = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.9365079365079367, 522 | 0.8571428571428572, 0.7777777777777777, 0.6984126984126986, 0.6190476190476191, 0.53968253968254, 523 | 0.4603174603174605, 0.3809523809523814, 0.3015873015873018, 0.2222222222222223, 0.1428571428571432, 524 | 0.06349206349206415, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.03174603174603208, 0.08465608465608465, 525 | 0.1375661375661377, 0.1904761904761907, 0.2433862433862437, 0.2962962962962963, 0.3492063492063493, 526 | 0.4021164021164023, 0.4550264550264553, 0.5079365079365079, 0.5608465608465609, 0.6137566137566139, 527 | 0.666666666666667] 528 | greenmap = [0, 0.03968253968253968, 0.07936507936507936, 0.119047619047619, 0.1587301587301587, 529 | 0.1984126984126984, 0.2380952380952381, 0.2777777777777778, 0.3174603174603174, 0.3571428571428571, 530 | 0.3968253968253968, 0.4365079365079365, 0.4761904761904762, 0.5158730158730158, 0.5555555555555556, 531 | 0.5952380952380952, 0.6349206349206349, 0.6746031746031745, 0.7142857142857142, 0.753968253968254, 532 | 0.7936507936507936, 533 | 0.8333333333333333, 0.873015873015873, 0.9126984126984127, 0.9523809523809523, 0.992063492063492, 1, 534 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.9841269841269842, 0.9047619047619047, 0.8253968253968256, 535 | 0.7460317460317465, 0.666666666666667, 0.587301587301587, 0.5079365079365079, 0.4285714285714288, 536 | 0.3492063492063493, 0.2698412698412698, 0.1904761904761907, 0.1111111111111116, 0.03174603174603208, 537 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 538 | bluemap = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 539 | 0, 0, 0, 0, 0.01587301587301582, 0.09523809523809534, 0.1746031746031744, 0.2539682539682535, 540 | 0.333333333333333, 0.412698412698413, 0.4920634920634921, 0.5714285714285712, 0.6507936507936507, 541 | 0.7301587301587302, 0.8095238095238093, 0.8888888888888884, 0.9682539682539679, 1, 1, 1, 1, 1, 1, 1, 542 | 1, 1, 1, 1, 1, 1] 543 | elif name == 'winter': 544 | greenmap = [0, 1] 545 | bluemap = [1, 0.5] 546 | else: 547 | if name != 'jet': 548 | print('Warning: colormap "%s" not supported. Using jet instead.' % name) 549 | redmap = [0, 0, 0, 0, 0.5, 1, 1, 1, 0.5] 550 | greenmap = [0, 0, 0.5, 1, 1, 1, 0.5, 0, 0] 551 | bluemap = [0.5, 1, 1, 1, 0.5, 0, 0, 0, 0] 552 | return 255.0 * np.array(redmap), 255.0 * np.array(greenmap), 255.0 * np.array(bluemap) 553 | -------------------------------------------------------------------------------- /deepkit/experiment.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import atexit 3 | import base64 4 | import json 5 | import os 6 | import signal 7 | import struct 8 | import sys 9 | import time 10 | from datetime import datetime 11 | from threading import Lock 12 | from typing import Optional, Callable, NamedTuple, Dict, List 13 | import math 14 | 15 | import PIL.Image 16 | import numpy as np 17 | import psutil 18 | import typedload 19 | from rx import interval 20 | 21 | import deepkit.client 22 | import deepkit.debugger 23 | import deepkit.globals 24 | import deepkit.utils 25 | from deepkit.model import ExperimentOptions 26 | from deepkit.utils.image import pil_image_to_jpeg, get_layer_vis_square, get_image_tales, make_image_from_dense 27 | from deepkit.utils import numpy_to_binary 28 | 29 | 30 | def get_job_config(): 31 | if deepkit.globals.loaded_job_config is None: 32 | if 'DEEPKIT_JOB_CONFIG' in os.environ: 33 | deepkit.globals.loaded_job_config = json.loads(os.environ['DEEPKIT_JOB_CONFIG']) 34 | else: 35 | deepkit.globals.loaded_job_config = { 36 | } 37 | 38 | return deepkit.globals.loaded_job_config 39 | 40 | 41 | class JobController: 42 | def stop(self): 43 | """ 44 | Raising the SIGINT signal in the current process and all sub-processes. 45 | os.kill() only issues a signal in the current process (without subprocesses). 46 | CTRL+C on the console sends the signal to the process group (which we need). 47 | """ 48 | if hasattr(signal, 'CTRL_C_EVENT'): 49 | # windows. Need CTRL_C_EVENT to raise the signal in the whole process group 50 | os.kill(os.getpid(), signal.CTRL_C_EVENT) 51 | else: 52 | # unix. 53 | pgid = os.getpgid(os.getpid()) 54 | if pgid == 1: 55 | os.kill(os.getpid(), signal.SIGINT) 56 | else: 57 | os.killpg(os.getpgid(os.getpid()), signal.SIGINT) 58 | 59 | 60 | class JobDebuggingState(NamedTuple): 61 | watchingLayers: Dict[str, bool] 62 | live: bool 63 | recording: bool 64 | 65 | # 'epoch' | 'second' 66 | recordingMode: str 67 | 68 | # 'watched' | 'all' 69 | recordingLayers: str 70 | 71 | recordingSecond: int 72 | 73 | 74 | class JobDebuggerController: 75 | def __init__(self, client: deepkit.client.Client): 76 | self.state: Optional[JobDebuggingState] = None 77 | self.client = client 78 | 79 | async def connected(self): 80 | await self._update_watching_layers() 81 | 82 | # registered RPC function 83 | async def updateWatchingLayer(self): 84 | await self._update_watching_layers() 85 | 86 | async def _update_watching_layers(self): 87 | self.state = typedload.load(await self.client.job_action('getDebuggingState'), JobDebuggingState) 88 | 89 | 90 | class Experiment: 91 | def __init__(self, project=None, account=None, monitoring=True, try_pick_up=False, parent_experiment=None, 92 | silent=False): 93 | """ 94 | :type project: str If the current folder is not linked and you don't specify a project here, an error is raised since 95 | Deepkit isn't able to know to which project the experiments data should be sent. 96 | 97 | :type account: str Per default the first account linked to this folder is used (see `deepkit link` or `deepkit-sdk auth -l`), 98 | this is on a new system `localhost`. 99 | You can overwrite which account is used by specifying the name here (see `deepkit id` for 100 | available accounts in your system). 101 | 102 | :type monitoring: bool When true this experiment sends current stdout as experiment logs and monitors 103 | hardware utilisation of the current process. 104 | 105 | :type try_pick_up: bool Whether it should be tried to pick up an existing experiment created by the 106 | CLI/App (determined by environment variables). For manually created experiments this should be False. 107 | 108 | :tye parent_experiment: str When defined moves this experiment as sub experiment 109 | """ 110 | self.account = account 111 | self.project = project 112 | self.monitoring = monitoring 113 | self.parent_experiment = parent_experiment 114 | self.silent = silent 115 | 116 | if not self.parent_experiment: 117 | if deepkit.globals.last_experiment: 118 | self.parent_experiment = deepkit.globals.last_experiment.id 119 | else: 120 | self.parent_experiment = os.environ.get('DEEPKIT_JOB_ID', None) 121 | 122 | self.metric_buffer = [] 123 | self.speed_buffer = [] 124 | self.logs_buffer = [] 125 | self.last_throttle_call = dict() 126 | 127 | self.client = deepkit.client.Client(project=project, account=account, try_pick_up=try_pick_up, 128 | parent_experiment=self.parent_experiment, silent=self.silent) 129 | 130 | self.log_lock = Lock() 131 | self.defined_metrics = {} 132 | self.shutting_down = False 133 | 134 | self.last_iteration_time = 0 135 | self.last_batch_time = 0 136 | self.job_iteration = 0 137 | self.job_iterations = 0 138 | self.job_step = 0 139 | self.job_steps = 0 140 | 141 | self.model_watching = dict() 142 | 143 | self.auto_x_of_metrix = dict() 144 | self.auto_x_of_insight = dict() 145 | self.created_insights = dict() 146 | 147 | self.seconds_per_iteration = 0 148 | self.seconds_per_iterations = [] 149 | self.debugger = deepkit.debugger.DebuggerManager(self) 150 | 151 | if deepkit.utils.in_self_execution(): 152 | self.job_controller = JobController() 153 | 154 | self.debugger_controller: JobDebuggerController = JobDebuggerController(self.client) 155 | 156 | # runs in the client Thread 157 | def on_connect(connected): 158 | if connected: 159 | if deepkit.utils.in_self_execution(): 160 | self.client.register_controller('job/' + self.client.job_id, self.job_controller) 161 | 162 | self.client.register_controller('job/' + self.client.job_id + '/debugger', self.debugger_controller) 163 | 164 | asyncio.run_coroutine_threadsafe(self.debugger_controller.connected(), loop=self.client.loop) 165 | else: 166 | self.debugger.on_disconnect() 167 | 168 | self.client.connected.subscribe(on_connect) 169 | 170 | atexit.register(self.shutdown) 171 | try: 172 | self.client.connect() 173 | self.wait_for_connect() 174 | except Exception as e: 175 | print("Error connecting to Deepkit. Experiment data sync aborted.", e) 176 | if deepkit.globals.last_experiment is self: 177 | deepkit.globals.last_experiment = None 178 | 179 | if deepkit.utils.in_self_execution() and monitoring: 180 | # the CLI handles output logging otherwise 181 | if len(deepkit.globals.last_logs.getvalue()) > 0: 182 | self.logs_buffer.append(deepkit.globals.last_logs.getvalue()) 183 | 184 | if deepkit.utils.in_self_execution() and monitoring: 185 | # the CLI handles hardware monitoring otherwise 186 | p = psutil.Process() 187 | 188 | def on_hardware_metrics(dummy): 189 | net = psutil.net_io_counters() 190 | disk = psutil.disk_io_counters() 191 | data = struct.pack( 192 | ' last_time: 223 | self.last_throttle_call[fn] = time.time() 224 | fn() 225 | 226 | def create_sub_experiment(self): 227 | return Experiment( 228 | project=self.project, 229 | account=self.account, 230 | parent_experiment=self.id, 231 | silent=self.silent 232 | ) 233 | 234 | def drain_speed_report(self): 235 | # only save latest value, each second 236 | if len(self.speed_buffer) == 0: return 237 | item = self.speed_buffer[-1] 238 | self.speed_buffer = [] 239 | self.client.job_action_threadsafe( 240 | 'streamInternalFile', 241 | ['.deepkit/speed.metric', base64.b64encode(item).decode('utf8')] 242 | ) 243 | 244 | def drain_logs(self): 245 | if len(self.logs_buffer) == 0: return 246 | packed = '' 247 | buffer = self.logs_buffer.copy() 248 | self.logs_buffer = [] 249 | for d in buffer: 250 | packed += d 251 | 252 | self.client.job_action_threadsafe('log', ['main_0', packed]) 253 | 254 | def drain_metric_buffer(self): 255 | if len(self.metric_buffer) == 0: 256 | return 257 | buffer = self.metric_buffer.copy() 258 | self.metric_buffer = [] 259 | try: 260 | packed = {} 261 | items = {} 262 | for d in buffer: 263 | if d['id'] not in packed: 264 | packed[d['id']] = b'' 265 | items[d['id']] = 0 266 | 267 | items[d['id']] += 1 268 | packed[d['id']] += d['row'] 269 | 270 | for i, v in packed.items(): 271 | # print('channelData', items[i], len(v) / 27) 272 | 273 | self.client.job_action_threadsafe('channelData', [i, base64.b64encode(v).decode('utf8')]) 274 | except Exception as e: 275 | print('on_metric failed', e) 276 | 277 | def wait_for_connect(self): 278 | async def wait(): 279 | await self.client.connecting 280 | 281 | asyncio.run_coroutine_threadsafe(wait(), self.client.loop).result() 282 | 283 | def done(self): 284 | self.client.result_status = deepkit.client.JobStatus.done 285 | self.shutdown() 286 | 287 | def abort(self): 288 | self.client.result_status = deepkit.client.JobStatus.aborted 289 | self.shutdown() 290 | 291 | def crash(self): 292 | self.client.result_status = deepkit.client.JobStatus.crashed 293 | self.shutdown() 294 | 295 | def failed(self): 296 | self.client.result_status = deepkit.client.JobStatus.failed 297 | self.shutdown() 298 | 299 | def shutdown(self): 300 | if self.shutting_down: return 301 | self.shutting_down = True 302 | atexit.unregister(self.shutdown) 303 | self.drain_metric_buffer() 304 | self.drain_speed_report() 305 | self.drain_logs() 306 | self.client.shutdown() 307 | 308 | def epoch(self, current: int, total: Optional[int]): 309 | self.iteration(current, total) 310 | self.debugger.tick() 311 | 312 | def iteration(self, current: int, total: Optional[int]): 313 | if current and self.job_iteration == current: 314 | # nothing to do 315 | return 316 | 317 | self.job_iteration = current 318 | if total: 319 | self.job_iterations = total 320 | 321 | now = time.time() 322 | if self.last_iteration_time: 323 | self.seconds_per_iterations.append({ 324 | 'diff': now - self.last_iteration_time, 325 | 'when': now, 326 | }) 327 | 328 | self.last_iteration_time = now 329 | self.last_batch_time = now 330 | 331 | # remove all older than twenty seconds 332 | self.seconds_per_iterations = [x for x in self.seconds_per_iterations if (now - x['when']) < 20] 333 | self.seconds_per_iterations = self.seconds_per_iterations[-30:] 334 | 335 | if len(self.seconds_per_iterations) > 0: 336 | diffs = [x['diff'] for x in self.seconds_per_iterations] 337 | self.seconds_per_iteration = sum(diffs) / len(diffs) 338 | 339 | if self.seconds_per_iteration: 340 | self.client.patch('secondsPerIteration', self.seconds_per_iteration) 341 | 342 | self.client.patch('iteration', self.job_iteration) 343 | if total: 344 | self.client.patch('iterations', self.job_iterations) 345 | 346 | iterations_left = self.job_iterations - self.job_iteration 347 | if iterations_left > 0: 348 | self.client.patch('eta', self.seconds_per_iteration * iterations_left) 349 | else: 350 | self.client.patch('eta', 0) 351 | 352 | def batch(self, current: int, total: int = None, size: int = 1): 353 | self.step(current, total, size) 354 | 355 | def step(self, current: int, total: int = None, size: int = 1): 356 | if current and self.job_steps == current: 357 | # nothing to do 358 | return 359 | 360 | if current < self.job_step: 361 | # it was reset, new epoch/iteration basically 362 | self.job_step = 0 363 | 364 | steps_made = current - self.job_step 365 | 366 | self.job_step = current 367 | if total is not None: 368 | self.job_steps = total 369 | if total is None: 370 | total = self.job_steps 371 | 372 | self.client.patch('step', current) 373 | now = time.time() 374 | 375 | x = self.job_iteration + (current / total) 376 | speed_per_second = 0 377 | if size: 378 | speed_per_second = size / (now - self.last_batch_time) if self.last_batch_time else size 379 | 380 | if self.last_batch_time: 381 | time_per_step = step_since_last_took = (now - self.last_batch_time) 382 | if steps_made > 0: 383 | time_per_step = step_since_last_took / steps_made 384 | 385 | self.seconds_per_iterations.append({ 386 | 'diff': time_per_step * total, 387 | 'when': now 388 | }) 389 | 390 | # remove all older than twenty seconds 391 | self.seconds_per_iterations = [x for x in self.seconds_per_iterations if (now - x['when']) < 20] 392 | self.seconds_per_iterations = self.seconds_per_iterations[-30:] 393 | 394 | if len(self.seconds_per_iterations) > 0: 395 | diffs = [x['diff'] for x in self.seconds_per_iterations] 396 | self.seconds_per_iteration = sum(diffs) / len(diffs) 397 | 398 | iterations_left = self.job_iterations - self.job_iteration 399 | self.client.patch('eta', self.seconds_per_iteration * iterations_left) 400 | 401 | self.last_batch_time = now 402 | 403 | if self.seconds_per_iteration: 404 | self.client.patch('secondsPerIteration', self.seconds_per_iteration) 405 | 406 | self.client.patch('speed', speed_per_second) 407 | 408 | speed = struct.pack(' 0: 628 | x = self.job_iteration + (self.job_step / self.job_steps) 629 | elif self.job_iteration > 0: 630 | x = self.job_iteration 631 | else: 632 | if name not in self.auto_x_of_insight: 633 | self.auto_x_of_insight[name] = 0 634 | self.auto_x_of_insight[name] += 1 635 | x = self.auto_x_of_insight[name] 636 | 637 | if not isinstance(x, (int, float)): 638 | raise Exception('x needs to be integer or float') 639 | 640 | if x not in self.created_insights: 641 | self.created_insights[x] = True 642 | self.client.job_action_threadsafe('addInsight', [ 643 | x, 644 | time.time(), 645 | self.job_iteration, 646 | self.job_step, 647 | ]) 648 | 649 | for i, d in enumerate(data): 650 | file_type = '' 651 | if isinstance(d, PIL.Image.Image): 652 | file_type = 'png' 653 | d = pil_image_to_jpeg(d) 654 | elif isinstance(d, np.ndarray): 655 | # tf is not batch per default 656 | 657 | if image_convertion: 658 | sample = np.copy(d) 659 | shape = d.shape 660 | image = False 661 | if len(shape) == 3: 662 | try: 663 | if 'keras' in sys.modules: 664 | import keras 665 | if keras.backend.image_data_format() == 'channels_last': 666 | sample = np.transpose(sample, (2, 0, 1)) 667 | elif 'tensorflow.keras' in sys.modules: 668 | import tensorflow.keras as keras 669 | if keras.backend.image_data_format() == 'channels_last': 670 | sample = np.transpose(sample, (2, 0, 1)) 671 | except: 672 | pass 673 | 674 | if sample.shape[0] == 3: 675 | d = PIL.Image.fromarray(get_layer_vis_square(sample)) 676 | image = True 677 | else: 678 | d = PIL.Image.fromarray(get_image_tales(sample)) 679 | image = True 680 | elif len(shape) > 1: 681 | d = PIL.Image.fromarray(get_layer_vis_square(sample)) 682 | image = True 683 | elif len(shape) == 1: 684 | if shape[0] != 1: 685 | # we got a single number 686 | d = sample[0] 687 | else: 688 | d = make_image_from_dense(sample) 689 | image = True 690 | if image: 691 | file_type = 'png' 692 | d = pil_image_to_jpeg(d) 693 | else: 694 | file_type = 'npy' 695 | d = numpy_to_binary(d) 696 | else: 697 | file_type = 'npy' 698 | d = numpy_to_binary(d) 699 | else: 700 | file_type = 'json' 701 | d = bytes(json.dumps(d), encoding='utf-8') 702 | 703 | if len(data) > 1: 704 | file_name = name + '_' + str(i) + '.' + file_type 705 | else: 706 | file_name = name + '.' + file_type 707 | 708 | self.client.job_action_threadsafe('addInsightEntry', [ 709 | x, 710 | file_name, 711 | datetime.utcnow().isoformat(), 712 | { 713 | 'type': file_type, 714 | 'meta': meta 715 | }, 716 | base64.b64encode(d).decode(), 717 | ]) 718 | 719 | def log_metric(self, name: str, *y, x=None): 720 | if y is None: 721 | y = 0 722 | 723 | if not isinstance(y, (list, tuple)): 724 | y = [y] 725 | 726 | def convert(v): 727 | if v is None: return 0 728 | if math.isnan(v): return 0 729 | return float(v) 730 | 731 | y = [convert(v) for v in y] 732 | 733 | if x is None: 734 | if self.job_steps > 0: 735 | x = self.job_iteration + (self.job_step / self.job_steps) 736 | else: 737 | if name not in self.auto_x_of_metrix: 738 | self.auto_x_of_metrix[name] = 0 739 | self.auto_x_of_metrix[name] += 1 740 | x = self.auto_x_of_metrix[name] 741 | 742 | name = name.replace('.', '/') 743 | 744 | if name not in self.defined_metrics: 745 | traces = [str(i) for i, _ in enumerate(y)] 746 | self.define_metric(name, traces=traces) 747 | else: 748 | if 'traces' in self.defined_metrics[name] and len(self.defined_metrics[name]['traces']) != len(y): 749 | traces = self.defined_metrics[name]['traces'] 750 | raise Exception(f'Metric {name} has {len(traces)} traces defined, but you provided {len(y)}') 751 | 752 | row_binary = struct.pack('