├── .gitignore ├── LICENSE ├── LICENSE_DATASET ├── README.md ├── launch_configs ├── ray_gcp.yaml └── ray_local_cluster.yaml ├── requirements.txt ├── robonet ├── README ├── __init__.py ├── datasets │ ├── __init__.py │ ├── base_dataset.py │ ├── record_dataset.py │ ├── robonet_dataset.py │ ├── util │ │ ├── __init__.py │ │ ├── convert_all.sh │ │ ├── dataset_utils.py │ │ ├── hdf5_2_records.py │ │ ├── hdf5_loader.py │ │ ├── metadata_helper.py │ │ └── tensor_multiplexer.py │ └── variants │ │ ├── __init__.py │ │ ├── annotation_benchmark_dataset.py │ │ └── val_filter_dataset_variants.py ├── inverse_model │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ ├── base_inverse_model.py │ │ ├── deterministic_inverse_model.py │ │ ├── discretized_inverse_model.py │ │ ├── graphs │ │ │ ├── __init__.py │ │ │ ├── base_graph.py │ │ │ └── lstm_baseline.py │ │ └── layers │ │ │ ├── __init__.py │ │ │ └── vgg_pretrain.py │ ├── testing │ │ ├── __init__.py │ │ └── action_inference_interface.py │ └── training │ │ ├── __init__.py │ │ └── inverse_trainable.py ├── video_prediction │ ├── __init__.py │ ├── flow_ops.py │ ├── functional_ops.py │ ├── layers │ │ ├── __init__.py │ │ ├── deterministic_embedding_rnn_cell.py │ │ ├── dnaflow_rnn_cell.py │ │ ├── encoder_layers.py │ │ ├── normalization.py │ │ └── vgg_network.py │ ├── losses.py │ ├── metrics.py │ ├── models │ │ ├── __init__.py │ │ ├── base_model.py │ │ ├── deterministc_embedding_utils.py │ │ ├── deterministic_generator.py │ │ └── graphs │ │ │ ├── __init__.py │ │ │ ├── base_graph.py │ │ │ ├── deterministic_graph.py │ │ │ ├── dnaflow_graph.py │ │ │ └── vgg_conv_graph.py │ ├── ops.py │ ├── rnn_ops.py │ ├── testing │ │ ├── __init__.py │ │ └── model_evaluation_interface.py │ ├── training │ │ ├── __init__.py │ │ ├── data_filter.py │ │ ├── finetuning_trainable_interface.py │ │ ├── ray_util │ │ │ ├── __init__.py │ │ │ └── gif_logger.py │ │ ├── trainable_interface.py │ │ └── util.py │ └── utils │ │ ├── __init__.py │ │ ├── encode_img.py │ │ ├── ffmpeg_gif.py │ │ ├── html.py │ │ └── tf_utils.py └── yaml_util.py ├── robonet_experiments ├── classifier_control │ └── params.yaml ├── gpu │ ├── capacity_test │ │ ├── base_model │ │ │ ├── flow.yaml │ │ │ └── noflow.yaml │ │ └── large_model │ │ │ ├── flow.yaml │ │ │ └── noflow.yaml │ ├── finetune_baxter.yaml │ ├── pretrain_models │ │ ├── all_robonet │ │ │ ├── large.yaml │ │ │ └── medium.yaml │ │ └── autograsp │ │ │ ├── large.yaml │ │ │ └── medium.yaml │ └── sawyer_grid_search.yaml ├── inverse_model │ ├── discretized_inverse.yaml │ └── inverse.yaml └── tpu │ ├── capacity_test_flow.yaml │ └── capacity_test_noflow.yaml ├── scripts ├── examples │ ├── create_prediction_gifs.py │ └── test_franka_flow.py ├── templates │ ├── index_template.html │ └── traj_template.html ├── train_model.py ├── train_vpred_tpu.py └── visualize_dataset.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.gif 2 | *.pyc 3 | __pycache__/* 4 | *.egg-info/ 5 | .idea/ 6 | scratch/ 7 | node_modules 8 | bower_components 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Sudeep Dasari, Frederik Ebert, Stephen Tian 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RoboNet 2 | Code for loading and manipulating the RoboNet dataset, as well as for training supervised inverse models and video prediction models on the dataset. 3 | 4 | Please refer to the [project wiki](https://github.com/SudeepDasari/RoboNet/wiki) for more detailed documentation. 5 | 6 | If you find the codebase or dataset useful please consider citing our paper. 7 | ``` 8 | @inproceedings{dasari2019robonet, 9 | title={RoboNet: Large-Scale Multi-Robot Learning}, 10 | author={Sudeep Dasari and Frederik Ebert and Stephen Tian and Suraj Nair and Bernadette Bucher and Karl Schmeckpeper and Siddharth Singh and Sergey Levine and Chelsea Finn}, 11 | year={2019}, 12 | eprint={1910.11215}, 13 | archivePrefix={arXiv}, 14 | primaryClass={cs.RO}, 15 | booktitle={CoRL 2019: Volume 100 Proceedings of Machine Learning Research} 16 | } 17 | ``` 18 | 19 | ## Downloading the Dataset 20 | You can find instructions for downloading the dataset on the [project wiki](https://github.com/SudeepDasari/RoboNet/wiki/Getting-Started) as well. All data is provided under the [Creative Commons BY 4.0](https://creativecommons.org/licenses/by/4.0/legalcode) license. 21 | -------------------------------------------------------------------------------- /launch_configs/ray_gcp.yaml: -------------------------------------------------------------------------------- 1 | # An unique identifier for the head node and workers of this cluster. 2 | cluster_name: gcpcluster 3 | 4 | # The minimum number of workers nodes to launch in addition to the head 5 | # node. This number should be >= 0. 6 | min_workers: 4 7 | 8 | # The maximum number of workers nodes to launch in addition to the head 9 | # node. This takes precedence over min_workers. 10 | max_workers: 4 11 | 12 | # The initial number of worker nodes to launch in addition to the head 13 | # node. When the cluster is first brought up (or when it is refreshed with a 14 | # subsequent `ray up`) this number of nodes will be started. 15 | initial_workers: 4 16 | 17 | # This executes all commands on all nodes in the docker container, 18 | # and opens all the necessary ports to support the Ray cluster. 19 | # Empty string means disabled. 20 | docker: 21 | image: "" 22 | container_name: "" # e.g. ray_docker 23 | # container_name: "softlearning" 24 | 25 | # The autoscaler will scale up the cluster to this target fraction of resource 26 | # usage. For example, if a cluster of 10 nodes is 100% busy and 27 | # target_utilization is 0.8, it would resize the cluster to 13. This fraction 28 | # can be decreased to increase the aggressiveness of upscaling. 29 | # This value must be less than 1.0 for scaling to happen. 30 | target_utilization_fraction: 0.8 31 | 32 | # If a node is idle for this many minutes, it will be removed. 33 | idle_timeout_minutes: 5 34 | 35 | # Cloud-provider specific configuration. 36 | provider: 37 | type: gcp 38 | region: us-central1 39 | availability_zone: us-central1-a 40 | project_id: visualmpc-210823 41 | 42 | # How Ray will authenticate with newly launched nodes. 43 | auth: 44 | ssh_user: sudeep 45 | # By default Ray creates a new private keypair, but you can also use your own. 46 | # If you do so, make sure to also set "KeyName" in the head and worker node 47 | # configurations below. 48 | # ssh_private_key: /path/to/your/key.pem 49 | 50 | # Provider-specific config for the head node, e.g. instance type. By default 51 | # Ray will auto-configure unspecified fields such as SubnetId and KeyName. 52 | # For more documentation on available fields, see: 53 | # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances 54 | head_node: 55 | machineType: n1-standard-4 # n1-highcpu-16 56 | disks: 57 | - boot: true 58 | autoDelete: true 59 | type: PERSISTENT 60 | initializeParams: 61 | diskSizeGb: 1000 62 | # See https://cloud.google.com/compute/docs/images for more images 63 | sourceImage: projects/visualmpc-210823/global/images/robonet-image-newray 64 | 65 | # Additional options can be found in in the compute docs at 66 | # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert 67 | 68 | # Provider-specific config for worker nodes, e.g. instance type. By default 69 | # Ray will auto-configure unspecified fields such as SubnetId and KeyName. 70 | # For more documentation on available fields, see: 71 | # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances 72 | worker_nodes: 73 | machineType: n1-standard-16 # n1-highcpu-8 74 | disks: 75 | - boot: true 76 | autoDelete: true 77 | type: PERSISTENT 78 | initializeParams: 79 | diskSizeGb: 1000 80 | # See https://cloud.google.com/compute/docs/images for more images 81 | sourceImage: projects/visualmpc-210823/global/images/robonet-image-newray 82 | # workers have p100 83 | guestAccelerators: 84 | - acceleratorType: projects/visualmpc-210823/zones/us-central1-a/acceleratorTypes/nvidia-tesla-v100 85 | acceleratorCount: 2 86 | # Run workers on preemtible instance by default. 87 | # Note that GCP preemptible instances automatically shut down after 24h. 88 | # Comment this out to use on-demand. 89 | scheduling: 90 | - preemptible: true 91 | - onHostMaintenance: TERMINATE 92 | 93 | # Additional options can be found in in the compute docs at 94 | # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert 95 | 96 | # Files or directories to copy to the head and worker nodes. The format is a 97 | # dictionary from REMOTE_PATH: LOCAL_PATH, e.g. 98 | file_mounts: {} 99 | 100 | # List of shell commands to run to set up nodes. 101 | setup_commands: 102 | - >- 103 | pip install cryptography 104 | && pip install --upgrade google-api-python-client 105 | && rm -rf ~/ray_results 106 | && cd ~/Documents/RoboNet 107 | && git stash 108 | && git pull origin inverse_model 109 | 110 | 111 | # Custom commands that will be run on the head node after common setup. 112 | head_setup_commands: [] 113 | 114 | # Custom commands that will be run on worker nodes after common setup. 115 | worker_setup_commands: [] 116 | 117 | # Command to start ray on the head node. You don't need to change this. 118 | head_start_ray_commands: 119 | - ray stop 120 | - >- 121 | ray start 122 | --head 123 | --redis-port=6379 124 | --object-manager-port=8076 125 | --autoscaling-config=~/ray_bootstrap_config.yaml 126 | --internal-config={\"initial_reconstruction_timeout_milliseconds\":2000\,\"num_heartbeats_timeout\":100} 127 | 128 | # Command to start ray on worker nodes. You don't need to change this. 129 | worker_start_ray_commands: 130 | - ray stop 131 | - >- 132 | ray start 133 | --redis-address=$RAY_HEAD_IP:6379 134 | --object-manager-port=8076 135 | -------------------------------------------------------------------------------- /launch_configs/ray_local_cluster.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: default 2 | min_workers: 1 3 | max_workers: 4 4 | docker: 5 | image: "" 6 | container_name: "" 7 | target_utilization_fraction: 0.8 8 | idle_timeout_minutes: 5 9 | provider: 10 | type: local 11 | head_ip: deepthought 12 | worker_ips: [newton5] 13 | auth: 14 | ssh_user: sudeep 15 | ssh_private_key: ~/.ssh/id_rsa 16 | head_node: {} 17 | worker_nodes: {} 18 | file_mounts: {} 19 | head_setup_commands: [] 20 | worker_setup_commands: [] 21 | initialization_commands: [] 22 | setup_commands: 23 | - source ~/rayrc && cd ~/Documents/RoboNet && git checkout ray && git pull origin ray 24 | # - source activate ray && cd ray/python && pip install -e . 25 | head_start_ray_commands: 26 | - source ~/rayrc && ray stop 27 | - source ~/rayrc && ulimit -c unlimited && ray start --head --redis-port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml 28 | worker_start_ray_commands: 29 | - source ~/rayrc && ray stop 30 | - source ~/rayrc && ray start --redis-address=$RAY_HEAD_IP:6379 31 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu==1.14 2 | opencv-python 3 | scipy 4 | scikit-image 5 | h5py 6 | imageio-ffmpeg 7 | pandas 8 | tqdm 9 | requests 10 | ray 11 | -------------------------------------------------------------------------------- /robonet/README: -------------------------------------------------------------------------------- 1 | #for setup install 2 | sudo apt-get install ffmpeg 3 | -------------------------------------------------------------------------------- /robonet/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from robonet.video_prediction.training import GIFLogger 3 | from robonet.video_prediction.training import get_trainable as vpred_trainable 4 | from robonet.inverse_model.training import get_trainable as inverse_trainable 5 | except: 6 | print('could not import trainables!') 7 | 8 | 9 | def get_trainable(class_name): 10 | available_trainables = [vpred_trainable, inverse_trainable] 11 | for a in available_trainables: 12 | try: 13 | return a(class_name) 14 | except NotImplementedError: 15 | pass 16 | raise NotImplementedError 17 | 18 | -------------------------------------------------------------------------------- /robonet/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .util.metadata_helper import load_metadata 2 | 3 | 4 | def get_dataset_class(name): 5 | if name == 'RoboNet': 6 | from .robonet_dataset import RoboNetDataset 7 | return RoboNetDataset 8 | elif name == 'AnnotatedRoboNet': 9 | from .variants.annotation_benchmark_dataset import AnnotationBenchmarkDataset 10 | return AnnotationBenchmarkDataset 11 | elif name == 'AnnotationHeldoutRobotDataset': 12 | from .variants.val_filter_dataset_variants import AnnotationHeldoutRobotDataset 13 | return AnnotationHeldoutRobotDataset 14 | elif name == 'HeldoutRobotDataset': 15 | from .variants.val_filter_dataset_variants import HeldoutRobotDataset 16 | return HeldoutRobotDataset 17 | elif name == 'TPU' or name == 'TFRecords': 18 | from .record_dataset import TFRecordVideoDataset 19 | return TFRecordVideoDataset 20 | else: 21 | raise NotImplementedError 22 | -------------------------------------------------------------------------------- /robonet/datasets/base_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | from tensorflow.contrib.training import HParams 4 | import glob 5 | import copy 6 | from .util.metadata_helper import load_metadata, MetaDataContainer 7 | import random 8 | import numpy as np 9 | 10 | 11 | class BaseVideoDataset(object): 12 | def __init__(self, batch_size, dataset_files_or_metadata, hparams=dict()): 13 | assert isinstance(batch_size, int), "batch_size must be an integer" 14 | self._batch_size = batch_size 15 | 16 | if isinstance(dataset_files_or_metadata, str): 17 | self._metadata = [load_metadata(dataset_files_or_metadata)] 18 | elif isinstance(dataset_files_or_metadata, MetaDataContainer): 19 | self._metadata = [dataset_files_or_metadata] 20 | elif isinstance(dataset_files_or_metadata, (list, tuple)): 21 | self._metadata = [] 22 | for d in dataset_files_or_metadata: 23 | assert isinstance(d, (str, MetaDataContainer)), "potential dataset must be folder containing files or meta-data instance" 24 | if isinstance(d, str): 25 | self._metadata.append(load_metadata(d)) 26 | else: 27 | self._metadata.append(d) 28 | 29 | # initialize hparams and store metadata_frame 30 | self._hparams = self._get_default_hparams().override_from_dict(hparams) 31 | 32 | self._init_rng() 33 | 34 | #initialize dataset 35 | self._num_ex_per_epoch = self._init_dataset() 36 | print('loaded {} train files'.format(self._num_ex_per_epoch)) 37 | 38 | def _init_dataset(self): 39 | return 0 40 | 41 | def _init_rng(self): 42 | # if RNG is not supplied then initialize new RNG 43 | self._random_generator = {} 44 | 45 | seeds = [None for _ in range(len(self.modes) + 1)] 46 | if self._hparams.RNG: 47 | seeds = [i + self._hparams.RNG for i in range(len(seeds))] 48 | 49 | for k, seed in zip(self.modes + ['base'], seeds): 50 | if k == 'train' and self._hparams.use_random_train_seed: 51 | seed = None 52 | self._random_generator[k] = random.Random(seed) 53 | self._np_rng = np.random.RandomState(self._random_generator['base'].getrandbits(32)) 54 | 55 | def _get(self, key, mode): 56 | raise NotImplementedError 57 | 58 | @staticmethod 59 | def _get_default_hparams(): 60 | default_dict = { 61 | 'RNG': 11381294392481135266, 62 | 'use_random_train_seed': False 63 | } 64 | return HParams(**default_dict) 65 | 66 | def get(self, key, mode='train'): 67 | if mode not in self.modes: 68 | raise ValueError('Mode {} not valid! Dataset has following modes: {}'.format(mode, self.modes)) 69 | return self._get(key, mode) 70 | 71 | def __getitem__(self, item): 72 | if isinstance(item, tuple): 73 | if len(item) != 2: 74 | raise KeyError('Index should be in format: [Key, Mode] or [Key] (assumes default train mode)') 75 | key, mode = item 76 | return self.get(key, mode) 77 | 78 | return self.get(item) 79 | 80 | def __contains__(self, item): 81 | raise NotImplementedError 82 | 83 | @property 84 | def batch_size(self): 85 | return self._batch_size 86 | 87 | @property 88 | def hparams(self): 89 | return copy.deepcopy(self._hparams) 90 | 91 | @property 92 | def num_examples_per_epoch(self): 93 | return self._num_ex_per_epoch 94 | 95 | @property 96 | def modes(self): 97 | return ['train', 'val', 'test'] 98 | 99 | @property 100 | def primary_mode(self): 101 | return 'train' 102 | 103 | def build_feed_dict(self, mode): 104 | return {} 105 | -------------------------------------------------------------------------------- /robonet/datasets/record_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | from tensorflow.contrib.training import HParams 4 | import glob 5 | from robonet.datasets.base_dataset import BaseVideoDataset 6 | import random 7 | import functools 8 | import json 9 | from robonet.datasets.util.dataset_utils import color_augment 10 | 11 | 12 | class TFRecordVideoDataset(BaseVideoDataset): 13 | def __init__(self, dataset_batches, dataset_paths, hparams=dict()): 14 | self._hparams = self._get_default_hparams().override_from_dict(hparams) # initialize hparams and store metadata_frame 15 | self._init_rng() # init rng objects 16 | 17 | assert isinstance(dataset_batches, (list, tuple)), "dataset_batches must be a list of batch_sizes per source" 18 | assert isinstance(dataset_paths, (list, tuple)), "dataset_batches must be a list of paths per source" 19 | self._batch_size = sum(dataset_batches) 20 | 21 | self._source_batch_sizes = dataset_batches 22 | self._source_dataset_paths = dataset_paths 23 | 24 | self._init_dataset() 25 | 26 | def _init_dataset(self): 27 | self._mode_datasets = {} 28 | for m in self.modes: 29 | self._mode_datasets[m] = [] 30 | 31 | for batch_size, dataset_path in zip(self._source_batch_sizes, self._source_dataset_paths): 32 | assert batch_size > 0 33 | assert 0 < self._hparams.train_frac < 1 34 | assert self._hparams.load_T > 1 35 | 36 | dataset_metadata = json.load(open('{}/format.json'.format(dataset_path), 'r')) 37 | 38 | if self._hparams.bucket_dir: 39 | print('loading files from: {}'.format(dataset_path + '/files.json')) 40 | all_files = json.load(open(dataset_path + '/files.json')) 41 | all_files = ['{}/{}'.format(self._hparams.bucket_dir, f) for f in all_files] 42 | else: 43 | all_files = glob.glob('{}/*.tfrecord'.format(dataset_path)) 44 | all_files.sort(key=lambda x: x.split('/')[-1]) 45 | 46 | self._random_generator['base'].shuffle(all_files) 47 | pivot = max(int(len(all_files) * self._hparams.train_frac), 1) 48 | train_f, val_f = all_files[:pivot], all_files[pivot:] 49 | 50 | self._random_generator['val'].shuffle(val_f) 51 | self._random_generator['train'].shuffle(train_f) 52 | 53 | for m, files in zip(self.modes, [train_f, val_f]): 54 | outputs = self._build_dataset(files, m, dataset_metadata, batch_size) 55 | 56 | # enforces static shapes constraint 57 | height, width = dataset_metadata['img_dim'] 58 | outputs['images'] = tf.cast(tf.reshape(outputs['images'], [batch_size, self._hparams.load_T, height, width, 3]), tf.float32) / 255 59 | if self._hparams.color_augmentation: 60 | outputs['images'] = color_augment(outputs['images'], self._hparams.color_augmentation) 61 | outputs['actions'] = tf.reshape(outputs['actions'], [batch_size, self._hparams.load_T - 1, dataset_metadata['adim']]) 62 | outputs['states'] = tf.reshape(outputs['states'], [batch_size, self._hparams.load_T, dataset_metadata['sdim']]) 63 | 64 | self._mode_datasets[m].append(outputs) 65 | 66 | for m in self.modes: 67 | tensor_list = self._mode_datasets.pop(m) 68 | self._mode_datasets[m] = {} 69 | for key in ['images', 'states', 'actions']: 70 | self._mode_datasets[m][key] = tf.concat([out_dict[key] for out_dict in tensor_list], axis=0) 71 | 72 | def _build_dataset(self, files, mode, dataset_metadata, batch_size): 73 | dataset = tf.data.Dataset.from_tensor_slices(files) 74 | if mode == 'train': 75 | dataset = dataset.repeat(self._hparams.n_epochs) 76 | else: 77 | dataset = dataset.repeat(None) # always have infinite val records 78 | 79 | ignore_order = tf.data.Options() 80 | ignore_order.experimental_deterministic = False 81 | dataset = dataset.with_options(ignore_order) 82 | dataset = dataset.interleave(tf.data.TFRecordDataset, 83 | cycle_length=min(len(files), 32), 84 | num_parallel_calls=tf.data.experimental.AUTOTUNE) 85 | 86 | parse_fn = functools.partial(self._parse_records, metadata=dataset_metadata) 87 | dataset = dataset.map(parse_fn) 88 | dataset = dataset.shuffle(buffer_size=self._hparams.shuffle_buffer) 89 | dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE) 90 | outputs = dataset.make_one_shot_iterator().get_next() 91 | return outputs 92 | 93 | def _parse_records(self, serialized_example, metadata): 94 | feat_names = {} 95 | feat_names['images'] = tf.FixedLenFeature([], tf.string) 96 | feat_names['actions'] = tf.FixedLenFeature([(metadata['T'] - 1) * metadata['adim']], tf.float32) 97 | feat_names['states'] = tf.FixedLenFeature([metadata['T'] * metadata['sdim']], tf.float32) 98 | 99 | feature = tf.parse_single_example(serialized_example, features=feat_names) 100 | 101 | rand_start = tf.random.uniform((), 0, metadata['T'] - self._hparams.load_T, dtype=tf.int32) 102 | rand_cam = tf.random.uniform((), 0, metadata['ncam'], dtype=tf.int32) 103 | 104 | decoded_feat = {} 105 | height, width = metadata['img_dim'] 106 | 107 | vid_decode = tf.reshape(tf.image.decode_jpeg(feature['images'], channels=3), (metadata['T'] * metadata['ncam'] * height, width, 3)) 108 | decoded_feat['images'] = tf.reshape(vid_decode, [metadata['T'], metadata['ncam'], height, width, 3])[rand_start:rand_start+self._hparams.load_T, rand_cam] 109 | decoded_feat['actions'] = tf.reshape(feature['actions'], [metadata['T'] - 1, metadata['adim']])[rand_start:rand_start+self._hparams.load_T - 1] 110 | decoded_feat['states'] = tf.reshape(feature['states'], [metadata['T'], metadata['sdim']])[rand_start:rand_start+self._hparams.load_T] 111 | 112 | return decoded_feat 113 | 114 | def _get(self, key, mode): 115 | return self._mode_datasets[mode][key] 116 | 117 | @staticmethod 118 | def _get_default_hparams(): 119 | default_dict = { 120 | 'RNG': 11381294392481135266, 121 | 'use_random_train_seed': False, 122 | 'shuffle_buffer': 500, 123 | 'n_epochs': None, 124 | 'buffer_size': 10, 125 | 'train_frac': 0.9, # train, val 126 | 'load_T': 15, 127 | 'bucket_dir': '', 128 | 'color_augmentation': 0 129 | } 130 | return HParams(**default_dict) 131 | 132 | def __contains__(self, item): 133 | return item in ['images', 'actions', 'states'] 134 | 135 | @property 136 | def modes(self): 137 | return ['train', 'val'] 138 | 139 | @property 140 | def num_examples_per_epoch(self): 141 | raise NotImplementedError 142 | 143 | 144 | if __name__ == '__main__': 145 | import argparse 146 | import imageio 147 | import numpy as np 148 | import time 149 | 150 | 151 | parser = argparse.ArgumentParser(description="tfrecord dataset tester") 152 | parser.add_argument('--path', type=str, required=True, help='path to tfrecord files') 153 | parser.add_argument('--batch_size', type=int, default=10, help='batch size for loaded data') 154 | args = parser.parse_args() 155 | 156 | loader = TFRecordVideoDataset([args.batch_size], [args.path], {'train_frac': 0.5, 'shuffle_buffer': 10}) 157 | print(loader['images'], loader['actions'], loader['states']) 158 | s = tf.Session() 159 | for j in range(10): 160 | t = time.time() 161 | img, act, state = s.run([loader['images'], loader['actions'], loader['states']]) 162 | print(time.time() - t) 163 | print('actions', act) 164 | print('state', state) 165 | 166 | w = imageio.get_writer('./out{}.gif'.format(j)) 167 | for t in range(img.shape[1]): 168 | w.append_data((np.concatenate(img[:, t], axis=-2) * 255).astype(np.uint8)) 169 | 170 | import pdb; pdb.set_trace() 171 | print(img.shape) 172 | -------------------------------------------------------------------------------- /robonet/datasets/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SudeepDasari/RoboNet/d83eee20f39653c3f8e7c349df7350e8a9e9f7a7/robonet/datasets/util/__init__.py -------------------------------------------------------------------------------- /robonet/datasets/util/convert_all.sh: -------------------------------------------------------------------------------- 1 | python robonet/datasets/util/hdf5_2_records.py ~/hdf5 --robot sawyer --save_dir records_all_small/sawyer --n_workers 40; 2 | python robonet/datasets/util/hdf5_2_records.py ~/hdf5 --robot kuka --save_dir records_all_small/kuka --n_workers 40; 3 | python robonet/datasets/util/hdf5_2_records.py ~/hdf5 --robot R3 --save_dir records_all_small/R3 --n_workers 40; 4 | python robonet/datasets/util/hdf5_2_records.py ~/hdf5 --robot widowx --save_dir records_all_small/widowx --n_workers 40; 5 | python robonet/datasets/util/hdf5_2_records.py ~/hdf5 --robot baxter --save_dir records_all_small/baxter --n_workers 40; 6 | python robonet/datasets/util/hdf5_2_records.py ~/hdf5 --robot fetch --save_dir records_all_small/fetch --n_workers 40; 7 | python robonet/datasets/util/hdf5_2_records.py ~/hdf5 --robot franka --save_dir records_all_small/franka --n_workers 40; 8 | -------------------------------------------------------------------------------- /robonet/datasets/util/dataset_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import pdb 3 | import numpy as np 4 | 5 | 6 | def color_augment(image, noise_range=0.2): 7 | assert noise_range > 0, "noise_range must be positive" 8 | 9 | bs = image.get_shape().as_list()[0] 10 | shape = [bs] + [1 for _ in range(len(image.get_shape().as_list()) - 1)] 11 | min_noise = -noise_range 12 | max_noise = noise_range 13 | rand_h = tf.random_uniform(shape, minval=min_noise, maxval=max_noise) 14 | rand_s = tf.random_uniform(shape, minval=min_noise, maxval=max_noise) 15 | rand_v = tf.random_uniform(shape, minval=min_noise, maxval=max_noise) 16 | image_hsv = tf.image.rgb_to_hsv(image) 17 | h_, s_, v_ = tf.split(image_hsv, 3, -1) 18 | stack_mod = tf.clip_by_value(tf.concat([h_ + rand_h, s_ + rand_s, v_ + rand_v], axis=-1), 0, 1.) 19 | image_rgb = tf.image.hsv_to_rgb(stack_mod) 20 | return image_rgb 21 | 22 | 23 | def split_train_val_test(metadata, splits=None, train_ex=None, rng=None): 24 | assert (splits is None) != (train_ex is None), "exactly one of splits or train_ex should be supplied" 25 | files = metadata.get_shuffled_files(rng) 26 | train_files, val_files, test_files = None, None, None 27 | 28 | if splits is not None: 29 | assert len(splits) == 3, "function requires 3 split parameteres ordered (train, val ,test)" 30 | splits = np.cumsum([int(i * len(files)) for i in splits]).tolist() 31 | else: 32 | assert len(files) >= train_ex, "not enough files for train examples!" 33 | val_split = int(0.5 * (len(files) + train_ex)) 34 | splits = [train_ex, val_split, len(files)] 35 | 36 | # give extra fat to val set 37 | if splits[-1] < len(files): 38 | diff = len(files) - splits[-1] 39 | for i in range(1, len(splits)): 40 | splits[i] += diff 41 | 42 | if splits[0]: 43 | train_files = files[:splits[0]] 44 | if splits[1]: 45 | val_files = files[splits[0]: splits[1]] 46 | if splits[2]: 47 | test_files = files[splits[1]: splits[2]] 48 | 49 | return train_files, val_files, test_files 50 | -------------------------------------------------------------------------------- /robonet/datasets/util/hdf5_2_records.py: -------------------------------------------------------------------------------- 1 | """ 2 | Converts data from hdf5 format to TFRecord format 3 | """ 4 | 5 | import tensorflow as tf 6 | from robonet.datasets.util.hdf5_loader import load_data, default_loader_hparams 7 | from tqdm import tqdm 8 | import cv2 9 | 10 | 11 | def float_feature(value): 12 | return tf.train.Feature(float_list=tf.train.FloatList(value=value)) 13 | 14 | 15 | def bytes_feature(value): 16 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 17 | 18 | 19 | def save_record(filename, trajs): 20 | writer = tf.python_io.TFRecordWriter(filename) 21 | for traj in tqdm(trajs): 22 | images, actions, states = traj 23 | image_bytes = cv2.imencode('.jpg', images.reshape((-1, images.shape[-2], images.shape[-1]))[:, :, ::-1])[1].tostring() 24 | 25 | feature = {} 26 | feature['images'] = bytes_feature(image_bytes) 27 | feature['actions'] = float_feature(actions.flatten().tolist()) 28 | feature['states'] = float_feature(states.flatten().tolist()) 29 | example = tf.train.Example(features=tf.train.Features(feature=feature)) 30 | writer.write(example.SerializeToString()) 31 | writer.close() 32 | 33 | 34 | def _load_hdf5(inputs): 35 | if len(inputs) == 3: 36 | f_name, file_metadata, hparams = inputs 37 | return load_data(f_name, file_metadata, hparams) 38 | elif len(inputs) == 4: 39 | f_name, file_metadata, hparams, rng = inputs 40 | return load_data(f_name, file_metadata, hparams, rng) 41 | raise ValueError 42 | 43 | 44 | if __name__ == '__main__': 45 | import argparse 46 | from robonet.datasets import load_metadata 47 | from tensorflow.contrib.training import HParams 48 | import multiprocessing 49 | import json 50 | import copy 51 | import random 52 | import os 53 | 54 | 55 | parser = argparse.ArgumentParser(description="converts data into tfrecord format for fast TPU loading") 56 | parser.add_argument('path', type=str, default='./', help='path to input file archive') 57 | parser.add_argument('--robot', type=str, default='', help='if flag supplied only converts data corresponding to given robot') 58 | parser.add_argument('--filter_primitive', type=str, default='', help='if flag supplied only converts data with given primitive') 59 | parser.add_argument('--n_workers', type=int, default=1, help='number of worker threads') 60 | parser.add_argument('--target_adim', type=int, default=5, help='target action dimension for loading') 61 | parser.add_argument('--target_sdim', type=int, default=5, help='target state dimension for loading') 62 | parser.add_argument('--img_dims', type=int, nargs='+', default=[48, 64], help='(height, width) to resize images') 63 | parser.add_argument('--save_dir', type=str, default='./', help='where to save records') 64 | parser.add_argument('--ex_per_record', type=int, default=512, help='examples per record file') 65 | args = parser.parse_args() 66 | 67 | name_dir = 'record_names/' + '/'.join(args.save_dir.split('/')[1:]) 68 | if not os.path.exists(args.save_dir): 69 | os.makedirs(args.save_dir) 70 | if not os.path.exists(name_dir): 71 | os.makedirs(name_dir) 72 | 73 | metadata = load_metadata(args.path) 74 | if args.robot: 75 | metadata = metadata[metadata['robot'] == args.robot] 76 | if args.filter_primitive: 77 | metadata = metadata[metadata['primitives'] == args.filter_primitive] 78 | 79 | ncam = min(metadata['ncam'].frame.unique().tolist()) 80 | print('loaded {} records with robot={} and primitive={}'.format(len(metadata), args.robot, args.filter_primitive)) 81 | 82 | hparams = HParams(**default_loader_hparams()) 83 | hparams.target_adim = args.target_adim 84 | hparams.target_sdim = args.target_sdim 85 | hparams.action_mismatch = 3 86 | hparams.state_mismatch = 3 87 | hparams.cams_to_load = list(range(ncam)) 88 | hparams.load_T = min(min(metadata['state_T']),min(metadata['img_T'])).frame 89 | assert len(args.img_dims) == 2, "should be (height, width) tuple" 90 | hparams.img_size = tuple(args.img_dims) 91 | 92 | print('saving images with adim-{}, sdim-{}, img_dims-{}, T-{}'.format(hparams.target_adim, hparams.target_sdim, hparams.img_size, hparams.load_T)) 93 | 94 | record_metadata = {'adim': int(hparams.target_adim), 'sdim': int(hparams.target_sdim), 'img_dim': list(hparams.img_size), 'T': int(hparams.load_T) , 'ncam': ncam} 95 | json.dump(record_metadata, open('{}/format.json'.format(args.save_dir), 'w')) 96 | json.dump(record_metadata, open('{}/format.json'.format(name_dir), 'w')) 97 | pool = multiprocessing.Pool(args.n_workers) 98 | 99 | all_files = metadata.files 100 | random.shuffle(all_files) 101 | f_ind, r_cntr = 0, 0 102 | f_names = [] 103 | while f_ind < len(all_files): 104 | f_load = all_files[f_ind:f_ind + args.ex_per_record] 105 | fm_load = [metadata.get_file_metadata(f) for f in f_load] 106 | f_hparams = [copy.deepcopy(hparams) for _ in f_load] 107 | 108 | loaded_data = pool.map(_load_hdf5, [(f, fm, fh) for f, fm, fh in zip(f_load, fm_load, f_hparams)]) 109 | f_name = '{}/record{}.tfrecord'.format(args.save_dir, r_cntr) 110 | save_record(f_name, loaded_data) 111 | print('saved record{}.tfrecord'.format(r_cntr)) 112 | f_names.append(f_name) 113 | 114 | r_cntr += 1 115 | f_ind += len(loaded_data) 116 | 117 | json.dump(f_names, open('{}/files.json'.format(args.save_dir), 'w')) 118 | json.dump(f_names, open('{}/files.json'.format(name_dir), 'w')) 119 | -------------------------------------------------------------------------------- /robonet/datasets/util/hdf5_loader.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import cv2 3 | import pdb 4 | import imageio 5 | import io 6 | import hashlib 7 | import numpy as np 8 | import os 9 | import random 10 | 11 | 12 | class ACTION_MISMATCH: 13 | ERROR = 0 14 | PAD_ZERO = 1 15 | CLEAVE = 2 16 | 17 | 18 | class STATE_MISMATCH: 19 | ERROR = 0 20 | PAD_ZERO = 1 21 | CLEAVE = 2 22 | 23 | 24 | def default_loader_hparams(): 25 | return { 26 | 'target_adim': 4, 27 | 'target_sdim': 5, 28 | 'state_mismatch': STATE_MISMATCH.ERROR, # TODO make better flag parsing 29 | 'action_mismatch': ACTION_MISMATCH.ERROR, # TODO make better flag parsing 30 | 'img_size': [48, 64], 31 | 'cams_to_load': [0], 32 | 'impute_autograsp_action': True, 33 | 'load_annotations': False, 34 | 'zero_if_missing_annotation': False, 35 | 'load_T': 0 # TODO implement error checking here for jagged reading 36 | } 37 | 38 | 39 | def load_camera_imgs(cam_index, file_pointer, file_metadata, target_dims, start_time=0, n_load=None): 40 | cam_group = file_pointer['env']['cam{}_video'.format(cam_index)] 41 | old_dims = file_metadata['frame_dim'] 42 | length = file_metadata['img_T'] 43 | encoding = file_metadata['img_encoding'] 44 | image_format = file_metadata['image_format'] 45 | 46 | if n_load is None: 47 | n_load = length 48 | 49 | old_height, old_width = old_dims 50 | target_height, target_width = target_dims 51 | resize_method = cv2.INTER_CUBIC 52 | if target_height * target_width < old_height * old_width: 53 | resize_method = cv2.INTER_AREA 54 | 55 | images = np.zeros((n_load, target_height, target_width, 3), dtype=np.uint8) 56 | if encoding == 'mp4': 57 | buf = io.BytesIO(cam_group['frames'][:].tostring()) 58 | img_buffer = [img for t, img in enumerate(imageio.get_reader(buf, format='mp4')) if start_time <= t < n_load + start_time] 59 | elif encoding == 'jpg': 60 | img_buffer = [cv2.imdecode(cam_group['frame{}'.format(t)][:], cv2.IMREAD_COLOR)[:, :, ::-1] 61 | for t in range(start_time, start_time + n_load)] 62 | else: 63 | raise ValueError("encoding not supported") 64 | 65 | for t, img in enumerate(img_buffer): 66 | if (old_height, old_width) == (target_height, target_width): 67 | images[t] = img 68 | else: 69 | images[t] = cv2.resize(img, (target_width, target_height), interpolation=resize_method) 70 | 71 | if image_format == 'RGB': 72 | return images 73 | elif image_format == 'BGR': 74 | return images[:, :, :, ::-1] 75 | raise NotImplementedError 76 | 77 | 78 | def load_states(file_pointer, meta_data, hparams): 79 | s_T, sdim = meta_data['state_T'], meta_data['sdim'] 80 | if hparams.target_sdim == sdim: 81 | return file_pointer['env']['state'][:] 82 | 83 | elif sdim < hparams.target_sdim and hparams.state_mismatch & STATE_MISMATCH.PAD_ZERO: 84 | pad = np.zeros((s_T, hparams.target_sdim - sdim), dtype=np.float32) 85 | return np.concatenate((file_pointer['env']['state'][:], pad), axis=-1) 86 | 87 | elif sdim > hparams.target_sdim and hparams.state_mismatch & STATE_MISMATCH.CLEAVE: 88 | return file_pointer['env']['state'][:][:, :hparams.target_sdim] 89 | 90 | else: 91 | raise ValueError("file sdim - {}, target sdim - {}, pad behavior - {}".format(sdim, hparams.target_sdim, hparams.state_mismatch)) 92 | 93 | 94 | def load_actions(file_pointer, meta_data, hparams): 95 | a_T, adim = meta_data['action_T'], meta_data['adim'] 96 | if hparams.target_adim == adim: 97 | return file_pointer['policy']['actions'][:] 98 | 99 | elif hparams.target_adim == adim + 1 and hparams.impute_autograsp_action and meta_data['primitives'] == 'autograsp': 100 | action_append, old_actions = np.zeros((a_T, 1)), file_pointer['policy']['actions'][:] 101 | next_state = file_pointer['env']['state'][:][1:, -1] 102 | 103 | high_val, low_val = meta_data['high_bound'][-1], meta_data['low_bound'][-1] 104 | midpoint = (high_val + low_val) / 2.0 105 | 106 | for t, s in enumerate(next_state): 107 | if s > midpoint: 108 | action_append[t, 0] = high_val 109 | else: 110 | action_append[t, 0] = low_val 111 | return np.concatenate((old_actions, action_append), axis=-1) 112 | 113 | elif adim < hparams.target_adim and hparams.action_mismatch & ACTION_MISMATCH.PAD_ZERO: 114 | pad = np.zeros((a_T, hparams.target_adim - adim), dtype=np.float32) 115 | return np.concatenate((file_pointer['policy']['actions'][:], pad), axis=-1) 116 | 117 | elif adim > hparams.target_adim and hparams.action_mismatch & ACTION_MISMATCH.CLEAVE: 118 | return file_pointer['policy']['actions'][:][:, :hparams.target_adim] 119 | 120 | else: 121 | raise ValueError("file adim - {}, target adim - {}, pad behavior - {}".format(adim, hparams.target_adim, hparams.action_mismatch)) 122 | 123 | 124 | def load_annotations(file_pointer, metadata, hparams, cams_to_load): 125 | old_height, old_width = metadata['frame_dim'] 126 | target_height, target_width = hparams.img_size 127 | scale_height, scale_width = target_height / float(old_height), target_width / float(old_width) 128 | annot = np.zeros((metadata['img_T'], len(cams_to_load), target_height, target_width, 2), dtype=np.float32) 129 | if metadata.get('contains_annotation', False) != True and hparams.zero_if_missing_annotation: 130 | return annot 131 | 132 | assert metadata['contains_annotation'], "no annotations to load!" 133 | point_mat = file_pointer['env']['bbox_annotations'][:].astype(np.int32) 134 | 135 | for t in range(metadata['img_T']): 136 | for n, chosen_cam in enumerate(cams_to_load): 137 | for obj in range(point_mat.shape[2]): 138 | h1, w1 = point_mat[t, chosen_cam, obj, 0] * [scale_height, scale_width] - 1 139 | h2, w2 = point_mat[t, chosen_cam, obj, 1] * [scale_height, scale_width] - 1 140 | h, w = int((h1 + h2) / 2), int((w1 + w2) / 2) 141 | annot[t, n, h, w, obj] = 1 142 | return annot 143 | 144 | 145 | def load_data(f_name, file_metadata, hparams, rng=None): 146 | rng = random.Random(rng) 147 | 148 | assert os.path.exists(f_name) and os.path.isfile(f_name), "invalid f_name" 149 | with open(f_name, 'rb') as f: 150 | buf = f.read() 151 | assert hashlib.sha256(buf).hexdigest() == file_metadata['sha256'], "file hash doesn't match meta-data. maybe delete pkl and re-generate?" 152 | 153 | with h5py.File(io.BytesIO(buf)) as hf: 154 | start_time, n_states = 0, min([file_metadata['state_T'], file_metadata['img_T'], file_metadata['action_T'] + 1]) 155 | assert n_states > 1, "must be more than one state in loaded tensor!" 156 | if 1 < hparams.load_T < n_states: 157 | start_time = rng.randint(0, n_states - hparams.load_T) 158 | n_states = hparams.load_T 159 | 160 | assert all([0 <= i < file_metadata['ncam'] for i in hparams.cams_to_load]), "cams_to_load out of bounds!" 161 | images, selected_cams = [], [] 162 | for cam_index in hparams.cams_to_load: 163 | images.append(load_camera_imgs(cam_index, hf, file_metadata, hparams.img_size, start_time, n_states)[None]) 164 | selected_cams.append(cam_index) 165 | images = np.swapaxes(np.concatenate(images, 0), 0, 1) 166 | 167 | actions = load_actions(hf, file_metadata, hparams).astype(np.float32)[start_time:start_time + n_states-1] 168 | states = load_states(hf, file_metadata, hparams).astype(np.float32)[start_time:start_time + n_states] 169 | 170 | if hparams.load_annotations: 171 | annotations = load_annotations(hf, file_metadata, hparams, selected_cams)[start_time:start_time + n_states] 172 | return images, actions, states, annotations 173 | 174 | return images, actions, states 175 | 176 | 177 | if __name__ == '__main__': 178 | import argparse 179 | import tensorflow as tf 180 | import robonet.datasets as datasets 181 | import random 182 | import matplotlib.pyplot as plt 183 | 184 | parser = argparse.ArgumentParser(description="tests hdf5 data loader without tensorflow dataset wrapper") 185 | parser.add_argument('file', type=str, help="path to hdf5 you want to load") 186 | parser.add_argument('--load_annotations', action='store_true', help="loads annotations if supplied") 187 | parser.add_argument('--load_steps', type=int, default=0, help="loads steps from the dataset instead of everything") 188 | args = parser.parse_args() 189 | 190 | assert 'hdf5' in args.file 191 | data_folder = '/'.join(args.file.split('/')[:-1]) 192 | meta_data = datasets.load_metadata(data_folder) 193 | 194 | hparams = tf.contrib.training.HParams(**default_loader_hparams()) 195 | hparams.load_T = args.load_steps 196 | if args.load_annotations: 197 | hparams.load_annotations = True 198 | print(meta_data[meta_data['contains_annotation'] == True]) 199 | meta_data = meta_data[meta_data['contains_annotation'] == True] 200 | imgs, actions, states, annot = load_data((args.file, meta_data.get_file_metadata(args.file)), hparams) 201 | else: 202 | imgs, actions, states = load_data((args.file, meta_data.get_file_metadata(args.file)), hparams) 203 | 204 | print('actions', actions.shape) 205 | print('states', states.shape) 206 | print('images', imgs.shape) 207 | 208 | if args.load_annotations: 209 | for o in range(2): 210 | w = imageio.get_writer('out{}.gif'.format(o)) 211 | for t, i in enumerate(imgs): 212 | dist_render = plt.cm.viridis(annot[t, :, :, o])[:, :, :3] 213 | w.append_data((i * dist_render).astype(np.uint8)) 214 | w.close() 215 | else: 216 | w = imageio.get_writer('out.gif') 217 | for i in imgs: 218 | w.append_data(i) 219 | w.close() 220 | 221 | -------------------------------------------------------------------------------- /robonet/datasets/util/metadata_helper.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import pandas as pd 3 | import numpy as np 4 | import glob 5 | import os 6 | from tqdm import tqdm 7 | from multiprocessing import Pool, cpu_count 8 | import hashlib 9 | import io 10 | import random 11 | 12 | 13 | class MetaDataContainer: 14 | def __init__(self, base_path, meta_data): 15 | self._meta_data = meta_data 16 | self._base_path = base_path 17 | 18 | def get_file_metadata(self, fname): 19 | fname = fname.split('/')[-1] 20 | return self._meta_data.loc[fname] 21 | 22 | def select_objects(self, obj_class_name): 23 | if isinstance(obj_class_name, str): 24 | return self._meta_data[[obj_class_name in x for x in self._meta_data['object_classes']]] 25 | return self._meta_data[[set(obj_class_name) == set(x) for x in self._meta_data['object_classes']]] 26 | 27 | @property 28 | def frame(self): 29 | return self._meta_data 30 | 31 | @property 32 | def files(self): 33 | return ['{}/{}'.format(self._base_path, f) for f in self.frame.index] 34 | 35 | def get_shuffled_files(self, rng=None): 36 | files = ['{}/{}'.format(self._base_path, f) for f in self.frame.index] 37 | if rng: 38 | rng.shuffle(files) 39 | else: 40 | random.shuffle(files) 41 | return files 42 | 43 | @property 44 | def base_path(self): 45 | return self._base_path 46 | 47 | def __getitem__(self, arg): 48 | return MetaDataContainer(self._base_path, self._meta_data[arg]) 49 | 50 | def __contains__(self, item): 51 | return item in self._meta_data 52 | 53 | def __repr__(self): 54 | return repr(self._meta_data) 55 | 56 | def __str__(self): 57 | return str(self._meta_data) 58 | 59 | def __eq__(self, other): 60 | return self._meta_data == other 61 | 62 | def __ne__(self, other): 63 | return self._meta_data != other 64 | 65 | def __lt__(self, other): 66 | return self._meta_data < other 67 | 68 | def __le__(self, other): 69 | return self._meta_data <= other 70 | 71 | def __gt__(self, other): 72 | return self._meta_data > other 73 | 74 | def __ge__(self, other): 75 | return self._meta_data >= other 76 | 77 | def keys(self): 78 | return self._meta_data.keys() 79 | 80 | def __len__(self): 81 | return len(self._meta_data) 82 | 83 | 84 | def load_metadata_dict(fname): 85 | if not os.path.exists(fname) or not os.path.isfile(fname): 86 | raise IOError("can't find {}".format(fname)) 87 | buf = open(fname, 'rb').read() 88 | 89 | with h5py.File(io.BytesIO(buf)) as hf: 90 | meta_data_dict = {'file_version': hf['file_version'][()]} 91 | 92 | meta_data_dict['sha256'] = hashlib.sha256(buf).hexdigest() 93 | meta_data_dict['sdim'] = hf['env']['state'].shape[1] 94 | meta_data_dict['state_T'] = hf['env']['state'].shape[0] 95 | 96 | meta_data_dict['adim'] = hf['policy']['actions'].shape[1] 97 | meta_data_dict['action_T'] =hf['policy']['actions'].shape[0] 98 | 99 | # assumes all cameras have same attributes (if they exist) 100 | n_cams = hf['env'].attrs.get('n_cams', 0) 101 | if n_cams: 102 | meta_data_dict['ncam'] = n_cams 103 | 104 | if hf['env'].attrs['cam_encoding'] == 'mp4': 105 | meta_data_dict['frame_dim'] = hf['env']['cam0_video']['frames'].attrs['shape'][:2] 106 | meta_data_dict['img_T'] = hf['env']['cam0_video']['frames'].attrs['T'] 107 | meta_data_dict['img_encoding'] = 'mp4' 108 | meta_data_dict['image_format'] = hf['env']['cam0_video']['frames'].attrs['image_format'] 109 | else: 110 | meta_data_dict['frame_dim'] = hf['env']['cam0_video']['frame0'].attrs['shape'][:2] 111 | meta_data_dict['image_format'] = hf['env']['cam0_video']['frame0'].attrs['image_format'] 112 | meta_data_dict['img_encoding'] = 'jpg' 113 | meta_data_dict['img_T'] = len(hf['env']['cam0_video']) 114 | 115 | # TODO: remove misc field and shift all to meta-data 116 | for k in hf['misc'].keys(): 117 | assert k not in meta_data_dict, "key {} already present!".format(k) 118 | meta_data_dict[k] = hf['misc'][k][()] 119 | 120 | 121 | for k in hf['metadata'].attrs.keys(): 122 | assert k not in meta_data_dict, "key {} already present!".format(k) 123 | meta_data_dict[k] = hf['metadata'].attrs[k] 124 | 125 | if 'low_bound' not in meta_data_dict and 'low_bound' in hf['env']: 126 | meta_data_dict['low_bound'] = hf['env']['low_bound'][0] 127 | 128 | if 'high_bound' not in meta_data_dict and 'high_bound' in hf['env']: 129 | meta_data_dict['high_bound'] = hf['env']['high_bound'][0] 130 | 131 | return meta_data_dict 132 | 133 | def get_metadata_frame(files): 134 | if isinstance(files, str): 135 | base_path = files 136 | files = sorted(glob.glob('{}/*.hdf5'.format(files))) 137 | if not files: 138 | raise ValueError('no hdf5 files found!') 139 | 140 | if os.path.exists('{}/meta_data.pkl'.format(base_path)): 141 | meta_data = pd.read_pickle('{}/meta_data.pkl'.format(base_path), compression='gzip') 142 | 143 | registered_fnames = set([f for f in meta_data.index]) 144 | loaded_fnames = set([f.split('/')[-1] for f in files]) 145 | 146 | if loaded_fnames == registered_fnames: 147 | return meta_data 148 | os.remove('{}/meta_data.pkl'.format(base_path)) 149 | print('regenerating meta_data file!') 150 | elif isinstance(files, (list, tuple)): 151 | base_path=None 152 | files = sorted(files) 153 | else: 154 | raise ValueError("Must be path to files or list/tuple of filenames") 155 | 156 | with Pool(cpu_count()) as p: 157 | meta_data = list(tqdm(p.imap(load_metadata_dict, files), total=len(files))) 158 | 159 | data_frame = pd.DataFrame(meta_data, index=[f.split('/')[-1] for f in files]) 160 | if base_path: 161 | data_frame.to_pickle("{}/meta_data.pkl".format(base_path), compression='gzip') 162 | return data_frame 163 | 164 | 165 | def load_metadata(files): 166 | base_path = files 167 | if isinstance(files, (tuple, list)): 168 | base_path = '' 169 | else: 170 | files = base_path = os.path.expanduser(base_path) 171 | 172 | return MetaDataContainer(base_path, get_metadata_frame(files)) 173 | 174 | 175 | if __name__ == '__main__': 176 | import argparse 177 | import pdb 178 | 179 | parser = argparse.ArgumentParser(description="calculates or loads meta_data frame") 180 | parser.add_argument('path', help='path to files containing hdf5 dataset') 181 | args = parser.parse_args() 182 | data_frame = load_metadata(args.path) 183 | pdb.set_trace() 184 | print('loaded frame') 185 | -------------------------------------------------------------------------------- /robonet/datasets/util/tensor_multiplexer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from collections import OrderedDict 3 | 4 | 5 | def multiplex_tensors(dataset, key_name, train_cond=None): 6 | if train_cond is None: 7 | _train_cond = tf.placeholder(tf.int32, shape=[], name="train_cond") 8 | else: 9 | _train_cond = train_cond 10 | 11 | tensors = [dataset[key_name, m] for m in dataset.modes] 12 | assert len(tensors), "can't multiplex across no modes!" 13 | 14 | if len(tensors) == 1: 15 | if train_cond is None: 16 | return tensors[0], _train_cond 17 | return tensors[0] 18 | 19 | top_tensor = tensors[-1] 20 | for ind in range(len(tensors) - 1, 0, -1): 21 | top_tensor = tf.cond(_train_cond < ind, lambda: tensors[ind - 1], lambda: top_tensor) 22 | 23 | if train_cond is None: 24 | return top_tensor, _train_cond 25 | return top_tensor 26 | 27 | 28 | class MultiplexedTensors: 29 | def __init__(self, dataset, tensor_names): 30 | self._dataset = dataset 31 | self._mode_ind = {} 32 | for i, k in enumerate(dataset.modes): 33 | self._mode_ind[k] = i 34 | 35 | self._train_cond = tf.placeholder(tf.int32, shape=[], name="train_cond") 36 | self._tensor_dict = OrderedDict() 37 | for t in tensor_names: 38 | self._tensor_dict[t] = multiplex_tensors(dataset, t, self._train_cond) 39 | 40 | def __getitem__(self, key): 41 | return self._tensor_dict[key] 42 | 43 | @property 44 | def dict(self): 45 | return self._tensor_dict 46 | 47 | def get_feed_dict(self, mode): 48 | dataset_feed = self._dataset.build_feed_dict(mode) 49 | if isinstance(mode, int): 50 | assert 0 <= mode < len(self._mode_ind.keys()), "mode_index must be in range 0 to len(modes) - 1" 51 | dataset_feed[self._train_cond] = mode 52 | return dataset_feed 53 | 54 | assert isinstance(mode, str) 55 | assert mode in self._mode_ind, "{} not supported! Modes are {}".foramt(mode, self._mode_ind.keys()) 56 | 57 | dataset_feed[self._train_cond] = self._mode_ind[mode] 58 | return dataset_feed 59 | 60 | @property 61 | def modes(self): 62 | return list(self._mode_ind.keys()) 63 | -------------------------------------------------------------------------------- /robonet/datasets/variants/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SudeepDasari/RoboNet/d83eee20f39653c3f8e7c349df7350e8a9e9f7a7/robonet/datasets/variants/__init__.py -------------------------------------------------------------------------------- /robonet/datasets/variants/annotation_benchmark_dataset.py: -------------------------------------------------------------------------------- 1 | from robonet.datasets.robonet_dataset import RoboNetDataset 2 | from robonet.datasets.util.dataset_utils import split_train_val_test 3 | 4 | 5 | class AnnotationBenchmarkDataset(RoboNetDataset): 6 | """ 7 | Separates files that have annotations and those which don't 8 | - files with annotations are loaded as validation files 9 | - all others are loaded as train/test 10 | """ 11 | def __init__(self, batch_size, dataset_files_or_metadata, hparams=dict()): 12 | self._annotated_robots = None 13 | super(AnnotationBenchmarkDataset, self).__init__(batch_size, dataset_files_or_metadata, hparams) 14 | 15 | @staticmethod 16 | def _get_default_hparams(parent_hparams=None): 17 | if parent_hparams is None: 18 | parent_hparams = RoboNetDataset._get_default_hparams() 19 | parent_hparams.load_annotations = True 20 | parent_hparams.zero_if_missing_annotation = True 21 | return parent_hparams 22 | 23 | def _split_files(self, source_number, metadata): 24 | assert self._hparams.load_annotations, "mode requires annotation loading" 25 | assert self._hparams.zero_if_missing_annotation, "mode requires some files to not be annotated" 26 | 27 | non_annotated_metadata = metadata[metadata['contains_annotation'] != True] 28 | 29 | if self._hparams.train_ex_per_source != [-1]: 30 | train_files, val_files, test_files = split_train_val_test(metadata, train_ex=self._hparams.train_ex_per_source[source_number], rng=self._random_generator['base']) 31 | else: 32 | train_files, val_files, test_files = split_train_val_test(non_annotated_metadata, splits=self._hparams.splits, rng=self._random_generator['base']) 33 | 34 | all_annotated = metadata[metadata['contains_annotation'] == True] 35 | robot_files = [all_annotated[all_annotated['robot'] == r].files for r in self._annotated_robots] 36 | 37 | if len(self._annotated_robots) == 1: 38 | return [train_files, val_files, test_files] + robot_files 39 | return [train_files, val_files, test_files] + [all_annotated.files] + robot_files 40 | 41 | @property 42 | def modes(self): 43 | if self._annotated_robots is None: 44 | self._annotated_robots = [] 45 | for m in self._metadata: 46 | annotated_robots_from_source = m[m['contains_annotation'] == True]['robot'].frame.unique().tolist() 47 | self._annotated_robots.extend(annotated_robots_from_source) 48 | self._annotated_robots = list(set(self._annotated_robots)) 49 | 50 | all_annotated_mode = [] 51 | if len(self._annotated_robots) > 1: 52 | all_annotated_mode = ['all_annotated'] 53 | 54 | return ['train', 'val', 'test'] + all_annotated_mode + ['{}_annotated'.format(r) for r in self._annotated_robots] 55 | 56 | 57 | if __name__ == '__main__': 58 | import argparse 59 | import tensorflow as tf 60 | import numpy as np 61 | parser = argparse.ArgumentParser(description="calculates or loads meta_data frame") 62 | parser.add_argument('path', help='path to files containing hdf5 dataset') 63 | parser.add_argument('--batch_size', type=int, default=32, help='batch size for test loader (should be even for non-time test demo to work)') 64 | parser.add_argument('--mode', type=str, default='val', help='mode to grab data from') 65 | parser.add_argument('--load_steps', type=int, default=0, help='if value is provided will load steps') 66 | args = parser.parse_args() 67 | 68 | hparams = {'ret_fnames': True, 'load_T': args.load_steps,'action_mismatch': 3, 'state_mismatch': 3, 'splits':[0.8, 0.1, 0.1], 'same_cam_across_sub_batch':False} 69 | loader = AnnotationBenchmarkDataset(args.batch_size, args.path, hparams=hparams) 70 | print('modes are', loader.modes) 71 | 72 | tensors = [loader[x, args.mode] for x in ['images', 'states', 'actions', 'annotations', 'f_names']] 73 | s = tf.Session() 74 | out_tensors = s.run(tensors, feed_dict=loader.build_feed_dict(args.mode)) 75 | 76 | import imageio 77 | writer = imageio.get_writer('test_frames.gif') 78 | for t in range(out_tensors[0].shape[1]): 79 | writer.append_data((np.concatenate([b for b in out_tensors[0][:, t, 0]], axis=-2) * 255).astype(np.uint8)) 80 | writer.close() 81 | import pdb; pdb.set_trace() 82 | print('loaded tensors!') 83 | -------------------------------------------------------------------------------- /robonet/datasets/variants/val_filter_dataset_variants.py: -------------------------------------------------------------------------------- 1 | from robonet.datasets.robonet_dataset import RoboNetDataset 2 | from tensorflow.contrib.training.python.training.hparam import HParams 3 | from robonet.datasets.variants.annotation_benchmark_dataset import AnnotationBenchmarkDataset 4 | import pdb 5 | 6 | 7 | """ 8 | Should perhaps update these to work with new API 9 | """ 10 | class ValFilterDataset(RoboNetDataset): 11 | """ 12 | Separates files that have annotations and those which don't 13 | - files with annotations are loaded as validation files 14 | - all others are loaded as train/test 15 | """ 16 | 17 | def _split_files(self, metadata): 18 | train_metadata, val_metadata = self.train_val_filter(metadata, metadata) 19 | 20 | train_files, test_files, val_files = [], [], [] 21 | train_test_files = train_metadata.files 22 | val_files = val_metadata.files 23 | [self.rng.shuffle(files) for files in [train_test_files, val_files]] 24 | train_pivot = int(len(train_test_files) * self._hparams.splits[0]) 25 | if self._hparams.splits[0]: 26 | train_files = train_test_files[:train_pivot] 27 | if self._hparams.splits[1]: 28 | val_files = val_files 29 | if self._hparams.splits[2]: 30 | test_files = train_test_files[train_pivot:] 31 | return train_files, val_files, test_files 32 | 33 | def train_val_filter(self, train_metadata, val_metadata): 34 | """ 35 | :param metadata: 36 | :return: train_metadata, val_metadata 37 | """ 38 | raise NotImplementedError 39 | 40 | class HeldoutRobotDataset(ValFilterDataset): 41 | """ 42 | Use files from one held-out robot for testing and files from all other robots for training 43 | """ 44 | @staticmethod 45 | def _get_default_hparams(parent_hparams=None): 46 | if parent_hparams is None: 47 | parent_hparams = ValFilterDataset._get_default_hparams() 48 | parent_hparams.add_hparam('held_out_robot', '') 49 | return parent_hparams 50 | 51 | def train_val_filter(self, train_metadata, val_metadata): 52 | train_metadata = train_metadata[train_metadata['robot'] != self._hparams.held_out_robot] 53 | val_metadata = val_metadata[val_metadata['robot'] == self._hparams.held_out_robot] 54 | print('after filtering robots: number of trainfiles {} number of val files {}'.format(len(train_metadata.files), len(val_metadata.files))) 55 | return train_metadata, val_metadata 56 | 57 | 58 | class AnnotationHeldoutRobotDataset(HeldoutRobotDataset, AnnotationBenchmarkDataset): 59 | 60 | @staticmethod 61 | def _get_default_hparams(): 62 | combined_params = RoboNetDataset._get_default_hparams() 63 | combined_params = HeldoutRobotDataset._get_default_hparams(combined_params) 64 | combined_params = AnnotationBenchmarkDataset._get_default_hparams(combined_params) 65 | return combined_params 66 | 67 | def train_val_filter(self, train_metadata, val_metadata): 68 | print('before filtering: number of trainfiles {} number of val files {}'.format(len(train_metadata.files), len(val_metadata.files))) 69 | train_metadata, val_metadata = HeldoutRobotDataset.train_val_filter(self, train_metadata, val_metadata) 70 | train_metadata, val_metadata = AnnotationBenchmarkDataset.train_val_filter(self, train_metadata, val_metadata) 71 | return train_metadata, val_metadata -------------------------------------------------------------------------------- /robonet/inverse_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SudeepDasari/RoboNet/d83eee20f39653c3f8e7c349df7350e8a9e9f7a7/robonet/inverse_model/__init__.py -------------------------------------------------------------------------------- /robonet/inverse_model/models/__init__.py: -------------------------------------------------------------------------------- 1 | def get_models(class_name): 2 | if class_name == 'DeterministicInverseModel': 3 | from .deterministic_inverse_model import DeterministicInverseModel 4 | return DeterministicInverseModel 5 | if class_name == 'DiscretizedInverseModel': 6 | from .discretized_inverse_model import DiscretizedInverseModel 7 | return DiscretizedInverseModel 8 | raise NotImplementedError 9 | 10 | -------------------------------------------------------------------------------- /robonet/inverse_model/models/base_inverse_model.py: -------------------------------------------------------------------------------- 1 | from robonet.video_prediction.models.base_model import BaseModel 2 | from robonet.inverse_model.models.graphs import get_graph_class 3 | 4 | 5 | class BaseInverseModel(BaseModel): 6 | def _get_graph(self, graph_type): 7 | return get_graph_class(graph_type) 8 | 9 | def _default_scope(self): 10 | return 'inverse_model' 11 | -------------------------------------------------------------------------------- /robonet/inverse_model/models/deterministic_inverse_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Boiled down version of SAVP model from https://github.com/alexlee-gk/video_prediction 3 | """ 4 | from robonet.inverse_model.models.base_inverse_model import BaseInverseModel 5 | from robonet.video_prediction.utils import tf_utils 6 | import tensorflow as tf 7 | from collections import OrderedDict 8 | from robonet.video_prediction import losses 9 | from robonet.video_prediction.utils import tf_utils 10 | 11 | 12 | class DeterministicInverseModel(BaseInverseModel): 13 | def _model_default_hparams(self): 14 | return { 15 | "lr": 0.001, 16 | "end_lr": 0.0, 17 | "beta1": 0.9, 18 | "beta2": 0.999, 19 | } 20 | 21 | def _model_fn(self, model_inputs, model_targets, mode): 22 | inputs, targets = {}, None 23 | inputs['start_images'] = model_inputs['images'][:, 0] 24 | inputs['goal_images'] = model_inputs['images'][:, -1] 25 | if mode == tf.estimator.ModeKeys.TRAIN: 26 | inputs['T'] = model_targets['actions'].get_shape().as_list()[1] 27 | inputs['adim'] = model_targets['actions'].get_shape().as_list()[2] 28 | inputs['real_actions'] = targets = model_targets['actions'] 29 | else: 30 | inputs['adim'] = model_inputs['adim'] 31 | inputs['T'] = model_inputs['T'] 32 | 33 | # build the graph 34 | self._model_graph = model_graph = self._graph_class() 35 | 36 | if self._num_gpus <= 1: 37 | outputs = model_graph.build_graph(mode, inputs, self._hparams, self._graph_scope) 38 | else: 39 | # TODO: add multi-gpu support 40 | raise NotImplementedError 41 | 42 | # train 43 | if mode == tf.estimator.ModeKeys.TRAIN: 44 | global_step = tf.train.get_or_create_global_step() 45 | lr, optimizer = tf_utils.build_optimizer(self._hparams.lr, self._hparams.beta1, self._hparams.beta2, global_step=global_step) 46 | loss = losses.l1_loss(targets, outputs['pred_actions']) 47 | 48 | print('computing gradient and train_op') 49 | g_train_op = optimizer.minimize(loss, global_step=global_step) 50 | 51 | est = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=g_train_op) 52 | scalar_summaries = {} 53 | if 'ground_truth_sampling_mean' in outputs: 54 | scalar_summaries['ground_truth_sampling_mean'] = outputs['ground_truth_sampling_mean'] 55 | return est, scalar_summaries, {} 56 | 57 | #test 58 | return outputs['pred_actions'] 59 | 60 | -------------------------------------------------------------------------------- /robonet/inverse_model/models/discretized_inverse_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Boiled down version of SAVP model from https://github.com/alexlee-gk/video_prediction 3 | """ 4 | from robonet.inverse_model.models.base_inverse_model import BaseInverseModel 5 | from robonet.video_prediction.utils import tf_utils 6 | import tensorflow as tf 7 | from collections import OrderedDict 8 | from robonet.video_prediction import losses 9 | from robonet.video_prediction.utils import tf_utils 10 | 11 | 12 | def _binarize(actions, pivots): 13 | n_xy = (len(pivots[0]) + 1) * (len(pivots[1]) + 1) 14 | n_z = len(pivots[2]) + 1 15 | n_theta = len(pivots[3]) + 1 16 | 17 | B = actions.get_shape().as_list()[0] 18 | input_adim = actions.get_shape().as_list()[2] 19 | T = actions.get_shape().as_list()[1] 20 | 21 | assert input_adim == 4, "only supports [x,y,z,theta] action space for now!" 22 | assert len(pivots) == input_adim, "bad discretization pivots array!" 23 | binned_actions = [] 24 | for a in range(input_adim): 25 | binned_action = tf.zeros((B, T), dtype=tf.int32) 26 | for p in range(len(pivots[a])): 27 | pivot = pivots[a][p] 28 | binned_action = tf.where_v2(actions[:, :, a] > pivot, binned_action + 1, binned_action) 29 | binned_actions.append(binned_action) 30 | 31 | xy_act = binned_actions[0] + (len(pivots[0]) + 1) * binned_actions[1] 32 | z_act, theta_act = binned_actions[2], binned_actions[3] 33 | one_hot_actions = [tf.one_hot(tensor, n_dim) for tensor, n_dim in zip((xy_act, z_act, theta_act), (n_xy, n_z, n_theta))] 34 | return one_hot_actions 35 | 36 | 37 | class DiscretizedInverseModel(BaseInverseModel): 38 | def _model_default_hparams(self): 39 | return { 40 | "context_actions": 0, 41 | "lr": 0.001, 42 | "end_lr": 0.0, 43 | "beta1": 0.9, 44 | "beta2": 0.999, 45 | "pivots": [[-0.04483253140755173, -0.02947711320550581, -0.018373884708696702, -0.008892051974322548, -4.59881939272745e-05, 0.008815899693566963, 0.018292582474913204, 0.02938255920278165, 0.04470332342338521], 46 | [-0.044674549010427486, -0.029352782231283018, -0.018263887904468375, -0.008836470630237072, 7.81874877900302e-06, 0.00884825636063618, 0.01830693463003378, 0.029377939442953, 0.04473508111072804], 47 | [-0.10348141529525286, -0.06793363038544242, -0.042405628783200776, -0.02067683018449292, -0.0003540274691179853, 0.019988218195319766, 0.04168513725690283, 0.06726936589279635, 0.10260515613003221], 48 | [-0.22409500837108018, -0.1470685835529137, -0.09166876049855337, -0.04419968109806307, 5.580875190224738e-05, 0.044414223320168145, 0.09168509202611021, 0.1469321233733917, 0.2237400683241968]], 49 | "means": [[-0.05844043352506317, -0.0365598888753108, -0.02371854361080623, -0.01355633452537272, -0.004447061217304071, 0.004359558603466982, 0.01346781084244209, 0.02363783086130393, 0.036456939880113295, 0.05834560772861528], 50 | [-0.05831025927528526, -0.03643373528153938, -0.023609139710274608, -0.013465667182953755, -0.004399357688117235, 0.004405043570967748, 0.013491056349448851, 0.023632353969085647, 0.03646405448080863, 0.0583175660974888], 51 | [-0.14210753817324154, -0.08433897448430323, -0.054693763651882464, -0.03133158710778195, -0.010471111756616646, 0.00976338713468559, 0.030621494148596932, 0.05401384675615853, 0.08356642563278535, 0.1406814351222195], 52 | [-0.30673709675244115, -0.1826234528754964, -0.11831810064105407, -0.06747048133665953, -0.02199376800432712, 0.02209506703978301, 0.06762712392804507, 0.11832652238765545, 0.18242774553595653, 0.30635348910031857]] 53 | } 54 | 55 | def _model_fn(self, model_inputs, model_targets, mode): 56 | inputs = {} 57 | if self._hparams.context_actions: 58 | inputs['context_frames'] = model_inputs['images'][:, :self._hparams.context_actions] 59 | inputs['start_images'] = model_inputs['images'][:, self._hparams.context_actions] 60 | inputs['goal_images'] = model_inputs['images'][:, -1] 61 | 62 | n_xy = (len(self._hparams.pivots[0]) + 1) * (len(self._hparams.pivots[1]) + 1) 63 | n_z = len(self._hparams.pivots[2]) + 1 64 | n_theta = len(self._hparams.pivots[3]) + 1 65 | 66 | if mode == tf.estimator.ModeKeys.TRAIN: 67 | one_hot_actions = _binarize(model_targets['actions'], self._hparams.pivots) 68 | if self._hparams.context_actions: 69 | inputs['context_actions'] = tf.concat([x[:, :self._hparams.context_actions] for x in one_hot_actions], -1) 70 | real_pred_actions = [x[:, self._hparams.context_actions:] for x in one_hot_actions] 71 | inputs['real_actions'] = tf.concat(real_pred_actions, -1) 72 | inputs['T'] = model_targets['actions'].get_shape().as_list()[1] - self._hparams.context_actions 73 | else: 74 | assert model_inputs['adim'] == 4, "only supports [x,y,z,theta] action space for now!" 75 | inputs['T'] = model_inputs['T'] - self._hparams.context_actions 76 | if self._hparams.context_actions: 77 | one_hot_actions = _binarize(model_inputs['context_actions'], self._hparams.pivots) 78 | inputs['context_actions'] = tf.concat([x[:, :self._hparams.context_actions] for x in one_hot_actions], -1) 79 | 80 | inputs['adim'] = (len(self._hparams.pivots[0]) + 1) * (len(self._hparams.pivots[1]) + 1) + sum([len(arr) + 1 for arr in self._hparams.pivots[2:]]) 81 | 82 | # build the graph 83 | self._model_graph = model_graph = self._graph_class() 84 | if self._num_gpus <= 1: 85 | outputs = model_graph.build_graph(mode, inputs, self._hparams, self._graph_scope) 86 | else: 87 | # TODO: add multi-gpu support 88 | raise NotImplementedError 89 | 90 | # train 91 | if mode == tf.estimator.ModeKeys.TRAIN: 92 | global_step = tf.train.get_or_create_global_step() 93 | lr, optimizer = tf_utils.build_optimizer(self._hparams.lr, self._hparams.beta1, self._hparams.beta2, global_step=global_step) 94 | pred_xy = outputs['pred_actions'][:, :, :n_xy] 95 | pred_z = outputs['pred_actions'][:, :, n_xy:n_z + n_xy] 96 | pred_theta = outputs['pred_actions'][:, :, n_z + n_xy:] 97 | pred_one_hots = [pred_xy, pred_z, pred_theta] 98 | 99 | losses = [tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(real, pred)) for real, pred in zip(real_pred_actions, pred_one_hots)] 100 | loss = sum(losses) 101 | 102 | print('computing gradient and train_op') 103 | g_train_op = optimizer.minimize(loss, global_step=global_step) 104 | 105 | est = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=g_train_op) 106 | scalar_summaries = {} 107 | if 'ground_truth_sampling_mean' in outputs: 108 | scalar_summaries['ground_truth_sampling_mean'] = outputs['ground_truth_sampling_mean'] 109 | 110 | for k, loss in zip(['xy_loss', 'z_loss', 'theta_loss'], losses): 111 | scalar_summaries[k] = loss 112 | return est, scalar_summaries, {} 113 | 114 | #test 115 | means = tf.convert_to_tensor(self._hparams.means) 116 | pred_xy = outputs['pred_actions'][:, :, :n_xy] 117 | pred_z = outputs['pred_actions'][:, :, n_xy:n_z + n_xy] 118 | pred_theta = outputs['pred_actions'][:, :, n_z + n_xy:] 119 | 120 | pred_xy = tf.reshape(tf.random.categorical(tf.reshape(pred_xy, (-1, n_xy)), 1, dtype=tf.int32), (-1, inputs['T'])) 121 | pred_x, pred_y = tf.mod(pred_xy, len(self._hparams.pivots[0]) + 1), tf.floordiv(pred_xy, len(self._hparams.pivots[0]) + 1) 122 | pred_z = tf.reshape(tf.random.categorical(tf.reshape(pred_z, (-1, n_z)), 1, dtype=tf.int32), (-1, inputs['T'])) 123 | pred_theta = tf.reshape(tf.random.categorical(tf.reshape(pred_theta, (-1, n_theta)), 1, dtype=tf.int32), (-1, inputs['T'])) 124 | 125 | outputs['pred_actions'] = tf.concat([tf.gather(means[i], indices)[:, :, None] for i, indices in 126 | enumerate([pred_x, pred_y, pred_z, pred_theta])], axis=-1) 127 | return outputs['pred_actions'] 128 | -------------------------------------------------------------------------------- /robonet/inverse_model/models/graphs/__init__.py: -------------------------------------------------------------------------------- 1 | def get_graph_class(class_name): 2 | if class_name == 'lstm_baseline': 3 | from .lstm_baseline import LSTMBaseline 4 | return LSTMBaseline 5 | else: 6 | raise NotImplementedError 7 | -------------------------------------------------------------------------------- /robonet/inverse_model/models/graphs/base_graph.py: -------------------------------------------------------------------------------- 1 | from robonet.video_prediction.models.graphs.base_graph import BaseGraph as BaseVpredGraph 2 | import tensorflow as tf 3 | 4 | 5 | class BaseGraph(BaseVpredGraph): 6 | @staticmethod 7 | def default_hparams(): 8 | return { 9 | } 10 | -------------------------------------------------------------------------------- /robonet/inverse_model/models/graphs/lstm_baseline.py: -------------------------------------------------------------------------------- 1 | from robonet.inverse_model.models.graphs.base_graph import BaseGraph 2 | import itertools 3 | import tensorflow as tf 4 | import tensorflow.keras.layers as layers 5 | from robonet.inverse_model.models.layers.vgg_pretrain import get_vgg_dict, vgg_preprocess_images, vgg_conv, vgg_pool 6 | 7 | 8 | class ImageEncoder(tf.Module): 9 | def __init__(self, conv_filters, kernel_size, out_dim, vgg_path, n_convs=3, padding='same', fc_layer=256): 10 | self._vgg_dict = get_vgg_dict(vgg_path) 11 | 12 | self._convs = [[layers.Conv2D(conv_filters, kernel_size, padding="same", dilation_rate=min(c + 1, 3)), 13 | layers.BatchNormalization(axis=-1)] for c in range(n_convs)] 14 | 15 | # top layer 16 | self._fc_layer = [layers.Dense(fc_layer), layers.BatchNormalization(axis=-1)] 17 | self._top = [layers.Dense(out_dim), layers.BatchNormalization(axis=-1)] 18 | 19 | def __call__(self, input_img, training=True): 20 | preprocessed = vgg_preprocess_images(input_img) 21 | conv1_out = vgg_conv(self._vgg_dict, vgg_conv(self._vgg_dict, preprocessed, "conv1_1"), "conv1_2") 22 | conv1_out = vgg_pool(conv1_out, "pool1") 23 | 24 | conv2_out = vgg_conv(self._vgg_dict, vgg_conv(self._vgg_dict, conv1_out, "conv2_1"), "conv2_2") 25 | conv2_out = vgg_pool(conv2_out, "pool2") 26 | 27 | conv3_out = conv2_out 28 | for c in ['conv3_1', 'conv3_2', 'conv3_3', 'conv3_4']: 29 | conv3_out = vgg_conv(self._vgg_dict, conv3_out, c) 30 | conv3_out = vgg_pool(conv3_out, "pool3") 31 | 32 | conv4_out = conv3_out 33 | for c in ['conv4_1', 'conv4_2', 'conv4_3', 'conv4_4']: 34 | conv4_out = vgg_conv(self._vgg_dict, conv4_out, c) 35 | conv4_out = vgg_pool(conv4_out, "pool4") 36 | 37 | top = vgg_conv(self._vgg_dict, conv4_out, "conv5_1") 38 | for layer in self._convs: 39 | conv, norm = layer 40 | top = norm(tf.nn.relu(conv(top))) + top 41 | 42 | top = tf.nn.max_pool(top, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='top_pool') 43 | dense, norm = self._fc_layer 44 | top = norm(tf.nn.relu(dense(tf.reshape(top, (top.get_shape().as_list()[0], -1))))) 45 | 46 | dense, norm = self._top 47 | return norm(tf.nn.relu(dense(top))) 48 | 49 | 50 | class LSTMBaseline(BaseGraph): 51 | def build_graph(self, mode, inputs, hparams, scope_name='flow_generator'): 52 | is_train = mode == tf.estimator.ModeKeys.TRAIN 53 | B = inputs['start_images'].get_shape().as_list()[0] 54 | self._scope_name = scope_name 55 | outputs = {} 56 | with tf.variable_scope(scope_name): 57 | encoder = ImageEncoder(hparams.conv_filters, hparams.kernel_size, hparams.enc_dim, hparams.vgg_path, hparams.n_convs) 58 | start_enc = encoder(inputs['start_images'], training=is_train) 59 | goal_enc = encoder(inputs['goal_images'], training=is_train) 60 | start_goal_enc = tf.concat((start_enc, goal_enc), -1) 61 | 62 | lstm_in = layers.Dense(hparams.latent_dim * inputs['T'])(start_goal_enc) 63 | lstm_in = layers.BatchNormalization(axis=-1)(tf.nn.relu(lstm_in), training=is_train) 64 | lstm_in = tf.reshape(lstm_in, (-1, inputs['T'], hparams.latent_dim)) 65 | 66 | lstm_dim = hparams.latent_dim 67 | if hparams.append_last_action: 68 | lstm_dim += + inputs['adim'] 69 | 70 | lstm = layers.LSTM(lstm_dim) 71 | lstm.cell.build([B, inputs['T'], lstm_dim]) 72 | 73 | if 'context_actions' in inputs: 74 | last_action = inputs['context_actions'][:, -1] 75 | else: 76 | last_action = tf.zeros((B, inputs['adim'])) 77 | 78 | action_predictions = [] 79 | top_layer = layers.Dense(inputs['adim']) 80 | schedule_sample = self.schedule_sample(inputs['T'], B, hparams) 81 | for t in range(inputs['T']): 82 | if hparams.append_last_action: 83 | if t > 0 and is_train: 84 | real_action = inputs['real_actions'][:, t - 1] 85 | last_action = tf.where(schedule_sample[t - 1], real_action, action_predictions[-1][:, 0]) 86 | elif t > 0: 87 | last_action = action_predictions[-1][:, 0] 88 | in_t = tf.concat([lstm_in[:, t], last_action], axis=-1) 89 | else: 90 | in_t = lstm_in[:, t] 91 | 92 | if t == 0: 93 | if 'context_frames' in inputs: 94 | assert hparams.append_last_action 95 | context_encodings = [encoder(inputs['context_frames'][:, c]) for c in range(hparams.context_actions)] 96 | 97 | for i, c in enumerate(context_encodings): 98 | dense = layers.Dense(hparams.latent_dim)(c) 99 | context_enc = layers.BatchNormalization(axis=-1)(tf.nn.relu(dense), training=is_train) 100 | if i == 0: 101 | context_act = tf.zeros_like(inputs['context_actions'][:, 0]) 102 | else: 103 | context_act = inputs['context_actions'][:, i - 1] 104 | 105 | context_in = tf.concat((context_enc, context_act), axis=-1) 106 | if i == 0: 107 | hidden_state = lstm.get_initial_state(context_in[:, None]) 108 | _, hidden_state = lstm.cell(context_in, hidden_state) 109 | else: 110 | hidden_state = lstm.get_initial_state(in_t[:, None]) 111 | 112 | lstm_out, hidden_state = lstm.cell(in_t, hidden_state) 113 | action_predictions.append(top_layer(lstm_out)[:, None]) 114 | 115 | 116 | outputs['pred_actions'] = tf.concat(action_predictions, axis=1) 117 | if hparams.append_last_action and inputs['T'] > 1: 118 | outputs['ground_truth_sampling_mean'] = tf.reduce_mean(tf.to_float(schedule_sample)) 119 | 120 | return outputs 121 | 122 | @staticmethod 123 | def default_hparams(): 124 | default_params = { 125 | "n_convs": 3, 126 | "conv_filters": 512, 127 | "enc_dim": 128, 128 | "kernel_size": 3, 129 | 130 | "latent_dim": 20, 131 | 132 | "vgg_path": '~/', 133 | "append_last_action": True, 134 | "schedule_sampling_k": 900.0, 135 | "schedule_sampling_steps": [0, 100000], 136 | } 137 | return dict(itertools.chain(BaseGraph.default_hparams().items(), default_params.items())) 138 | 139 | def schedule_sample(self, T, B, hparams): 140 | if T == 1: 141 | return 142 | 143 | ground_truth_sampling_shape = [T - 1, B] 144 | 145 | k = hparams.schedule_sampling_k 146 | start_step = hparams.schedule_sampling_steps[0] 147 | iter_num = tf.to_float(tf.train.get_or_create_global_step()) 148 | prob = (k / (k + tf.exp((iter_num - start_step) / k))) 149 | prob = tf.cond(tf.less(iter_num, start_step), lambda: 1.0, lambda: prob) 150 | 151 | log_probs = tf.log([1 - prob, prob]) 152 | ground_truth_sampling = tf.multinomial([log_probs] * B, ground_truth_sampling_shape[0]) 153 | ground_truth_sampling = tf.cast(tf.transpose(ground_truth_sampling, [1, 0]), dtype=tf.bool) 154 | # Ensure that eventually, the model is deterministically 155 | # autoregressive (as opposed to autoregressive with very high probability). 156 | ground_truth_sampling = tf.cond(tf.less(prob, 0.001), 157 | lambda: tf.constant(False, dtype=tf.bool, shape=ground_truth_sampling_shape), 158 | lambda: ground_truth_sampling) 159 | return ground_truth_sampling -------------------------------------------------------------------------------- /robonet/inverse_model/models/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SudeepDasari/RoboNet/d83eee20f39653c3f8e7c349df7350e8a9e9f7a7/robonet/inverse_model/models/layers/__init__.py -------------------------------------------------------------------------------- /robonet/inverse_model/models/layers/vgg_pretrain.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | 5 | 6 | def get_vgg_dict(path): 7 | return np.load(os.path.join(path, "vgg19.npy"), encoding='latin1', allow_pickle=True).item() 8 | 9 | 10 | def vgg_preprocess_images(image_tensor): 11 | """ 12 | :param image_tensor: float 32 array of Batch x Height x Width x Channel immages (range 0 - 1) 13 | :return: pre-processed images (ready to input to VGG) 14 | """ 15 | vgg_mean = tf.convert_to_tensor(np.array([103.939, 116.779, 123.68], dtype=np.float32)) 16 | red, green, blue = tf.split(axis=-1, num_or_size_splits=3, value=image_tensor * 255) 17 | 18 | return tf.concat(axis=3, values=[ 19 | blue - vgg_mean[0], 20 | green - vgg_mean[1], 21 | red - vgg_mean[2], 22 | ]) 23 | 24 | 25 | def vgg_conv(vgg_dict, bottom, name): 26 | with tf.variable_scope(name, reuse=True): 27 | filt = tf.constant(vgg_dict[name][0], name="filter") 28 | 29 | conv = tf.nn.conv2d(bottom, filt, [1, 1, 1, 1], padding='SAME') 30 | 31 | conv_biases = tf.constant(vgg_dict[name][1], name="biases") 32 | bias = tf.nn.bias_add(conv, conv_biases) 33 | 34 | relu = tf.nn.relu(bias) 35 | return relu 36 | 37 | 38 | def vgg_pool(bottom, name): 39 | return tf.nn.max_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) 40 | -------------------------------------------------------------------------------- /robonet/inverse_model/testing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SudeepDasari/RoboNet/d83eee20f39653c3f8e7c349df7350e8a9e9f7a7/robonet/inverse_model/testing/__init__.py -------------------------------------------------------------------------------- /robonet/inverse_model/testing/action_inference_interface.py: -------------------------------------------------------------------------------- 1 | import ray 2 | from robonet.inverse_model.models import get_models 3 | import numpy as np 4 | from robonet.video_prediction.utils import tf_utils 5 | import tensorflow as tf 6 | from tensorflow.contrib.training import HParams 7 | import os 8 | import glob 9 | import math 10 | import yaml 11 | 12 | 13 | class ActionInferenceInterface(object): 14 | def __init__(self, model_path, test_hparams={}, n_gpus=1, first_gpu=0, sess=None): 15 | assert n_gpus == 1, "multi gpu evaluation not yet written" 16 | assert first_gpu == 0, "only starts building at gpu0" 17 | 18 | self._test_hparams = self._default_hparams().override_from_dict(test_hparams) 19 | self._model_path = os.path.expanduser(model_path) 20 | 21 | config_path = self._model_path + '/params.yaml' 22 | assert os.path.exists(config_path), 'Config path does not exist!' 23 | 24 | with open(config_path) as config: 25 | params = yaml.load(config, Loader=yaml.SafeLoader) 26 | self._model_hparams = params['model'] 27 | self._input_hparams = params['dataset'] 28 | 29 | # ensure vgg weights are restored correctly (a bit hacky for now) 30 | self._model_hparams['vgg_path'] = os.path.expanduser(self._test_hparams.vgg_path) 31 | 32 | print('\n\n------------------------------------ LOADED PARAMS ------------------------------------') 33 | for k, v in self._model_hparams.items(): 34 | print('{} --> {}'.format(k, v)) 35 | for k, v in self._input_hparams.items(): 36 | print('{} --> {}'.format(k, v)) 37 | print('---------------------------------------------------------------------------------------\n\n') 38 | 39 | InverseModel = get_models(self._model_hparams.pop('model')) 40 | self._model = model = InverseModel(self._input_hparams, n_gpus, self._model_hparams['graph_type'], False, self._model_hparams.pop('scope_name')) 41 | inputs, targets = self._build_input_targets() 42 | self._pred_act= model.model_fn(inputs, targets, tf.estimator.ModeKeys.PREDICT, self._model_hparams) 43 | 44 | self._sess = sess 45 | self._restored = False 46 | 47 | def _default_hparams(self): 48 | default_dict = { 49 | "run_batch_size": 1, 50 | "vgg_path": "~/" # vgg19.npy should be in vgg_path folder (aka vgg_path = /path/to/folder/containing/weights/) 51 | } 52 | return HParams(**default_dict) 53 | 54 | def _build_input_targets(self): 55 | n_context = self._model_hparams.get('context_actions', 0) 56 | height, width = self._input_hparams['img_size'] 57 | self._images_pl = tf.placeholder(tf.float32, [self._test_hparams.run_batch_size, 2 + n_context, height, width, 3]) 58 | pl_dict = {'adim': self._input_hparams['target_adim'], 'T': self._input_hparams['load_T'] - 1, 'images': self._images_pl} 59 | 60 | if n_context: 61 | self._context_pl = tf.placeholder(tf.float32, [self._test_hparams.run_batch_size, self._model_hparams['context_actions'], 62 | self._input_hparams['target_adim']]) 63 | pl_dict['context_actions'] = self._context_pl 64 | 65 | return pl_dict, {} 66 | 67 | def predict(self, start_image, goal_image, context_actions=None, context_frames=None): 68 | assert self._restored 69 | start_goal_image = np.concatenate((start_image[None, None], goal_image[None, None]), axis=1) 70 | fd = {self._images_pl: start_goal_image} 71 | if self._model_hparams.get('context_actions', 0): 72 | fd[self._images_pl] = np.concatenate((context_frames, start_goal_image), axis=1) 73 | fd[self._context_pl] = context_actions 74 | return self._sess.run(self._pred_act, feed_dict=fd) 75 | 76 | def __call__(self, start_image, goal_image, context_actions=None, context_frames=None): 77 | return self.predict(start_image, goal_image, context_actions, context_frames) 78 | 79 | def set_session(self, sess): 80 | self._sess = sess 81 | 82 | def restore(self): 83 | if self._sess is None: 84 | self._sess = tf.Session() 85 | self._sess.run(tf.global_variables_initializer()) 86 | 87 | model_paths = glob.glob('{}/model-*'.format(self._model_path)) 88 | max_model = max([int(m.split('.')[0].split('-')[-1]) for m in model_paths]) 89 | restore_path = os.path.join(self._model_path, 'model-' + str(max_model)) 90 | print('restoring', restore_path) 91 | 92 | checkpoints = [restore_path] 93 | # automatically skip global_step if more than one checkpoint is provided 94 | skip_global_step = len(checkpoints) > 1 95 | savers = [] 96 | for checkpoint in checkpoints: 97 | print("creating restore saver from checkpoint %s" % checkpoint) 98 | saver, _ = tf_utils.get_checkpoint_restore_saver(checkpoint, skip_global_step=skip_global_step) 99 | savers.append(saver) 100 | restore_op = [saver.saver_def.restore_op_name for saver in savers] 101 | self._sess.run(restore_op) 102 | self._restored = True 103 | 104 | @property 105 | def horizon(self): 106 | return self._input_hparams['load_T'] - 1 107 | 108 | @property 109 | def context_actions(self): 110 | return self._model_hparams.get('context_actions', 0) 111 | -------------------------------------------------------------------------------- /robonet/inverse_model/training/__init__.py: -------------------------------------------------------------------------------- 1 | def get_trainable(name): 2 | if name == 'InverseTrainable': 3 | from .inverse_trainable import InverseTrainable 4 | return InverseTrainable 5 | raise NotImplementedError 6 | 7 | -------------------------------------------------------------------------------- /robonet/inverse_model/training/inverse_trainable.py: -------------------------------------------------------------------------------- 1 | from robonet.video_prediction.training.trainable_interface import VPredTrainable 2 | from robonet.inverse_model.models import get_models 3 | import time 4 | from tensorflow.contrib.training import HParams 5 | from robonet.datasets.util.tensor_multiplexer import MultiplexedTensors 6 | 7 | 8 | class InverseTrainable(VPredTrainable): 9 | def _get_model_class(self, model_name): 10 | return get_models(model_name) 11 | 12 | def _default_hparams(self): 13 | default_dict = { 14 | 'batch_size': 16, 15 | 'restore_dir': '', 16 | 'n_gpus': 1, 17 | 'scalar_summary_freq': 100, 18 | 'train_fraction': 0.9, 19 | 'val_fraction': 0.05, 20 | 'max_to_keep': 3, 21 | 'max_steps': 300000, 22 | 'tf_log_flush_freq': 500 23 | } 24 | return HParams(**default_dict) 25 | 26 | def _get_input_targets(self, DatasetClass, metadata, dataset_hparams): 27 | data_loader = DatasetClass(self._hparams.batch_size, metadata, dataset_hparams) 28 | 29 | tensor_names = ['actions', 'images', 'states'] 30 | if 'annotations' in data_loader: 31 | tensor_names = ['actions', 'images', 'states', 'annotations'] 32 | 33 | self._tensor_multiplexer = MultiplexedTensors(data_loader, tensor_names) 34 | loaded_tensors = [self._tensor_multiplexer[k] for k in tensor_names] 35 | 36 | self._real_annotations = None 37 | assert loaded_tensors[1].get_shape().as_list()[2] == 1, "loader assumes one (potentially random) camera will be loaded in each example!" 38 | self._real_images = loaded_tensors[1] = loaded_tensors[1][:, :, 0] # grab cam 0 for images 39 | if 'annotations' in data_loader: 40 | self._real_annotations = loaded_tensors[3] = loaded_tensors[3][:, :, 0] # grab cam 0 for annotations 41 | 42 | inputs, targets = {}, {'actions': loaded_tensors[0]} 43 | for k, v in zip(tensor_names[1:], loaded_tensors[1:]): 44 | inputs[k] = v 45 | 46 | self._data_loader = data_loader 47 | return inputs, targets 48 | 49 | def _train(self): 50 | itr = self.iteration 51 | 52 | # no need to increment itr since global step is incremented by train_op 53 | loss, train_op = self._estimator.loss, self._estimator.train_op 54 | fetches = {'global_step': itr} 55 | 56 | start = time.time() 57 | train_loss = self.sess.run([loss, train_op], feed_dict=self._tensor_multiplexer.get_feed_dict('train'))[0] 58 | fetches['metric/step_time'] = time.time() - start 59 | fetches['metric/loss/train'] = train_loss 60 | 61 | if itr % self._hparams.scalar_summary_freq == 0: 62 | fetches['metric/loss/val'] = self.sess.run(loss, feed_dict=self._tensor_multiplexer.get_feed_dict('val')) 63 | for name in ['train', 'val']: 64 | metrics = self.sess.run(self._scalar_metrics, feed_dict=self._tensor_multiplexer.get_feed_dict(name)) 65 | for key, value in metrics.items(): 66 | fetches['metric/{}/{}'.format(key, name)] = value 67 | 68 | fetches['done'] = itr >= self._hparams.max_steps 69 | 70 | self._tf_log(fetches) 71 | 72 | return fetches 73 | -------------------------------------------------------------------------------- /robonet/video_prediction/__init__.py: -------------------------------------------------------------------------------- 1 | from . import losses 2 | from . import metrics 3 | from . import ops 4 | -------------------------------------------------------------------------------- /robonet/video_prediction/flow_ops.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def image_warp(im, flow): 5 | """Performs a backward warp of an image using the predicted flow. 6 | 7 | Args: 8 | im: Batch of images. [num_batch, height, width, channels] 9 | flow: Batch of flow vectors. [num_batch, height, width, 2] 10 | Returns: 11 | warped: transformed image of the same shape as the input image. 12 | 13 | Implementation taken from here: https://github.com/simonmeister/UnFlow 14 | 15 | maybe swap to # tf.contrib.image.dense_image_warp 16 | """ 17 | with tf.variable_scope('image_warp'): 18 | 19 | num_batch, height, width, channels = tf.unstack(tf.shape(im)) 20 | max_x = tf.cast(width - 1, 'int32') 21 | max_y = tf.cast(height - 1, 'int32') 22 | zero = tf.zeros([], dtype='int32') 23 | 24 | # We have to flatten our tensors to vectorize the interpolation 25 | im_flat = tf.reshape(im, [-1, channels]) 26 | flow_flat = tf.reshape(flow, [-1, 2]) 27 | 28 | # Floor the flow, as the final indices are integers 29 | # The fractional part is used to control the bilinear interpolation. 30 | flow_floor = tf.to_int32(tf.floor(flow_flat)) 31 | bilinear_weights = flow_flat - tf.floor(flow_flat) 32 | 33 | # Construct base indices which are displaced with the flow 34 | pos_x = tf.tile(tf.range(width), [height * num_batch]) 35 | grid_y = tf.tile(tf.expand_dims(tf.range(height), 1), [1, width]) 36 | pos_y = tf.tile(tf.reshape(grid_y, [-1]), [num_batch]) 37 | 38 | x = flow_floor[:, 0] 39 | y = flow_floor[:, 1] 40 | xw = bilinear_weights[:, 0] 41 | yw = bilinear_weights[:, 1] 42 | 43 | # Compute interpolation weights for 4 adjacent pixels 44 | # expand to num_batch * height * width x 1 for broadcasting in add_n below 45 | wa = tf.expand_dims((1 - xw) * (1 - yw), 1) # top left pixel 46 | wb = tf.expand_dims((1 - xw) * yw, 1) # bottom left pixel 47 | wc = tf.expand_dims(xw * (1 - yw), 1) # top right pixel 48 | wd = tf.expand_dims(xw * yw, 1) # bottom right pixel 49 | 50 | x0 = pos_x + x 51 | x1 = x0 + 1 52 | y0 = pos_y + y 53 | y1 = y0 + 1 54 | 55 | x0 = tf.clip_by_value(x0, zero, max_x) 56 | x1 = tf.clip_by_value(x1, zero, max_x) 57 | y0 = tf.clip_by_value(y0, zero, max_y) 58 | y1 = tf.clip_by_value(y1, zero, max_y) 59 | 60 | dim1 = width * height 61 | batch_offsets = tf.range(num_batch) * dim1 62 | base_grid = tf.tile(tf.expand_dims(batch_offsets, 1), [1, dim1]) 63 | base = tf.reshape(base_grid, [-1]) 64 | 65 | base_y0 = base + y0 * width 66 | base_y1 = base + y1 * width 67 | idx_a = base_y0 + x0 68 | idx_b = base_y1 + x0 69 | idx_c = base_y0 + x1 70 | idx_d = base_y1 + x1 71 | 72 | Ia = tf.gather(im_flat, idx_a) 73 | Ib = tf.gather(im_flat, idx_b) 74 | Ic = tf.gather(im_flat, idx_c) 75 | Id = tf.gather(im_flat, idx_d) 76 | 77 | warped_flat = tf.add_n([wa * Ia, wb * Ib, wc * Ic, wd * Id]) 78 | warped = tf.reshape(warped_flat, [num_batch, height, width, channels]) 79 | warped.set_shape(im.shape) 80 | 81 | return warped 82 | -------------------------------------------------------------------------------- /robonet/video_prediction/functional_ops.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.eager import context 2 | from tensorflow.python.framework import constant_op 3 | from tensorflow.python.framework import ops 4 | from tensorflow.python.ops import array_ops 5 | from tensorflow.python.ops import control_flow_ops 6 | from tensorflow.python.ops import tensor_array_ops 7 | from tensorflow.python.ops import variable_scope as vs 8 | from tensorflow.python.util import nest 9 | 10 | 11 | def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True, 12 | swap_memory=False, name=None): 13 | """ 14 | Same as tf.foldl but with support for a possibly nested sequence of tensors. 15 | """ 16 | if not callable(fn): 17 | raise TypeError("fn must be callable.") 18 | 19 | input_is_sequence = nest.is_sequence(elems) 20 | input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x] 21 | def input_pack(x): 22 | return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0] 23 | 24 | if initializer is None: 25 | output_is_sequence = input_is_sequence 26 | output_flatten = input_flatten 27 | output_pack = input_pack 28 | else: 29 | output_is_sequence = nest.is_sequence(initializer) 30 | output_flatten = lambda x: nest.flatten(x) if output_is_sequence else [x] 31 | def output_pack(x): 32 | return (nest.pack_sequence_as(initializer, x) 33 | if output_is_sequence else x[0]) 34 | 35 | elems_flat = input_flatten(elems) 36 | 37 | in_graph_mode = context.in_graph_mode() 38 | with ops.name_scope(name, "foldl", [elems]): 39 | # TODO(akshayka): Remove the in_graph_mode check once caching devices are 40 | # supported in Eager 41 | if in_graph_mode: 42 | # Any get_variable calls in fn will cache the first call locally 43 | # and not issue repeated network I/O requests for each iteration. 44 | varscope = vs.get_variable_scope() 45 | varscope_caching_device_was_none = False 46 | if varscope.caching_device is None: 47 | # TODO(ebrevdo): Change to using colocate_with here and in other 48 | # methods. 49 | varscope.set_caching_device(lambda op: op.device) 50 | varscope_caching_device_was_none = True 51 | 52 | # Convert elems to tensor array. 53 | elems_flat = [ 54 | ops.convert_to_tensor(elem, name="elem") for elem in elems_flat] 55 | 56 | n = array_ops.shape(elems_flat[0])[0] 57 | 58 | # TensorArrays are always flat 59 | elems_ta = [ 60 | tensor_array_ops.TensorArray(dtype=elem.dtype, size=n, 61 | dynamic_size=False, 62 | infer_shape=True) 63 | for elem in elems_flat] 64 | # Unpack elements 65 | elems_ta = [ 66 | elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)] 67 | 68 | if initializer is None: 69 | a_flat = [elem.read(0) for elem in elems_ta] 70 | i = constant_op.constant(1) 71 | else: 72 | initializer_flat = output_flatten(initializer) 73 | a_flat = [ops.convert_to_tensor(init) for init in initializer_flat] 74 | i = constant_op.constant(0) 75 | 76 | def compute(i, a_flat): 77 | packed_elems = input_pack([elem_ta.read(i) for elem_ta in elems_ta]) 78 | packed_a = output_pack(a_flat) 79 | a_out = fn(packed_a, packed_elems) 80 | nest.assert_same_structure( 81 | elems if initializer is None else initializer, a_out) 82 | flat_a_out = output_flatten(a_out) 83 | return (i + 1, flat_a_out) 84 | 85 | _, r_a = control_flow_ops.while_loop( 86 | lambda i, a: i < n, compute, (i, a_flat), 87 | parallel_iterations=parallel_iterations, 88 | back_prop=back_prop, 89 | swap_memory=swap_memory) 90 | 91 | # TODO(akshayka): Remove the in_graph_mode check once caching devices are 92 | # supported in Eager 93 | if in_graph_mode and varscope_caching_device_was_none: 94 | varscope.set_caching_device(None) 95 | 96 | return output_pack(r_a) 97 | -------------------------------------------------------------------------------- /robonet/video_prediction/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalization import fused_instance_norm 2 | -------------------------------------------------------------------------------- /robonet/video_prediction/layers/encoder_layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from robonet.video_prediction.ops import lrelu, dense, conv2d, pool2d, get_norm_layer 3 | 4 | 5 | def create_n_layer_encoder(inputs, 6 | nz=8, 7 | nef=64, 8 | n_layers=3, 9 | norm_layer='instance', 10 | stochastic=True): 11 | norm_layer = get_norm_layer(norm_layer) 12 | layers = [] 13 | paddings = [[0, 0], [1, 1], [1, 1], [0, 0]] 14 | 15 | with tf.variable_scope("layer_1"): 16 | convolved = conv2d(tf.pad(inputs, paddings), nef, kernel_size=4, strides=2, padding='VALID') 17 | rectified = lrelu(convolved, 0.2) 18 | layers.append(rectified) 19 | 20 | for i in range(1, n_layers): 21 | with tf.variable_scope("layer_%d" % (len(layers) + 1)): 22 | out_channels = nef * min(2**i, 4) 23 | convolved = conv2d(tf.pad(layers[-1], paddings), out_channels, kernel_size=4, strides=2, padding='VALID') 24 | normalized = norm_layer(convolved) 25 | rectified = lrelu(normalized, 0.2) 26 | layers.append(rectified) 27 | 28 | pooled = pool2d(rectified, rectified.shape[1:3].as_list(), padding='VALID', pool_mode='avg') 29 | squeezed = tf.squeeze(pooled, [1, 2]) 30 | 31 | if stochastic: 32 | with tf.variable_scope('z_mu'): 33 | z_mu = dense(squeezed, nz) 34 | with tf.variable_scope('z_log_sigma_sq'): 35 | z_log_sigma_sq = dense(squeezed, nz) 36 | z_log_sigma_sq = tf.clip_by_value(z_log_sigma_sq, -10, 10) 37 | outputs = {'enc_zs_mu': z_mu, 'enc_zs_log_sigma_sq': z_log_sigma_sq} 38 | else: 39 | outputs = squeezed 40 | return outputs -------------------------------------------------------------------------------- /robonet/video_prediction/layers/normalization.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================= 15 | """Contains the normalization layer classes and their functional aliases.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | 21 | from tensorflow.contrib.framework.python.ops import variables 22 | from tensorflow.contrib.layers.python.layers import utils 23 | from tensorflow.python.framework import ops 24 | from tensorflow.python.ops import array_ops 25 | from tensorflow.python.ops import init_ops 26 | from tensorflow.python.ops import nn 27 | from tensorflow.python.ops import variable_scope 28 | 29 | 30 | DATA_FORMAT_NCHW = 'NCHW' 31 | DATA_FORMAT_NHWC = 'NHWC' 32 | 33 | 34 | def fused_instance_norm(inputs, 35 | center=True, 36 | scale=True, 37 | epsilon=1e-6, 38 | activation_fn=None, 39 | param_initializers=None, 40 | reuse=None, 41 | variables_collections=None, 42 | outputs_collections=None, 43 | trainable=True, 44 | data_format=DATA_FORMAT_NHWC, 45 | scope=None): 46 | """Functional interface for the instance normalization layer. 47 | 48 | Reference: https://arxiv.org/abs/1607.08022. 49 | 50 | "Instance Normalization: The Missing Ingredient for Fast Stylization" 51 | Dmitry Ulyanov, Andrea Vedaldi, Victor Lempitsky 52 | 53 | Args: 54 | inputs: A tensor with 2 or more dimensions, where the first dimension has 55 | `batch_size`. The normalization is over all but the last dimension if 56 | `data_format` is `NHWC` and the second dimension if `data_format` is 57 | `NCHW`. 58 | center: If True, add offset of `beta` to normalized tensor. If False, `beta` 59 | is ignored. 60 | scale: If True, multiply by `gamma`. If False, `gamma` is 61 | not used. When the next layer is linear (also e.g. `nn.relu`), this can be 62 | disabled since the scaling can be done by the next layer. 63 | epsilon: Small float added to variance to avoid dividing by zero. 64 | activation_fn: Activation function, default set to None to skip it and 65 | maintain a linear activation. 66 | param_initializers: Optional initializers for beta, gamma, moving mean and 67 | moving variance. 68 | reuse: Whether or not the layer and its variables should be reused. To be 69 | able to reuse the layer scope must be given. 70 | variables_collections: Optional collections for the variables. 71 | outputs_collections: Collections to add the outputs. 72 | trainable: If `True` also add variables to the graph collection 73 | `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 74 | data_format: A string. `NHWC` (default) and `NCHW` are supported. 75 | scope: Optional scope for `variable_scope`. 76 | 77 | Returns: 78 | A `Tensor` representing the output of the operation. 79 | 80 | Raises: 81 | ValueError: If `data_format` is neither `NHWC` nor `NCHW`. 82 | ValueError: If the rank of `inputs` is undefined. 83 | ValueError: If rank or channels dimension of `inputs` is undefined. 84 | """ 85 | inputs = ops.convert_to_tensor(inputs) 86 | inputs_shape = inputs.shape 87 | inputs_rank = inputs.shape.ndims 88 | 89 | if inputs_rank is None: 90 | raise ValueError('Inputs %s has undefined rank.' % inputs.name) 91 | if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC): 92 | raise ValueError('data_format has to be either NCHW or NHWC.') 93 | 94 | with variable_scope.variable_scope( 95 | scope, 'InstanceNorm', [inputs], reuse=reuse) as sc: 96 | if data_format == DATA_FORMAT_NCHW: 97 | reduction_axis = 1 98 | # For NCHW format, rather than relying on implicit broadcasting, we 99 | # explicitly reshape the params to params_shape_broadcast when computing 100 | # the moments and the batch normalization. 101 | params_shape_broadcast = list( 102 | [1, inputs_shape[1].value] + [1 for _ in range(2, inputs_rank)]) 103 | else: 104 | reduction_axis = inputs_rank - 1 105 | params_shape_broadcast = None 106 | moments_axes = list(range(inputs_rank)) 107 | del moments_axes[reduction_axis] 108 | del moments_axes[0] 109 | params_shape = inputs_shape[reduction_axis:reduction_axis + 1] 110 | if not params_shape.is_fully_defined(): 111 | raise ValueError('Inputs %s has undefined channels dimension %s.' % ( 112 | inputs.name, params_shape)) 113 | 114 | # Allocate parameters for the beta and gamma of the normalization. 115 | beta, gamma = None, None 116 | dtype = inputs.dtype.base_dtype 117 | if param_initializers is None: 118 | param_initializers = {} 119 | if center: 120 | beta_collections = utils.get_variable_collections( 121 | variables_collections, 'beta') 122 | beta_initializer = param_initializers.get( 123 | 'beta', init_ops.zeros_initializer()) 124 | beta = variables.model_variable('beta', 125 | shape=params_shape, 126 | dtype=dtype, 127 | initializer=beta_initializer, 128 | collections=beta_collections, 129 | trainable=trainable) 130 | if params_shape_broadcast: 131 | beta = array_ops.reshape(beta, params_shape_broadcast) 132 | if scale: 133 | gamma_collections = utils.get_variable_collections( 134 | variables_collections, 'gamma') 135 | gamma_initializer = param_initializers.get( 136 | 'gamma', init_ops.ones_initializer()) 137 | gamma = variables.model_variable('gamma', 138 | shape=params_shape, 139 | dtype=dtype, 140 | initializer=gamma_initializer, 141 | collections=gamma_collections, 142 | trainable=trainable) 143 | if params_shape_broadcast: 144 | gamma = array_ops.reshape(gamma, params_shape_broadcast) 145 | 146 | if data_format == DATA_FORMAT_NHWC: 147 | inputs = array_ops.transpose(inputs, list(range(1, reduction_axis)) + [0, reduction_axis]) 148 | if data_format == DATA_FORMAT_NCHW: 149 | inputs = array_ops.transpose(inputs, list(range(2, inputs_rank)) + [0, reduction_axis]) 150 | hw, n, c = inputs.shape.as_list()[:-2], inputs.shape[-2].value, inputs.shape[-1].value 151 | inputs = array_ops.reshape(inputs, [1] + hw + [n * c]) 152 | if inputs.shape.ndims != 4: 153 | # combine all the spatial dimensions into only two, e.g. [D, H, W] -> [DH, W] 154 | if inputs.shape.ndims > 4: 155 | inputs_ndims4_shape = [1, hw[0], -1, n * c] 156 | else: 157 | inputs_ndims4_shape = [1, 1, -1, n * c] 158 | inputs = array_ops.reshape(inputs, inputs_ndims4_shape) 159 | beta = array_ops.reshape(array_ops.tile(beta[None, :], [n, 1]), [-1]) 160 | gamma = array_ops.reshape(array_ops.tile(gamma[None, :], [n, 1]), [-1]) 161 | 162 | outputs, _, _ = nn.fused_batch_norm( 163 | inputs, gamma, beta, epsilon=epsilon, 164 | data_format=DATA_FORMAT_NHWC, name='instancenorm') 165 | 166 | outputs = array_ops.reshape(outputs, hw + [n, c]) 167 | if data_format == DATA_FORMAT_NHWC: 168 | outputs = array_ops.transpose(outputs, [inputs_rank - 2] + list(range(inputs_rank - 2)) + [inputs_rank - 1]) 169 | if data_format == DATA_FORMAT_NCHW: 170 | outputs = array_ops.transpose(outputs, [inputs_rank - 2, inputs_rank - 1] + list(range(inputs_rank - 2))) 171 | 172 | # if data_format == DATA_FORMAT_NHWC: 173 | # inputs = array_ops.transpose(inputs, [0, reduction_axis] + list(range(1, reduction_axis))) 174 | # inputs_nchw_shape = inputs.shape 175 | # inputs = array_ops.reshape(inputs, [1, -1] + inputs_nchw_shape.as_list()[2:]) 176 | # if inputs.shape.ndims != 4: 177 | # # combine all the spatial dimensions into only two, e.g. [D, H, W] -> [DH, W] 178 | # if inputs.shape.ndims > 4: 179 | # inputs_ndims4_shape = inputs.shape.as_list()[:2] + [-1, inputs_nchw_shape.as_list()[-1]] 180 | # else: 181 | # inputs_ndims4_shape = inputs.shape.as_list()[:2] + [1, -1] 182 | # inputs = array_ops.reshape(inputs, inputs_ndims4_shape) 183 | # beta = array_ops.reshape(array_ops.tile(beta[None, :], [inputs_nchw_shape[0].value, 1]), [-1]) 184 | # gamma = array_ops.reshape(array_ops.tile(gamma[None, :], [inputs_nchw_shape[0].value, 1]), [-1]) 185 | # 186 | # outputs, _, _ = nn.fused_batch_norm( 187 | # inputs, gamma, beta, epsilon=epsilon, 188 | # data_format=DATA_FORMAT_NCHW, name='instancenorm') 189 | # 190 | # outputs = array_ops.reshape(outputs, inputs_nchw_shape) 191 | # if data_format == DATA_FORMAT_NHWC: 192 | # outputs = array_ops.transpose(outputs, [0] + list(range(2, inputs_rank)) + [1]) 193 | 194 | if activation_fn is not None: 195 | outputs = activation_fn(outputs) 196 | return utils.collect_named_outputs(outputs_collections, sc.name, outputs) 197 | -------------------------------------------------------------------------------- /robonet/video_prediction/layers/vgg_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def vgg_assign_from_values_fn(model='vgg16', 6 | var_name_prefix='vgg/', 7 | var_name_kernel_postfix='/kernel:0', 8 | var_name_bias_postfix='/bias:0'): 9 | if model not in ('vgg16', 'vgg19'): 10 | raise ValueError('Invalid model %s' % model) 11 | import h5py 12 | WEIGHTS_PATH_NO_TOP = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.1/' \ 13 | '%s_weights_tf_dim_ordering_tf_kernels_notop.h5' % model 14 | weights_path = tf.keras.utils.get_file( 15 | '%s_weights_tf_dim_ordering_tf_kernels_notop.h5' % model, 16 | WEIGHTS_PATH_NO_TOP, 17 | cache_subdir='models') 18 | weights_file = h5py.File(weights_path, 'r') 19 | 20 | num_blocks = 5 21 | max_num_convs_in_block = 3 if model == 'vgg16' else 4 22 | 23 | weight_name_kernel_postfix = '_W_1:0' 24 | weight_name_bias_postfix = '_b_1:0' 25 | var_names_to_values = {} 26 | for block_id in range(num_blocks): 27 | for conv_id in range(max_num_convs_in_block): 28 | if block_id < 2 and conv_id >= 2: 29 | continue 30 | name = 'block%d_conv%d' % (block_id + 1, conv_id + 1) 31 | var_names_to_values[var_name_prefix + name + var_name_kernel_postfix] = \ 32 | weights_file[name][name + weight_name_kernel_postfix][()] 33 | var_names_to_values[var_name_prefix + name + var_name_bias_postfix] = \ 34 | weights_file[name][name + weight_name_bias_postfix][()] 35 | return tf.contrib.framework.assign_from_values_fn(var_names_to_values) 36 | 37 | 38 | def vgg16(rgb_image): 39 | """ 40 | rgb_image: 4-D tensor with pixel intensities between 0 and 1. 41 | """ 42 | bgr_mean = np.array([103.939, 116.779, 123.68], np.float32) 43 | rgb_scaled_image = rgb_image * 255.0 44 | bgr_scaled_image = rgb_scaled_image[:, :, :, ::-1] 45 | bgr_centered_image = bgr_scaled_image - tf.convert_to_tensor(bgr_mean) 46 | 47 | x = bgr_centered_image 48 | tensors = [x] 49 | features = [] 50 | 51 | # Block1 52 | x = tf.layers.conv2d(x, 64, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block1_conv1') 53 | tensors.append(x) 54 | x = tf.layers.conv2d(x, 64, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block1_conv2') 55 | tensors.append(x) 56 | features.append(x) 57 | x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='same', name='block1_pool') 58 | tensors.append(x) 59 | 60 | # Block2 61 | x = tf.layers.conv2d(x, 128, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block2_conv1') 62 | tensors.append(x) 63 | x = tf.layers.conv2d(x, 128, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block2_conv2') 64 | tensors.append(x) 65 | features.append(x) 66 | x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='same', name='block2_pool') 67 | tensors.append(x) 68 | 69 | # Block3 70 | x = tf.layers.conv2d(x, 256, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block3_conv1') 71 | tensors.append(x) 72 | x = tf.layers.conv2d(x, 256, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block3_conv2') 73 | tensors.append(x) 74 | x = tf.layers.conv2d(x, 256, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block3_conv3') 75 | tensors.append(x) 76 | features.append(x) 77 | x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='same', name='block3_pool') 78 | tensors.append(x) 79 | 80 | # Block4 81 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block4_conv1') 82 | tensors.append(x) 83 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block4_conv2') 84 | tensors.append(x) 85 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block4_conv3') 86 | tensors.append(x) 87 | features.append(x) 88 | x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='same', name='block4_pool') 89 | tensors.append(x) 90 | 91 | # Block5 92 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block5_conv1') 93 | tensors.append(x) 94 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block5_conv2') 95 | tensors.append(x) 96 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block5_conv3') 97 | tensors.append(x) 98 | features.append(x) 99 | x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='same', name='block5_pool') 100 | tensors.append(x) 101 | 102 | return tensors, features 103 | 104 | 105 | def vgg19(rgb_image): 106 | """ 107 | rgb_image: 4-D tensor with pixel intensities between 0 and 1. 108 | """ 109 | bgr_mean = np.array([103.939, 116.779, 123.68], np.float32) 110 | rgb_scaled_image = rgb_image * 255.0 111 | bgr_scaled_image = rgb_scaled_image[:, :, :, ::-1] 112 | bgr_centered_image = bgr_scaled_image - tf.convert_to_tensor(bgr_mean) 113 | 114 | x = bgr_centered_image 115 | tensors = [x] 116 | features = [] 117 | 118 | # Block1 119 | x = tf.layers.conv2d(x, 64, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block1_conv1') 120 | tensors.append(x) 121 | x = tf.layers.conv2d(x, 64, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block1_conv2') 122 | tensors.append(x) 123 | features.append(x) 124 | x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='same', name='block1_pool') 125 | tensors.append(x) 126 | 127 | # Block2 128 | x = tf.layers.conv2d(x, 128, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block2_conv1') 129 | tensors.append(x) 130 | x = tf.layers.conv2d(x, 128, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block2_conv2') 131 | tensors.append(x) 132 | features.append(x) 133 | x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='same', name='block2_pool') 134 | tensors.append(x) 135 | 136 | # Block3 137 | x = tf.layers.conv2d(x, 256, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block3_conv1') 138 | tensors.append(x) 139 | x = tf.layers.conv2d(x, 256, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block3_conv2') 140 | tensors.append(x) 141 | x = tf.layers.conv2d(x, 256, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block3_conv3') 142 | tensors.append(x) 143 | x = tf.layers.conv2d(x, 256, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block3_conv4') 144 | tensors.append(x) 145 | features.append(x) 146 | x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='same', name='block3_pool') 147 | tensors.append(x) 148 | 149 | # Block4 150 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block4_conv1') 151 | tensors.append(x) 152 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block4_conv2') 153 | tensors.append(x) 154 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block4_conv3') 155 | tensors.append(x) 156 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block4_conv4') 157 | tensors.append(x) 158 | features.append(x) 159 | x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='same', name='block4_pool') 160 | tensors.append(x) 161 | 162 | # Block5 163 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block5_conv1') 164 | tensors.append(x) 165 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block5_conv2') 166 | tensors.append(x) 167 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block5_conv3') 168 | tensors.append(x) 169 | x = tf.layers.conv2d(x, 512, (3, 3), padding='same', activation=tf.nn.relu, trainable=False, name='block5_conv4') 170 | tensors.append(x) 171 | features.append(x) 172 | x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='same', name='block5_pool') 173 | tensors.append(x) 174 | 175 | return tensors, features 176 | -------------------------------------------------------------------------------- /robonet/video_prediction/losses.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from robonet.video_prediction.ops import sigmoid_kl_with_logits 4 | 5 | 6 | def l1_loss(pred, target): 7 | return tf.reduce_mean(tf.abs(target - pred)) 8 | 9 | 10 | def l2_loss(pred, target): 11 | return tf.reduce_mean(tf.square(target - pred)) 12 | 13 | 14 | def gan_loss(logits, labels, gan_loss_type): 15 | # use 1.0 (or 1.0 - discrim_label_smooth) for real data and 0.0 for fake data 16 | if gan_loss_type == 'GAN': 17 | # discrim_loss = tf.reduce_mean(-(tf.log(predict_real + EPS) + tf.log(1 - predict_fake + EPS))) 18 | # gen_loss = tf.reduce_mean(-tf.log(predict_fake + EPS)) 19 | if labels in (0.0, 1.0): 20 | labels = tf.constant(labels, dtype=logits.dtype, shape=logits.get_shape()) 21 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)) 22 | else: 23 | loss = tf.reduce_mean(sigmoid_kl_with_logits(logits, labels)) 24 | elif gan_loss_type == 'LSGAN': 25 | # discrim_loss = tf.reduce_mean((tf.square(predict_real - 1) + tf.square(predict_fake))) 26 | # gen_loss = tf.reduce_mean(tf.square(predict_fake - 1)) 27 | loss = tf.reduce_mean(tf.square(logits - labels)) 28 | elif gan_loss_type == 'SNGAN': 29 | # this is the form of the loss used in the official implementation of the SNGAN paper, but it leads to 30 | # worse results in our video prediction experiments 31 | if labels == 0.0: 32 | loss = tf.reduce_mean(tf.nn.softplus(logits)) 33 | elif labels == 1.0: 34 | loss = tf.reduce_mean(tf.nn.softplus(-logits)) 35 | else: 36 | raise NotImplementedError 37 | else: 38 | raise ValueError('Unknown GAN loss type %s' % gan_loss_type) 39 | return loss 40 | 41 | 42 | def kl_loss(mu, log_sigma_sq): 43 | sigma_sq = tf.exp(log_sigma_sq) 44 | return -0.5 * tf.reduce_mean(tf.reduce_sum(1 + log_sigma_sq - tf.square(mu) - sigma_sq, axis=-1)) 45 | -------------------------------------------------------------------------------- /robonet/video_prediction/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .graphs import get_graph_class 2 | 3 | 4 | def get_model(class_name): 5 | if class_name == 'deterministic': 6 | from .deterministic_generator import DeterministicModel 7 | return DeterministicModel 8 | else: 9 | raise NotImplementedError 10 | -------------------------------------------------------------------------------- /robonet/video_prediction/models/base_model.py: -------------------------------------------------------------------------------- 1 | from robonet.video_prediction.models import get_graph_class 2 | from tensorflow.contrib.training import HParams 3 | import itertools 4 | import copy 5 | 6 | 7 | class BaseModel(object): 8 | def __init__(self, data_loader_hparams, num_gpus, graph_type, tpu_mode=False, graph_scope=None): 9 | self._data_hparams = data_loader_hparams 10 | self._num_gpus = num_gpus 11 | self._graph_class = self._get_graph(graph_type) 12 | self._tpu_mode = tpu_mode 13 | if graph_scope is not None: 14 | self._graph_scope = graph_scope 15 | else: 16 | self._graph_scope = self._default_scope() 17 | 18 | def _default_scope(self): 19 | return 'vpred_model' 20 | 21 | def _get_graph(self, graph_type): 22 | return get_graph_class(graph_type) 23 | 24 | def init_default_hparams(self, params): 25 | graph_params = self._graph_class.default_hparams() 26 | model_hparams = self._model_default_hparams() 27 | default_hparams = dict(itertools.chain(graph_params.items(), model_hparams.items())) 28 | 29 | params = copy.deepcopy(params) 30 | if self._tpu_mode: 31 | self._summary_dir = params.pop('summary_dir') 32 | self._summary_queue_len = params.pop('summary_queue_len') 33 | self._image_summary_freq = params.pop('image_summary_freq') 34 | 35 | self._use_tpu = params.pop('use_tpu', None) 36 | for k in list(params.keys()): 37 | if k not in default_hparams: 38 | params.pop(k) 39 | print('key {} specified but is not in hparams!') 40 | 41 | self._hparams = HParams(**default_hparams).override_from_dict(params) 42 | self._hparams.use_tpu = self._use_tpu 43 | 44 | def model_fn(self, features, labels, mode, params): 45 | self.init_default_hparams(params) 46 | return self._model_fn(features, labels, mode) 47 | 48 | def _model_default_hparams(self): 49 | raise NotImplementedError 50 | 51 | def _model_fn(self, inputs, targets, mode): 52 | raise NotImplementedError 53 | 54 | @property 55 | def scope_name(self): 56 | return self._graph_scope 57 | 58 | @property 59 | def data_hparams(self): 60 | return copy.deepcopy(self._data_hparams) 61 | 62 | @property 63 | def model_hparams(self): 64 | return copy.deepcopy(self._hparams) -------------------------------------------------------------------------------- /robonet/video_prediction/models/deterministc_embedding_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | TODO: use self._data_hparams instead of hacking batch_size/sub_batch_size into the model_hparams 3 | 4 | Boiled down version of SAVP model from https://github.com/alexlee-gk/video_prediction 5 | """ 6 | import itertools 7 | from robonet.video_prediction.utils import tf_utils 8 | import tensorflow as tf 9 | from robonet.video_prediction.models import get_graph_class 10 | from tensorflow.contrib.training import HParams 11 | import logging 12 | from collections import OrderedDict 13 | from robonet.video_prediction import losses 14 | from robonet.video_prediction.ops import lrelu, dense, pad2d, conv2d, conv_pool2d, flatten, tile_concat, pool2d, get_norm_layer 15 | from tensorflow.python.util import nest 16 | from robonet.video_prediction.layers.encoder_layers import create_n_layer_encoder 17 | 18 | 19 | def onestep_encoder_fn(targets, hparams=None): 20 | image_pairs = tf.concat([targets['images'][:-1], targets['images'][1:]], axis=-1) 21 | 22 | targets = tile_concat([image_pairs, targets['actions'][:-1][:,:, None, None]], axis=-1) 23 | 24 | assert targets.shape.ndims == 5 25 | 26 | batch_shape = targets.shape[:-3].as_list() 27 | targets = flatten(targets, 0, len(batch_shape) - 1) 28 | unflatten = lambda x: tf.reshape(x, batch_shape + x.shape.as_list()[1:]) 29 | outputs = create_n_layer_encoder(targets, stochastic=hparams.stochastic) 30 | return nest.map_structure(unflatten, outputs) 31 | 32 | 33 | def split_model_inference(inputs, targets, params): 34 | """ 35 | we use separate trajectories for the encoder than from the ones used for prediction training 36 | :param inputs: dict with tensors in *time-major* 37 | :param targets:dict with tensors in *time-major* 38 | :return: 39 | """ 40 | def split(inputs, bs, sbs): 41 | first_half = {} 42 | second_half = {} 43 | for key, value in inputs.items(): 44 | first_half[key] = [] 45 | second_half[key] = [] 46 | for i in range(bs // sbs): 47 | first_half[key].append(value[:, sbs * i:sbs * i + sbs // 2]) 48 | second_half[key].append(value[:, sbs * i + sbs // 2:sbs * (i + 1)]) 49 | first_half[key] = tf.concat(first_half[key], 1) 50 | second_half[key] = tf.concat(second_half[key], 1) 51 | return first_half, second_half 52 | 53 | sbs = params.sub_batch_size 54 | bs = params.batch_size 55 | inputs_train, inputs_inference = split(inputs, bs, sbs) 56 | targets_train, targets_inference = split(targets, bs, sbs) 57 | 58 | return {'train':inputs_train, 'inference':inputs_inference}, \ 59 | {'train':targets_train, 'inference':targets_inference} 60 | 61 | 62 | def average_and_repeat(enc, params, tlen): 63 | """ 64 | :param enc: time, batch, z_dim 65 | :param params: 66 | :param tlen: length of horizon 67 | :return: e in time-major 68 | """ 69 | 70 | enc = tf.reduce_mean(enc, axis=0) # average over time dimension 71 | hsbs = params.sub_batch_size // 2 72 | bs = params.batch_size 73 | e = [] 74 | for i in range(bs // params.sub_batch_size): 75 | averaged = tf.reduce_mean(enc[i*hsbs: (i+1)*hsbs], axis=0) # average over sub-batch dimension 76 | averaged = tf.tile(averaged[None], [hsbs, 1]) # tile across sub-batch 77 | e.append(averaged) 78 | e = tf.concat(e, axis=0) 79 | e = tf.tile(e[None], [tlen, 1, 1]) 80 | return e 81 | 82 | -------------------------------------------------------------------------------- /robonet/video_prediction/models/graphs/__init__.py: -------------------------------------------------------------------------------- 1 | def get_graph_class(class_name): 2 | if class_name == 'c_dna_flow': 3 | from .dnaflow_graph import DNAFlowGraphWrapper 4 | return DNAFlowGraphWrapper 5 | elif class_name == 'deterministic_graph': 6 | from .deterministic_graph import DeterministicWrapper 7 | return DeterministicWrapper 8 | elif class_name == 'vgg_conv': 9 | from .vgg_conv_graph import VGGConvGraph 10 | return VGGConvGraph 11 | else: 12 | raise NotImplementedError 13 | -------------------------------------------------------------------------------- /robonet/video_prediction/models/graphs/base_graph.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class BaseGraph: 5 | def build_graph(self, inputs, hparams, n_gpus=1, scope_name='graph'): 6 | raise NotImplementedError 7 | 8 | @staticmethod 9 | def default_hparams(): 10 | return { 11 | 'sequence_length': 15, 12 | 'context_frames': 2, 13 | 'use_states': False 14 | } 15 | 16 | @property 17 | def vars(self): 18 | return tf.trainable_variables(self._scope_name) 19 | -------------------------------------------------------------------------------- /robonet/video_prediction/models/graphs/deterministic_graph.py: -------------------------------------------------------------------------------- 1 | from .base_graph import BaseGraph 2 | from robonet.video_prediction.layers.dnaflow_rnn_cell import VPredCell 3 | import itertools 4 | import tensorflow as tf 5 | from robonet.video_prediction.utils import tf_utils 6 | 7 | from robonet.video_prediction.layers.deterministic_embedding_rnn_cell import DetVPredCell 8 | import pdb 9 | 10 | class DeterministicWrapper(BaseGraph): 11 | def build_graph(self, mode, inputs, hparams, n_gpus=1, scope_name='dnaflow_generator'): 12 | if hparams.use_states: 13 | assert "states" in inputs, "graph is building with states but no states in inptus" 14 | else: 15 | inputs.pop('states', None) 16 | outputs_enc = inputs.pop('outputs_enc', None) 17 | 18 | self._scope_name = scope_name 19 | with tf.variable_scope(self._scope_name) as graph_scope: 20 | # TODO: I really don't like this. Should just error at this point instead of padding 21 | inputs = {name: tf_utils.maybe_pad_or_slice(input, hparams.sequence_length - 1) 22 | for name, input in inputs.items()} 23 | 24 | if outputs_enc is not None: 25 | inputs['e'] = outputs_enc 26 | 27 | cell = DetVPredCell(mode, inputs, hparams) 28 | outputs, _ = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32, 29 | swap_memory=False, time_major=True) 30 | 31 | outputs = {name: output[hparams.context_frames - 1:] for name, output in outputs.items()} 32 | outputs['ground_truth_sampling_mean'] = tf.reduce_mean(tf.to_float(cell.ground_truth[hparams.context_frames:])) 33 | return outputs 34 | 35 | @property 36 | def vars(self): 37 | return tf.trainable_variables(self._scope_name) 38 | 39 | @staticmethod 40 | def default_hparams(): 41 | default_params = { 42 | "where_add": "all", 43 | 'last_frames': 2, 44 | 'num_transformed_images': 4, 45 | 'prev_image_background': True, 46 | 'first_image_background': True, 47 | 'context_images_background': False, 48 | 'generate_scratch_image': False, 49 | 'transformation': "flow", 50 | 'conv_rnn': "lstm", 51 | 'norm_layer': "instance", 52 | 'ablation_conv_rnn_norm': False, 53 | 'downsample_layer': "conv_pool2d", 54 | 'upsample_layer': "upsample_conv2d", 55 | 'dependent_mask': True, 56 | 'c_dna_kernel_size': [5, 5], # only used in CDNA/DNA mode 57 | 58 | 'schedule_sampling': "inverse_sigmoid", 59 | 'schedule_sampling_k': 900.0, 60 | 'schedule_sampling_steps': [0, 100000], 61 | 62 | 'renormalize_pixdistrib': True, 63 | 64 | 'e_dim': None, # gets populated inside in deterministic_embedding_generator.py 65 | 'sub_batch_size': None, # gets poplated from dataset_hparam 66 | 'batch_size': None, # gets poplated from dataset_hparam 67 | 'encoder': None, 68 | 'stochastic': False, 69 | 70 | # params below control size of model 71 | 'ngf': 32, 72 | 'encoder_layer_size_mult': [1, 2, 4], 73 | 'encoder_layer_use_rnn': [True, True, True], 74 | 'decoder_layer_size_mult': [2, 1, 1], 75 | 'decoder_layer_use_rnn': [True, True, False] 76 | } 77 | return dict(itertools.chain(BaseGraph.default_hparams().items(), default_params.items())) 78 | -------------------------------------------------------------------------------- /robonet/video_prediction/models/graphs/dnaflow_graph.py: -------------------------------------------------------------------------------- 1 | from .base_graph import BaseGraph 2 | from robonet.video_prediction.layers.dnaflow_rnn_cell import VPredCell 3 | import itertools 4 | import tensorflow as tf 5 | from robonet.video_prediction.utils import tf_utils 6 | 7 | 8 | class DNAFlowGraphWrapper(BaseGraph): 9 | def build_graph(self, mode, inputs, hparams, n_gpus=1, scope_name='dnaflow_generator'): 10 | if hparams.use_states: 11 | assert "states" in inputs, "graph is building with states but no states in inptus" 12 | else: 13 | inputs.pop('states', None) 14 | 15 | self._scope_name = scope_name 16 | outputs_enc = inputs.pop('outputs_enc', None) 17 | with tf.variable_scope(self._scope_name) as graph_scope: 18 | # TODO: I really don't like this. Should just error at this point instead of padding 19 | inputs = {name: tf_utils.maybe_pad_or_slice(input, hparams.sequence_length - 1) 20 | for name, input in inputs.items()} 21 | 22 | if outputs_enc is not None: 23 | inputs['e'] = outputs_enc 24 | 25 | cell = VPredCell(mode, inputs, hparams) 26 | outputs, _ = tf.nn.dynamic_rnn(cell, inputs, dtype=inputs['actions'].dtype, 27 | swap_memory=False, time_major=True) 28 | 29 | outputs = {name: output[hparams.context_frames - 1:] for name, output in outputs.items()} 30 | outputs['ground_truth_sampling_mean'] = tf.reduce_mean(tf.to_float(cell.ground_truth[hparams.context_frames:])) 31 | return outputs 32 | 33 | @staticmethod 34 | def default_hparams(): 35 | default_params = { 36 | "where_add": "all", 37 | 'last_frames': 2, 38 | 'num_transformed_images': 4, 39 | 'prev_image_background': True, 40 | 'first_image_background': True, 41 | 'context_images_background': False, 42 | 'generate_scratch_image': False, 43 | 'transformation': "flow", 44 | 'conv_rnn': "lstm", 45 | 'norm_layer': "instance", 46 | 'ablation_conv_rnn_norm': False, 47 | 'downsample_layer': "conv_pool2d", 48 | 'upsample_layer': "upsample_conv2d", 49 | 'dependent_mask': True, 50 | 'c_dna_kernel_size': [5, 5], # only used in CDNA/DNA mode 51 | 52 | 'schedule_sampling': "inverse_sigmoid", 53 | 'schedule_sampling_k': 900.0, 54 | 'schedule_sampling_steps': [0, 100000], 55 | 56 | 'renormalize_pixdistrib': True, 57 | 58 | # params below control size of model 59 | 'ngf': 32, 60 | 'encoder_layer_size_mult': [1, 2, 4], 61 | 'encoder_layer_use_rnn': [True, True, True], 62 | 'decoder_layer_size_mult': [2, 1, 1], 63 | 'decoder_layer_use_rnn': [True, True, False] 64 | } 65 | return dict(itertools.chain(BaseGraph.default_hparams().items(), default_params.items())) 66 | -------------------------------------------------------------------------------- /robonet/video_prediction/rnn_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Convolutional LSTM implementation.""" 17 | 18 | import tensorflow as tf 19 | from tensorflow.python.framework import dtypes 20 | from tensorflow.python.framework import tensor_shape 21 | from tensorflow.python.ops import array_ops 22 | from tensorflow.python.ops import init_ops 23 | from tensorflow.python.ops import math_ops 24 | from tensorflow.python.ops import nn_ops 25 | from tensorflow.python.ops import rnn_cell_impl 26 | from tensorflow.python.ops import variable_scope as vs 27 | 28 | 29 | class BasicConv2DLSTMCell(rnn_cell_impl.RNNCell): 30 | """2D Convolutional LSTM cell with (optional) normalization and recurrent dropout. 31 | 32 | The implementation is based on: tf.contrib.rnn.LayerNormBasicLSTMCell. 33 | 34 | It does not allow cell clipping, a projection layer, and does not 35 | use peep-hole connections: it is the basic baseline. 36 | """ 37 | def __init__(self, input_shape, filters, kernel_size, 38 | forget_bias=1.0, activation_fn=math_ops.tanh, 39 | normalizer_fn=None, separate_norms=True, 40 | norm_gain=1.0, norm_shift=0.0, 41 | dropout_keep_prob=1.0, dropout_prob_seed=None, 42 | skip_connection=False, reuse=None): 43 | """Initializes the basic convolutional LSTM cell. 44 | 45 | Args: 46 | input_shape: int tuple, Shape of the input, excluding the batch size. 47 | filters: int, The number of filters of the conv LSTM cell. 48 | kernel_size: int tuple, The kernel size of the conv LSTM cell. 49 | forget_bias: float, The bias added to forget gates (see above). 50 | activation_fn: Activation function of the inner states. 51 | normalizer_fn: If specified, this normalization will be applied before the 52 | internal nonlinearities. 53 | separate_norms: If set to `False`, the normalizer_fn is applied to the 54 | concatenated tensor that follows the convolution, i.e. before splitting 55 | the tensor. This case is slightly faster but it might be functionally 56 | different, depending on the normalizer_fn (it's functionally the same 57 | for instance norm but not for layer norm). Default: `True`. 58 | norm_gain: float, The layer normalization gain initial value. If 59 | `normalizer_fn` is `None`, this argument will be ignored. 60 | norm_shift: float, The layer normalization shift initial value. If 61 | `normalizer_fn` is `None`, this argument will be ignored. 62 | dropout_keep_prob: unit Tensor or float between 0 and 1 representing the 63 | recurrent dropout probability value. If float and 1.0, no dropout will 64 | be applied. 65 | dropout_prob_seed: (optional) integer, the randomness seed. 66 | skip_connection: If set to `True`, concatenate the input to the 67 | output of the conv LSTM. Default: `False`. 68 | reuse: (optional) Python boolean describing whether to reuse variables 69 | in an existing scope. If not `True`, and the existing scope already has 70 | the given variables, an error is raised. 71 | """ 72 | super(BasicConv2DLSTMCell, self).__init__(_reuse=reuse) 73 | 74 | self._input_shape = input_shape 75 | self._filters = filters 76 | self._kernel_size = list(kernel_size) if isinstance(kernel_size, (tuple, list)) else [kernel_size] * 2 77 | self._forget_bias = forget_bias 78 | self._activation_fn = activation_fn 79 | self._normalizer_fn = normalizer_fn 80 | self._separate_norms = separate_norms 81 | self._g = norm_gain 82 | self._b = norm_shift 83 | self._keep_prob = dropout_keep_prob 84 | self._seed = dropout_prob_seed 85 | self._skip_connection = skip_connection 86 | self._reuse = reuse 87 | 88 | if self._skip_connection: 89 | output_channels = self._filters + self._input_shape[-1] 90 | else: 91 | output_channels = self._filters 92 | cell_size = tensor_shape.TensorShape(self._input_shape[:-1] + [self._filters]) 93 | self._output_size = tensor_shape.TensorShape(self._input_shape[:-1] + [output_channels]) 94 | self._state_size = rnn_cell_impl.LSTMStateTuple(cell_size, self._output_size) 95 | 96 | @property 97 | def output_size(self): 98 | return self._output_size 99 | 100 | @property 101 | def state_size(self): 102 | return self._state_size 103 | 104 | def _norm(self, inputs, scope): 105 | shape = inputs.get_shape()[-1:] 106 | gamma_init = init_ops.constant_initializer(self._g) 107 | beta_init = init_ops.constant_initializer(self._b) 108 | with vs.variable_scope(scope): 109 | # Initialize beta and gamma for use by normalizer. 110 | vs.get_variable("gamma", shape=shape, initializer=gamma_init) 111 | vs.get_variable("beta", shape=shape, initializer=beta_init) 112 | normalized = self._normalizer_fn(inputs, reuse=True, scope=scope) 113 | return normalized 114 | 115 | def _conv2d(self, inputs): 116 | output_filters = 4 * self._filters 117 | input_shape = inputs.get_shape().as_list() 118 | kernel_shape = list(self._kernel_size) + [input_shape[-1], output_filters] 119 | kernel = vs.get_variable("kernel", kernel_shape, dtype=dtypes.float32, 120 | initializer=init_ops.truncated_normal_initializer(stddev=0.02)) 121 | outputs = nn_ops.conv2d(inputs, kernel, [1] * 4, padding='SAME') 122 | if not self._normalizer_fn: 123 | bias = vs.get_variable('bias', [output_filters], dtype=dtypes.float32, 124 | initializer=init_ops.zeros_initializer()) 125 | outputs = nn_ops.bias_add(outputs, bias) 126 | return outputs 127 | 128 | def call(self, inputs, state): 129 | """2D Convolutional LSTM cell with (optional) normalization and recurrent dropout.""" 130 | c, h = state 131 | args = array_ops.concat([inputs, h], -1) 132 | concat = self._conv2d(args) 133 | 134 | if self._normalizer_fn and not self._separate_norms: 135 | concat = self._norm(concat, "input_transform_forget_output") 136 | i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=-1) 137 | if self._normalizer_fn and self._separate_norms: 138 | i = self._norm(i, "input") 139 | j = self._norm(j, "transform") 140 | f = self._norm(f, "forget") 141 | o = self._norm(o, "output") 142 | 143 | g = self._activation_fn(j) 144 | if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1: 145 | g = nn_ops.dropout(g, self._keep_prob, seed=self._seed) 146 | 147 | new_c = (c * math_ops.sigmoid(f + self._forget_bias) 148 | + math_ops.sigmoid(i) * g) 149 | if self._normalizer_fn: 150 | new_c = self._norm(new_c, "state") 151 | new_h = self._activation_fn(new_c) * math_ops.sigmoid(o) 152 | 153 | if self._skip_connection: 154 | new_h = array_ops.concat([new_h, inputs], axis=-1) 155 | 156 | new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_h) 157 | return new_h, new_state 158 | 159 | 160 | class Conv2DGRUCell(tf.nn.rnn_cell.RNNCell): 161 | """2D Convolutional GRU cell with (optional) normalization. 162 | 163 | Modified from these: 164 | https://github.com/carlthome/tensorflow-convlstm-cell/blob/master/cell.py 165 | https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/rnn_cell_impl.py 166 | """ 167 | def __init__(self, input_shape, filters, kernel_size, 168 | activation_fn=tf.tanh, 169 | normalizer_fn=None, separate_norms=True, 170 | bias_initializer=None, reuse=None): 171 | super(Conv2DGRUCell, self).__init__(_reuse=reuse) 172 | self._input_shape = input_shape 173 | self._filters = filters 174 | self._kernel_size = list(kernel_size) if isinstance(kernel_size, (tuple, list)) else [kernel_size] * 2 175 | self._activation_fn = activation_fn 176 | self._normalizer_fn = normalizer_fn 177 | self._separate_norms = separate_norms 178 | self._bias_initializer = bias_initializer 179 | self._size = tensor_shape.TensorShape(self._input_shape[:-1] + [self._filters]) 180 | 181 | @property 182 | def state_size(self): 183 | return self._size 184 | 185 | @property 186 | def output_size(self): 187 | return self._size 188 | 189 | def _norm(self, inputs, scope, bias_initializer): 190 | shape = inputs.get_shape()[-1:] 191 | gamma_init = init_ops.ones_initializer() 192 | beta_init = bias_initializer 193 | with vs.variable_scope(scope): 194 | # Initialize beta and gamma for use by normalizer. 195 | vs.get_variable("gamma", shape=shape, initializer=gamma_init) 196 | vs.get_variable("beta", shape=shape, initializer=beta_init) 197 | normalized = self._normalizer_fn(inputs, reuse=True, scope=scope) 198 | return normalized 199 | 200 | def _conv2d(self, inputs, output_filters, bias_initializer): 201 | input_shape = inputs.get_shape().as_list() 202 | kernel_shape = list(self._kernel_size) + [input_shape[-1], output_filters] 203 | kernel = vs.get_variable("kernel", kernel_shape, dtype=dtypes.float32, 204 | initializer=init_ops.truncated_normal_initializer(stddev=0.02)) 205 | outputs = nn_ops.conv2d(inputs, kernel, [1] * 4, padding='SAME') 206 | if not self._normalizer_fn: 207 | bias = vs.get_variable('bias', [output_filters], dtype=dtypes.float32, 208 | initializer=bias_initializer) 209 | outputs = nn_ops.bias_add(outputs, bias) 210 | return outputs 211 | 212 | def call(self, inputs, state): 213 | bias_ones = self._bias_initializer 214 | if self._bias_initializer is None: 215 | bias_ones = init_ops.ones_initializer() 216 | with vs.variable_scope('gates'): 217 | inputs = array_ops.concat([inputs, state], axis=-1) 218 | concat = self._conv2d(inputs, 2 * self._filters, bias_ones) 219 | if self._normalizer_fn and not self._separate_norms: 220 | concat = self._norm(concat, "reset_update", bias_ones) 221 | r, u = array_ops.split(concat, 2, axis=-1) 222 | if self._normalizer_fn and self._separate_norms: 223 | r = self._norm(r, "reset", bias_ones) 224 | u = self._norm(u, "update", bias_ones) 225 | r, u = math_ops.sigmoid(r), math_ops.sigmoid(u) 226 | 227 | bias_zeros = self._bias_initializer 228 | if self._bias_initializer is None: 229 | bias_zeros = init_ops.zeros_initializer() 230 | with vs.variable_scope('candidate'): 231 | inputs = array_ops.concat([inputs, r * state], axis=-1) 232 | candidate = self._conv2d(inputs, self._filters, bias_zeros) 233 | if self._normalizer_fn: 234 | candidate = self._norm(candidate, "state", bias_zeros) 235 | 236 | c = self._activation_fn(candidate) 237 | new_h = u * state + (1 - u) * c 238 | return new_h, new_h 239 | -------------------------------------------------------------------------------- /robonet/video_prediction/testing/__init__.py: -------------------------------------------------------------------------------- 1 | from .model_evaluation_interface import VPredEvaluation 2 | -------------------------------------------------------------------------------- /robonet/video_prediction/testing/model_evaluation_interface.py: -------------------------------------------------------------------------------- 1 | import ray 2 | from robonet.video_prediction.models import get_model 3 | import numpy as np 4 | import yaml 5 | from robonet.video_prediction.utils import tf_utils 6 | import tensorflow as tf 7 | from tensorflow.contrib.training import HParams 8 | import os 9 | import math 10 | import re 11 | import glob 12 | 13 | 14 | class VPredEvaluation(object): 15 | def __init__(self, model_path, test_hparams={}, n_gpus=1, first_gpu=0, sess=None): 16 | assert first_gpu == 0, "only starts building at gpu0" 17 | 18 | self._test_hparams = self._default_hparams().override_from_dict(test_hparams) 19 | self._model_path = os.path.expanduser(model_path) 20 | 21 | config_path = self._model_path + '/params.yaml' 22 | assert os.path.exists(config_path), 'Config path does not exist!' 23 | 24 | with open(config_path) as config: 25 | params = yaml.load(config, Loader=yaml.SafeLoader) 26 | self._model_hparams = params['model'] 27 | self._input_hparams = params['dataset'] 28 | 29 | print('\n\n------------------------------------ LOADED PARAMS ------------------------------------') 30 | for k, v in self._model_hparams.items(): 31 | print('{} --> {}'.format(k, v)) 32 | for k, v in self._input_hparams.items(): 33 | print('{} --> {}'.format(k, v)) 34 | print('---------------------------------------------------------------------------------------\n\n') 35 | 36 | PredictionModel = get_model(self._model_hparams.pop('model')) 37 | self._model = PredictionModel(self._input_hparams, n_gpus, self._model_hparams.pop('graph_type'), False, self._model_hparams.pop('scope_name')) 38 | self._outputs = self._model.model_fn(self._build_inputs(), {}, tf.estimator.ModeKeys.PREDICT, self._model_hparams) 39 | 40 | self._sess = sess 41 | self._restored = False 42 | 43 | def _default_hparams(self): 44 | default_dict = { 45 | "run_batch_size": 200, 46 | 'tile_context': True, 47 | 'designated_pixel_count': 0 48 | } 49 | return HParams(**default_dict) 50 | 51 | def _build_inputs(self): 52 | B_pl = self._test_hparams.run_batch_size 53 | if self._test_hparams.tile_context: 54 | B_pl = 1 55 | 56 | context_frames = self._model_hparams['context_frames'] 57 | assert context_frames > 1, "needs at least 1 context action (so 2 frames)" 58 | 59 | input_length = self._model_hparams['sequence_length'] - 1 60 | pad_len = input_length - context_frames 61 | 62 | height, width = self._input_hparams['img_size'] 63 | self._images_pl = tf.placeholder(tf.float32, [B_pl, context_frames, height, width, 3]) 64 | self._states_pl = tf.placeholder(tf.float32, [B_pl, context_frames, self._input_hparams['target_sdim']]) 65 | self._context_actions_pl = tf.placeholder(tf.float32, [B_pl, context_frames - 1, self._input_hparams['target_adim']]) 66 | self._actions_pl = tf.placeholder(tf.float32, [self._test_hparams.run_batch_size, pad_len + 1, self._input_hparams['target_adim']]) 67 | 68 | if self._test_hparams.designated_pixel_count: 69 | self._pixel_dist_pl = tf.placeholder(tf.float32, [B_pl, context_frames, height, width, self._test_hparams.designated_pixel_count]) 70 | pad = tf.zeros((B_pl, pad_len, height, width, self._test_hparams.designated_pixel_count), dtype=tf.float32) 71 | input_pixel_distributions = tf.concat((self._pixel_dist_pl, pad), axis=1) 72 | if self._test_hparams.tile_context: 73 | input_pixel_distributions = tf.tile(input_pixel_distributions, [self._test_hparams.run_batch_size, 1, 1, 1, 1]) 74 | 75 | input_imgs = tf.concat((self._images_pl, tf.zeros((B_pl, pad_len, height, width, 3), dtype=tf.float32)), axis=1) 76 | input_states = tf.concat((self._states_pl, tf.zeros((B_pl, pad_len, self._input_hparams['target_sdim']), dtype=tf.float32)), axis=1) 77 | if self._test_hparams.tile_context: 78 | input_states, context_actions = [tf.tile(tensor, [self._test_hparams.run_batch_size, 1, 1]) for tensor in [input_states, self._context_actions_pl]] 79 | input_imgs = tf.tile(input_imgs, [self._test_hparams.run_batch_size, 1, 1, 1, 1]) 80 | else: 81 | context_actions = self._context_actions_pl 82 | 83 | input_actions = tf.concat((context_actions, self._actions_pl), axis=1) 84 | 85 | ret_dict = {'actions': input_actions, 'images': input_imgs, 'states': input_states} 86 | if self._test_hparams.designated_pixel_count: 87 | ret_dict['pixel_distributions'] = input_pixel_distributions 88 | return ret_dict 89 | 90 | def predict(self, context_tensors, action_tensors): 91 | # assert self._restored, "must restore before testing can continue!" 92 | 93 | if self._test_hparams.tile_context: 94 | assert context_tensors['context_frames'].shape[1] == 1, "only one camera supported!" 95 | context_images = context_tensors['context_frames'][-self._model_hparams['context_frames']:, 0][None] 96 | context_actions = context_tensors['context_actions'][(1 - self._model_hparams['context_frames']):][None] 97 | context_states = context_tensors['context_states'][-self._model_hparams['context_frames']:][None] 98 | else: 99 | assert context_tensors['context_frames'].shape[2] == 1, "only one camera supported!" 100 | context_images = context_tensors['context_frames'][:, -self._model_hparams['context_frames']:, 0] 101 | context_actions = context_tensors['context_actions'][:, (1 - self._model_hparams['context_frames']):] 102 | context_states = context_tensors['context_states'][:, -self._model_hparams['context_frames']:] 103 | 104 | if self._test_hparams.designated_pixel_count and self._test_hparams.tile_context: 105 | context_distributions = context_tensors['context_pixel_distributions'][-self._model_hparams['context_frames']:, 0][None] 106 | elif self._test_hparams.designated_pixel_count: 107 | context_distributions = context_tensors['context_pixel_distributions'][:, -self._model_hparams['context_frames']:, 0] 108 | else: 109 | context_distributions = None 110 | 111 | input_actions = action_tensors['actions'] 112 | n_runs = int(math.ceil(input_actions.shape[0] / float(self._test_hparams.run_batch_size))) 113 | assert n_runs 114 | 115 | ret_dict = None 116 | for n in range(n_runs): 117 | selected_actions = input_actions[n * self._test_hparams.run_batch_size :(n + 1) * self._test_hparams.run_batch_size] 118 | if selected_actions.shape[0] < self._test_hparams.run_batch_size: 119 | pad = np.zeros((self._test_hparams.run_batch_size - selected_actions.shape[0], selected_actions.shape[1], selected_actions.shape[2])) 120 | padded_actions = np.concatenate((selected_actions, pad), axis=0) 121 | else: 122 | padded_actions = selected_actions 123 | 124 | run_t = self._feed(context_images, context_actions, context_states, context_distributions, padded_actions) 125 | 126 | for k in run_t.keys(): 127 | run_t[k] = run_t[k][:selected_actions.shape[0]] 128 | 129 | if ret_dict is None: 130 | ret_dict = run_t 131 | else: 132 | for k, v in run_t.items(): 133 | ret_dict[k] = np.concatenate((ret_dict[k], v), axis=0) 134 | return ret_dict 135 | 136 | def _feed(self, context_images, context_actions, context_states, context_distributions, input_actions): 137 | if context_images.dtype == np.uint8: 138 | context_images = context_images.astype(np.float32) / 255 139 | 140 | feed_dict = {self._images_pl: context_images, 141 | self._states_pl: context_states, 142 | self._context_actions_pl: context_actions, 143 | self._actions_pl: input_actions} 144 | 145 | if self._test_hparams.designated_pixel_count and context_distributions is None: 146 | height, width = self._input_hparams['img_size'] 147 | context_distributions = np.zeros((self._test_hparams.batch_size, self._model_hparams['context_frames'], 148 | height, width, self._test_hparams.designated_pixel_count), dtype=np.float32) 149 | context_distributions[:, :, 0, 0] = 1.0 150 | feed_dict[self._pixel_dist_pl] = context_distributions 151 | elif self._test_hparams.designated_pixel_count: 152 | feed_dict[self._pixel_dist_pl] = context_distributions 153 | 154 | return self._sess.run(self._outputs, feed_dict=feed_dict) 155 | 156 | def __call__(self, context_tensors, action_tensors): 157 | return self.predict(context_tensors, action_tensors) 158 | 159 | def set_session(self, sess): 160 | self._sess = sess 161 | 162 | def restore(self): 163 | if self._restored: 164 | return 165 | 166 | if self._sess is None: 167 | self._sess = tf.Session() 168 | self._sess.run(tf.global_variables_initializer()) 169 | 170 | model_paths = glob.glob('{}/model*'.format(self._model_path)) 171 | assert model_paths, "models not found in {}!".format(self._model_path) 172 | max_model = max([max(re.findall('\d+', m)) for m in model_paths]) 173 | meta_file = [m for m in model_paths if '.meta' in m and str(max_model) in m][0] 174 | restore_path = meta_file[:meta_file.find('.meta')] 175 | print('restoring', restore_path) 176 | 177 | checkpoints = [restore_path] 178 | # automatically skip global_step if more than one checkpoint is provided 179 | skip_global_step = len(checkpoints) > 1 180 | savers = [] 181 | for checkpoint in checkpoints: 182 | print("creating restore saver from checkpoint %s" % checkpoint) 183 | saver, _ = tf_utils.get_checkpoint_restore_saver(checkpoint, skip_global_step=skip_global_step) 184 | savers.append(saver) 185 | restore_op = [saver.saver_def.restore_op_name for saver in savers] 186 | self._sess.run(restore_op) 187 | self._restored = True 188 | 189 | @property 190 | def sequence_length(self): 191 | return self._model_hparams['sequence_length'] 192 | 193 | @property 194 | def n_context(self): 195 | return self._model_hparams['context_frames'] 196 | 197 | @property 198 | def horizon(self): 199 | return self.sequence_length - self.n_context 200 | 201 | @property 202 | def n_cam(self): 203 | return 1 204 | 205 | @property 206 | def img_size(self): 207 | return self._input_hparams['img_size'] 208 | 209 | @property 210 | def adim(self): 211 | return self._input_hparams['target_adim'] 212 | 213 | @property 214 | def sdim(self): 215 | return self._input_hparams['target_sdim'] 216 | -------------------------------------------------------------------------------- /robonet/video_prediction/training/__init__.py: -------------------------------------------------------------------------------- 1 | from .ray_util.gif_logger import GIFLogger 2 | 3 | 4 | def get_trainable(class_name): 5 | if class_name == 'VPredTrainable': 6 | from .trainable_interface import VPredTrainable 7 | return VPredTrainable 8 | if class_name == 'BalancedCamFilter': 9 | from .data_filter import BalancedCamFilter 10 | return BalancedCamFilter 11 | if class_name == 'RobotSetFilter': 12 | from .data_filter import RobotSetFilter 13 | return RobotSetFilter 14 | if class_name == 'RobotObjectFilter': 15 | from .data_filter import RobotObjectFilter 16 | return RobotObjectFilter 17 | if class_name == 'BatchmixFinetuning': 18 | from .finetuning_trainable_interface import BatchmixingVPredTrainable 19 | return BatchmixingVPredTrainable 20 | raise NotImplementedError 21 | -------------------------------------------------------------------------------- /robonet/video_prediction/training/data_filter.py: -------------------------------------------------------------------------------- 1 | from robonet.video_prediction.training.trainable_interface import VPredTrainable 2 | import numpy as np 3 | 4 | 5 | class BalancedCamFilter(VPredTrainable): 6 | 7 | def _default_hparams(self): 8 | params = super()._default_hparams() 9 | params.add_hparam('balanced_camera_configurations', True) 10 | return params 11 | 12 | def _filter_metadata(self, metadata): 13 | metadata = super()._filter_metadata(metadata) 14 | 15 | if self._hparams.balanced_camera_configurations: 16 | assert self.dataset_hparams.get('sub_batch_size', 1) > 1 17 | unique_cameras = metadata['camera_configuration'].frame.unique().tolist() # all camera configs that are in he dataset 18 | all_metadata = metadata 19 | metadata = [all_metadata[all_metadata['camera_configuration'] == r] for r in unique_cameras] 20 | 21 | # print('sizes after splitting metadata in camera configurations') 22 | # for m, cam in zip(metadata, unique_cameras): 23 | # print('cam {} : numfiles {} robots: {}'.format(cam, len(m.files), m['robot'].frame.unique().tolist())) 24 | return metadata 25 | 26 | 27 | class RobotSetFilter(VPredTrainable): 28 | 29 | def _default_hparams(self): 30 | params = super()._default_hparams() 31 | params.add_hparam('robot_set', ['sawyer', 'widowx', 'R3', 'franka']) 32 | return params 33 | 34 | def _filter_metadata(self, metadata_list): 35 | metadata_list = super()._filter_metadata(metadata_list) 36 | 37 | assert self._hparams.balance_across_robots, "need to balance accross robots!" 38 | if self._hparams.robot_set is not None: 39 | 40 | new_metadata_list = [] 41 | for m in metadata_list: 42 | if m['robot'].frame.unique().tolist()[0] in self._hparams.robot_set: 43 | print('using robot', m['robot'].frame.unique().tolist()) 44 | new_metadata_list.append(m) 45 | return new_metadata_list 46 | 47 | 48 | class RobotObjectFilter(VPredTrainable): 49 | def _default_hparams(self): 50 | params = super()._default_hparams() 51 | params.add_hparam('target_robot', '') 52 | params.add_hparam('removed_object', '') 53 | return params 54 | 55 | def _filter_metadata(self, metadata): 56 | obj_exclude = metadata['object_classes'].frame.apply(lambda x: self._hparams.removed_object not in x) 57 | not_robot_applied_to = metadata['robot'] != self._hparams.target_robot 58 | x = metadata[np.logical_or(obj_exclude, not_robot_applied_to)] 59 | return x 60 | -------------------------------------------------------------------------------- /robonet/video_prediction/training/finetuning_trainable_interface.py: -------------------------------------------------------------------------------- 1 | 2 | import glob 3 | import pdb 4 | 5 | from robonet.video_prediction.training.trainable_interface import VPredTrainable 6 | from robonet.datasets import get_dataset_class, load_metadata 7 | 8 | class BatchmixingVPredTrainable(VPredTrainable): 9 | 10 | def _default_hparams(self): 11 | params = super()._default_hparams() 12 | params.add_hparam('robot_set', ['sawyer', 'widowx', 'R3', 'franka']) 13 | return params 14 | 15 | def make_dataloaders(self, config): 16 | DatasetClass = get_dataset_class(self.dataset_hparams.pop('dataset')) 17 | 18 | # data from new domain 19 | new_domain_metadata = self._filter_metadata(load_metadata(config['data_directory'])) 20 | 21 | # data from old domain 22 | old_domain_metadata = self._filter_metadata(load_metadata(config['batchmix_basedata'])) 23 | 24 | old_metadata_list = [] 25 | for m in old_domain_metadata: 26 | if m['robot'].frame.unique().tolist()[0] in self._hparams.robot_set: 27 | print('using robot', m['robot'].frame.unique().tolist()) 28 | old_metadata_list.append(m) 29 | 30 | assert len(new_domain_metadata) == 1 31 | metadata_list = new_domain_metadata*len(old_metadata_list) + old_metadata_list # make sure that we're using the same amount of data from old and new 32 | 33 | return self._get_input_targets(DatasetClass, metadata_list, self.dataset_hparams) 34 | 35 | -------------------------------------------------------------------------------- /robonet/video_prediction/training/ray_util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SudeepDasari/RoboNet/d83eee20f39653c3f8e7c349df7350e8a9e9f7a7/robonet/video_prediction/training/ray_util/__init__.py -------------------------------------------------------------------------------- /robonet/video_prediction/training/ray_util/gif_logger.py: -------------------------------------------------------------------------------- 1 | import ray.cloudpickle as cloudpickle 2 | from ray.tune.logger import Logger 3 | import numpy as np 4 | import os 5 | import pickle as pkl 6 | from robonet.video_prediction.utils.ffmpeg_gif import encode_gif 7 | from robonet.video_prediction.utils.encode_img import construct_image_tile 8 | 9 | 10 | class GIFLogger(Logger): 11 | def _init(self): 12 | self._save_dir = os.path.join(self.logdir, 'metrics') 13 | if not os.path.exists(self._save_dir): 14 | os.makedirs(self._save_dir) 15 | 16 | self._metric_file = os.path.join(self._save_dir, 'metric_summaries.pkl') 17 | if os.path.exists(self._metric_file): 18 | self._metric_logs = pkl.load(open(self._metric_file, 'rb')) 19 | else: 20 | self._metric_logs = {} 21 | self._image_logs = {} 22 | 23 | def flush(self): 24 | with open(self._metric_file, 'wb') as f: 25 | cloudpickle.dump(self._metric_logs, f) 26 | 27 | if self._image_logs: 28 | img_dir = os.path.join(self._save_dir, 'images') 29 | if not os.path.exists(img_dir): 30 | os.makedirs(img_dir) 31 | for metric_name, summaries in self._image_logs.items(): 32 | for step, encoding_type, encoded_im in summaries: 33 | assert encoding_type == 'GIF' 34 | file_name = '{}/{}_summary_{}.gif'.format(img_dir, metric_name, step) 35 | with open(os.path.join(self._save_dir, file_name), 'wb') as f: 36 | f.write(encoded_im) 37 | self._image_logs = {} 38 | 39 | def on_result(self, result): 40 | global_step = result['global_step'] 41 | 42 | report_step = False 43 | for k, v in result.items(): 44 | if 'metric/' not in k or 'step_time' in k: 45 | continue 46 | 47 | report_step = True 48 | tag = '_'.join(k.split('/')[1:]) 49 | if isinstance(v, np.ndarray): 50 | assert v.dtype == np.uint8 and len(v.shape) >= 4, 'assume np arrays are batched image data' 51 | self._image_logs[tag] = self._image_logs.get(tag, []) + [(global_step, 'GIF', encode_gif(construct_image_tile(v), 4))] 52 | else: 53 | self._metric_logs[tag] = self._metric_logs.get(tag, []) + [v] 54 | 55 | if report_step: 56 | self._metric_logs['global_step'] = self._metric_logs.get('global_step', []) + [global_step] 57 | 58 | self.flush() 59 | -------------------------------------------------------------------------------- /robonet/video_prediction/training/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def stbmajor(ten): 6 | """ 7 | swap time-batch major 8 | :param ten: npy tenosr 9 | :return: 10 | """ 11 | return np.transpose(ten, [1, 0] + list(range(2,len(ten.shape)))) 12 | 13 | 14 | def pad(real_frames, pad_amount): 15 | tensor = (real_frames * 255).astype(np.uint8) 16 | height_pad = np.zeros((tensor.shape[0], tensor.shape[1], pad_amount, tensor.shape[-2], tensor.shape[-1]), dtype=np.uint8) 17 | tensor = np.concatenate((height_pad, tensor, height_pad), axis=-3) 18 | width_pad = np.zeros((tensor.shape[0], tensor.shape[1], tensor.shape[2], pad_amount, tensor.shape[-1]), dtype=np.uint8) 19 | tensor = np.concatenate((width_pad, tensor, width_pad), axis=-2) 20 | return tensor 21 | 22 | def pad_and_concat(real_frames, pred_frames, pad_amount): 23 | real, pred = [(x * 255).astype(np.uint8) for x in (real_frames, pred_frames)] 24 | pred = np.concatenate([pred[:, 0][:, None] for _ in range(real.shape[1] - pred.shape[1])] + [pred], axis=1) 25 | image_summary_tensors = [] 26 | for tensor in [real, pred]: 27 | height_pad = np.zeros((tensor.shape[0], tensor.shape[1], pad_amount, tensor.shape[-2], tensor.shape[-1]), dtype=np.uint8) 28 | tensor = np.concatenate((height_pad, tensor, height_pad), axis=-3) 29 | width_pad = np.zeros((tensor.shape[0], tensor.shape[1], tensor.shape[2], pad_amount, tensor.shape[-1]), dtype=np.uint8) 30 | tensor = np.concatenate((width_pad, tensor, width_pad), axis=-2) 31 | image_summary_tensors.append(tensor) 32 | tensor = np.concatenate(image_summary_tensors, axis=2) 33 | return tensor 34 | 35 | 36 | def render_dist(dist): 37 | rendered = np.zeros((dist.shape[0], dist.shape[1], dist.shape[2], dist.shape[3], 3), dtype=np.float32) 38 | for b in range(dist.shape[0]): 39 | for t in range(dist.shape[1]): 40 | rendered[b,t] = np.squeeze(plt.cm.viridis(dist[b][t])[:, :, :3]) 41 | return rendered 42 | -------------------------------------------------------------------------------- /robonet/video_prediction/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SudeepDasari/RoboNet/d83eee20f39653c3f8e7c349df7350e8a9e9f7a7/robonet/video_prediction/utils/__init__.py -------------------------------------------------------------------------------- /robonet/video_prediction/utils/encode_img.py: -------------------------------------------------------------------------------- 1 | import imageio 2 | import io 3 | import cv2 4 | import numpy as np 5 | 6 | 7 | def construct_image_tile(tensor): 8 | assert len(tensor.shape) == 4 or len(tensor.shape) == 5, "assumes (B, H, W, C) or (B, T, H, W, C) tensor" 9 | return np.concatenate([im for im in tensor], axis=-2) 10 | 11 | 12 | def encode_images(tensor, fps=4): 13 | if len(tensor.shape) == 3: 14 | return cv2.imencode('.jpg', tensor[:, :, ::-1])[1] 15 | elif len(tensor.shape) == 4: 16 | buffer = io.BytesIO() 17 | writer = imageio.get_writer(buffer, format='gif', fps=fps) 18 | [writer.append_data(im) for im in tensor] 19 | writer.close() 20 | buffer.seek(0) 21 | return buffer.read() 22 | raise NotImplementedError 23 | -------------------------------------------------------------------------------- /robonet/video_prediction/utils/ffmpeg_gif.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | 5 | 6 | def save_gif(gif_fname, images, fps): 7 | """ 8 | To generate a gif from image files, first generate palette from images 9 | and then generate the gif from the images and the palette. 10 | ffmpeg -i input_%02d.jpg -vf palettegen -y palette.png 11 | ffmpeg -i input_%02d.jpg -i palette.png -lavfi paletteuse -y output.gif 12 | 13 | Alternatively, use a filter to map the input images to both the palette 14 | and gif commands, while also passing the palette to the gif command. 15 | ffmpeg -i input_%02d.jpg -filter_complex "[0:v]split[x][z];[z]palettegen[y];[x][y]paletteuse" -y output.gif 16 | 17 | To directly pass in numpy images, use rawvideo format and `-i -` option. 18 | """ 19 | from subprocess import Popen, PIPE 20 | head, tail = os.path.split(gif_fname) 21 | if head and not os.path.exists(head): 22 | os.makedirs(head) 23 | h, w, c = images[0].shape 24 | cmd = ['ffmpeg', '-y', 25 | '-f', 'rawvideo', 26 | '-vcodec', 'rawvideo', 27 | '-r', '%.02f' % fps, 28 | '-s', '%dx%d' % (w, h), 29 | '-pix_fmt', {1: 'gray', 3: 'rgb24', 4: 'rgba'}[c], 30 | '-i', '-', 31 | '-filter_complex', '[0:v]split[x][z];[z]palettegen[y];[x][y]paletteuse', 32 | '-r', '%.02f' % fps, 33 | '%s' % gif_fname] 34 | proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) 35 | for image in images: 36 | proc.stdin.write(image.tostring()) 37 | out, err = proc.communicate() 38 | if proc.returncode: 39 | err = '\n'.join([' '.join(cmd), err.decode('utf8')]) 40 | raise IOError(err) 41 | del proc 42 | 43 | 44 | def encode_gif(images, fps): 45 | from subprocess import Popen, PIPE 46 | h, w, c = images[0].shape 47 | cmd = ['ffmpeg', '-y', 48 | '-f', 'rawvideo', 49 | '-vcodec', 'rawvideo', 50 | '-r', '%.02f' % fps, 51 | '-s', '%dx%d' % (w, h), 52 | '-pix_fmt', {1: 'gray', 3: 'rgb24', 4: 'rgba'}[c], 53 | '-i', '-', 54 | '-filter_complex', '[0:v]split[x][z];[z]palettegen[y];[x][y]paletteuse', 55 | '-r', '%.02f' % fps, 56 | '-f', 'gif', 57 | '-'] 58 | proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) 59 | for image in images: 60 | proc.stdin.write(image.tostring()) 61 | out, err = proc.communicate() 62 | if proc.returncode: 63 | err = '\n'.join([' '.join(cmd), err.decode('utf8')]) 64 | raise IOError(err) 65 | del proc 66 | return out 67 | 68 | 69 | def main(): 70 | images_shape = (12, 64, 64, 3) # num_frames, height, width, channels 71 | images = np.random.randint(256, size=images_shape).astype(np.uint8) 72 | 73 | save_gif('output_save.gif', images, 4) 74 | with open('output_save.gif', 'rb') as f: 75 | string_save = f.read() 76 | 77 | string_encode = encode_gif(images, 4) 78 | with open('output_encode.gif', 'wb') as f: 79 | f.write(string_encode) 80 | 81 | print(np.all(string_save == string_encode)) 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /robonet/video_prediction/utils/html.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import dominate 4 | from dominate.tags import * 5 | 6 | 7 | class HTML: 8 | def __init__(self, web_dir, title, reflesh=0): 9 | self.title = title 10 | self.web_dir = web_dir 11 | self.img_dir = os.path.join(self.web_dir, 'images') 12 | if not os.path.exists(self.web_dir): 13 | os.makedirs(self.web_dir) 14 | if not os.path.exists(self.img_dir): 15 | os.makedirs(self.img_dir) 16 | # print(self.img_dir) 17 | 18 | self.doc = dominate.document(title=title) 19 | if reflesh > 0: 20 | with self.doc.head: 21 | meta(http_equiv="reflesh", content=str(reflesh)) 22 | self.t = None 23 | 24 | def get_image_dir(self): 25 | return self.img_dir 26 | 27 | def add_header1(self, str): 28 | with self.doc: 29 | h1(str) 30 | 31 | def add_header2(self, str): 32 | with self.doc: 33 | h2(str) 34 | 35 | def add_header3(self, str): 36 | with self.doc: 37 | h3(str) 38 | 39 | def add_table(self, border=1): 40 | self.t = table(border=border, style="table-layout: fixed;") 41 | self.doc.add(self.t) 42 | 43 | def add_row(self, txts, colspans=None): 44 | if self.t is None: 45 | self.add_table() 46 | with self.t: 47 | with tr(): 48 | if colspans: 49 | assert len(txts) == len(colspans) 50 | colspans = [dict(colspan=str(colspan)) for colspan in colspans] 51 | else: 52 | colspans = [dict()] * len(txts) 53 | for txt, colspan in zip(txts, colspans): 54 | style = "word-break: break-all;" if len(str(txt)) > 80 else "word-wrap: break-word;" 55 | with td(style=style, halign="center", valign="top", **colspan): 56 | with p(): 57 | if txt is not None: 58 | p(txt) 59 | 60 | def add_images(self, ims, txts, links, colspans=None, height=None, width=400): 61 | image_style = '' 62 | if height is not None: 63 | image_style += "height:%dpx;" % height 64 | if width is not None: 65 | image_style += "width:%dpx;" % width 66 | if self.t is None: 67 | self.add_table() 68 | with self.t: 69 | with tr(): 70 | if colspans: 71 | assert len(txts) == len(colspans) 72 | colspans = [dict(colspan=str(colspan)) for colspan in colspans] 73 | else: 74 | colspans = [dict()] * len(txts) 75 | for im, txt, link, colspan in zip(ims, txts, links, colspans): 76 | with td(style="word-wrap: break-word;", halign="center", valign="top", **colspan): 77 | with p(): 78 | if im is not None and link is not None: 79 | with a(href=os.path.join('images', link)): 80 | img(style=image_style, src=os.path.join('images', im)) 81 | if im is not None and link is not None and txt is not None: 82 | br() 83 | if txt is not None: 84 | p(txt) 85 | 86 | def save(self): 87 | html_file = '%s/index.html' % self.web_dir 88 | f = open(html_file, 'wt') 89 | f.write(self.doc.render()) 90 | f.close() 91 | 92 | 93 | if __name__ == '__main__': 94 | html = HTML('web/', 'test_html') 95 | html.add_header('hello world') 96 | 97 | ims = [] 98 | txts = [] 99 | links = [] 100 | for n in range(4): 101 | ims.append('image_%d.jpg' % n) 102 | txts.append('text_%d' % n) 103 | links.append('image_%d.jpg' % n) 104 | html.add_images(ims, txts, links) 105 | html.save() 106 | -------------------------------------------------------------------------------- /robonet/yaml_util.py: -------------------------------------------------------------------------------- 1 | import re, yaml, os, json 2 | 3 | 4 | def parse_tune_config(config_file): 5 | """ 6 | Configures custom yaml loading behavior and parses config file 7 | """ 8 | import ray.tune as tune 9 | search_pattern = re.compile(r".*search\/(.*?)\((.*?)\)", re.VERBOSE) 10 | def search_constructor(loader, node): 11 | value = loader.construct_scalar(node) 12 | search_type, args = search_pattern.match(value).groups() 13 | if search_type == 'grid': 14 | return tune.grid_search(json.loads(args)) 15 | raise NotImplementedError("search {} is not implemented".format(search_type)) 16 | yaml.add_implicit_resolver("!custom_search", search_pattern, Loader=yaml.SafeLoader) 17 | yaml.add_constructor('!custom_search', search_constructor, Loader=yaml.SafeLoader) 18 | 19 | env_pattern = re.compile(r"\$\{(.*?)\}(.*)", re.VERBOSE) 20 | def env_var_constructor(loader, node): 21 | """ 22 | Converts ${VAR}/* from config file to 'os.environ[VAR] + *' 23 | Modified from: https://www.programcreek.com/python/example/61563/yaml.add_implicit_resolver 24 | """ 25 | value = loader.construct_scalar(node) 26 | env_var, remainder = env_pattern.match(value).groups() 27 | if env_var not in os.environ: 28 | raise ValueError("config requires envirnonment variable {} which is not set".format(env_var)) 29 | return os.environ[env_var] + remainder 30 | yaml.add_implicit_resolver("!env", env_pattern, Loader=yaml.SafeLoader) 31 | yaml.add_constructor('!env', env_var_constructor, Loader=yaml.SafeLoader) 32 | 33 | with open(config_file) as config: 34 | return yaml.load(config, Loader=yaml.SafeLoader) 35 | 36 | 37 | def parse_tpu_config(config_file): 38 | """ 39 | Configures custom yaml loading behavior and parses config file 40 | """ 41 | env_pattern = re.compile(r"\$\{(.*?)\}(.*)", re.VERBOSE) 42 | def env_var_constructor(loader, node): 43 | """ 44 | Converts ${VAR}/* from config file to 'os.environ[VAR] + *' 45 | Modified from: https://www.programcreek.com/python/example/61563/yaml.add_implicit_resolver 46 | """ 47 | value = loader.construct_scalar(node) 48 | env_var, remainder = env_pattern.match(value).groups() 49 | if env_var not in os.environ: 50 | raise ValueError("config requires envirnonment variable {} which is not set".format(env_var)) 51 | return os.environ[env_var] + remainder 52 | yaml.add_implicit_resolver("!env", env_pattern, Loader=yaml.SafeLoader) 53 | yaml.add_constructor('!env', env_var_constructor, Loader=yaml.SafeLoader) 54 | 55 | with open(config_file) as config: 56 | return yaml.load(config, Loader=yaml.SafeLoader) 57 | -------------------------------------------------------------------------------- /robonet_experiments/classifier_control/params.yaml: -------------------------------------------------------------------------------- 1 | # example configuration file for training a set of video prediction model on sawyer data from RoboNet 2 | # each model is trained on a different fraction of data 3 | 4 | # general experiment configurations 5 | batch_size: 16 6 | train_class: NumericHDF5Dataset 7 | max_steps: 300000 8 | result_dir: ${VMPC_EXP}/classifier_control/vidpred_training 9 | 10 | # list of dictionaries containing data sources along with filter parameters 11 | batch_config: 12 | # selects sawyer data with autograsp enabled (adim=4, robot=sawyer) 13 | - data_directory: ${VMPC_DATA}/classifier_control/data_collection/sim/1_obj_cartgripper_xz_rejsamp 14 | 15 | # loader_hparams used to initialize loader object 16 | loader_hparams: 17 | dataset: "NumericHDF5Dataset" 18 | buffer_size: 10 19 | load_T: 15 20 | random_shifts: True 21 | resize_image: True 22 | target_adim: 2 23 | target_sdim: 3 24 | 25 | # model_hparams used to create graph and loss function 26 | model_hparams: 27 | model: deterministic 28 | graph_type: c_dna_flow 29 | -------------------------------------------------------------------------------- /robonet_experiments/gpu/capacity_test/base_model/flow.yaml: -------------------------------------------------------------------------------- 1 | # example configuration file for training a set of video prediction model on sawyer data from RoboNet 2 | # each model is trained on a different fraction of data 3 | 4 | # general experiment configurations 5 | batch_size: 16 6 | train_class: VPredTrainable 7 | max_steps: 300000 8 | train_fraction: search/grid([0.9, 0.1, 0.01, 0.001]) 9 | 10 | # list of dictionaries containing data sources along with filter parameters 11 | batch_config: 12 | # selects sawyer data with autograsp enabled (adim=4, robot=sawyer) 13 | - data_directory: ${DATA_DIR}/hdf5 14 | robot: "sawyer" 15 | adim: 4 16 | 17 | # loader_hparams used to initialize loader object 18 | loader_hparams: 19 | dataset: "RoboNet" 20 | buffer_size: 10 21 | color_augmentation: 0.1 22 | load_T: 15 23 | 24 | # model_hparams used to create graph and loss function 25 | model_hparams: 26 | model: deterministic 27 | graph_type: vgg_conv 28 | tv_weight: 0.0 29 | -------------------------------------------------------------------------------- /robonet_experiments/gpu/capacity_test/base_model/noflow.yaml: -------------------------------------------------------------------------------- 1 | # example configuration file for training a set of video prediction model on sawyer data from RoboNet 2 | # each model is trained on a different fraction of data 3 | 4 | # general experiment configurations 5 | batch_size: 16 6 | train_class: VPredTrainable 7 | max_steps: 500000 8 | train_fraction: search/grid([0.9, 0.1, 0.01, 0.001]) 9 | 10 | # list of dictionaries containing data sources along with filter parameters 11 | batch_config: 12 | # selects sawyer data with autograsp enabled (adim=4, robot=sawyer) 13 | - data_directory: ${DATA_DIR}/hdf5 14 | robot: "sawyer" 15 | adim: 4 16 | 17 | # loader_hparams used to initialize loader object 18 | loader_hparams: 19 | dataset: "RoboNet" 20 | buffer_size: 10 21 | color_augmentation: 0.1 22 | load_T: 15 23 | 24 | # model_hparams used to create graph and loss function 25 | model_hparams: 26 | model: deterministic 27 | graph_type: vgg_conv 28 | use_flows: False 29 | tv_weight: 0 30 | -------------------------------------------------------------------------------- /robonet_experiments/gpu/capacity_test/large_model/flow.yaml: -------------------------------------------------------------------------------- 1 | # example configuration file for training a set of video prediction model on sawyer data from RoboNet 2 | # each model is trained on a different fraction of data 3 | 4 | # general experiment configurations 5 | batch_size: 12 6 | train_class: VPredTrainable 7 | max_steps: 500000 8 | train_fraction: 0.9 9 | save_freq: 1000 10 | 11 | # list of dictionaries containing data sources along with filter parameters 12 | batch_config: 13 | # selects sawyer data with autograsp enabled (adim=4, robot=sawyer) 14 | - data_directory: ${DATA_DIR}/hdf5 15 | robot: search/grid(["sawyer", ["sawyer", "franka"], ["sawyer", "baxter"], ["sawyer", "baxter", "franka"]]) 16 | adim: 4 17 | 18 | # loader_hparams used to initialize loader object 19 | loader_hparams: 20 | dataset: "RoboNet" 21 | buffer_size: 10 22 | color_augmentation: 0.1 23 | load_T: 15 24 | 25 | # model_hparams used to create graph and loss function 26 | model_hparams: 27 | model: deterministic 28 | graph_type: vgg_conv 29 | lr: 0.0001 30 | context_frames: 5 31 | schedule_sampling_k: 4000 32 | use_flows: True 33 | tv_weight: 0 34 | enc_filters: [256, 512, 1024] 35 | lstm_filters: 1024 36 | dec_filters: [1024, 512] 37 | img_flows: 32 38 | skip_flows: 16 39 | 40 | -------------------------------------------------------------------------------- /robonet_experiments/gpu/capacity_test/large_model/noflow.yaml: -------------------------------------------------------------------------------- 1 | # example configuration file for training a set of video prediction model on sawyer data from RoboNet 2 | # each model is trained on a different fraction of data 3 | 4 | # general experiment configurations 5 | batch_size: 16 6 | train_class: VPredTrainable 7 | max_steps: 500000 8 | train_fraction: search/grid([0.9, 0.1, 0.01, 0.001]) 9 | 10 | # list of dictionaries containing data sources along with filter parameters 11 | batch_config: 12 | # selects sawyer data with autograsp enabled (adim=4, robot=sawyer) 13 | - data_directory: ${DATA_DIR}/hdf5 14 | robot: "sawyer" 15 | adim: 4 16 | 17 | # loader_hparams used to initialize loader object 18 | loader_hparams: 19 | dataset: "RoboNet" 20 | buffer_size: 10 21 | color_augmentation: 0.1 22 | load_T: 15 23 | 24 | # model_hparams used to create graph and loss function 25 | model_hparams: 26 | model: deterministic 27 | graph_type: vgg_conv 28 | lr: 0.0001 29 | context_frames: 5 30 | schedule_sampling_k: 4000 31 | use_flows: False 32 | tv_weight: 0 33 | enc_filters: [256, 512, 1024] 34 | lstm_filters: 1024 35 | dec_filters: [1024, 512] 36 | -------------------------------------------------------------------------------- /robonet_experiments/gpu/finetune_baxter.yaml: -------------------------------------------------------------------------------- 1 | # example configuration file for training a set of video prediction model on sawyer data from RoboNet 2 | # each model is trained on a different fraction of data 3 | 4 | # general experiment configurations 5 | batch_size: 16 6 | train_class: VPredTrainable 7 | restore_dir: ${RESTORE_DIR} 8 | max_steps: 300000 9 | 10 | # list of dictionaries containing data sources along with filter parameters 11 | batch_config: 12 | # selects newly colleced baxter data for finetuning 13 | # note there is a 50% chance to select the new cloth data and a 50% chance to sample from one of the pretraining sources 14 | - data_directory: ${DATA_DIR}/baxter_cloth 15 | source_prob: 0.5 16 | 17 | # selects non-baxter data with autograsp enabled 18 | # source _prob need not be set here - code will automatically set it to 1 - 0.5 (from above) 19 | - data_directory: ${DATA_DIR}/hdf5 20 | robot: ["sawyer", "widowx", "franka"] 21 | adim: 4 22 | 23 | # loader_hparams used to initialize loader object 24 | loader_hparams: 25 | dataset: "RoboNet" 26 | buffer_size: 10 27 | load_T: 15 28 | color_augmentation: 0.1 29 | 30 | # model_hparams used to create graph and loss function 31 | model_hparams: 32 | model: deterministic 33 | graph_type: c_dna_flow 34 | tv_weight: 0.001 35 | -------------------------------------------------------------------------------- /robonet_experiments/gpu/pretrain_models/all_robonet/large.yaml: -------------------------------------------------------------------------------- 1 | batch_size: 12 2 | train_class: VPredTrainable 3 | max_steps: 500000 4 | train_fraction: 0.9 5 | save_freq: 5000 6 | 7 | batch_config: 8 | - data_directory: ${DATA_DIR}/hdf5 9 | 10 | # loader_hparams used to initialize loader object 11 | loader_hparams: 12 | dataset: "RoboNet" 13 | buffer_size: 10 14 | color_augmentation: 0.1 15 | load_T: 15 16 | target_adim: 4 17 | action_mismatch: 3 18 | state_mismatch: 3 19 | 20 | # model_hparams used to create graph and loss function 21 | model_hparams: 22 | model: deterministic 23 | graph_type: vgg_conv 24 | lr: 0.0001 25 | context_frames: 5 26 | schedule_sampling_k: 4000 27 | use_flows: True 28 | tv_weight: 0 29 | enc_filters: [256, 512, 896] 30 | lstm_filters: 896 31 | dec_filters: [896, 512] 32 | img_flows: 32 33 | skip_flows: 16 34 | -------------------------------------------------------------------------------- /robonet_experiments/gpu/pretrain_models/all_robonet/medium.yaml: -------------------------------------------------------------------------------- 1 | batch_size: 8 2 | train_class: VPredTrainable 3 | max_steps: 500000 4 | train_fraction: 0.9 5 | save_freq: 5000 6 | 7 | batch_config: 8 | - data_directory: ${DATA_DIR}/hdf5 9 | 10 | # loader_hparams used to initialize loader object 11 | loader_hparams: 12 | dataset: "RoboNet" 13 | buffer_size: 10 14 | color_augmentation: 0.1 15 | load_T: 15 16 | target_adim: 4 17 | action_mismatch: 3 18 | state_mismatch: 3 19 | 20 | # model_hparams used to create graph and loss function 21 | model_hparams: 22 | model: deterministic 23 | graph_type: vgg_conv 24 | lr: 0.0001 25 | context_frames: 5 26 | schedule_sampling_k: 4000 27 | use_flows: True 28 | tv_weight: 0 29 | enc_filters: [256, 512, 512] 30 | lstm_filters: 512 31 | dec_filters: [512, 512] 32 | -------------------------------------------------------------------------------- /robonet_experiments/gpu/pretrain_models/autograsp/large.yaml: -------------------------------------------------------------------------------- 1 | batch_size: 12 2 | train_class: VPredTrainable 3 | max_steps: 500000 4 | train_fraction: 0.9 5 | save_freq: 5000 6 | 7 | batch_config: 8 | - data_directory: ${DATA_DIR}/hdf5 9 | primitives: autograsp 10 | adim: 4 11 | robot: search/grid([["sawyer", "widowx", "baxter", "kuka"], ["sawyer", "widowx", "franka", "kuka"], ["sawyer", "widowx", "baxter", "franka"]]) 12 | 13 | # loader_hparams used to initialize loader object 14 | loader_hparams: 15 | dataset: "RoboNet" 16 | buffer_size: 10 17 | color_augmentation: 0.1 18 | load_T: 15 19 | 20 | # model_hparams used to create graph and loss function 21 | model_hparams: 22 | model: deterministic 23 | graph_type: vgg_conv 24 | lr: 0.0001 25 | context_frames: 5 26 | schedule_sampling_k: 4000 27 | use_flows: True 28 | tv_weight: 0 29 | enc_filters: [256, 512, 896] 30 | lstm_filters: 896 31 | dec_filters: [896, 512] 32 | img_flows: 32 33 | skip_flows: 16 34 | -------------------------------------------------------------------------------- /robonet_experiments/gpu/pretrain_models/autograsp/medium.yaml: -------------------------------------------------------------------------------- 1 | batch_size: 8 2 | train_class: VPredTrainable 3 | max_steps: 500000 4 | train_fraction: 0.9 5 | save_freq: 5000 6 | 7 | batch_config: 8 | - data_directory: ${DATA_DIR}/hdf5 9 | primitives: autograsp 10 | adim: 4 11 | robot: search/grid([["sawyer", "widowx", "baxter", "kuka"], ["sawyer", "widowx", "franka", "kuka"], ["sawyer", "widowx", "baxter", "franka"]]) 12 | 13 | # loader_hparams used to initialize loader object 14 | loader_hparams: 15 | dataset: "RoboNet" 16 | buffer_size: 10 17 | color_augmentation: 0.1 18 | load_T: 15 19 | 20 | # model_hparams used to create graph and loss function 21 | model_hparams: 22 | model: deterministic 23 | graph_type: vgg_conv 24 | lr: 0.0001 25 | context_frames: 5 26 | schedule_sampling_k: 4000 27 | use_flows: True 28 | tv_weight: 0 29 | enc_filters: [256, 512, 512] 30 | lstm_filters: 512 31 | dec_filters: [512, 512] 32 | -------------------------------------------------------------------------------- /robonet_experiments/gpu/sawyer_grid_search.yaml: -------------------------------------------------------------------------------- 1 | # example configuration file for training a set of video prediction model on sawyer data from RoboNet 2 | # each model is trained on a different fraction of data 3 | 4 | # general experiment configurations 5 | batch_size: 16 6 | train_class: VPredTrainable 7 | max_steps: 300000 8 | train_fraction: search/grid([0.9, 0.1, 0.01, 0.001]) 9 | 10 | # list of dictionaries containing data sources along with filter parameters 11 | batch_config: 12 | # selects sawyer data with autograsp enabled (adim=4, robot=sawyer) 13 | - data_directory: ${DATA_DIR}/hdf5 14 | robot: "sawyer" 15 | adim: 4 16 | 17 | # loader_hparams used to initialize loader object 18 | loader_hparams: 19 | dataset: "RoboNet" 20 | buffer_size: 10 21 | load_T: 15 22 | color_augmentation: 0.1 23 | 24 | # model_hparams used to create graph and loss function 25 | model_hparams: 26 | model: deterministic 27 | graph_type: vgg_conv 28 | tv_weight: 0.0 -------------------------------------------------------------------------------- /robonet_experiments/inverse_model/discretized_inverse.yaml: -------------------------------------------------------------------------------- 1 | batch_size: 16 2 | train_class: InverseTrainable 3 | max_steps: 200000 4 | 5 | 6 | # list of dictionaries containing data sources along with filter parameters 7 | batch_config: 8 | # selects sawyer data with autograsp enabled (adim=4, robot=sawyer) 9 | - data_directory: ${DATA_DIR}/hdf5 10 | robot: "sawyer" 11 | adim: 4 12 | 13 | # loader_hparams used to initialize loader object 14 | loader_hparams: 15 | dataset: "RoboNet" 16 | buffer_size: 10 17 | load_T: 4 #search/grid([2, 4, 11]) 18 | # color_augmentation: 0.1 19 | img_size: [192, 256] 20 | 21 | # model_hparams used to create graph and loss function 22 | model_hparams: 23 | model: DiscretizedInverseModel 24 | graph_type: lstm_baseline 25 | vgg_path: ${VGG_DIR} 26 | context_actions: 2 27 | enc_dim: 256 28 | latent_dim: 64 29 | append_last_action: true 30 | -------------------------------------------------------------------------------- /robonet_experiments/inverse_model/inverse.yaml: -------------------------------------------------------------------------------- 1 | batch_size: 16 2 | train_class: InverseTrainable 3 | max_steps: 100000 4 | 5 | # list of dictionaries containing data sources along with filter parameters 6 | batch_config: 7 | # selects sawyer data with autograsp enabled (adim=4, robot=sawyer) 8 | - data_directory: ${DATA_DIR}/hdf5 9 | robot: "sawyer" 10 | adim: 4 11 | 12 | # loader_hparams used to initialize loader object 13 | loader_hparams: 14 | dataset: "RoboNet" 15 | buffer_size: 10 16 | load_T: search/grid([2, 3, 4]) 17 | load_random_cam: False 18 | # color_augmentation: 0.1 19 | img_size: [192, 256] 20 | 21 | # model_hparams used to create graph and loss function 22 | model_hparams: 23 | model: DeterministicInverseModel 24 | graph_type: lstm_baseline 25 | vgg_path: ${VGG_DIR} 26 | -------------------------------------------------------------------------------- /robonet_experiments/tpu/capacity_test_flow.yaml: -------------------------------------------------------------------------------- 1 | # general experiment and batch configs 2 | data_directory: ${DATA_DIR} 3 | save_dir: "model_save_large_lbls_flow_multibot_2" 4 | batch_sizes: [4, 4, 4, 4] 5 | robots: ['sawyer', 'franka', 'baxter', 'widowx'] 6 | max_steps: 600000 7 | robots: ["sawyer"] 8 | 9 | # loader_hparams used to initialize loader object 10 | loader_hparams: 11 | load_T: 15 12 | 13 | # model_hparams used to create graph and loss function 14 | model_hparams: 15 | model: deterministic 16 | graph_type: vgg_conv 17 | tv_weight: 0 18 | lr: 0.0001 19 | context_frames: 5 20 | enc_filters: [512, 1024, 1536] 21 | lstm_filters: 1536 22 | dec_filters: [1024, 512] 23 | schedule_sampling_k: 4000 24 | img_flows: 32 25 | skip_flows: 16 26 | -------------------------------------------------------------------------------- /robonet_experiments/tpu/capacity_test_noflow.yaml: -------------------------------------------------------------------------------- 1 | # general experiment and batch configs 2 | data_directory: ${DATA_DIR} 3 | save_dir: "model_save_large_lbls" 4 | batch_sizes: [16] 5 | max_steps: 600000 6 | 7 | # loader_hparams used to initialize loader object 8 | loader_hparams: 9 | load_T: 15 10 | 11 | # model_hparams used to create graph and loss function 12 | model_hparams: 13 | model: deterministic 14 | graph_type: vgg_conv 15 | use_flows: False 16 | tv_weight: 0 17 | lr: 0.0001 18 | context_frames: 5 19 | enc_filters: [512, 1024, 1792] 20 | lstm_filters: 1792 21 | dec_filters: [1024, 512] 22 | schedule_sampling_k: 4000 23 | -------------------------------------------------------------------------------- /scripts/examples/create_prediction_gifs.py: -------------------------------------------------------------------------------- 1 | from robonet.video_prediction.testing import VPredEvaluation 2 | from robonet.yaml_util import parse_tune_config as parse_config 3 | import os 4 | import argparse 5 | import tensorflow as tf 6 | from robonet.datasets import get_dataset_class, load_metadata 7 | from tensorflow.contrib.training import HParams 8 | from robonet.datasets.util.tensor_multiplexer import MultiplexedTensors 9 | import numpy as np 10 | import imageio 11 | 12 | 13 | class DataLoader: 14 | def __init__(self, config): 15 | # run hparams are passed in through config dict 16 | self.dataset_hparams, self.model_hparams, self._hparams = self._extract_hparams(config) 17 | self._inputs, self._targets = self._make_dataloaders(config) 18 | 19 | def _default_hparams(self): 20 | default_dict = { 21 | 'batch_size': 16, 22 | 'restore_dir': '', 23 | 'n_gpus': 1, 24 | 'pad_amount': 2, 25 | 'scalar_summary_freq': 100, 26 | 'image_summary_freq': 1000, 27 | 'train_fraction': 0.9, 28 | 'val_fraction': 0.05, 29 | 'max_to_keep': 3, 30 | 'max_steps': 300000, 31 | } 32 | return HParams(**default_dict) 33 | 34 | def _get_dataset_class(self, class_name): 35 | return get_dataset_class(class_name) 36 | 37 | def _extract_hparams(self, config): 38 | """ 39 | Grabs and (optionally) modifies hparams 40 | """ 41 | self._batch_config = config.pop('batch_config') 42 | dataset_hparams, model_hparams = config.pop('loader_hparams', {}), config.pop('model_hparams', {}) 43 | hparams = self._default_hparams().override_from_dict(config) 44 | 45 | if 'splits' not in dataset_hparams: 46 | dataset_hparams['splits'] = [hparams.train_fraction, hparams.val_fraction, 1 - hparams.val_fraction - hparams.train_fraction] 47 | assert all([x >= 0 for x in dataset_hparams['splits']]), "invalid train/val fractions!" 48 | 49 | if 'sequence_length' in model_hparams and 'load_T' not in dataset_hparams: 50 | dataset_hparams['load_T'] = model_hparams['sequence_length'] 51 | 52 | return dataset_hparams, model_hparams, hparams 53 | 54 | def _get_input_targets(self, DatasetClass, metadata, dataset_hparams): 55 | data_loader = DatasetClass(self._hparams.batch_size, metadata, dataset_hparams) 56 | 57 | tensor_names = ['actions', 'images', 'states'] 58 | if 'annotations' in data_loader: 59 | tensor_names = ['actions', 'images', 'states', 'annotations'] 60 | 61 | self._tensor_multiplexer = MultiplexedTensors(data_loader, tensor_names) 62 | loaded_tensors = [self._tensor_multiplexer[k] for k in tensor_names] 63 | 64 | self._real_annotations = None 65 | assert loaded_tensors[1].get_shape().as_list()[2] == 1, "loader assumes one (potentially random) camera will be loaded in each example!" 66 | self._real_images = loaded_tensors[1] = loaded_tensors[1][:, :, 0] # grab cam 0 for images 67 | if 'annotations' in data_loader: 68 | self._real_annotations = loaded_tensors[3] = loaded_tensors[3][:, :, 0] # grab cam 0 for annotations 69 | 70 | inputs, targets = {'actions': loaded_tensors[0]}, {} 71 | for k, v in zip(tensor_names[1:], loaded_tensors[1:]): 72 | inputs[k], targets[k] = v[:, :-1], v 73 | 74 | self._data_loader = data_loader 75 | return inputs, targets 76 | 77 | def _make_dataloaders(self, config): 78 | DatasetClass = self._get_dataset_class(self.dataset_hparams.pop('dataset')) 79 | sources, self.dataset_hparams['source_selection_probabilities'] = self._init_sources() 80 | 81 | inputs, targets = self._get_input_targets(DatasetClass, sources, self.dataset_hparams) 82 | return inputs, targets 83 | 84 | def _default_source_hparams(self): 85 | return { 86 | 'data_directory': './', 87 | 'source_prob': None, 88 | 'balance_by_attribute': ['robot'] # split data source into multiple sources where for each source meta[attr] == a, (e.g all examples in one source come from a specific robot) 89 | } 90 | 91 | def _init_sources(self): 92 | loaded_metadata = {} 93 | sources, source_probs = [], [] 94 | 95 | for source in self._batch_config: 96 | source_hparams = self._default_source_hparams() 97 | source_hparams.update(source) 98 | dir_path = os.path.realpath(os.path.expanduser(source_hparams['data_directory'])) 99 | meta_data = loaded_metadata[dir_path] = loaded_metadata.get(dir_path, load_metadata(dir_path)) 100 | 101 | for k, v in source_hparams.items(): 102 | if k not in self._default_source_hparams(): 103 | if k == 'object_classes': 104 | meta_data = meta_data.select_objects(v) 105 | elif isinstance(v, (list, tuple)): 106 | meta_data = meta_data[meta_data[k].frame.isin(v)] 107 | else: 108 | meta_data = meta_data[meta_data[k] == v] 109 | assert len(meta_data), "filters created empty data source!" 110 | 111 | if source_hparams['balance_by_attribute']: 112 | meta_data = [meta_data] 113 | for k in source_hparams['balance_by_attribute']: 114 | new_data = [] 115 | for m in meta_data: 116 | unique_elems = m[k].frame.unique().tolist() 117 | new_data.extend([m[m[k] == u] for u in unique_elems]) 118 | meta_data = new_data 119 | 120 | if source_hparams['source_prob']: 121 | new_prob = source_hparams['source_prob'] / float(len(meta_data)) 122 | source_hparams['source_prob'] = [new_prob for _ in range(len(meta_data))] 123 | else: 124 | source_hparams['source_prob'] = [None for _ in range(len(meta_data))] 125 | 126 | sources.extend(meta_data) 127 | source_probs.extend(source_hparams['source_prob']) 128 | else: 129 | source_probs.append(source_hparams['source_prob']) 130 | sources.append(meta_data) 131 | 132 | if any([s is not None for s in source_probs]): 133 | set_probs = [s for s in source_probs if s is not None] 134 | assert all([0 <= s <= 1 for s in set_probs]) and sum(set_probs) <= 1, "invalid probability distribution!" 135 | if len(set_probs) != len(source_probs): 136 | remainder_prob = (1.0 - sum(set_probs)) / (len(source_probs) - len(set_probs)) 137 | for i in range(len(source_probs)): 138 | if source_probs[i] is None: 139 | source_probs[i] = remainder_prob 140 | else: 141 | source_probs = None 142 | 143 | return sources, source_probs 144 | 145 | def get_batch(self, sess, mode='test'): 146 | return sess.run([self._inputs, self._targets], feed_dict=self._tensor_multiplexer.get_feed_dict(mode)) 147 | 148 | 149 | def get_prediction_batches(dataset, prediction_model, mode='test'): 150 | batch = dataset.get_batch(prediction_model._sess, mode) 151 | actions = batch[0]['actions'] 152 | states, images = [batch[1][x] for x in ('states', 'images')] 153 | context = { 154 | "context_frames": images[:, :prediction_model.n_context][:, :, None], 155 | "context_actions": actions[:, :prediction_model.n_context - 1], 156 | "context_states": states[:, :prediction_model.n_context] 157 | } 158 | real_actions = actions[:, prediction_model.n_context - 1:] 159 | 160 | real_prediction_batch = {'context_tensors': context, 'action_tensors': {'actions':real_actions}} 161 | real_frames = images[:, prediction_model.n_context:] 162 | return real_prediction_batch, real_frames 163 | 164 | 165 | if __name__ == '__main__': 166 | import pickle as pkl 167 | parser = argparse.ArgumentParser() 168 | parser.add_argument('experiment_file', type=str, help='path to YAML experiment config file') 169 | parser.add_argument('prediction_checkpoint', type=str, help="path to video prediction model checkpoint folder") 170 | parser.add_argument('--N', type=int, help="number of batches to run", default=1) 171 | parser.add_argument('--n_gpus', type=int, help="number of GPUs to use during eval", default=1) 172 | args = parser.parse_args() 173 | args.experiment_file = os.path.expanduser(args.experiment_file) 174 | args.prediction_checkpoint = os.path.expanduser(args.prediction_checkpoint) 175 | 176 | config = parse_config(args.experiment_file) 177 | config.pop('train_class', None) 178 | 179 | batch_size = config['batch_size'] 180 | prediction_model = VPredEvaluation(args.prediction_checkpoint, {"run_batch_size": batch_size, 'tile_context': False}, n_gpus=args.n_gpus) 181 | config['loader_hparams']['load_T'] = prediction_model.sequence_length 182 | dataset = DataLoader(config) 183 | prediction_model.restore() 184 | 185 | l1_errors = [] 186 | for n in range(args.N): 187 | input_batch, real_frames = get_prediction_batches(dataset, prediction_model) 188 | pred_frames = prediction_model(**input_batch)['predicted_frames'][:, :, 0] 189 | n_pixels = pred_frames.shape[0] * pred_frames.shape[1] * pred_frames.shape[2] * pred_frames.shape[3] 190 | l1_errors.append(np.sum(np.abs(pred_frames - real_frames)) / n_pixels) 191 | 192 | for b in range(batch_size): 193 | for vid, name in zip([real_frames, pred_frames], ['real', 'pred']): 194 | images = (vid[b] * 255).astype(np.uint8) 195 | writer = imageio.get_writer('b{}_{}.gif'.format(n * batch_size + b, name)) 196 | for t in range(images.shape[0]): 197 | writer.append_data(images[t]) 198 | writer.close() 199 | print('average l1 error', np.mean(l1_errors)) 200 | print('std l1 error', np.std(l1_errors)) 201 | -------------------------------------------------------------------------------- /scripts/examples/test_franka_flow.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple script that shows the video-predictor API in action 3 | """ 4 | 5 | 6 | from robonet.video_prediction.testing.model_evaluation_interface import VPredEvaluation 7 | import numpy as np 8 | 9 | test_hparams = {} 10 | test_hparams['designated_pixel_count'] = 1 # number of selected pixels 11 | test_hparams['run_batch_size'] = 200 # number of predictions run through model concurrently 12 | N_ACTIONS = 300 # total actions to predict: can be different from run_batch_size! 13 | 14 | # feed in restore path and test specific hyperparams 15 | model = VPredEvaluation('~/Downloads/franka_sanity/sanity_check_model/checkpoint_170000', test_hparams) 16 | model.restore() 17 | 18 | # context tensors needed for prediction 19 | context_tensors = {} 20 | context_tensors['context_actions'] = np.zeros((model.n_context - 1, model.adim)) 21 | context_tensors['context_states'] = np.zeros((model.n_context, model.sdim)) # not needed for all models 22 | height, width = model.img_size 23 | context_tensors['context_frames'] = np.zeros((model.n_context, model.n_cam, height, width, 3)) # inputs should be RGB float \in [0, 1] 24 | context_tensors['context_pixel_distributions'] = np.zeros((model.n_context, model.n_cam, height, # spatial disributions (sum across image should be 1) 25 | width, test_hparams['designated_pixel_count'])) 26 | context_tensors['context_pixel_distributions'][:, :, 24, 32, :] = 1.0 27 | 28 | # actions for frames to be predicted 29 | action_tensors = {} 30 | action_tensors['actions'] = np.zeros((N_ACTIONS, model.horizon, model.adim)) 31 | 32 | results = model(context_tensors, action_tensors) 33 | predicted_frames = results['predicted_frames'] # RGB images, shape (N_ACTIONS, HORIZON, N_CAMS, 48, 64, 3) 34 | predicted_distributions = results['predicted_pixel_distributions'] # pixel distributions, shape (N_ACTIONS, HORIZON, N_CAMS, 48, 64, designated_pixel_count) 35 | print('predicted_frames has shape', predicted_frames.shape) 36 | -------------------------------------------------------------------------------- /scripts/templates/index_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | Trajectory Viewer 13 | 14 | 15 |

Dataset Visualizer

16 | 17 | 18 | {% for name in filter_names %} 19 | 20 | {% endfor %} 21 | 22 | 23 | {% for f in filters %} 24 | 30 | {% endfor %} 31 | 32 |
{{ name }}
25 | {% for traj in f %} 26 | {{ traj.text }} 27 |
28 | {% endfor %} 29 |
33 | 34 | -------------------------------------------------------------------------------- /scripts/templates/traj_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | Trajectory Viewer 13 | 14 | 15 | 16 |

RoboNet Trajectory {{ traj_name }}

17 | 18 |

Meta-Data

19 | 20 | {% for attr in attributes %} 21 | 22 | 23 | 24 | 25 | {% endfor %} 26 |
{{ attr.name }}{{ attr.value }}
27 |

28 | 29 |

Videos

30 | 31 | 32 | {% for name in video_names %} 33 | 34 | {% endfor %} 35 | 36 | 37 | 38 | {% for video in videos %} 39 | 44 | {% endfor %} 45 | 46 |
{{ name }}
40 | 43 |
47 | 48 | 49 | -------------------------------------------------------------------------------- /scripts/train_model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from robonet import get_trainable, GIFLogger 3 | import tensorflow as tf 4 | import ray 5 | import ray.tune as tune 6 | from robonet.yaml_util import parse_tune_config as parse_config 7 | import os 8 | 9 | 10 | def trial_str_creator(trial): 11 | return "{}_{}".format(str(trial), trial.trial_id) 12 | 13 | 14 | if __name__ == '__main__': 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('experiment_file', type=str, help='path to YAML experiment config file') 17 | parser.add_argument('--local_mode', action='store_true', help="if flag enables local_mode") 18 | parser.add_argument('--cluster', action='store_true', help="if flag enables cluster mode") 19 | parser.add_argument('--resume', action='store_true', help="if flag provided resume from checkpoints rather than start from scratch") 20 | parser.add_argument('--temp_dir', type=str, default=None, help="sets temp dir for ray redis (useful if permission error in /tmp/)") 21 | parser.add_argument('--name', type=str, default=None, help="sets experiment name") 22 | parser.add_argument('--n_gpus', type=int, default=1, help="number of GPUs to train on") 23 | args = parser.parse_args() 24 | config = parse_config(args.experiment_file) 25 | config['n_gpus'] = args.n_gpus 26 | 27 | redis_address, max_failures, local_mode = None, 10, False 28 | resume = config.pop('resume', args.resume) 29 | if args.cluster or config.pop('cluster', False): 30 | redis_address = ray.services.get_node_ip_address() + ':6379' 31 | max_failures = 1000 32 | elif args.local_mode or config.pop('local_mode', False): 33 | resume=False 34 | local_mode = True 35 | max_failures = 0 36 | 37 | if args.temp_dir is None: 38 | args.temp_dir = config.pop('temp_dir', None) 39 | 40 | if args.name is not None: 41 | name = args.name 42 | config.pop('name', None) 43 | else: 44 | name = config.pop('name', "{}_training".format(os.getlogin())) 45 | 46 | exp = tune.Experiment( 47 | name=name, 48 | run=get_trainable(config.pop('train_class')), 49 | trial_name_creator=tune.function(trial_str_creator), 50 | loggers=[GIFLogger], 51 | resources_per_trial= {"cpu": 1, "gpu": args.n_gpus}, 52 | checkpoint_freq=config.pop('save_freq', 5000), 53 | upload_dir=config.pop('upload_dir', None), 54 | local_dir=config.pop('local_dir', None), 55 | config=config # evaluate last to allow all popping above 56 | ) 57 | 58 | ray.init(redis_address=redis_address, local_mode=local_mode, temp_dir=args.temp_dir) 59 | trials = tune.run(exp, queue_trials=True, resume=resume, 60 | checkpoint_at_end=True, max_failures=max_failures) 61 | exit(0) 62 | -------------------------------------------------------------------------------- /scripts/train_vpred_tpu.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import os 3 | from robonet.datasets import get_dataset_class 4 | from robonet.video_prediction.models import get_model 5 | import tensorflow as tf 6 | from robonet.yaml_util import parse_tpu_config as parse_config 7 | 8 | 9 | def dataset_fn(params, DatasetClass, batch_sizes, loader_files, dataset_hparams): 10 | loader = DatasetClass(batch_sizes, loader_files, dataset_hparams) 11 | inputs = {} 12 | targets = {} 13 | 14 | inputs['actions'] = loader['actions'] 15 | inputs['images'] = loader['images'][:, :-1] 16 | inputs['states'] = loader['states'][:, :-1] 17 | 18 | targets['images'] = loader['images'] 19 | targets['states'] = loader['states'] 20 | 21 | return inputs, targets 22 | 23 | 24 | if __name__ == '__main__': 25 | import argparse 26 | parser = argparse.ArgumentParser(description='launches video prediction training on tpu instances') 27 | parser.add_argument('experiment_file', type=str, default='', help='path of experiment file') 28 | parser.add_argument('--testing', action='store_true', help='if flag is supplied then assume testing mode (model run on cpu)') 29 | args = parser.parse_args() 30 | 31 | config = parse_config(args.experiment_file) 32 | dataset_hparams = config.pop('loader_hparams') 33 | model_hparams = config.pop('model_hparams') 34 | 35 | # add bucket_dir to hparams 36 | if 'BUCKET' in os.environ and 'bucket_dir' not in dataset_hparams: 37 | dataset_hparams['bucket_dir'] = os.environ['BUCKET'] 38 | config['save_dir'] = '{}/{}'.format(os.environ['BUCKET'], config['save_dir']) 39 | 40 | # extract train params from config 41 | input_dir = os.path.expanduser(config['data_directory']) 42 | batch_sizes = config['batch_sizes'] 43 | model_hparams['summary_dir'] = save_dir = os.path.expanduser(config['save_dir']) 44 | train_steps_per_save = config.get('train_steps_per_save', 5000) 45 | model_hparams['summary_queue_len'] = iter_per_loop = config.get('iter_per_loop', train_steps_per_save) 46 | model_hparams['image_summary_freq'] = config.get('image_summary_freq', 500) 47 | 48 | robots = config.get('robots', ['sawyer']) 49 | max_steps = config.get('max_steps', 300000) 50 | 51 | loader_files = ['{}/{}'.format(input_dir, r) for r in robots] 52 | DatasetClass = get_dataset_class(dataset_hparams.pop('dataset', 'TPU')) 53 | 54 | train_input = functools.partial(dataset_fn, DatasetClass=DatasetClass, batch_sizes=batch_sizes, 55 | loader_files=loader_files, dataset_hparams=dataset_hparams) 56 | 57 | PredictionModel = get_model(model_hparams.pop('model')) 58 | model = PredictionModel(None, 0, model_hparams.pop('graph_type'), True, '') 59 | 60 | tpu_cluster_resolver=None 61 | if not args.testing: 62 | tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(os.environ['TPU_NAME'], zone=os.environ['TPU_ZONE'], project=os.environ['PROJECT_ID']) 63 | 64 | tpu_config = tf.contrib.tpu.TPUConfig(iterations_per_loop=iter_per_loop) 65 | run_config = tf.contrib.tpu.RunConfig(cluster=tpu_cluster_resolver, model_dir=save_dir, save_checkpoints_steps=train_steps_per_save,tpu_config=tpu_config) 66 | 67 | tf.logging.set_verbosity(tf.logging.DEBUG) 68 | estimator = tf.contrib.tpu.TPUEstimator(model_fn=model.model_fn, 69 | use_tpu=not args.testing, 70 | train_batch_size=sum(batch_sizes), 71 | eval_batch_size=sum(batch_sizes), 72 | predict_batch_size=sum(batch_sizes), 73 | params=model_hparams, 74 | config=run_config) 75 | 76 | estimator.train(input_fn=train_input, max_steps=max_steps) 77 | -------------------------------------------------------------------------------- /scripts/visualize_dataset.py: -------------------------------------------------------------------------------- 1 | import io 2 | import imageio 3 | from flask import Flask, render_template, url_for, redirect, abort, send_file 4 | import argparse 5 | app = Flask(__name__) 6 | 7 | 8 | args=None 9 | if __name__ == '__main__': 10 | parser = argparse.ArgumentParser(description="Web based utility to visualize RoboNet trajectoriers (in hdf5 format). I don't even pretend like this is secure. Don't serve on a public website!") 11 | parser.add_argument('input_dir', type=str, help='path to stored hdf5 files') 12 | args = parser.parse_args() 13 | f = open('scripts/test.mp4', 'rb') 14 | vid = f.read() 15 | f.close() 16 | 17 | 18 | @app.route('/') 19 | def index(): 20 | filter_names = ['filt0', 'filt1'] 21 | 22 | traj0 = {'url': '/traj/0', 'text': 'here lies traj 0'} 23 | traj10 = {'url': '/traj/10', 'text': 'here lies traj 10'} 24 | filters = [[traj0], [traj0, traj10]] 25 | 26 | return render_template('index_template.html', filter_names=filter_names, filters=filters) 27 | 28 | 29 | @app.route('/traj/') 30 | def traj_page(traj_id): 31 | if traj_id != 0 and traj_id != 10: # page not found if traj id not valid 32 | abort(404) 33 | 34 | attr_list = [{'name': 'robot', 'value': 'sawyer'}, {'name': 'te', 'value': 'st'}] 35 | 36 | vid_url = '/traj/{}/cam{}.mp4'.format(traj_id, 0) 37 | name_list = ['cam0', 'cam1', 'cam2'] 38 | video_list = [{'url': vid_url, 'type':'video/mp4'}, {'url': vid_url, 'type':'video/mp4'}, {'url': vid_url, 'type':'video/mp4'}] 39 | return render_template('traj_template.html', traj_name=str(traj_id), videos=video_list, video_names=name_list, attributes=attr_list) 40 | 41 | 42 | @app.route('/traj//cam.mp4') 43 | def get_mp4(traj_id, cam_id): 44 | if traj_id != 0 and traj_id != 10: # page not found if traj id not valid 45 | abort(404) 46 | 47 | if not 0 <= cam_id < 5: # page not found if camera id is invalid 48 | abort(404) 49 | 50 | return send_file( 51 | io.BytesIO(vid), 52 | mimetype='video/mp4', 53 | as_attachment=True, 54 | attachment_filename='cam{}.mp4'.format(cam_id)) 55 | 56 | 57 | @app.after_request 58 | def add_header(r): 59 | """ 60 | Source: https://stackoverflow.com/questions/34066804/disabling-caching-in-flask 61 | Add headers to both force latest IE rendering engine or Chrome Frame, 62 | and also to cache the rendered page for 10 minutes. 63 | """ 64 | r.headers["Cache-Control"] = "no-cache, no-store, must-revalidate" 65 | r.headers["Pragma"] = "no-cache" 66 | r.headers["Expires"] = "0" 67 | r.headers['Cache-Control'] = 'public, max-age=0' 68 | return r 69 | 70 | 71 | if __name__ == '__main__': 72 | # disable caching trick 2 (same source as above) 73 | app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0 74 | app.run() 75 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='RoboNet', 6 | version='0.1.0', 7 | packages=['robonet'], 8 | ) 9 | --------------------------------------------------------------------------------