├── restore_dicts ├── efficientnet-b0.p ├── efficientnet-b1.p ├── efficientnet-b2.p ├── efficientnet-b3.p ├── efficientnet-b4.p ├── efficientnet-b5.p ├── efficientnet-b6.p ├── efficientnet-b7.p ├── efficientnet-l2.p └── efficientnet-l2-475.p ├── requirements.txt ├── ImageNet ├── results_replication.txt └── input_imagenet.py ├── fMoW ├── match_test_gt.py ├── results_replication.txt ├── crop_fMoW.py ├── input_fMoW.py └── create_TFRecords_fMoW.py ├── CUB ├── create_csv_cub.py ├── create_tfrecords_cub.py ├── results_replication.txt └── input_cub.py ├── README.md └── NABirds ├── results_replication.txt ├── input_nab.py └── create_tfrecords_nab.py /restore_dicts/efficientnet-b0.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b0.p -------------------------------------------------------------------------------- /restore_dicts/efficientnet-b1.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b1.p -------------------------------------------------------------------------------- /restore_dicts/efficientnet-b2.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b2.p -------------------------------------------------------------------------------- /restore_dicts/efficientnet-b3.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b3.p -------------------------------------------------------------------------------- /restore_dicts/efficientnet-b4.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b4.p -------------------------------------------------------------------------------- /restore_dicts/efficientnet-b5.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b5.p -------------------------------------------------------------------------------- /restore_dicts/efficientnet-b6.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b6.p -------------------------------------------------------------------------------- /restore_dicts/efficientnet-b7.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b7.p -------------------------------------------------------------------------------- /restore_dicts/efficientnet-l2.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-l2.p -------------------------------------------------------------------------------- /restore_dicts/efficientnet-l2-475.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-l2-475.p -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow 2 | tensorflow-addons 3 | tensorboard 4 | tensorboard-plugin-profile 5 | Pillow 6 | scikit-learn 7 | scipy 8 | opencv-python 9 | pandas 10 | -------------------------------------------------------------------------------- /ImageNet/results_replication.txt: -------------------------------------------------------------------------------- 1 | --- Training 2 | 3 | - TNet 4 | 5 | The following command can be used to replicate the training of the TNet model reported in the paper: 6 | 7 | python train.py --to_train --to_evaluate_train --to_evaluate_val --batch_size 64 --num_classes 1000 --num_epochs 200 --initial_lr 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --keep_prob 0.5 --loc_per_grid 3.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.34375 --img_size_y 224 --img_size_x 224 --base_res_y 77 --base_res_x 77 --num_samples 1 --ls_dim 512 --num_patches_y 5 --num_patches_x 5 --num_res_levels 2 --num_do_layers 1 --descr_tag 'BagNet_77_TNet' --save_tag 'TNet_ImageNet' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary 8 | 9 | 10 | 11 | - Baseline 12 | 13 | The following command can be used to replicate the training of the BagNet-77 baseline model reported in the paper: 14 | 15 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_size 64 --num_classes 1000 --num_epochs 200 --initial_lr 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --keep_prob 0.375 --ls_dim 512 --num_do_layers 1 --img_size_y 224 --img_size_x 224 --descr_tag 'BagNet_77' --save_tag 'BagNet_77_ImageNet' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary 16 | 17 | 18 | 19 | 20 | 21 | --- Evaluation 22 | 23 | - TNet 24 | 25 | The following command can be used to evaluate a trained TNet model on the validation set of ImageNet: 26 | 27 | python train.py --to_evaluate_val --batch_size 64 --num_classes 1000 --loc_per_grid 3.0 --overlap 0.34375 --img_size_y 224 --img_size_x 224 --ls_dim 512 --num_patches_y 5 --num_patches_x 5 --base_res_y 77 --base_res_x 77 --num_res_levels 2 --descr_tag 'BagNet_77_TNet' --save_tag 'TNet_ImageNet' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 28 | 29 | The following command can be used to time the inference of TNet: 30 | 31 | python train.py --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 1 --to_evaluate_val --batch_size 64 --num_classes 1000 --loc_per_grid 3.0 --overlap 0.34375 --img_size_y 224 --img_size_x 224 --ls_dim 512 --num_patches_y 5 --num_patches_x 5 --base_res_y 77 --base_res_x 77 --num_res_levels 2 --descr_tag 'BagNet_77_TNet' --save_tag 'TNet_ImageNet' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 32 | 33 | The following command can be used for advanced evaluation of TNet: 34 | 35 | python train.py --adv_eval_data --batches_to_time_range 0 -1 --eval_epochs_num 1 --to_evaluate_val --batch_size 64 --num_classes 1000 --loc_per_grid 3.0 --overlap 0.34375 --img_size_y 224 --img_size_x 224 --ls_dim 512 --num_patches_y 5 --num_patches_x 5 --base_res_y 77 --base_res_x 77 --num_res_levels 2 --descr_tag 'BagNet_77_TNet' --save_tag 'TNet_ImageNet' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --labels_file '/path/to/imagenet_lsvrc_2015_synsets.txt' --imagenet_metadata_file '/path/to/imagenet_metadata.txt' 36 | 37 | Advanced evaluation corresponds to the creation of an excel file with information about the attended locations and the attendance probabilities of all candidate locations. 38 | 39 | 40 | - Baseline 41 | 42 | The following command can be used to evaluate a trained BagNet-77 model on the validation set of ImageNet: 43 | 44 | python train_bl.py --to_evaluate_val --batch_size 64 --num_classes 1000 --ls_dim 512 --img_size_y 224 --img_size_x 224 --descr_tag 'BagNet_77' --save_tag 'BagNet_77_ImageNet' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 45 | 46 | The following command can be used to time the inference of BagNet-77: 47 | 48 | python train_bl.py --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 1 --to_evaluate_val --batch_size 64 --num_classes 1000 --ls_dim 512 --img_size_y 224 --img_size_x 224 --descr_tag 'BagNet_77' --save_tag 'BagNet_77_ImageNet' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 49 | 50 | 51 | -------------------------------------------------------------------------------- /fMoW/match_test_gt.py: -------------------------------------------------------------------------------- 1 | """Match ground truth information with test images. 2 | Raw fMoW data can be downloaded here https://github.com/fMoW/dataset. 3 | The current script utilizes the rgb version of fMoW, and not the full version. 4 | fMoW data are split in training, validation and test sets. After download, for 5 | the training and validations sets, jpeg and json files are expected to reside 6 | in the following directory structure: 7 | /train/airport/airport_0/airport_0_0_rgb.jpg 8 | /train/airport/airport_0/airport_0_0_rgb.json 9 | ... 10 | 11 | /val/airport/airport_0/airport_0_0_rgb.jpg 12 | /val/airport/airport_0/airport_0_0_rgb.json 13 | ... 14 | 15 | For the test set, jpeg and json files are expected to reside 16 | in the following directory structure: 17 | /test/0011978/0011978_0_rgb.jpg 18 | /test/0011978/0011978_0_rgb.json 19 | ... 20 | 21 | Test set directory structure doesn't reveal the labels of the images, because 22 | it was initially realeased in the context of an IARPA challenge (https://www.iarpa.gov/challenges/fmow.html). 23 | However, given that the challenge is over, test set annotations are available 24 | for download with the rest of the data here https://github.com/fMoW/dataset. 25 | After downloding the ground truth test data, they consist of json files that 26 | reside in the following directory structure: 27 | /test_gt/airport/airport_0/airport_0_0_rgb.json 28 | /test_gt/airport/airport_0/airport_0_1_rgb.json 29 | ... 30 | 31 | The additional test_gt_mapping.json file is provided to establish a correspondance 32 | between the annotations under folder test_gt, and the images under folder test. 33 | The current script organizes jpeg and json files for the test set, in the following 34 | directory structure: 35 | /test_matched_with_gt/airport/airport_0/airport_0_0_rgb.jpeg 36 | /test_matched_with_gt/airport/airport_0/airport_0_0_rgb.json 37 | ... 38 | """ 39 | 40 | import argparse 41 | import os 42 | import json 43 | from tqdm import tqdm 44 | import errno 45 | import shutil 46 | 47 | 48 | 49 | parser = argparse.ArgumentParser() 50 | 51 | parser.add_argument('--root_test_dir', type=str, default='/fMoW-rgb/', help='Root directory of the original test data.') 52 | parser.add_argument('--test_output_dir', type=str, default='/test_matched_with_gt/', help='Directory to output the matched data.') 53 | parser.add_argument('--match_gt_json_path', type=str, default='/test_gt_mapping.json', help='Path to test_gt_mapping.json.') 54 | 55 | FLAGS = parser.parse_args() 56 | 57 | def try_mkdir(input_dir): 58 | """Try to make directory. 59 | Args: 60 | input_dir: string; directory to create. 61 | Returns: 62 | - 63 | """ 64 | 65 | if (not os.path.isdir(input_dir)): 66 | try: 67 | os.makedirs(input_dir) 68 | except OSError as e: 69 | if (e.errno == errno.EEXIST): 70 | pass 71 | 72 | def main(argv=None): 73 | """Match data with ground truth information, 74 | and save to new directory structure. 75 | Args: 76 | - 77 | Returns: 78 | - 79 | """ 80 | 81 | # Load test_gt_mapping.json, and iterate over its entries 82 | jsonData = json.load(open(FLAGS.match_gt_json_path)) 83 | for entry in tqdm(jsonData): 84 | src_test_dir = os.path.join(FLAGS.root_test_dir, entry['output']) 85 | src_test_gt_dir = os.path.join(FLAGS.root_test_dir, entry['input']) 86 | save_dir_suffix = entry['input'].split('/', 1)[1] 87 | save_dir = os.path.join(FLAGS.test_output_dir, save_dir_suffix) 88 | try_mkdir(save_dir) 89 | 90 | f_name_prefix_test_gt = entry['input'].split('/')[-1] 91 | f_name_prefix_test = entry['output'].split('/')[-1] 92 | 93 | for _, _, files in os.walk(src_test_dir): 94 | for f_src in files: 95 | # Ignore msrgb images 96 | if f_src.endswith('_rgb.jpg'): 97 | f_src_test_img = f_src 98 | 99 | f_scr_test_gt_json = f_src.replace('.jpg', '.json') 100 | f_scr_test_gt_json = f_scr_test_gt_json.replace(f_name_prefix_test, f_name_prefix_test_gt) 101 | 102 | f_dst_json = f_scr_test_gt_json 103 | f_dst_img = f_src_test_img.replace(f_name_prefix_test, f_name_prefix_test_gt) 104 | 105 | jsonData_src_test_gt = json.load(open(os.path.join(src_test_gt_dir, f_scr_test_gt_json))) 106 | # Ignore bounding boxes with unknown ids 107 | if not isinstance(jsonData_src_test_gt['bounding_boxes'], list): 108 | jsonData_src_test_gt['bounding_boxes'] = [jsonData_src_test_gt['bounding_boxes']] 109 | bb_lst = [] 110 | for bb in jsonData_src_test_gt['bounding_boxes']: 111 | if (bb['ID'] != -1): 112 | bb_lst.append(bb) 113 | 114 | jsonData_dst = jsonData_src_test_gt 115 | jsonData_dst['bounding_boxes'] = bb_lst 116 | 117 | # Save updated json file 118 | json.dump(jsonData_dst, open(os.path.join(save_dir, f_dst_json), 'w')) 119 | # Copy test image under the new directory 120 | shutil.copy(os.path.join(src_test_dir, f_src_test_img), os.path.join(save_dir, f_dst_img)) 121 | 122 | if __name__ == '__main__': 123 | main() 124 | -------------------------------------------------------------------------------- /CUB/create_csv_cub.py: -------------------------------------------------------------------------------- 1 | """Create csv files for the training and validation splits of the Caltech-UCSD Birds-200-2011 dataset. 2 | Each entry in the csv files containes the path to an image, its numeric label, and its human-readable 3 | label. Raw data can be downloaded here http://www.vision.caltech.edu/visipedia/CUB-200-2011.html. 4 | """ 5 | 6 | from __future__ import absolute_import, division, print_function 7 | 8 | import os 9 | import random 10 | import argparse 11 | import numpy as np 12 | import tensorflow as tf 13 | from tqdm import tqdm 14 | import pandas as pd 15 | 16 | 17 | 18 | parser = argparse.ArgumentParser() 19 | 20 | # This file (images.txt) contains the list of image file names, with each line corresponding to one image. 21 | # The content of the file is expected to be as follows: 22 | # 23 | # 24 | # where image_id is a unique numeric identifier for each image in the dataset, and image_name is the path 25 | # to the corresponding image file. An example line is the following: 26 | # 16 001.Black_footed_Albatross/Black_Footed_Albatross_0016_796067.jpg 27 | parser.add_argument('--imgs_list_txt', type=str, default='/images.txt', help='File with list of image paths.') 28 | 29 | # This file (train_test_split.txt) contains the suggested training/validation split, with each line corresponding 30 | # to one image. The content of the file is expected to be as follows: 31 | # 32 | # 33 | # where image_id is a unique numeric identifier for each image in the dataset (same as in images.txt), and 34 | # is_training_image takes either value 1 or 0, denoting that the file is in the training or the validation 35 | # set, respectively. An example line is the following: 36 | # 16 0 37 | parser.add_argument('--split_list_txt', type=str, default='/train_test_split.txt', help='File with information about train/validation split of the data.') 38 | 39 | parser.add_argument('--save_dir', type=str, default='/CUB_200_2011/', help='Output data directory') 40 | 41 | FLAGS = parser.parse_args() 42 | 43 | def find_image_files(imgs_list_txt): 44 | """Build lists of all images file paths, numeric labels, and 45 | human-readable labels. 46 | Args: 47 | imgs_list_txt: string; path to file with list of image paths. 48 | Returns: 49 | filenames: list of strings; it contains paths to image files. 50 | labels_values: list of ints; it contains numeric labels. 51 | labels_names: list of strings; it contains human-readable labels. 52 | """ 53 | 54 | lines = tf.io.gfile.GFile(imgs_list_txt, 'r').readlines() 55 | 56 | filenames = [] 57 | labels_values = [] 58 | labels_names = [] 59 | # Iterate over file lines 60 | for l in lines: 61 | if l: 62 | parts = l.strip().split(' ') 63 | assert len(parts) == 2 64 | filenames.append('/' + parts[1]) 65 | 66 | p = parts[1].split('.', 1) 67 | labels_values.append(int(p[0])) 68 | labels_names.append(p[1].split('/', 1)[0]) 69 | 70 | print('Found %d JPEG files across %d labels.' %(len(filenames), len(labels_names))) 71 | 72 | return filenames, labels_values, labels_names 73 | 74 | def split_data(split_list_txt, filenames, labels_values, labels_names): 75 | """Create entries for csv files about the training and validation 76 | splits of the Caltech-UCSD Birds-200-2011 dataset. Each entry 77 | includes the path to an image, its numeric label, and its 78 | human-readable label. 79 | Args: 80 | split_list_txt: string; path to file with information about 81 | train/validation split of the data. 82 | filenames: list of strings; it contains paths to image files. 83 | labels_values: list of ints; it contains numeric labels. 84 | labels_names: list of strings; it contains human-readable labels. 85 | Returns: 86 | train_csv_entries: np array; it contains paths to the image files 87 | of the training split. It also contains the numeric label and 88 | the human-readable label of each iamge. It is of size 89 | [num_imgs_train, 3], where num_imgs_train is the number of 90 | images in the training split. 91 | validation_csv_entries: np array; it contains paths to the image 92 | files of the validation split. It also contains the numeric 93 | label and the human-readable label of each iamge. It is of 94 | size [num_imgs_val, 3], where num_imgs_val is the number of 95 | images in the validation split. 96 | """ 97 | 98 | lines = tf.io.gfile.GFile(split_list_txt, 'r').readlines() 99 | 100 | split_val = [] 101 | for l in lines: 102 | if l: 103 | split_val.append(l.strip().split(' ')[1]) 104 | 105 | # Shuffle the ordering of image files to guarantee 106 | # random ordering of the images with respect to labels 107 | shuffled_index = list(range(len(filenames))) 108 | random.seed(12345) 109 | random.shuffle(shuffled_index) 110 | filenames = [filenames[i] for i in shuffled_index] 111 | labels_values = [labels_values[i] for i in shuffled_index] 112 | labels_names = [labels_names[i] for i in shuffled_index] 113 | split_val = [split_val[i] for i in shuffled_index] 114 | 115 | df_array = np.concatenate((np.expand_dims(np.asarray(filenames), 1), np.expand_dims(np.asarray(labels_values), 1), 116 | np.expand_dims(np.asarray(labels_names), 1)), axis=1) 117 | mask = np.asarray(split_val).astype(bool) 118 | inv_mask = (1 - mask).astype(bool) 119 | 120 | # Create entries for csv files about the training split 121 | train_csv_entries = df_array[mask, :] 122 | # Create entries for csv files about the validation split 123 | validation_csv_entries = df_array[inv_mask, :] 124 | 125 | print('Added %d entries to train split, and %d entries to validation split.' %(train_csv_entries.shape[0], validation_csv_entries.shape[0])) 126 | 127 | return train_csv_entries, validation_csv_entries 128 | 129 | def save_to_csv(csv_entries, save_dir, tag): 130 | """Save csv entries. 131 | Args: 132 | csv_entries: np array; it contains paths to image files with 133 | their numeric labels, and human-readable labels. It is of size 134 | [num_imgs, 3], where num_imgs is the number of images files. 135 | save_dir: string; directory to save the csv file. 136 | tag: string; name of the csv file to save. 137 | Returns: 138 | - 139 | """ 140 | 141 | cols = ['fname', 'class_number', 'class_name'] 142 | df = pd.DataFrame(csv_entries, columns=cols) 143 | fp = os.path.join(save_dir, tag + '.csv') 144 | 145 | if (not os.path.isdir(FLAGS.save_dir)): 146 | os.makedirs(FLAGS.save_dir) 147 | df.to_csv(fp, encoding='utf-8', index=False) 148 | print('CSV saved at %s.' %fp) 149 | 150 | def main(argv=None): 151 | """Create csv files for the training and validation splits 152 | of the Caltech-UCSD Birds-200-2011 dataset. 153 | Args: 154 | - 155 | Returns: 156 | - 157 | """ 158 | 159 | # Build lists with image file paths, numeric labels, and human-readable labels 160 | filenames, labels_values, labels_names = find_image_files(FLAGS.imgs_list_txt) 161 | 162 | # Create csv entries for training and validation splits 163 | train_csv_entries, validation_csv_entries = split_data(FLAGS.split_list_txt, filenames, labels_values, labels_names) 164 | 165 | # Save csv files for training and validation splits 166 | save_to_csv(train_csv_entries, FLAGS.save_dir, 'train_anno') 167 | save_to_csv(validation_csv_entries, FLAGS.save_dir, 'validation_anno') 168 | 169 | if __name__ == '__main__': 170 | main() 171 | -------------------------------------------------------------------------------- /fMoW/results_replication.txt: -------------------------------------------------------------------------------- 1 | --- Training 2 | 3 | - TNet 4 | 5 | The TNet model reported in the paper is trained in two steps; first TNet is trained on images of size 448x448 px, and then it is fine-tuned on images of size 896x896 px. The first step can be replicated with the following command: 6 | 7 | python train.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_classes 62 --num_epochs 40 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.5 --block_drop_rate 0.3 --loc_per_grid 2.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.2 --perFReg_reinf_weight 0.2 --overlap 0.5 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 3 --num_patches_x 3 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNet_lg3x3' --save_tag 'TNet_fMoW_448' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary 8 | 9 | The second step can be replicated with the following command: 10 | 11 | python train.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_classes 62 --num_epochs 10 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.5 --block_drop_rate 0.5 --loc_per_grid 2.0 1.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.05 --perFReg_reinf_weight 0.05 --overlap 0.5 --img_size_y 896 --img_size_x 896 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 3 --num_patches_x 3 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 3 --perFReg_cap 2 --descr_tag 'EfficientNet_lg3x3' --save_tag 'TNet_fMoW' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/dir/with/ckpt/to/restore/' 12 | 13 | 14 | 15 | - Baselines 16 | 17 | The following command can be used to replicate the training of the EfficientNet-B0 model on the cropped images: 18 | 19 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 65 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --l2_reg 0.00001 --dropout_rate 0.75 --block_drop_rate 0.5 --img_size_y 224 --img_size_x 224 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet' --save_tag 'bl_fMoW_cropped' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary 20 | 21 | The following command can be used to replicate the training of the EfficientNet-B0 model on images of size 224x224 px: 22 | 23 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 60 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --l2_reg 0.00001 --dropout_rate 0.75 --block_drop_rate 0.5 --img_size_y 224 --img_size_x 224 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet' --save_tag 'bl_fMoW_224' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary 24 | 25 | Since this model is trained on images of relatively small size, for purposes of training efficiency, we used TFRecords created with the following command (for the other models we use TFRecords which are created as described in the README.md of our repository): 26 | 27 | python create_TFRecords_fMoW.py --train_directory '/path/to/training/set/dir/' --validation_directory '/path/to/validation/set/dir/' --test_directory '/path/to/test/set/dir/' --output_directory '/path/to/output/dir/' --maximum_min_dim 275 28 | 29 | The following command can be used to replicate the training of the EfficientNet-B0 model on images of size 448x448 px: 30 | 31 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 32 --num_epochs 30 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --l2_reg 0.00001 --dropout_rate 0.75 --block_drop_rate 0.3 --img_size_y 448 --img_size_x 448 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet' --save_tag 'bl_fMoW_448' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary 32 | 33 | The following command can be used to replicate the training of the EfficientNet-B0 model on images of size 896x896 px: 34 | 35 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 32 --num_epochs 30 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --l2_reg 0.00001 --dropout_rate 0.3 --block_drop_rate 0.2 --img_size_y 896 --img_size_x 896 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet' --save_tag 'bl_fMoW_896' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary 36 | 37 | 38 | 39 | 40 | 41 | --- Evaluation 42 | 43 | - TNet 44 | 45 | The following command can be used to evaluate a trained TNet model on the test set of fMoW, with 2 processing levels (images of size 448x448 px) and 2 attended locations: 46 | 47 | python train.py --to_evaluate_test --batch_norm --batch_size 64 --num_classes 62 --loc_per_grid 2.0 --overlap 0.5 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 3 --num_patches_x 3 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNet_lg3x3' --save_tag 'TNet_fMoW_448' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 48 | 49 | The following command can be used to evaluate a trained TNet model on the test set of fMoW, with 3 processing levels (images of size 896x896 px) and 4 attended locations: 50 | 51 | python train.py --to_evaluate_test --batch_norm --batch_size 64 --num_classes 62 --loc_per_grid 2.0 1.0 --overlap 0.5 --img_size_y 896 --img_size_x 896 --pos_dim_divisor 4 --num_patches_y 3 --num_patches_x 3 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 3 --descr_tag 'EfficientNet_lg3x3' --save_tag 'TNet_fMoW' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 52 | 53 | The following flags can be added to the previous evaluation commands in order to time the inference of TNet: 54 | 55 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 1 56 | 57 | The following flags can be added to the previous evaluation commands for advanced evaluation of TNet: 58 | 59 | --adv_eval_data --batches_to_time_range 0 -1 --eval_epochs_num 1 60 | 61 | Advanced evaluation corresponds to the creation of an excel file with information about the attended locations and the attendance probabilities of all candidate locations. 62 | 63 | 64 | 65 | - Baselines 66 | 67 | The following command can be used to evaluate on the test set of fMoW, an EfficientNet-B0 model trained on cropped images of size 224x224 px: 68 | 69 | python train_bl.py --to_evaluate_test --batch_norm --batch_size 64 --img_size_y 224 --img_size_x 224 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet0' --save_tag 'bl_fMoW_cropped' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 70 | 71 | The following command can be used to evaluate on the test set of fMoW, an EfficientNet-B0 model trained on images of size 224x224 px: 72 | 73 | python train_bl.py --to_evaluate_test --batch_norm --batch_size 64 --img_size_y 224 --img_size_x 224 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet0' --save_tag 'bl_fMoW_224' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 74 | 75 | The following command can be used to evaluate on the test set of fMoW, an EfficientNet-B0 model trained on images of size 448x448 px: 76 | 77 | python train_bl.py --to_evaluate_test --batch_norm --batch_size 64 --img_size_y 448 --img_size_x 448 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet0' --save_tag 'bl_fMoW_448' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 78 | 79 | The following command can be used to evaluate on the test set of fMoW, an EfficientNet-B0 model trained on images of size 896x896 px: 80 | 81 | python train_bl.py --to_evaluate_test --batch_norm --batch_size 64 --img_size_y 896 --img_size_x 896 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet0' --save_tag 'bl_fMoW_896' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 82 | 83 | The following flags can be added to the previous evaluation commands in order to time the inference of the baselines: 84 | 85 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 1 86 | 87 | 88 | -------------------------------------------------------------------------------- /fMoW/crop_fMoW.py: -------------------------------------------------------------------------------- 1 | """Crop fMoW images based on bounding box annotations. 2 | """ 3 | 4 | from __future__ import absolute_import, division, print_function 5 | 6 | import argparse 7 | from datetime import datetime 8 | import os 9 | import random 10 | import sys 11 | import threading 12 | import json 13 | from multiprocessing import cpu_count 14 | import cv2 15 | import copy 16 | 17 | import numpy as np 18 | import six 19 | import tensorflow as tf 20 | 21 | 22 | 23 | parser = argparse.ArgumentParser() 24 | 25 | parser.add_argument('--train_directory', type=str, default='/train/', help='Training data directory.') 26 | parser.add_argument('--validation_directory', type=str, default='/val/', help='Validation data directory.') 27 | parser.add_argument('--test_directory', type=str, default='/test_matched_with_gt/', help='Test data directory.') 28 | parser.add_argument('--output_directory', type=str, default='/data_cropped/', help='Output data directory.') 29 | parser.add_argument('--num_threads', type=int, default=16, help='Number of threads to parallelize processing.') 30 | 31 | FLAGS = parser.parse_args() 32 | 33 | category_names = ['airport', 'airport_hangar', 'airport_terminal', 'amusement_park', 'aquaculture', 'archaeological_site', 'barn', 'border_checkpoint', 'burial_site', 'car_dealership', 'construction_site', 34 | 'crop_field', 'dam', 'debris_or_rubble', 'educational_institution', 'electric_substation', 'factory_or_powerplant', 'fire_station', 'flooded_road', 'fountain', 'gas_station', 'golf_course', 35 | 'ground_transportation_station', 'helipad', 'hospital', 'interchange', 'lake_or_pond', 'lighthouse', 'military_facility', 'multi-unit_residential', 'nuclear_powerplant', 'office_building', 36 | 'oil_or_gas_facility', 'park', 'parking_lot_or_garage', 'place_of_worship', 'police_station', 'port', 'prison', 'race_track', 'railway_bridge', 'recreational_facility', 'impoverished_settlement', 37 | 'road_bridge', 'runway', 'shipyard', 'shopping_mall', 'single-unit_residential', 'smokestack', 'solar_farm', 'space_facility', 'stadium', 'storage_tank','surface_mine', 'swimming_pool', 38 | 'toll_booth', 'tower', 'tunnel_opening', 'waste_disposal', 'water_treatment_facility', 'wind_farm', 'zoo'] 39 | 40 | def _process_image_files_batch(thread_index, ranges, file_paths, categories, outDir): 41 | """Execute 1 thread that processes images and saves crops according 42 | to bounding box annotations. 43 | Args: 44 | thread_index: int; unique thread identifier. 45 | ranges: list of ints; it contains the range of images to process. 46 | file_paths: list of strings; it contains paths to image files. 47 | categories: list of strings; it contains human-readable labels. 48 | outDir: string; directory to save output data. 49 | Returns: 50 | - 51 | """ 52 | 53 | # Process each file 54 | files_in_thread = np.arange(ranges[thread_index][0], ranges[thread_index][1], dtype=int) 55 | img_num = 0 56 | bbox_num = 0 57 | for i in files_in_thread: 58 | img_num += 1 59 | f_src_img = file_paths[i] 60 | f_src_json = f_src_img.replace('.jpg', '.json') 61 | 62 | # Load image 63 | img = cv2.imread(f_src_img).astype(np.float32) 64 | 65 | # Load json file with image information 66 | jsonData = json.load(open(f_src_json)) 67 | if not isinstance(jsonData['bounding_boxes'], list): 68 | jsonData['bounding_boxes'] = [jsonData['bounding_boxes']] 69 | 70 | label = categories[i] 71 | for bb in jsonData['bounding_boxes']: 72 | category = bb['category'] 73 | if ((category != label) or (bb['ID'] == -1)): 74 | continue 75 | bbox_num += 1 76 | # Each bounding box is a list of 4 ints. The first two entries (box[0] and box[1]) 77 | # are the coordinates in pixels of top left corner of the box (first the horizontal 78 | # and then the vertical coordinate), and the last two entries (box[2] and box[3]) 79 | # are the width and the height of the box 80 | box = bb['box'] 81 | 82 | # Ignore tiny boxes 83 | if box[2] <= 2 or box[3] <= 2: 84 | continue 85 | 86 | # Add margin around a bounding box for more contextual information. 87 | # The followed strategy is based on _process_file function from 88 | # https://github.com/fMoW/baseline/blob/master/code/data_ml_functions/dataFunctions.py 89 | contextMultWidth = 0.15 90 | contextMultHeight = 0.15 91 | 92 | wRatio = float(box[2]) / img.shape[1] 93 | hRatio = float(box[3]) / img.shape[0] 94 | 95 | if ((wRatio < 0.5) and (wRatio >= 0.4)): 96 | contextMultWidth = 0.2 97 | if ((wRatio < 0.4) and (wRatio >= 0.3)): 98 | contextMultWidth = 0.3 99 | if ((wRatio < 0.3) and (wRatio >= 0.2)): 100 | contextMultWidth = 0.5 101 | if ((wRatio < 0.2) and (wRatio >= 0.1)): 102 | contextMultWidth = 1 103 | if (wRatio < 0.1): 104 | contextMultWidth = 2 105 | 106 | if ((hRatio < 0.5) and (hRatio >= 0.4)): 107 | contextMultHeight = 0.2 108 | if ((hRatio < 0.4) and (hRatio >= 0.3)): 109 | contextMultHeight = 0.3 110 | if ((hRatio < 0.3) and (hRatio >= 0.2)): 111 | contextMultHeight = 0.5 112 | if ((hRatio < 0.2) and (hRatio >= 0.1)): 113 | contextMultHeight = 1 114 | if (hRatio < 0.1): 115 | contextMultHeight = 2 116 | 117 | widthBuffer = int((box[2] * contextMultWidth) / 2.0) 118 | heightBuffer = int((box[3] * contextMultHeight) / 2.0) 119 | 120 | r1 = box[1] - heightBuffer 121 | r2 = box[1] + box[3] + heightBuffer 122 | c1 = box[0] - widthBuffer 123 | c2 = box[0] + box[2] + widthBuffer 124 | 125 | if (r1 < 0): 126 | r1 = 0 127 | if (r2 > img.shape[0]): 128 | r2 = img.shape[0] 129 | if (c1 < 0): 130 | c1 = 0 131 | if (c2 > img.shape[1]): 132 | c2 = img.shape[1] 133 | 134 | if ((r1 >= r2) or (c1 >= c2)): 135 | continue 136 | 137 | subImg = img[r1:r2, c1:c2, :] 138 | 139 | jsonData_dst = copy.deepcopy(jsonData) 140 | bb['box'] = [0., 0., 1.0, 1.0] 141 | jsonData_dst['bounding_boxes'] = [bb] 142 | jsonData['img_height'] = r2 - r1 143 | jsonData['img_width'] = c2 - c1 144 | 145 | # Determine output directory and save files 146 | slashes = [k for k, ltr in enumerate(f_src_img) if ltr == '/'] 147 | outBaseName = '%s_%s' %(category, bb['ID']) 148 | currOut = os.path.join(outDir, f_src_img[(slashes[-3] + 1):slashes[-1]], outBaseName) 149 | 150 | if (not os.path.isdir(currOut)): 151 | try: 152 | os.makedirs(currOut) 153 | except: 154 | print("Directory already created.") 155 | 156 | f_name = os.path.basename(f_src_img) 157 | f_dst_img = os.path.join(currOut, f_name) 158 | f_dst_json = f_dst_img.replace('.jpg', '.json') 159 | 160 | cv2.imwrite(f_dst_img, subImg) 161 | json.dump(jsonData_dst, open(f_dst_json, 'w')) 162 | 163 | print('%s [thread %d]: Wrote %d images with %d bboxes.' %(datetime.now(), thread_index, img_num, bbox_num)) 164 | sys.stdout.flush() 165 | 166 | def _process_image_files(file_paths, categories, outDir): 167 | """Process images and save crops according to bounding box annotations. 168 | Args: 169 | file_paths: list of strings; it contains paths to image files. 170 | categories: list of strings; it contains human-readable labels. 171 | outDir: string; directory to save output data. 172 | Returns: 173 | - 174 | """ 175 | 176 | # Break images into batches 177 | num_threads = FLAGS.num_threads 178 | spacing = np.linspace(0, len(file_paths), num_threads + 1).astype(np.int) 179 | ranges = [] 180 | for i in range(len(spacing) - 1): 181 | ranges.append([spacing[i], spacing[i + 1]]) 182 | 183 | # Launch a thread for each batch 184 | print('Launching %d threads for spacings: %s' % (num_threads, ranges)) 185 | sys.stdout.flush() 186 | 187 | # Create a mechanism for monitoring when all threads are finished. 188 | coord = tf.train.Coordinator() 189 | 190 | # Run threads 191 | threads = [] 192 | for thread_index in range(len(ranges)): 193 | args = (thread_index, ranges, file_paths, categories, outDir) 194 | t = threading.Thread(target=_process_image_files_batch, args=args) 195 | t.start() 196 | threads.append(t) 197 | 198 | # Wait for all the threads to terminate. 199 | coord.join(threads) 200 | print('%s: Finished writing all %d images in data set.' %(datetime.now(), len(file_paths))) 201 | sys.stdout.flush() 202 | 203 | def _find_image_files(data_dir): 204 | """Build lists of all images file paths, synsets, and labels in 205 | a data set. 206 | Args: 207 | data_dir: string; path to data set. 208 | Returns: 209 | file_paths: list of strings; it contains paths to image files. 210 | categories: list of strings; it contains human-readable labels. 211 | """ 212 | 213 | # Construct the lists of image files and categories 214 | print('Determining list of input files and categories from %s.' % data_dir) 215 | file_paths = [] 216 | categories = [] 217 | label_index = 1 218 | for category in category_names: 219 | jpeg_file_path = os.path.join(data_dir, category, '*', category + '_*_rgb.jpg') 220 | matching_files = tf.io.gfile.glob(jpeg_file_path) 221 | 222 | file_paths.extend(matching_files) 223 | categories.extend([category] * len(matching_files)) 224 | 225 | if (not (label_index % 10)): 226 | print('Finished finding files in %d of %d classes.' %(label_index, len(category_names))) 227 | label_index += 1 228 | 229 | # Shuffle images to distribute large images to different threads 230 | # and avoid bottlenecks, since image size seems to be class specific 231 | shuffled_index = list(range(len(file_paths))) 232 | random.seed(12345) 233 | random.shuffle(shuffled_index) 234 | 235 | file_paths = [file_paths[i] for i in shuffled_index] 236 | categories = [categories[i] for i in shuffled_index] 237 | 238 | print('Found %d .jpg files across %d labels inside %s.' %(len(file_paths), len(category_names), data_dir)) 239 | 240 | return file_paths, categories 241 | 242 | def _process_dataset(directory, outDir): 243 | """Process a complete data set (training, validation or test). 244 | Args: 245 | directory: string; path to data set. 246 | outDir: string; directory to save output data. 247 | Returns: 248 | - 249 | """ 250 | 251 | file_paths, categories = _find_image_files(directory) 252 | _process_image_files(file_paths, categories, outDir) 253 | 254 | def main(argv=None): 255 | """Crop fMoW training, validation and testing images 256 | based on bounding box annotations. 257 | Args: 258 | - 259 | Returns: 260 | - 261 | """ 262 | 263 | _process_dataset(FLAGS.validation_directory, os.path.join(FLAGS.output_directory, 'val')) 264 | _process_dataset(FLAGS.test_directory, os.path.join(FLAGS.output_directory, 'test')) 265 | _process_dataset(FLAGS.train_directory, os.path.join(FLAGS.output_directory, 'train')) 266 | 267 | if __name__ == '__main__': 268 | main() -------------------------------------------------------------------------------- /CUB/create_tfrecords_cub.py: -------------------------------------------------------------------------------- 1 | """Convert Caltech-UCSD Birds-200-2011 images to TFRecords. Information about the training and 2 | validation splits of the data reside in csv files, which are created by using create_csv_cub.py. 3 | Raw data can be downloaded here http://www.vision.caltech.edu/visipedia/CUB-200-2011.html, and 4 | are assumed to reside in the following directory structure: 5 | images/001.Black_footed_Albatross/Black_Footed_Albatross_0001_796111.jpg 6 | images/002.Laysan_Albatross/Laysan_Albatross_0001_545.jpg 7 | ... 8 | """ 9 | 10 | from __future__ import absolute_import, division, print_function 11 | 12 | import argparse 13 | from datetime import datetime 14 | import os 15 | import random 16 | import sys 17 | import threading 18 | import scipy.io 19 | import pandas as pd 20 | import six 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | 26 | 27 | parser = argparse.ArgumentParser() 28 | 29 | parser.add_argument('--img_dir', type=str, default='/CUB_200_2011/images/', help='Directory with raw image data.') 30 | parser.add_argument('--train_csv_path', type=str, default='/CUB_200_2011/train_anno.csv', help='Path to csv file with information about the images in the training split.') 31 | parser.add_argument('--dev_csv_path', type=str, default='/CUB_200_2011/validation_anno.csv', help='Path to csv file with information about the images in the validation split.') 32 | parser.add_argument('--output_dir', type=str, default='/TFRecords/', help='Output data directory.') 33 | 34 | parser.add_argument('--train_shards_num', type=int, default=16, help='Number of shards in training TFRecord files.') 35 | parser.add_argument('--dev_shards_num', type=int, default=16, help='Number of shards in validation TFRecord files.') 36 | parser.add_argument('--num_threads', type=int, default=16, help='Number of threads to parallelize processing.') 37 | 38 | FLAGS = parser.parse_args() 39 | 40 | def _int64_feature(value): 41 | """Insert int features into Example proto. 42 | Args: 43 | value: int or list of ints; features to insert 44 | in Example proto. 45 | Returns: 46 | feature: example proto; it contains a list of ints. 47 | """ 48 | 49 | if ((not isinstance(value, list)) and (not isinstance(value, np.ndarray))): 50 | value = [value] 51 | 52 | feature = tf.train.Feature(int64_list=tf.train.Int64List(value=value)) 53 | 54 | return feature 55 | 56 | def _float_feature(value): 57 | """Insert float features into Example proto. 58 | Args: 59 | value: float or list of floats; features to insert 60 | in Example proto. 61 | Returns: 62 | feature: example proto; it contains a list of floats. 63 | """ 64 | 65 | if ((not isinstance(value, list)) and (not isinstance(value, np.ndarray))): 66 | value = [value] 67 | 68 | feature = tf.train.Feature(float_list=tf.train.FloatList(value=value)) 69 | 70 | return feature 71 | 72 | def _bytes_feature(value): 73 | """Insert byte features into Example proto. 74 | Args: 75 | value: string or list of strings; features to 76 | insert in Example proto. 77 | Returns: 78 | feature: example proto; it contains a byte list. 79 | """ 80 | 81 | if (isinstance(value, type(tf.constant(0)))): 82 | value = value.numpy() 83 | if (six.PY3 and isinstance(value, six.text_type)): 84 | value = six.binary_type(value, encoding='utf-8') 85 | 86 | feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 87 | 88 | return feature 89 | 90 | def _convert_to_example(filename, image_buffer, label_value, label_name, height, width): 91 | """Build an Example proto for an image. 92 | Args: 93 | filename: string; path to image file. 94 | image_buffer: string; JPEG encoded image. 95 | label_value: int; numeric ground truth label. 96 | label_name: string; human-readable label. 97 | height: int; image height in pixels. 98 | width: int; image width in pixels. 99 | Returns: 100 | example: example proto; it contains the following fields: 101 | image/height: int; image height in pixels. 102 | image/width: int; image width in pixels. 103 | image/colorspace: string; colorspace, always 'RGB'. 104 | image/channels: int; number of channels, always 3. 105 | image/class/label: int; index of a classification label in range [1, 200]. 106 | image/class/text: string; human-readable label. 107 | image/format: string; image format, always 'JPEG'. 108 | image/filename: string; image file basename. 109 | image/encoded: string; JPEG encoded image. 110 | """ 111 | 112 | colorspace = 'RGB' 113 | channels = 3 114 | image_format = 'JPEG' 115 | 116 | example = tf.train.Example(features=tf.train.Features(feature={ 117 | 'image/height': _int64_feature(height), 118 | 'image/width': _int64_feature(width), 119 | 'image/colorspace': _bytes_feature(colorspace), 120 | 'image/channels': _int64_feature(channels), 121 | 'image/class/label': _int64_feature(label_value), 122 | 'image/class/text': _bytes_feature(label_name), 123 | 'image/format': _bytes_feature(image_format), 124 | 'image/filename': _bytes_feature(os.path.basename(filename)), 125 | 'image/encoded': _bytes_feature(image_buffer) 126 | })) 127 | 128 | return example 129 | 130 | def _process_image(filename): 131 | """Process a single image file. 132 | Args: 133 | filename: string; path to an image file. 134 | Returns: 135 | image_buffer: string; JPEG encoded image. 136 | height: int; image height in pixels. 137 | width: int; image width in pixels. 138 | """ 139 | 140 | # Read image file 141 | image_data = tf.io.read_file(filename) 142 | 143 | # Decode image 144 | try: 145 | image = tf.io.decode_jpeg(image_data, channels=3) 146 | except: 147 | print("Oops! %s." %filename) 148 | return -1 149 | 150 | # Assert that the image has the appropriate dimensions 151 | assert len(image.shape) == 3 152 | height = image.shape[0] 153 | width = image.shape[1] 154 | assert image.shape[2] == 3 155 | 156 | return image_data, height, width 157 | 158 | def _process_image_files_batch(thread_index, ranges, name, filenames, 159 | labels_values, labels_names, num_shards): 160 | """Execute 1 thread that processes images and saves them as TFRecords 161 | of Example protos. 162 | Args: 163 | thread_index: int; unique thread identifier. 164 | ranges: list of ints; it contains the range of images to 165 | process. 166 | name: string; unique identifier specifying the data set. 167 | filenames: list of strings; it contains paths to image files. 168 | labels_values: list of ints; it contains numeric labels. 169 | labels_names: list of strings; it contains human-readable labels. 170 | num_shards: int; number of shards. 171 | Returns: 172 | - 173 | """ 174 | 175 | # Each thread produces N shards where N = int(num_shards / num_threads). 176 | # For instance, if num_shards = 128, and the num_threads = 2, then the first 177 | # thread would produce shards [0, 64) 178 | num_threads = len(ranges) 179 | assert not num_shards % num_threads 180 | num_shards_per_batch = int(num_shards / num_threads) 181 | 182 | shard_ranges = np.linspace(ranges[thread_index][0], 183 | ranges[thread_index][1], 184 | num_shards_per_batch + 1).astype(int) 185 | num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0] 186 | 187 | # Generate each shard 188 | counter = 0 189 | for s in range(num_shards_per_batch): 190 | shard = thread_index * num_shards_per_batch + s 191 | output_filename = '%s-%.4d-of-%.4d' % (name, (shard+1), num_shards) 192 | output_file = os.path.join(FLAGS.output_dir, output_filename) 193 | writer = tf.io.TFRecordWriter(output_file) 194 | 195 | # Process each file for a shard 196 | shard_counter = 0 197 | files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int) 198 | for i in files_in_shard: 199 | filename = filenames[i] 200 | label_value = labels_values[i] 201 | label_name = labels_names[i] 202 | 203 | # Process an image 204 | image_buffer, height, width = _process_image(filename) 205 | 206 | # Create an Example proto 207 | example = _convert_to_example(filename, image_buffer, label_value, 208 | label_name, height, width) 209 | 210 | # Write to TFRecord 211 | writer.write(example.SerializeToString()) 212 | shard_counter += 1 213 | counter += 1 214 | 215 | if (not (counter % 1000)): 216 | print('%s [thread %d]: Processed %d of %d images in thread batch.' % 217 | (datetime.now(), thread_index, counter, num_files_in_thread)) 218 | sys.stdout.flush() 219 | 220 | writer.close() 221 | print('%s [thread %d]: Wrote %d images to %s' % 222 | (datetime.now(), thread_index, shard_counter, output_file)) 223 | sys.stdout.flush() 224 | shard_counter = 0 225 | print('%s [thread %d]: Wrote %d images to %d shards.' 226 | %(datetime.now(), thread_index, counter, num_files_in_thread)) 227 | sys.stdout.flush() 228 | 229 | def _process_image_files(name, filenames, labels_values, labels_names, num_shards): 230 | """Process images and save them as TFRecords of Example protos. 231 | Args: 232 | name: string; unique identifier specifying the data set. 233 | filenames: list of strings; it contains paths to image files. 234 | labels_values: list of ints; it contains numeric labels. 235 | labels_names: list of strings; it contains human-readable labels. 236 | num_shards: int; number of shards. 237 | Returns: 238 | - 239 | """ 240 | 241 | assert len(filenames) == len(labels_values) == len(labels_names) 242 | 243 | # Break images into batches 244 | spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int) 245 | ranges = [] 246 | for i in range(len(spacing) - 1): 247 | ranges.append([spacing[i], spacing[i + 1]]) 248 | 249 | # Launch a thread for each batch 250 | print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges)) 251 | sys.stdout.flush() 252 | 253 | # Create a mechanism for monitoring threads' execution 254 | coord = tf.train.Coordinator() 255 | 256 | # Run threads 257 | threads = [] 258 | for thread_index in range(len(ranges)): 259 | args = (thread_index, ranges, name, filenames, 260 | labels_values, labels_names, num_shards) 261 | t = threading.Thread(target=_process_image_files_batch, args=args) 262 | t.start() 263 | threads.append(t) 264 | 265 | # Wait for all the threads to terminate 266 | coord.join(threads) 267 | print('%s: Finished writing all %d images in data set.' % 268 | (datetime.now(), len(filenames))) 269 | sys.stdout.flush() 270 | 271 | def _find_image_files(name, data_dir, csv_file): 272 | """Build lists of images file paths, numeric labels, and 273 | human-readable labels. 274 | Args: 275 | name: string; unique identifier specifying the data set. 276 | directory: string; path to data set. 277 | csv_file: string; path to csv file with information about 278 | the data. 279 | Returns: 280 | filenames: list of strings; it contains paths to image files. 281 | labels_values: list of ints; it contains numeric labels. 282 | labels_names: list of strings; it contains human-readable labels. 283 | """ 284 | 285 | df = pd.read_csv(csv_file) 286 | filenames = df['fname'].tolist() 287 | filenames = [os.path.join(data_dir, f.lstrip('/')) for f in filenames] 288 | 289 | labels_values = df.to_numpy()[:, 1].astype(np.int).tolist() 290 | labels_names = df.to_numpy()[:, 2].tolist() 291 | 292 | print('Found %d JPEG files across %d labels inside %s.' % 293 | (len(filenames), len(np.unique(labels_values)), data_dir)) 294 | 295 | return filenames, labels_values, labels_names 296 | 297 | def _process_dataset(name, directory, num_shards, csv_file): 298 | """Process a complete data set and save it in TFRecords. 299 | Args: 300 | name: string; unique identifier specifying the data set. 301 | directory: string; path to data set. 302 | num_shards: int; number of shards. 303 | csv_file: string; path to csv file with information about 304 | the data. 305 | Returns: 306 | - 307 | """ 308 | 309 | filenames, labels_values, labels_names = _find_image_files(name, directory, csv_file) 310 | _process_image_files(name, filenames, labels_values, labels_names, num_shards) 311 | 312 | def main(argv=None): 313 | """Convert Caltech-UCSD Birds-200-2011 training and validation 314 | images to TFRecords. 315 | Args: 316 | - 317 | Returns: 318 | - 319 | """ 320 | 321 | assert not FLAGS.train_shards_num % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.train_shards_num') 322 | assert not FLAGS.dev_shards_num % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.dev_shards_num') 323 | 324 | if (not os.path.isdir(FLAGS.output_dir)): 325 | os.makedirs(FLAGS.output_dir) 326 | print('Saving results to %s' % FLAGS.output_dir) 327 | sys.stdout.flush() 328 | 329 | # Create TFRecords 330 | _process_dataset('validation', FLAGS.img_dir, FLAGS.dev_shards_num, FLAGS.dev_csv_path) 331 | _process_dataset('train', FLAGS.img_dir, FLAGS.train_shards_num, FLAGS.train_csv_path) 332 | 333 | if __name__ == '__main__': 334 | main() 335 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Traversal Network (TNet) 2 | 3 | We provide the TensorFlow implementation of the Traversal Network (TNet) architecture, presented in "Hard-Attention for Scalable Image Classification" (https://arxiv.org/pdf/2102.10212.pdf). The code is organized according to the datasets used for the experimental evaluation of TNet. Each folder contains code to convert raw data to TFRecords, to stream input batches, to build TNet and baseline models, and to train and evaluate the models. Learned weights, along with instructions to replicate the results presented in the paper, are provided as well. 4 | 5 | ## ImageNet ILSVRC 2012 6 | 7 | All related files can be found under the `/ImageNet/` folder. 8 | 9 | ### Data preparation 10 | 11 | Detailed intstructions to download the raw data, and to create related metadata files, are provided in `create_tfrecords_imagenet.py`. Given the necessary files are created, and the data directories are organized appropriately, the following command can be used to convert raw data to TFRecords: 12 | 13 | ``` 14 | python create_tfrecords_imagenet.py --output_directory '/path/to/output/dir/' 15 | --labels_file '/path/to/imagenet_lsvrc_2015_synsets.txt' 16 | --imagenet_metadata_file '/path/to/imagenet_metadata.txt' 17 | --bounding_box_file '/path/to/imagenet_2012_bounding_boxes.csv' 18 | ``` 19 | 20 | ### Training 21 | 22 | There are many different flags that can be used to customize the training of TNet and BagNet-77 baseline. An example command for training TNet is the following: 23 | 24 | ``` 25 | python train.py --to_train 26 | --batch_size 64 27 | --num_epochs 200 28 | --initial_lr 0.0001 29 | --lr_scedule_1step 30 | --keep_prob 0.5 31 | --loc_per_grid 3.0 32 | --reinfornce_reg_w 0.1 33 | --perFReg_ce_weight 0.3 34 | --perFReg_reinf_weight 0.3 35 | --overlap 0.34375 36 | --num_patches_y 5 37 | --num_patches_x 5 38 | --base_res_y 77 39 | --base_res_x 77 40 | --num_res_levels 2 41 | --num_do_layers 1 42 | --descr_tag 'BagNet_77_TNet' 43 | --save_tag 'TNet_imagenet' 44 | --num_gpus 2 45 | --data_dir '/path/to/TFRecords/dir/' 46 | --ckpt_dir '/path/to/ckpts/dir/' 47 | --summaries_dir '/path/to/summaries/dir/' 48 | --keep_weights_summary 49 | ``` 50 | 51 | An example command for traing BagNet-77 baseline is the following: 52 | 53 | ``` 54 | python train_bl.py --to_train 55 | --batch_size 64 56 | --num_epochs 200 57 | --initial_lr 0.0001 58 | --lr_scedule_1step 59 | --keep_prob 0.375 60 | --num_do_layers 1 61 | --descr_tag 'BagNet_77' 62 | --save_tag 'BagNet_77_imagenet' 63 | --num_gpus 2 64 | --data_dir '/path/to/TFRecords/dir/' 65 | --ckpt_dir '/path/to/ckpts/dir/' 66 | --summaries_dir '/path/to/summaries/dir/' 67 | --keep_weights_summary 68 | ``` 69 | 70 | Commands to replicate the training of the networks presented in the paper, can be found in `results_replication.txt`. 71 | 72 | The weights of the TNet model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1JmKOP6aN2tYUsD4-zWMfRCbXlId6gcko&export=download).
73 | The weights of the BagNet-77 baseline reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1jB3ouvcVhxYnTrIlagUawBkVqhL8Wfro&export=download). 74 | 75 | ### Evaluation 76 | 77 | A trained TNet model can be evaluated on the training and validation sets, by using a command similar to the following example: 78 | 79 | ``` 80 | python train.py --to_evaluate_train 81 | --to_evaluate_val 82 | --batch_size 64 83 | --loc_per_grid 3.0 84 | --overlap 0.34375 85 | --num_patches_y 5 86 | --num_patches_x 5 87 | --base_res_y 77 88 | --base_res_x 77 89 | --num_res_levels 2 90 | --descr_tag 'BagNet_77_TNet' 91 | --save_tag 'BagNet_77_imagenet' 92 | --num_gpus 1 93 | --data_dir '/path/to/TFRecords/dir/' 94 | --ckpt_dir '/path/to/ckpts/dir/' 95 | --summaries_dir '/path/to/summaries/dir/' 96 | --restore_dir '/path/to/dir/with/ckpt/to/restore/' 97 | ``` 98 | 99 | An example command for evaluating a trained BagNet-77 baseline network, is the following: 100 | 101 | ``` 102 | python train_bl.py --to_evaluate_train 103 | --to_evaluate_val 104 | --batch_size 64 105 | --descr_tag 'BagNet_77' 106 | --save_tag 'BagNet_77_imagenet' 107 | --num_gpus 1 108 | --data_dir '/path/to/TFRecords/dir/' 109 | --ckpt_dir '/path/to/ckpts/dir/' 110 | --summaries_dir '/path/to/summaries/dir/' 111 | --restore_dir '/path/to/dir/with/ckpt/to/restore/' 112 | ``` 113 | 114 | Commands to evaluate the networks presented in the paper, can be found in `results_replication.txt`. 115 | 116 | ## Functional Map of the World (fMoW) 117 | 118 | All related files can be found under the `/fMoW/` folder. 119 | 120 | ### Data preparation 121 | 122 | Details about how to download raw data are provided in `create_TFRecords_fMoW.py`. As explained in `create_TFRecords_fMoW.py`, test set data should be manually matched to ground truth labels. This can be done with the following command: 123 | 124 | ``` 125 | python match_test_gt.py --root_test_dir '/path/to/original/test/data/root/dir/' 126 | --test_output_dir '/path/to/output/dir/' 127 | --match_gt_json_path '/path/to/test_gt_mapping.json' 128 | ``` 129 | 130 | Given the desired uniformity in the directory organization of the training, validation, and test sets is established, the following command can be used to convert raw data to TFRecords: 131 | 132 | ``` 133 | python create_TFRecords_fMoW.py --train_directory '/path/to/training/set/dir/' 134 | --validation_directory '/path/to/validation/set/dir/' 135 | --test_directory '/path/to/test/set/dir/' 136 | --output_directory '/path/to/output/dir/' 137 | ``` 138 | 139 | In order to crop images according to the provided bounding boxes, the following command can be used: 140 | 141 | ``` 142 | python crop_fMoW.py --train_directory '/path/to/training/set/dir/' 143 | --validation_directory '/path/to/validation/set/dir/' 144 | --test_directory '/path/to/test/set/dir/' 145 | --output_directory '/path/to/output/dir/' 146 | ``` 147 | 148 | TFRecords for cropped images can be created with the following command: 149 | 150 | ``` 151 | python create_TFRecords_fMoW.py --cropped_data 152 | --train_directory '/path/to/training/set/dir/' 153 | --validation_directory '/path/to/validation/set/dir/' 154 | --test_directory '/path/to/test/set/dir/' 155 | --output_directory '/path/to/output/dir/' 156 | --maximum_min_dim 224 157 | ``` 158 | 159 | ### Training and evaluation 160 | 161 | Training and evaluation commands are similar to the ones provided for ImageNet. The commands used to train and evaluate the networks presented in the paper, can be found in `results_replication.txt`. 162 | 163 | The weights of the TNet model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=13d9qE1DOwm93pVrCUEPtBZZ3sQWZycXa&export=download).
164 | The weights of the EfficientNet-B0 model trained on cropped images, can be downloaded [here](https://drive.google.com/u/1/uc?id=1BsY-3EphqSviOx_OMS_x0gEWL2tUpRn5&export=download).
165 | The weights of the EfficientNet-B0 model trained on images of size 224x224 px, can be downloaded [here](https://drive.google.com/u/1/uc?id=15weVtPPnXv-H6wilP820TDAf_zNe3WQj&export=download).
166 | The weights of the EfficientNet-B0 model trained on images of size 448x448 px, can be downloaded [here](https://drive.google.com/u/1/uc?id=1ZiJMOsNU4LLMdzenO3ITUI9HyhvLhlF7&export=download).
167 | The weights of the EfficientNet-B0 model trained on images of size 896x896 px, can be downloaded [here](https://drive.google.com/u/1/uc?id=1l9n1EvQ1FkGu1U-C0IaQzVeGjCeP_t5D&export=download). 168 | 169 | ## CUB-200-2011 170 | 171 | All related files can be found under the `/CUB/` folder. 172 | 173 | ### Data preparation 174 | 175 | The link to download raw data is provided in `create_tfrecords_cub.py`. Before the creation of TFRecords, data can be split into training and validation sets through the following command (a csv file for each split is created): 176 | 177 | ``` 178 | python create_csv_cub.py --imgs_list_txt '/path/to/images.txt' 179 | --split_list_txt '/path/to/train_test_split.txt' 180 | --save_dir '/path/to/output/dir/' 181 | ``` 182 | 183 | Given the csv files for each data split are created, the following command can be used to convert raw data to TFRecords: 184 | 185 | ``` 186 | python create_tfrecords_cub.py --img_dir '/path/to/images/dir/' 187 | --train_csv_path '/path/to/train_anno.csv' 188 | --dev_csv_path '/path/to/validation_anno.csv' 189 | --output_dir '/path/to/output/dir/' 190 | ``` 191 | 192 | ### Training and evaluation 193 | 194 | Training and evaluation commands are similar to the ones provided for ImageNet. As noted in the paper, the pre-trained weights for EfficientNet models that are used for fine-tuning, can be downloaded here. They correspond to the weights of models trained with NoisyStudent and RandAugment, with the extra JFT-300M unlabeled data. Under the folder `/restore_dicts/` are provided dictionaries that are used to load the pre-trained weights to TNet and the baselines. The commands used to train and evaluate the networks presented in the paper, can be found in `results_replication.txt`. 195 | 196 | The weights of the TNet-B0 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1o8idpT73OCMba57oFTTn5SxSGOfZpqBG&export=download).
197 | The weights of the TNet-B1 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1oEGcgjMCbBWi5JrM5HEkfb4F4iI7CaXX&export=download).
198 | The weights of the TNet-B2 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1iADseI0T3s-_P5-KNeU31k8t9S1Bb4Df&export=download).
199 | The weights of the TNet-B3 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1YA7zfOw78hl7AwRaoSeo51mqpz1-vke2&export=download).
200 | The weights of the TNet-B4 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1MEHkmT3br2DpPK8Im_En9oQp3SP5cURJ&export=download).
201 | The weights of the EfficientNet-B0 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=15ZHqMZib058qhILYi8a04iM-YPWod_6_&export=download).
202 | The weights of the EfficientNet-B1 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1iT_QJregdanQjaKA1QjCdqLmEBBLdrsP&export=download).
203 | The weights of the EfficientNet-B2 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1_bbScPS4nGt0aBij9YIN0snQsLS2jC3g&export=download).
204 | The weights of the EfficientNet-B3 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1l6QeqGVrYc1k1r4n6fnNyInZ0vzq19iw&export=download).
205 | The weights of the EfficientNet-B4 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1CFcS8eFy63SSRpB0uB9CHkEVDns-umBc&export=download).
206 | 207 | ## NABirds 208 | 209 | All related files can be found under the `/NABirds/` folder. 210 | 211 | ### Data preparation 212 | 213 | Details about the NABirds data are provided in `create_tfrecords_nab.py`. The following command can be used to convert raw data to TFRecords: 214 | 215 | ``` 216 | python create_tfrecords_nab.py --root_directory '/path/to/dir/with/all/downloaded/data/' 217 | --data_directory '/path/to/images/dir/' 218 | --output_directory '/path/to/output/dir/' 219 | ``` 220 | 221 | ### Training and evaluation 222 | 223 | Training and evaluation commands are similar to the ones provided for ImageNet. As noted in the paper, the pre-trained weights for EfficientNet models that are used for fine-tuning, can be downloaded here. They correspond to the weights of models trained with NoisyStudent and RandAugment, with the extra JFT-300M unlabeled data. Under the folder `/restore_dicts/` are provided dictionaries that are used to load the pre-trained weights to TNet and the baselines. The commands used to train and evaluate the networks presented in the paper, can be found in `results_replication.txt`. 224 | 225 | The weights of the TNet-B0 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1FsHt1duv-3cIWPuoPToYdk-k6E65huwH&export=download).
226 | The weights of the TNet-B1 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=15p0O25f_ysGjd2F51uQEt42T0E5fNRtE&export=download).
227 | The weights of the TNet-B2 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1tI7eGNUXyEPiC0LIlSHaM9gGPHm3CS9d&export=download).
228 | The weights of the TNet-B3 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1QtUv-WKBqtYzKQUNhUspWQmQ6NgVO1pg&export=download).
229 | The weights of the TNet-B4 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1gAw7bfjG3LsVmZl3XbT5cVN7vZcUcy2H&export=download).
230 | The weights of the EfficientNet-B0 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1cbSie1e7BhlyNDdWQXLlJrgdbudmxUJl&export=download).
231 | The weights of the EfficientNet-B1 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1oLooePT4tyXXBgcJ-Cf_XnWkxjIRUXV2&export=download).
232 | The weights of the EfficientNet-B2 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1rg1uD1ISM-8-anZkAgqebwrisGfpC5NT&export=download).
233 | The weights of the EfficientNet-B3 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1Dxi8BhURHVonguIWQNYt-WU1SuLLr78e&export=download).
234 | The weights of the EfficientNet-B4 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1qqfbL-nfAftIByQUVUZNm5bYGRsoiX4S&export=download).
235 | 236 | -------------------------------------------------------------------------------- /NABirds/results_replication.txt: -------------------------------------------------------------------------------- 1 | --- Training 2 | 3 | - TNet 4 | 5 | The following command can be used to replicate the training of the TNet-B0 model: 6 | 7 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --num_epochs 100 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB0_origWD' --save_tag 'TNet-B0_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b0.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4 8 | 9 | The following command can be used to replicate the training of the TNet-B1 model: 10 | 11 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --num_epochs 100 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB1_origWD' --save_tag 'TNet-B1_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b1.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4 12 | 13 | The following command can be used to replicate the training of the TNet-B2 model: 14 | 15 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --num_epochs 100 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB2_origWD' --save_tag 'TNet-B2_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b2.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4 16 | 17 | The following command can be used to replicate the training of the TNet-B3 model: 18 | 19 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --num_epochs 100 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 3.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB3_origWD' --save_tag 'TNet-B3_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b3.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4 20 | 21 | The following command can be used to replicate the training of the TNet-B4 model: 22 | 23 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --num_epochs 100 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 3.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB4_origWD' --save_tag 'TNet-B4_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b4.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4 24 | 25 | 26 | 27 | - Baselines 28 | 29 | The following command can be used to replicate the training of the EfficientNet-B0 model: 30 | 31 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 100 --num_classes 555 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB0_origWD' --save_tag 'EN-B0_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b0.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4 32 | 33 | The following command can be used to replicate the training of the EfficientNet-B1 model: 34 | 35 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 100 --num_classes 555 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB1_origWD' --save_tag 'EN-B1_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b1.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4 36 | 37 | The following command can be used to replicate the training of the EfficientNet-B2 model: 38 | 39 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 100 --num_classes 555 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB2_origWD' --save_tag 'EN-B2_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b2.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4 40 | 41 | The following command can be used to replicate the training of the EfficientNet-B3 model: 42 | 43 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 100 --num_classes 555 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB3_origWD' --save_tag 'EN-B3_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b3.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4 44 | 45 | The following command can be used to replicate the training of the EfficientNet-B4 model: 46 | 47 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 100 --num_classes 555 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB4_origWD' --save_tag 'EN-B4_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b4.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4 48 | 49 | 50 | 51 | 52 | 53 | --- Evaluation 54 | 55 | - TNet 56 | 57 | The following command can be used to evaluate a trained TNet-B0 model on the validation set of CUB-200-2011: 58 | 59 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB0_origWD' --save_tag 'TNet-B0_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --image_ids_struct_path '/path/to/image_ids_struct.txt' 60 | 61 | The following command can be used to evaluate a trained TNet-B1 model on the validation set of CUB-200-2011: 62 | 63 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB1_origWD' --save_tag 'TNet-B1_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --image_ids_struct_path '/path/to/image_ids_struct.txt' 64 | 65 | The following command can be used to evaluate a trained TNet-B2 model on the validation set of CUB-200-2011: 66 | 67 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB2_origWD' --save_tag 'TNet-B2_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --image_ids_struct_path '/path/to/image_ids_struct.txt' 68 | 69 | The following command can be used to evaluate a trained TNet-B3 model on the validation set of CUB-200-2011: 70 | 71 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB3_origWD' --save_tag 'TNet-B3_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --image_ids_struct_path '/path/to/image_ids_struct.txt' 72 | 73 | The following command can be used to evaluate a trained TNet-B4 model on the validation set of CUB-200-2011: 74 | 75 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB4_origWD' --save_tag 'TNet-B4_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --image_ids_struct_path '/path/to/image_ids_struct.txt' 76 | 77 | The following flags can be added to the previous evaluation commands in order to time the inference of TNet: 78 | 79 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 10 80 | 81 | The following flags can be added to the previous evaluation commands for advanced evaluation of TNet: 82 | 83 | --adv_eval_data --batches_to_time_range 0 -1 --eval_epochs_num 1 84 | 85 | Advanced evaluation corresponds to the creation of an excel file with information about the attended locations, the attendance probabilities of all candidate locations, and the weights estimated by the feature weighting module. 86 | 87 | 88 | 89 | - Baselines 90 | 91 | The following command can be used to evaluate a trained EfficientNet-B0 model on the validation set of CUB-200-2011: 92 | 93 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_classes 555 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB0_origWD' --save_tag 'EN-B0_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 94 | 95 | The following command can be used to evaluate a trained EfficientNet-B1 model on the validation set of CUB-200-2011: 96 | 97 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_classes 555 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB1_origWD' --save_tag 'EN-B1_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 98 | 99 | The following command can be used to evaluate a trained EfficientNet-B2 model on the validation set of CUB-200-2011: 100 | 101 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_classes 555 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB2_origWD' --save_tag 'EN-B2_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 102 | 103 | The following command can be used to evaluate a trained EfficientNet-B3 model on the validation set of CUB-200-2011: 104 | 105 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_classes 555 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB3_origWD' --save_tag 'EN-B3_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 106 | 107 | The following command can be used to evaluate a trained EfficientNet-B4 model on the validation set of CUB-200-2011: 108 | 109 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_classes 555 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB4_origWD' --save_tag 'EN-B4_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 110 | 111 | The following flags can be added to the previous evaluation commands in order to time the inference of the baselines: 112 | 113 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 10 114 | 115 | 116 | -------------------------------------------------------------------------------- /CUB/results_replication.txt: -------------------------------------------------------------------------------- 1 | --- Training 2 | 3 | - TNet 4 | 5 | The following command can be used to replicate the training of the TNet-B0 model: 6 | 7 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --num_epochs 200 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB0_origWD' --save_tag 'TNet-B0_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b0.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4 8 | 9 | The following command can be used to replicate the training of the TNet-B1 model: 10 | 11 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --num_epochs 200 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB1_origWD' --save_tag 'TNet-B1_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b1.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4 12 | 13 | The following command can be used to replicate the training of the TNet-B2 model: 14 | 15 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --num_epochs 200 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB2_origWD' --save_tag 'TNet-B2_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b2.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4 16 | 17 | The following command can be used to replicate the training of the TNet-B3 model: 18 | 19 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --num_epochs 200 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB3_origWD' --save_tag 'TNet-B3_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b3.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4 20 | 21 | The following command can be used to replicate the training of the TNet-B4 model: 22 | 23 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --num_epochs 125 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB4_origWD' --save_tag 'TNet-B4_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b4.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4 24 | 25 | 26 | 27 | - Baselines 28 | 29 | The following command can be used to replicate the training of the EfficientNet-B0 model: 30 | 31 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB0_origWD' --save_tag 'EN-B0_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b0.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4 32 | 33 | The following command can be used to replicate the training of the EfficientNet-B1 model: 34 | 35 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB1_origWD' --save_tag 'EN-B1_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b1.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4 36 | 37 | The following command can be used to replicate the training of the EfficientNet-B2 model: 38 | 39 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB2_origWD' --save_tag 'EN-B2_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b2.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4 40 | 41 | The following command can be used to replicate the training of the EfficientNet-B3 model: 42 | 43 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB3_origWD' --save_tag 'EN-B3_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b3.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4 44 | 45 | The following command can be used to replicate the training of the EfficientNet-B4 model: 46 | 47 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB4_origWD' --save_tag 'EN-B4_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b4.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4 48 | 49 | 50 | 51 | 52 | 53 | --- Evaluation 54 | 55 | - TNet 56 | 57 | The following command can be used to evaluate a trained TNet-B0 model on the validation set of CUB-200-2011: 58 | 59 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB0_origWD' --save_tag 'TNet-B0_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --cub_classes_file_dir '/path/to/classes.txt' 60 | 61 | The following command can be used to evaluate a trained TNet-B1 model on the validation set of CUB-200-2011: 62 | 63 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB1_origWD' --save_tag 'TNet-B1_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --cub_classes_file_dir '/path/to/classes.txt' 64 | 65 | The following command can be used to evaluate a trained TNet-B2 model on the validation set of CUB-200-2011: 66 | 67 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB2_origWD' --save_tag 'TNet-B2_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --cub_classes_file_dir '/path/to/classes.txt' 68 | 69 | The following command can be used to evaluate a trained TNet-B3 model on the validation set of CUB-200-2011: 70 | 71 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB3_origWD' --save_tag 'TNet-B3_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --cub_classes_file_dir '/path/to/classes.txt' 72 | 73 | The following command can be used to evaluate a trained TNet-B4 model on the validation set of CUB-200-2011: 74 | 75 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB4_origWD' --save_tag 'TNet-B4_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --cub_classes_file_dir '/path/to/classes.txt' 76 | 77 | The following flags can be added to the previous evaluation commands in order to time the inference of TNet: 78 | 79 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 10 80 | 81 | The following flags can be added to the previous evaluation commands for advanced evaluation of TNet: 82 | 83 | --adv_eval_data --batches_to_time_range 0 -1 --eval_epochs_num 1 84 | 85 | Advanced evaluation corresponds to the creation of an excel file with information about the attended locations, the attendance probabilities of all candidate locations, and the weights estimated by the feature weighting module. 86 | 87 | 88 | 89 | - Baselines 90 | 91 | The following command can be used to evaluate a trained EfficientNet-B0 model on the validation set of CUB-200-2011: 92 | 93 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB0_origWD' --save_tag 'EN-B0_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 94 | 95 | The following command can be used to evaluate a trained EfficientNet-B1 model on the validation set of CUB-200-2011: 96 | 97 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB1_origWD' --save_tag 'EN-B1_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 98 | 99 | The following command can be used to evaluate a trained EfficientNet-B2 model on the validation set of CUB-200-2011: 100 | 101 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB2_origWD' --save_tag 'EN-B2_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 102 | 103 | The following command can be used to evaluate a trained EfficientNet-B3 model on the validation set of CUB-200-2011: 104 | 105 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB3_origWD' --save_tag 'EN-B3_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 106 | 107 | The following command can be used to evaluate a trained EfficientNet-B4 model on the validation set of CUB-200-2011: 108 | 109 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB4_origWD' --save_tag 'EN-B4_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' 110 | 111 | The following flags can be added to the previous evaluation commands in order to time the inference of the baselines: 112 | 113 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 10 114 | 115 | 116 | -------------------------------------------------------------------------------- /NABirds/input_nab.py: -------------------------------------------------------------------------------- 1 | """Prepare input batches. 2 | """ 3 | 4 | from __future__ import absolute_import, division, print_function 5 | 6 | import os 7 | 8 | from absl import logging 9 | import numpy as np 10 | import tensorflow as tf 11 | import tensorflow_addons as tfa 12 | from tensorflow.python.ops import control_flow_ops 13 | 14 | 15 | 16 | _SHUFFLE_BUFFER = 10000 17 | NUM_CHANNELS = 3 18 | TRAIN_SHARDS_NUM = 16 19 | VAL_SHARDS_NUM = 16 20 | 21 | def get_filenames(dataset_type, data_dir): 22 | """Return filenames for dataset. 23 | Args: 24 | dataset_type: string; type of dataset. 25 | data_dir: string; directory containing the input data. 26 | Returns: 27 | data_filemames: list of strings; it contains paths to TFRecords. 28 | """ 29 | 30 | # Data are assumed to be stored in TFRecords 31 | if (dataset_type == 'train'): 32 | data_filemames = [os.path.join(data_dir, 'train-%04d-of-%04d' % (i+1, TRAIN_SHARDS_NUM)) for i in range(TRAIN_SHARDS_NUM)] 33 | elif (dataset_type == 'validation'): 34 | data_filemames = [os.path.join(data_dir, 'validation-%04d-of-%04d' % (i+1, VAL_SHARDS_NUM)) for i in range(VAL_SHARDS_NUM)] 35 | 36 | return data_filemames 37 | 38 | def parse_example_proto(example_serialized, adv_eval_data=False): 39 | """Parse an Example proto that corresponds to an image. 40 | Args: 41 | example_serialized: string; serialized Example protocol buffer. 42 | adv_eval_data: boolean; whether to include information for advanced 43 | evaluation in the input batches. 44 | Returns: 45 | to_batch: tuple; it contains the following entries: 46 | encoded_img: string; encoded JPEG file. 47 | label: int; numeric image label. 48 | img_filename (optional): string; the filename of an image. 49 | img_label_text (optional): string; the human-readable label of an image. 50 | """ 51 | 52 | # Extract dense features in Example proto 53 | feature_map = { 54 | 'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''), 55 | 'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1), 56 | 'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''), 57 | 'image/filename': tf.io.FixedLenFeature([], dtype=tf.string, default_value='') 58 | } 59 | 60 | features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map) 61 | encoded_img = features['image/encoded'] 62 | label = tf.cast(features['image/class/label'], dtype=tf.int32) 63 | label = tf.cast(tf.reshape(label, shape=[1]), dtype=tf.float32) 64 | 65 | if (not adv_eval_data): 66 | to_batch = (encoded_img, label) 67 | else: 68 | img_filename = features['image/filename'] 69 | img_label_text = features['image/class/text'] 70 | to_batch = (encoded_img, label, img_filename, img_label_text) 71 | 72 | return to_batch 73 | 74 | def apply_with_random_selector(x, func, cases): 75 | """Compute func(x, cases[sel]), with sel sampled from cases. 76 | Args: 77 | x: Tensor; input Tensor to process. 78 | func: function; python function to apply. 79 | num_cases: list; cases to sample from. 80 | Returns: 81 | The result of func(x, cases[sel]), sel is sampled dynamically. 82 | """ 83 | 84 | sel = tf.random.uniform([], maxval=len(cases), dtype=tf.int32) 85 | # Pass the input only to one of the func calls 86 | return control_flow_ops.merge([ 87 | func(control_flow_ops.switch(x, tf.equal(sel, i))[1], cases[i]) 88 | for i in range(len(cases))])[0] 89 | 90 | def distort_image(image_buffer, output_height, output_width, num_channels, bbox): 91 | """Distort an image for data augmentation. 92 | Args: 93 | image_buffer: string; raw JPEG image buffer. 94 | output_height: int; height of the image after preprocessing. 95 | output_width: int; width of the image after preprocessing. 96 | num_channels: int; depth of the image buffer for decoding. 97 | bbox: 3-D float Tensor; it contains the bounding boxes related to 98 | an image. Bounding box coordinates are in range [0, 1], 99 | arranged in order [ymin, xmin, ymax, xmax]. The Tensor is of 100 | shape [1, num_boxes, 4], where num_boxes is the number of 101 | bounding boxes related to the image. 102 | Returns: 103 | distorted_image: 3-D float Tensor; it contains an image. It is of 104 | size [H, W, C], where H is the image height, W is the image 105 | width, and C is the number of channels. 106 | """ 107 | 108 | # Create a bounding box by distorting an existing one (if it is provided). 109 | # The new bounding box should respect specific constraints, e.g., be within 110 | # a range of aspect ratios. If no bounding box is provided, the entire 111 | # image is considered the initial bounding box to be distorted. 112 | sampled_distorted_bounding_box = tf.image.sample_distorted_bounding_box( 113 | tf.io.extract_jpeg_shape(image_buffer), 114 | bounding_boxes=bbox, 115 | min_object_covered=0.1, 116 | aspect_ratio_range=[0.5, 2.0], 117 | area_range=[0.85, 1.0], 118 | max_attempts=50, 119 | use_image_if_no_bounding_boxes=True, 120 | seed=0) 121 | bbox_begin, bbox_size, _ = sampled_distorted_bounding_box 122 | 123 | # Reassemble and crop the bounding box 124 | offset_y, offset_x, _ = tf.unstack(bbox_begin) 125 | target_height, target_width, _ = tf.unstack(bbox_size) 126 | crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) 127 | distorted_image = tf.image.decode_and_crop_jpeg(image_buffer, crop_window, channels=num_channels) 128 | distorted_image = tf.image.convert_image_dtype(distorted_image, dtype=tf.float32) 129 | 130 | # Resize the image. Select a resize method randomly. The image aspect ratio may change. 131 | resize_methods = [tf.image.ResizeMethod.BILINEAR, 132 | tf.image.ResizeMethod.LANCZOS3, 133 | tf.image.ResizeMethod.LANCZOS5, 134 | tf.image.ResizeMethod.BICUBIC, 135 | tf.image.ResizeMethod.GAUSSIAN, 136 | tf.image.ResizeMethod.NEAREST_NEIGHBOR, 137 | tf.image.ResizeMethod.AREA, 138 | tf.image.ResizeMethod.MITCHELLCUBIC] 139 | distorted_image = apply_with_random_selector(distorted_image, 140 | lambda x, resize_method: tf.image.resize(distorted_image, 141 | [output_height, output_width], 142 | method=resize_method, antialias=False), 143 | cases=resize_methods) 144 | 145 | # Restore image shape 146 | distorted_image.set_shape([output_height, output_width, num_channels]) 147 | 148 | # Perform a random horizontal flip of the image 149 | distorted_image = tf.image.random_flip_left_right(distorted_image) 150 | 151 | # Perform a random translation of the image 152 | distorted_image = tf.expand_dims(distorted_image, 0) 153 | s = 0.1 154 | vy = s * tf.cast(tf.shape(distorted_image)[1], tf.float32) 155 | vx = s * tf.cast(tf.shape(distorted_image)[2], tf.float32) 156 | dy = tf.random.uniform(shape=[tf.shape(distorted_image)[0], 1], minval=-vy, maxval=vy) 157 | dx = tf.random.uniform(shape=[tf.shape(distorted_image)[0], 1], minval=-vx, maxval=vx) 158 | d = tf.concat([dx, dy], axis=-1) 159 | distorted_image = tfa.image.translate(distorted_image, translations=d) 160 | 161 | # Perform a random rotation of the image 162 | r_limit = 20.0 * np.pi / 180.0 163 | r = tf.random.uniform(shape=[tf.shape(distorted_image)[0]], minval=-r_limit, maxval=r_limit) 164 | distorted_image = tfa.image.rotate(distorted_image, angles=r) 165 | 166 | distorted_image = tf.squeeze(distorted_image) 167 | 168 | return distorted_image 169 | 170 | def preprocess_image(image_buffer, bbox, output_height, output_width, 171 | num_channels, dataset_type, is_training): 172 | """Preprocess an image. 173 | Args: 174 | image_buffer: string; encoded JPEG file. 175 | bbox: 3-D float Tensor; it contains the bounding boxes related to an 176 | image. Bounding box coordinates are in range [0, 1], arranged in 177 | order [ymin, xmin, ymax, xmax]. The Tensor is of shape 178 | [1, num_boxes, 4], where num_boxes is the number of bounding 179 | boxes related to the image. 180 | output_height: int; height of the image after preprocessing. 181 | output_width: int; width of the image after preprocessing. 182 | num_channels: int; depth of the image buffer for decoding. 183 | dataset_type: string; type of dataset. 184 | is_training: boolean; whether the input will be used for training. 185 | Returns: 186 | image: 3-D float Tensor; it contains an image. It is of 187 | size [H, W, C], where H is the image height, W is 188 | the image width, and C is the number of channels. 189 | """ 190 | 191 | if ((dataset_type == 'train') and (is_training)): 192 | # For training data during training, apply random distortions for data augmentation 193 | image = distort_image(image_buffer, output_height, output_width, num_channels, bbox) 194 | else: 195 | # Decode and resize the input image 196 | image = tf.image.decode_jpeg(image_buffer, channels=num_channels) 197 | image = tf.image.convert_image_dtype(image, dtype=tf.float32) 198 | image = tf.expand_dims(image, 0) 199 | image = tf.image.resize(image, [output_height, output_width], method=tf.image.ResizeMethod.BILINEAR, antialias=False) 200 | image = tf.squeeze(image, [0]) 201 | 202 | # Transform image values from range [0, 1], to [-1, 1] 203 | image = tf.subtract(image, 0.5) 204 | image = tf.multiply(image, 2.0) 205 | 206 | return image 207 | 208 | def parse_record(raw_record, dataset_type, is_training, 209 | img_size_y, img_size_x, dtype, adv_eval_data): 210 | """Parse a record containing a training example that corresponds to an image. 211 | Args: 212 | raw_record: string; serialized Example protocol buffer. 213 | dataset_type: string; type of dataset. 214 | is_training: boolean; whether the input will be used for training. 215 | img_size_y: int; image height in pixels. 216 | img_size_x: int; image width in pixels. 217 | dtype: string; data type to use for images/features. 218 | adv_eval_data: boolean; whether to include information for advanced 219 | evaluation in the input batches. 220 | Returns: 221 | batch: tuple; it contains the following entries: 222 | image: 3-D float Tensor; it contains an image. It is of 223 | size [H, W, C], where H is the image height, W is 224 | the image width, and C is the number of channels. 225 | label: int; numeric image label. 226 | img_filename (optional): string; the filename of an image. 227 | img_label_text (optional): string; the human-readable label of an image. 228 | """ 229 | 230 | # Parse Example protocol buffer 231 | if (not adv_eval_data): 232 | image_buffer, label = parse_example_proto(raw_record, adv_eval_data) 233 | else: 234 | (image_buffer, label, 235 | img_filename, img_label_text) = parse_example_proto(raw_record, adv_eval_data) 236 | 237 | # Pre-process image 238 | bbox = tf.constant([[[0., 0., 1., 1.]]], dtype=tf.float32) 239 | image = preprocess_image(image_buffer=image_buffer, 240 | bbox=bbox, 241 | output_height=img_size_y, 242 | output_width=img_size_x, 243 | num_channels=NUM_CHANNELS, 244 | dataset_type=dataset_type, 245 | is_training=is_training) 246 | 247 | # Return batch 248 | if (not adv_eval_data): 249 | batch = (image, label) 250 | else: 251 | batch = (image, label, img_filename, img_label_text) 252 | 253 | return batch 254 | 255 | def process_record_dataset(dataset, 256 | dataset_type, 257 | is_training, 258 | batch_size, 259 | img_size_y, 260 | img_size_x, 261 | shuffle_buffer, 262 | parse_record_fn, 263 | num_epochs=-1, 264 | dtype=tf.float32, 265 | drop_remainder=False, 266 | adv_eval_data=False): 267 | """Create input dataset from raw records. 268 | Args: 269 | dataset: tf dataset; dataset with raw records. 270 | dataset_type: string; type of dataset. 271 | is_training: boolean; whether the input will be used for training. 272 | batch_size: int; number of samples per batch (global, not per replica). 273 | img_size_y: int; image height in pixels. 274 | img_size_x: int; image width in pixels. 275 | shuffle_buffer: int; buffer size to use when shuffling records. A larger 276 | value results in higher randomness, but a smaller one reduces startup 277 | time and uses less memory. 278 | parse_record_fn: function; function that processes raw records. 279 | num_epochs: int; number of times to repeat the dataset. 280 | dtype: string; data type to use for images/features. 281 | drop_remainder: boolean; whether to drop the remainder of the 282 | batches. If True, the batch dimension will be static. 283 | adv_eval_data: boolean; whether to include information for advanced 284 | evaluation in the input batches. 285 | Returns: 286 | dataset: tf dataset; iterable input dataset. 287 | """ 288 | 289 | # Shuffle records before repeating, to respect epoch boundaries 290 | if (is_training): 291 | dataset = dataset.shuffle(buffer_size=shuffle_buffer) 292 | 293 | # Repeat dataset for the number of epochs to train 294 | if (num_epochs < 1): 295 | dataset = dataset.repeat() 296 | else: 297 | dataset = dataset.repeat(num_epochs) 298 | 299 | # Parse raw records 300 | dataset = dataset.map(lambda value: parse_record_fn(value, dataset_type, is_training, 301 | img_size_y, img_size_x, dtype, adv_eval_data), 302 | num_parallel_calls=tf.data.experimental.AUTOTUNE) 303 | dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) 304 | 305 | # Operations between the final prefetch and the get_next call to the iterator 306 | # will happen synchronously during run time. Prefetch here again to 307 | # background all of the above processing work and keep it out of the 308 | # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE 309 | # allows DistributionStrategies to adjust how many batches to fetch based 310 | # on how many devices are present. 311 | dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) 312 | 313 | return dataset 314 | 315 | def input_fn(dataset_type, 316 | is_training, 317 | data_dir, 318 | batch_size, 319 | img_size_y, 320 | img_size_x, 321 | num_epochs=-1, 322 | dtype=tf.float32, 323 | parse_record_fn=parse_record, 324 | drop_remainder=False, 325 | filenames=None, 326 | adv_eval_data=False): 327 | """Prepare input batches. 328 | Args: 329 | dataset_type: string; type of dataset. 330 | is_training: boolean; whether the input will be used for training. 331 | data_dir: string; directory containing the input data. 332 | batch_size: int; number of samples per batch (global, not per replica). 333 | img_size_y: int; image height in pixels. 334 | img_size_x: int; image width in pixels. 335 | num_epochs: int; number of times to repeat the dataset. 336 | dtype: string; data type to use for images/features. 337 | parse_record_fn: function; function that processes raw records. 338 | drop_remainder: boolean; indicates whether to drop the remainder of the 339 | batches. If True, the batch dimension will be static. 340 | filenames: list of strings; it contains paths to TFRecords. 341 | adv_eval_data: boolean; whether to include information for advanced 342 | evaluation in the input batches. 343 | Returns: 344 | input_dataset: tf dataset; iterable input dataset. 345 | """ 346 | 347 | # Get TFRecords paths 348 | if (filenames is None): 349 | filenames = get_filenames(dataset_type, data_dir) 350 | dataset = tf.data.Dataset.from_tensor_slices(filenames) 351 | 352 | # Shuffle input files 353 | if (is_training): 354 | if (dataset_type == 'train'): 355 | dataset = dataset.shuffle(buffer_size=TRAIN_SHARDS_NUM) 356 | elif (dataset_type == 'validation'): 357 | dataset = dataset.shuffle(buffer_size=VAL_SHARDS_NUM) 358 | 359 | # Process input files concurrently 360 | dataset = dataset.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE) 361 | 362 | # Process TFRecords 363 | input_dataset = process_record_dataset(dataset=dataset, 364 | dataset_type=dataset_type, 365 | is_training=is_training, 366 | batch_size=batch_size, 367 | img_size_y=img_size_y, 368 | img_size_x=img_size_x, 369 | shuffle_buffer=_SHUFFLE_BUFFER, 370 | parse_record_fn=parse_record_fn, 371 | num_epochs=num_epochs, 372 | dtype=dtype, 373 | drop_remainder=drop_remainder, 374 | adv_eval_data=adv_eval_data) 375 | 376 | return input_dataset 377 | -------------------------------------------------------------------------------- /CUB/input_cub.py: -------------------------------------------------------------------------------- 1 | """Prepare input batches. 2 | """ 3 | 4 | from __future__ import absolute_import, division, print_function 5 | 6 | import os 7 | import numpy as np 8 | import tensorflow as tf 9 | import tensorflow_addons as tfa 10 | from tensorflow.python.ops import control_flow_ops 11 | 12 | 13 | 14 | _SHUFFLE_BUFFER = 10000 15 | NUM_CHANNELS = 3 16 | TRAIN_SHARDS_NUM = 16 17 | VAL_SHARDS_NUM = 16 18 | 19 | def get_filenames(dataset_type, data_dir): 20 | """Return filenames for dataset. 21 | Args: 22 | dataset_type: string; type of dataset. 23 | data_dir: string; directory containing the input data. 24 | Returns: 25 | data_filemames: list of strings; it contains paths to TFRecords. 26 | """ 27 | 28 | # Data are assumed to be stored in TFRecords 29 | if (dataset_type == 'train'): 30 | data_filemames = [os.path.join(data_dir, 'train-%04d-of-%04d' % (i+1, TRAIN_SHARDS_NUM)) for i in range(TRAIN_SHARDS_NUM)] 31 | elif (dataset_type == 'validation'): 32 | data_filemames = [os.path.join(data_dir, 'validation-%04d-of-%04d' % (i+1, VAL_SHARDS_NUM)) for i in range(VAL_SHARDS_NUM)] 33 | 34 | return data_filemames 35 | 36 | def parse_example_proto(example_serialized, adv_eval_data=False): 37 | """Parse an Example proto that corresponds to an image. 38 | Args: 39 | example_serialized: string; serialized Example protocol buffer. 40 | adv_eval_data: boolean; whether to include information for advanced 41 | evaluation in the input batches. 42 | Returns: 43 | to_batch: tuple; it contains the following entries: 44 | encoded_img: string; encoded JPEG file. 45 | label: int; numeric image label. 46 | img_filename (optional): string; the filename of an image. 47 | img_label_text (optional): string; the human-readable label of an image. 48 | """ 49 | 50 | # Extract dense features in Example proto 51 | feature_map = { 52 | 'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''), 53 | 'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1), 54 | 'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''), 55 | 'image/filename': tf.io.FixedLenFeature([], dtype=tf.string, default_value='') 56 | } 57 | 58 | features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map) 59 | encoded_img = features['image/encoded'] 60 | label = tf.cast(features['image/class/label'], dtype=tf.int32) 61 | 62 | if (not adv_eval_data): 63 | to_batch = (encoded_img, label) 64 | else: 65 | img_filename = features['image/filename'] 66 | img_label_text = features['image/class/text'] 67 | to_batch = (encoded_img, label, img_filename, img_label_text) 68 | 69 | return to_batch 70 | 71 | def apply_with_random_selector(x, func, cases): 72 | """Compute func(x, cases[sel]), with sel sampled from cases. 73 | Args: 74 | x: Tensor; input Tensor to process. 75 | func: function; python function to apply. 76 | num_cases: list; cases to sample from. 77 | Returns: 78 | The result of func(x, cases[sel]), sel is sampled dynamically. 79 | """ 80 | 81 | sel = tf.random.uniform([], maxval=len(cases), dtype=tf.int32) 82 | # Pass the input only to one of the func calls 83 | return control_flow_ops.merge([ 84 | func(control_flow_ops.switch(x, tf.equal(sel, i))[1], cases[i]) 85 | for i in range(len(cases))])[0] 86 | 87 | def distort_image(image_buffer, output_height, output_width, num_channels, bbox): 88 | """Distort an image for data augmentation. 89 | Args: 90 | image_buffer: string; raw JPEG image buffer. 91 | output_height: int; height of the image after preprocessing. 92 | output_width: int; width of the image after preprocessing. 93 | num_channels: int; depth of the image buffer for decoding. 94 | bbox: 3-D float Tensor; it contains the bounding boxes related to 95 | an image. Bounding box coordinates are in range [0, 1], 96 | arranged in order [ymin, xmin, ymax, xmax]. The Tensor is of 97 | shape [1, num_boxes, 4], where num_boxes is the number of 98 | bounding boxes related to the image. 99 | Returns: 100 | distorted_image: 3-D float Tensor; it contains an image. It is of 101 | size [H, W, C], where H is the image height, W is the image 102 | width, and C is the number of channels. 103 | """ 104 | 105 | # Create a bounding box by distorting an existing one (if it is provided). 106 | # The new bounding box should respect specific constraints, e.g., be within 107 | # a range of aspect ratios. If no bounding box is provided, the entire 108 | # image is considered the initial bounding box to be distorted. 109 | sampled_distorted_bounding_box = tf.image.sample_distorted_bounding_box( 110 | tf.io.extract_jpeg_shape(image_buffer), 111 | bounding_boxes=bbox, 112 | min_object_covered=0.1, 113 | aspect_ratio_range=[0.5, 2.0], 114 | area_range=[0.85, 1.0], 115 | max_attempts=50, 116 | use_image_if_no_bounding_boxes=True, 117 | seed=0) 118 | bbox_begin, bbox_size, _ = sampled_distorted_bounding_box 119 | 120 | # Reassemble and crop the bounding box 121 | offset_y, offset_x, _ = tf.unstack(bbox_begin) 122 | target_height, target_width, _ = tf.unstack(bbox_size) 123 | crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) 124 | distorted_image = tf.image.decode_and_crop_jpeg(image_buffer, crop_window, channels=num_channels) 125 | distorted_image = tf.image.convert_image_dtype(distorted_image, dtype=tf.float32) 126 | 127 | # Resize the image. Select a resize method randomly. The image aspect ratio may change. 128 | resize_methods = [tf.image.ResizeMethod.BILINEAR, 129 | tf.image.ResizeMethod.LANCZOS3, 130 | tf.image.ResizeMethod.LANCZOS5, 131 | tf.image.ResizeMethod.BICUBIC, 132 | tf.image.ResizeMethod.GAUSSIAN, 133 | tf.image.ResizeMethod.NEAREST_NEIGHBOR, 134 | tf.image.ResizeMethod.AREA, 135 | tf.image.ResizeMethod.MITCHELLCUBIC] 136 | distorted_image = apply_with_random_selector(distorted_image, 137 | lambda x, resize_method: tf.image.resize(distorted_image, 138 | [output_height, output_width], 139 | method=resize_method, antialias=False), 140 | cases=resize_methods) 141 | 142 | # Restore image shape 143 | distorted_image.set_shape([output_height, output_width, num_channels]) 144 | 145 | # Perform a random horizontal flip of the image 146 | distorted_image = tf.image.random_flip_left_right(distorted_image) 147 | 148 | # Perform a random translation of the image 149 | distorted_image = tf.expand_dims(distorted_image, 0) 150 | s = 0.1 151 | vy = s * tf.cast(tf.shape(distorted_image)[1], tf.float32) 152 | vx = s * tf.cast(tf.shape(distorted_image)[2], tf.float32) 153 | dy = tf.random.uniform(shape=[tf.shape(distorted_image)[0], 1], minval=-vy, maxval=vy) 154 | dx = tf.random.uniform(shape=[tf.shape(distorted_image)[0], 1], minval=-vx, maxval=vx) 155 | d = tf.concat([dx, dy], axis=-1) 156 | distorted_image = tfa.image.translate(distorted_image, translations=d) 157 | 158 | # Perform a random rotation of the image 159 | r_limit = 20.0 * np.pi / 180.0 160 | r = tf.random.uniform(shape=[tf.shape(distorted_image)[0]], minval=-r_limit, maxval=r_limit) 161 | distorted_image = tfa.image.rotate(distorted_image, angles=r) 162 | 163 | distorted_image = tf.squeeze(distorted_image) 164 | 165 | return distorted_image 166 | 167 | def preprocess_image(image_buffer, bbox, output_height, output_width, 168 | num_channels, dataset_type, is_training): 169 | """Preprocess an image. 170 | Args: 171 | image_buffer: string; encoded JPEG file. 172 | bbox: 3-D float Tensor; it contains the bounding boxes related to an 173 | image. Bounding box coordinates are in range [0, 1], arranged in 174 | order [ymin, xmin, ymax, xmax]. The Tensor is of shape 175 | [1, num_boxes, 4], where num_boxes is the number of bounding 176 | boxes related to the image. 177 | output_height: int; height of the image after preprocessing. 178 | output_width: int; width of the image after preprocessing. 179 | num_channels: int; depth of the image buffer for decoding. 180 | dataset_type: string; type of dataset. 181 | is_training: boolean; whether the input will be used for training. 182 | Returns: 183 | image: 3-D float Tensor; it contains an image. It is of 184 | size [H, W, C], where H is the image height, W is 185 | the image width, and C is the number of channels. 186 | """ 187 | 188 | if ((dataset_type == 'train') and (is_training)): 189 | # For training data during training, apply random distortions for data augmentation 190 | image = distort_image(image_buffer, output_height, output_width, num_channels, bbox) 191 | else: 192 | # Decode and resize the input image 193 | image = tf.image.decode_jpeg(image_buffer, channels=num_channels) 194 | image = tf.image.convert_image_dtype(image, dtype=tf.float32) 195 | image = tf.expand_dims(image, 0) 196 | image = tf.image.resize(image, [output_height, output_width], method=tf.image.ResizeMethod.BILINEAR, antialias=False) 197 | image = tf.squeeze(image, [0]) 198 | 199 | # Transform image values from range [0, 1], to [-1, 1] 200 | image = tf.subtract(image, 0.5) 201 | image = tf.multiply(image, 2.0) 202 | 203 | return image 204 | 205 | def parse_record(raw_record, dataset_type, is_training, 206 | img_size_y, img_size_x, dtype, adv_eval_data): 207 | """Parse a record containing a training example that corresponds to an image. 208 | Args: 209 | raw_record: string; serialized Example protocol buffer. 210 | dataset_type: string; type of dataset. 211 | is_training: boolean; whether the input will be used for training. 212 | img_size_y: int; image height in pixels. 213 | img_size_x: int; image width in pixels. 214 | dtype: string; data type to use for images/features. 215 | adv_eval_data: boolean; whether to include information for advanced 216 | evaluation in the input batches. 217 | Returns: 218 | batch: tuple; it contains the following entries: 219 | image: 3-D float Tensor; it contains an image. It is of 220 | size [H, W, C], where H is the image height, W is 221 | the image width, and C is the number of channels. 222 | label: int; numeric image label. 223 | img_filename (optional): string; the filename of an image. 224 | img_label_text (optional): string; the human-readable label of an image. 225 | """ 226 | 227 | # Parse Example protocol buffer 228 | if (not adv_eval_data): 229 | image_buffer, label = parse_example_proto(raw_record, adv_eval_data) 230 | else: 231 | (image_buffer, label, 232 | img_filename, img_label_text) = parse_example_proto(raw_record, adv_eval_data) 233 | 234 | # Pre-process image 235 | bbox = tf.constant([[[0., 0., 1., 1.]]], dtype=tf.float32) 236 | image = preprocess_image(image_buffer=image_buffer, 237 | bbox=bbox, 238 | output_height=img_size_y, 239 | output_width=img_size_x, 240 | num_channels=NUM_CHANNELS, 241 | dataset_type=dataset_type, 242 | is_training=is_training) 243 | 244 | # Subtract 1 so that labels are in [0, 199] range 245 | label = tf.cast(tf.cast(tf.reshape(label, shape=[1]), dtype=tf.int32) - 1, dtype=tf.float32) 246 | 247 | # Return batch 248 | if (not adv_eval_data): 249 | batch = (image, label) 250 | else: 251 | batch = (image, label, img_filename, img_label_text) 252 | 253 | return batch 254 | 255 | def process_record_dataset(dataset, 256 | dataset_type, 257 | is_training, 258 | batch_size, 259 | img_size_y, 260 | img_size_x, 261 | shuffle_buffer, 262 | parse_record_fn, 263 | num_epochs=-1, 264 | dtype=tf.float32, 265 | drop_remainder=False, 266 | adv_eval_data=False): 267 | """Create input dataset from raw records. 268 | Args: 269 | dataset: tf dataset; dataset with raw records. 270 | dataset_type: string; type of dataset. 271 | is_training: boolean; whether the input will be used for training. 272 | batch_size: int; number of samples per batch (global, not per replica). 273 | img_size_y: int; image height in pixels. 274 | img_size_x: int; image width in pixels. 275 | shuffle_buffer: int; buffer size to use when shuffling records. A larger 276 | value results in higher randomness, but a smaller one reduces startup 277 | time and uses less memory. 278 | parse_record_fn: function; function that processes raw records. 279 | num_epochs: int; number of times to repeat the dataset. 280 | dtype: string; data type to use for images/features. 281 | drop_remainder: boolean; whether to drop the remainder of the 282 | batches. If True, the batch dimension will be static. 283 | adv_eval_data: boolean; whether to include information for advanced 284 | evaluation in the input batches. 285 | Returns: 286 | dataset: tf dataset; iterable input dataset. 287 | """ 288 | 289 | # Shuffle records before repeating, to respect epoch boundaries 290 | if (is_training): 291 | dataset = dataset.shuffle(buffer_size=shuffle_buffer) 292 | 293 | # Repeat dataset for the number of epochs to train 294 | if (num_epochs < 1): 295 | dataset = dataset.repeat() 296 | else: 297 | dataset = dataset.repeat(num_epochs) 298 | 299 | # Parse raw records 300 | dataset = dataset.map(lambda value: parse_record_fn(value, dataset_type, is_training, 301 | img_size_y, img_size_x, dtype, adv_eval_data), 302 | num_parallel_calls=tf.data.experimental.AUTOTUNE) 303 | dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) 304 | 305 | # Operations between the final prefetch and the get_next call to the iterator 306 | # will happen synchronously during run time. Prefetch here again to 307 | # background all of the above processing work and keep it out of the 308 | # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE 309 | # allows DistributionStrategies to adjust how many batches to fetch based 310 | # on how many devices are present. 311 | dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) 312 | 313 | return dataset 314 | 315 | def input_fn(dataset_type, 316 | is_training, 317 | data_dir, 318 | batch_size, 319 | img_size_y, 320 | img_size_x, 321 | num_epochs=-1, 322 | dtype=tf.float32, 323 | parse_record_fn=parse_record, 324 | drop_remainder=False, 325 | filenames=None, 326 | adv_eval_data=False): 327 | """Prepare input batches. 328 | Args: 329 | dataset_type: string; type of dataset. 330 | is_training: boolean; whether the input will be used for training. 331 | data_dir: string; directory containing the input data. 332 | batch_size: int; number of samples per batch (global, not per replica). 333 | img_size_y: int; image height in pixels. 334 | img_size_x: int; image width in pixels. 335 | num_epochs: int; number of times to repeat the dataset. 336 | dtype: string; data type to use for images/features. 337 | parse_record_fn: function; function that processes raw records. 338 | drop_remainder: boolean; indicates whether to drop the remainder of the 339 | batches. If True, the batch dimension will be static. 340 | filenames: list of strings; it contains paths to TFRecords. 341 | adv_eval_data: boolean; whether to include information for advanced 342 | evaluation in the input batches. 343 | Returns: 344 | input_dataset: tf dataset; iterable input dataset. 345 | """ 346 | 347 | # Get TFRecords paths 348 | if (filenames is None): 349 | filenames = get_filenames(dataset_type, data_dir) 350 | dataset = tf.data.Dataset.from_tensor_slices(filenames) 351 | 352 | # Shuffle input files 353 | if (is_training): 354 | if (dataset_type == 'train'): 355 | dataset = dataset.shuffle(buffer_size=TRAIN_SHARDS_NUM) 356 | elif (dataset_type == 'validation'): 357 | dataset = dataset.shuffle(buffer_size=VAL_SHARDS_NUM) 358 | 359 | # Process input files concurrently 360 | dataset = dataset.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE) 361 | 362 | # Process TFRecords 363 | input_dataset = process_record_dataset(dataset=dataset, 364 | dataset_type=dataset_type, 365 | is_training=is_training, 366 | batch_size=batch_size, 367 | img_size_y=img_size_y, 368 | img_size_x=img_size_x, 369 | shuffle_buffer=_SHUFFLE_BUFFER, 370 | parse_record_fn=parse_record_fn, 371 | num_epochs=num_epochs, 372 | dtype=dtype, 373 | drop_remainder=drop_remainder, 374 | adv_eval_data=adv_eval_data) 375 | 376 | return input_dataset 377 | -------------------------------------------------------------------------------- /NABirds/create_tfrecords_nab.py: -------------------------------------------------------------------------------- 1 | """Convert NABirds images to TFRecords. Raw data can be downloaded here 2 | https://dl.allaboutbirds.org/nabirds, and are assumed to reside in the following 3 | directory structure: 4 | images/0295/01f53d6bf5e449438d2bb79e0854bca4.jpg 5 | images/0296/069519c379574fb285d7bb920443ea89.jpg 6 | ... 7 | Metadata files that can be downloaded with the raw data, are utilized as well. 8 | In particular, the following files are used: images.txt, train_test_split.txt, 9 | sizes.txt, classes.txt, and image_class_labels.txt. 10 | 11 | images.txt contains the list of image file names, with each line corresponding to one image. 12 | The content of the file is expected to be as follows: 13 | 14 | where image_id is a numeric identifier for each image in the dataset, and image_name is the path 15 | to the corresponding image file. 16 | An example line is the following: 17 | 0000139e-21dc-4d0c-bfe1-4cae3c85c829 0817/0000139e21dc4d0cbfe14cae3c85c829.jpg 18 | 19 | train_test_split.txt contains the suggested training/validation split, with each line corresponding 20 | to one image. The content of the file is expected to be as follows: 21 | 22 | where image_id is a unique identifier for each image in the dataset (same as in images.txt), and 23 | is_training_image takes either value 1 or 0, denoting that the file is in the training or the validation 24 | set, respectively. 25 | An example line is the following: 26 | 0000139e-21dc-4d0c-bfe1-4cae3c85c829 0 27 | 28 | sizes.txt contains the spatial dimensions of each image, with each line corresponding to one image. 29 | The content of the file is expected to be as follows: 30 | 31 | where image_id is a unique identifier for each image in the dataset (same as in images.txt), width 32 | is the width of the corresponding image in pixels, and height is the height of the image in pixels. 33 | An example line is the following: 34 | 0000139e-21dc-4d0c-bfe1-4cae3c85c829 296 341 35 | 36 | classes.txt contains the list of human-readable labels (not all of them are represented in the image data), 37 | with each line corresponding to a different label. 38 | The content of the file is expected to be as follows: 39 | 40 | where class_id is a unique numeric identifier for each class, and class_name is the corresponding human-readable label. 41 | An example line is the following: 42 | 37 Barn Owl 43 | 44 | image_class_labels.txt contains the mapping between images and ground truth labels. 45 | The content of the file is expected to be as follows: 46 | 47 | where mage_id is a unique identifier for each image in the dataset (same as in images.txt), and class_id is a unique 48 | numeric identifier for each class. 49 | An example line is the following: 50 | 0000139e-21dc-4d0c-bfe1-4cae3c85c829 817 51 | """ 52 | 53 | from __future__ import absolute_import, division, print_function 54 | 55 | import argparse 56 | from datetime import datetime 57 | import os 58 | import random 59 | import sys 60 | import threading 61 | import scipy.io 62 | import pickle 63 | import six 64 | 65 | import numpy as np 66 | import tensorflow as tf 67 | 68 | 69 | 70 | parser = argparse.ArgumentParser() 71 | 72 | parser.add_argument('--data_directory', type=str, default='/images/', help='Directory with raw image data.') 73 | parser.add_argument('--root_directory', type=str, default='/NABirds/data/', help='Directory with metadata files.') 74 | parser.add_argument('--output_directory', type=str, default='/TFRecords/', help='Output data directory.') 75 | parser.add_argument('--image_ids_struct_path', type=str, default=None, help='Path to txt file with pyhton dictionary that contains metadata needed for the creation of TFRecord files.') 76 | 77 | parser.add_argument('--train_shards', type=int, default=16, help='Number of shards in training TFRecord files.') 78 | parser.add_argument('--validation_shards', type=int, default=16, help='Number of shards in validation TFRecord files.') 79 | parser.add_argument('--num_threads', type=int, default=16, help='Number of threads to parallelize processing.') 80 | 81 | FLAGS = parser.parse_args() 82 | 83 | IMAGE_IDS_STRUCT_FNAME = 'image_ids_struct.txt' 84 | 85 | def create_image_ids_struct(): 86 | """Create dictionary with information about the NAbirds dataset. 87 | It includes image filenames, ground truth numeric labels, 88 | human-readable labels, image spatial dimensions, and indicators 89 | that distinguish between the training and validation splits. 90 | Args: 91 | - 92 | Returns: 93 | - 94 | """ 95 | 96 | image_ids = {} 97 | 98 | fname = 'images.txt' 99 | with open(os.path.join(FLAGS.root_directory, fname)) as f: 100 | for line in f: 101 | tokens = line.strip().split() 102 | image_ids[tokens[0]] = {} 103 | image_ids[tokens[0]]['image_name'] = tokens[1] 104 | 105 | fname = 'train_test_split.txt' 106 | with open(os.path.join(FLAGS.root_directory, fname)) as f: 107 | for line in f: 108 | tokens = line.strip().split() 109 | image_ids[tokens[0]]['is_training_image'] = int(tokens[1]) 110 | 111 | fname = 'sizes.txt' 112 | with open(os.path.join(FLAGS.root_directory, fname)) as f: 113 | for line in f: 114 | tokens = line.strip().split() 115 | image_ids[tokens[0]]['height'] = int(tokens[2]) 116 | image_ids[tokens[0]]['width'] = int(tokens[1]) 117 | 118 | class_ids = {} 119 | fname = 'classes.txt' 120 | with open(os.path.join(FLAGS.root_directory, fname)) as f: 121 | for line in f: 122 | tokens = line.strip().split() 123 | class_ids[tokens[0]] = tokens[1] 124 | 125 | fname = 'image_class_labels.txt' 126 | with open(os.path.join(FLAGS.root_directory, fname)) as f: 127 | for line in f: 128 | tokens = line.strip().split() 129 | image_ids[tokens[0]]['class_id'] = tokens[1] 130 | image_ids[tokens[0]]['class_name'] = class_ids[tokens[1]] 131 | 132 | labels = {} 133 | label_id = 0 134 | for e in image_ids: 135 | if (image_ids[e]['class_id'] not in labels): 136 | labels[image_ids[e]['class_id']] = label_id 137 | image_ids[e]['label'] = label_id 138 | label_id += 1 139 | else: 140 | image_ids[e]['label'] = labels[image_ids[e]['class_id']] 141 | 142 | with open(os.path.join(FLAGS.root_directory, IMAGE_IDS_STRUCT_FNAME), "wb") as fp: 143 | pickle.dump(image_ids, fp) 144 | 145 | def _int64_feature(value): 146 | """Insert int features into Example proto. 147 | Args: 148 | value: int or list of ints; features to insert 149 | in Example proto. 150 | Returns: 151 | feature: example proto; it contains a list of ints. 152 | """ 153 | 154 | if ((not isinstance(value, list)) and (not isinstance(value, np.ndarray))): 155 | value = [value] 156 | 157 | feature = tf.train.Feature(int64_list=tf.train.Int64List(value=value)) 158 | 159 | return feature 160 | 161 | def _float_feature(value): 162 | """Insert float features into Example proto. 163 | Args: 164 | value: float or list of floats; features to insert 165 | in Example proto. 166 | Returns: 167 | feature: example proto; it contains a list of floats. 168 | """ 169 | 170 | if ((not isinstance(value, list)) and (not isinstance(value, np.ndarray))): 171 | value = [value] 172 | 173 | feature = tf.train.Feature(float_list=tf.train.FloatList(value=value)) 174 | 175 | return feature 176 | 177 | def _bytes_feature(value): 178 | """Insert byte features into Example proto. 179 | Args: 180 | value: string or list of strings; features to 181 | insert in Example proto. 182 | Returns: 183 | feature: example proto; it contains a byte list. 184 | """ 185 | 186 | if (isinstance(value, type(tf.constant(0)))): 187 | value = value.numpy() 188 | if (six.PY3 and isinstance(value, six.text_type)): 189 | value = six.binary_type(value, encoding='utf-8') 190 | 191 | feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 192 | 193 | return feature 194 | 195 | def _convert_to_example(filename, image_buffer, label, human_label, height, width): 196 | """Build an Example proto for an image. 197 | Args: 198 | filename: string; path to image file. 199 | image_buffer: string; JPEG encoded image. 200 | label: int; numeric ground truth label. 201 | human_label: string; human-readable label. 202 | height: int; image height in pixels. 203 | width: int; image width in pixels. 204 | Returns: 205 | example: example proto; it contains the following fields: 206 | image/height: int; image height in pixels. 207 | image/width: int; image width in pixels. 208 | image/colorspace: string; colorspace, always 'RGB'. 209 | image/channels: int; number of channels, always 3. 210 | image/class/label: int; index of a classification label in range [1, 200]. 211 | image/class/text: string; human-readable label. 212 | image/format: string; image format, always 'JPEG'. 213 | image/filename: string; image file basename. 214 | image/encoded: string; JPEG encoded image. 215 | """ 216 | 217 | colorspace = 'RGB' 218 | channels = 3 219 | image_format = 'JPEG' 220 | 221 | example = tf.train.Example(features=tf.train.Features(feature={ 222 | 'image/height': _int64_feature(height), 223 | 'image/width': _int64_feature(width), 224 | 'image/colorspace': _bytes_feature(colorspace), 225 | 'image/channels': _int64_feature(channels), 226 | 'image/class/label': _int64_feature(label), 227 | 'image/class/text': _bytes_feature(human_label), 228 | 'image/format': _bytes_feature(image_format), 229 | 'image/filename': _bytes_feature(os.path.basename(filename)), 230 | 'image/encoded': _bytes_feature(image_buffer) 231 | })) 232 | 233 | return example 234 | 235 | def _process_image(filename): 236 | """Process a single image file. 237 | Args: 238 | filename: string; path to an image file. 239 | Returns: 240 | image_buffer: string; JPEG encoded image. 241 | height: int; image height in pixels. 242 | width: int; image width in pixels. 243 | """ 244 | 245 | # Read image file 246 | image_data = tf.io.read_file(filename) 247 | 248 | # Decode image 249 | try: 250 | image = tf.io.decode_image(image_data, channels=3) 251 | except: 252 | print("Oops! %s." %filename) 253 | 254 | # Assert that the image has the appropriate dimensions 255 | assert (image.shape[2] == 3) 256 | height = image.shape[0] 257 | width = image.shape[1] 258 | assert image.shape[2] == 3 259 | 260 | image_data = tf.io.encode_jpeg(image, format='rgb', quality=100) 261 | 262 | return image_data, height, width 263 | 264 | def _process_image_files_batch(thread_index, ranges, name, filenames, 265 | labels, human_labels, num_shards): 266 | """Execute 1 thread that processes images and saves them as TFRecords 267 | of Example protos. 268 | Args: 269 | thread_index: int; unique thread identifier. 270 | ranges: list of ints; it contains the range of images to 271 | process. 272 | name: string; unique identifier specifying the data set. 273 | filenames: list of strings; it contains paths to image files. 274 | labels: list of ints; it contains numeric labels. 275 | human_labels: list of strings; it contains human-readable labels. 276 | num_shards: int; number of shards. 277 | Returns: 278 | - 279 | """ 280 | 281 | # Each thread produces N shards where N = int(num_shards / num_threads). 282 | # For instance, if num_shards = 128, and the num_threads = 2, then the first 283 | # thread would produce shards [0, 64) 284 | num_threads = len(ranges) 285 | assert not num_shards % num_threads 286 | num_shards_per_batch = int(num_shards / num_threads) 287 | 288 | shard_ranges = np.linspace(ranges[thread_index][0], 289 | ranges[thread_index][1], 290 | num_shards_per_batch + 1).astype(int) 291 | num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0] 292 | 293 | # Generate each shard 294 | counter = 0 295 | for s in range(num_shards_per_batch): 296 | shard = thread_index * num_shards_per_batch + s 297 | output_filename = '%s-%.4d-of-%.4d' % (name, (shard+1), num_shards) 298 | output_file = os.path.join(FLAGS.output_directory, output_filename) 299 | writer = tf.io.TFRecordWriter(output_file) 300 | 301 | # Process each file for a shard 302 | shard_counter = 0 303 | files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int) 304 | for i in files_in_shard: 305 | filename = filenames[i] 306 | label = labels[i] 307 | human_label = human_labels[i] 308 | 309 | # Process an image 310 | image_buffer, height, width = _process_image(filename) 311 | 312 | # Create an Example proto 313 | example = _convert_to_example(filename, image_buffer, label, 314 | human_label, height, width) 315 | 316 | # Write to TFRecord 317 | writer.write(example.SerializeToString()) 318 | shard_counter += 1 319 | counter += 1 320 | 321 | if (not (counter % 1000)): 322 | print('%s [thread %d]: Processed %d of %d images in thread batch.' % 323 | (datetime.now(), thread_index, counter, num_files_in_thread)) 324 | sys.stdout.flush() 325 | 326 | writer.close() 327 | print('%s [thread %d]: Wrote %d images to %s' % 328 | (datetime.now(), thread_index, shard_counter, output_file)) 329 | sys.stdout.flush() 330 | shard_counter = 0 331 | print('%s [thread %d]: Wrote %d images to %d shards.' 332 | %(datetime.now(), thread_index, counter, num_files_in_thread)) 333 | sys.stdout.flush() 334 | 335 | def _process_image_files(name, filenames, labels, human_labels, num_shards): 336 | """Process images and save them as TFRecords of Example protos. 337 | Args: 338 | name: string; unique identifier specifying the data set. 339 | filenames: list of strings; it contains paths to image files. 340 | labels: list of ints; it contains numeric labels. 341 | human_labels: list of strings; it contains human-readable labels. 342 | num_shards: int; number of shards. 343 | Returns: 344 | - 345 | """ 346 | 347 | assert len(filenames) == len(labels) == len(human_labels) 348 | 349 | # Break images into batches 350 | spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int) 351 | ranges = [] 352 | for i in range(len(spacing) - 1): 353 | ranges.append([spacing[i], spacing[i + 1]]) 354 | 355 | # Launch a thread for each batch 356 | print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges)) 357 | sys.stdout.flush() 358 | 359 | # Create a mechanism for monitoring threads' execution 360 | coord = tf.train.Coordinator() 361 | 362 | # Run threads 363 | threads = [] 364 | for thread_index in range(len(ranges)): 365 | args = (thread_index, ranges, name, filenames, 366 | labels, human_labels, num_shards) 367 | t = threading.Thread(target=_process_image_files_batch, args=args) 368 | t.start() 369 | threads.append(t) 370 | 371 | # Wait for all the threads to terminate 372 | coord.join(threads) 373 | print('%s: Finished writing all %d images in data set.' %(datetime.now(), len(filenames))) 374 | sys.stdout.flush() 375 | 376 | def _find_image_files(name, data_dir): 377 | """Build lists of images file paths, numeric labels, and 378 | human-readable labels. 379 | Args: 380 | name: string; unique identifier specifying the data set. 381 | data_dir: string; path to data set. 382 | Returns: 383 | filenames: list of strings; it contains paths to image files. 384 | labels: list of ints; it contains numeric labels. 385 | human_labels: list of strings; it contains human-readable labels. 386 | """ 387 | 388 | data_type_bool = int(name == 'train') 389 | 390 | with open(os.path.join(FLAGS.root_directory, IMAGE_IDS_STRUCT_FNAME), "rb") as fp: 391 | image_ids = pickle.load(fp) 392 | 393 | # Iterate over the image files 394 | filenames = [] 395 | labels = [] 396 | human_labels = [] 397 | label_num = 0 398 | for e in image_ids: 399 | im_struct = image_ids[e] 400 | if (im_struct['is_training_image'] == data_type_bool): 401 | filenames.append(os.path.join(data_dir, im_struct['image_name'])) 402 | 403 | if (im_struct['label'] not in labels): 404 | label_num += 1 405 | labels.append(im_struct['label']) 406 | human_labels.append(im_struct['class_name']) 407 | 408 | # Shuffle the ordering of all image files in order to guarantee 409 | # random ordering of the images with respect to labels in the 410 | # saved TFRecord files. Make the randomization repeatable 411 | shuffled_index = list(range(len(filenames))) 412 | random.seed(12345) 413 | random.shuffle(shuffled_index) 414 | 415 | filenames = [filenames[i] for i in shuffled_index] 416 | labels = [labels[i] for i in shuffled_index] 417 | human_labels = [human_labels[i] for i in shuffled_index] 418 | 419 | print('Found %d .jpg files across %d labels inside %s.' %(len(filenames), label_num, data_dir)) 420 | sys.stdout.flush() 421 | 422 | return filenames, labels, human_labels 423 | 424 | def _process_dataset(name, directory, num_shards): 425 | """Process a complete data set and save it in TFRecords. 426 | Args: 427 | name: string; unique identifier specifying the data set. 428 | directory: string; path to data set. 429 | num_shards: int; number of shards. 430 | Returns: 431 | - 432 | """ 433 | 434 | filenames, labels, human_labels = _find_image_files(name, directory) 435 | _process_image_files(name, filenames, labels, human_labels, num_shards) 436 | 437 | def main(argv=None): 438 | """Convert NABirds training and validation images to TFRecords. 439 | Args: 440 | - 441 | Returns: 442 | - 443 | """ 444 | 445 | assert not FLAGS.train_shards % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.train_shards_num') 446 | assert not FLAGS.validation_shards % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.dev_shards_num') 447 | 448 | if (not os.path.isdir(FLAGS.output_directory)): 449 | os.makedirs(FLAGS.output_directory) 450 | print('Saving results to %s' % FLAGS.output_directory) 451 | sys.stdout.flush() 452 | 453 | # Create dictionary with metadata information 454 | if (not FLAGS.image_ids_struct_path): 455 | create_image_ids_struct() 456 | 457 | # Create TFRecords 458 | _process_dataset('validation', FLAGS.data_directory, FLAGS.validation_shards) 459 | _process_dataset('train', FLAGS.data_directory, FLAGS.train_shards) 460 | 461 | if __name__ == '__main__': 462 | main() 463 | -------------------------------------------------------------------------------- /fMoW/input_fMoW.py: -------------------------------------------------------------------------------- 1 | """Prepare input batches. 2 | """ 3 | 4 | from __future__ import absolute_import, division, print_function 5 | 6 | import os 7 | import numpy as np 8 | import tensorflow as tf 9 | from tensorflow.python.ops import control_flow_ops 10 | 11 | 12 | 13 | _SHUFFLE_BUFFER = 10000 14 | NUM_CHANNELS = 3 15 | TRAIN_SHARDS_NUM = 512 16 | VAL_SHARDS_NUM = 128 17 | TEST_SHARDS_NUM = 128 18 | 19 | def get_filenames(dataset_type, data_dir): 20 | """Return filenames for dataset. 21 | Args: 22 | dataset_type: string; type of dataset. 23 | data_dir: string; directory containing the input data. 24 | Returns: 25 | data_filemames: list of strings; it contains paths to TFRecords. 26 | """ 27 | 28 | # Data are assumed to be stored in TFRecords 29 | if (dataset_type == 'train'): 30 | data_filemames = [os.path.join(data_dir, 'train-%05d-of-%05d' % (i, TRAIN_SHARDS_NUM)) for i in range(TRAIN_SHARDS_NUM)] 31 | elif (dataset_type == 'validation'): 32 | data_filemames = [os.path.join(data_dir, 'validation-%05d-of-%05d' % (i, VAL_SHARDS_NUM)) for i in range(VAL_SHARDS_NUM)] 33 | elif (dataset_type == 'test'): 34 | data_filemames = [os.path.join(data_dir, 'test-%05d-of-%05d' % (i, TEST_SHARDS_NUM)) for i in range(TEST_SHARDS_NUM)] 35 | 36 | return data_filemames 37 | 38 | def parse_example_proto(example_serialized, adv_eval_data=False): 39 | """Parse an Example proto that corresponds to an image. 40 | Args: 41 | example_serialized: string; serialized Example protocol buffer. 42 | adv_eval_data: boolean; whether to include information for advanced 43 | evaluation in the input batches. 44 | Returns: 45 | to_batch: tuple; it contains the following entries: 46 | encoded_img: string; encoded JPEG file. 47 | label: int; numeric image label. 48 | bbox: 3-D float Tensor; it contains the bounding boxes related to an 49 | image. Bounding box coordinates are in range [0, 1], arranged in 50 | order [ymin, xmin, ymax, xmax]. The Tensor is of shape 51 | [1, num_boxes, 4], where num_boxes is the number of bounding 52 | boxes related to the image. 53 | img_filename (optional): string; the filename of an image. 54 | img_label_text (optional): string; the human-readable label of an image. 55 | """ 56 | 57 | # Extract dense features in Example proto 58 | feature_map = { 59 | 'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''), 60 | 'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1), 61 | 'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''), 62 | 'image/filename': tf.io.FixedLenFeature([], dtype=tf.string, default_value='') 63 | } 64 | # Extract sparse features in Example proto 65 | sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32) 66 | feature_map.update( 67 | {k: sparse_float32 for k in ['image/object/bbox/xmin', 'image/object/bbox/ymin', 68 | 'image/object/bbox/xmax', 'image/object/bbox/ymax']}) 69 | 70 | features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map) 71 | encoded_img = features['image/encoded'] 72 | label = tf.cast(features['image/class/label'], dtype=tf.int32) 73 | 74 | ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) 75 | xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) 76 | ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) 77 | xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) 78 | 79 | # Make the variable number of bounding boxes into 80 | # the shape [1, num_boxes, coords] 81 | bbox = tf.concat([ymin, xmin, ymax, xmax], 0) 82 | bbox = tf.expand_dims(bbox, 0) 83 | bbox = tf.transpose(a=bbox, perm=[0, 2, 1]) 84 | 85 | if (not adv_eval_data): 86 | to_batch = (encoded_img, label, bbox) 87 | else: 88 | img_filename = features['image/filename'] 89 | img_label_text = features['image/class/text'] 90 | to_batch = (encoded_img, label, bbox, img_filename, img_label_text) 91 | 92 | return to_batch 93 | 94 | def apply_with_random_selector(x, func, cases): 95 | """Compute func(x, cases[sel]), with sel sampled from cases. 96 | Args: 97 | x: Tensor; input Tensor to process. 98 | func: function; python function to apply. 99 | num_cases: list; cases to sample from. 100 | Returns: 101 | The result of func(x, cases[sel]), sel is sampled dynamically. 102 | """ 103 | 104 | sel = tf.random.uniform([], maxval=len(cases), dtype=tf.int32) 105 | # Pass the input only to one of the func calls 106 | return control_flow_ops.merge([ 107 | func(control_flow_ops.switch(x, tf.equal(sel, i))[1], cases[i]) 108 | for i in range(len(cases))])[0] 109 | 110 | def distort_color(image, color_ordering): 111 | """Distort the color of an image. 112 | Args: 113 | image: 3-D float Tensor; it contains an image. It is of 114 | size [H, W, C], where H is the image height, W is 115 | the image width, and C is the number of channels. 116 | color_ordering: int; denotes the kind of color distortion. 117 | Returns: 118 | image: 3-D float Tensor; it contains an image. It is of 119 | size [H, W, C], where H is the image height, W is 120 | the image width, and C is the number of channels. 121 | """ 122 | 123 | if color_ordering == 0: 124 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 125 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 126 | image = tf.image.random_hue(image, max_delta=0.2) 127 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 128 | elif color_ordering == 1: 129 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 130 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 131 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 132 | image = tf.image.random_hue(image, max_delta=0.2) 133 | elif color_ordering == 2: 134 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 135 | image = tf.image.random_hue(image, max_delta=0.2) 136 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 137 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 138 | elif color_ordering == 3: 139 | image = tf.image.random_hue(image, max_delta=0.2) 140 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 141 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 142 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 143 | elif color_ordering == 4: 144 | return image 145 | else: 146 | raise ValueError('color_ordering must be in [0, 4]') 147 | 148 | # The random_* ops do not necessarily clamp 149 | image = tf.clip_by_value(image, 0.0, 1.0) 150 | 151 | return image 152 | 153 | def distort_image(image_buffer, output_height, output_width, num_channels, bbox): 154 | """Distort an image for data augmentation. 155 | Args: 156 | image_buffer: string; raw JPEG image buffer. 157 | output_height: int; height of the image after preprocessing. 158 | output_width: int; width of the image after preprocessing. 159 | num_channels: int; depth of the image buffer for decoding. 160 | bbox: 3-D float Tensor; it contains the bounding boxes related to 161 | an image. Bounding box coordinates are in range [0, 1], 162 | arranged in order [ymin, xmin, ymax, xmax]. The Tensor is of 163 | shape [1, num_boxes, 4], where num_boxes is the number of 164 | bounding boxes related to the image. 165 | Returns: 166 | distorted_image: 3-D float Tensor; it contains an image. It is of 167 | size [H, W, C], where H is the image height, W is the image 168 | width, and C is the number of channels. 169 | """ 170 | 171 | # Create a bounding box by distorting an existing one (if it is provided). 172 | # The new bounding box should respect specific constraints, e.g., be within 173 | # a range of aspect ratios. If no bounding box is provided, the entire 174 | # image is considered the initial bounding box to be distorted. 175 | sampled_distorted_bounding_box = tf.image.sample_distorted_bounding_box( 176 | tf.io.extract_jpeg_shape(image_buffer), 177 | bounding_boxes=bbox, 178 | min_object_covered=0.1, 179 | aspect_ratio_range=[0.5, 2.0], 180 | area_range=[0.85, 1.0], 181 | max_attempts=50, 182 | use_image_if_no_bounding_boxes=True, 183 | seed=0) 184 | bbox_begin, bbox_size, _ = sampled_distorted_bounding_box 185 | 186 | # Reassemble and crop the bounding box 187 | offset_y, offset_x, _ = tf.unstack(bbox_begin) 188 | target_height, target_width, _ = tf.unstack(bbox_size) 189 | crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) 190 | distorted_image = tf.image.decode_and_crop_jpeg(image_buffer, crop_window, channels=num_channels) 191 | distorted_image = tf.image.convert_image_dtype(distorted_image, dtype=tf.float32) 192 | 193 | # Resize the image. Select a resize method randomly. The image aspect ratio may change. 194 | resize_methods = [tf.image.ResizeMethod.BILINEAR, 195 | tf.image.ResizeMethod.LANCZOS3, 196 | tf.image.ResizeMethod.LANCZOS5, 197 | tf.image.ResizeMethod.BICUBIC, 198 | tf.image.ResizeMethod.GAUSSIAN, 199 | tf.image.ResizeMethod.NEAREST_NEIGHBOR, 200 | tf.image.ResizeMethod.AREA, 201 | tf.image.ResizeMethod.MITCHELLCUBIC] 202 | distorted_image = apply_with_random_selector(distorted_image, 203 | lambda x, resize_method: tf.image.resize(distorted_image, 204 | [output_height, output_width], 205 | method=resize_method, antialias=False), 206 | cases=resize_methods) 207 | 208 | # Restore image shape 209 | distorted_image.set_shape([output_height, output_width, num_channels]) 210 | 211 | # Perform a random horizontal flip of the image 212 | distorted_image = tf.image.random_flip_left_right(distorted_image) 213 | 214 | # Perform random color distortions 215 | distorted_image = apply_with_random_selector(distorted_image, 216 | lambda x, color_ordering: distort_color(x, color_ordering), 217 | cases=np.arange(5)) 218 | 219 | return distorted_image 220 | 221 | def preprocess_image(image_buffer, bbox, output_height, output_width, 222 | num_channels, dataset_type, is_training): 223 | """Preprocess an image. 224 | Args: 225 | image_buffer: string; encoded JPEG file. 226 | bbox: 3-D float Tensor; it contains the bounding boxes related to an 227 | image. Bounding box coordinates are in range [0, 1], arranged in 228 | order [ymin, xmin, ymax, xmax]. The Tensor is of shape 229 | [1, num_boxes, 4], where num_boxes is the number of bounding 230 | boxes related to the image. 231 | output_height: int; height of the image after preprocessing. 232 | output_width: int; width of the image after preprocessing. 233 | num_channels: int; depth of the image buffer for decoding. 234 | dataset_type: string; type of dataset. 235 | is_training: boolean; whether the input will be used for training. 236 | Returns: 237 | image: 3-D float Tensor; it contains an image. It is of 238 | size [H, W, C], where H is the image height, W is 239 | the image width, and C is the number of channels. 240 | """ 241 | 242 | if ((dataset_type == 'train') and (is_training)): 243 | # For training data during training, apply random distortions for data augmentation 244 | image = distort_image(image_buffer, output_height, output_width, num_channels, bbox) 245 | else: 246 | # Decode and resize the input image 247 | image = tf.image.decode_jpeg(image_buffer, channels=num_channels) 248 | image = tf.image.convert_image_dtype(image, dtype=tf.float32) 249 | image = tf.expand_dims(image, 0) 250 | image = tf.image.resize(image, [output_height, output_width], method=tf.image.ResizeMethod.BILINEAR, antialias=False) 251 | image = tf.squeeze(image, [0]) 252 | 253 | # Transform image values from range [0, 1], to [-1, 1] 254 | image = tf.subtract(image, 0.5) 255 | image = tf.multiply(image, 2.0) 256 | 257 | return image 258 | 259 | def parse_record(raw_record, dataset_type, is_training, 260 | img_size_y, img_size_x, dtype, adv_eval_data): 261 | """Parse a record containing a training example that corresponds to an image. 262 | Args: 263 | raw_record: string; serialized Example protocol buffer. 264 | dataset_type: string; type of dataset. 265 | is_training: boolean; whether the input will be used for training. 266 | img_size_y: int; image height in pixels. 267 | img_size_x: int; image width in pixels. 268 | dtype: string; data type to use for images/features. 269 | adv_eval_data: boolean; whether to include information for advanced 270 | evaluation in the input batches. 271 | Returns: 272 | batch: tuple; it contains the following entries: 273 | image: 3-D float Tensor; it contains an image. It is of 274 | size [H, W, C], where H is the image height, W is 275 | the image width, and C is the number of channels. 276 | label: int; numeric image label. 277 | img_filename (optional): string; the filename of an image. 278 | img_label_text (optional): string; the human-readable label of an image. 279 | """ 280 | 281 | # Parse Example protocol buffer 282 | if (not adv_eval_data): 283 | image_buffer, label, bbox = parse_example_proto(raw_record, adv_eval_data) 284 | else: 285 | (image_buffer, label, bbox, 286 | img_filename, img_label_text) = parse_example_proto(raw_record, adv_eval_data) 287 | 288 | # Pre-process image 289 | image = preprocess_image(image_buffer=image_buffer, 290 | bbox=bbox, 291 | output_height=img_size_y, 292 | output_width=img_size_x, 293 | num_channels=NUM_CHANNELS, 294 | dataset_type=dataset_type, 295 | is_training=is_training) 296 | image = tf.cast(image, dtype) 297 | 298 | label = tf.cast(tf.reshape(label, shape=[1]), dtype=tf.float32) 299 | 300 | # Return batch 301 | if (not adv_eval_data): 302 | batch = (image, label) 303 | else: 304 | batch = (image, label, img_filename, img_label_text) 305 | 306 | return batch 307 | 308 | def process_record_dataset(dataset, 309 | dataset_type, 310 | is_training, 311 | batch_size, 312 | img_size_y, 313 | img_size_x, 314 | shuffle_buffer, 315 | parse_record_fn, 316 | num_epochs=-1, 317 | dtype=tf.float32, 318 | datasets_num_private_threads=None, 319 | drop_remainder=False, 320 | tf_data_experimental_slack=False, 321 | adv_eval_data=False): 322 | """Create input dataset from raw records. 323 | Args: 324 | dataset: tf dataset; dataset with raw records. 325 | dataset_type: string; type of dataset. 326 | is_training: boolean; whether the input will be used for training. 327 | batch_size: int; number of samples per batch (global, not per replica). 328 | img_size_y: int; image height in pixels. 329 | img_size_x: int; image width in pixels. 330 | shuffle_buffer: int; buffer size to use when shuffling records. A larger 331 | value results in higher randomness, but a smaller one reduces startup 332 | time and uses less memory. 333 | parse_record_fn: function; function that processes raw records. 334 | num_epochs: int; number of times to repeat the dataset. 335 | dtype: string; data type to use for images/features. 336 | drop_remainder: boolean; whether to drop the remainder of the 337 | batches. If True, the batch dimension will be static. 338 | adv_eval_data: boolean; whether to include information for advanced 339 | evaluation in the input batches. 340 | Returns: 341 | dataset: tf dataset; iterable input dataset. 342 | """ 343 | 344 | # Shuffle records before repeating, to respect epoch boundaries 345 | if (is_training): 346 | dataset = dataset.shuffle(buffer_size=shuffle_buffer) 347 | 348 | # Repeat dataset for the number of epochs to train 349 | if (num_epochs < 1): 350 | dataset = dataset.repeat() 351 | else: 352 | dataset = dataset.repeat(num_epochs) 353 | 354 | # Parse raw records 355 | dataset = dataset.map(lambda value: parse_record_fn(value, dataset_type, is_training, 356 | img_size_y, img_size_x, dtype, adv_eval_data), 357 | num_parallel_calls=tf.data.experimental.AUTOTUNE) 358 | dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) 359 | 360 | # Operations between the final prefetch and the get_next call to the iterator 361 | # will happen synchronously during run time. Prefetch here again to 362 | # background all of the above processing work and keep it out of the 363 | # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE 364 | # allows DistributionStrategies to adjust how many batches to fetch based 365 | # on how many devices are present. 366 | dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) 367 | 368 | return dataset 369 | 370 | def input_fn(dataset_type, 371 | is_training, 372 | data_dir, 373 | batch_size, 374 | img_size_y, 375 | img_size_x, 376 | num_epochs=-1, 377 | dtype=tf.float32, 378 | parse_record_fn=parse_record, 379 | drop_remainder=False, 380 | filenames=None, 381 | adv_eval_data=False): 382 | """Prepare input batches. 383 | Args: 384 | dataset_type: string; type of dataset. 385 | is_training: boolean; whether the input will be used for training. 386 | data_dir: string; directory containing the input data. 387 | batch_size: int; number of samples per batch (global, not per replica). 388 | img_size_y: int; image height in pixels. 389 | img_size_x: int; image width in pixels. 390 | num_epochs: int; number of times to repeat the dataset. 391 | dtype: string; data type to use for images/features. 392 | parse_record_fn: function; function that processes raw records. 393 | drop_remainder: boolean; indicates whether to drop the remainder of the 394 | batches. If True, the batch dimension will be static. 395 | filenames: list of strings; it contains paths to TFRecords. 396 | adv_eval_data: boolean; whether to include information for advanced 397 | evaluation in the input batches. 398 | Returns: 399 | input_dataset: tf dataset; iterable input dataset. 400 | """ 401 | 402 | # Get TFRecords paths 403 | if (filenames is None): 404 | filenames = get_filenames(dataset_type, data_dir) 405 | dataset = tf.data.Dataset.from_tensor_slices(filenames) 406 | 407 | # Shuffle input files 408 | if (is_training): 409 | if (dataset_type == 'train'): 410 | dataset = dataset.shuffle(buffer_size=TRAIN_SHARDS_NUM) 411 | elif (dataset_type == 'validation'): 412 | dataset = dataset.shuffle(buffer_size=VAL_SHARDS_NUM) 413 | elif (dataset_type == 'test'): 414 | dataset = dataset.shuffle(buffer_size=TEST_SHARDS_NUM) 415 | 416 | # Process input files concurrently 417 | dataset = dataset.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE) 418 | 419 | # Process TFRecords 420 | input_dataset = process_record_dataset(dataset=dataset, 421 | dataset_type=dataset_type, 422 | is_training=is_training, 423 | batch_size=batch_size, 424 | img_size_y=img_size_y, 425 | img_size_x=img_size_x, 426 | shuffle_buffer=_SHUFFLE_BUFFER, 427 | parse_record_fn=parse_record_fn, 428 | num_epochs=num_epochs, 429 | dtype=dtype, 430 | drop_remainder=drop_remainder, 431 | adv_eval_data=adv_eval_data) 432 | 433 | return input_dataset 434 | -------------------------------------------------------------------------------- /ImageNet/input_imagenet.py: -------------------------------------------------------------------------------- 1 | """Prepare input batches, based on 2 | https://github.com/tensorflow/models/blob/master/official/vision/image_classification/resnet/imagenet_preprocessing.py. 3 | """ 4 | 5 | from __future__ import absolute_import, division, print_function 6 | 7 | import os 8 | import numpy as np 9 | import tensorflow as tf 10 | from tensorflow.python.ops import control_flow_ops 11 | 12 | 13 | 14 | _SHUFFLE_BUFFER = 10000 15 | NUM_CHANNELS = 3 16 | TRAIN_SHARDS_NUM = 1024 17 | VAL_SHARDS_NUM = 128 18 | 19 | def get_filenames(is_train_dataset, data_dir): 20 | """Return filenames for dataset. 21 | Args: 22 | is_train_dataset: boolean; whether the input is the training 23 | or the validation set. 24 | data_dir: string; directory containing the input data. 25 | Returns: 26 | data_filemames: list of strings; it contains paths to TFRecords. 27 | """ 28 | 29 | # Data are assumed to be stored in TFRecords 30 | if (is_train_dataset): 31 | data_filemames = [os.path.join(data_dir, 'train-%05d-of-%05d' % (i, TRAIN_SHARDS_NUM)) for i in range(TRAIN_SHARDS_NUM)] 32 | else: 33 | data_filemames = [os.path.join(data_dir, 'validation-%05d-of-%05d' % (i, VAL_SHARDS_NUM)) for i in range(VAL_SHARDS_NUM)] 34 | 35 | return data_filemames 36 | 37 | def parse_example_proto(example_serialized, adv_eval_data=False): 38 | """Parse an Example proto that corresponds to an image. 39 | Each Example proto contains the following fields (values are included as examples): 40 | image/height: 462 41 | image/width: 581 42 | image/colorspace: 'RGB' 43 | image/channels: 3 44 | image/class/label: 1 - 1000 # label value 0 was left empty for the background class when building the dataset 45 | image/class/synset: 'n03623198' 46 | image/class/text: 'knee pad' 47 | image/object/bbox/xmin: 0.1 48 | image/object/bbox/xmax: 0.9 49 | image/object/bbox/ymin: 0.2 50 | image/object/bbox/ymax: 0.6 51 | image/object/bbox/label: 615 52 | image/format: 'JPEG' 53 | image/filename: 'ILSVRC2012_val_00041207.JPEG' 54 | image/encoded: 55 | Args: 56 | example_serialized: string; serialized Example protocol buffer. 57 | adv_eval_data: boolean; whether to include information for advanced 58 | evaluation in the input batches. 59 | Returns: 60 | to_batch: tuple; it contains the following entries: 61 | encoded_img: string; encoded JPEG file. 62 | label: int; numeric image label. 63 | bbox: 3-D float Tensor; it contains the bounding boxes related to an 64 | image. Bounding box coordinates are in range [0, 1], arranged in 65 | order [ymin, xmin, ymax, xmax]. The Tensor is of shape 66 | [1, num_boxes, 4], where num_boxes is the number of bounding 67 | boxes related to the image. 68 | img_filename (optional): string; the filename of an image. 69 | img_label_text (optional): string; the human-readable label of an image. 70 | """ 71 | 72 | # Extract dense features in Example proto 73 | feature_map = { 74 | 'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''), 75 | 'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1), 76 | 'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''), 77 | 'image/filename': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''), 78 | 'image/class/synset': tf.io.FixedLenFeature([], dtype=tf.string, default_value='') 79 | } 80 | # Extract sparse features in Example proto 81 | sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32) 82 | feature_map.update( 83 | {k: sparse_float32 for k in ['image/object/bbox/xmin', 'image/object/bbox/ymin', 84 | 'image/object/bbox/xmax', 'image/object/bbox/ymax']}) 85 | 86 | features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map) 87 | encoded_img = features['image/encoded'] 88 | label = tf.cast(features['image/class/label'], dtype=tf.int32) 89 | 90 | xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) 91 | ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) 92 | xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) 93 | ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) 94 | 95 | # Make the variable number of bounding boxes into 96 | # the shape [1, num_boxes, coords] 97 | bbox = tf.concat([ymin, xmin, ymax, xmax], 0) 98 | bbox = tf.expand_dims(bbox, 0) 99 | bbox = tf.transpose(a=bbox, perm=[0, 2, 1]) 100 | 101 | if (not adv_eval_data): 102 | to_batch = (encoded_img, label, bbox) 103 | else: 104 | img_filename = features['image/filename'] 105 | img_synset = features['image/class/synset'] 106 | img_label_text = features['image/class/text'] 107 | to_batch = (encoded_img, label, bbox, img_filename, img_synset, img_label_text) 108 | 109 | return to_batch 110 | 111 | def apply_with_random_selector(x, func, cases): 112 | """Compute func(x, cases[sel]), with sel sampled from cases. 113 | Args: 114 | x: Tensor; input Tensor to process. 115 | func: function; python function to apply. 116 | num_cases: list; cases to sample from. 117 | Returns: 118 | The result of func(x, cases[sel]), sel is sampled dynamically. 119 | """ 120 | 121 | sel = tf.random.uniform([], maxval=len(cases), dtype=tf.int32) 122 | # Pass the input only to one of the func calls 123 | return control_flow_ops.merge([ 124 | func(control_flow_ops.switch(x, tf.equal(sel, i))[1], cases[i]) 125 | for i in range(len(cases))])[0] 126 | 127 | def distort_color(image, color_ordering): 128 | """Distort the color of an image. 129 | Args: 130 | image: 3-D float Tensor; it contains an image. It is of 131 | size [H, W, C], where H is the image height, W is 132 | the image width, and C is the number of channels. 133 | color_ordering: int; denotes the kind of color distortion. 134 | Returns: 135 | image: 3-D float Tensor; it contains an image. It is of 136 | size [H, W, C], where H is the image height, W is 137 | the image width, and C is the number of channels. 138 | """ 139 | 140 | if color_ordering == 0: 141 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 142 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 143 | image = tf.image.random_hue(image, max_delta=0.2) 144 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 145 | elif color_ordering == 1: 146 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 147 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 148 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 149 | image = tf.image.random_hue(image, max_delta=0.2) 150 | elif color_ordering == 2: 151 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 152 | image = tf.image.random_hue(image, max_delta=0.2) 153 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 154 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 155 | elif color_ordering == 3: 156 | image = tf.image.random_hue(image, max_delta=0.2) 157 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 158 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 159 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 160 | elif color_ordering == 4: 161 | return image 162 | else: 163 | raise ValueError('color_ordering must be in [0, 4]') 164 | 165 | # The random_* ops do not necessarily clamp 166 | image = tf.clip_by_value(image, 0.0, 1.0) 167 | 168 | return image 169 | 170 | def distort_image(image_buffer, output_height, output_width, num_channels, bbox): 171 | """Distort an image for data augmentation. 172 | Args: 173 | image_buffer: string; raw JPEG image buffer. 174 | output_height: int; height of the image after preprocessing. 175 | output_width: int; width of the image after preprocessing. 176 | num_channels: int; depth of the image buffer for decoding. 177 | bbox: 3-D float Tensor; it contains the bounding boxes related to 178 | an image. Bounding box coordinates are in range [0, 1], 179 | arranged in order [ymin, xmin, ymax, xmax]. The Tensor is of 180 | shape [1, num_boxes, 4], where num_boxes is the number of 181 | bounding boxes related to the image. 182 | Returns: 183 | distorted_image: 3-D float Tensor; it contains an image. It is of 184 | size [H, W, C], where H is the image height, W is the image 185 | width, and C is the number of channels. 186 | """ 187 | 188 | # Create a bounding box by distorting an existing one (if it is provided). 189 | # The new bounding box should respect specific constraints, e.g., be within 190 | # a range of aspect ratios. If no bounding box is provided, the entire 191 | # image is considered the initial bounding box to be distorted. 192 | sampled_distorted_bounding_box = tf.image.sample_distorted_bounding_box( 193 | tf.io.extract_jpeg_shape(image_buffer), 194 | bounding_boxes=bbox, 195 | min_object_covered=0.1, 196 | aspect_ratio_range=[0.5, 2.0], 197 | area_range=[0.85, 1.0], 198 | max_attempts=50, 199 | use_image_if_no_bounding_boxes=True, 200 | seed=0) 201 | bbox_begin, bbox_size, _ = sampled_distorted_bounding_box 202 | 203 | # Reassemble and crop the bounding box 204 | offset_y, offset_x, _ = tf.unstack(bbox_begin) 205 | target_height, target_width, _ = tf.unstack(bbox_size) 206 | crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) 207 | distorted_image = tf.image.decode_and_crop_jpeg(image_buffer, crop_window, channels=num_channels) 208 | distorted_image = tf.image.convert_image_dtype(distorted_image, dtype=tf.float32) 209 | 210 | # Resize the image. Select a resize method randomly. The image aspect ratio may change. 211 | resize_methods = [tf.image.ResizeMethod.BILINEAR, 212 | tf.image.ResizeMethod.LANCZOS3, 213 | tf.image.ResizeMethod.LANCZOS5, 214 | tf.image.ResizeMethod.BICUBIC, 215 | tf.image.ResizeMethod.GAUSSIAN, 216 | tf.image.ResizeMethod.NEAREST_NEIGHBOR, 217 | tf.image.ResizeMethod.AREA, 218 | tf.image.ResizeMethod.MITCHELLCUBIC] 219 | distorted_image = apply_with_random_selector(distorted_image, 220 | lambda x, resize_method: tf.image.resize(distorted_image, 221 | [output_height, output_width], 222 | method=resize_method, antialias=False), 223 | cases=resize_methods) 224 | 225 | # Restore image shape 226 | distorted_image.set_shape([output_height, output_width, num_channels]) 227 | 228 | # Perform a random horizontal flip of the image 229 | distorted_image = tf.image.random_flip_left_right(distorted_image) 230 | 231 | # Perform random color distortions 232 | distorted_image = apply_with_random_selector(distorted_image, 233 | lambda x, color_ordering: distort_color(x, color_ordering), 234 | cases=np.arange(5)) 235 | 236 | return distorted_image 237 | 238 | def preprocess_image(image_buffer, bbox, output_height, output_width, 239 | num_channels, is_train_dataset, is_training): 240 | """Preprocess an image. 241 | Args: 242 | image_buffer: string; encoded JPEG file. 243 | bbox: 3-D float Tensor; it contains the bounding boxes related to an 244 | image. Bounding box coordinates are in range [0, 1], arranged in 245 | order [ymin, xmin, ymax, xmax]. The Tensor is of shape 246 | [1, num_boxes, 4], where num_boxes is the number of bounding 247 | boxes related to the image. 248 | output_height: int; height of the image after preprocessing. 249 | output_width: int; width of the image after preprocessing. 250 | num_channels: int; depth of the image buffer for decoding. 251 | is_train_dataset: boolean; whether the input is the training 252 | or the validation set. 253 | is_training: boolean; whether the input will be used for training. 254 | Returns: 255 | image: 3-D float Tensor; it contains an image. It is of 256 | size [H, W, C], where H is the image height, W is 257 | the image width, and C is the number of channels. 258 | """ 259 | 260 | if (is_train_dataset and is_training): 261 | # For training data during training, apply random distortions for data augmentation 262 | image = distort_image(image_buffer, output_height, output_width, num_channels, bbox) 263 | else: 264 | # Decode and resize the input image 265 | image = tf.image.decode_jpeg(image_buffer, channels=num_channels) 266 | image = tf.image.convert_image_dtype(image, dtype=tf.float32) 267 | image = tf.expand_dims(image, 0) 268 | image = tf.image.resize(image, [output_height, output_width], method=tf.image.ResizeMethod.BILINEAR, antialias=False) 269 | image = tf.squeeze(image, [0]) 270 | 271 | # Transform image values from range [0, 1], to [-1, 1] 272 | image = tf.subtract(image, 0.5) 273 | image = tf.multiply(image, 2.0) 274 | 275 | return image 276 | 277 | def parse_record(raw_record, is_train_dataset, is_training, 278 | img_size_y, img_size_x, dtype, adv_eval_data): 279 | """Parse a record containing a training example that corresponds to an image. 280 | Args: 281 | raw_record: string; serialized Example protocol buffer. 282 | is_train_dataset: boolean; whether the input is the training 283 | or the validation set. 284 | is_training: boolean; whether the input will be used for training. 285 | img_size_y: int; image height in pixels. 286 | img_size_x: int; image width in pixels. 287 | dtype: string; data type to use for images/features. 288 | adv_eval_data: boolean; whether to include information for advanced 289 | evaluation in the input batches. 290 | Returns: 291 | batch: tuple; it contains the following entries: 292 | image: 3-D float Tensor; it contains an image. It is of 293 | size [H, W, C], where H is the image height, W is 294 | the image width, and C is the number of channels. 295 | label: int; numeric image label. 296 | img_filename (optional): string; the filename of an image. 297 | img_synset (optional): string; the synset of an image. 298 | img_label_text (optional): string; the human-readable label of an image. 299 | """ 300 | 301 | # Parse Example protocol buffer 302 | if (not adv_eval_data): 303 | image_buffer, label, bbox = parse_example_proto(raw_record, adv_eval_data) 304 | else: 305 | (image_buffer, label, bbox, 306 | img_filename, img_synset, img_label_text) = parse_example_proto(raw_record, adv_eval_data) 307 | 308 | # Pre-process image 309 | image = preprocess_image(image_buffer=image_buffer, 310 | bbox=bbox, 311 | output_height=img_size_y, 312 | output_width=img_size_x, 313 | num_channels=NUM_CHANNELS, 314 | is_train_dataset=is_train_dataset, 315 | is_training=is_training) 316 | image = tf.cast(image, dtype) 317 | 318 | # Subtract 1 so that labels are in [0, 1000) range 319 | label = tf.cast(tf.cast(tf.reshape(label, shape=[1]), dtype=tf.int32) - 1, dtype=tf.float32) 320 | 321 | # Return batch 322 | if (not adv_eval_data): 323 | batch = (image, label) 324 | else: 325 | batch = (image, label, img_filename, img_synset, img_label_text) 326 | 327 | return batch 328 | 329 | def process_record_dataset(dataset, 330 | is_train_dataset, 331 | is_training, 332 | batch_size, 333 | img_size_y, 334 | img_size_x, 335 | shuffle_buffer, 336 | parse_record_fn, 337 | num_epochs=-1, 338 | dtype=tf.float32, 339 | drop_remainder=False, 340 | adv_eval_data=False): 341 | """Create input dataset from raw records. 342 | Args: 343 | dataset: tf dataset; dataset with raw records. 344 | is_train_dataset: boolean; whether the input is the training 345 | or the validation set. 346 | is_training: boolean; whether the input will be used for training. 347 | batch_size: int; number of samples per batch (global, not per replica). 348 | img_size_y: int; image height in pixels. 349 | img_size_x: int; image width in pixels. 350 | shuffle_buffer: int; buffer size to use when shuffling records. A larger 351 | value results in higher randomness, but a smaller one reduces startup 352 | time and uses less memory. 353 | parse_record_fn: function; function that processes raw records. 354 | num_epochs: int; number of times to repeat the dataset. 355 | dtype: string; data type to use for images/features. 356 | drop_remainder: boolean; whether to drop the remainder of the 357 | batches. If True, the batch dimension will be static. 358 | adv_eval_data: boolean; whether to include information for advanced 359 | evaluation in the input batches. 360 | Returns: 361 | dataset: tf dataset; iterable input dataset. 362 | """ 363 | 364 | # Shuffle records before repeating, to respect epoch boundaries 365 | if (is_training): 366 | dataset = dataset.shuffle(buffer_size=shuffle_buffer) 367 | 368 | # Repeat dataset for the number of epochs to train 369 | if (num_epochs < 1): 370 | dataset = dataset.repeat() 371 | else: 372 | dataset = dataset.repeat(num_epochs) 373 | 374 | # Parse raw records 375 | dataset = dataset.map(lambda value: parse_record_fn(value, is_train_dataset, is_training, 376 | img_size_y, img_size_x, dtype, adv_eval_data), 377 | num_parallel_calls=tf.data.experimental.AUTOTUNE) 378 | dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) 379 | 380 | # Operations between the final prefetch and the get_next call to the iterator 381 | # will happen synchronously during run time. Prefetch here again to 382 | # background all of the above processing work and keep it out of the 383 | # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE 384 | # allows DistributionStrategies to adjust how many batches to fetch based 385 | # on how many devices are present. 386 | dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) 387 | 388 | return dataset 389 | 390 | def input_fn(is_train_dataset, 391 | is_training, 392 | data_dir, 393 | batch_size, 394 | img_size_y, 395 | img_size_x, 396 | num_epochs=-1, 397 | dtype=tf.float32, 398 | parse_record_fn=parse_record, 399 | drop_remainder=True, 400 | filenames=None, 401 | adv_eval_data=False): 402 | """Prepare input batches. 403 | Args: 404 | is_train_dataset: boolean; whether the input is the training 405 | or the validation set. 406 | is_training: boolean; whether the input will be used for training. 407 | data_dir: string; directory containing the input data. 408 | batch_size: int; number of samples per batch (global, not per replica). 409 | img_size_y: int; image height in pixels. 410 | img_size_x: int; image width in pixels. 411 | num_epochs: int; number of times to repeat the dataset. 412 | dtype: string; data type to use for images/features. 413 | parse_record_fn: function; function that processes raw records. 414 | drop_remainder: boolean; indicates whether to drop the remainder of the 415 | batches. If True, the batch dimension will be static. 416 | filenames: list of strings; it contains paths to TFRecords. 417 | adv_eval_data: boolean; whether to include information for advanced 418 | evaluation in the input batches. 419 | Returns: 420 | input_dataset: tf dataset; iterable input dataset. 421 | """ 422 | 423 | # Get TFRecords paths 424 | if (filenames is None): 425 | filenames = get_filenames(is_train_dataset, data_dir) 426 | dataset = tf.data.Dataset.from_tensor_slices(filenames) 427 | 428 | # Shuffle input files 429 | if (is_training): 430 | if (is_train_dataset): 431 | dataset = dataset.shuffle(buffer_size=TRAIN_SHARDS_NUM) 432 | else: 433 | dataset = dataset.shuffle(buffer_size=VAL_SHARDS_NUM) 434 | 435 | # Process input files concurrently 436 | dataset = dataset.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE) 437 | 438 | # Process TFRecords 439 | input_dataset = process_record_dataset(dataset=dataset, 440 | is_train_dataset=is_train_dataset, 441 | is_training=is_training, 442 | batch_size=batch_size, 443 | img_size_y=img_size_y, 444 | img_size_x=img_size_x, 445 | shuffle_buffer=_SHUFFLE_BUFFER, 446 | parse_record_fn=parse_record_fn, 447 | num_epochs=num_epochs, 448 | dtype=dtype, 449 | drop_remainder=drop_remainder, 450 | adv_eval_data=adv_eval_data) 451 | 452 | return input_dataset 453 | -------------------------------------------------------------------------------- /fMoW/create_TFRecords_fMoW.py: -------------------------------------------------------------------------------- 1 | """Convert fMoW images to TFRecords. 2 | Raw fMoW data can be downloaded here https://github.com/fMoW/dataset. 3 | The current script utilizes the rgb version of fMoW, and not the full version. 4 | fMoW data are split in training, validation and test sets. After download, for 5 | the training and validations sets, jpeg and json files are expected to reside 6 | in the following directory structure: 7 | /train/airport/airport_0/airport_0_0_rgb.jpg 8 | /train/airport/airport_0/airport_0_0_rgb.json 9 | ... 10 | 11 | /val/airport/airport_0/airport_0_0_rgb.jpg 12 | /val/airport/airport_0/airport_0_0_rgb.json 13 | ... 14 | 15 | For the test set, jpeg and json files are expected to reside 16 | in the following directory structure: 17 | /test/0011978/0011978_0_rgb.jpg 18 | /test/0011978/0011978_0_rgb.json 19 | ... 20 | 21 | Test set directory structure doesn't reveal the labels of the images, because 22 | it was initially realeased in the context of an IARPA challenge (https://www.iarpa.gov/challenges/fmow.html). 23 | However, given that the challenge is over, test set annotations are available 24 | for download with the rest of the data here https://github.com/fMoW/dataset. 25 | After downloding the ground truth test data, they consist of json files that 26 | reside in the following directory structure: 27 | /test_gt/airport/airport_0/airport_0_0_rgb.json 28 | /test_gt/airport/airport_0/airport_0_1_rgb.json 29 | ... 30 | 31 | The additional test_gt_mapping.json file is provided to establish a correspondance 32 | between the annotations under folder test_gt, and the images under folder test. To 33 | this end, we provide match_test_gt.py script, which organizes jpeg and json files 34 | for the test set, in the following directory structure: 35 | /test_matched_with_gt/airport/airport_0/airport_0_0_rgb.jpeg 36 | /test_matched_with_gt/airport/airport_0/airport_0_0_rgb.json 37 | ... 38 | 39 | Given the desired uniformity in the directory organization of the training, 40 | validation, and test sets is established, the current script converts image 41 | data to TFRecord files. Each record within a TFRecord file is a serialized 42 | Example proto. 43 | """ 44 | 45 | from __future__ import absolute_import, division, print_function 46 | 47 | import argparse 48 | from datetime import datetime 49 | import os 50 | import random 51 | import sys 52 | import threading 53 | import json 54 | 55 | import numpy as np 56 | import six 57 | import tensorflow as tf 58 | 59 | 60 | 61 | parser = argparse.ArgumentParser() 62 | 63 | parser.add_argument('--train_directory', type=str, default='/train/', help='Training data directory.') 64 | parser.add_argument('--validation_directory', type=str, default='/val/', help='Validation data directory.') 65 | parser.add_argument('--test_directory', type=str, default='/test_matched_with_gt/', help='Test data directory.') 66 | parser.add_argument('--output_directory', type=str, default='/TFRecords/', help='Output data directory.') 67 | 68 | parser.add_argument('--train_shards', type=int, default=512, help='Number of shards in training TFRecord files.') 69 | parser.add_argument('--validation_shards', type=int, default=128, help='Number of shards in validation TFRecord files.') 70 | parser.add_argument('--test_shards', type=int, default=128, help='Number of shards in test TFRecord files.') 71 | parser.add_argument('--num_threads', type=int, default=32, help='Number of threads to parallelize processing.') 72 | parser.add_argument('--maximum_min_dim', type=int, default=1000, help='Maximum size allowed for the smallest image spatial dimension.') 73 | parser.add_argument('--cropped_data', action='store_true', help='Whether the provided data are cropped acoording to bounding boxes annotations.') 74 | 75 | FLAGS = parser.parse_args() 76 | 77 | category_names = ['airport', 'airport_hangar', 'airport_terminal', 'amusement_park', 'aquaculture', 'archaeological_site', 'barn', 'border_checkpoint', 'burial_site', 'car_dealership', 'construction_site', 78 | 'crop_field', 'dam', 'debris_or_rubble', 'educational_institution', 'electric_substation', 'factory_or_powerplant', 'fire_station', 'flooded_road', 'fountain', 'gas_station', 'golf_course', 79 | 'ground_transportation_station', 'helipad', 'hospital', 'interchange', 'lake_or_pond', 'lighthouse', 'military_facility', 'multi-unit_residential', 'nuclear_powerplant', 'office_building', 80 | 'oil_or_gas_facility', 'park', 'parking_lot_or_garage', 'place_of_worship', 'police_station', 'port', 'prison', 'race_track', 'railway_bridge', 'recreational_facility', 'impoverished_settlement', 81 | 'road_bridge', 'runway', 'shipyard', 'shopping_mall', 'single-unit_residential', 'smokestack', 'solar_farm', 'space_facility', 'stadium', 'storage_tank','surface_mine', 'swimming_pool', 82 | 'toll_booth', 'tower', 'tunnel_opening', 'waste_disposal', 'water_treatment_facility', 'wind_farm', 'zoo'] 83 | 84 | def clip_0_1(x): 85 | """Clip given float number within [0, 1] range. 86 | Args: 87 | x: float; value to clip. 88 | Returns: 89 | x: float; value within [0, 1] range. 90 | """ 91 | 92 | if (x < 0.): 93 | x = 0. 94 | elif (x > 1.0): 95 | x = 1.0 96 | 97 | return x 98 | 99 | def _int64_feature(value): 100 | """Insert int features into Example proto. 101 | Args: 102 | value: int or list of ints; features to insert 103 | in Example proto. 104 | Returns: 105 | feature: example proto; it contains a list of ints. 106 | """ 107 | 108 | if (not isinstance(value, list)): 109 | value = [value] 110 | 111 | feature = tf.train.Feature(int64_list=tf.train.Int64List(value=value)) 112 | 113 | return feature 114 | 115 | def _float_feature(value): 116 | """Insert float features into Example proto. 117 | Args: 118 | value: float or list of floats; features to insert 119 | in Example proto. 120 | Returns: 121 | feature: example proto; it contains a list of floats. 122 | """ 123 | 124 | if (not isinstance(value, list)): 125 | value = [value] 126 | 127 | feature = tf.train.Feature(float_list=tf.train.FloatList(value=value)) 128 | 129 | return feature 130 | 131 | def _bytes_feature(value): 132 | """Insert byte features into Example proto. 133 | Args: 134 | value: string or list of strings; features to 135 | insert in Example proto. 136 | Returns: 137 | feature: example proto; it contains a byte list. 138 | """ 139 | 140 | if (isinstance(value, type(tf.constant(0)))): 141 | value = value.numpy() 142 | if (six.PY3 and isinstance(value, six.text_type)): 143 | value = six.binary_type(value, encoding='utf-8') 144 | 145 | feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 146 | 147 | return feature 148 | 149 | def _convert_to_example(filename, image_buffer, label, 150 | category, bbox, height, width): 151 | """Build an Example proto for an image. 152 | Args: 153 | filename: string; path to image file. 154 | image_buffer: string; JPEG encoded image. 155 | label: int; numeric ground truth label. 156 | category: string; human-readable label. 157 | bbox: list; it contains coordinates of bounding boxes. 158 | height: int; image height in pixels. 159 | width: int; image width in pixels. 160 | Returns: 161 | example: example proto; it contains the following fields: 162 | image/height: int; image height in pixels. 163 | image/width: int; image width in pixels. 164 | image/colorspace: string; colorspace, always 'RGB'. 165 | image/channels: int; number of channels, always 3. 166 | image/class/label: int; index of a classification label in range [0, 61]. 167 | image/class/text: string; human-readable label. 168 | image/object/bbox/ymin: list of ints; denotes the minimum vertical pixel value 169 | of a bounding box, in proportion to the image height. It takes values in 170 | [0, 1]. Each entry in the list corresponds to a different bounding box. 171 | image/object/bbox/xmin: list of ints; denotes the minimum horizontal pixel 172 | value of a bounding box, in proportion to the image width. It takes values 173 | in [0, 1]. Each entry in the list corresponds to a different bounding box. 174 | image/object/bbox/ymax: list of ints; denotes the maximum vertical pixel value 175 | of a bounding box, in proportion to the image height. It takes values in 176 | [0, 1]. Each entry in the list corresponds to a different bounding box. 177 | image/object/bbox/xmax: list of ints; denotes the maximum horizontal pixel 178 | value of a bounding box, in proportion to the image width. It takes values 179 | in [0, 1]. Each entry in the list corresponds to a different bounding box. 180 | image/object/bbox/label: int; index of a classification label. It is always 181 | identical to the corresponding image label. 182 | image/format: string; image format, always 'JPEG'. 183 | image/filename: string; image file basename. 184 | image/encoded: string; JPEG encoded image. 185 | """ 186 | 187 | b_ymin = [] 188 | b_xmin = [] 189 | b_ymax = [] 190 | b_xmax = [] 191 | for b in bbox: 192 | assert len(b) == 4 193 | [l.append(point) for l, point in zip([b_ymin, b_xmin, b_ymax, b_xmax], b)] 194 | 195 | colorspace = 'RGB' 196 | channels = 3 197 | image_format = 'JPEG' 198 | 199 | example = tf.train.Example(features=tf.train.Features(feature={ 200 | 'image/height': _int64_feature(height), 201 | 'image/width': _int64_feature(width), 202 | 'image/colorspace': _bytes_feature(colorspace), 203 | 'image/channels': _int64_feature(channels), 204 | 'image/class/label': _int64_feature(label), 205 | 'image/class/text': _bytes_feature(category), 206 | 'image/object/bbox/ymin': _float_feature(b_ymin), 207 | 'image/object/bbox/xmin': _float_feature(b_xmin), 208 | 'image/object/bbox/ymax': _float_feature(b_ymax), 209 | 'image/object/bbox/xmax': _float_feature(b_xmax), 210 | 'image/object/bbox/label': _int64_feature([label] * len(b_xmin)), 211 | 'image/format': _bytes_feature(image_format), 212 | 'image/filename': _bytes_feature(os.path.basename(filename)), 213 | 'image/encoded': _bytes_feature(image_buffer) 214 | })) 215 | 216 | return example 217 | 218 | def _process_image(filename, img_size): 219 | """Process a single image file. 220 | Args: 221 | filename: string; path to an image file. 222 | img_sizes: tuple of ints; it contains the spatial 223 | dimensions of an image. 224 | Returns: 225 | image_buffer: string; JPEG encoded image. 226 | height: int; image height in pixels. 227 | width: int; image width in pixels. 228 | """ 229 | 230 | # Read image file 231 | image_data = tf.io.read_file(filename) 232 | 233 | # Calculate decoding ratio to avoid overflow due to huge images 234 | min_dim = min(img_size) 235 | if (min_dim > 8 * FLAGS.maximum_min_dim): 236 | ratio = 8 237 | elif (min_dim > 4 * FLAGS.maximum_min_dim): 238 | ratio = 4 239 | elif (min_dim > 2 * FLAGS.maximum_min_dim): 240 | ratio = 2 241 | else: 242 | ratio = 1 243 | image = tf.io.decode_jpeg(image_data, ratio=ratio, channels=3) 244 | 245 | # Ensure smallest image dimension does not exceed FLAGS.maximum_min_dim 246 | height = image.shape[0] 247 | width = image.shape[1] 248 | min_dim = min([height, width]) 249 | if (min_dim > FLAGS.maximum_min_dim): 250 | if (height == min_dim): 251 | new_height = FLAGS.maximum_min_dim 252 | new_width = np.ceil(float(new_height) * (float(width)/float(height))) 253 | else: 254 | new_width = FLAGS.maximum_min_dim 255 | new_height = np.ceil(float(new_width) * (float(height)/float(width))) 256 | 257 | image = tf.image.convert_image_dtype(image, dtype=tf.float32) 258 | image = tf.image.resize(tf.expand_dims(image, axis=0), size=[int(new_height), int(new_width)], 259 | preserve_aspect_ratio=True, method=tf.image.ResizeMethod.BILINEAR) 260 | image = tf.squeeze(image) 261 | 262 | # Assert that the image has the appropriate dimensions 263 | assert (len(image.shape) == 3) 264 | assert (image.shape[2] == 3) 265 | height = image.shape[0] 266 | width = image.shape[1] 267 | assert ((height <= FLAGS.maximum_min_dim) or (width <= FLAGS.maximum_min_dim)) 268 | 269 | # Encode the image, if it was processed 270 | if ((min_dim > FLAGS.maximum_min_dim) or (ratio != 1)): 271 | image = tf.image.convert_image_dtype(image, dtype=tf.uint8) 272 | image_data = tf.image.encode_jpeg(image, format='rgb', quality=100) 273 | 274 | return image_data, height, width 275 | 276 | def _process_image_files_batch(thread_index, ranges, name, filenames, 277 | labels, categories, bboxes, img_sizes, num_shards): 278 | """Execute 1 thread that processes images and saves them as TFRecords 279 | of Example protos. 280 | Args: 281 | thread_index: int; unique thread identifier. 282 | ranges: list of ints; it contains the range of images to 283 | process. 284 | name: string; unique identifier specifying the data set. 285 | filenames: list of strings; it contains paths to image files. 286 | labels: list of ints; it contains numeric ground truth labels. 287 | categories: list of strings; it contains human-readable ground 288 | truth labels. 289 | bboxes: list; it contains bounding boxes for each image. 290 | img_sizes: list of tuples; each tuple contains the spatial 291 | dimensions of an image. 292 | num_shards: int; number of shards. 293 | Returns: 294 | - 295 | """ 296 | 297 | # Each thread produces N shards where N = int(num_shards / num_threads). 298 | # For instance, if num_shards = 128, and the num_threads = 2, then the first 299 | # thread would produce shards [0, 64) 300 | num_threads = len(ranges) 301 | assert not num_shards % num_threads 302 | num_shards_per_batch = int(num_shards / num_threads) 303 | 304 | shard_ranges = np.linspace(ranges[thread_index][0], 305 | ranges[thread_index][1], 306 | num_shards_per_batch + 1).astype(int) 307 | num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0] 308 | 309 | # Generate each shard 310 | counter = 0 311 | for s in range(num_shards_per_batch): 312 | shard = thread_index * num_shards_per_batch + s 313 | output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards) 314 | output_file = os.path.join(FLAGS.output_directory, output_filename) 315 | writer = tf.io.TFRecordWriter(output_file) 316 | 317 | # Process each file for a shard 318 | shard_counter = 0 319 | files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int) 320 | for i in files_in_shard: 321 | filename = filenames[i] 322 | label = labels[i] 323 | category = categories[i] 324 | bbox = bboxes[i] 325 | img_size = img_sizes[i] 326 | 327 | # Process an image 328 | image_buffer, height, width = _process_image(filename, img_size) 329 | 330 | # Create an Example proto 331 | example = _convert_to_example(filename, image_buffer, label, 332 | category, bbox, height, width) 333 | 334 | # Write to TFRecord 335 | writer.write(example.SerializeToString()) 336 | shard_counter += 1 337 | counter += 1 338 | 339 | if (not (counter % 1000)): 340 | print('%s [thread %d]: Processed %d of %d images in thread batch.' 341 | %(datetime.now(), thread_index, counter, num_files_in_thread)) 342 | sys.stdout.flush() 343 | 344 | writer.close() 345 | print('%s [thread %d]: Wrote %d images to %s' 346 | %(datetime.now(), thread_index, shard_counter, output_file)) 347 | sys.stdout.flush() 348 | shard_counter = 0 349 | print('%s [thread %d]: Wrote %d images to %d shards.' 350 | %(datetime.now(), thread_index, counter, num_files_in_thread)) 351 | sys.stdout.flush() 352 | 353 | def _process_image_files(name, filenames, labels, categories, bboxes, img_sizes, num_shards): 354 | """Process images and save them as TFRecords of Example protos. 355 | Args: 356 | name: string; unique identifier specifying the data set. 357 | filenames: list of strings; it contains paths to image files. 358 | labels: list of ints; it contains numeric ground truth labels. 359 | categories: list of strings; it contains human-readable ground 360 | truth labels. 361 | bboxes: list; it contains bounding boxes for each image. 362 | img_sizes: list of tuples; each tuple contains the spatial 363 | dimensions of an image. 364 | num_shards: int; number of shards. 365 | Returns: 366 | - 367 | """ 368 | 369 | assert len(filenames) == len(labels) == len(categories) == len(bboxes) == len(img_sizes) 370 | 371 | # Break images into batches 372 | spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int) 373 | ranges = [] 374 | for i in range(len(spacing) - 1): 375 | ranges.append([spacing[i], spacing[i + 1]]) 376 | 377 | # Launch a thread for each batch 378 | print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges)) 379 | sys.stdout.flush() 380 | 381 | # Create a mechanism for monitoring threads' execution 382 | coord = tf.train.Coordinator() 383 | 384 | # Run threads 385 | threads = [] 386 | for thread_index in range(len(ranges)): 387 | args = (thread_index, ranges, name, filenames, 388 | labels, categories, bboxes, img_sizes, num_shards) 389 | t = threading.Thread(target=_process_image_files_batch, args=args) 390 | t.start() 391 | threads.append(t) 392 | 393 | # Wait for all the threads to terminate 394 | coord.join(threads) 395 | print('%s: Finished writing all %d images in data set.' %(datetime.now(), len(filenames))) 396 | sys.stdout.flush() 397 | 398 | def _find_image_files(data_dir): 399 | """Build lists of all images file paths, numeric labels, and 400 | human-readable labels in a data set. 401 | Args: 402 | data_dir: string; path to data set. 403 | Returns: 404 | filenames: list of strings; it contains paths to image files. 405 | labels: list of ints; it contains numeric ground truth labels. 406 | categories: list of strings; it contains human-readable ground 407 | truth labels. 408 | """ 409 | 410 | print('Determining list of input files and labels from %s.' % data_dir) 411 | sys.stdout.flush() 412 | filenames = [] 413 | labels = [] 414 | categories = [] 415 | 416 | # Construct the list of JPEG files and labels 417 | label_index = 0 418 | for category in category_names: 419 | if (not FLAGS.cropped_data): 420 | jpeg_file_path = os.path.join(data_dir, category, '*', category + '_*_rgb.jpg') 421 | else: 422 | jpeg_file_path = os.path.join(data_dir, category, '*', '*', category + '_*_rgb.jpg') 423 | matching_files = tf.io.gfile.glob(jpeg_file_path) 424 | 425 | filenames.extend(matching_files) 426 | labels.extend([label_index] * len(matching_files)) 427 | categories.extend([category] * len(matching_files)) 428 | 429 | if (not (label_index % 10)): 430 | print('Finished finding files in %d of %d classes.' % (label_index, len(category_names))) 431 | sys.stdout.flush() 432 | 433 | label_index += 1 434 | 435 | # Shuffle the ordering of all image files in order to guarantee 436 | # random ordering of the images with respect to labels in the 437 | # saved TFRecord files. Make the randomization repeatable. 438 | shuffled_index = list(range(len(filenames))) 439 | random.seed(12345) 440 | random.shuffle(shuffled_index) 441 | 442 | filenames = [filenames[i] for i in shuffled_index] 443 | labels = [labels[i] for i in shuffled_index] 444 | categories = [categories[i] for i in shuffled_index] 445 | 446 | print('Found %d .jpg files across %d labels inside %s.' 447 | %(len(filenames), len(category_names), data_dir)) 448 | sys.stdout.flush() 449 | 450 | return filenames, labels, categories 451 | 452 | def _find_image_bounding_boxes(filenames, categories): 453 | """Find the bounding boxes for a given image file. 454 | Args: 455 | filenames: list of strings; it contains paths to image files. 456 | categories: list of strings; it contains human-readable ground 457 | truth labels. 458 | Returns: 459 | bboxes: list; it contains bounding boxes for each image. 460 | img_sizes: list of tuples; each tuple contains the spatial 461 | dimensions of an image. 462 | """ 463 | 464 | num_image_bbox = 0 465 | bbox_num = 0 466 | bboxes = [] 467 | img_sizes = [] 468 | # Iterate over image files 469 | for i in range(len(filenames)): 470 | f = filenames[i] 471 | category = categories[i] 472 | 473 | f_json = f.replace('.jpg', '.json') 474 | jsonData = json.load(open(f_json)) 475 | 476 | json_bboxes = jsonData['bounding_boxes'] 477 | if not isinstance(json_bboxes, list): 478 | json_bboxes = [json_bboxes] 479 | 480 | h = float(jsonData['img_height']) 481 | w = float(jsonData['img_width']) 482 | # Iterate over available bounding boxes for an image file 483 | bb_lst = [] 484 | if (not FLAGS.cropped_data): 485 | for bb in json_bboxes: 486 | if ((bb['category'] != category) or (bb['ID'] == -1)): 487 | continue 488 | # Change box format from [xmin, ymin, width, height] to 489 | # [ymin, xmin, ymax, xmax] with values as image size percentages 490 | bb['box'] = [float(e) for e in bb['box']] 491 | ymin = bb['box'][1] / h 492 | ymin = clip_0_1(ymin) 493 | xmin = bb['box'][0] / w 494 | xmin = clip_0_1(xmin) 495 | ymax = (bb['box'][1] + bb['box'][3]) / h 496 | ymax = clip_0_1(ymax) 497 | xmax = (bb['box'][0] + bb['box'][2]) / w 498 | xmax = clip_0_1(xmax) 499 | bb_lst.append([ymin, xmin, ymax, xmax]) 500 | 501 | if (len(bb_lst) > 0): 502 | num_image_bbox += 1 503 | bbox_num += len(bb_lst) 504 | else: 505 | # Cropped images result from crop_fMoW.py, 506 | # and bounding boxes are standarized 507 | assert len(jsonData['bounding_boxes']) == 1 508 | 509 | bb = jsonData['bounding_boxes'][0] 510 | assert bb['category'] == category 511 | 512 | box = bb['box'] 513 | assert ((box[0] == 0.) and (box[1] == 0.) and (box[2] == 1.0) and (box[3] == 1.0)) 514 | 515 | bb_lst.append(box) 516 | num_image_bbox += 1 517 | bbox_num += 1 518 | 519 | bboxes.append(bb_lst) 520 | img_sizes.append([h, w]) 521 | 522 | print('Found %d images with %d bboxes out of %d images' 523 | %(num_image_bbox, bbox_num, len(filenames))) 524 | sys.stdout.flush() 525 | 526 | return bboxes, img_sizes 527 | 528 | def _process_dataset(name, directory, num_shards): 529 | """Process a complete data set and save it in TFRecords. 530 | Args: 531 | name: string; unique identifier specifying the data set. 532 | directory: string; path to data set. 533 | num_shards: int; number of shards. 534 | Returns: 535 | - 536 | """ 537 | 538 | filenames, labels, categories = _find_image_files(directory) 539 | bboxes, img_sizes = _find_image_bounding_boxes(filenames, categories) 540 | _process_image_files(name, filenames, labels, categories, bboxes, img_sizes, num_shards) 541 | 542 | def main(argv=None): 543 | """Convert fMoW training, validation, and test images to TFRecords. 544 | Args: 545 | - 546 | Returns: 547 | - 548 | """ 549 | 550 | assert not FLAGS.train_shards % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.train_shards') 551 | assert not FLAGS.validation_shards % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.validation_shards') 552 | 553 | if (not os.path.isdir(FLAGS.output_directory)): 554 | os.makedirs(FLAGS.output_directory) 555 | print('Saving results to %s' % FLAGS.output_directory) 556 | sys.stdout.flush() 557 | 558 | # Create TFRecords 559 | _process_dataset('validation', FLAGS.validation_directory, FLAGS.validation_shards) 560 | _process_dataset('test', FLAGS.test_directory, FLAGS.test_shards) 561 | _process_dataset('train', FLAGS.train_directory, FLAGS.train_shards) 562 | 563 | if __name__ == '__main__': 564 | main() --------------------------------------------------------------------------------