├── restore_dicts
    ├── efficientnet-b0.p
    ├── efficientnet-b1.p
    ├── efficientnet-b2.p
    ├── efficientnet-b3.p
    ├── efficientnet-b4.p
    ├── efficientnet-b5.p
    ├── efficientnet-b6.p
    ├── efficientnet-b7.p
    ├── efficientnet-l2.p
    └── efficientnet-l2-475.p
├── requirements.txt
├── ImageNet
    ├── results_replication.txt
    └── input_imagenet.py
├── fMoW
    ├── match_test_gt.py
    ├── results_replication.txt
    ├── crop_fMoW.py
    ├── input_fMoW.py
    └── create_TFRecords_fMoW.py
├── CUB
    ├── create_csv_cub.py
    ├── create_tfrecords_cub.py
    ├── results_replication.txt
    └── input_cub.py
├── README.md
└── NABirds
    ├── results_replication.txt
    ├── input_nab.py
    └── create_tfrecords_nab.py


/restore_dicts/efficientnet-b0.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b0.p


--------------------------------------------------------------------------------
/restore_dicts/efficientnet-b1.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b1.p


--------------------------------------------------------------------------------
/restore_dicts/efficientnet-b2.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b2.p


--------------------------------------------------------------------------------
/restore_dicts/efficientnet-b3.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b3.p


--------------------------------------------------------------------------------
/restore_dicts/efficientnet-b4.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b4.p


--------------------------------------------------------------------------------
/restore_dicts/efficientnet-b5.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b5.p


--------------------------------------------------------------------------------
/restore_dicts/efficientnet-b6.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b6.p


--------------------------------------------------------------------------------
/restore_dicts/efficientnet-b7.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-b7.p


--------------------------------------------------------------------------------
/restore_dicts/efficientnet-l2.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-l2.p


--------------------------------------------------------------------------------
/restore_dicts/efficientnet-l2-475.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tpap/TNet/HEAD/restore_dicts/efficientnet-l2-475.p


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorflow
 2 | tensorflow-addons
 3 | tensorboard
 4 | tensorboard-plugin-profile
 5 | Pillow
 6 | scikit-learn
 7 | scipy
 8 | opencv-python
 9 | pandas
10 | 


--------------------------------------------------------------------------------
/ImageNet/results_replication.txt:
--------------------------------------------------------------------------------
 1 | --- Training
 2 | 
 3 | - TNet
 4 | 
 5 | The following command can be used to replicate the training of the TNet model reported in the paper:
 6 | 
 7 | python train.py --to_train --to_evaluate_train --to_evaluate_val --batch_size 64 --num_classes 1000 --num_epochs 200 --initial_lr 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --keep_prob 0.5 --loc_per_grid 3.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.34375 --img_size_y 224 --img_size_x 224 --base_res_y 77 --base_res_x 77 --num_samples 1 --ls_dim 512 --num_patches_y 5 --num_patches_x 5 --num_res_levels 2 --num_do_layers 1 --descr_tag 'BagNet_77_TNet' --save_tag 'TNet_ImageNet' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary
 8 | 
 9 | 
10 | 
11 | - Baseline
12 | 
13 | The following command can be used to replicate the training of the BagNet-77 baseline model reported in the paper:
14 | 
15 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_size 64 --num_classes 1000 --num_epochs 200 --initial_lr 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --keep_prob 0.375 --ls_dim 512 --num_do_layers 1 --img_size_y 224 --img_size_x 224 --descr_tag 'BagNet_77' --save_tag 'BagNet_77_ImageNet' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary
16 | 
17 | 
18 | 
19 | 
20 | 
21 | --- Evaluation
22 | 
23 | - TNet
24 | 
25 | The following command can be used to evaluate a trained TNet model on the validation set of ImageNet:
26 | 
27 | python train.py --to_evaluate_val --batch_size 64 --num_classes 1000 --loc_per_grid 3.0 --overlap 0.34375 --img_size_y 224 --img_size_x 224 --ls_dim 512 --num_patches_y 5 --num_patches_x 5 --base_res_y 77 --base_res_x 77 --num_res_levels 2 --descr_tag 'BagNet_77_TNet' --save_tag 'TNet_ImageNet' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
28 | 
29 | The following command can be used to time the inference of TNet:
30 | 
31 | python train.py --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 1 --to_evaluate_val --batch_size 64 --num_classes 1000 --loc_per_grid 3.0 --overlap 0.34375 --img_size_y 224 --img_size_x 224 --ls_dim 512 --num_patches_y 5 --num_patches_x 5 --base_res_y 77 --base_res_x 77 --num_res_levels 2 --descr_tag 'BagNet_77_TNet' --save_tag 'TNet_ImageNet' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
32 | 
33 | The following command can be used for advanced evaluation of TNet:
34 | 
35 | python train.py --adv_eval_data --batches_to_time_range 0 -1 --eval_epochs_num 1 --to_evaluate_val --batch_size 64 --num_classes 1000 --loc_per_grid 3.0 --overlap 0.34375 --img_size_y 224 --img_size_x 224 --ls_dim 512 --num_patches_y 5 --num_patches_x 5 --base_res_y 77 --base_res_x 77 --num_res_levels 2 --descr_tag 'BagNet_77_TNet' --save_tag 'TNet_ImageNet' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --labels_file '/path/to/imagenet_lsvrc_2015_synsets.txt' --imagenet_metadata_file '/path/to/imagenet_metadata.txt'
36 | 
37 | Advanced evaluation corresponds to the creation of an excel file with information about the attended locations and the attendance probabilities of all candidate locations.
38 | 
39 | 
40 | - Baseline
41 | 
42 | The following command can be used to evaluate a trained BagNet-77 model on the validation set of ImageNet:
43 | 
44 | python train_bl.py --to_evaluate_val --batch_size 64 --num_classes 1000 --ls_dim 512 --img_size_y 224 --img_size_x 224 --descr_tag 'BagNet_77' --save_tag 'BagNet_77_ImageNet' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
45 | 
46 | The following command can be used to time the inference of BagNet-77:
47 | 
48 | python train_bl.py --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 1 --to_evaluate_val --batch_size 64 --num_classes 1000 --ls_dim 512 --img_size_y 224 --img_size_x 224 --descr_tag 'BagNet_77' --save_tag 'BagNet_77_ImageNet' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/fMoW/match_test_gt.py:
--------------------------------------------------------------------------------
  1 | """Match ground truth information with test images.
  2 | Raw fMoW data can be downloaded here https://github.com/fMoW/dataset.
  3 | The current script utilizes the rgb version of fMoW, and not the full version.
  4 | fMoW data are split in training, validation and test sets. After download, for
  5 | the training and validations sets, jpeg and json files are expected to reside
  6 | in the following directory structure:
  7 |     /train/airport/airport_0/airport_0_0_rgb.jpg
  8 |     /train/airport/airport_0/airport_0_0_rgb.json
  9 |     ...
 10 | 
 11 |     /val/airport/airport_0/airport_0_0_rgb.jpg
 12 |     /val/airport/airport_0/airport_0_0_rgb.json
 13 |     ...
 14 | 
 15 | For the test set, jpeg and json files are expected to reside
 16 | in the following directory structure:
 17 |     /test/0011978/0011978_0_rgb.jpg
 18 |     /test/0011978/0011978_0_rgb.json
 19 |     ...
 20 | 
 21 | Test set directory structure doesn't reveal the labels of the images, because
 22 | it was initially realeased in the context of an IARPA challenge (https://www.iarpa.gov/challenges/fmow.html).
 23 | However, given that the challenge is over, test set annotations are available
 24 | for download with the rest of the data here https://github.com/fMoW/dataset.
 25 | After downloding the ground truth test data, they consist of json files that
 26 | reside in the following directory structure:
 27 |     /test_gt/airport/airport_0/airport_0_0_rgb.json
 28 |     /test_gt/airport/airport_0/airport_0_1_rgb.json
 29 |     ...
 30 | 
 31 | The additional test_gt_mapping.json file is provided to establish a correspondance
 32 | between the annotations under folder test_gt, and the images under folder test.
 33 | The current script organizes jpeg and json files for the test set, in the following
 34 | directory structure:
 35 |     /test_matched_with_gt/airport/airport_0/airport_0_0_rgb.jpeg
 36 |     /test_matched_with_gt/airport/airport_0/airport_0_0_rgb.json
 37 |     ...
 38 | """
 39 | 
 40 | import argparse
 41 | import os
 42 | import json
 43 | from tqdm import tqdm
 44 | import errno
 45 | import shutil
 46 | 
 47 | 
 48 | 
 49 | parser = argparse.ArgumentParser()
 50 | 
 51 | parser.add_argument('--root_test_dir', type=str, default='/fMoW-rgb/', help='Root directory of the original test data.')
 52 | parser.add_argument('--test_output_dir', type=str, default='/test_matched_with_gt/', help='Directory to output the matched data.')
 53 | parser.add_argument('--match_gt_json_path', type=str, default='/test_gt_mapping.json', help='Path to test_gt_mapping.json.')
 54 | 
 55 | FLAGS = parser.parse_args()
 56 | 
 57 | def try_mkdir(input_dir):
 58 |     """Try to make directory.
 59 |     Args:
 60 |         input_dir: string; directory to create.
 61 |     Returns:
 62 |         -
 63 |     """
 64 | 
 65 |     if (not os.path.isdir(input_dir)):
 66 |         try:
 67 |             os.makedirs(input_dir)
 68 |         except OSError as e:
 69 |             if (e.errno == errno.EEXIST):
 70 |                 pass
 71 | 
 72 | def main(argv=None):
 73 |     """Match data with ground truth information,
 74 |        and save to new directory structure.
 75 |     Args:
 76 |         -
 77 |     Returns:
 78 |         -
 79 |     """
 80 |     
 81 |     # Load test_gt_mapping.json, and iterate over its entries
 82 |     jsonData = json.load(open(FLAGS.match_gt_json_path))
 83 |     for entry in tqdm(jsonData):
 84 |         src_test_dir = os.path.join(FLAGS.root_test_dir, entry['output'])
 85 |         src_test_gt_dir = os.path.join(FLAGS.root_test_dir, entry['input'])
 86 |         save_dir_suffix = entry['input'].split('/', 1)[1]
 87 |         save_dir = os.path.join(FLAGS.test_output_dir, save_dir_suffix)
 88 |         try_mkdir(save_dir)
 89 | 
 90 |         f_name_prefix_test_gt = entry['input'].split('/')[-1]
 91 |         f_name_prefix_test = entry['output'].split('/')[-1]
 92 |         
 93 |         for _, _, files in os.walk(src_test_dir):
 94 |             for f_src in files:
 95 |                 # Ignore msrgb images
 96 |                 if f_src.endswith('_rgb.jpg'):
 97 |                     f_src_test_img = f_src
 98 | 
 99 |                     f_scr_test_gt_json = f_src.replace('.jpg', '.json')
100 |                     f_scr_test_gt_json = f_scr_test_gt_json.replace(f_name_prefix_test, f_name_prefix_test_gt)
101 |                     
102 |                     f_dst_json = f_scr_test_gt_json
103 |                     f_dst_img = f_src_test_img.replace(f_name_prefix_test, f_name_prefix_test_gt)
104 | 
105 |                     jsonData_src_test_gt = json.load(open(os.path.join(src_test_gt_dir, f_scr_test_gt_json)))
106 |                     # Ignore bounding boxes with unknown ids
107 |                     if not isinstance(jsonData_src_test_gt['bounding_boxes'], list):
108 |                         jsonData_src_test_gt['bounding_boxes'] = [jsonData_src_test_gt['bounding_boxes']]
109 |                     bb_lst = []
110 |                     for bb in jsonData_src_test_gt['bounding_boxes']:
111 |                         if (bb['ID'] != -1):
112 |                             bb_lst.append(bb)
113 |                     
114 |                     jsonData_dst = jsonData_src_test_gt
115 |                     jsonData_dst['bounding_boxes'] = bb_lst
116 |                     
117 |                     # Save updated json file
118 |                     json.dump(jsonData_dst, open(os.path.join(save_dir, f_dst_json), 'w'))
119 |                     # Copy test image under the new directory
120 |                     shutil.copy(os.path.join(src_test_dir, f_src_test_img), os.path.join(save_dir, f_dst_img))
121 | 
122 | if __name__ == '__main__':
123 |     main()
124 | 


--------------------------------------------------------------------------------
/CUB/create_csv_cub.py:
--------------------------------------------------------------------------------
  1 | """Create csv files for the training and validation splits of the Caltech-UCSD Birds-200-2011 dataset.
  2 | Each entry in the csv files containes the path to an image, its numeric label, and its human-readable
  3 | label. Raw data can be downloaded here http://www.vision.caltech.edu/visipedia/CUB-200-2011.html.
  4 | """
  5 | 
  6 | from __future__ import absolute_import, division, print_function
  7 | 
  8 | import os
  9 | import random
 10 | import argparse
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | from tqdm import tqdm
 14 | import pandas as pd
 15 | 
 16 | 
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | 
 20 | # This file (images.txt) contains the list of image file names, with each line corresponding to one image.
 21 | # The content of the file is expected to be as follows:
 22 | # <image_id> <image_name>
 23 | # 
 24 | # where image_id is a unique numeric identifier for each image in the dataset, and image_name is the path
 25 | # to the corresponding image file. An example line is the following:
 26 | # 16 001.Black_footed_Albatross/Black_Footed_Albatross_0016_796067.jpg
 27 | parser.add_argument('--imgs_list_txt', type=str, default='/images.txt', help='File with list of image paths.')
 28 | 
 29 | # This file (train_test_split.txt) contains the suggested training/validation split, with each line corresponding
 30 | # to one image. The content of the file is expected to be as follows:
 31 | # <image_id> <is_training_image>
 32 | #
 33 | # where image_id is a unique numeric identifier for each image in the dataset (same as in images.txt), and
 34 | # is_training_image takes either value 1 or 0, denoting that the file is in the training or the validation
 35 | # set, respectively. An example line is the following:
 36 | # 16 0
 37 | parser.add_argument('--split_list_txt', type=str, default='/train_test_split.txt', help='File with information about train/validation split of the data.')
 38 | 
 39 | parser.add_argument('--save_dir', type=str, default='/CUB_200_2011/', help='Output data directory')
 40 | 
 41 | FLAGS = parser.parse_args()
 42 | 
 43 | def find_image_files(imgs_list_txt):
 44 |     """Build lists of all images file paths, numeric labels, and
 45 |        human-readable labels.
 46 |     Args:
 47 |         imgs_list_txt: string; path to file with list of image paths.
 48 |     Returns:
 49 |         filenames: list of strings; it contains paths to image files.
 50 |         labels_values: list of ints; it contains numeric labels.
 51 |         labels_names: list of strings; it contains human-readable labels.
 52 |     """
 53 | 
 54 |     lines = tf.io.gfile.GFile(imgs_list_txt, 'r').readlines()
 55 | 
 56 |     filenames = []
 57 |     labels_values = []
 58 |     labels_names = []
 59 |     # Iterate over file lines
 60 |     for l in lines:
 61 |         if l:
 62 |             parts = l.strip().split(' ')
 63 |             assert len(parts) == 2
 64 |             filenames.append('/' + parts[1])
 65 |             
 66 |             p = parts[1].split('.', 1)
 67 |             labels_values.append(int(p[0]))
 68 |             labels_names.append(p[1].split('/', 1)[0])
 69 | 
 70 |     print('Found %d JPEG files across %d labels.' %(len(filenames), len(labels_names)))
 71 | 
 72 |     return filenames, labels_values, labels_names
 73 | 
 74 | def split_data(split_list_txt, filenames, labels_values, labels_names):
 75 |     """Create entries for csv files about the training and validation
 76 |        splits of the Caltech-UCSD Birds-200-2011 dataset. Each entry
 77 |        includes the path to an image, its numeric label, and its
 78 |        human-readable label.
 79 |     Args:
 80 |         split_list_txt: string; path to file with information about
 81 |             train/validation split of the data.
 82 |         filenames: list of strings; it contains paths to image files.
 83 |         labels_values: list of ints; it contains numeric labels.
 84 |         labels_names: list of strings; it contains human-readable labels.
 85 |     Returns:
 86 |         train_csv_entries: np array; it contains paths to the image files
 87 |             of the training split. It also contains the numeric label and
 88 |             the  human-readable label of each iamge. It is of size
 89 |             [num_imgs_train, 3], where num_imgs_train is the number of
 90 |             images in the training split.
 91 |         validation_csv_entries: np array; it contains paths to the image
 92 |             files of the validation split. It also contains the numeric
 93 |             label and the human-readable label of each iamge. It is of
 94 |             size [num_imgs_val, 3], where num_imgs_val is the number of
 95 |             images in the validation split.
 96 |     """
 97 | 
 98 |     lines = tf.io.gfile.GFile(split_list_txt, 'r').readlines()
 99 |     
100 |     split_val = []
101 |     for l in lines:
102 |         if l:
103 |             split_val.append(l.strip().split(' ')[1])
104 | 
105 |     # Shuffle the ordering of image files to guarantee
106 |     # random ordering of the images with respect to labels
107 |     shuffled_index = list(range(len(filenames)))
108 |     random.seed(12345)
109 |     random.shuffle(shuffled_index)
110 |     filenames = [filenames[i] for i in shuffled_index]
111 |     labels_values = [labels_values[i] for i in shuffled_index]
112 |     labels_names = [labels_names[i] for i in shuffled_index]
113 |     split_val = [split_val[i] for i in shuffled_index]
114 | 
115 |     df_array = np.concatenate((np.expand_dims(np.asarray(filenames), 1), np.expand_dims(np.asarray(labels_values), 1),
116 |                                 np.expand_dims(np.asarray(labels_names), 1)), axis=1)
117 |     mask = np.asarray(split_val).astype(bool)
118 |     inv_mask = (1 - mask).astype(bool)
119 | 
120 |     # Create entries for csv files about the training split
121 |     train_csv_entries = df_array[mask, :]
122 |     # Create entries for csv files about the validation split
123 |     validation_csv_entries = df_array[inv_mask, :]
124 | 
125 |     print('Added %d entries to train split, and %d entries to validation split.' %(train_csv_entries.shape[0], validation_csv_entries.shape[0]))
126 | 
127 |     return train_csv_entries, validation_csv_entries
128 | 
129 | def save_to_csv(csv_entries, save_dir, tag):
130 |     """Save csv entries.
131 |     Args:
132 |         csv_entries: np array; it contains paths to image files with
133 |             their numeric labels, and human-readable labels. It is of size
134 |             [num_imgs, 3], where num_imgs is the number of images files.
135 |         save_dir: string; directory to save the csv file.
136 |         tag: string; name of the csv file to save.
137 |     Returns:
138 |         -
139 |     """
140 | 
141 |     cols = ['fname', 'class_number', 'class_name']
142 |     df = pd.DataFrame(csv_entries, columns=cols)
143 |     fp = os.path.join(save_dir, tag + '.csv')
144 | 
145 |     if (not os.path.isdir(FLAGS.save_dir)):
146 |         os.makedirs(FLAGS.save_dir)
147 |     df.to_csv(fp, encoding='utf-8', index=False)
148 |     print('CSV saved at %s.' %fp)
149 | 
150 | def main(argv=None):
151 |     """Create csv files for the training and validation splits
152 |        of the Caltech-UCSD Birds-200-2011 dataset.
153 |     Args:
154 |         -
155 |     Returns:
156 |         -
157 |     """
158 | 
159 |     # Build lists with image file paths, numeric labels, and human-readable labels
160 |     filenames, labels_values, labels_names = find_image_files(FLAGS.imgs_list_txt)
161 | 
162 |     # Create csv entries for training and validation splits
163 |     train_csv_entries, validation_csv_entries = split_data(FLAGS.split_list_txt, filenames, labels_values, labels_names)
164 | 
165 |     # Save csv files for training and validation splits
166 |     save_to_csv(train_csv_entries, FLAGS.save_dir, 'train_anno')
167 |     save_to_csv(validation_csv_entries, FLAGS.save_dir, 'validation_anno')
168 | 
169 | if __name__ == '__main__':
170 |     main()
171 | 


--------------------------------------------------------------------------------
/fMoW/results_replication.txt:
--------------------------------------------------------------------------------
 1 | --- Training
 2 | 
 3 | - TNet
 4 | 
 5 | The TNet model reported in the paper is trained in two steps; first TNet is trained on images of size 448x448 px, and then it is fine-tuned on images of size 896x896 px. The first step can be replicated with the following command:
 6 | 
 7 | python train.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_classes 62 --num_epochs 40 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.5 --block_drop_rate 0.3 --loc_per_grid 2.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.2 --perFReg_reinf_weight 0.2 --overlap 0.5 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 3 --num_patches_x 3 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNet_lg3x3' --save_tag 'TNet_fMoW_448' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary
 8 | 
 9 | The second step can be replicated with the following command:
10 | 
11 | python train.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_classes 62 --num_epochs 10 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.5 --block_drop_rate 0.5 --loc_per_grid 2.0 1.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.05 --perFReg_reinf_weight 0.05 --overlap 0.5 --img_size_y 896 --img_size_x 896 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 3 --num_patches_x 3 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 3 --perFReg_cap 2 --descr_tag 'EfficientNet_lg3x3' --save_tag 'TNet_fMoW' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/dir/with/ckpt/to/restore/'
12 | 
13 | 
14 | 
15 | - Baselines
16 | 
17 | The following command can be used to replicate the training of the EfficientNet-B0 model on the cropped images:
18 | 
19 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 65 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --l2_reg 0.00001 --dropout_rate 0.75 --block_drop_rate 0.5 --img_size_y 224 --img_size_x 224 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet' --save_tag 'bl_fMoW_cropped' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary
20 | 
21 | The following command can be used to replicate the training of the EfficientNet-B0 model on images of size 224x224 px:
22 | 
23 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 60 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --l2_reg 0.00001 --dropout_rate 0.75 --block_drop_rate 0.5 --img_size_y 224 --img_size_x 224 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet' --save_tag 'bl_fMoW_224' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary
24 | 
25 | Since this model is trained on images of relatively small size, for purposes of training efficiency, we used TFRecords created with the following command (for the other models we use TFRecords which are created as described in the README.md of our repository):
26 | 
27 | python create_TFRecords_fMoW.py --train_directory '/path/to/training/set/dir/' --validation_directory '/path/to/validation/set/dir/' --test_directory '/path/to/test/set/dir/' --output_directory '/path/to/output/dir/' --maximum_min_dim 275
28 | 
29 | The following command can be used to replicate the training of the EfficientNet-B0 model on images of size 448x448 px:
30 | 
31 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 32 --num_epochs 30 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --l2_reg 0.00001 --dropout_rate 0.75 --block_drop_rate 0.3 --img_size_y 448 --img_size_x 448 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet' --save_tag 'bl_fMoW_448' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary
32 | 
33 | The following command can be used to replicate the training of the EfficientNet-B0 model on images of size 896x896 px:
34 | 
35 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 32 --num_epochs 30 --initial_lr 0.001 --lr_scedule_1step --lr_decay_factor 0.1 --l2_reg 0.00001 --dropout_rate 0.3 --block_drop_rate 0.2 --img_size_y 896 --img_size_x 896 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet' --save_tag 'bl_fMoW_896' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary
36 | 
37 | 
38 | 
39 | 
40 | 
41 | --- Evaluation
42 | 
43 | - TNet
44 | 
45 | The following command can be used to evaluate a trained TNet model on the test set of fMoW, with 2 processing levels (images of size 448x448 px) and 2 attended locations:
46 | 
47 | python train.py --to_evaluate_test --batch_norm --batch_size 64 --num_classes 62 --loc_per_grid 2.0 --overlap 0.5 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 3 --num_patches_x 3 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNet_lg3x3' --save_tag 'TNet_fMoW_448' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
48 | 
49 | The following command can be used to evaluate a trained TNet model on the test set of fMoW, with 3 processing levels (images of size 896x896 px) and 4 attended locations:
50 | 
51 | python train.py --to_evaluate_test --batch_norm --batch_size 64 --num_classes 62 --loc_per_grid 2.0 1.0 --overlap 0.5 --img_size_y 896 --img_size_x 896 --pos_dim_divisor 4 --num_patches_y 3 --num_patches_x 3 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 3 --descr_tag 'EfficientNet_lg3x3' --save_tag 'TNet_fMoW' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
52 | 
53 | The following flags can be added to the previous evaluation commands in order to time the inference of TNet:
54 | 
55 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 1
56 | 
57 | The following flags can be added to the previous evaluation commands for advanced evaluation of TNet:
58 | 
59 | --adv_eval_data --batches_to_time_range 0 -1 --eval_epochs_num 1
60 | 
61 | Advanced evaluation corresponds to the creation of an excel file with information about the attended locations and the attendance probabilities of all candidate locations.
62 | 
63 | 
64 | 
65 | - Baselines
66 | 
67 | The following command can be used to evaluate on the test set of fMoW, an EfficientNet-B0 model trained on cropped images of size 224x224 px:
68 | 
69 | python train_bl.py --to_evaluate_test --batch_norm --batch_size 64 --img_size_y 224 --img_size_x 224 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet0' --save_tag 'bl_fMoW_cropped' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
70 | 
71 | The following command can be used to evaluate on the test set of fMoW, an EfficientNet-B0 model trained on images of size 224x224 px:
72 | 
73 | python train_bl.py --to_evaluate_test --batch_norm --batch_size 64 --img_size_y 224 --img_size_x 224 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet0' --save_tag 'bl_fMoW_224' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
74 | 
75 | The following command can be used to evaluate on the test set of fMoW, an EfficientNet-B0 model trained on images of size 448x448 px:
76 | 
77 | python train_bl.py --to_evaluate_test --batch_norm --batch_size 64 --img_size_y 448 --img_size_x 448 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet0' --save_tag 'bl_fMoW_448' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
78 | 
79 | The following command can be used to evaluate on the test set of fMoW, an EfficientNet-B0 model trained on images of size 896x896 px:
80 | 
81 | python train_bl.py --to_evaluate_test --batch_norm --batch_size 64 --img_size_y 896 --img_size_x 896 --width_coefficient 1.0 --depth_coefficient 1.0 --activation 'swish' --descr_tag 'EfficientNet0' --save_tag 'bl_fMoW_896' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
82 | 
83 | The following flags can be added to the previous evaluation commands in order to time the inference of the baselines:
84 | 
85 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 1
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/fMoW/crop_fMoW.py:
--------------------------------------------------------------------------------
  1 | """Crop fMoW images based on bounding box annotations.
  2 | """
  3 | 
  4 | from __future__ import absolute_import, division, print_function
  5 | 
  6 | import argparse
  7 | from datetime import datetime
  8 | import os
  9 | import random
 10 | import sys
 11 | import threading
 12 | import json
 13 | from multiprocessing import cpu_count
 14 | import cv2
 15 | import copy
 16 | 
 17 | import numpy as np
 18 | import six
 19 | import tensorflow as tf
 20 | 
 21 | 
 22 | 
 23 | parser = argparse.ArgumentParser()
 24 | 
 25 | parser.add_argument('--train_directory', type=str, default='/train/', help='Training data directory.')
 26 | parser.add_argument('--validation_directory', type=str, default='/val/', help='Validation data directory.')
 27 | parser.add_argument('--test_directory', type=str, default='/test_matched_with_gt/', help='Test data directory.')
 28 | parser.add_argument('--output_directory', type=str, default='/data_cropped/', help='Output data directory.')
 29 | parser.add_argument('--num_threads', type=int, default=16, help='Number of threads to parallelize processing.')
 30 | 
 31 | FLAGS = parser.parse_args()
 32 | 
 33 | category_names = ['airport', 'airport_hangar', 'airport_terminal', 'amusement_park', 'aquaculture', 'archaeological_site', 'barn', 'border_checkpoint', 'burial_site', 'car_dealership', 'construction_site',
 34 |                   'crop_field', 'dam', 'debris_or_rubble', 'educational_institution', 'electric_substation', 'factory_or_powerplant', 'fire_station', 'flooded_road', 'fountain', 'gas_station', 'golf_course',
 35 |                   'ground_transportation_station', 'helipad', 'hospital', 'interchange', 'lake_or_pond', 'lighthouse', 'military_facility', 'multi-unit_residential', 'nuclear_powerplant', 'office_building',
 36 |                   'oil_or_gas_facility', 'park', 'parking_lot_or_garage', 'place_of_worship', 'police_station', 'port', 'prison', 'race_track', 'railway_bridge', 'recreational_facility', 'impoverished_settlement',
 37 |                   'road_bridge', 'runway', 'shipyard', 'shopping_mall', 'single-unit_residential', 'smokestack', 'solar_farm', 'space_facility', 'stadium', 'storage_tank','surface_mine', 'swimming_pool',
 38 |                   'toll_booth', 'tower', 'tunnel_opening', 'waste_disposal', 'water_treatment_facility', 'wind_farm', 'zoo']
 39 | 
 40 | def _process_image_files_batch(thread_index, ranges, file_paths, categories, outDir):
 41 |     """Execute 1 thread that processes images and saves crops according
 42 |        to bounding box annotations.
 43 |     Args:
 44 |         thread_index: int; unique thread identifier.
 45 |         ranges: list of ints; it contains the range of images to process.
 46 |         file_paths: list of strings; it contains paths to image files.
 47 |         categories: list of strings; it contains human-readable labels.
 48 |         outDir: string; directory to save output data.
 49 |     Returns:
 50 |         -
 51 |     """
 52 | 
 53 |     # Process each file
 54 |     files_in_thread = np.arange(ranges[thread_index][0], ranges[thread_index][1], dtype=int)
 55 |     img_num = 0
 56 |     bbox_num = 0
 57 |     for i in files_in_thread:
 58 |         img_num += 1
 59 |         f_src_img = file_paths[i]
 60 |         f_src_json = f_src_img.replace('.jpg', '.json')
 61 | 
 62 |         # Load image
 63 |         img = cv2.imread(f_src_img).astype(np.float32)
 64 | 
 65 |         # Load json file with image information
 66 |         jsonData = json.load(open(f_src_json))
 67 |         if not isinstance(jsonData['bounding_boxes'], list):
 68 |             jsonData['bounding_boxes'] = [jsonData['bounding_boxes']]
 69 | 
 70 |         label = categories[i]
 71 |         for bb in jsonData['bounding_boxes']:
 72 |             category = bb['category']
 73 |             if ((category != label) or (bb['ID'] == -1)):
 74 |                 continue
 75 |             bbox_num += 1
 76 |             # Each bounding box is a list of 4 ints. The first two entries (box[0] and box[1])
 77 |             # are the coordinates in pixels of top left corner of the box (first the horizontal
 78 |             # and then the vertical coordinate), and the last two entries (box[2] and box[3])
 79 |             # are the width and the height of the box
 80 |             box = bb['box']
 81 | 
 82 |             # Ignore tiny boxes
 83 |             if box[2] <= 2 or box[3] <= 2:
 84 |                 continue
 85 | 
 86 |             # Add margin around a bounding box for more contextual information.
 87 |             # The followed strategy is based on _process_file function from
 88 |             # https://github.com/fMoW/baseline/blob/master/code/data_ml_functions/dataFunctions.py
 89 |             contextMultWidth = 0.15
 90 |             contextMultHeight = 0.15
 91 | 
 92 |             wRatio = float(box[2]) / img.shape[1]
 93 |             hRatio = float(box[3]) / img.shape[0]
 94 |             
 95 |             if ((wRatio < 0.5) and (wRatio >= 0.4)):
 96 |                 contextMultWidth = 0.2
 97 |             if ((wRatio < 0.4) and (wRatio >= 0.3)):
 98 |                 contextMultWidth = 0.3
 99 |             if ((wRatio < 0.3) and (wRatio >= 0.2)):
100 |                 contextMultWidth = 0.5
101 |             if ((wRatio < 0.2) and (wRatio >= 0.1)):
102 |                 contextMultWidth = 1
103 |             if (wRatio < 0.1):
104 |                 contextMultWidth = 2
105 |                 
106 |             if ((hRatio < 0.5) and (hRatio >= 0.4)):
107 |                 contextMultHeight = 0.2
108 |             if ((hRatio < 0.4) and (hRatio >= 0.3)):
109 |                 contextMultHeight = 0.3
110 |             if ((hRatio < 0.3) and (hRatio >= 0.2)):
111 |                 contextMultHeight = 0.5
112 |             if ((hRatio < 0.2) and (hRatio >= 0.1)):
113 |                 contextMultHeight = 1
114 |             if (hRatio < 0.1):
115 |                 contextMultHeight = 2
116 |             
117 |             widthBuffer = int((box[2] * contextMultWidth) / 2.0)
118 |             heightBuffer = int((box[3] * contextMultHeight) / 2.0)
119 | 
120 |             r1 = box[1] - heightBuffer
121 |             r2 = box[1] + box[3] + heightBuffer
122 |             c1 = box[0] - widthBuffer
123 |             c2 = box[0] + box[2] + widthBuffer
124 | 
125 |             if (r1 < 0):
126 |                 r1 = 0
127 |             if (r2 > img.shape[0]):
128 |                 r2 = img.shape[0]
129 |             if (c1 < 0):
130 |                 c1 = 0
131 |             if (c2 > img.shape[1]):
132 |                 c2 = img.shape[1]
133 | 
134 |             if ((r1 >= r2) or (c1 >= c2)):
135 |                 continue
136 | 
137 |             subImg = img[r1:r2, c1:c2, :]
138 | 
139 |             jsonData_dst = copy.deepcopy(jsonData)
140 |             bb['box'] = [0., 0., 1.0, 1.0]
141 |             jsonData_dst['bounding_boxes'] = [bb]
142 |             jsonData['img_height'] = r2 - r1 
143 |             jsonData['img_width'] = c2 - c1
144 | 
145 |             # Determine output directory and save files
146 |             slashes = [k for k, ltr in enumerate(f_src_img) if ltr == '/']
147 |             outBaseName = '%s_%s' %(category, bb['ID'])
148 |             currOut = os.path.join(outDir, f_src_img[(slashes[-3] + 1):slashes[-1]], outBaseName)
149 | 
150 |             if (not os.path.isdir(currOut)):
151 |                 try:
152 |                     os.makedirs(currOut)
153 |                 except:
154 |                     print("Directory already created.")
155 | 
156 |             f_name = os.path.basename(f_src_img)
157 |             f_dst_img = os.path.join(currOut, f_name)
158 |             f_dst_json = f_dst_img.replace('.jpg', '.json')
159 | 
160 |             cv2.imwrite(f_dst_img, subImg)
161 |             json.dump(jsonData_dst, open(f_dst_json, 'w'))
162 | 
163 |     print('%s [thread %d]: Wrote %d images with %d bboxes.' %(datetime.now(), thread_index, img_num, bbox_num))
164 |     sys.stdout.flush()
165 | 
166 | def _process_image_files(file_paths, categories, outDir):
167 |     """Process images and save crops according to bounding box annotations.
168 |     Args:
169 |         file_paths: list of strings; it contains paths to image files.
170 |         categories: list of strings; it contains human-readable labels.
171 |         outDir: string; directory to save output data.
172 |     Returns:
173 |         -
174 |     """
175 | 
176 |     # Break images into batches
177 |     num_threads = FLAGS.num_threads
178 |     spacing = np.linspace(0, len(file_paths), num_threads + 1).astype(np.int)
179 |     ranges = []
180 |     for i in range(len(spacing) - 1):
181 |         ranges.append([spacing[i], spacing[i + 1]])
182 | 
183 |     # Launch a thread for each batch
184 |     print('Launching %d threads for spacings: %s' % (num_threads, ranges))
185 |     sys.stdout.flush()
186 | 
187 |     # Create a mechanism for monitoring when all threads are finished.
188 |     coord = tf.train.Coordinator()
189 | 
190 |     # Run threads
191 |     threads = []
192 |     for thread_index in range(len(ranges)):
193 |         args = (thread_index, ranges, file_paths, categories, outDir)
194 |         t = threading.Thread(target=_process_image_files_batch, args=args)
195 |         t.start()
196 |         threads.append(t)
197 | 
198 |     # Wait for all the threads to terminate.
199 |     coord.join(threads)
200 |     print('%s: Finished writing all %d images in data set.' %(datetime.now(), len(file_paths)))
201 |     sys.stdout.flush()
202 | 
203 | def _find_image_files(data_dir):
204 |     """Build lists of all images file paths, synsets, and labels in
205 |        a data set.
206 |     Args:
207 |         data_dir: string; path to data set.
208 |     Returns:
209 |         file_paths: list of strings; it contains paths to image files.
210 |         categories: list of strings; it contains human-readable labels.
211 |     """
212 | 
213 |     # Construct the lists of image files and categories
214 |     print('Determining list of input files and categories from %s.' % data_dir)
215 |     file_paths = []
216 |     categories = []
217 |     label_index = 1
218 |     for category in category_names:
219 |         jpeg_file_path = os.path.join(data_dir, category, '*', category + '_*_rgb.jpg')
220 |         matching_files = tf.io.gfile.glob(jpeg_file_path)
221 | 
222 |         file_paths.extend(matching_files)
223 |         categories.extend([category] * len(matching_files))
224 | 
225 |         if (not (label_index % 10)):
226 |             print('Finished finding files in %d of %d classes.' %(label_index, len(category_names)))
227 |         label_index += 1
228 | 
229 |     # Shuffle images to distribute large images to different threads
230 |     # and avoid bottlenecks, since image size seems to be class specific
231 |     shuffled_index = list(range(len(file_paths)))
232 |     random.seed(12345)
233 |     random.shuffle(shuffled_index)
234 | 
235 |     file_paths = [file_paths[i] for i in shuffled_index]
236 |     categories = [categories[i] for i in shuffled_index]
237 | 
238 |     print('Found %d .jpg files across %d labels inside %s.' %(len(file_paths), len(category_names), data_dir))
239 |   
240 |     return file_paths, categories
241 | 
242 | def _process_dataset(directory, outDir):
243 |     """Process a complete data set (training, validation or test).
244 |     Args:
245 |         directory: string; path to data set.
246 |         outDir: string; directory to save output data.
247 |     Returns:
248 |         -
249 |     """
250 | 
251 |     file_paths, categories = _find_image_files(directory)
252 |     _process_image_files(file_paths, categories, outDir)
253 | 
254 | def main(argv=None):
255 |     """Crop fMoW training, validation and testing images
256 |        based on bounding box annotations.
257 |     Args:
258 |         -
259 |     Returns:
260 |         -
261 |     """
262 | 
263 |     _process_dataset(FLAGS.validation_directory, os.path.join(FLAGS.output_directory, 'val'))
264 |     _process_dataset(FLAGS.test_directory, os.path.join(FLAGS.output_directory, 'test'))
265 |     _process_dataset(FLAGS.train_directory, os.path.join(FLAGS.output_directory, 'train'))
266 | 
267 | if __name__ == '__main__':
268 |   main()


--------------------------------------------------------------------------------
/CUB/create_tfrecords_cub.py:
--------------------------------------------------------------------------------
  1 | """Convert Caltech-UCSD Birds-200-2011 images to TFRecords. Information about the training and
  2 | validation splits of the data reside in csv files, which are created by using create_csv_cub.py.
  3 | Raw data can be downloaded here http://www.vision.caltech.edu/visipedia/CUB-200-2011.html, and
  4 | are assumed to reside in the following directory structure:
  5 |     images/001.Black_footed_Albatross/Black_Footed_Albatross_0001_796111.jpg
  6 |     images/002.Laysan_Albatross/Laysan_Albatross_0001_545.jpg
  7 |     ...
  8 | """
  9 | 
 10 | from __future__ import absolute_import, division, print_function
 11 | 
 12 | import argparse
 13 | from datetime import datetime
 14 | import os
 15 | import random
 16 | import sys
 17 | import threading
 18 | import scipy.io
 19 | import pandas as pd
 20 | import six
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | 
 25 | 
 26 | 
 27 | parser = argparse.ArgumentParser()
 28 | 
 29 | parser.add_argument('--img_dir', type=str, default='/CUB_200_2011/images/', help='Directory with raw image data.')
 30 | parser.add_argument('--train_csv_path', type=str, default='/CUB_200_2011/train_anno.csv', help='Path to csv file with information about the images in the training split.')
 31 | parser.add_argument('--dev_csv_path', type=str, default='/CUB_200_2011/validation_anno.csv', help='Path to csv file with information about the images in the validation split.')
 32 | parser.add_argument('--output_dir', type=str, default='/TFRecords/', help='Output data directory.')
 33 | 
 34 | parser.add_argument('--train_shards_num', type=int, default=16, help='Number of shards in training TFRecord files.')
 35 | parser.add_argument('--dev_shards_num', type=int, default=16, help='Number of shards in validation TFRecord files.')
 36 | parser.add_argument('--num_threads', type=int, default=16, help='Number of threads to parallelize processing.')
 37 | 
 38 | FLAGS = parser.parse_args()
 39 | 
 40 | def _int64_feature(value):
 41 |     """Insert int features into Example proto.
 42 |     Args:
 43 |         value: int or list of ints; features to insert
 44 |             in Example proto.
 45 |     Returns:
 46 |         feature: example proto; it contains a list of ints.
 47 |     """
 48 | 
 49 |     if ((not isinstance(value, list)) and (not isinstance(value, np.ndarray))):
 50 |         value = [value]
 51 |     
 52 |     feature = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
 53 | 
 54 |     return feature
 55 | 
 56 | def _float_feature(value):
 57 |     """Insert float features into Example proto.
 58 |     Args:
 59 |         value: float or list of floats; features to insert
 60 |             in Example proto.
 61 |     Returns:
 62 |         feature: example proto; it contains a list of floats.
 63 |     """
 64 | 
 65 |     if ((not isinstance(value, list)) and (not isinstance(value, np.ndarray))):
 66 |         value = [value]
 67 |     
 68 |     feature = tf.train.Feature(float_list=tf.train.FloatList(value=value))
 69 | 
 70 |     return feature
 71 | 
 72 | def _bytes_feature(value):
 73 |     """Insert byte features into Example proto.
 74 |     Args:
 75 |         value: string or list of strings; features to
 76 |             insert in Example proto.
 77 |     Returns:
 78 |         feature: example proto; it contains a byte list.
 79 |     """
 80 |     
 81 |     if (isinstance(value, type(tf.constant(0)))):
 82 |         value = value.numpy()
 83 |     if (six.PY3 and isinstance(value, six.text_type)):
 84 |         value = six.binary_type(value, encoding='utf-8') 
 85 |     
 86 |     feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
 87 | 
 88 |     return feature
 89 | 
 90 | def _convert_to_example(filename, image_buffer, label_value, label_name, height, width):
 91 |     """Build an Example proto for an image.
 92 |     Args:
 93 |         filename: string; path to image file.
 94 |         image_buffer: string; JPEG encoded image.
 95 |         label_value: int; numeric ground truth label.
 96 |         label_name: string; human-readable label.
 97 |         height: int; image height in pixels.
 98 |         width: int; image width in pixels.
 99 |     Returns:
100 |         example: example proto; it contains the following fields:
101 |             image/height: int; image height in pixels.
102 |             image/width: int; image width in pixels.
103 |             image/colorspace: string; colorspace, always 'RGB'.
104 |             image/channels: int; number of channels, always 3.
105 |             image/class/label: int; index of a classification label in range [1, 200].
106 |             image/class/text: string; human-readable label.
107 |             image/format: string; image format, always 'JPEG'.
108 |             image/filename: string; image file basename.
109 |             image/encoded: string; JPEG encoded image.
110 |     """
111 | 
112 |     colorspace = 'RGB'
113 |     channels = 3
114 |     image_format = 'JPEG'
115 | 
116 |     example = tf.train.Example(features=tf.train.Features(feature={
117 |         'image/height': _int64_feature(height),
118 |         'image/width': _int64_feature(width),
119 |         'image/colorspace': _bytes_feature(colorspace),
120 |         'image/channels': _int64_feature(channels),
121 |         'image/class/label': _int64_feature(label_value),
122 |         'image/class/text': _bytes_feature(label_name),
123 |         'image/format': _bytes_feature(image_format),
124 |         'image/filename': _bytes_feature(os.path.basename(filename)),
125 |         'image/encoded': _bytes_feature(image_buffer)
126 |     }))
127 | 
128 |     return example
129 | 
130 | def _process_image(filename):
131 |     """Process a single image file.
132 |     Args:
133 |         filename: string; path to an image file.
134 |     Returns:
135 |         image_buffer: string; JPEG encoded image.
136 |         height: int; image height in pixels.
137 |         width: int; image width in pixels.
138 |     """
139 | 
140 |     # Read image file
141 |     image_data = tf.io.read_file(filename)
142 | 
143 |     # Decode image
144 |     try:
145 |         image = tf.io.decode_jpeg(image_data, channels=3)
146 |     except:
147 |         print("Oops! %s." %filename)
148 |         return -1
149 |     
150 |     # Assert that the image has the appropriate dimensions
151 |     assert len(image.shape) == 3
152 |     height = image.shape[0]
153 |     width = image.shape[1]
154 |     assert image.shape[2] == 3
155 | 
156 |     return image_data, height, width
157 | 
158 | def _process_image_files_batch(thread_index, ranges, name, filenames,
159 |                                labels_values, labels_names, num_shards):
160 |     """Execute 1 thread that processes images and saves them as TFRecords
161 |        of Example protos.
162 |     Args:
163 |         thread_index: int; unique thread identifier.
164 |         ranges: list of ints; it contains the range of images to
165 |             process.
166 |         name: string; unique identifier specifying the data set.
167 |         filenames: list of strings; it contains paths to image files.
168 |         labels_values: list of ints; it contains numeric labels.
169 |         labels_names: list of strings; it contains human-readable labels.
170 |         num_shards: int; number of shards.
171 |     Returns:
172 |         -
173 |     """
174 | 
175 |     # Each thread produces N shards where N = int(num_shards / num_threads).
176 |     # For instance, if num_shards = 128, and the num_threads = 2, then the first
177 |     # thread would produce shards [0, 64)
178 |     num_threads = len(ranges)
179 |     assert not num_shards % num_threads
180 |     num_shards_per_batch = int(num_shards / num_threads)
181 | 
182 |     shard_ranges = np.linspace(ranges[thread_index][0],
183 |                              ranges[thread_index][1],
184 |                              num_shards_per_batch + 1).astype(int)
185 |     num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
186 | 
187 |     # Generate each shard
188 |     counter = 0
189 |     for s in range(num_shards_per_batch):
190 |         shard = thread_index * num_shards_per_batch + s
191 |         output_filename = '%s-%.4d-of-%.4d' % (name, (shard+1), num_shards)
192 |         output_file = os.path.join(FLAGS.output_dir, output_filename)
193 |         writer = tf.io.TFRecordWriter(output_file)
194 | 
195 |         # Process each file for a shard
196 |         shard_counter = 0
197 |         files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
198 |         for i in files_in_shard:
199 |             filename = filenames[i]
200 |             label_value = labels_values[i]
201 |             label_name = labels_names[i]
202 | 
203 |             # Process an image
204 |             image_buffer, height, width = _process_image(filename)
205 | 
206 |             # Create an Example proto
207 |             example = _convert_to_example(filename, image_buffer, label_value,
208 |                                           label_name, height, width)
209 |             
210 |             # Write to TFRecord
211 |             writer.write(example.SerializeToString())
212 |             shard_counter += 1
213 |             counter += 1
214 | 
215 |             if (not (counter % 1000)):
216 |                 print('%s [thread %d]: Processed %d of %d images in thread batch.' %
217 |                       (datetime.now(), thread_index, counter, num_files_in_thread))
218 |                 sys.stdout.flush()
219 | 
220 |         writer.close()
221 |         print('%s [thread %d]: Wrote %d images to %s' %
222 |               (datetime.now(), thread_index, shard_counter, output_file))
223 |         sys.stdout.flush()
224 |         shard_counter = 0
225 |     print('%s [thread %d]: Wrote %d images to %d shards.'
226 |           %(datetime.now(), thread_index, counter, num_files_in_thread))
227 |     sys.stdout.flush()
228 | 
229 | def _process_image_files(name, filenames, labels_values, labels_names, num_shards):
230 |     """Process images and save them as TFRecords of Example protos.
231 |     Args:
232 |         name: string; unique identifier specifying the data set.
233 |         filenames: list of strings; it contains paths to image files.
234 |         labels_values: list of ints; it contains numeric labels.
235 |         labels_names: list of strings; it contains human-readable labels.
236 |         num_shards: int; number of shards.
237 |     Returns:
238 |         -
239 |     """
240 | 
241 |     assert len(filenames) == len(labels_values) == len(labels_names)
242 | 
243 |     # Break images into batches
244 |     spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int)
245 |     ranges = []
246 |     for i in range(len(spacing) - 1):
247 |         ranges.append([spacing[i], spacing[i + 1]])
248 | 
249 |     # Launch a thread for each batch
250 |     print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
251 |     sys.stdout.flush()
252 | 
253 |     # Create a mechanism for monitoring threads' execution
254 |     coord = tf.train.Coordinator()
255 | 
256 |     # Run threads
257 |     threads = []
258 |     for thread_index in range(len(ranges)):
259 |         args = (thread_index, ranges, name, filenames,
260 |                 labels_values, labels_names, num_shards)
261 |         t = threading.Thread(target=_process_image_files_batch, args=args)
262 |         t.start()
263 |         threads.append(t)
264 | 
265 |     # Wait for all the threads to terminate
266 |     coord.join(threads)
267 |     print('%s: Finished writing all %d images in data set.' %
268 |         (datetime.now(), len(filenames)))
269 |     sys.stdout.flush()
270 | 
271 | def _find_image_files(name, data_dir, csv_file):
272 |     """Build lists of images file paths, numeric labels, and
273 |        human-readable labels.
274 |     Args:
275 |         name: string; unique identifier specifying the data set.
276 |         directory: string; path to data set.
277 |         csv_file: string; path to csv file with information about
278 |             the data.
279 |     Returns:
280 |         filenames: list of strings; it contains paths to image files.
281 |         labels_values: list of ints; it contains numeric labels.
282 |         labels_names: list of strings; it contains human-readable labels.
283 |     """
284 | 
285 |     df = pd.read_csv(csv_file)
286 |     filenames = df['fname'].tolist()
287 |     filenames = [os.path.join(data_dir, f.lstrip('/')) for f in filenames]
288 | 
289 |     labels_values = df.to_numpy()[:, 1].astype(np.int).tolist()
290 |     labels_names = df.to_numpy()[:, 2].tolist()
291 | 
292 |     print('Found %d JPEG files across %d labels inside %s.' %
293 |         (len(filenames), len(np.unique(labels_values)), data_dir))
294 | 
295 |     return filenames, labels_values, labels_names
296 | 
297 | def _process_dataset(name, directory, num_shards, csv_file):
298 |     """Process a complete data set and save it in TFRecords.
299 |     Args:
300 |         name: string; unique identifier specifying the data set.
301 |         directory: string; path to data set.
302 |         num_shards: int; number of shards.
303 |         csv_file: string; path to csv file with information about
304 |             the data.
305 |     Returns:
306 |         -
307 |     """
308 | 
309 |     filenames, labels_values, labels_names = _find_image_files(name, directory, csv_file)
310 |     _process_image_files(name, filenames, labels_values, labels_names, num_shards)
311 | 
312 | def main(argv=None):
313 |     """Convert Caltech-UCSD Birds-200-2011 training and validation
314 |        images to TFRecords.
315 |     Args:
316 |         -
317 |     Returns:
318 |         -
319 |     """
320 | 
321 |     assert not FLAGS.train_shards_num % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.train_shards_num')
322 |     assert not FLAGS.dev_shards_num % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.dev_shards_num')
323 | 
324 |     if (not os.path.isdir(FLAGS.output_dir)):
325 |         os.makedirs(FLAGS.output_dir)
326 |     print('Saving results to %s' % FLAGS.output_dir)
327 |     sys.stdout.flush()
328 |     
329 |     # Create TFRecords
330 |     _process_dataset('validation', FLAGS.img_dir, FLAGS.dev_shards_num, FLAGS.dev_csv_path)
331 |     _process_dataset('train', FLAGS.img_dir, FLAGS.train_shards_num, FLAGS.train_csv_path)
332 | 
333 | if __name__ == '__main__':
334 |     main()
335 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Traversal Network (TNet)
  2 | 
  3 | We provide the TensorFlow implementation of the Traversal Network (TNet) architecture, presented in "Hard-Attention for Scalable Image Classification" (https://arxiv.org/pdf/2102.10212.pdf). The code is organized according to the datasets used for the experimental evaluation of TNet. Each folder contains code to convert raw data to TFRecords, to stream input batches, to build TNet and baseline models, and to train and evaluate the models. Learned weights, along with instructions to replicate the results presented in the paper, are provided as well.
  4 | 
  5 | ## ImageNet ILSVRC 2012
  6 | 
  7 | All related files can be found under the `/ImageNet/` folder.
  8 | 
  9 | ### Data preparation
 10 | 
 11 | Detailed intstructions to download the raw data, and to create related metadata files, are provided in `create_tfrecords_imagenet.py`. Given the necessary files are created, and the data directories are organized appropriately, the following command can be used to convert raw data to TFRecords:
 12 | 
 13 | ```
 14 | python create_tfrecords_imagenet.py --output_directory '/path/to/output/dir/'
 15 |                                     --labels_file '/path/to/imagenet_lsvrc_2015_synsets.txt'
 16 |                                     --imagenet_metadata_file '/path/to/imagenet_metadata.txt'
 17 |                                     --bounding_box_file '/path/to/imagenet_2012_bounding_boxes.csv'
 18 | ```
 19 | 
 20 | ### Training
 21 | 
 22 | There are many different flags that can be used to customize the training of TNet and BagNet-77 baseline. An example command for training TNet is the following:
 23 | 
 24 | ```
 25 | python train.py --to_train
 26 |                 --batch_size 64
 27 |                 --num_epochs 200
 28 |                 --initial_lr 0.0001
 29 |                 --lr_scedule_1step
 30 |                 --keep_prob 0.5
 31 |                 --loc_per_grid 3.0
 32 |                 --reinfornce_reg_w 0.1
 33 |                 --perFReg_ce_weight 0.3
 34 |                 --perFReg_reinf_weight 0.3
 35 |                 --overlap 0.34375
 36 |                 --num_patches_y 5
 37 |                 --num_patches_x 5
 38 |                 --base_res_y 77
 39 |                 --base_res_x 77
 40 |                 --num_res_levels 2
 41 |                 --num_do_layers 1
 42 |                 --descr_tag 'BagNet_77_TNet'
 43 |                 --save_tag 'TNet_imagenet'
 44 |                 --num_gpus 2
 45 |                 --data_dir '/path/to/TFRecords/dir/'
 46 |                 --ckpt_dir '/path/to/ckpts/dir/'
 47 |                 --summaries_dir '/path/to/summaries/dir/'
 48 |                 --keep_weights_summary
 49 | ```
 50 | 
 51 | An example command for traing BagNet-77 baseline is the following:
 52 | 
 53 | ```
 54 | python train_bl.py --to_train
 55 |                    --batch_size 64
 56 |                    --num_epochs 200
 57 |                    --initial_lr 0.0001
 58 |                    --lr_scedule_1step
 59 |                    --keep_prob 0.375
 60 |                    --num_do_layers 1
 61 |                    --descr_tag 'BagNet_77'
 62 |                    --save_tag 'BagNet_77_imagenet'
 63 |                    --num_gpus 2
 64 |                    --data_dir '/path/to/TFRecords/dir/'
 65 |                    --ckpt_dir '/path/to/ckpts/dir/'
 66 |                    --summaries_dir '/path/to/summaries/dir/'
 67 |                    --keep_weights_summary
 68 | ```
 69 | 
 70 | Commands to replicate the training of the networks presented in the paper, can be found in `results_replication.txt`.
 71 | 
 72 | The weights of the TNet model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1JmKOP6aN2tYUsD4-zWMfRCbXlId6gcko&export=download). <br />
 73 | The weights of the BagNet-77 baseline reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1jB3ouvcVhxYnTrIlagUawBkVqhL8Wfro&export=download).
 74 | 
 75 | ### Evaluation
 76 | 
 77 | A trained TNet model can be evaluated on the training and validation sets, by using a command similar to the following example:
 78 | 
 79 | ```
 80 | python train.py --to_evaluate_train
 81 |                 --to_evaluate_val
 82 |                 --batch_size 64
 83 |                 --loc_per_grid 3.0
 84 |                 --overlap 0.34375
 85 |                 --num_patches_y 5
 86 |                 --num_patches_x 5
 87 |                 --base_res_y 77
 88 |                 --base_res_x 77
 89 |                 --num_res_levels 2
 90 |                 --descr_tag 'BagNet_77_TNet'
 91 |                 --save_tag 'BagNet_77_imagenet'
 92 |                 --num_gpus 1
 93 |                 --data_dir '/path/to/TFRecords/dir/'
 94 |                 --ckpt_dir '/path/to/ckpts/dir/'
 95 |                 --summaries_dir '/path/to/summaries/dir/'
 96 |                 --restore_dir '/path/to/dir/with/ckpt/to/restore/'
 97 | ```
 98 | 
 99 | An example command for evaluating a trained BagNet-77 baseline network, is the following:
100 | 
101 | ```
102 | python train_bl.py --to_evaluate_train
103 |                    --to_evaluate_val
104 |                    --batch_size 64
105 |                    --descr_tag 'BagNet_77'
106 |                    --save_tag 'BagNet_77_imagenet'
107 |                    --num_gpus 1
108 |                    --data_dir '/path/to/TFRecords/dir/'
109 |                    --ckpt_dir '/path/to/ckpts/dir/'
110 |                    --summaries_dir '/path/to/summaries/dir/'
111 |                    --restore_dir '/path/to/dir/with/ckpt/to/restore/'
112 | ```
113 | 
114 | Commands to evaluate the networks presented in the paper, can be found in `results_replication.txt`.
115 | 
116 | ## Functional Map of the World (fMoW)
117 | 
118 | All related files can be found under the `/fMoW/` folder.
119 | 
120 | ### Data preparation
121 | 
122 | Details about how to download raw data are provided in `create_TFRecords_fMoW.py`. As explained in `create_TFRecords_fMoW.py`, test set data should be manually matched to ground truth labels. This can be done with the following command:
123 | 
124 | ```
125 | python match_test_gt.py --root_test_dir '/path/to/original/test/data/root/dir/'
126 |                         --test_output_dir '/path/to/output/dir/'
127 |                         --match_gt_json_path '/path/to/test_gt_mapping.json'
128 | ```
129 | 
130 | Given the desired uniformity in the directory organization of the training, validation, and test sets is established, the following command can be used to convert raw data to TFRecords:
131 | 
132 | ```
133 | python create_TFRecords_fMoW.py --train_directory '/path/to/training/set/dir/'
134 |                                 --validation_directory '/path/to/validation/set/dir/'
135 |                                 --test_directory '/path/to/test/set/dir/'
136 |                                 --output_directory '/path/to/output/dir/'
137 | ```
138 | 
139 | In order to crop images according to the provided bounding boxes, the following command can be used:
140 | 
141 | ```
142 | python crop_fMoW.py --train_directory '/path/to/training/set/dir/'
143 |                     --validation_directory '/path/to/validation/set/dir/'
144 |                     --test_directory '/path/to/test/set/dir/'
145 |                     --output_directory '/path/to/output/dir/'
146 | ```
147 | 
148 | TFRecords for cropped images can be created with the following command:
149 | 
150 | ```
151 | python create_TFRecords_fMoW.py --cropped_data
152 |                                 --train_directory '/path/to/training/set/dir/'
153 |                                 --validation_directory '/path/to/validation/set/dir/'
154 |                                 --test_directory '/path/to/test/set/dir/'
155 |                                 --output_directory '/path/to/output/dir/'
156 |                                 --maximum_min_dim 224
157 | ```
158 | 
159 | ### Training and evaluation
160 | 
161 | Training and evaluation commands are similar to the ones provided for ImageNet. The commands used to train and evaluate the networks presented in the paper, can be found in `results_replication.txt`.
162 | 
163 | The weights of the TNet model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=13d9qE1DOwm93pVrCUEPtBZZ3sQWZycXa&export=download). <br />
164 | The weights of the EfficientNet-B0 model trained on cropped images, can be downloaded [here](https://drive.google.com/u/1/uc?id=1BsY-3EphqSviOx_OMS_x0gEWL2tUpRn5&export=download). <br />
165 | The weights of the EfficientNet-B0 model trained on images of size 224x224 px, can be downloaded [here](https://drive.google.com/u/1/uc?id=15weVtPPnXv-H6wilP820TDAf_zNe3WQj&export=download). <br />
166 | The weights of the EfficientNet-B0 model trained on images of size 448x448 px, can be downloaded [here](https://drive.google.com/u/1/uc?id=1ZiJMOsNU4LLMdzenO3ITUI9HyhvLhlF7&export=download). <br />
167 | The weights of the EfficientNet-B0 model trained on images of size 896x896 px, can be downloaded [here](https://drive.google.com/u/1/uc?id=1l9n1EvQ1FkGu1U-C0IaQzVeGjCeP_t5D&export=download).
168 | 
169 | ## CUB-200-2011
170 | 
171 | All related files can be found under the `/CUB/` folder.
172 | 
173 | ### Data preparation
174 | 
175 | The link to download raw data is provided in `create_tfrecords_cub.py`. Before the creation of TFRecords, data can be split into training and validation sets through the following command (a csv file for each split is created):
176 | 
177 | ```
178 | python create_csv_cub.py --imgs_list_txt '/path/to/images.txt'
179 |                          --split_list_txt '/path/to/train_test_split.txt'
180 |                          --save_dir '/path/to/output/dir/'
181 | ```
182 | 
183 | Given the csv files for each data split are created, the following command can be used to convert raw data to TFRecords:
184 | 
185 | ```
186 | python create_tfrecords_cub.py --img_dir '/path/to/images/dir/'
187 |                                --train_csv_path '/path/to/train_anno.csv'
188 |                                --dev_csv_path '/path/to/validation_anno.csv'
189 |                                --output_dir '/path/to/output/dir/'
190 | ```
191 | 
192 | ### Training and evaluation
193 | 
194 | Training and evaluation commands are similar to the ones provided for ImageNet. As noted in the paper, the pre-trained weights for EfficientNet models that are used for fine-tuning, can be downloaded <a href="https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet">here</a>. They correspond to the weights of models trained with NoisyStudent and RandAugment, with the extra JFT-300M unlabeled data. Under the folder `/restore_dicts/` are provided dictionaries that are used to load the pre-trained weights to TNet and the baselines. The commands used to train and evaluate the networks presented in the paper, can be found in `results_replication.txt`.
195 | 
196 | The weights of the TNet-B0 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1o8idpT73OCMba57oFTTn5SxSGOfZpqBG&export=download). <br />
197 | The weights of the TNet-B1 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1oEGcgjMCbBWi5JrM5HEkfb4F4iI7CaXX&export=download). <br />
198 | The weights of the TNet-B2 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1iADseI0T3s-_P5-KNeU31k8t9S1Bb4Df&export=download). <br />
199 | The weights of the TNet-B3 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1YA7zfOw78hl7AwRaoSeo51mqpz1-vke2&export=download). <br />
200 | The weights of the TNet-B4 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1MEHkmT3br2DpPK8Im_En9oQp3SP5cURJ&export=download). <br />
201 | The weights of the EfficientNet-B0 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=15ZHqMZib058qhILYi8a04iM-YPWod_6_&export=download). <br />
202 | The weights of the EfficientNet-B1 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1iT_QJregdanQjaKA1QjCdqLmEBBLdrsP&export=download). <br />
203 | The weights of the EfficientNet-B2 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1_bbScPS4nGt0aBij9YIN0snQsLS2jC3g&export=download). <br />
204 | The weights of the EfficientNet-B3 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1l6QeqGVrYc1k1r4n6fnNyInZ0vzq19iw&export=download). <br />
205 | The weights of the EfficientNet-B4 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1CFcS8eFy63SSRpB0uB9CHkEVDns-umBc&export=download). <br />
206 | 
207 | ## NABirds
208 | 
209 | All related files can be found under the `/NABirds/` folder.
210 | 
211 | ### Data preparation
212 | 
213 | Details about the NABirds data are provided in `create_tfrecords_nab.py`. The following command can be used to convert raw data to TFRecords:
214 | 
215 | ```
216 | python create_tfrecords_nab.py --root_directory '/path/to/dir/with/all/downloaded/data/'
217 |                                --data_directory '/path/to/images/dir/'
218 |                                --output_directory '/path/to/output/dir/'
219 | ```
220 | 
221 | ### Training and evaluation
222 | 
223 | Training and evaluation commands are similar to the ones provided for ImageNet. As noted in the paper, the pre-trained weights for EfficientNet models that are used for fine-tuning, can be downloaded <a href="https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet">here</a>. They correspond to the weights of models trained with NoisyStudent and RandAugment, with the extra JFT-300M unlabeled data. Under the folder `/restore_dicts/` are provided dictionaries that are used to load the pre-trained weights to TNet and the baselines. The commands used to train and evaluate the networks presented in the paper, can be found in `results_replication.txt`.
224 | 
225 | The weights of the TNet-B0 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1FsHt1duv-3cIWPuoPToYdk-k6E65huwH&export=download). <br />
226 | The weights of the TNet-B1 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=15p0O25f_ysGjd2F51uQEt42T0E5fNRtE&export=download). <br />
227 | The weights of the TNet-B2 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1tI7eGNUXyEPiC0LIlSHaM9gGPHm3CS9d&export=download). <br />
228 | The weights of the TNet-B3 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1QtUv-WKBqtYzKQUNhUspWQmQ6NgVO1pg&export=download). <br />
229 | The weights of the TNet-B4 model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1gAw7bfjG3LsVmZl3XbT5cVN7vZcUcy2H&export=download). <br />
230 | The weights of the EfficientNet-B0 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1cbSie1e7BhlyNDdWQXLlJrgdbudmxUJl&export=download). <br />
231 | The weights of the EfficientNet-B1 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1oLooePT4tyXXBgcJ-Cf_XnWkxjIRUXV2&export=download). <br />
232 | The weights of the EfficientNet-B2 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1rg1uD1ISM-8-anZkAgqebwrisGfpC5NT&export=download). <br />
233 | The weights of the EfficientNet-B3 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1Dxi8BhURHVonguIWQNYt-WU1SuLLr78e&export=download). <br />
234 | The weights of the EfficientNet-B4 baseline model reported in the paper, can be downloaded [here](https://drive.google.com/u/1/uc?id=1qqfbL-nfAftIByQUVUZNm5bYGRsoiX4S&export=download). <br />
235 | 
236 | 


--------------------------------------------------------------------------------
/NABirds/results_replication.txt:
--------------------------------------------------------------------------------
  1 | --- Training
  2 | 
  3 | - TNet
  4 | 
  5 | The following command can be used to replicate the training of the TNet-B0 model:
  6 | 
  7 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --num_epochs 100 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB0_origWD' --save_tag 'TNet-B0_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b0.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4
  8 | 
  9 | The following command can be used to replicate the training of the TNet-B1 model:
 10 | 
 11 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --num_epochs 100 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB1_origWD' --save_tag 'TNet-B1_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b1.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4
 12 | 
 13 | The following command can be used to replicate the training of the TNet-B2 model:
 14 | 
 15 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --num_epochs 100 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB2_origWD' --save_tag 'TNet-B2_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b2.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4
 16 | 
 17 | The following command can be used to replicate the training of the TNet-B3 model:
 18 | 
 19 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --num_epochs 100 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 3.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB3_origWD' --save_tag 'TNet-B3_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b3.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4
 20 | 
 21 | The following command can be used to replicate the training of the TNet-B4 model:
 22 | 
 23 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --num_epochs 100 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 3.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB4_origWD' --save_tag 'TNet-B4_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b4.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4
 24 | 
 25 | 
 26 | 
 27 | - Baselines
 28 | 
 29 | The following command can be used to replicate the training of the EfficientNet-B0 model:
 30 | 
 31 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 100 --num_classes 555 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB0_origWD' --save_tag 'EN-B0_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b0.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4
 32 | 
 33 | The following command can be used to replicate the training of the EfficientNet-B1 model:
 34 | 
 35 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 100 --num_classes 555 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB1_origWD' --save_tag 'EN-B1_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b1.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4
 36 | 
 37 | The following command can be used to replicate the training of the EfficientNet-B2 model:
 38 | 
 39 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 100 --num_classes 555 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB2_origWD' --save_tag 'EN-B2_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b2.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4
 40 | 
 41 | The following command can be used to replicate the training of the EfficientNet-B3 model:
 42 | 
 43 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 100 --num_classes 555 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB3_origWD' --save_tag 'EN-B3_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b3.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4
 44 | 
 45 | The following command can be used to replicate the training of the EfficientNet-B4 model:
 46 | 
 47 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 100 --num_classes 555 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB4_origWD' --save_tag 'EN-B4_nab' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b4.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 50.0 --contrastive_margin 0.4
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | --- Evaluation
 54 | 
 55 | - TNet
 56 | 
 57 | The following command can be used to evaluate a trained TNet-B0 model on the validation set of CUB-200-2011:
 58 | 
 59 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB0_origWD' --save_tag 'TNet-B0_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --image_ids_struct_path '/path/to/image_ids_struct.txt'
 60 | 
 61 | The following command can be used to evaluate a trained TNet-B1 model on the validation set of CUB-200-2011:
 62 | 
 63 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB1_origWD' --save_tag 'TNet-B1_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --image_ids_struct_path '/path/to/image_ids_struct.txt'
 64 | 
 65 | The following command can be used to evaluate a trained TNet-B2 model on the validation set of CUB-200-2011:
 66 | 
 67 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB2_origWD' --save_tag 'TNet-B2_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --image_ids_struct_path '/path/to/image_ids_struct.txt'
 68 | 
 69 | The following command can be used to evaluate a trained TNet-B3 model on the validation set of CUB-200-2011:
 70 | 
 71 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB3_origWD' --save_tag 'TNet-B3_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --image_ids_struct_path '/path/to/image_ids_struct.txt'
 72 | 
 73 | The following command can be used to evaluate a trained TNet-B4 model on the validation set of CUB-200-2011:
 74 | 
 75 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 555 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB4_origWD' --save_tag 'TNet-B4_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --image_ids_struct_path '/path/to/image_ids_struct.txt'
 76 | 
 77 | The following flags can be added to the previous evaluation commands in order to time the inference of TNet:
 78 | 
 79 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 10
 80 | 
 81 | The following flags can be added to the previous evaluation commands for advanced evaluation of TNet:
 82 | 
 83 | --adv_eval_data --batches_to_time_range 0 -1 --eval_epochs_num 1
 84 | 
 85 | Advanced evaluation corresponds to the creation of an excel file with information about the attended locations, the attendance probabilities of all candidate locations, and the weights estimated by the feature weighting module.
 86 | 
 87 | 
 88 | 
 89 | - Baselines
 90 | 
 91 | The following command can be used to evaluate a trained EfficientNet-B0 model on the validation set of CUB-200-2011:
 92 | 
 93 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_classes 555 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB0_origWD' --save_tag 'EN-B0_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
 94 | 
 95 | The following command can be used to evaluate a trained EfficientNet-B1 model on the validation set of CUB-200-2011:
 96 | 
 97 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_classes 555 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB1_origWD' --save_tag 'EN-B1_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
 98 | 
 99 | The following command can be used to evaluate a trained EfficientNet-B2 model on the validation set of CUB-200-2011:
100 | 
101 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_classes 555 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB2_origWD' --save_tag 'EN-B2_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
102 | 
103 | The following command can be used to evaluate a trained EfficientNet-B3 model on the validation set of CUB-200-2011:
104 | 
105 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_classes 555 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB3_origWD' --save_tag 'EN-B3_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
106 | 
107 | The following command can be used to evaluate a trained EfficientNet-B4 model on the validation set of CUB-200-2011:
108 | 
109 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_classes 555 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB4_origWD' --save_tag 'EN-B4_nab' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
110 | 
111 | The following flags can be added to the previous evaluation commands in order to time the inference of the baselines:
112 | 
113 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 10
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/CUB/results_replication.txt:
--------------------------------------------------------------------------------
  1 | --- Training
  2 | 
  3 | - TNet
  4 | 
  5 | The following command can be used to replicate the training of the TNet-B0 model:
  6 | 
  7 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --num_epochs 200 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB0_origWD' --save_tag 'TNet-B0_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b0.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4
  8 | 
  9 | The following command can be used to replicate the training of the TNet-B1 model:
 10 | 
 11 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --num_epochs 200 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB1_origWD' --save_tag 'TNet-B1_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b1.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4
 12 | 
 13 | The following command can be used to replicate the training of the TNet-B2 model:
 14 | 
 15 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --num_epochs 200 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB2_origWD' --save_tag 'TNet-B2_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b2.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4
 16 | 
 17 | The following command can be used to replicate the training of the TNet-B3 model:
 18 | 
 19 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --num_epochs 200 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB3_origWD' --save_tag 'TNet-B3_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b3.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4
 20 | 
 21 | The following command can be used to replicate the training of the TNet-B4 model:
 22 | 
 23 | python train.py --to_train --to_evaluate_train --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --num_epochs 125 --initial_lr 0.001 --initial_lr2 0.0001 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --loc_per_grid 5.0 --reinfornce_reg_w 0.1 --perFReg_ce_weight 0.3 --perFReg_reinf_weight 0.3 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_samples 1 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB4_origWD' --save_tag 'TNet-B4_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b4.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' 'feat_weighting' 'feature_posBurn' 'location_prediction' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4
 24 | 
 25 | 
 26 | 
 27 | - Baselines
 28 | 
 29 | The following command can be used to replicate the training of the EfficientNet-B0 model:
 30 | 
 31 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB0_origWD' --save_tag 'EN-B0_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b0.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4
 32 | 
 33 | The following command can be used to replicate the training of the EfficientNet-B1 model:
 34 | 
 35 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB1_origWD' --save_tag 'EN-B1_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b1.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4
 36 | 
 37 | The following command can be used to replicate the training of the EfficientNet-B2 model:
 38 | 
 39 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB2_origWD' --save_tag 'EN-B2_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b2.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4
 40 | 
 41 | The following command can be used to replicate the training of the EfficientNet-B3 model:
 42 | 
 43 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB3_origWD' --save_tag 'EN-B3_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b3.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4
 44 | 
 45 | The following command can be used to replicate the training of the EfficientNet-B4 model:
 46 | 
 47 | python train_bl.py --to_train --to_evaluate_train --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --initial_lr 0.001 --initial_lr2 0.00005 --lr_scedule_1step --lr_decay_factor 0.1 --dropout_rate 0.75 --block_drop_rate 0.5 --l2_reg 0.0001 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB4_origWD' --save_tag 'EN-B4_cub' --num_gpus 4 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --keep_grads_summary --keep_weights_summary --keep_activations_summary --restore_dir '/path/to/pretrained/weights/' --dictionary_to_restore_from '/path/to/efficientnet-b4.p' --vars_to_exclude 'logits_layer' --two_oprimizers --vars_to_update 'logits_layer' --contrastive_loss --l_contrastive 100.0 --contrastive_margin 0.4
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | --- Evaluation
 54 | 
 55 | - TNet
 56 | 
 57 | The following command can be used to evaluate a trained TNet-B0 model on the validation set of CUB-200-2011:
 58 | 
 59 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB0_origWD' --save_tag 'TNet-B0_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --cub_classes_file_dir '/path/to/classes.txt'
 60 | 
 61 | The following command can be used to evaluate a trained TNet-B1 model on the validation set of CUB-200-2011:
 62 | 
 63 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB1_origWD' --save_tag 'TNet-B1_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --cub_classes_file_dir '/path/to/classes.txt'
 64 | 
 65 | The following command can be used to evaluate a trained TNet-B2 model on the validation set of CUB-200-2011:
 66 | 
 67 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB2_origWD' --save_tag 'TNet-B2_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --cub_classes_file_dir '/path/to/classes.txt'
 68 | 
 69 | The following command can be used to evaluate a trained TNet-B3 model on the validation set of CUB-200-2011:
 70 | 
 71 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB3_origWD' --save_tag 'TNet-B3_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --cub_classes_file_dir '/path/to/classes.txt'
 72 | 
 73 | The following command can be used to evaluate a trained TNet-B4 model on the validation set of CUB-200-2011:
 74 | 
 75 | python train.py --to_evaluate_val --feat_weighting --batch_norm --batch_size 64 --num_classes 200 --loc_per_grid 3.0 --overlap 0.35 --img_size_y 448 --img_size_x 448 --pos_dim_divisor 4 --num_patches_y 5 --num_patches_x 5 --activation 'swish' --base_res_y 224 --base_res_x 224 --num_res_levels 2 --descr_tag 'EfficientNetB4_origWD' --save_tag 'TNet-B4_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/' --cub_classes_file_dir '/path/to/classes.txt'
 76 | 
 77 | The following flags can be added to the previous evaluation commands in order to time the inference of TNet:
 78 | 
 79 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 10
 80 | 
 81 | The following flags can be added to the previous evaluation commands for advanced evaluation of TNet:
 82 | 
 83 | --adv_eval_data --batches_to_time_range 0 -1 --eval_epochs_num 1
 84 | 
 85 | Advanced evaluation corresponds to the creation of an excel file with information about the attended locations, the attendance probabilities of all candidate locations, and the weights estimated by the feature weighting module.
 86 | 
 87 | 
 88 | 
 89 | - Baselines
 90 | 
 91 | The following command can be used to evaluate a trained EfficientNet-B0 model on the validation set of CUB-200-2011:
 92 | 
 93 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB0_origWD' --save_tag 'EN-B0_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
 94 | 
 95 | The following command can be used to evaluate a trained EfficientNet-B1 model on the validation set of CUB-200-2011:
 96 | 
 97 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB1_origWD' --save_tag 'EN-B1_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
 98 | 
 99 | The following command can be used to evaluate a trained EfficientNet-B2 model on the validation set of CUB-200-2011:
100 | 
101 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB2_origWD' --save_tag 'EN-B2_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
102 | 
103 | The following command can be used to evaluate a trained EfficientNet-B3 model on the validation set of CUB-200-2011:
104 | 
105 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB3_origWD' --save_tag 'EN-B3_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
106 | 
107 | The following command can be used to evaluate a trained EfficientNet-B4 model on the validation set of CUB-200-2011:
108 | 
109 | python train_bl.py --to_evaluate_val --batch_norm --batch_size 64 --num_epochs 200 --num_classes 200 --img_size_y 448 --img_size_x 448 --activation 'swish' --descr_tag 'EfficientNetB4_origWD' --save_tag 'EN-B4_cub' --num_gpus 1 --data_dir '/path/to/TFRecords/dir/' --ckpt_dir '/path/to/ckpts/dir/' --summaries_dir '/path/to/summaries/dir/' --restore_dir '/path/to/dir/with/ckpt/to/restore/'
110 | 
111 | The following flags can be added to the previous evaluation commands in order to time the inference of the baselines:
112 | 
113 | --profile_step 10. --batches_to_time_range 50 501 --eval_epochs_num 10
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/NABirds/input_nab.py:
--------------------------------------------------------------------------------
  1 | """Prepare input batches.
  2 | """
  3 | 
  4 | from __future__ import absolute_import, division, print_function
  5 | 
  6 | import os
  7 | 
  8 | from absl import logging
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | import tensorflow_addons as tfa
 12 | from tensorflow.python.ops import control_flow_ops
 13 | 
 14 | 
 15 | 
 16 | _SHUFFLE_BUFFER = 10000
 17 | NUM_CHANNELS = 3
 18 | TRAIN_SHARDS_NUM = 16
 19 | VAL_SHARDS_NUM = 16
 20 | 
 21 | def get_filenames(dataset_type, data_dir):
 22 |     """Return filenames for dataset.
 23 |     Args:
 24 |         dataset_type: string; type of dataset.
 25 |         data_dir: string; directory containing the input data.
 26 |     Returns:
 27 |         data_filemames: list of strings; it contains paths to TFRecords.
 28 |     """
 29 |     
 30 |     # Data are assumed to be stored in TFRecords
 31 |     if (dataset_type == 'train'):
 32 |         data_filemames = [os.path.join(data_dir, 'train-%04d-of-%04d' % (i+1, TRAIN_SHARDS_NUM)) for i in range(TRAIN_SHARDS_NUM)]
 33 |     elif (dataset_type == 'validation'):
 34 |         data_filemames = [os.path.join(data_dir, 'validation-%04d-of-%04d' % (i+1, VAL_SHARDS_NUM)) for i in range(VAL_SHARDS_NUM)]
 35 |   
 36 |     return data_filemames
 37 | 
 38 | def parse_example_proto(example_serialized, adv_eval_data=False):
 39 |     """Parse an Example proto that corresponds to an image.
 40 |     Args:
 41 |         example_serialized: string; serialized Example protocol buffer.
 42 |         adv_eval_data: boolean; whether to include information for advanced
 43 |             evaluation in the input batches.
 44 |     Returns:
 45 |         to_batch: tuple; it contains the following entries:
 46 |             encoded_img: string; encoded JPEG file.
 47 |             label: int; numeric image label.
 48 |             img_filename (optional): string; the filename of an image.
 49 |             img_label_text (optional): string; the human-readable label of an image.
 50 |     """
 51 | 
 52 |     # Extract dense features in Example proto
 53 |     feature_map = {
 54 |         'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
 55 |         'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1),
 56 |         'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
 57 |         'image/filename': tf.io.FixedLenFeature([], dtype=tf.string, default_value='')
 58 |     }
 59 |     
 60 |     features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map)
 61 |     encoded_img = features['image/encoded']
 62 |     label = tf.cast(features['image/class/label'], dtype=tf.int32)
 63 |     label = tf.cast(tf.reshape(label, shape=[1]), dtype=tf.float32)
 64 | 
 65 |     if (not adv_eval_data):
 66 |         to_batch = (encoded_img, label)
 67 |     else:
 68 |         img_filename = features['image/filename']
 69 |         img_label_text = features['image/class/text']
 70 |         to_batch = (encoded_img, label, img_filename, img_label_text)
 71 | 
 72 |     return to_batch
 73 | 
 74 | def apply_with_random_selector(x, func, cases):
 75 |     """Compute func(x, cases[sel]), with sel sampled from cases.
 76 |     Args:
 77 |         x: Tensor; input Tensor to process.
 78 |         func: function; python function to apply.
 79 |         num_cases: list; cases to sample from.
 80 |     Returns:
 81 |         The result of func(x, cases[sel]), sel is sampled dynamically.
 82 |     """
 83 | 
 84 |     sel = tf.random.uniform([], maxval=len(cases), dtype=tf.int32)
 85 |     # Pass the input only to one of the func calls
 86 |     return control_flow_ops.merge([
 87 |             func(control_flow_ops.switch(x, tf.equal(sel, i))[1], cases[i])
 88 |             for i in range(len(cases))])[0]
 89 | 
 90 | def distort_image(image_buffer, output_height, output_width, num_channels, bbox):
 91 |     """Distort an image for data augmentation.
 92 |     Args:
 93 |         image_buffer: string; raw JPEG image buffer.
 94 |         output_height: int; height of the image after preprocessing.
 95 |         output_width: int; width of the image after preprocessing.
 96 |         num_channels: int; depth of the image buffer for decoding.
 97 |         bbox: 3-D float Tensor; it contains the bounding boxes related to
 98 |             an image. Bounding box coordinates are in range [0, 1],
 99 |             arranged in order [ymin, xmin, ymax, xmax]. The Tensor is of
100 |             shape [1, num_boxes, 4], where num_boxes is the number of
101 |             bounding boxes related to the image.
102 |     Returns:
103 |         distorted_image: 3-D float Tensor; it contains an image. It is of
104 |             size [H, W, C], where H is the image height, W is the image
105 |             width, and C is the number of channels.
106 |     """
107 | 
108 |     # Create a bounding box by distorting an existing one (if it is provided).
109 |     # The new bounding box should respect specific constraints, e.g., be within
110 |     # a range of aspect ratios. If no bounding box is provided, the entire
111 |     # image is considered the initial bounding box to be distorted.
112 |     sampled_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
113 |                                         tf.io.extract_jpeg_shape(image_buffer),
114 |                                         bounding_boxes=bbox,
115 |                                         min_object_covered=0.1,
116 |                                         aspect_ratio_range=[0.5, 2.0],
117 |                                         area_range=[0.85, 1.0],
118 |                                         max_attempts=50,
119 |                                         use_image_if_no_bounding_boxes=True,
120 |                                         seed=0)
121 |     bbox_begin, bbox_size, _ = sampled_distorted_bounding_box
122 | 
123 |     # Reassemble and crop the bounding box
124 |     offset_y, offset_x, _ = tf.unstack(bbox_begin)
125 |     target_height, target_width, _ = tf.unstack(bbox_size)
126 |     crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
127 |     distorted_image = tf.image.decode_and_crop_jpeg(image_buffer, crop_window, channels=num_channels)
128 |     distorted_image = tf.image.convert_image_dtype(distorted_image, dtype=tf.float32)
129 | 
130 |     # Resize the image. Select a resize method randomly. The image aspect ratio may change.
131 |     resize_methods = [tf.image.ResizeMethod.BILINEAR,
132 |                       tf.image.ResizeMethod.LANCZOS3,
133 |                       tf.image.ResizeMethod.LANCZOS5,
134 |                       tf.image.ResizeMethod.BICUBIC,
135 |                       tf.image.ResizeMethod.GAUSSIAN,
136 |                       tf.image.ResizeMethod.NEAREST_NEIGHBOR,
137 |                       tf.image.ResizeMethod.AREA,
138 |                       tf.image.ResizeMethod.MITCHELLCUBIC]
139 |     distorted_image = apply_with_random_selector(distorted_image,
140 |                                                  lambda x, resize_method: tf.image.resize(distorted_image,
141 |                                                  [output_height, output_width],
142 |                                                  method=resize_method, antialias=False),
143 |                                                  cases=resize_methods)
144 | 
145 |     # Restore image shape
146 |     distorted_image.set_shape([output_height, output_width, num_channels])
147 | 
148 |     # Perform a random horizontal flip of the image
149 |     distorted_image = tf.image.random_flip_left_right(distorted_image)
150 | 
151 |     # Perform a random translation of the image
152 |     distorted_image = tf.expand_dims(distorted_image, 0)
153 |     s = 0.1
154 |     vy = s * tf.cast(tf.shape(distorted_image)[1], tf.float32)
155 |     vx = s * tf.cast(tf.shape(distorted_image)[2], tf.float32)
156 |     dy = tf.random.uniform(shape=[tf.shape(distorted_image)[0], 1], minval=-vy, maxval=vy)
157 |     dx = tf.random.uniform(shape=[tf.shape(distorted_image)[0], 1], minval=-vx, maxval=vx)
158 |     d = tf.concat([dx, dy], axis=-1)
159 |     distorted_image = tfa.image.translate(distorted_image, translations=d)
160 |     
161 |     # Perform a random rotation of the image
162 |     r_limit = 20.0 * np.pi / 180.0
163 |     r = tf.random.uniform(shape=[tf.shape(distorted_image)[0]], minval=-r_limit, maxval=r_limit)
164 |     distorted_image = tfa.image.rotate(distorted_image, angles=r)
165 | 
166 |     distorted_image = tf.squeeze(distorted_image)
167 | 
168 |     return distorted_image
169 | 
170 | def preprocess_image(image_buffer, bbox, output_height, output_width,
171 |                      num_channels, dataset_type, is_training):
172 |     """Preprocess an image.
173 |     Args:
174 |         image_buffer: string; encoded JPEG file.
175 |         bbox: 3-D float Tensor; it contains the bounding boxes related to an
176 |             image. Bounding box coordinates are in range [0, 1], arranged in
177 |             order [ymin, xmin, ymax, xmax]. The Tensor is of shape
178 |             [1, num_boxes, 4], where num_boxes is the number of bounding
179 |             boxes related to the image.
180 |         output_height: int; height of the image after preprocessing.
181 |         output_width: int; width of the image after preprocessing.
182 |         num_channels: int; depth of the image buffer for decoding.
183 |         dataset_type: string; type of dataset.
184 |         is_training: boolean; whether the input will be used for training.
185 |     Returns:
186 |         image: 3-D float Tensor; it contains an image. It is of
187 |             size [H, W, C], where H is the image height, W is
188 |             the image width, and C is the number of channels.
189 |     """
190 |     
191 |     if ((dataset_type == 'train') and (is_training)):
192 |         # For training data during training, apply random distortions for data augmentation
193 |         image = distort_image(image_buffer, output_height, output_width, num_channels, bbox)
194 |     else:
195 |         # Decode and resize the input image
196 |         image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
197 |         image = tf.image.convert_image_dtype(image, dtype=tf.float32)
198 |         image = tf.expand_dims(image, 0)
199 |         image = tf.image.resize(image, [output_height, output_width], method=tf.image.ResizeMethod.BILINEAR, antialias=False)
200 |         image = tf.squeeze(image, [0])
201 |     
202 |     # Transform image values from range [0, 1], to [-1, 1]
203 |     image = tf.subtract(image, 0.5)
204 |     image = tf.multiply(image, 2.0)
205 | 
206 |     return image
207 | 
208 | def parse_record(raw_record, dataset_type, is_training,
209 |                  img_size_y, img_size_x, dtype, adv_eval_data):
210 |     """Parse a record containing a training example that corresponds to an image.
211 |     Args:
212 |         raw_record: string; serialized Example protocol buffer.
213 |         dataset_type: string; type of dataset.
214 |         is_training: boolean; whether the input will be used for training.
215 |         img_size_y: int; image height in pixels.
216 |         img_size_x: int; image width in pixels.
217 |         dtype: string; data type to use for images/features.
218 |         adv_eval_data: boolean; whether to include information for advanced
219 |             evaluation in the input batches.
220 |     Returns:
221 |         batch: tuple; it contains the following entries:
222 |             image: 3-D float Tensor; it contains an image. It is of
223 |                 size [H, W, C], where H is the image height, W is
224 |                 the image width, and C is the number of channels.
225 |             label: int; numeric image label.
226 |             img_filename (optional): string; the filename of an image.
227 |             img_label_text (optional): string; the human-readable label of an image.
228 |     """
229 | 
230 |     # Parse Example protocol buffer
231 |     if (not adv_eval_data):
232 |         image_buffer, label = parse_example_proto(raw_record, adv_eval_data)
233 |     else:
234 |         (image_buffer, label,
235 |         img_filename, img_label_text) = parse_example_proto(raw_record, adv_eval_data)
236 | 
237 |     # Pre-process image
238 |     bbox = tf.constant([[[0., 0., 1., 1.]]], dtype=tf.float32)
239 |     image = preprocess_image(image_buffer=image_buffer,
240 |                              bbox=bbox,
241 |                              output_height=img_size_y,
242 |                              output_width=img_size_x,
243 |                              num_channels=NUM_CHANNELS,
244 |                              dataset_type=dataset_type,
245 |                              is_training=is_training)
246 | 
247 |     # Return batch
248 |     if (not adv_eval_data):
249 |         batch  = (image, label)
250 |     else:
251 |         batch  = (image, label, img_filename, img_label_text)
252 |         
253 |     return batch
254 | 
255 | def process_record_dataset(dataset,
256 |                            dataset_type,
257 |                            is_training,
258 |                            batch_size,
259 |                            img_size_y,
260 |                            img_size_x,
261 |                            shuffle_buffer,
262 |                            parse_record_fn,
263 |                            num_epochs=-1,
264 |                            dtype=tf.float32,
265 |                            drop_remainder=False,
266 |                            adv_eval_data=False):
267 |     """Create input dataset from raw records.
268 |     Args:
269 |         dataset: tf dataset; dataset with raw records.
270 |         dataset_type: string; type of dataset.
271 |         is_training: boolean; whether the input will be used for training.
272 |         batch_size: int; number of samples per batch (global, not per replica).
273 |         img_size_y: int; image height in pixels.
274 |         img_size_x: int; image width in pixels.
275 |         shuffle_buffer: int; buffer size to use when shuffling records. A larger
276 |             value results in higher randomness, but a smaller one reduces startup
277 |             time and uses less memory.
278 |         parse_record_fn: function; function that processes raw records.
279 |         num_epochs: int; number of times to repeat the dataset.
280 |         dtype: string; data type to use for images/features.
281 |         drop_remainder: boolean; whether to drop the remainder of the
282 |             batches. If True, the batch dimension will be static.
283 |         adv_eval_data: boolean; whether to include information for advanced
284 |             evaluation in the input batches.
285 |     Returns:
286 |         dataset: tf dataset; iterable input dataset.
287 |     """
288 | 
289 |     # Shuffle records before repeating, to respect epoch boundaries
290 |     if (is_training):
291 |         dataset = dataset.shuffle(buffer_size=shuffle_buffer)
292 |     
293 |     # Repeat dataset for the number of epochs to train
294 |     if (num_epochs < 1):
295 |         dataset = dataset.repeat()
296 |     else:
297 |         dataset = dataset.repeat(num_epochs)
298 | 
299 |     # Parse raw records
300 |     dataset = dataset.map(lambda value: parse_record_fn(value, dataset_type, is_training,
301 |                                                         img_size_y, img_size_x, dtype, adv_eval_data),
302 |                                                         num_parallel_calls=tf.data.experimental.AUTOTUNE)
303 |     dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
304 | 
305 |     # Operations between the final prefetch and the get_next call to the iterator
306 |     # will happen synchronously during run time. Prefetch here again to
307 |     # background all of the above processing work and keep it out of the
308 |     # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
309 |     # allows DistributionStrategies to adjust how many batches to fetch based
310 |     # on how many devices are present.
311 |     dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
312 | 
313 |     return dataset
314 | 
315 | def input_fn(dataset_type,
316 |              is_training,
317 |              data_dir,
318 |              batch_size,
319 |              img_size_y,
320 |              img_size_x,
321 |              num_epochs=-1,
322 |              dtype=tf.float32,
323 |              parse_record_fn=parse_record,
324 |              drop_remainder=False,
325 |              filenames=None,
326 |              adv_eval_data=False):
327 |     """Prepare input batches.
328 |     Args:
329 |         dataset_type: string; type of dataset.
330 |         is_training: boolean; whether the input will be used for training.
331 |         data_dir: string; directory containing the input data.
332 |         batch_size: int; number of samples per batch (global, not per replica).
333 |         img_size_y: int; image height in pixels.
334 |         img_size_x: int; image width in pixels.
335 |         num_epochs: int; number of times to repeat the dataset.
336 |         dtype: string; data type to use for images/features.
337 |         parse_record_fn: function; function that processes raw records.
338 |         drop_remainder: boolean; indicates whether to drop the remainder of the
339 |             batches. If True, the batch dimension will be static.
340 |         filenames: list of strings; it contains paths to TFRecords.
341 |         adv_eval_data: boolean; whether to include information for advanced
342 |             evaluation in the input batches.
343 |     Returns:
344 |         input_dataset: tf dataset; iterable input dataset.
345 |     """
346 | 
347 |     # Get TFRecords paths
348 |     if (filenames is None):
349 |         filenames = get_filenames(dataset_type, data_dir)
350 |     dataset = tf.data.Dataset.from_tensor_slices(filenames)
351 | 
352 |     # Shuffle input files
353 |     if (is_training):
354 |         if (dataset_type == 'train'):
355 |             dataset = dataset.shuffle(buffer_size=TRAIN_SHARDS_NUM)
356 |         elif (dataset_type == 'validation'):
357 |             dataset = dataset.shuffle(buffer_size=VAL_SHARDS_NUM)
358 | 
359 |     # Process input files concurrently
360 |     dataset = dataset.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE)
361 | 
362 |     # Process TFRecords
363 |     input_dataset = process_record_dataset(dataset=dataset,
364 |                                            dataset_type=dataset_type,
365 |                                            is_training=is_training,
366 |                                            batch_size=batch_size,
367 |                                            img_size_y=img_size_y,
368 |                                            img_size_x=img_size_x,
369 |                                            shuffle_buffer=_SHUFFLE_BUFFER,
370 |                                            parse_record_fn=parse_record_fn,
371 |                                            num_epochs=num_epochs,
372 |                                            dtype=dtype,
373 |                                            drop_remainder=drop_remainder,
374 |                                            adv_eval_data=adv_eval_data)
375 | 
376 |     return input_dataset
377 | 


--------------------------------------------------------------------------------
/CUB/input_cub.py:
--------------------------------------------------------------------------------
  1 | """Prepare input batches.
  2 | """
  3 | 
  4 | from __future__ import absolute_import, division, print_function
  5 | 
  6 | import os
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | import tensorflow_addons as tfa
 10 | from tensorflow.python.ops import control_flow_ops
 11 | 
 12 | 
 13 | 
 14 | _SHUFFLE_BUFFER = 10000
 15 | NUM_CHANNELS = 3
 16 | TRAIN_SHARDS_NUM = 16
 17 | VAL_SHARDS_NUM = 16
 18 | 
 19 | def get_filenames(dataset_type, data_dir):
 20 |     """Return filenames for dataset.
 21 |     Args:
 22 |         dataset_type: string; type of dataset.
 23 |         data_dir: string; directory containing the input data.
 24 |     Returns:
 25 |         data_filemames: list of strings; it contains paths to TFRecords.
 26 |     """
 27 |     
 28 |     # Data are assumed to be stored in TFRecords
 29 |     if (dataset_type == 'train'):
 30 |         data_filemames = [os.path.join(data_dir, 'train-%04d-of-%04d' % (i+1, TRAIN_SHARDS_NUM)) for i in range(TRAIN_SHARDS_NUM)]
 31 |     elif (dataset_type == 'validation'):
 32 |         data_filemames = [os.path.join(data_dir, 'validation-%04d-of-%04d' % (i+1, VAL_SHARDS_NUM)) for i in range(VAL_SHARDS_NUM)]
 33 |   
 34 |     return data_filemames
 35 | 
 36 | def parse_example_proto(example_serialized, adv_eval_data=False):
 37 |     """Parse an Example proto that corresponds to an image.
 38 |     Args:
 39 |         example_serialized: string; serialized Example protocol buffer.
 40 |         adv_eval_data: boolean; whether to include information for advanced
 41 |             evaluation in the input batches.
 42 |     Returns:
 43 |         to_batch: tuple; it contains the following entries:
 44 |             encoded_img: string; encoded JPEG file.
 45 |             label: int; numeric image label.
 46 |             img_filename (optional): string; the filename of an image.
 47 |             img_label_text (optional): string; the human-readable label of an image.
 48 |     """
 49 | 
 50 |     # Extract dense features in Example proto
 51 |     feature_map = {
 52 |         'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
 53 |         'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1),
 54 |         'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
 55 |         'image/filename': tf.io.FixedLenFeature([], dtype=tf.string, default_value='')
 56 |     }
 57 |     
 58 |     features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map)
 59 |     encoded_img = features['image/encoded']
 60 |     label = tf.cast(features['image/class/label'], dtype=tf.int32)
 61 | 
 62 |     if (not adv_eval_data):
 63 |         to_batch = (encoded_img, label)
 64 |     else:
 65 |         img_filename = features['image/filename']
 66 |         img_label_text = features['image/class/text']
 67 |         to_batch = (encoded_img, label, img_filename, img_label_text)
 68 | 
 69 |     return to_batch
 70 | 
 71 | def apply_with_random_selector(x, func, cases):
 72 |     """Compute func(x, cases[sel]), with sel sampled from cases.
 73 |     Args:
 74 |         x: Tensor; input Tensor to process.
 75 |         func: function; python function to apply.
 76 |         num_cases: list; cases to sample from.
 77 |     Returns:
 78 |         The result of func(x, cases[sel]), sel is sampled dynamically.
 79 |     """
 80 | 
 81 |     sel = tf.random.uniform([], maxval=len(cases), dtype=tf.int32)
 82 |     # Pass the input only to one of the func calls
 83 |     return control_flow_ops.merge([
 84 |             func(control_flow_ops.switch(x, tf.equal(sel, i))[1], cases[i])
 85 |             for i in range(len(cases))])[0]
 86 | 
 87 | def distort_image(image_buffer, output_height, output_width, num_channels, bbox):
 88 |     """Distort an image for data augmentation.
 89 |     Args:
 90 |         image_buffer: string; raw JPEG image buffer.
 91 |         output_height: int; height of the image after preprocessing.
 92 |         output_width: int; width of the image after preprocessing.
 93 |         num_channels: int; depth of the image buffer for decoding.
 94 |         bbox: 3-D float Tensor; it contains the bounding boxes related to
 95 |             an image. Bounding box coordinates are in range [0, 1],
 96 |             arranged in order [ymin, xmin, ymax, xmax]. The Tensor is of
 97 |             shape [1, num_boxes, 4], where num_boxes is the number of
 98 |             bounding boxes related to the image.
 99 |     Returns:
100 |         distorted_image: 3-D float Tensor; it contains an image. It is of
101 |             size [H, W, C], where H is the image height, W is the image
102 |             width, and C is the number of channels.
103 |     """
104 | 
105 |     # Create a bounding box by distorting an existing one (if it is provided).
106 |     # The new bounding box should respect specific constraints, e.g., be within
107 |     # a range of aspect ratios. If no bounding box is provided, the entire
108 |     # image is considered the initial bounding box to be distorted.
109 |     sampled_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
110 |                                         tf.io.extract_jpeg_shape(image_buffer),
111 |                                         bounding_boxes=bbox,
112 |                                         min_object_covered=0.1,
113 |                                         aspect_ratio_range=[0.5, 2.0],
114 |                                         area_range=[0.85, 1.0],
115 |                                         max_attempts=50,
116 |                                         use_image_if_no_bounding_boxes=True,
117 |                                         seed=0)
118 |     bbox_begin, bbox_size, _ = sampled_distorted_bounding_box
119 | 
120 |     # Reassemble and crop the bounding box
121 |     offset_y, offset_x, _ = tf.unstack(bbox_begin)
122 |     target_height, target_width, _ = tf.unstack(bbox_size)
123 |     crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
124 |     distorted_image = tf.image.decode_and_crop_jpeg(image_buffer, crop_window, channels=num_channels)
125 |     distorted_image = tf.image.convert_image_dtype(distorted_image, dtype=tf.float32)
126 | 
127 |     # Resize the image. Select a resize method randomly. The image aspect ratio may change.
128 |     resize_methods = [tf.image.ResizeMethod.BILINEAR,
129 |                       tf.image.ResizeMethod.LANCZOS3,
130 |                       tf.image.ResizeMethod.LANCZOS5,
131 |                       tf.image.ResizeMethod.BICUBIC,
132 |                       tf.image.ResizeMethod.GAUSSIAN,
133 |                       tf.image.ResizeMethod.NEAREST_NEIGHBOR,
134 |                       tf.image.ResizeMethod.AREA,
135 |                       tf.image.ResizeMethod.MITCHELLCUBIC]
136 |     distorted_image = apply_with_random_selector(distorted_image,
137 |                                                  lambda x, resize_method: tf.image.resize(distorted_image,
138 |                                                  [output_height, output_width],
139 |                                                  method=resize_method, antialias=False),
140 |                                                  cases=resize_methods)
141 | 
142 |     # Restore image shape
143 |     distorted_image.set_shape([output_height, output_width, num_channels])
144 | 
145 |     # Perform a random horizontal flip of the image
146 |     distorted_image = tf.image.random_flip_left_right(distorted_image)
147 | 
148 |     # Perform a random translation of the image
149 |     distorted_image = tf.expand_dims(distorted_image, 0)
150 |     s = 0.1
151 |     vy = s * tf.cast(tf.shape(distorted_image)[1], tf.float32)
152 |     vx = s * tf.cast(tf.shape(distorted_image)[2], tf.float32)
153 |     dy = tf.random.uniform(shape=[tf.shape(distorted_image)[0], 1], minval=-vy, maxval=vy)
154 |     dx = tf.random.uniform(shape=[tf.shape(distorted_image)[0], 1], minval=-vx, maxval=vx)
155 |     d = tf.concat([dx, dy], axis=-1)
156 |     distorted_image = tfa.image.translate(distorted_image, translations=d)
157 |     
158 |     # Perform a random rotation of the image
159 |     r_limit = 20.0 * np.pi / 180.0
160 |     r = tf.random.uniform(shape=[tf.shape(distorted_image)[0]], minval=-r_limit, maxval=r_limit)
161 |     distorted_image = tfa.image.rotate(distorted_image, angles=r)
162 | 
163 |     distorted_image = tf.squeeze(distorted_image)
164 | 
165 |     return distorted_image
166 | 
167 | def preprocess_image(image_buffer, bbox, output_height, output_width,
168 |                      num_channels, dataset_type, is_training):
169 |     """Preprocess an image.
170 |     Args:
171 |         image_buffer: string; encoded JPEG file.
172 |         bbox: 3-D float Tensor; it contains the bounding boxes related to an
173 |             image. Bounding box coordinates are in range [0, 1], arranged in
174 |             order [ymin, xmin, ymax, xmax]. The Tensor is of shape
175 |             [1, num_boxes, 4], where num_boxes is the number of bounding
176 |             boxes related to the image.
177 |         output_height: int; height of the image after preprocessing.
178 |         output_width: int; width of the image after preprocessing.
179 |         num_channels: int; depth of the image buffer for decoding.
180 |         dataset_type: string; type of dataset.
181 |         is_training: boolean; whether the input will be used for training.
182 |     Returns:
183 |         image: 3-D float Tensor; it contains an image. It is of
184 |             size [H, W, C], where H is the image height, W is
185 |             the image width, and C is the number of channels.
186 |     """
187 |     
188 |     if ((dataset_type == 'train') and (is_training)):
189 |         # For training data during training, apply random distortions for data augmentation
190 |         image = distort_image(image_buffer, output_height, output_width, num_channels, bbox)
191 |     else:
192 |         # Decode and resize the input image
193 |         image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
194 |         image = tf.image.convert_image_dtype(image, dtype=tf.float32)
195 |         image = tf.expand_dims(image, 0)
196 |         image = tf.image.resize(image, [output_height, output_width], method=tf.image.ResizeMethod.BILINEAR, antialias=False)
197 |         image = tf.squeeze(image, [0])
198 |     
199 |     # Transform image values from range [0, 1], to [-1, 1]
200 |     image = tf.subtract(image, 0.5)
201 |     image = tf.multiply(image, 2.0)
202 | 
203 |     return image
204 | 
205 | def parse_record(raw_record, dataset_type, is_training,
206 |                  img_size_y, img_size_x, dtype, adv_eval_data):
207 |     """Parse a record containing a training example that corresponds to an image.
208 |     Args:
209 |         raw_record: string; serialized Example protocol buffer.
210 |         dataset_type: string; type of dataset.
211 |         is_training: boolean; whether the input will be used for training.
212 |         img_size_y: int; image height in pixels.
213 |         img_size_x: int; image width in pixels.
214 |         dtype: string; data type to use for images/features.
215 |         adv_eval_data: boolean; whether to include information for advanced
216 |             evaluation in the input batches.
217 |     Returns:
218 |         batch: tuple; it contains the following entries:
219 |             image: 3-D float Tensor; it contains an image. It is of
220 |                 size [H, W, C], where H is the image height, W is
221 |                 the image width, and C is the number of channels.
222 |             label: int; numeric image label.
223 |             img_filename (optional): string; the filename of an image.
224 |             img_label_text (optional): string; the human-readable label of an image.
225 |     """
226 | 
227 |     # Parse Example protocol buffer
228 |     if (not adv_eval_data):
229 |         image_buffer, label = parse_example_proto(raw_record, adv_eval_data)
230 |     else:
231 |         (image_buffer, label,
232 |         img_filename, img_label_text) = parse_example_proto(raw_record, adv_eval_data)
233 | 
234 |     # Pre-process image
235 |     bbox = tf.constant([[[0., 0., 1., 1.]]], dtype=tf.float32)
236 |     image = preprocess_image(image_buffer=image_buffer,
237 |                              bbox=bbox,
238 |                              output_height=img_size_y,
239 |                              output_width=img_size_x,
240 |                              num_channels=NUM_CHANNELS,
241 |                              dataset_type=dataset_type,
242 |                              is_training=is_training)
243 | 
244 |     # Subtract 1 so that labels are in [0, 199] range
245 |     label = tf.cast(tf.cast(tf.reshape(label, shape=[1]), dtype=tf.int32) - 1, dtype=tf.float32)
246 | 
247 |     # Return batch
248 |     if (not adv_eval_data):
249 |         batch  = (image, label)
250 |     else:
251 |         batch  = (image, label, img_filename, img_label_text)
252 |         
253 |     return batch
254 | 
255 | def process_record_dataset(dataset,
256 |                            dataset_type,
257 |                            is_training,
258 |                            batch_size,
259 |                            img_size_y,
260 |                            img_size_x,
261 |                            shuffle_buffer,
262 |                            parse_record_fn,
263 |                            num_epochs=-1,
264 |                            dtype=tf.float32,
265 |                            drop_remainder=False,
266 |                            adv_eval_data=False):
267 |     """Create input dataset from raw records.
268 |     Args:
269 |         dataset: tf dataset; dataset with raw records.
270 |         dataset_type: string; type of dataset.
271 |         is_training: boolean; whether the input will be used for training.
272 |         batch_size: int; number of samples per batch (global, not per replica).
273 |         img_size_y: int; image height in pixels.
274 |         img_size_x: int; image width in pixels.
275 |         shuffle_buffer: int; buffer size to use when shuffling records. A larger
276 |             value results in higher randomness, but a smaller one reduces startup
277 |             time and uses less memory.
278 |         parse_record_fn: function; function that processes raw records.
279 |         num_epochs: int; number of times to repeat the dataset.
280 |         dtype: string; data type to use for images/features.
281 |         drop_remainder: boolean; whether to drop the remainder of the
282 |             batches. If True, the batch dimension will be static.
283 |         adv_eval_data: boolean; whether to include information for advanced
284 |             evaluation in the input batches.
285 |     Returns:
286 |         dataset: tf dataset; iterable input dataset.
287 |     """
288 | 
289 |     # Shuffle records before repeating, to respect epoch boundaries
290 |     if (is_training):
291 |         dataset = dataset.shuffle(buffer_size=shuffle_buffer)
292 |     
293 |     # Repeat dataset for the number of epochs to train
294 |     if (num_epochs < 1):
295 |         dataset = dataset.repeat()
296 |     else:
297 |         dataset = dataset.repeat(num_epochs)
298 | 
299 |     # Parse raw records
300 |     dataset = dataset.map(lambda value: parse_record_fn(value, dataset_type, is_training,
301 |                                                         img_size_y, img_size_x, dtype, adv_eval_data),
302 |                                                         num_parallel_calls=tf.data.experimental.AUTOTUNE)
303 |     dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
304 | 
305 |     # Operations between the final prefetch and the get_next call to the iterator
306 |     # will happen synchronously during run time. Prefetch here again to
307 |     # background all of the above processing work and keep it out of the
308 |     # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
309 |     # allows DistributionStrategies to adjust how many batches to fetch based
310 |     # on how many devices are present.
311 |     dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
312 | 
313 |     return dataset
314 | 
315 | def input_fn(dataset_type,
316 |              is_training,
317 |              data_dir,
318 |              batch_size,
319 |              img_size_y,
320 |              img_size_x,
321 |              num_epochs=-1,
322 |              dtype=tf.float32,
323 |              parse_record_fn=parse_record,
324 |              drop_remainder=False,
325 |              filenames=None,
326 |              adv_eval_data=False):
327 |     """Prepare input batches.
328 |     Args:
329 |         dataset_type: string; type of dataset.
330 |         is_training: boolean; whether the input will be used for training.
331 |         data_dir: string; directory containing the input data.
332 |         batch_size: int; number of samples per batch (global, not per replica).
333 |         img_size_y: int; image height in pixels.
334 |         img_size_x: int; image width in pixels.
335 |         num_epochs: int; number of times to repeat the dataset.
336 |         dtype: string; data type to use for images/features.
337 |         parse_record_fn: function; function that processes raw records.
338 |         drop_remainder: boolean; indicates whether to drop the remainder of the
339 |             batches. If True, the batch dimension will be static.
340 |         filenames: list of strings; it contains paths to TFRecords.
341 |         adv_eval_data: boolean; whether to include information for advanced
342 |             evaluation in the input batches.
343 |     Returns:
344 |         input_dataset: tf dataset; iterable input dataset.
345 |     """
346 | 
347 |     # Get TFRecords paths
348 |     if (filenames is None):
349 |         filenames = get_filenames(dataset_type, data_dir)
350 |     dataset = tf.data.Dataset.from_tensor_slices(filenames)
351 | 
352 |     # Shuffle input files
353 |     if (is_training):
354 |         if (dataset_type == 'train'):
355 |             dataset = dataset.shuffle(buffer_size=TRAIN_SHARDS_NUM)
356 |         elif (dataset_type == 'validation'):
357 |             dataset = dataset.shuffle(buffer_size=VAL_SHARDS_NUM)
358 | 
359 |     # Process input files concurrently
360 |     dataset = dataset.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE)
361 | 
362 |     # Process TFRecords
363 |     input_dataset = process_record_dataset(dataset=dataset,
364 |                                            dataset_type=dataset_type,
365 |                                            is_training=is_training,
366 |                                            batch_size=batch_size,
367 |                                            img_size_y=img_size_y,
368 |                                            img_size_x=img_size_x,
369 |                                            shuffle_buffer=_SHUFFLE_BUFFER,
370 |                                            parse_record_fn=parse_record_fn,
371 |                                            num_epochs=num_epochs,
372 |                                            dtype=dtype,
373 |                                            drop_remainder=drop_remainder,
374 |                                            adv_eval_data=adv_eval_data)
375 | 
376 |     return input_dataset
377 | 


--------------------------------------------------------------------------------
/NABirds/create_tfrecords_nab.py:
--------------------------------------------------------------------------------
  1 | """Convert NABirds images to TFRecords. Raw data can be downloaded here
  2 | https://dl.allaboutbirds.org/nabirds, and are assumed to reside in the following
  3 | directory structure:
  4 |     images/0295/01f53d6bf5e449438d2bb79e0854bca4.jpg
  5 |     images/0296/069519c379574fb285d7bb920443ea89.jpg
  6 |     ...
  7 | Metadata files that can be downloaded with the raw data, are utilized as well.
  8 | In particular, the following files are used: images.txt, train_test_split.txt,
  9 | sizes.txt, classes.txt, and image_class_labels.txt.
 10 | 
 11 | images.txt contains the list of image file names, with each line corresponding to one image.
 12 | The content of the file is expected to be as follows:
 13 | <image_id> <image_name>
 14 | where image_id is a numeric identifier for each image in the dataset, and image_name is the path
 15 | to the corresponding image file.
 16 | An example line is the following:
 17 | 0000139e-21dc-4d0c-bfe1-4cae3c85c829 0817/0000139e21dc4d0cbfe14cae3c85c829.jpg
 18 | 
 19 | train_test_split.txt contains the suggested training/validation split, with each line corresponding
 20 | to one image. The content of the file is expected to be as follows:
 21 | <image_id> <is_training_image>
 22 | where image_id is a unique identifier for each image in the dataset (same as in images.txt), and
 23 | is_training_image takes either value 1 or 0, denoting that the file is in the training or the validation
 24 | set, respectively.
 25 | An example line is the following:
 26 | 0000139e-21dc-4d0c-bfe1-4cae3c85c829 0
 27 | 
 28 | sizes.txt contains the spatial dimensions of each image, with each line corresponding to one image.
 29 | The content of the file is expected to be as follows:
 30 | <image_id> <width> <height>
 31 | where image_id is a unique identifier for each image in the dataset (same as in images.txt), width
 32 | is the width of the corresponding image in pixels, and height is the height of the image in pixels.
 33 | An example line is the following:
 34 | 0000139e-21dc-4d0c-bfe1-4cae3c85c829 296 341
 35 | 
 36 | classes.txt contains the list of human-readable labels (not all of them are represented in the image data),
 37 | with each line corresponding to a different label.
 38 | The content of the file is expected to be as follows:
 39 | <class_id> <class_name>
 40 | where class_id is a unique numeric identifier for each class, and class_name is the corresponding human-readable label.
 41 | An example line is the following:
 42 | 37 Barn Owl
 43 | 
 44 | image_class_labels.txt contains the mapping between images and ground truth labels.
 45 | The content of the file is expected to be as follows:
 46 | <image_id> <class_id>
 47 | where mage_id is a unique identifier for each image in the dataset (same as in images.txt), and class_id is a unique
 48 | numeric identifier for each class.
 49 | An example line is the following:
 50 | 0000139e-21dc-4d0c-bfe1-4cae3c85c829 817
 51 | """
 52 | 
 53 | from __future__ import absolute_import, division, print_function
 54 | 
 55 | import argparse
 56 | from datetime import datetime
 57 | import os
 58 | import random
 59 | import sys
 60 | import threading
 61 | import scipy.io
 62 | import pickle
 63 | import six
 64 | 
 65 | import numpy as np
 66 | import tensorflow as tf
 67 | 
 68 | 
 69 | 
 70 | parser = argparse.ArgumentParser()
 71 | 
 72 | parser.add_argument('--data_directory', type=str, default='/images/', help='Directory with raw image data.')
 73 | parser.add_argument('--root_directory', type=str, default='/NABirds/data/', help='Directory with metadata files.')
 74 | parser.add_argument('--output_directory', type=str, default='/TFRecords/', help='Output data directory.')
 75 | parser.add_argument('--image_ids_struct_path', type=str, default=None, help='Path to txt file with pyhton dictionary that contains metadata needed for the creation of TFRecord files.')
 76 | 
 77 | parser.add_argument('--train_shards', type=int, default=16, help='Number of shards in training TFRecord files.')
 78 | parser.add_argument('--validation_shards', type=int, default=16, help='Number of shards in validation TFRecord files.')
 79 | parser.add_argument('--num_threads', type=int, default=16, help='Number of threads to parallelize processing.')
 80 | 
 81 | FLAGS = parser.parse_args()
 82 | 
 83 | IMAGE_IDS_STRUCT_FNAME = 'image_ids_struct.txt'
 84 | 
 85 | def create_image_ids_struct():
 86 |     """Create dictionary with information about the NAbirds dataset.
 87 |        It includes image filenames, ground truth numeric labels,
 88 |        human-readable labels, image spatial dimensions, and indicators
 89 |        that distinguish between the training and validation splits.
 90 |     Args:
 91 |         -
 92 |     Returns:
 93 |         -
 94 |     """
 95 | 
 96 |     image_ids = {}
 97 | 
 98 |     fname = 'images.txt'
 99 |     with open(os.path.join(FLAGS.root_directory, fname)) as f:
100 |         for line in f:
101 |             tokens = line.strip().split()
102 |             image_ids[tokens[0]] = {}
103 |             image_ids[tokens[0]]['image_name'] = tokens[1]
104 | 
105 |     fname = 'train_test_split.txt'
106 |     with open(os.path.join(FLAGS.root_directory, fname)) as f:
107 |         for line in f:
108 |             tokens = line.strip().split()
109 |             image_ids[tokens[0]]['is_training_image'] = int(tokens[1])
110 | 
111 |     fname = 'sizes.txt'
112 |     with open(os.path.join(FLAGS.root_directory, fname)) as f:
113 |         for line in f:
114 |             tokens = line.strip().split()
115 |             image_ids[tokens[0]]['height'] = int(tokens[2])
116 |             image_ids[tokens[0]]['width'] = int(tokens[1])
117 | 
118 |     class_ids = {}
119 |     fname = 'classes.txt'
120 |     with open(os.path.join(FLAGS.root_directory, fname)) as f:
121 |         for line in f:
122 |             tokens = line.strip().split()
123 |             class_ids[tokens[0]] = tokens[1]
124 | 
125 |     fname = 'image_class_labels.txt'
126 |     with open(os.path.join(FLAGS.root_directory, fname)) as f:
127 |         for line in f:
128 |             tokens = line.strip().split()
129 |             image_ids[tokens[0]]['class_id'] = tokens[1]
130 |             image_ids[tokens[0]]['class_name'] = class_ids[tokens[1]]
131 | 
132 |     labels = {}
133 |     label_id = 0
134 |     for e in image_ids:
135 |         if (image_ids[e]['class_id'] not in labels):
136 |             labels[image_ids[e]['class_id']] = label_id
137 |             image_ids[e]['label'] = label_id
138 |             label_id += 1
139 |         else:
140 |             image_ids[e]['label'] = labels[image_ids[e]['class_id']]
141 | 
142 |     with open(os.path.join(FLAGS.root_directory, IMAGE_IDS_STRUCT_FNAME), "wb") as fp:
143 |         pickle.dump(image_ids, fp)
144 | 
145 | def _int64_feature(value):
146 |     """Insert int features into Example proto.
147 |     Args:
148 |         value: int or list of ints; features to insert
149 |             in Example proto.
150 |     Returns:
151 |         feature: example proto; it contains a list of ints.
152 |     """
153 | 
154 |     if ((not isinstance(value, list)) and (not isinstance(value, np.ndarray))):
155 |         value = [value]
156 |     
157 |     feature = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
158 | 
159 |     return feature
160 | 
161 | def _float_feature(value):
162 |     """Insert float features into Example proto.
163 |     Args:
164 |         value: float or list of floats; features to insert
165 |             in Example proto.
166 |     Returns:
167 |         feature: example proto; it contains a list of floats.
168 |     """
169 | 
170 |     if ((not isinstance(value, list)) and (not isinstance(value, np.ndarray))):
171 |         value = [value]
172 |     
173 |     feature = tf.train.Feature(float_list=tf.train.FloatList(value=value))
174 | 
175 |     return feature
176 | 
177 | def _bytes_feature(value):
178 |     """Insert byte features into Example proto.
179 |     Args:
180 |         value: string or list of strings; features to
181 |             insert in Example proto.
182 |     Returns:
183 |         feature: example proto; it contains a byte list.
184 |     """
185 |     
186 |     if (isinstance(value, type(tf.constant(0)))):
187 |         value = value.numpy()
188 |     if (six.PY3 and isinstance(value, six.text_type)):
189 |         value = six.binary_type(value, encoding='utf-8') 
190 |     
191 |     feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
192 | 
193 |     return feature
194 | 
195 | def _convert_to_example(filename, image_buffer, label, human_label, height, width):
196 |     """Build an Example proto for an image.
197 |     Args:
198 |         filename: string; path to image file.
199 |         image_buffer: string; JPEG encoded image.
200 |         label: int; numeric ground truth label.
201 |         human_label: string; human-readable label.
202 |         height: int; image height in pixels.
203 |         width: int; image width in pixels.
204 |     Returns:
205 |         example: example proto; it contains the following fields:
206 |             image/height: int; image height in pixels.
207 |             image/width: int; image width in pixels.
208 |             image/colorspace: string; colorspace, always 'RGB'.
209 |             image/channels: int; number of channels, always 3.
210 |             image/class/label: int; index of a classification label in range [1, 200].
211 |             image/class/text: string; human-readable label.
212 |             image/format: string; image format, always 'JPEG'.
213 |             image/filename: string; image file basename.
214 |             image/encoded: string; JPEG encoded image.
215 |     """
216 | 
217 |     colorspace = 'RGB'
218 |     channels = 3
219 |     image_format = 'JPEG'
220 | 
221 |     example = tf.train.Example(features=tf.train.Features(feature={
222 |         'image/height': _int64_feature(height),
223 |         'image/width': _int64_feature(width),
224 |         'image/colorspace': _bytes_feature(colorspace),
225 |         'image/channels': _int64_feature(channels),
226 |         'image/class/label': _int64_feature(label),
227 |         'image/class/text': _bytes_feature(human_label),
228 |         'image/format': _bytes_feature(image_format),
229 |         'image/filename': _bytes_feature(os.path.basename(filename)),
230 |         'image/encoded': _bytes_feature(image_buffer)
231 |     }))
232 | 
233 |     return example
234 | 
235 | def _process_image(filename):
236 |     """Process a single image file.
237 |     Args:
238 |         filename: string; path to an image file.
239 |     Returns:
240 |         image_buffer: string; JPEG encoded image.
241 |         height: int; image height in pixels.
242 |         width: int; image width in pixels.
243 |     """
244 | 
245 |     # Read image file
246 |     image_data = tf.io.read_file(filename)
247 | 
248 |     # Decode image
249 |     try:
250 |         image = tf.io.decode_image(image_data, channels=3)
251 |     except:
252 |         print("Oops! %s." %filename)
253 | 
254 |     # Assert that the image has the appropriate dimensions
255 |     assert (image.shape[2] == 3)
256 |     height = image.shape[0]
257 |     width = image.shape[1]
258 |     assert image.shape[2] == 3
259 | 
260 |     image_data = tf.io.encode_jpeg(image, format='rgb', quality=100)
261 | 
262 |     return image_data, height, width
263 | 
264 | def _process_image_files_batch(thread_index, ranges, name, filenames,
265 |                                labels, human_labels, num_shards):
266 |     """Execute 1 thread that processes images and saves them as TFRecords
267 |        of Example protos.
268 |     Args:
269 |         thread_index: int; unique thread identifier.
270 |         ranges: list of ints; it contains the range of images to
271 |             process.
272 |         name: string; unique identifier specifying the data set.
273 |         filenames: list of strings; it contains paths to image files.
274 |         labels: list of ints; it contains numeric labels.
275 |         human_labels: list of strings; it contains human-readable labels.
276 |         num_shards: int; number of shards.
277 |     Returns:
278 |         -
279 |     """
280 | 
281 |     # Each thread produces N shards where N = int(num_shards / num_threads).
282 |     # For instance, if num_shards = 128, and the num_threads = 2, then the first
283 |     # thread would produce shards [0, 64)
284 |     num_threads = len(ranges)
285 |     assert not num_shards % num_threads
286 |     num_shards_per_batch = int(num_shards / num_threads)
287 | 
288 |     shard_ranges = np.linspace(ranges[thread_index][0],
289 |                                ranges[thread_index][1],
290 |                                num_shards_per_batch + 1).astype(int)
291 |     num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
292 | 
293 |     # Generate each shard
294 |     counter = 0
295 |     for s in range(num_shards_per_batch):
296 |         shard = thread_index * num_shards_per_batch + s
297 |         output_filename = '%s-%.4d-of-%.4d' % (name, (shard+1), num_shards)
298 |         output_file = os.path.join(FLAGS.output_directory, output_filename)
299 |         writer = tf.io.TFRecordWriter(output_file)
300 | 
301 |         # Process each file for a shard
302 |         shard_counter = 0
303 |         files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
304 |         for i in files_in_shard:
305 |             filename = filenames[i]
306 |             label = labels[i]
307 |             human_label = human_labels[i]
308 | 
309 |             # Process an image
310 |             image_buffer, height, width = _process_image(filename)
311 | 
312 |             # Create an Example proto
313 |             example = _convert_to_example(filename, image_buffer, label,
314 |                                           human_label, height, width)
315 |             
316 |             # Write to TFRecord
317 |             writer.write(example.SerializeToString())
318 |             shard_counter += 1
319 |             counter += 1
320 | 
321 |             if (not (counter % 1000)):
322 |                 print('%s [thread %d]: Processed %d of %d images in thread batch.' %
323 |                       (datetime.now(), thread_index, counter, num_files_in_thread))
324 |                 sys.stdout.flush()
325 | 
326 |         writer.close()
327 |         print('%s [thread %d]: Wrote %d images to %s' %
328 |               (datetime.now(), thread_index, shard_counter, output_file))
329 |         sys.stdout.flush()
330 |         shard_counter = 0
331 |     print('%s [thread %d]: Wrote %d images to %d shards.'
332 |           %(datetime.now(), thread_index, counter, num_files_in_thread))
333 |     sys.stdout.flush()
334 | 
335 | def _process_image_files(name, filenames, labels, human_labels, num_shards):
336 |     """Process images and save them as TFRecords of Example protos.
337 |     Args:
338 |         name: string; unique identifier specifying the data set.
339 |         filenames: list of strings; it contains paths to image files.
340 |         labels: list of ints; it contains numeric labels.
341 |         human_labels: list of strings; it contains human-readable labels.
342 |         num_shards: int; number of shards.
343 |     Returns:
344 |         -
345 |     """
346 | 
347 |     assert len(filenames) == len(labels) == len(human_labels)
348 | 
349 |     # Break images into batches
350 |     spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int)
351 |     ranges = []
352 |     for i in range(len(spacing) - 1):
353 |         ranges.append([spacing[i], spacing[i + 1]])
354 | 
355 |     # Launch a thread for each batch
356 |     print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
357 |     sys.stdout.flush()
358 | 
359 |     # Create a mechanism for monitoring threads' execution
360 |     coord = tf.train.Coordinator()
361 | 
362 |     # Run threads
363 |     threads = []
364 |     for thread_index in range(len(ranges)):
365 |         args = (thread_index, ranges, name, filenames,
366 |                 labels, human_labels, num_shards)
367 |         t = threading.Thread(target=_process_image_files_batch, args=args)
368 |         t.start()
369 |         threads.append(t)
370 | 
371 |     # Wait for all the threads to terminate
372 |     coord.join(threads)
373 |     print('%s: Finished writing all %d images in data set.' %(datetime.now(), len(filenames)))
374 |     sys.stdout.flush()
375 | 
376 | def _find_image_files(name, data_dir):
377 |     """Build lists of images file paths, numeric labels, and
378 |        human-readable labels.
379 |     Args:
380 |         name: string; unique identifier specifying the data set.
381 |         data_dir: string; path to data set.
382 |     Returns:
383 |         filenames: list of strings; it contains paths to image files.
384 |         labels: list of ints; it contains numeric labels.
385 |         human_labels: list of strings; it contains human-readable labels.
386 |     """
387 | 
388 |     data_type_bool = int(name == 'train')
389 | 
390 |     with open(os.path.join(FLAGS.root_directory, IMAGE_IDS_STRUCT_FNAME), "rb") as fp:
391 |         image_ids = pickle.load(fp)
392 |     
393 |     # Iterate over the image files
394 |     filenames = []
395 |     labels = []
396 |     human_labels = []
397 |     label_num = 0
398 |     for e in image_ids:
399 |         im_struct = image_ids[e]
400 |         if (im_struct['is_training_image'] == data_type_bool):
401 |             filenames.append(os.path.join(data_dir, im_struct['image_name']))
402 |             
403 |             if (im_struct['label'] not in labels):
404 |                 label_num += 1
405 |             labels.append(im_struct['label'])
406 |             human_labels.append(im_struct['class_name'])
407 | 
408 |     # Shuffle the ordering of all image files in order to guarantee
409 |     # random ordering of the images with respect to labels in the
410 |     # saved TFRecord files. Make the randomization repeatable
411 |     shuffled_index = list(range(len(filenames)))
412 |     random.seed(12345)
413 |     random.shuffle(shuffled_index)
414 | 
415 |     filenames = [filenames[i] for i in shuffled_index]
416 |     labels = [labels[i] for i in shuffled_index]
417 |     human_labels = [human_labels[i] for i in shuffled_index]
418 | 
419 |     print('Found %d .jpg files across %d labels inside %s.' %(len(filenames), label_num, data_dir))
420 |     sys.stdout.flush()
421 | 
422 |     return filenames, labels, human_labels
423 | 
424 | def _process_dataset(name, directory, num_shards):
425 |     """Process a complete data set and save it in TFRecords.
426 |     Args:
427 |         name: string; unique identifier specifying the data set.
428 |         directory: string; path to data set.
429 |         num_shards: int; number of shards.
430 |     Returns:
431 |         -
432 |     """
433 | 
434 |     filenames, labels, human_labels = _find_image_files(name, directory)
435 |     _process_image_files(name, filenames, labels, human_labels, num_shards)
436 | 
437 | def main(argv=None):
438 |     """Convert NABirds training and validation images to TFRecords.
439 |     Args:
440 |         -
441 |     Returns:
442 |         -
443 |     """
444 | 
445 |     assert not FLAGS.train_shards % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.train_shards_num')
446 |     assert not FLAGS.validation_shards % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.dev_shards_num')
447 | 
448 |     if (not os.path.isdir(FLAGS.output_directory)):
449 |         os.makedirs(FLAGS.output_directory)
450 |     print('Saving results to %s' % FLAGS.output_directory)
451 |     sys.stdout.flush()
452 | 
453 |     # Create dictionary with metadata information
454 |     if (not FLAGS.image_ids_struct_path):
455 |         create_image_ids_struct()
456 | 
457 |     # Create TFRecords
458 |     _process_dataset('validation', FLAGS.data_directory, FLAGS.validation_shards)
459 |     _process_dataset('train', FLAGS.data_directory, FLAGS.train_shards)
460 | 
461 | if __name__ == '__main__':
462 |     main()
463 | 


--------------------------------------------------------------------------------
/fMoW/input_fMoW.py:
--------------------------------------------------------------------------------
  1 | """Prepare input batches.
  2 | """
  3 | 
  4 | from __future__ import absolute_import, division, print_function
  5 | 
  6 | import os
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | from tensorflow.python.ops import control_flow_ops
 10 | 
 11 | 
 12 | 
 13 | _SHUFFLE_BUFFER = 10000
 14 | NUM_CHANNELS = 3
 15 | TRAIN_SHARDS_NUM = 512
 16 | VAL_SHARDS_NUM = 128
 17 | TEST_SHARDS_NUM = 128
 18 | 
 19 | def get_filenames(dataset_type, data_dir):
 20 |     """Return filenames for dataset.
 21 |     Args:
 22 |         dataset_type: string; type of dataset.
 23 |         data_dir: string; directory containing the input data.
 24 |     Returns:
 25 |         data_filemames: list of strings; it contains paths to TFRecords.
 26 |     """
 27 |     
 28 |     # Data are assumed to be stored in TFRecords
 29 |     if (dataset_type == 'train'):
 30 |         data_filemames = [os.path.join(data_dir, 'train-%05d-of-%05d' % (i, TRAIN_SHARDS_NUM)) for i in range(TRAIN_SHARDS_NUM)]
 31 |     elif (dataset_type == 'validation'):
 32 |         data_filemames = [os.path.join(data_dir, 'validation-%05d-of-%05d' % (i, VAL_SHARDS_NUM)) for i in range(VAL_SHARDS_NUM)]
 33 |     elif (dataset_type == 'test'):
 34 |         data_filemames = [os.path.join(data_dir, 'test-%05d-of-%05d' % (i, TEST_SHARDS_NUM)) for i in range(TEST_SHARDS_NUM)]
 35 |   
 36 |     return data_filemames
 37 | 
 38 | def parse_example_proto(example_serialized, adv_eval_data=False):
 39 |     """Parse an Example proto that corresponds to an image.
 40 |     Args:
 41 |         example_serialized: string; serialized Example protocol buffer.
 42 |         adv_eval_data: boolean; whether to include information for advanced
 43 |             evaluation in the input batches.
 44 |     Returns:
 45 |         to_batch: tuple; it contains the following entries:
 46 |             encoded_img: string; encoded JPEG file.
 47 |             label: int; numeric image label.
 48 |             bbox: 3-D float Tensor; it contains the bounding boxes related to an
 49 |                 image. Bounding box coordinates are in range [0, 1], arranged in
 50 |                 order [ymin, xmin, ymax, xmax]. The Tensor is of shape
 51 |                 [1, num_boxes, 4], where num_boxes is the number of bounding
 52 |                 boxes related to the image.
 53 |             img_filename (optional): string; the filename of an image.
 54 |             img_label_text (optional): string; the human-readable label of an image.
 55 |     """
 56 | 
 57 |     # Extract dense features in Example proto
 58 |     feature_map = {
 59 |         'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
 60 |         'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1),
 61 |         'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
 62 |         'image/filename': tf.io.FixedLenFeature([], dtype=tf.string, default_value='')
 63 |     }
 64 |     # Extract sparse features in Example proto
 65 |     sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
 66 |     feature_map.update(
 67 |         {k: sparse_float32 for k in ['image/object/bbox/xmin', 'image/object/bbox/ymin',
 68 |                                     'image/object/bbox/xmax', 'image/object/bbox/ymax']})
 69 | 
 70 |     features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map)
 71 |     encoded_img = features['image/encoded']
 72 |     label = tf.cast(features['image/class/label'], dtype=tf.int32)
 73 | 
 74 |     ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
 75 |     xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
 76 |     ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
 77 |     xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
 78 | 
 79 |     # Make the variable number of bounding boxes into
 80 |     # the shape [1, num_boxes, coords]
 81 |     bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
 82 |     bbox = tf.expand_dims(bbox, 0)
 83 |     bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
 84 | 
 85 |     if (not adv_eval_data):
 86 |         to_batch = (encoded_img, label, bbox)
 87 |     else:
 88 |         img_filename = features['image/filename']
 89 |         img_label_text = features['image/class/text']
 90 |         to_batch = (encoded_img, label, bbox, img_filename, img_label_text)
 91 | 
 92 |     return to_batch
 93 | 
 94 | def apply_with_random_selector(x, func, cases):
 95 |     """Compute func(x, cases[sel]), with sel sampled from cases.
 96 |     Args:
 97 |         x: Tensor; input Tensor to process.
 98 |         func: function; python function to apply.
 99 |         num_cases: list; cases to sample from.
100 |     Returns:
101 |         The result of func(x, cases[sel]), sel is sampled dynamically.
102 |     """
103 | 
104 |     sel = tf.random.uniform([], maxval=len(cases), dtype=tf.int32)
105 |     # Pass the input only to one of the func calls
106 |     return control_flow_ops.merge([
107 |             func(control_flow_ops.switch(x, tf.equal(sel, i))[1], cases[i])
108 |             for i in range(len(cases))])[0]
109 | 
110 | def distort_color(image, color_ordering):
111 |     """Distort the color of an image.
112 |     Args:
113 |         image: 3-D float Tensor; it contains an image. It is of
114 |             size [H, W, C], where H is the image height, W is
115 |             the image width, and C is the number of channels.
116 |         color_ordering: int; denotes the kind of color distortion.
117 |     Returns:
118 |         image: 3-D float Tensor; it contains an image. It is of
119 |             size [H, W, C], where H is the image height, W is
120 |             the image width, and C is the number of channels.
121 |     """
122 | 
123 |     if color_ordering == 0:
124 |         image = tf.image.random_brightness(image, max_delta=32. / 255.)
125 |         image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
126 |         image = tf.image.random_hue(image, max_delta=0.2)
127 |         image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
128 |     elif color_ordering == 1:
129 |         image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
130 |         image = tf.image.random_brightness(image, max_delta=32. / 255.)
131 |         image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
132 |         image = tf.image.random_hue(image, max_delta=0.2)
133 |     elif color_ordering == 2:
134 |         image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
135 |         image = tf.image.random_hue(image, max_delta=0.2)
136 |         image = tf.image.random_brightness(image, max_delta=32. / 255.)
137 |         image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
138 |     elif color_ordering == 3:
139 |         image = tf.image.random_hue(image, max_delta=0.2)
140 |         image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
141 |         image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
142 |         image = tf.image.random_brightness(image, max_delta=32. / 255.)
143 |     elif color_ordering == 4:
144 |         return image
145 |     else:
146 |         raise ValueError('color_ordering must be in [0, 4]')
147 | 
148 |     # The random_* ops do not necessarily clamp
149 |     image = tf.clip_by_value(image, 0.0, 1.0)
150 | 
151 |     return image
152 | 
153 | def distort_image(image_buffer, output_height, output_width, num_channels, bbox):
154 |     """Distort an image for data augmentation.
155 |     Args:
156 |         image_buffer: string; raw JPEG image buffer.
157 |         output_height: int; height of the image after preprocessing.
158 |         output_width: int; width of the image after preprocessing.
159 |         num_channels: int; depth of the image buffer for decoding.
160 |         bbox: 3-D float Tensor; it contains the bounding boxes related to
161 |             an image. Bounding box coordinates are in range [0, 1],
162 |             arranged in order [ymin, xmin, ymax, xmax]. The Tensor is of
163 |             shape [1, num_boxes, 4], where num_boxes is the number of
164 |             bounding boxes related to the image.
165 |     Returns:
166 |         distorted_image: 3-D float Tensor; it contains an image. It is of
167 |             size [H, W, C], where H is the image height, W is the image
168 |             width, and C is the number of channels.
169 |     """
170 | 
171 |     # Create a bounding box by distorting an existing one (if it is provided).
172 |     # The new bounding box should respect specific constraints, e.g., be within
173 |     # a range of aspect ratios. If no bounding box is provided, the entire
174 |     # image is considered the initial bounding box to be distorted.
175 |     sampled_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
176 |                                         tf.io.extract_jpeg_shape(image_buffer),
177 |                                         bounding_boxes=bbox,
178 |                                         min_object_covered=0.1,
179 |                                         aspect_ratio_range=[0.5, 2.0],
180 |                                         area_range=[0.85, 1.0],
181 |                                         max_attempts=50,
182 |                                         use_image_if_no_bounding_boxes=True,
183 |                                         seed=0)
184 |     bbox_begin, bbox_size, _ = sampled_distorted_bounding_box
185 | 
186 |     # Reassemble and crop the bounding box
187 |     offset_y, offset_x, _ = tf.unstack(bbox_begin)
188 |     target_height, target_width, _ = tf.unstack(bbox_size)
189 |     crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
190 |     distorted_image = tf.image.decode_and_crop_jpeg(image_buffer, crop_window, channels=num_channels)
191 |     distorted_image = tf.image.convert_image_dtype(distorted_image, dtype=tf.float32)
192 | 
193 |     # Resize the image. Select a resize method randomly. The image aspect ratio may change.
194 |     resize_methods = [tf.image.ResizeMethod.BILINEAR,
195 |                       tf.image.ResizeMethod.LANCZOS3,
196 |                       tf.image.ResizeMethod.LANCZOS5,
197 |                       tf.image.ResizeMethod.BICUBIC,
198 |                       tf.image.ResizeMethod.GAUSSIAN,
199 |                       tf.image.ResizeMethod.NEAREST_NEIGHBOR,
200 |                       tf.image.ResizeMethod.AREA,
201 |                       tf.image.ResizeMethod.MITCHELLCUBIC]
202 |     distorted_image = apply_with_random_selector(distorted_image,
203 |                                                  lambda x, resize_method: tf.image.resize(distorted_image,
204 |                                                  [output_height, output_width],
205 |                                                  method=resize_method, antialias=False),
206 |                                                  cases=resize_methods)
207 | 
208 |     # Restore image shape
209 |     distorted_image.set_shape([output_height, output_width, num_channels])
210 | 
211 |     # Perform a random horizontal flip of the image
212 |     distorted_image = tf.image.random_flip_left_right(distorted_image)
213 | 
214 |     # Perform random color distortions
215 |     distorted_image = apply_with_random_selector(distorted_image,
216 |                                                  lambda x, color_ordering: distort_color(x, color_ordering),
217 |                                                  cases=np.arange(5))
218 | 
219 |     return distorted_image
220 | 
221 | def preprocess_image(image_buffer, bbox, output_height, output_width,
222 |                      num_channels, dataset_type, is_training):
223 |     """Preprocess an image.
224 |     Args:
225 |         image_buffer: string; encoded JPEG file.
226 |         bbox: 3-D float Tensor; it contains the bounding boxes related to an
227 |             image. Bounding box coordinates are in range [0, 1], arranged in
228 |             order [ymin, xmin, ymax, xmax]. The Tensor is of shape
229 |             [1, num_boxes, 4], where num_boxes is the number of bounding
230 |             boxes related to the image.
231 |         output_height: int; height of the image after preprocessing.
232 |         output_width: int; width of the image after preprocessing.
233 |         num_channels: int; depth of the image buffer for decoding.
234 |         dataset_type: string; type of dataset.
235 |         is_training: boolean; whether the input will be used for training.
236 |     Returns:
237 |         image: 3-D float Tensor; it contains an image. It is of
238 |             size [H, W, C], where H is the image height, W is
239 |             the image width, and C is the number of channels.
240 |     """
241 |     
242 |     if ((dataset_type == 'train') and (is_training)):
243 |         # For training data during training, apply random distortions for data augmentation
244 |         image = distort_image(image_buffer, output_height, output_width, num_channels, bbox)
245 |     else:
246 |         # Decode and resize the input image
247 |         image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
248 |         image = tf.image.convert_image_dtype(image, dtype=tf.float32)
249 |         image = tf.expand_dims(image, 0)
250 |         image = tf.image.resize(image, [output_height, output_width], method=tf.image.ResizeMethod.BILINEAR, antialias=False)
251 |         image = tf.squeeze(image, [0])
252 | 
253 |     # Transform image values from range [0, 1], to [-1, 1]
254 |     image = tf.subtract(image, 0.5)
255 |     image = tf.multiply(image, 2.0)
256 | 
257 |     return image
258 | 
259 | def parse_record(raw_record, dataset_type, is_training,
260 |                  img_size_y, img_size_x, dtype, adv_eval_data):
261 |     """Parse a record containing a training example that corresponds to an image.
262 |     Args:
263 |         raw_record: string; serialized Example protocol buffer.
264 |         dataset_type: string; type of dataset.
265 |         is_training: boolean; whether the input will be used for training.
266 |         img_size_y: int; image height in pixels.
267 |         img_size_x: int; image width in pixels.
268 |         dtype: string; data type to use for images/features.
269 |         adv_eval_data: boolean; whether to include information for advanced
270 |             evaluation in the input batches.
271 |     Returns:
272 |         batch: tuple; it contains the following entries:
273 |             image: 3-D float Tensor; it contains an image. It is of
274 |                 size [H, W, C], where H is the image height, W is
275 |                 the image width, and C is the number of channels.
276 |             label: int; numeric image label.
277 |             img_filename (optional): string; the filename of an image.
278 |             img_label_text (optional): string; the human-readable label of an image.
279 |     """
280 | 
281 |     # Parse Example protocol buffer
282 |     if (not adv_eval_data):
283 |         image_buffer, label, bbox = parse_example_proto(raw_record, adv_eval_data)
284 |     else:
285 |         (image_buffer, label, bbox,
286 |         img_filename, img_label_text) = parse_example_proto(raw_record, adv_eval_data)
287 | 
288 |     # Pre-process image
289 |     image = preprocess_image(image_buffer=image_buffer,
290 |                              bbox=bbox,
291 |                              output_height=img_size_y,
292 |                              output_width=img_size_x,
293 |                              num_channels=NUM_CHANNELS,
294 |                              dataset_type=dataset_type,
295 |                              is_training=is_training)
296 |     image = tf.cast(image, dtype)
297 | 
298 |     label = tf.cast(tf.reshape(label, shape=[1]), dtype=tf.float32)
299 |   
300 |     # Return batch
301 |     if (not adv_eval_data):
302 |         batch  = (image, label)
303 |     else:
304 |         batch  = (image, label, img_filename, img_label_text)
305 |     
306 |     return batch
307 | 
308 | def process_record_dataset(dataset,
309 |                            dataset_type,
310 |                            is_training,
311 |                            batch_size,
312 |                            img_size_y,
313 |                            img_size_x,
314 |                            shuffle_buffer,
315 |                            parse_record_fn,
316 |                            num_epochs=-1,
317 |                            dtype=tf.float32,
318 |                            datasets_num_private_threads=None,
319 |                            drop_remainder=False,
320 |                            tf_data_experimental_slack=False,
321 |                            adv_eval_data=False):
322 |     """Create input dataset from raw records.
323 |     Args:
324 |         dataset: tf dataset; dataset with raw records.
325 |         dataset_type: string; type of dataset.
326 |         is_training: boolean; whether the input will be used for training.
327 |         batch_size: int; number of samples per batch (global, not per replica).
328 |         img_size_y: int; image height in pixels.
329 |         img_size_x: int; image width in pixels.
330 |         shuffle_buffer: int; buffer size to use when shuffling records. A larger
331 |             value results in higher randomness, but a smaller one reduces startup
332 |             time and uses less memory.
333 |         parse_record_fn: function; function that processes raw records.
334 |         num_epochs: int; number of times to repeat the dataset.
335 |         dtype: string; data type to use for images/features.
336 |         drop_remainder: boolean; whether to drop the remainder of the
337 |             batches. If True, the batch dimension will be static.
338 |         adv_eval_data: boolean; whether to include information for advanced
339 |             evaluation in the input batches.
340 |     Returns:
341 |         dataset: tf dataset; iterable input dataset.
342 |     """
343 | 
344 |     # Shuffle records before repeating, to respect epoch boundaries
345 |     if (is_training):
346 |         dataset = dataset.shuffle(buffer_size=shuffle_buffer)
347 |     
348 |     # Repeat dataset for the number of epochs to train
349 |     if (num_epochs < 1):
350 |         dataset = dataset.repeat()
351 |     else:
352 |         dataset = dataset.repeat(num_epochs)
353 | 
354 |     # Parse raw records
355 |     dataset = dataset.map(lambda value: parse_record_fn(value, dataset_type, is_training,
356 |                                                         img_size_y, img_size_x, dtype, adv_eval_data),
357 |                                                         num_parallel_calls=tf.data.experimental.AUTOTUNE)
358 |     dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
359 | 
360 |     # Operations between the final prefetch and the get_next call to the iterator
361 |     # will happen synchronously during run time. Prefetch here again to
362 |     # background all of the above processing work and keep it out of the
363 |     # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
364 |     # allows DistributionStrategies to adjust how many batches to fetch based
365 |     # on how many devices are present.
366 |     dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
367 | 
368 |     return dataset
369 | 
370 | def input_fn(dataset_type,
371 |              is_training,
372 |              data_dir,
373 |              batch_size,
374 |              img_size_y,
375 |              img_size_x,
376 |              num_epochs=-1,
377 |              dtype=tf.float32,
378 |              parse_record_fn=parse_record,
379 |              drop_remainder=False,
380 |              filenames=None,
381 |              adv_eval_data=False):
382 |     """Prepare input batches.
383 |     Args:
384 |         dataset_type: string; type of dataset.
385 |         is_training: boolean; whether the input will be used for training.
386 |         data_dir: string; directory containing the input data.
387 |         batch_size: int; number of samples per batch (global, not per replica).
388 |         img_size_y: int; image height in pixels.
389 |         img_size_x: int; image width in pixels.
390 |         num_epochs: int; number of times to repeat the dataset.
391 |         dtype: string; data type to use for images/features.
392 |         parse_record_fn: function; function that processes raw records.
393 |         drop_remainder: boolean; indicates whether to drop the remainder of the
394 |             batches. If True, the batch dimension will be static.
395 |         filenames: list of strings; it contains paths to TFRecords.
396 |         adv_eval_data: boolean; whether to include information for advanced
397 |             evaluation in the input batches.
398 |     Returns:
399 |         input_dataset: tf dataset; iterable input dataset.
400 |     """
401 | 
402 |     # Get TFRecords paths
403 |     if (filenames is None):
404 |         filenames = get_filenames(dataset_type, data_dir)
405 |     dataset = tf.data.Dataset.from_tensor_slices(filenames)
406 |     
407 |     # Shuffle input files
408 |     if (is_training):
409 |         if (dataset_type == 'train'):
410 |             dataset = dataset.shuffle(buffer_size=TRAIN_SHARDS_NUM)
411 |         elif (dataset_type == 'validation'):
412 |             dataset = dataset.shuffle(buffer_size=VAL_SHARDS_NUM)
413 |         elif (dataset_type == 'test'):
414 |             dataset = dataset.shuffle(buffer_size=TEST_SHARDS_NUM)
415 | 
416 |     # Process input files concurrently
417 |     dataset = dataset.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE)
418 | 
419 |     # Process TFRecords
420 |     input_dataset = process_record_dataset(dataset=dataset,
421 |                                            dataset_type=dataset_type,
422 |                                            is_training=is_training,
423 |                                            batch_size=batch_size,
424 |                                            img_size_y=img_size_y,
425 |                                            img_size_x=img_size_x,
426 |                                            shuffle_buffer=_SHUFFLE_BUFFER,
427 |                                            parse_record_fn=parse_record_fn,
428 |                                            num_epochs=num_epochs,
429 |                                            dtype=dtype,
430 |                                            drop_remainder=drop_remainder,
431 |                                            adv_eval_data=adv_eval_data)
432 |     
433 |     return input_dataset
434 | 


--------------------------------------------------------------------------------
/ImageNet/input_imagenet.py:
--------------------------------------------------------------------------------
  1 | """Prepare input batches, based on
  2 | https://github.com/tensorflow/models/blob/master/official/vision/image_classification/resnet/imagenet_preprocessing.py.
  3 | """
  4 | 
  5 | from __future__ import absolute_import, division, print_function
  6 | 
  7 | import os
  8 | import numpy as np
  9 | import tensorflow as tf
 10 | from tensorflow.python.ops import control_flow_ops
 11 | 
 12 | 
 13 | 
 14 | _SHUFFLE_BUFFER = 10000
 15 | NUM_CHANNELS = 3
 16 | TRAIN_SHARDS_NUM = 1024
 17 | VAL_SHARDS_NUM = 128
 18 | 
 19 | def get_filenames(is_train_dataset, data_dir):
 20 |     """Return filenames for dataset.
 21 |     Args:
 22 |         is_train_dataset: boolean; whether the input is the training
 23 |             or the validation set.
 24 |         data_dir: string; directory containing the input data.
 25 |     Returns:
 26 |         data_filemames: list of strings; it contains paths to TFRecords.
 27 |     """
 28 |     
 29 |     # Data are assumed to be stored in TFRecords
 30 |     if (is_train_dataset):
 31 |         data_filemames = [os.path.join(data_dir, 'train-%05d-of-%05d' % (i, TRAIN_SHARDS_NUM)) for i in range(TRAIN_SHARDS_NUM)]
 32 |     else:
 33 |         data_filemames = [os.path.join(data_dir, 'validation-%05d-of-%05d' % (i, VAL_SHARDS_NUM)) for i in range(VAL_SHARDS_NUM)]
 34 |   
 35 |     return data_filemames
 36 | 
 37 | def parse_example_proto(example_serialized, adv_eval_data=False):
 38 |     """Parse an Example proto that corresponds to an image.
 39 |        Each Example proto contains the following fields (values are included as examples):
 40 |        image/height: 462
 41 |        image/width: 581
 42 |        image/colorspace: 'RGB'
 43 |        image/channels: 3
 44 |        image/class/label: 1 - 1000 # label value 0 was left empty for the background class when building the dataset
 45 |        image/class/synset: 'n03623198'
 46 |        image/class/text: 'knee pad'
 47 |        image/object/bbox/xmin: 0.1
 48 |        image/object/bbox/xmax: 0.9
 49 |        image/object/bbox/ymin: 0.2
 50 |        image/object/bbox/ymax: 0.6
 51 |        image/object/bbox/label: 615
 52 |        image/format: 'JPEG'
 53 |        image/filename: 'ILSVRC2012_val_00041207.JPEG'
 54 |        image/encoded: <JPEG encoded string>
 55 |     Args:
 56 |         example_serialized: string; serialized Example protocol buffer.
 57 |         adv_eval_data: boolean; whether to include information for advanced
 58 |             evaluation in the input batches.
 59 |     Returns:
 60 |         to_batch: tuple; it contains the following entries:
 61 |             encoded_img: string; encoded JPEG file.
 62 |             label: int; numeric image label.
 63 |             bbox: 3-D float Tensor; it contains the bounding boxes related to an
 64 |                 image. Bounding box coordinates are in range [0, 1], arranged in
 65 |                 order [ymin, xmin, ymax, xmax]. The Tensor is of shape
 66 |                 [1, num_boxes, 4], where num_boxes is the number of bounding
 67 |                 boxes related to the image.
 68 |             img_filename (optional): string; the filename of an image.
 69 |             img_label_text (optional): string; the human-readable label of an image.
 70 |     """
 71 | 
 72 |     # Extract dense features in Example proto
 73 |     feature_map = {
 74 |         'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
 75 |         'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1),
 76 |         'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
 77 |         'image/filename': tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
 78 |         'image/class/synset': tf.io.FixedLenFeature([], dtype=tf.string, default_value='')
 79 |     }
 80 |     # Extract sparse features in Example proto
 81 |     sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
 82 |     feature_map.update(
 83 |         {k: sparse_float32 for k in ['image/object/bbox/xmin', 'image/object/bbox/ymin',
 84 |                                     'image/object/bbox/xmax', 'image/object/bbox/ymax']})
 85 | 
 86 |     features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map)
 87 |     encoded_img = features['image/encoded']
 88 |     label = tf.cast(features['image/class/label'], dtype=tf.int32)
 89 | 
 90 |     xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
 91 |     ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
 92 |     xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
 93 |     ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
 94 | 
 95 |     # Make the variable number of bounding boxes into
 96 |     # the shape [1, num_boxes, coords]
 97 |     bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
 98 |     bbox = tf.expand_dims(bbox, 0)
 99 |     bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
100 | 
101 |     if (not adv_eval_data):
102 |         to_batch = (encoded_img, label, bbox)
103 |     else:
104 |         img_filename = features['image/filename']
105 |         img_synset = features['image/class/synset']
106 |         img_label_text = features['image/class/text']
107 |         to_batch = (encoded_img, label, bbox, img_filename, img_synset, img_label_text)
108 |     
109 |     return to_batch
110 | 
111 | def apply_with_random_selector(x, func, cases):
112 |     """Compute func(x, cases[sel]), with sel sampled from cases.
113 |     Args:
114 |         x: Tensor; input Tensor to process.
115 |         func: function; python function to apply.
116 |         num_cases: list; cases to sample from.
117 |     Returns:
118 |         The result of func(x, cases[sel]), sel is sampled dynamically.
119 |     """
120 |     
121 |     sel = tf.random.uniform([], maxval=len(cases), dtype=tf.int32)
122 |     # Pass the input only to one of the func calls
123 |     return control_flow_ops.merge([
124 |             func(control_flow_ops.switch(x, tf.equal(sel, i))[1], cases[i])
125 |             for i in range(len(cases))])[0]
126 | 
127 | def distort_color(image, color_ordering):
128 |     """Distort the color of an image.
129 |     Args:
130 |         image: 3-D float Tensor; it contains an image. It is of
131 |             size [H, W, C], where H is the image height, W is
132 |             the image width, and C is the number of channels.
133 |         color_ordering: int; denotes the kind of color distortion.
134 |     Returns:
135 |         image: 3-D float Tensor; it contains an image. It is of
136 |             size [H, W, C], where H is the image height, W is
137 |             the image width, and C is the number of channels.
138 |     """
139 | 
140 |     if color_ordering == 0:
141 |         image = tf.image.random_brightness(image, max_delta=32. / 255.)
142 |         image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
143 |         image = tf.image.random_hue(image, max_delta=0.2)
144 |         image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
145 |     elif color_ordering == 1:
146 |         image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
147 |         image = tf.image.random_brightness(image, max_delta=32. / 255.)
148 |         image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
149 |         image = tf.image.random_hue(image, max_delta=0.2)
150 |     elif color_ordering == 2:
151 |         image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
152 |         image = tf.image.random_hue(image, max_delta=0.2)
153 |         image = tf.image.random_brightness(image, max_delta=32. / 255.)
154 |         image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
155 |     elif color_ordering == 3:
156 |         image = tf.image.random_hue(image, max_delta=0.2)
157 |         image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
158 |         image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
159 |         image = tf.image.random_brightness(image, max_delta=32. / 255.)
160 |     elif color_ordering == 4:
161 |         return image
162 |     else:
163 |         raise ValueError('color_ordering must be in [0, 4]')
164 | 
165 |     # The random_* ops do not necessarily clamp
166 |     image = tf.clip_by_value(image, 0.0, 1.0)
167 | 
168 |     return image
169 | 
170 | def distort_image(image_buffer, output_height, output_width, num_channels, bbox):
171 |     """Distort an image for data augmentation.
172 |     Args:
173 |         image_buffer: string; raw JPEG image buffer.
174 |         output_height: int; height of the image after preprocessing.
175 |         output_width: int; width of the image after preprocessing.
176 |         num_channels: int; depth of the image buffer for decoding.
177 |         bbox: 3-D float Tensor; it contains the bounding boxes related to
178 |             an image. Bounding box coordinates are in range [0, 1],
179 |             arranged in order [ymin, xmin, ymax, xmax]. The Tensor is of
180 |             shape [1, num_boxes, 4], where num_boxes is the number of
181 |             bounding boxes related to the image.
182 |     Returns:
183 |         distorted_image: 3-D float Tensor; it contains an image. It is of
184 |             size [H, W, C], where H is the image height, W is the image
185 |             width, and C is the number of channels.
186 |     """
187 |    
188 |     # Create a bounding box by distorting an existing one (if it is provided).
189 |     # The new bounding box should respect specific constraints, e.g., be within
190 |     # a range of aspect ratios. If no bounding box is provided, the entire
191 |     # image is considered the initial bounding box to be distorted.
192 |     sampled_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
193 |                                         tf.io.extract_jpeg_shape(image_buffer),
194 |                                         bounding_boxes=bbox,
195 |                                         min_object_covered=0.1,
196 |                                         aspect_ratio_range=[0.5, 2.0],
197 |                                         area_range=[0.85, 1.0],
198 |                                         max_attempts=50,
199 |                                         use_image_if_no_bounding_boxes=True,
200 |                                         seed=0)
201 |     bbox_begin, bbox_size, _ = sampled_distorted_bounding_box
202 | 
203 |     # Reassemble and crop the bounding box
204 |     offset_y, offset_x, _ = tf.unstack(bbox_begin)
205 |     target_height, target_width, _ = tf.unstack(bbox_size)
206 |     crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
207 |     distorted_image = tf.image.decode_and_crop_jpeg(image_buffer, crop_window, channels=num_channels)
208 |     distorted_image = tf.image.convert_image_dtype(distorted_image, dtype=tf.float32)
209 | 
210 |     # Resize the image. Select a resize method randomly. The image aspect ratio may change.
211 |     resize_methods = [tf.image.ResizeMethod.BILINEAR,
212 |                       tf.image.ResizeMethod.LANCZOS3,
213 |                       tf.image.ResizeMethod.LANCZOS5,
214 |                       tf.image.ResizeMethod.BICUBIC,
215 |                       tf.image.ResizeMethod.GAUSSIAN,
216 |                       tf.image.ResizeMethod.NEAREST_NEIGHBOR,
217 |                       tf.image.ResizeMethod.AREA,
218 |                       tf.image.ResizeMethod.MITCHELLCUBIC]
219 |     distorted_image = apply_with_random_selector(distorted_image,
220 |                                                  lambda x, resize_method: tf.image.resize(distorted_image,
221 |                                                  [output_height, output_width],
222 |                                                  method=resize_method, antialias=False),
223 |                                                  cases=resize_methods)
224 | 
225 |     # Restore image shape
226 |     distorted_image.set_shape([output_height, output_width, num_channels])
227 | 
228 |     # Perform a random horizontal flip of the image
229 |     distorted_image = tf.image.random_flip_left_right(distorted_image)
230 | 
231 |     # Perform random color distortions
232 |     distorted_image = apply_with_random_selector(distorted_image,
233 |                                                 lambda x, color_ordering: distort_color(x, color_ordering),
234 |                                                 cases=np.arange(5))
235 | 
236 |     return distorted_image
237 | 
238 | def preprocess_image(image_buffer, bbox, output_height, output_width,
239 |                      num_channels, is_train_dataset, is_training):
240 |     """Preprocess an image.
241 |     Args:
242 |         image_buffer: string; encoded JPEG file.
243 |         bbox: 3-D float Tensor; it contains the bounding boxes related to an
244 |             image. Bounding box coordinates are in range [0, 1], arranged in
245 |             order [ymin, xmin, ymax, xmax]. The Tensor is of shape
246 |             [1, num_boxes, 4], where num_boxes is the number of bounding
247 |             boxes related to the image.
248 |         output_height: int; height of the image after preprocessing.
249 |         output_width: int; width of the image after preprocessing.
250 |         num_channels: int; depth of the image buffer for decoding.
251 |         is_train_dataset: boolean; whether the input is the training
252 |             or the validation set.
253 |         is_training: boolean; whether the input will be used for training.
254 |     Returns:
255 |         image: 3-D float Tensor; it contains an image. It is of
256 |             size [H, W, C], where H is the image height, W is
257 |             the image width, and C is the number of channels.
258 |     """
259 |     
260 |     if (is_train_dataset and is_training):
261 |         # For training data during training, apply random distortions for data augmentation
262 |         image = distort_image(image_buffer, output_height, output_width, num_channels, bbox)
263 |     else:
264 |         # Decode and resize the input image
265 |         image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
266 |         image = tf.image.convert_image_dtype(image, dtype=tf.float32)
267 |         image = tf.expand_dims(image, 0)
268 |         image = tf.image.resize(image, [output_height, output_width], method=tf.image.ResizeMethod.BILINEAR, antialias=False)
269 |         image = tf.squeeze(image, [0])
270 | 
271 |     # Transform image values from range [0, 1], to [-1, 1]
272 |     image = tf.subtract(image, 0.5)
273 |     image = tf.multiply(image, 2.0)
274 | 
275 |     return image
276 | 
277 | def parse_record(raw_record, is_train_dataset, is_training,
278 |                  img_size_y, img_size_x, dtype, adv_eval_data):
279 |     """Parse a record containing a training example that corresponds to an image.
280 |     Args:
281 |         raw_record: string; serialized Example protocol buffer.
282 |         is_train_dataset: boolean; whether the input is the training
283 |             or the validation set.
284 |         is_training: boolean; whether the input will be used for training.
285 |         img_size_y: int; image height in pixels.
286 |         img_size_x: int; image width in pixels.
287 |         dtype: string; data type to use for images/features.
288 |         adv_eval_data: boolean; whether to include information for advanced
289 |             evaluation in the input batches.
290 |     Returns:
291 |         batch: tuple; it contains the following entries:
292 |             image: 3-D float Tensor; it contains an image. It is of
293 |                 size [H, W, C], where H is the image height, W is
294 |                 the image width, and C is the number of channels.
295 |             label: int; numeric image label.
296 |             img_filename (optional): string; the filename of an image.
297 |             img_synset (optional): string; the synset of an image.
298 |             img_label_text (optional): string; the human-readable label of an image.
299 |     """
300 |     
301 |     # Parse Example protocol buffer
302 |     if (not adv_eval_data):
303 |         image_buffer, label, bbox = parse_example_proto(raw_record, adv_eval_data)
304 |     else:
305 |         (image_buffer, label, bbox,
306 |         img_filename, img_synset, img_label_text) = parse_example_proto(raw_record, adv_eval_data)
307 | 
308 |     # Pre-process image
309 |     image = preprocess_image(image_buffer=image_buffer,
310 |                              bbox=bbox,
311 |                              output_height=img_size_y,
312 |                              output_width=img_size_x,
313 |                              num_channels=NUM_CHANNELS,
314 |                              is_train_dataset=is_train_dataset,
315 |                              is_training=is_training)
316 |     image = tf.cast(image, dtype)
317 | 
318 |     # Subtract 1 so that labels are in [0, 1000) range
319 |     label = tf.cast(tf.cast(tf.reshape(label, shape=[1]), dtype=tf.int32) - 1, dtype=tf.float32)
320 | 
321 |     # Return batch
322 |     if (not adv_eval_data):
323 |         batch  = (image, label)
324 |     else:
325 |         batch  = (image, label, img_filename, img_synset, img_label_text)
326 |         
327 |     return batch
328 | 
329 | def process_record_dataset(dataset,
330 |                            is_train_dataset,
331 |                            is_training,
332 |                            batch_size,
333 |                            img_size_y,
334 |                            img_size_x,
335 |                            shuffle_buffer,
336 |                            parse_record_fn,
337 |                            num_epochs=-1,
338 |                            dtype=tf.float32,
339 |                            drop_remainder=False,
340 |                            adv_eval_data=False):
341 |     """Create input dataset from raw records.
342 |     Args:
343 |         dataset: tf dataset; dataset with raw records.
344 |         is_train_dataset: boolean; whether the input is the training
345 |             or the validation set.
346 |         is_training: boolean; whether the input will be used for training.
347 |         batch_size: int; number of samples per batch (global, not per replica).
348 |         img_size_y: int; image height in pixels.
349 |         img_size_x: int; image width in pixels.
350 |         shuffle_buffer: int; buffer size to use when shuffling records. A larger
351 |             value results in higher randomness, but a smaller one reduces startup
352 |             time and uses less memory.
353 |         parse_record_fn: function; function that processes raw records.
354 |         num_epochs: int; number of times to repeat the dataset.
355 |         dtype: string; data type to use for images/features.
356 |         drop_remainder: boolean; whether to drop the remainder of the
357 |             batches. If True, the batch dimension will be static.
358 |         adv_eval_data: boolean; whether to include information for advanced
359 |             evaluation in the input batches.
360 |     Returns:
361 |         dataset: tf dataset; iterable input dataset.
362 |     """
363 |     
364 |     # Shuffle records before repeating, to respect epoch boundaries
365 |     if (is_training):
366 |         dataset = dataset.shuffle(buffer_size=shuffle_buffer)
367 |     
368 |     # Repeat dataset for the number of epochs to train
369 |     if (num_epochs < 1):
370 |         dataset = dataset.repeat()
371 |     else:
372 |         dataset = dataset.repeat(num_epochs)
373 | 
374 |     # Parse raw records
375 |     dataset = dataset.map(lambda value: parse_record_fn(value, is_train_dataset, is_training,
376 |                                                         img_size_y, img_size_x, dtype, adv_eval_data),
377 |                                                         num_parallel_calls=tf.data.experimental.AUTOTUNE)
378 |     dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
379 | 
380 |     # Operations between the final prefetch and the get_next call to the iterator
381 |     # will happen synchronously during run time. Prefetch here again to
382 |     # background all of the above processing work and keep it out of the
383 |     # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
384 |     # allows DistributionStrategies to adjust how many batches to fetch based
385 |     # on how many devices are present.
386 |     dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
387 | 
388 |     return dataset
389 | 
390 | def input_fn(is_train_dataset,
391 |              is_training,
392 |              data_dir,
393 |              batch_size,
394 |              img_size_y,
395 |              img_size_x,
396 |              num_epochs=-1,
397 |              dtype=tf.float32,
398 |              parse_record_fn=parse_record,
399 |              drop_remainder=True,
400 |              filenames=None,
401 |              adv_eval_data=False):
402 |     """Prepare input batches.
403 |     Args:
404 |         is_train_dataset: boolean; whether the input is the training
405 |             or the validation set.
406 |         is_training: boolean; whether the input will be used for training.
407 |         data_dir: string; directory containing the input data.
408 |         batch_size: int; number of samples per batch (global, not per replica).
409 |         img_size_y: int; image height in pixels.
410 |         img_size_x: int; image width in pixels.
411 |         num_epochs: int; number of times to repeat the dataset.
412 |         dtype: string; data type to use for images/features.
413 |         parse_record_fn: function; function that processes raw records.
414 |         drop_remainder: boolean; indicates whether to drop the remainder of the
415 |             batches. If True, the batch dimension will be static.
416 |         filenames: list of strings; it contains paths to TFRecords.
417 |         adv_eval_data: boolean; whether to include information for advanced
418 |             evaluation in the input batches.
419 |     Returns:
420 |         input_dataset: tf dataset; iterable input dataset.
421 |     """
422 |     
423 |     # Get TFRecords paths
424 |     if (filenames is None):
425 |         filenames = get_filenames(is_train_dataset, data_dir)
426 |     dataset = tf.data.Dataset.from_tensor_slices(filenames)
427 | 
428 |     # Shuffle input files
429 |     if (is_training):
430 |         if (is_train_dataset):
431 |             dataset = dataset.shuffle(buffer_size=TRAIN_SHARDS_NUM)
432 |         else:
433 |             dataset = dataset.shuffle(buffer_size=VAL_SHARDS_NUM)
434 | 
435 |     # Process input files concurrently
436 |     dataset = dataset.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE)
437 |     
438 |     # Process TFRecords
439 |     input_dataset = process_record_dataset(dataset=dataset,
440 |                                            is_train_dataset=is_train_dataset,
441 |                                            is_training=is_training,
442 |                                            batch_size=batch_size,
443 |                                            img_size_y=img_size_y,
444 |                                            img_size_x=img_size_x,
445 |                                            shuffle_buffer=_SHUFFLE_BUFFER,
446 |                                            parse_record_fn=parse_record_fn,
447 |                                            num_epochs=num_epochs,
448 |                                            dtype=dtype,
449 |                                            drop_remainder=drop_remainder,
450 |                                            adv_eval_data=adv_eval_data)
451 | 
452 |     return input_dataset
453 | 


--------------------------------------------------------------------------------
/fMoW/create_TFRecords_fMoW.py:
--------------------------------------------------------------------------------
  1 | """Convert fMoW images to TFRecords.
  2 | Raw fMoW data can be downloaded here https://github.com/fMoW/dataset.
  3 | The current script utilizes the rgb version of fMoW, and not the full version.
  4 | fMoW data are split in training, validation and test sets. After download, for
  5 | the training and validations sets, jpeg and json files are expected to reside
  6 | in the following directory structure:
  7 |     /train/airport/airport_0/airport_0_0_rgb.jpg
  8 |     /train/airport/airport_0/airport_0_0_rgb.json
  9 |     ...
 10 | 
 11 |     /val/airport/airport_0/airport_0_0_rgb.jpg
 12 |     /val/airport/airport_0/airport_0_0_rgb.json
 13 |     ...
 14 | 
 15 | For the test set, jpeg and json files are expected to reside
 16 | in the following directory structure:
 17 |     /test/0011978/0011978_0_rgb.jpg
 18 |     /test/0011978/0011978_0_rgb.json
 19 |     ...
 20 | 
 21 | Test set directory structure doesn't reveal the labels of the images, because
 22 | it was initially realeased in the context of an IARPA challenge (https://www.iarpa.gov/challenges/fmow.html).
 23 | However, given that the challenge is over, test set annotations are available
 24 | for download with the rest of the data here https://github.com/fMoW/dataset.
 25 | After downloding the ground truth test data, they consist of json files that
 26 | reside in the following directory structure:
 27 |     /test_gt/airport/airport_0/airport_0_0_rgb.json
 28 |     /test_gt/airport/airport_0/airport_0_1_rgb.json
 29 |     ...
 30 | 
 31 | The additional test_gt_mapping.json file is provided to establish a correspondance
 32 | between the annotations under folder test_gt, and the images under folder test. To
 33 | this end, we provide match_test_gt.py script, which organizes jpeg and json files
 34 | for the test set, in the following directory structure:
 35 |     /test_matched_with_gt/airport/airport_0/airport_0_0_rgb.jpeg
 36 |     /test_matched_with_gt/airport/airport_0/airport_0_0_rgb.json
 37 |     ...
 38 | 
 39 | Given the desired uniformity in the directory organization of the training,
 40 | validation, and test sets is established, the current script converts image
 41 | data to TFRecord files. Each record within a TFRecord file is a serialized
 42 | Example proto.
 43 | """
 44 | 
 45 | from __future__ import absolute_import, division, print_function
 46 | 
 47 | import argparse
 48 | from datetime import datetime
 49 | import os
 50 | import random
 51 | import sys
 52 | import threading
 53 | import json
 54 | 
 55 | import numpy as np
 56 | import six
 57 | import tensorflow as tf
 58 | 
 59 | 
 60 | 
 61 | parser = argparse.ArgumentParser()
 62 | 
 63 | parser.add_argument('--train_directory', type=str, default='/train/', help='Training data directory.')
 64 | parser.add_argument('--validation_directory', type=str, default='/val/', help='Validation data directory.')
 65 | parser.add_argument('--test_directory', type=str, default='/test_matched_with_gt/', help='Test data directory.')
 66 | parser.add_argument('--output_directory', type=str, default='/TFRecords/', help='Output data directory.')
 67 | 
 68 | parser.add_argument('--train_shards', type=int, default=512, help='Number of shards in training TFRecord files.')
 69 | parser.add_argument('--validation_shards', type=int, default=128, help='Number of shards in validation TFRecord files.')
 70 | parser.add_argument('--test_shards', type=int, default=128, help='Number of shards in test TFRecord files.')
 71 | parser.add_argument('--num_threads', type=int, default=32, help='Number of threads to parallelize processing.')
 72 | parser.add_argument('--maximum_min_dim', type=int, default=1000, help='Maximum size allowed for the smallest image spatial dimension.')
 73 | parser.add_argument('--cropped_data', action='store_true', help='Whether the provided data are cropped acoording to bounding boxes annotations.')
 74 | 
 75 | FLAGS = parser.parse_args()
 76 | 
 77 | category_names = ['airport', 'airport_hangar', 'airport_terminal', 'amusement_park', 'aquaculture', 'archaeological_site', 'barn', 'border_checkpoint', 'burial_site', 'car_dealership', 'construction_site',
 78 |                   'crop_field', 'dam', 'debris_or_rubble', 'educational_institution', 'electric_substation', 'factory_or_powerplant', 'fire_station', 'flooded_road', 'fountain', 'gas_station', 'golf_course',
 79 |                   'ground_transportation_station', 'helipad', 'hospital', 'interchange', 'lake_or_pond', 'lighthouse', 'military_facility', 'multi-unit_residential', 'nuclear_powerplant', 'office_building',
 80 |                   'oil_or_gas_facility', 'park', 'parking_lot_or_garage', 'place_of_worship', 'police_station', 'port', 'prison', 'race_track', 'railway_bridge', 'recreational_facility', 'impoverished_settlement',
 81 |                   'road_bridge', 'runway', 'shipyard', 'shopping_mall', 'single-unit_residential', 'smokestack', 'solar_farm', 'space_facility', 'stadium', 'storage_tank','surface_mine', 'swimming_pool',
 82 |                   'toll_booth', 'tower', 'tunnel_opening', 'waste_disposal', 'water_treatment_facility', 'wind_farm', 'zoo']
 83 | 
 84 | def clip_0_1(x):
 85 |     """Clip given float number within [0, 1] range.
 86 |     Args:
 87 |         x: float; value to clip.
 88 |     Returns:
 89 |         x: float; value within [0, 1] range.
 90 |     """
 91 | 
 92 |     if (x < 0.):
 93 |         x = 0.
 94 |     elif (x > 1.0):
 95 |         x = 1.0
 96 | 
 97 |     return x
 98 | 
 99 | def _int64_feature(value):
100 |     """Insert int features into Example proto.
101 |     Args:
102 |         value: int or list of ints; features to insert
103 |             in Example proto.
104 |     Returns:
105 |         feature: example proto; it contains a list of ints.
106 |     """
107 |     
108 |     if (not isinstance(value, list)):
109 |         value = [value]
110 |     
111 |     feature = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
112 | 
113 |     return feature
114 | 
115 | def _float_feature(value):
116 |     """Insert float features into Example proto.
117 |     Args:
118 |         value: float or list of floats; features to insert
119 |             in Example proto.
120 |     Returns:
121 |         feature: example proto; it contains a list of floats.
122 |     """
123 | 
124 |     if (not isinstance(value, list)):
125 |         value = [value]
126 |     
127 |     feature = tf.train.Feature(float_list=tf.train.FloatList(value=value))
128 | 
129 |     return feature
130 | 
131 | def _bytes_feature(value):
132 |     """Insert byte features into Example proto.
133 |     Args:
134 |         value: string or list of strings; features to
135 |             insert in Example proto.
136 |     Returns:
137 |         feature: example proto; it contains a byte list.
138 |     """
139 |     
140 |     if (isinstance(value, type(tf.constant(0)))):
141 |         value = value.numpy()
142 |     if (six.PY3 and isinstance(value, six.text_type)):
143 |         value = six.binary_type(value, encoding='utf-8') 
144 |     
145 |     feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
146 | 
147 |     return feature
148 | 
149 | def _convert_to_example(filename, image_buffer, label,
150 |                         category, bbox, height, width):
151 |     """Build an Example proto for an image.
152 |     Args:
153 |         filename: string; path to image file.
154 |         image_buffer: string; JPEG encoded image.
155 |         label: int; numeric ground truth label.
156 |         category: string; human-readable label.
157 |         bbox: list; it contains coordinates of bounding boxes.
158 |         height: int; image height in pixels.
159 |         width: int; image width in pixels.
160 |     Returns:
161 |         example: example proto; it contains the following fields:
162 |             image/height: int; image height in pixels.
163 |             image/width: int; image width in pixels.
164 |             image/colorspace: string; colorspace, always 'RGB'.
165 |             image/channels: int; number of channels, always 3.
166 |             image/class/label: int; index of a classification label in range [0, 61].
167 |             image/class/text: string; human-readable label.
168 |             image/object/bbox/ymin: list of ints; denotes the minimum vertical pixel value
169 |                 of a bounding box, in proportion to the image height. It takes values in
170 |                 [0, 1]. Each entry in the list corresponds to a different bounding box.
171 |             image/object/bbox/xmin: list of ints; denotes the minimum horizontal pixel
172 |                 value of a bounding box, in proportion to the image width. It takes values
173 |                 in [0, 1]. Each entry in the list corresponds to a different bounding box.
174 |             image/object/bbox/ymax: list of ints; denotes the maximum vertical pixel value
175 |                 of a bounding box, in proportion to the image height. It takes values in
176 |                 [0, 1]. Each entry in the list corresponds to a different bounding box.
177 |             image/object/bbox/xmax: list of ints; denotes the maximum horizontal pixel
178 |                 value of a bounding box, in proportion to the image width. It takes values
179 |                 in [0, 1]. Each entry in the list corresponds to a different bounding box.
180 |             image/object/bbox/label: int; index of a classification label. It is always
181 |                 identical to the corresponding image label.
182 |             image/format: string; image format, always 'JPEG'.
183 |             image/filename: string; image file basename.
184 |             image/encoded: string; JPEG encoded image.
185 |     """
186 | 
187 |     b_ymin = []
188 |     b_xmin = []
189 |     b_ymax = []
190 |     b_xmax = []
191 |     for b in bbox:
192 |         assert len(b) == 4
193 |         [l.append(point) for l, point in zip([b_ymin, b_xmin, b_ymax, b_xmax], b)]
194 | 
195 |     colorspace = 'RGB'
196 |     channels = 3
197 |     image_format = 'JPEG'
198 | 
199 |     example = tf.train.Example(features=tf.train.Features(feature={
200 |         'image/height': _int64_feature(height),
201 |         'image/width': _int64_feature(width),
202 |         'image/colorspace': _bytes_feature(colorspace),
203 |         'image/channels': _int64_feature(channels),
204 |         'image/class/label': _int64_feature(label),
205 |         'image/class/text': _bytes_feature(category),
206 |         'image/object/bbox/ymin': _float_feature(b_ymin),
207 |         'image/object/bbox/xmin': _float_feature(b_xmin),
208 |         'image/object/bbox/ymax': _float_feature(b_ymax),
209 |         'image/object/bbox/xmax': _float_feature(b_xmax),
210 |         'image/object/bbox/label': _int64_feature([label] * len(b_xmin)),
211 |         'image/format': _bytes_feature(image_format),
212 |         'image/filename': _bytes_feature(os.path.basename(filename)),
213 |         'image/encoded': _bytes_feature(image_buffer)
214 |     }))
215 |     
216 |     return example
217 | 
218 | def _process_image(filename, img_size):
219 |     """Process a single image file.
220 |     Args:
221 |         filename: string; path to an image file.
222 |         img_sizes: tuple of ints; it contains the spatial
223 |             dimensions of an image.
224 |     Returns:
225 |         image_buffer: string; JPEG encoded image.
226 |         height: int; image height in pixels.
227 |         width: int; image width in pixels.
228 |     """
229 | 
230 |     # Read image file
231 |     image_data = tf.io.read_file(filename)
232 | 
233 |     # Calculate decoding ratio to avoid overflow due to huge images
234 |     min_dim = min(img_size)
235 |     if (min_dim > 8 * FLAGS.maximum_min_dim):
236 |         ratio = 8
237 |     elif (min_dim > 4 * FLAGS.maximum_min_dim):
238 |         ratio = 4
239 |     elif (min_dim > 2 * FLAGS.maximum_min_dim):
240 |         ratio = 2
241 |     else:
242 |         ratio = 1
243 |     image = tf.io.decode_jpeg(image_data, ratio=ratio, channels=3)
244 | 
245 |     # Ensure smallest image dimension does not exceed FLAGS.maximum_min_dim
246 |     height = image.shape[0]
247 |     width = image.shape[1]
248 |     min_dim = min([height, width])
249 |     if (min_dim > FLAGS.maximum_min_dim):
250 |         if (height == min_dim):
251 |             new_height = FLAGS.maximum_min_dim
252 |             new_width = np.ceil(float(new_height) * (float(width)/float(height)))
253 |         else:
254 |             new_width = FLAGS.maximum_min_dim
255 |             new_height = np.ceil(float(new_width) * (float(height)/float(width)))
256 | 
257 |         image = tf.image.convert_image_dtype(image, dtype=tf.float32)
258 |         image = tf.image.resize(tf.expand_dims(image, axis=0), size=[int(new_height), int(new_width)],
259 |                                 preserve_aspect_ratio=True, method=tf.image.ResizeMethod.BILINEAR)
260 |         image = tf.squeeze(image)
261 |   
262 |     # Assert that the image has the appropriate dimensions
263 |     assert (len(image.shape) == 3)
264 |     assert (image.shape[2] == 3)
265 |     height = image.shape[0]
266 |     width = image.shape[1]
267 |     assert ((height <= FLAGS.maximum_min_dim) or (width <= FLAGS.maximum_min_dim))
268 |   
269 |     # Encode the image, if it was processed
270 |     if ((min_dim > FLAGS.maximum_min_dim) or (ratio != 1)):
271 |         image = tf.image.convert_image_dtype(image, dtype=tf.uint8)
272 |         image_data = tf.image.encode_jpeg(image, format='rgb', quality=100)
273 | 
274 |     return image_data, height, width
275 | 
276 | def _process_image_files_batch(thread_index, ranges, name, filenames,
277 |                                labels, categories, bboxes, img_sizes, num_shards):
278 |     """Execute 1 thread that processes images and saves them as TFRecords
279 |        of Example protos.
280 |     Args:
281 |         thread_index: int; unique thread identifier.
282 |         ranges: list of ints; it contains the range of images to
283 |             process.
284 |         name: string; unique identifier specifying the data set.
285 |         filenames: list of strings; it contains paths to image files.
286 |         labels: list of ints; it contains numeric ground truth labels.
287 |         categories: list of strings; it contains human-readable ground
288 |             truth labels.
289 |         bboxes: list; it contains bounding boxes for each image.
290 |         img_sizes: list of tuples; each tuple contains the spatial
291 |             dimensions of an image.
292 |         num_shards: int; number of shards.
293 |     Returns:
294 |         -
295 |     """
296 | 
297 |     # Each thread produces N shards where N = int(num_shards / num_threads).
298 |     # For instance, if num_shards = 128, and the num_threads = 2, then the first
299 |     # thread would produce shards [0, 64)
300 |     num_threads = len(ranges)
301 |     assert not num_shards % num_threads
302 |     num_shards_per_batch = int(num_shards / num_threads)
303 | 
304 |     shard_ranges = np.linspace(ranges[thread_index][0],
305 |                                ranges[thread_index][1],
306 |                                num_shards_per_batch + 1).astype(int)
307 |     num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
308 | 
309 |     # Generate each shard
310 |     counter = 0
311 |     for s in range(num_shards_per_batch):
312 |         shard = thread_index * num_shards_per_batch + s
313 |         output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
314 |         output_file = os.path.join(FLAGS.output_directory, output_filename)
315 |         writer = tf.io.TFRecordWriter(output_file)
316 | 
317 |         # Process each file for a shard
318 |         shard_counter = 0
319 |         files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
320 |         for i in files_in_shard:
321 |             filename = filenames[i]
322 |             label = labels[i]
323 |             category = categories[i]
324 |             bbox = bboxes[i]
325 |             img_size = img_sizes[i]
326 | 
327 |             # Process an image
328 |             image_buffer, height, width = _process_image(filename, img_size)
329 | 
330 |             # Create an Example proto
331 |             example = _convert_to_example(filename, image_buffer, label,
332 |                                           category, bbox, height, width)
333 |             
334 |             # Write to TFRecord
335 |             writer.write(example.SerializeToString())
336 |             shard_counter += 1
337 |             counter += 1
338 | 
339 |             if (not (counter % 1000)):
340 |                 print('%s [thread %d]: Processed %d of %d images in thread batch.'
341 |                       %(datetime.now(), thread_index, counter, num_files_in_thread))
342 |                 sys.stdout.flush()
343 | 
344 |         writer.close()
345 |         print('%s [thread %d]: Wrote %d images to %s'
346 |               %(datetime.now(), thread_index, shard_counter, output_file))
347 |         sys.stdout.flush()
348 |         shard_counter = 0
349 |     print('%s [thread %d]: Wrote %d images to %d shards.'
350 |           %(datetime.now(), thread_index, counter, num_files_in_thread))
351 |     sys.stdout.flush()
352 | 
353 | def _process_image_files(name, filenames, labels, categories, bboxes, img_sizes, num_shards):
354 |     """Process images and save them as TFRecords of Example protos.
355 |     Args:
356 |         name: string; unique identifier specifying the data set.
357 |         filenames: list of strings; it contains paths to image files.
358 |         labels: list of ints; it contains numeric ground truth labels.
359 |         categories: list of strings; it contains human-readable ground
360 |             truth labels.
361 |         bboxes: list; it contains bounding boxes for each image.
362 |         img_sizes: list of tuples; each tuple contains the spatial
363 |             dimensions of an image.
364 |         num_shards: int; number of shards.
365 |     Returns:
366 |         -
367 |     """
368 | 
369 |     assert len(filenames) == len(labels) == len(categories) == len(bboxes) == len(img_sizes)
370 | 
371 |     # Break images into batches
372 |     spacing = np.linspace(0, len(filenames), FLAGS.num_threads + 1).astype(np.int)
373 |     ranges = []
374 |     for i in range(len(spacing) - 1):
375 |         ranges.append([spacing[i], spacing[i + 1]])
376 | 
377 |     # Launch a thread for each batch
378 |     print('Launching %d threads for spacings: %s' % (FLAGS.num_threads, ranges))
379 |     sys.stdout.flush()
380 | 
381 |     # Create a mechanism for monitoring threads' execution
382 |     coord = tf.train.Coordinator()
383 | 
384 |     # Run threads
385 |     threads = []
386 |     for thread_index in range(len(ranges)):
387 |         args = (thread_index, ranges, name, filenames,
388 |                 labels, categories, bboxes, img_sizes, num_shards)
389 |         t = threading.Thread(target=_process_image_files_batch, args=args)
390 |         t.start()
391 |         threads.append(t)
392 | 
393 |     # Wait for all the threads to terminate
394 |     coord.join(threads)
395 |     print('%s: Finished writing all %d images in data set.' %(datetime.now(), len(filenames)))
396 |     sys.stdout.flush()
397 | 
398 | def _find_image_files(data_dir):
399 |     """Build lists of all images file paths, numeric labels, and
400 |        human-readable labels in a data set.
401 |     Args:
402 |         data_dir: string; path to data set.
403 |     Returns:
404 |         filenames: list of strings; it contains paths to image files.
405 |         labels: list of ints; it contains numeric ground truth labels.
406 |         categories: list of strings; it contains human-readable ground
407 |             truth labels.
408 |     """
409 | 
410 |     print('Determining list of input files and labels from %s.' % data_dir)
411 |     sys.stdout.flush()
412 |     filenames = []
413 |     labels = []
414 |     categories = []
415 | 
416 |     # Construct the list of JPEG files and labels
417 |     label_index = 0
418 |     for category in category_names:
419 |         if (not FLAGS.cropped_data):
420 |             jpeg_file_path = os.path.join(data_dir, category, '*', category + '_*_rgb.jpg')
421 |         else:
422 |             jpeg_file_path = os.path.join(data_dir, category, '*', '*', category + '_*_rgb.jpg')
423 |         matching_files = tf.io.gfile.glob(jpeg_file_path)
424 | 
425 |         filenames.extend(matching_files)
426 |         labels.extend([label_index] * len(matching_files))
427 |         categories.extend([category] * len(matching_files))
428 | 
429 |         if (not (label_index % 10)):
430 |             print('Finished finding files in %d of %d classes.' % (label_index, len(category_names)))
431 |             sys.stdout.flush()
432 |         
433 |         label_index += 1
434 | 
435 |     # Shuffle the ordering of all image files in order to guarantee
436 |     # random ordering of the images with respect to labels in the
437 |     # saved TFRecord files. Make the randomization repeatable.
438 |     shuffled_index = list(range(len(filenames)))
439 |     random.seed(12345)
440 |     random.shuffle(shuffled_index)
441 | 
442 |     filenames = [filenames[i] for i in shuffled_index]
443 |     labels = [labels[i] for i in shuffled_index]
444 |     categories = [categories[i] for i in shuffled_index]
445 | 
446 |     print('Found %d .jpg files across %d labels inside %s.'
447 |           %(len(filenames), len(category_names), data_dir))
448 |     sys.stdout.flush()
449 | 
450 |     return filenames, labels, categories
451 | 
452 | def _find_image_bounding_boxes(filenames, categories):
453 |     """Find the bounding boxes for a given image file.
454 |     Args:
455 |         filenames: list of strings; it contains paths to image files.
456 |         categories: list of strings; it contains human-readable ground
457 |             truth labels.
458 |     Returns:
459 |         bboxes: list; it contains bounding boxes for each image.
460 |         img_sizes: list of tuples; each tuple contains the spatial
461 |             dimensions of an image.
462 |     """
463 | 
464 |     num_image_bbox = 0
465 |     bbox_num = 0
466 |     bboxes = []
467 |     img_sizes = []
468 |     # Iterate over image files
469 |     for i in range(len(filenames)):
470 |         f = filenames[i]
471 |         category = categories[i]
472 | 
473 |         f_json = f.replace('.jpg', '.json')
474 |         jsonData = json.load(open(f_json))
475 | 
476 |         json_bboxes = jsonData['bounding_boxes']
477 |         if not isinstance(json_bboxes, list):
478 |             json_bboxes = [json_bboxes]
479 |         
480 |         h = float(jsonData['img_height'])
481 |         w = float(jsonData['img_width'])
482 |         # Iterate over available bounding boxes for an image file
483 |         bb_lst = []
484 |         if (not FLAGS.cropped_data):
485 |             for bb in json_bboxes:
486 |                 if ((bb['category'] != category) or (bb['ID'] == -1)):
487 |                     continue
488 |                 # Change box format from [xmin, ymin, width, height] to
489 |                 # [ymin, xmin, ymax, xmax] with values as image size percentages
490 |                 bb['box'] = [float(e) for e in bb['box']]
491 |                 ymin = bb['box'][1] / h
492 |                 ymin = clip_0_1(ymin)
493 |                 xmin = bb['box'][0] / w
494 |                 xmin = clip_0_1(xmin)
495 |                 ymax = (bb['box'][1] + bb['box'][3]) / h
496 |                 ymax = clip_0_1(ymax)
497 |                 xmax = (bb['box'][0] + bb['box'][2]) / w
498 |                 xmax = clip_0_1(xmax)
499 |                 bb_lst.append([ymin, xmin, ymax, xmax])
500 |             
501 |             if (len(bb_lst) > 0): 
502 |                 num_image_bbox += 1
503 |             bbox_num += len(bb_lst)
504 |         else:
505 |             # Cropped images result from crop_fMoW.py,
506 |             # and bounding boxes are standarized
507 |             assert len(jsonData['bounding_boxes']) == 1
508 |             
509 |             bb = jsonData['bounding_boxes'][0]
510 |             assert bb['category'] == category
511 |             
512 |             box = bb['box']
513 |             assert ((box[0] == 0.) and (box[1] == 0.) and (box[2] == 1.0) and (box[3] == 1.0))
514 |             
515 |             bb_lst.append(box)
516 |             num_image_bbox += 1
517 |             bbox_num += 1
518 |         
519 |         bboxes.append(bb_lst)
520 |         img_sizes.append([h, w])
521 | 
522 |     print('Found %d images with %d bboxes out of %d images'
523 |           %(num_image_bbox, bbox_num, len(filenames)))
524 |     sys.stdout.flush()
525 | 
526 |     return bboxes, img_sizes
527 | 
528 | def _process_dataset(name, directory, num_shards):
529 |     """Process a complete data set and save it in TFRecords.
530 |     Args:
531 |         name: string; unique identifier specifying the data set.
532 |         directory: string; path to data set.
533 |         num_shards: int; number of shards.
534 |     Returns:
535 |         -
536 |     """
537 | 
538 |     filenames, labels, categories = _find_image_files(directory)
539 |     bboxes, img_sizes = _find_image_bounding_boxes(filenames, categories)
540 |     _process_image_files(name, filenames, labels, categories, bboxes, img_sizes, num_shards)
541 | 
542 | def main(argv=None):
543 |     """Convert fMoW training, validation, and test images to TFRecords.
544 |     Args:
545 |         -
546 |     Returns:
547 |         -
548 |     """
549 | 
550 |     assert not FLAGS.train_shards % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.train_shards')
551 |     assert not FLAGS.validation_shards % FLAGS.num_threads, ('Please make the FLAGS.num_threads commensurate with FLAGS.validation_shards')
552 | 
553 |     if (not os.path.isdir(FLAGS.output_directory)):
554 |         os.makedirs(FLAGS.output_directory)
555 |     print('Saving results to %s' % FLAGS.output_directory)
556 |     sys.stdout.flush()
557 | 
558 |     # Create TFRecords
559 |     _process_dataset('validation', FLAGS.validation_directory, FLAGS.validation_shards)
560 |     _process_dataset('test', FLAGS.test_directory, FLAGS.test_shards)
561 |     _process_dataset('train', FLAGS.train_directory, FLAGS.train_shards)
562 | 
563 | if __name__ == '__main__':
564 |     main()


--------------------------------------------------------------------------------