├── .gitignore ├── LICENSE ├── README.md ├── config.json ├── create_pb.py ├── data ├── create_tfrecords.py ├── explore_and_prepare_CelebA.ipynb ├── procrustes.py └── test_input_pipeline.ipynb ├── inference ├── example.jpg ├── face_detector.py ├── landmark_detector.py ├── the_office.jpg └── try_detector.ipynb ├── input_pipeline ├── __init__.py ├── augmentations.py └── input_pipeline.py ├── loss.py ├── metrics.py ├── model.py ├── network.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | __pycache__ 3 | models/ 4 | export/ 5 | *.pb 6 | *.csv 7 | *.npy 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Dan Antoshchenko 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Wing Loss 2 | 3 | This is an implementation of the loss function from 4 | [Wing Loss for Robust Facial Landmark Localisation with Convolutional Neural Networks](https://arxiv.org/abs/1711.06753). 5 | 6 | ## How to use a pretrained model 7 | 1. Download a pretrained model from [here](https://drive.google.com/drive/folders/1yCGoE6wC8ZOVDX8DekkEtZZIxfP0wnVU?usp=sharing). 8 | 2. See an example of usage in `inference/try_detector.ipynb`. 9 | 10 | ## Example 11 | ![example](inference/example.jpg) 12 | 13 | ## Notes 14 | 1. I didn't train on any datasets in the paper. 15 | 2. I simply trained on CelebA dataset (it has five landmark locations for each face). 16 | 3. I use a detector from [here](https://github.com/TropComplique/FaceBoxes-tensorflow) to detect faces. 17 | 4. The inference speed is ~0.15 ms per image (video card is NVIDIA GeForce GTX 1080 Ti, batch size is 8). 18 | 5. I used procrustes analysis for data balancing (see `data/explore_and_prepare_CelebA.ipynb`). 19 | 20 | ## Requirements 21 | 1. tensorflow 1.12 22 | 2. numpy, Pillow, tqdm 23 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_dir": "models/run00", 3 | 4 | "num_landmarks": 5, "weight_decay": 1e-4, 5 | "epsilon": 2.0, "w": 10.0, 6 | "initial_lr": 4e-4, 7 | 8 | "image_size": [64, 64], 9 | "batch_size": 16, 10 | "train_dataset": "/mnt/datasets/dan/CelebA/train_shards/", 11 | "val_dataset": "/mnt/datasets/dan/CelebA/val_shards/", 12 | "num_steps": 180000 13 | } 14 | -------------------------------------------------------------------------------- /create_pb.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os 3 | import shutil 4 | import json 5 | from model import model_fn 6 | 7 | 8 | """ 9 | The purpose of this script is to export 10 | the inference graph as a SavedModel. 11 | 12 | Also it creates a .pb frozen inference graph. 13 | """ 14 | 15 | 16 | OUTPUT_FOLDER = 'export/' # for savedmodel 17 | PB_FILE_PATH = 'inference/model.pb' 18 | CONFIG = 'config.json' 19 | GPU_TO_USE = '0' 20 | 21 | params = json.load(open(CONFIG)) 22 | WIDTH, HEIGHT = params['image_size'] 23 | 24 | 25 | def export_savedmodel(): 26 | config = tf.ConfigProto() 27 | config.gpu_options.visible_device_list = GPU_TO_USE 28 | run_config = tf.estimator.RunConfig() 29 | run_config = run_config.replace( 30 | model_dir=params['model_dir'], 31 | session_config=config 32 | ) 33 | estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) 34 | 35 | def serving_input_receiver_fn(): 36 | images = tf.placeholder(dtype=tf.uint8, shape=[None, HEIGHT, WIDTH, 3], name='images') 37 | features = tf.to_float(images) * (1.0/255.0) 38 | return tf.estimator.export.TensorServingInputReceiver(features=features, receiver_tensors={'images': images}) 39 | 40 | shutil.rmtree(OUTPUT_FOLDER, ignore_errors=True) 41 | os.mkdir(OUTPUT_FOLDER) 42 | estimator.export_savedmodel(OUTPUT_FOLDER, serving_input_receiver_fn) 43 | 44 | 45 | def convert_to_pb(): 46 | 47 | subfolders = os.listdir(OUTPUT_FOLDER) 48 | assert len(subfolders) == 1 49 | last_saved_model = os.path.join(OUTPUT_FOLDER, subfolders[0]) 50 | 51 | graph = tf.Graph() 52 | config = tf.ConfigProto() 53 | config.gpu_options.visible_device_list = GPU_TO_USE 54 | 55 | with graph.as_default(): 56 | with tf.Session(graph=graph, config=config) as sess: 57 | tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], last_saved_model) 58 | 59 | # output ops 60 | keep_nodes = ['landmarks'] 61 | 62 | input_graph_def = tf.graph_util.convert_variables_to_constants( 63 | sess, graph.as_graph_def(), 64 | output_node_names=keep_nodes 65 | ) 66 | output_graph_def = tf.graph_util.remove_training_nodes( 67 | input_graph_def, protected_nodes=keep_nodes 68 | ) 69 | 70 | with tf.gfile.GFile(PB_FILE_PATH, 'wb') as f: 71 | f.write(output_graph_def.SerializeToString()) 72 | print('%d ops in the final graph.' % len(output_graph_def.node)) 73 | 74 | 75 | tf.logging.set_verbosity('INFO') 76 | export_savedmodel() 77 | convert_to_pb() 78 | -------------------------------------------------------------------------------- /data/create_tfrecords.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import PIL.Image 4 | import tensorflow as tf 5 | import json 6 | import shutil 7 | import random 8 | import math 9 | import argparse 10 | from tqdm import tqdm 11 | 12 | 13 | """ 14 | The purpose of this script is to create a set of .tfrecords files 15 | from a folder of images and a folder of annotations. 16 | Annotations are in the json format. 17 | Images must have .jpg or .jpeg filename extension. 18 | 19 | Example of a json annotation (with filename "132416.json"): 20 | { 21 | "box": {"ymin": 1, "ymax": 248, "xmax": 1149, "xmin": 1014}, 22 | "landmarks": [[102, 98], [135, 109], [121, 132], [85, 134], [117, 144]] 23 | "filename": "132416.jpg", 24 | "size": {"depth": 3, "width": 356, "height": 570} 25 | } 26 | 27 | Landmarks are in the following order: 28 | [[lefteye_x lefteye_y] 29 | [righteye_x righteye_y] 30 | [nose_x nose_y] 31 | [leftmouth_x leftmouth_y] 32 | [rightmouth_x rightmouth_y]] 33 | 34 | Example of use: 35 | python create_tfrecords.py \ 36 | --image_dir=/mnt/datasets/dan/CelebA/train/images/ \ 37 | --annotations_dir=/mnt/datasets/dan/CelebA/train/annotations/ \ 38 | --output=/mnt/datasets/dan/CelebA/train_shards/ \ 39 | --num_shards=800 40 | 41 | python create_tfrecords.py \ 42 | --image_dir=/mnt/datasets/dan/CelebA/val/images/ \ 43 | --annotations_dir=/mnt/datasets/dan/CelebA/val/annotations/ \ 44 | --output=/mnt/datasets/dan/CelebA/val_shards/ \ 45 | --num_shards=1 46 | """ 47 | 48 | 49 | def make_args(): 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument('-i', '--image_dir', type=str) 52 | parser.add_argument('-a', '--annotations_dir', type=str) 53 | parser.add_argument('-o', '--output', type=str) 54 | parser.add_argument('-s', '--num_shards', type=int, default=1) 55 | return parser.parse_args() 56 | 57 | 58 | def dict_to_tf_example(annotation, image_dir): 59 | """Convert dict to tf.Example proto. 60 | 61 | Notice that this function normalizes the bounding 62 | box coordinates provided by the raw data. 63 | 64 | Arguments: 65 | data: a dict. 66 | image_dir: a string, path to the image directory. 67 | Returns: 68 | an instance of tf.Example. 69 | """ 70 | image_name = annotation['filename'] 71 | assert image_name.endswith('.jpg') or image_name.endswith('.jpeg') 72 | 73 | image_path = os.path.join(image_dir, image_name) 74 | with tf.gfile.GFile(image_path, 'rb') as f: 75 | encoded_jpg = f.read() 76 | 77 | # check image format 78 | encoded_jpg_io = io.BytesIO(encoded_jpg) 79 | image = PIL.Image.open(encoded_jpg_io) 80 | assert image.format == 'JPEG' 81 | assert image.mode == 'RGB' 82 | 83 | width = int(annotation['size']['width']) 84 | height = int(annotation['size']['height']) 85 | assert width > 0 and height > 0 86 | assert image.size[0] == width and image.size[1] == height 87 | 88 | ymin = float(annotation['box']['ymin'])/height 89 | xmin = float(annotation['box']['xmin'])/width 90 | ymax = float(annotation['box']['ymax'])/height 91 | xmax = float(annotation['box']['xmax'])/width 92 | assert (ymin < ymax) and (xmin < xmax) 93 | 94 | # note that i reversing the order of the coordinates here 95 | landmarks = annotation['landmarks'] 96 | landmarks_flattened = [] 97 | for x, y in landmarks: 98 | y, x = y/height, x/width 99 | assert y <= ymax and y >= ymin 100 | assert x <= xmax and x >= xmin 101 | landmarks_flattened.extend([y, x]) 102 | 103 | example = tf.train.Example(features=tf.train.Features(feature={ 104 | 'image': _bytes_feature(encoded_jpg), 105 | 'xmin': _float_feature(xmin), 106 | 'xmax': _float_feature(xmax), 107 | 'ymin': _float_feature(ymin), 108 | 'ymax': _float_feature(ymax), 109 | 'landmarks': _float_list_feature(landmarks_flattened), 110 | })) 111 | return example 112 | 113 | 114 | def _bytes_feature(value): 115 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 116 | 117 | 118 | def _float_list_feature(value): 119 | return tf.train.Feature(float_list=tf.train.FloatList(value=value)) 120 | 121 | 122 | def _float_feature(value): 123 | return tf.train.Feature(float_list=tf.train.FloatList(value=[value])) 124 | 125 | 126 | def main(): 127 | ARGS = make_args() 128 | 129 | image_dir = ARGS.image_dir 130 | annotations_dir = ARGS.annotations_dir 131 | print('Reading images from:', image_dir) 132 | print('Reading annotations from:', annotations_dir, '\n') 133 | 134 | examples_list = os.listdir(annotations_dir) 135 | random.shuffle(examples_list) 136 | num_examples = len(examples_list) 137 | print('Number of images:', num_examples) 138 | 139 | num_shards = ARGS.num_shards 140 | shard_size = math.ceil(num_examples/num_shards) 141 | print('Number of images per shard:', shard_size) 142 | 143 | output_dir = ARGS.output 144 | shutil.rmtree(output_dir, ignore_errors=True) 145 | os.mkdir(output_dir) 146 | 147 | shard_id = 0 148 | num_examples_written = 0 149 | for example in tqdm(examples_list): 150 | 151 | if num_examples_written == 0: 152 | shard_path = os.path.join(output_dir, 'shard-%04d.tfrecords' % shard_id) 153 | writer = tf.python_io.TFRecordWriter(shard_path) 154 | 155 | path = os.path.join(annotations_dir, example) 156 | annotation = json.load(open(path)) 157 | tf_example = dict_to_tf_example(annotation, image_dir) 158 | writer.write(tf_example.SerializeToString()) 159 | num_examples_written += 1 160 | 161 | if num_examples_written == shard_size: 162 | shard_id += 1 163 | num_examples_written = 0 164 | writer.close() 165 | 166 | if num_examples_written != 0: 167 | writer.close() 168 | 169 | print('Result is here:', ARGS.output) 170 | 171 | 172 | main() 173 | -------------------------------------------------------------------------------- /data/explore_and_prepare_CelebA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import json\n", 11 | "from PIL import Image, ImageDraw\n", 12 | "import os\n", 13 | "import cv2\n", 14 | "import pandas as pd\n", 15 | "from tqdm import tqdm\n", 16 | "import shutil\n", 17 | "import random\n", 18 | "\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "%matplotlib inline\n", 21 | "\n", 22 | "from procrustes import procrustes\n", 23 | "from sklearn.decomposition import PCA\n", 24 | "\n", 25 | "import sys\n", 26 | "sys.path.append('../inference/')\n", 27 | "from face_detector import FaceDetector\n", 28 | "# this face detector is taken from here\n", 29 | "# https://github.com/TropComplique/FaceBoxes-tensorflow\n", 30 | "# (facial keypoints detector will be trained to work well with this detector)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "The purpose of this script is to explore images/annotations of the CelebA dataset. \n", 38 | "Also it cleans CelebA. \n", 39 | "Also it converts annotations into json format." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "IMAGES_DIR = '/home/gpu2/hdd/dan/CelebA/img_celeba.7z/out/'\n", 49 | "ANNOTATIONS_PATH = '/home/gpu2/hdd/dan/CelebA/list_landmarks_celeba.txt'\n", 50 | "SPLIT_PATH = '/home/gpu2/hdd/dan/CelebA/list_eval_partition.txt'" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "# Read data" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# collect paths to all images\n", 67 | "\n", 68 | "all_paths = []\n", 69 | "for name in tqdm(os.listdir(IMAGES_DIR)):\n", 70 | " all_paths.append(os.path.join(IMAGES_DIR, name))\n", 71 | "\n", 72 | "metadata = pd.DataFrame(all_paths, columns=['full_path'])\n", 73 | "\n", 74 | "# strip root folder\n", 75 | "metadata['name'] = metadata.full_path.apply(lambda x: os.path.relpath(x, IMAGES_DIR))" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# number of images is taken from the official website\n", 85 | "assert len(metadata) == 202599" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# see all unique endings\n", 95 | "metadata.name.apply(lambda x: x.split('.')[-1]).unique()" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Detect a face on each image" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# load faceboxes detector\n", 112 | "face_detector = FaceDetector('../inference/model-step-240000.pb', visible_device_list='0')" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "detections = []\n", 122 | "for p in tqdm(metadata.full_path):\n", 123 | " image = cv2.imread(p)\n", 124 | " image = image[:, :, [2, 1, 0]] # to RGB\n", 125 | " detections.append(face_detector(image))" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# take only images where one high confidence box is detected\n", 135 | "bad_images = [metadata.name[i] for i, (b, s) in enumerate(detections) if len(b) != 1 or s.max() < 0.5]" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "boxes = {}\n", 145 | "for n, (box, score) in zip(metadata.name, detections):\n", 146 | " if n not in bad_images:\n", 147 | " ymin, xmin, ymax, xmax = box[0]\n", 148 | " boxes[n] = (xmin, ymin, xmax, ymax)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### Read keypoints from annotations" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "def get_numbers(s):\n", 165 | " s = s.strip().split(' ')\n", 166 | " return [s[0]] + [int(i) for i in s[1:] if i]\n", 167 | " \n", 168 | "with open(ANNOTATIONS_PATH, 'r') as f:\n", 169 | " content = f.readlines()\n", 170 | " content = content[2:]\n", 171 | " content = [get_numbers(s) for s in content]\n", 172 | "\n", 173 | "landmarks = {}\n", 174 | "more_bad_images = []\n", 175 | "for i in content:\n", 176 | " name = i[0]\n", 177 | " \n", 178 | " keypoints = [\n", 179 | " [i[1], i[2]], # lefteye_x lefteye_y \n", 180 | " [i[3], i[4]], # righteye_x righteye_y\n", 181 | " [i[5], i[6]], # nose_x nose_y \n", 182 | " [i[7], i[8]], # leftmouth_x leftmouth_y\n", 183 | " [i[9], i[10]], # rightmouth_x rightmouth_y\n", 184 | " ]\n", 185 | " \n", 186 | " # assert that landmarks are inside the box\n", 187 | " if name in bad_images:\n", 188 | " continue\n", 189 | " xmin, ymin, xmax, ymax = boxes[name]\n", 190 | " points = np.array(keypoints)\n", 191 | " is_normal = (points[:, 0] > xmin).all() and\\\n", 192 | " (points[:, 0] < xmax).all() and\\\n", 193 | " (points[:, 1] > ymin).all() and\\\n", 194 | " (points[:, 1] < ymax).all()\n", 195 | " if not is_normal:\n", 196 | " more_bad_images.append(name)\n", 197 | "\n", 198 | " landmarks[name] = keypoints " 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "# number of weird landmarks\n", 208 | "len(more_bad_images)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "to_remove = more_bad_images + bad_images\n", 218 | "metadata = metadata.loc[~metadata.name.isin(to_remove)]\n", 219 | "metadata = metadata.reset_index(drop=True)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "# backup results\n", 229 | "metadata.to_csv('metadata.csv')\n", 230 | "np.save('boxes.npy', boxes)\n", 231 | "np.save('landmarks.npy', landmarks)\n", 232 | "np.save('to_remove.npy', to_remove)\n", 233 | "\n", 234 | "# metadata = pd.read_csv('metadata.csv', index_col=0)\n", 235 | "# boxes = np.load('boxes.npy')[()]\n", 236 | "# landmarks = np.load('landmarks.npy')[()]\n", 237 | "# to_remove = np.load('to_remove.npy')" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "# size after cleaning\n", 247 | "len(metadata)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "# Show some bounding boxes and landmarks" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "def draw_boxes_on_image(path, box, keypoints):\n", 264 | "\n", 265 | " image = Image.open(path)\n", 266 | " draw = ImageDraw.Draw(image, 'RGBA')\n", 267 | "\n", 268 | " xmin, ymin, xmax, ymax = box\n", 269 | " fill = (255, 255, 255, 45)\n", 270 | " outline = 'red'\n", 271 | " draw.rectangle(\n", 272 | " [(xmin, ymin), (xmax, ymax)],\n", 273 | " fill=fill, outline=outline\n", 274 | " )\n", 275 | " \n", 276 | " for x, y in keypoints:\n", 277 | " draw.ellipse([\n", 278 | " (x - 2.0, y - 2.0),\n", 279 | " (x + 2.0, y + 2.0)\n", 280 | " ], outline='red')\n", 281 | "\n", 282 | " return image" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "scrolled": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "i = random.randint(0, len(metadata) - 1) # choose a random image\n", 294 | "some_boxes = boxes[metadata.name[i]]\n", 295 | "keypoints = landmarks[metadata.name[i]]\n", 296 | "draw_boxes_on_image(metadata.full_path[i], some_boxes, keypoints)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "# Procrustes analysis (Pose-based Data Balancing strategy)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "landmarks_array = []\n", 313 | "boxes_array = []\n", 314 | "for n in metadata.name:\n", 315 | " landmarks_array.append(np.array(landmarks[n]))\n", 316 | " boxes_array.append(np.array(boxes[n]))\n", 317 | "\n", 318 | "landmarks_array = np.stack(landmarks_array, axis=0)\n", 319 | "landmarks_array = landmarks_array.astype('float32')\n", 320 | "boxes_array = np.stack(boxes_array)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "mean_shape = landmarks_array.mean(0) # reference shape\n", 330 | "num_images = len(landmarks_array)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "aligned = []\n", 340 | "for shape in tqdm(landmarks_array):\n", 341 | " Z, _ = procrustes(mean_shape, shape, reflection=False)\n", 342 | " aligned.append(Z)\n", 343 | "aligned = np.stack(aligned)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "pca = PCA(n_components=1)\n", 353 | "projected = pca.fit_transform(aligned.reshape((-1, 10)))\n", 354 | "projected = projected[:, 0]\n", 355 | "\n", 356 | "plt.hist(projected, bins=40);" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "# frontal faces:\n", 366 | "indices = np.where(np.abs(projected) < 5)[0]\n", 367 | "\n", 368 | "# faces turned to the left:\n", 369 | "# indices = np.where(projected > 15)[0]\n", 370 | "\n", 371 | "# faces turned to the right:\n", 372 | "# indices = np.where(projected < -30)[0]\n", 373 | "\n", 374 | "i = indices[random.randint(0, len(indices) - 1)]\n", 375 | "some_boxes = boxes[metadata.name[i]]\n", 376 | "keypoints = landmarks[metadata.name[i]]\n", 377 | "draw_boxes_on_image(metadata.full_path[i], some_boxes, keypoints)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "# it is not strictly a yaw angle\n", 387 | "metadata['yaw'] = projected" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "# Create train-val split" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "split = pd.read_csv(SPLIT_PATH, header=None, sep=' ')\n", 404 | "split.columns = ['name', 'assignment']" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "split = split.loc[~split.name.isin(to_remove)]\n", 414 | "split = split.reset_index(drop=True)" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "split.assignment.value_counts()" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "# \"0\" represents training image, \"1\" represents validation image, \"2\" represents testing image\n", 433 | "train = list(split.loc[split.assignment.isin([0, 1]), 'name'])\n", 434 | "val = list(split.loc[split.assignment.isin([2]), 'name'])" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "# Upsample rare poses" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "metadata['is_train'] = metadata.name.isin(train).astype('int')" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "bins = [metadata.yaw.min() - 1.0, -20.0, -5.0, 5.0, 20.0, metadata.yaw.max() + 1.0]\n", 460 | "metadata['bin'] = pd.cut(metadata.yaw, bins, labels=False)\n", 461 | "metadata.loc[metadata.is_train == 1, 'bin'].value_counts()" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "bins_to_upsample = [0, 1, 3, 4]\n", 471 | "num_samples = 80000" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "val_metadata = metadata.loc[metadata.is_train == 0]" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "upsampled = [metadata.loc[(metadata.is_train == 1) & (metadata.bin == 2)]]\n", 490 | "for b in bins_to_upsample:\n", 491 | " to_use = (metadata.is_train == 1) & (metadata.bin == b)\n", 492 | " m = metadata.loc[to_use].sample(n=num_samples, replace=True)\n", 493 | " upsampled.append(m)" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "upsampled = pd.concat(upsampled)\n", 503 | "upsampled.bin.value_counts()" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "metadata = pd.concat([upsampled, val_metadata])" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "# Convert" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "def get_annotation(name, new_name, width, height, translation):\n", 529 | " xmin, ymin, xmax, ymax = boxes[name]\n", 530 | " keypoints = landmarks[name]\n", 531 | " \n", 532 | " tx, ty = translation\n", 533 | " keypoints = [[p[0] - tx, p[1] - ty]for p in keypoints]\n", 534 | " xmin, ymin = xmin - tx, ymin - ty\n", 535 | " xmax, ymax = xmax - tx, ymax - ty\n", 536 | " \n", 537 | " annotation = {\n", 538 | " \"filename\": new_name,\n", 539 | " \"size\": {\"depth\": 3, \"width\": width, \"height\": height},\n", 540 | " \"box\": {\"ymin\": int(ymin), \"ymax\": int(ymax), \"xmax\": int(xmax), \"xmin\": int(xmin)},\n", 541 | " \"landmarks\": keypoints\n", 542 | " }\n", 543 | " return annotation" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "# create folders for the converted dataset\n", 553 | "TRAIN_DIR = '/mnt/datasets/dan/CelebA/train/'\n", 554 | "shutil.rmtree(TRAIN_DIR, ignore_errors=True)\n", 555 | "os.mkdir(TRAIN_DIR)\n", 556 | "os.mkdir(os.path.join(TRAIN_DIR, 'images'))\n", 557 | "os.mkdir(os.path.join(TRAIN_DIR, 'annotations'))\n", 558 | "\n", 559 | "VAL_DIR = '/mnt/datasets/dan/CelebA/val/'\n", 560 | "shutil.rmtree(VAL_DIR, ignore_errors=True)\n", 561 | "os.mkdir(VAL_DIR)\n", 562 | "os.mkdir(os.path.join(VAL_DIR, 'images'))\n", 563 | "os.mkdir(os.path.join(VAL_DIR, 'annotations'))" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "metadata": { 570 | "scrolled": true 571 | }, 572 | "outputs": [], 573 | "source": [ 574 | "counter = 0\n", 575 | "\n", 576 | "for T in tqdm(metadata.itertuples()):\n", 577 | " \n", 578 | " # get width and height of an image\n", 579 | " image = cv2.imread(T.full_path)\n", 580 | " h, w, c = image.shape\n", 581 | " assert c == 3\n", 582 | " \n", 583 | " # name of the image\n", 584 | " name = T.name\n", 585 | " assert name.endswith('.jpg')\n", 586 | " \n", 587 | " if name in train:\n", 588 | " result_dir = TRAIN_DIR\n", 589 | " elif name in val:\n", 590 | " result_dir = VAL_DIR\n", 591 | " else:\n", 592 | " print('WTF')\n", 593 | " break\n", 594 | " \n", 595 | " # crop the image to save space\n", 596 | " xmin, ymin, xmax, ymax = boxes[name]\n", 597 | " width, height = xmax - xmin, ymax - ymin\n", 598 | " assert width > 0 and height > 0\n", 599 | " xmin = max(int(xmin - width), 0)\n", 600 | " ymin = max(int(ymin - height), 0)\n", 601 | " xmax = min(int(xmax + width), w)\n", 602 | " ymax = min(int(ymax + height), h)\n", 603 | " crop = image[ymin:ymax, xmin:xmax, :]\n", 604 | " \n", 605 | " # we need to transform annotations after cropping\n", 606 | " translation = [xmin, ymin]\n", 607 | " \n", 608 | " # we need to rename images because of upsampling\n", 609 | " new_name = str(counter) + '.jpg'\n", 610 | " counter += 1\n", 611 | " cv2.imwrite(os.path.join(result_dir, 'images', new_name), crop)\n", 612 | "\n", 613 | " # save annotation for it\n", 614 | " d = get_annotation(name, new_name, xmax - xmin, ymax - ymin, translation)\n", 615 | " json_name = new_name[:-4] + '.json'\n", 616 | " json.dump(d, open(os.path.join(result_dir, 'annotations', json_name), 'w'))" 617 | ] 618 | } 619 | ], 620 | "metadata": { 621 | "kernelspec": { 622 | "display_name": "Python 3", 623 | "language": "python", 624 | "name": "python3" 625 | }, 626 | "language_info": { 627 | "codemirror_mode": { 628 | "name": "ipython", 629 | "version": 3 630 | }, 631 | "file_extension": ".py", 632 | "mimetype": "text/x-python", 633 | "name": "python", 634 | "nbconvert_exporter": "python", 635 | "pygments_lexer": "ipython3", 636 | "version": "3.6.3" 637 | } 638 | }, 639 | "nbformat": 4, 640 | "nbformat_minor": 1 641 | } 642 | -------------------------------------------------------------------------------- /data/procrustes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def procrustes(X, Y, scaling=True, reflection='best'): 5 | """ 6 | This function is taken from here: 7 | https://stackoverflow.com/a/18927641. 8 | 9 | It aligns a shape defined by Y to a shape defined by X. 10 | # Y_transformed = b * YT + c 11 | 12 | Procrustes analysis determines a linear transformation (translation, 13 | reflection, orthogonal rotation and scaling) of the points in Y to best 14 | conform them to the points in matrix X, using the sum of squared errors 15 | as the goodness of fit criterion. 16 | 17 | Arguments: 18 | X, Y: float numpy arrays with shape [n, p]. 19 | scaling: a boolean, if False, the scaling 20 | component of the transformation is forced to 1. 21 | reflection: a string or boolean, 22 | possible values are 'best', False, True. 23 | if 'best' (default), the transformation solution may or may not 24 | include a reflection component, depending on which fits the data 25 | best. setting reflection to True or False forces a solution with 26 | reflection or no reflection respectively. 27 | Returns: 28 | Z: a float numpy array with shape [n, p]. 29 | transform: a dict specifying the rotation, translation 30 | and scaling that maps X --> Y. 31 | """ 32 | muX = X.mean(0) 33 | muY = Y.mean(0) 34 | 35 | # center shapes 36 | X0 = X - muX 37 | Y0 = Y - muY 38 | 39 | # compute centered frobenius norm 40 | normX = np.sqrt((X0**2).sum()) 41 | normY = np.sqrt((Y0**2).sum()) 42 | 43 | # scale to equal (unit) norm 44 | X0 /= normX 45 | Y0 /= normY 46 | 47 | # get an optimal rotation matrix of Y 48 | A = np.matmul(X0.T, Y0) # shape [p, p] 49 | U, s, Vt = np.linalg.svd(A, full_matrices=False) 50 | # they have shapes [p, k], [k], [k, p] 51 | V = Vt.T 52 | T = np.matmul(V, U.T) # shape [p, p] 53 | # T is orthogonal 54 | 55 | if reflection is not 'best': 56 | 57 | # does the current solution use a reflection? 58 | has_reflection = np.linalg.det(T) < 0 59 | 60 | # if that's not what was specified, force another reflection 61 | if reflection != has_reflection: 62 | V[:, -1] *= -1 63 | s[-1] *= -1 # the smallest singular value 64 | T = np.matmul(V, U.T) 65 | 66 | traceTA = s.sum() # trace of TA 67 | 68 | if scaling: 69 | 70 | # optimal scaling of Y 71 | b = traceTA * normX / normY 72 | 73 | # transformed coords 74 | Z = normX * traceTA * np.matmul(Y0, T) + muX 75 | 76 | # frobenius_norm(Y0 * T) = frobenius_norm(Y0) = 1 77 | 78 | else: 79 | b = 1 80 | Z = normY * np.matmul(Y0, T) + muX 81 | 82 | c = muX - b * np.matmul(muY, T) 83 | 84 | transform = {'rotation': T, 'scale': b, 'translation': c} 85 | """ 86 | b * Y * T + c = 87 | = b * (Y0 * normY + muY) * T + muX - b * muY * T = 88 | = b * normY * Y0 * T + muX = 89 | = Z 90 | """ 91 | return Z, transform 92 | -------------------------------------------------------------------------------- /data/test_input_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import tensorflow as tf\n", 20 | "import numpy as np\n", 21 | "from PIL import Image, ImageDraw\n", 22 | "import json\n", 23 | "\n", 24 | "import sys\n", 25 | "sys.path.append('..')\n", 26 | "from input_pipeline import Pipeline" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# Get images and boxes" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "scrolled": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "tf.reset_default_graph()\n", 45 | "\n", 46 | "pipeline = Pipeline(\n", 47 | " ['/mnt/datasets/dan/CelebA/train_shards/shard-0000.tfrecords'],\n", 48 | " batch_size=100, image_size=(64, 64),\n", 49 | " num_landmarks=5, repeat=True, \n", 50 | " shuffle=False, augmentation=True\n", 51 | ")\n", 52 | "\n", 53 | "dataset = pipeline.dataset\n", 54 | "iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes)\n", 55 | "init = iterator.make_initializer(dataset)\n", 56 | "features, labels = iterator.get_next()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "with tf.Session() as sess:\n", 66 | " sess.run(init)\n", 67 | " I, L = sess.run([features, labels])" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "# Show an augmented image" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "def draw_on_image(image, keypoints):\n", 84 | "\n", 85 | " image_copy = image.copy()\n", 86 | " draw = ImageDraw.Draw(image_copy, 'RGBA')\n", 87 | " \n", 88 | " for i, (y, x) in enumerate(keypoints):\n", 89 | " draw.ellipse([\n", 90 | " (x - 1.0, y - 1.0),\n", 91 | " (x + 1.0, y + 1.0)\n", 92 | " ], outline='red')\n", 93 | " draw.text((x, y), text=str(i), fill='blue')\n", 94 | "\n", 95 | " return image_copy" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# choose an image\n", 105 | "i = 0\n", 106 | "image = Image.fromarray((I[i]*255.0).astype('uint8'))\n", 107 | "w, h = image.size\n", 108 | "keypoints = L[i]*np.array([h, w])\n", 109 | "draw_on_image(image, keypoints)" 110 | ] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 3", 116 | "language": "python", 117 | "name": "python3" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 3 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython3", 129 | "version": "3.6.3" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 2 134 | } 135 | -------------------------------------------------------------------------------- /inference/example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/wing-loss/d7335610d26cf805bf5a20ae0d70df5de85d1521/inference/example.jpg -------------------------------------------------------------------------------- /inference/face_detector.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | """ 6 | This is a face detector taken from here: 7 | https://github.com/TropComplique/FaceBoxes-tensorflow 8 | """ 9 | 10 | 11 | class FaceDetector: 12 | def __init__(self, model_path, gpu_memory_fraction=0.25, visible_device_list='0'): 13 | """ 14 | Arguments: 15 | model_path: a string, path to a pb file. 16 | gpu_memory_fraction: a float number. 17 | visible_device_list: a string. 18 | """ 19 | with tf.gfile.GFile(model_path, 'rb') as f: 20 | graph_def = tf.GraphDef() 21 | graph_def.ParseFromString(f.read()) 22 | 23 | graph = tf.Graph() 24 | with graph.as_default(): 25 | tf.import_graph_def(graph_def, name='import') 26 | 27 | self.input_image = graph.get_tensor_by_name('import/image_tensor:0') 28 | self.output_ops = [ 29 | graph.get_tensor_by_name('import/boxes:0'), 30 | graph.get_tensor_by_name('import/scores:0'), 31 | graph.get_tensor_by_name('import/num_boxes:0'), 32 | ] 33 | 34 | gpu_options = tf.GPUOptions( 35 | per_process_gpu_memory_fraction=gpu_memory_fraction, 36 | visible_device_list=visible_device_list 37 | ) 38 | config_proto = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False) 39 | self.sess = tf.Session(graph=graph, config=config_proto) 40 | 41 | def __call__(self, image, score_threshold=0.5): 42 | """Detect faces. 43 | 44 | Arguments: 45 | image: a numpy uint8 array with shape [height, width, 3], 46 | that represents a RGB image. 47 | score_threshold: a float number. 48 | Returns: 49 | boxes: a float numpy array of shape [num_faces, 4]. 50 | scores: a float numpy array of shape [num_faces]. 51 | 52 | Note that box coordinates are in the order: ymin, xmin, ymax, xmax! 53 | """ 54 | h, w, _ = image.shape 55 | image = np.expand_dims(image, 0) 56 | 57 | boxes, scores, num_boxes = self.sess.run( 58 | self.output_ops, feed_dict={self.input_image: image} 59 | ) 60 | num_boxes = num_boxes[0] 61 | boxes = boxes[0][:num_boxes] 62 | scores = scores[0][:num_boxes] 63 | 64 | to_keep = scores > score_threshold 65 | boxes = boxes[to_keep] 66 | scores = scores[to_keep] 67 | 68 | scaler = np.array([h, w, h, w], dtype='float32') 69 | boxes = boxes * scaler 70 | 71 | return boxes, scores 72 | -------------------------------------------------------------------------------- /inference/landmark_detector.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | class KeypointDetector: 6 | def __init__(self, model_path, gpu_memory_fraction=0.25, visible_device_list='0'): 7 | """ 8 | Arguments: 9 | model_path: a string, path to a pb file. 10 | gpu_memory_fraction: a float number. 11 | visible_device_list: a string. 12 | """ 13 | with tf.gfile.GFile(model_path, 'rb') as f: 14 | graph_def = tf.GraphDef() 15 | graph_def.ParseFromString(f.read()) 16 | 17 | graph = tf.Graph() 18 | with graph.as_default(): 19 | tf.import_graph_def(graph_def, name='import') 20 | 21 | self.input_image = graph.get_tensor_by_name('import/images:0') 22 | self.output = graph.get_tensor_by_name('import/landmarks:0') 23 | 24 | gpu_options = tf.GPUOptions( 25 | per_process_gpu_memory_fraction=gpu_memory_fraction, 26 | visible_device_list=visible_device_list 27 | ) 28 | config_proto = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False) 29 | self.sess = tf.Session(graph=graph, config=config_proto) 30 | 31 | def __call__(self, images): 32 | """ 33 | Arguments: 34 | images: a numpy uint8 array with shape [b, 64, 64, 3], 35 | that represents a batch of RGB images. 36 | Returns: 37 | a float numpy array of shape [b, 5, 2]. 38 | 39 | Note that points coordinates are in the order: (y, x). 40 | Also coordinates are relative to the image (in the [0, 1] range). 41 | """ 42 | landmarks = self.sess.run(self.output, feed_dict={self.input_image: images}) 43 | return landmarks 44 | -------------------------------------------------------------------------------- /inference/the_office.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TropComplique/wing-loss/d7335610d26cf805bf5a20ae0d70df5de85d1521/inference/the_office.jpg -------------------------------------------------------------------------------- /input_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .input_pipeline import Pipeline 2 | -------------------------------------------------------------------------------- /input_pipeline/augmentations.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import cv2 3 | import math 4 | 5 | 6 | """ 7 | `image` is assumed to be a float tensor with shape [height, width, 3], 8 | it is a RGB image with pixel values in range [0, 1]. 9 | `box` is a float tensor with shape [4]. 10 | `landmarks` is a float tensor with shape [num_landmarks, 2]. 11 | """ 12 | 13 | 14 | def random_rotation(image, box, landmarks, max_angle=10): 15 | with tf.name_scope('random_rotation'): 16 | # get a random angle 17 | max_angle_radians = max_angle*(math.pi/180.0) 18 | theta = tf.random_uniform( 19 | [], minval=-max_angle_radians, 20 | maxval=max_angle_radians, dtype=tf.float32 21 | ) 22 | 23 | # find the center of the image 24 | image_height = tf.to_float(tf.shape(image)[0]) 25 | image_width = tf.to_float(tf.shape(image)[1]) 26 | scaler = tf.stack([image_height, image_width], axis=0) 27 | center = tf.reshape(0.5*scaler, [1, 2]) 28 | 29 | rotation = tf.stack([ 30 | tf.cos(theta), tf.sin(theta), 31 | -tf.sin(theta), tf.cos(theta) 32 | ], axis=0) 33 | rotation_matrix = tf.reshape(rotation, [2, 2]) 34 | 35 | inverse_rotation = tf.stack([ 36 | tf.cos(theta), -tf.sin(theta), 37 | tf.sin(theta), tf.cos(theta) 38 | ], axis=0) 39 | inverse_rotation_matrix = tf.reshape(inverse_rotation, [2, 2]) 40 | 41 | # now i want to rotate the image and annotations around the image center, 42 | # note: landmark and box coordinates are (y, x) not (x, y) 43 | 44 | # rotate box 45 | ymin, xmin, ymax, xmax = tf.unstack(box, axis=0) 46 | h, w = ymax - ymin, xmax - xmin 47 | box = tf.stack([ 48 | ymin, xmin, ymin, xmax, 49 | ymax, xmax, ymax, xmin 50 | ], axis=0) # four corners 51 | box = tf.matmul(tf.reshape(box, [4, 2])*scaler - center, rotation_matrix) + center 52 | y, x = tf.unstack(box/scaler, axis=1) 53 | ymin, ymax = tf.reduce_min(y), tf.reduce_max(y) 54 | xmin, xmax = tf.reduce_min(x), tf.reduce_max(x) 55 | box = tf.stack([ymin, xmin, ymax, xmax], axis=0) 56 | 57 | # rotate landmarks 58 | landmarks = tf.matmul(landmarks*scaler - center, rotation_matrix) + center 59 | landmarks = landmarks/scaler 60 | 61 | # rotate image 62 | translate = center - tf.matmul(center, inverse_rotation_matrix) 63 | translate_y, translate_x = tf.unstack(tf.squeeze(translate, axis=0), axis=0) 64 | transform = tf.stack([ 65 | tf.cos(theta), -tf.sin(theta), translate_x, 66 | tf.sin(theta), tf.cos(theta), translate_y, 67 | 0.0, 0.0 68 | ]) 69 | image = tf.contrib.image.transform(image, transform, interpolation='BILINEAR') 70 | 71 | return image, box, landmarks 72 | 73 | 74 | def random_box_jitter(box, landmarks, ratio=0.05): 75 | """Randomly jitter bounding box. 76 | 77 | Arguments: 78 | box: a float tensor with shape [4]. 79 | landmarks: a float tensor with shape [num_landmarks, 2]. 80 | ratio: a float number. 81 | The ratio of the box width and height that the corners can jitter. 82 | For example if the width is 100 pixels and ratio is 0.05, 83 | the corners can jitter up to 5 pixels in the x direction. 84 | Returns: 85 | a float tensor with shape [4]. 86 | """ 87 | with tf.name_scope('random_box_jitter'): 88 | 89 | # get the tight box around all landmarks 90 | y, x = tf.unstack(landmarks, axis=1) 91 | ymin_tight, ymax_tight = tf.reduce_min(y), tf.reduce_max(y) 92 | xmin_tight, xmax_tight = tf.reduce_min(x), tf.reduce_max(x) 93 | # we want to keep landmarks inside the new distorted box 94 | 95 | ymin, xmin, ymax, xmax = tf.unstack(box, axis=0) 96 | box_height, box_width = ymax - ymin, xmax - xmin 97 | 98 | # it is assumed that initially 99 | # all landmarks were inside the box 100 | new_ymin = tf.random_uniform( 101 | [], minval=ymin - box_height * ratio, 102 | maxval=tf.minimum(ymin_tight, ymin + box_height * ratio), 103 | dtype=tf.float32 104 | ) 105 | new_xmin = tf.random_uniform( 106 | [], minval=xmin - box_width * ratio, 107 | maxval=tf.minimum(xmin_tight, xmin + box_width * ratio), 108 | dtype=tf.float32 109 | ) 110 | new_ymax = tf.random_uniform( 111 | [], minval=tf.maximum(ymax_tight, ymax - box_height * ratio), 112 | maxval=ymax + box_height * ratio, 113 | dtype=tf.float32 114 | ) 115 | new_xmax = tf.random_uniform( 116 | [], minval=tf.maximum(xmax_tight, xmax - box_width * ratio), 117 | maxval=xmax + box_width * ratio, 118 | dtype=tf.float32 119 | ) 120 | distorted_box = tf.stack([new_ymin, new_xmin, new_ymax, new_xmax], axis=0) 121 | return distorted_box 122 | 123 | 124 | def random_gaussian_blur(image, probability=0.3, kernel_size=3): 125 | h, w, _ = image.shape.as_list() 126 | 127 | def blur(image): 128 | image = (image*255.0).astype('uint8') 129 | image = cv2.blur(image, (kernel_size, kernel_size)) 130 | return (image/255.0).astype('float32') 131 | 132 | with tf.name_scope('random_gaussian_blur'): 133 | do_it = tf.less(tf.random_uniform([]), probability) 134 | image = tf.cond( 135 | do_it, 136 | lambda: tf.py_func(blur, [image], tf.float32, stateful=False), 137 | lambda: image 138 | ) 139 | image.set_shape([h, w, 3]) # without this shape information is lost 140 | return image 141 | 142 | 143 | def random_color_manipulations(image, probability=0.5, grayscale_probability=0.1): 144 | 145 | def manipulate(image): 146 | br_delta = tf.random_uniform([], -32.0/255.0, 32.0/255.0) 147 | cb_factor = tf.random_uniform([], -0.1, 0.1) 148 | cr_factor = tf.random_uniform([], -0.1, 0.1) 149 | channels = tf.split(axis=2, num_or_size_splits=3, value=image) 150 | red_offset = 1.402 * cr_factor + br_delta 151 | green_offset = -0.344136 * cb_factor - 0.714136 * cr_factor + br_delta 152 | blue_offset = 1.772 * cb_factor + br_delta 153 | channels[0] += red_offset 154 | channels[1] += green_offset 155 | channels[2] += blue_offset 156 | image = tf.concat(axis=2, values=channels) 157 | image = tf.clip_by_value(image, 0.0, 1.0) 158 | return image 159 | 160 | def to_grayscale(image): 161 | image = tf.image.rgb_to_grayscale(image) 162 | image = tf.image.grayscale_to_rgb(image) 163 | return image 164 | 165 | with tf.name_scope('random_color_manipulations'): 166 | do_it = tf.less(tf.random_uniform([]), probability) 167 | image = tf.cond(do_it, lambda: manipulate(image), lambda: image) 168 | 169 | with tf.name_scope('to_grayscale'): 170 | make_gray = tf.less(tf.random_uniform([]), grayscale_probability) 171 | image = tf.cond(make_gray, lambda: to_grayscale(image), lambda: image) 172 | 173 | return image 174 | 175 | 176 | def random_flip_left_right(image, landmarks): 177 | 178 | def flip(image, landmarks): 179 | flipped_image = tf.image.flip_left_right(image) 180 | y, x = tf.unstack(landmarks, axis=1) 181 | flipped_x = tf.subtract(1.0, x) 182 | flipped_landmarks = tf.stack([y, flipped_x], axis=1) 183 | 184 | # landmarks order: left_eye, right_eye, nose, left_mouth, right_mouth. 185 | # so, when we flip the image we need to flip some of the landmarks 186 | correct_order = tf.constant([1, 0, 2, 4, 3], dtype=tf.int32) 187 | flipped_landmarks = tf.gather(flipped_landmarks, correct_order) 188 | 189 | return flipped_image, flipped_landmarks 190 | 191 | with tf.name_scope('random_flip_left_right'): 192 | do_it = tf.less(tf.random_uniform([]), 0.5) 193 | image, landmarks = tf.cond(do_it, lambda: flip(image, landmarks), lambda: (image, landmarks)) 194 | return image, landmarks 195 | 196 | 197 | def random_pixel_value_scale(image, minval=0.9, maxval=1.1, probability=0.5): 198 | """This function scales each pixel independently of the other ones. 199 | 200 | Arguments: 201 | image: a float tensor with shape [height, width, 3], 202 | an image with pixel values varying between [0, 1]. 203 | minval: a float number, lower ratio of scaling pixel values. 204 | maxval: a float number, upper ratio of scaling pixel values. 205 | probability: a float number. 206 | Returns: 207 | a float tensor with shape [height, width, 3]. 208 | """ 209 | def random_value_scale(image): 210 | color_coefficient = tf.random_uniform( 211 | tf.shape(image), minval=minval, 212 | maxval=maxval, dtype=tf.float32 213 | ) 214 | image = tf.multiply(image, color_coefficient) 215 | image = tf.clip_by_value(image, 0.0, 1.0) 216 | return image 217 | 218 | with tf.name_scope('random_pixel_value_scale'): 219 | do_it = tf.less(tf.random_uniform([]), probability) 220 | image = tf.cond(do_it, lambda: random_value_scale(image), lambda: image) 221 | return image 222 | -------------------------------------------------------------------------------- /input_pipeline/input_pipeline.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from .augmentations import random_color_manipulations,\ 3 | random_flip_left_right, random_pixel_value_scale, \ 4 | random_gaussian_blur, random_rotation, random_box_jitter 5 | 6 | 7 | SHUFFLE_BUFFER_SIZE = 20000 8 | NUM_THREADS = 8 9 | RESIZE_METHOD = tf.image.ResizeMethod.BILINEAR 10 | 11 | 12 | class Pipeline: 13 | def __init__(self, filenames, batch_size, image_size, num_landmarks, 14 | repeat=False, shuffle=False, augmentation=False): 15 | """ 16 | Arguments: 17 | filenames: a list of strings, paths to tfrecords files. 18 | batch_size: an integer. 19 | image_size: a list with two integers [width, height], 20 | images of this size will be in a batch 21 | num_landmarks: an integer. 22 | repeat: a boolean, whether repeat indefinitely. 23 | shuffle: whether to shuffle the dataset. 24 | augmentation: whether to do data augmentation. 25 | """ 26 | self.image_width, self.image_height = image_size 27 | self.augmentation = augmentation 28 | self.batch_size = batch_size 29 | 30 | assert num_landmarks == 5 31 | self.num_landmarks = num_landmarks 32 | 33 | def get_num_samples(filename): 34 | return sum(1 for _ in tf.python_io.tf_record_iterator(filename)) 35 | 36 | num_examples = 0 37 | for filename in filenames: 38 | num_examples_in_file = get_num_samples(filename) 39 | assert num_examples_in_file > 0 40 | num_examples += num_examples_in_file 41 | self.num_examples = num_examples 42 | assert self.num_examples > 0 43 | 44 | dataset = tf.data.Dataset.from_tensor_slices(filenames) 45 | num_shards = len(filenames) 46 | 47 | if shuffle: 48 | dataset = dataset.shuffle(buffer_size=num_shards) 49 | dataset = dataset.flat_map(tf.data.TFRecordDataset) 50 | dataset = dataset.prefetch(buffer_size=batch_size) 51 | 52 | if shuffle: 53 | dataset = dataset.shuffle(buffer_size=SHUFFLE_BUFFER_SIZE) 54 | dataset = dataset.repeat(None if repeat else 1) 55 | dataset = dataset.map(self._parse_and_preprocess, num_parallel_calls=NUM_THREADS) 56 | 57 | dataset = dataset.batch(batch_size) 58 | dataset = dataset.prefetch(buffer_size=1) 59 | 60 | self.dataset = dataset 61 | 62 | def _parse_and_preprocess(self, example_proto): 63 | """What this function does: 64 | 1. Parses one record from a tfrecords file and decodes it. 65 | 2. (optionally) Augments it. 66 | 67 | Returns: 68 | image: a float tensor with shape [image_height, image_width, 3], 69 | an RGB image with pixel values in the range [0, 1]. 70 | landmarks: a float tensor with shape [num_landmarks, 2]. 71 | """ 72 | features = { 73 | 'image': tf.FixedLenFeature([], tf.string), 74 | 'ymin': tf.FixedLenFeature([], tf.float32), 75 | 'xmin': tf.FixedLenFeature([], tf.float32), 76 | 'ymax': tf.FixedLenFeature([], tf.float32), 77 | 'xmax': tf.FixedLenFeature([], tf.float32), 78 | 'landmarks': tf.FixedLenFeature([2 * self.num_landmarks], tf.float32) 79 | } 80 | parsed_features = tf.parse_single_example(example_proto, features) 81 | 82 | # get image 83 | image = tf.image.decode_jpeg(parsed_features['image'], channels=3) 84 | image = tf.image.convert_image_dtype(image, tf.float32) 85 | # now pixel values are scaled to [0, 1] range 86 | 87 | # get face box, it must be in from-zero-to-one format 88 | box = tf.stack([ 89 | parsed_features['ymin'], parsed_features['xmin'], 90 | parsed_features['ymax'], parsed_features['xmax'] 91 | ], axis=0) 92 | box = tf.clip_by_value(tf.to_float(box), clip_value_min=0.0, clip_value_max=1.0) 93 | 94 | # get facial landmarks, they must be in from-zero-to-one format 95 | landmarks = tf.to_float(parsed_features['landmarks']) 96 | landmarks = tf.reshape(landmarks, [self.num_landmarks, 2]) 97 | landmarks = tf.clip_by_value(landmarks, clip_value_min=0.0, clip_value_max=1.0) 98 | # it assumed that landmarks are inside the bounding box (or on the edges) 99 | 100 | if self.augmentation: 101 | image, landmarks = self._augmentation_fn(image, box, landmarks) 102 | else: 103 | image, landmarks = crop(image, landmarks, box) 104 | image = tf.image.resize_images( 105 | image, [self.image_height, self.image_width], 106 | method=RESIZE_METHOD 107 | ) 108 | 109 | return image, landmarks 110 | 111 | def _augmentation_fn(self, image, box, landmarks): 112 | # there are a lot of hyperparameters here, 113 | # you will need to tune them all, haha 114 | image, box, landmarks = random_rotation(image, box, landmarks, max_angle=5) 115 | box = random_box_jitter(box, landmarks, ratio=0.025) 116 | image, landmarks = crop(image, landmarks, box) 117 | image = tf.image.resize_images( 118 | image, [self.image_height, self.image_width], 119 | method=RESIZE_METHOD 120 | ) 121 | image = random_color_manipulations(image, probability=0.1, grayscale_probability=0.01) 122 | image = random_pixel_value_scale(image, minval=0.85, maxval=1.15, probability=0.1) 123 | image = random_gaussian_blur(image, probability=0.1, kernel_size=4) 124 | image, landmarks = random_flip_left_right(image, landmarks) 125 | return image, landmarks 126 | 127 | 128 | def crop(image, landmarks, box): 129 | """ 130 | Crops the image to the box. 131 | It also adds some margin. 132 | Finally, it transforms coordinates of the landmarks. 133 | """ 134 | image_h = tf.to_float(tf.shape(image)[0]) 135 | image_w = tf.to_float(tf.shape(image)[1]) 136 | scaler = tf.stack([image_h, image_w, image_h, image_w], axis=0) 137 | box = box * scaler 138 | ymin, xmin, ymax, xmax = tf.unstack(box, axis=0) 139 | 140 | h, w = ymax - ymin, xmax - xmin 141 | margin_y, margin_x = h / 6.0, w / 6.0 # 6.0 here is a hyperparameter 142 | 143 | ymin, xmin = ymin - 0.5 * margin_y, xmin - 0.5 * margin_x 144 | ymax, xmax = ymax + 0.5 * margin_y, xmax + 0.5 * margin_x 145 | ymin = tf.clip_by_value(ymin, 0.0, image_h) 146 | xmin = tf.clip_by_value(xmin, 0.0, image_w) 147 | ymax = tf.clip_by_value(ymax, 0.0, image_h) 148 | xmax = tf.clip_by_value(xmax, 0.0, image_w) 149 | 150 | # for some reason box width or height sometimes becomes zero, 151 | # but it happens very very rarely 152 | h, w = tf.to_int32(ymax - ymin), tf.to_int32(xmax - xmin) 153 | box_is_okay = tf.greater(h * w, 0) 154 | 155 | def do_it(image, landmarks): 156 | image = tf.image.crop_to_bounding_box( 157 | image, tf.to_int32(ymin), tf.to_int32(xmin), h, w 158 | ) 159 | # translate coordinates of the landmarks 160 | shift = tf.stack([ymin/(ymax - ymin), xmin/(xmax - xmin)], axis=0) 161 | scaler = tf.stack([image_h/(ymax - ymin), image_w/(xmax - xmin)], axis=0) 162 | landmarks = (landmarks * scaler) - shift 163 | return image, landmarks 164 | 165 | image, landmarks = tf.cond( 166 | box_is_okay, 167 | lambda: do_it(image, landmarks), 168 | lambda: (image, landmarks) 169 | ) 170 | 171 | landmarks = tf.clip_by_value(landmarks, 0.0, 1.0) 172 | return image, landmarks 173 | -------------------------------------------------------------------------------- /loss.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import math 3 | 4 | 5 | def wing_loss(landmarks, labels, w=10.0, epsilon=2.0): 6 | """ 7 | Arguments: 8 | landmarks, labels: float tensors with shape [batch_size, num_landmarks, 2]. 9 | w, epsilon: a float numbers. 10 | Returns: 11 | a float tensor with shape []. 12 | """ 13 | with tf.name_scope('wing_loss'): 14 | x = landmarks - labels 15 | c = w * (1.0 - math.log(1.0 + w/epsilon)) 16 | absolute_x = tf.abs(x) 17 | losses = tf.where( 18 | tf.greater(w, absolute_x), 19 | w * tf.log(1.0 + absolute_x/epsilon), 20 | absolute_x - c 21 | ) 22 | loss = tf.reduce_mean(tf.reduce_sum(losses, axis=[1, 2]), axis=0) 23 | return loss 24 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | """ 5 | For evaluation during the training I use NME (normalized mean error). 6 | You can find more about it here: 7 | https://arxiv.org/abs/1506.03799 (Pose-Invariant 3D Face Alignment) 8 | Note that my version here is slightly different. 9 | So you cannot compare its value with results in papers. 10 | 11 | It is assumed that num_landmarks = 5 12 | and that they are in the following order: 13 | [[lefteye_y lefteye_x] 14 | [righteye_y righteye_x] 15 | [nose_y nose_x] 16 | [leftmouth_y leftmouth_x] 17 | [rightmouth_y rightmouth_x]] 18 | """ 19 | 20 | 21 | def nme_metric_ops(labels, landmarks): 22 | """ 23 | Arguments: 24 | labels, landmarks: a float tensors with shape [batch_size, num_landmarks, 2]. 25 | Returns: 26 | two ops like in tf.metrics API. 27 | """ 28 | norms = tf.norm(labels - landmarks, axis=2) # shape [batch_size, num_landmarks] 29 | mean_norm = tf.reduce_mean(norms, axis=1) # shape [batch_size] 30 | eye_distance = tf.norm(labels[:, 0, :] - labels[:, 1, :], axis=1) # shape [batch_size] 31 | 32 | values = mean_norm/tf.maximum(eye_distance, 1.0) 33 | mean, update_op = tf.metrics.mean(values) 34 | return mean, update_op 35 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from network import network 3 | from loss import wing_loss 4 | from metrics import nme_metric_ops 5 | 6 | 7 | MOVING_AVERAGE_DECAY = 0.995 8 | 9 | 10 | def model_fn(features, labels, mode, params): 11 | """ 12 | This is a function for creating a tensorflow computational graph. 13 | The function is in the format required by tf.estimator. 14 | """ 15 | 16 | # features are just a tensor of RGB images 17 | 18 | is_training = mode == tf.estimator.ModeKeys.TRAIN 19 | landmarks = network(features, is_training, num_landmarks=params['num_landmarks']) 20 | # landmarks are normalized to [0, 1] range 21 | 22 | if mode == tf.estimator.ModeKeys.PREDICT: 23 | predictions = {'landmarks': landmarks} 24 | export_outputs = tf.estimator.export.PredictOutput({ 25 | name: tf.identity(tensor, name) 26 | for name, tensor in predictions.items() 27 | }) 28 | return tf.estimator.EstimatorSpec( 29 | mode, predictions=predictions, 30 | export_outputs={'outputs': export_outputs} 31 | ) 32 | 33 | with tf.name_scope('rescale'): 34 | w, h = params['image_size'] 35 | scaler = tf.constant([h, w], dtype=tf.float32) 36 | labels = labels * scaler 37 | landmarks = landmarks * scaler 38 | 39 | loss = wing_loss(landmarks, labels, w=params['w'], epsilon=params['epsilon']) 40 | tf.losses.add_loss(loss) 41 | tf.summary.scalar('just_wing_loss', loss) 42 | 43 | # add L2 regularization 44 | with tf.name_scope('weight_decay'): 45 | add_weight_decay(params['weight_decay']) 46 | regularization_loss = tf.losses.get_regularization_loss() 47 | tf.summary.scalar('regularization_loss', regularization_loss) 48 | 49 | total_loss = tf.losses.get_total_loss(add_regularization_losses=True) 50 | 51 | if mode == tf.estimator.ModeKeys.EVAL: 52 | eval_metric_ops = { 53 | 'validation_mae': tf.metrics.mean_absolute_error(labels, landmarks), 54 | 'normalized_mean_error': nme_metric_ops(labels, landmarks) 55 | } 56 | return tf.estimator.EstimatorSpec( 57 | mode, loss=total_loss, 58 | eval_metric_ops=eval_metric_ops 59 | ) 60 | 61 | assert mode == tf.estimator.ModeKeys.TRAIN 62 | 63 | with tf.variable_scope('learning_rate'): 64 | global_step = tf.train.get_global_step() 65 | learning_rate = tf.train.cosine_decay( 66 | params['initial_lr'], global_step, 67 | decay_steps=params['num_steps'] 68 | ) 69 | tf.summary.scalar('learning_rate', learning_rate) 70 | 71 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 72 | with tf.control_dependencies(update_ops), tf.variable_scope('optimizer'): 73 | optimizer = tf.train.AdamOptimizer(learning_rate) 74 | grads_and_vars = optimizer.compute_gradients(total_loss) 75 | train_op = optimizer.apply_gradients(grads_and_vars, global_step) 76 | 77 | for g, v in grads_and_vars: 78 | tf.summary.histogram(v.name[:-2] + '_hist', v) 79 | tf.summary.histogram(v.name[:-2] + '_grad_hist', g) 80 | 81 | with tf.control_dependencies([train_op]), tf.name_scope('ema'): 82 | ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step) 83 | train_op = ema.apply(tf.trainable_variables()) 84 | 85 | with tf.name_scope('evaluation_ops'): 86 | mae = tf.reduce_mean(tf.abs(labels - landmarks), axis=[0, 1, 2]) 87 | 88 | tf.summary.scalar('train_mae', mae) 89 | return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op) 90 | 91 | 92 | def add_weight_decay(weight_decay): 93 | """Add L2 regularization to all (or some) trainable kernel weights.""" 94 | weight_decay = tf.constant( 95 | weight_decay, tf.float32, 96 | [], 'weight_decay' 97 | ) 98 | trainable_vars = tf.trainable_variables() 99 | kernels = [v for v in trainable_vars if 'weights' in v.name] 100 | for K in kernels: 101 | x = tf.multiply(weight_decay, tf.nn.l2_loss(K)) 102 | tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, x) 103 | 104 | 105 | class RestoreMovingAverageHook(tf.train.SessionRunHook): 106 | def __init__(self, model_dir): 107 | super(RestoreMovingAverageHook, self).__init__() 108 | self.model_dir = model_dir 109 | 110 | def begin(self): 111 | ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY) 112 | variables_to_restore = ema.variables_to_restore() 113 | self.load_ema = tf.contrib.framework.assign_from_checkpoint_fn( 114 | tf.train.latest_checkpoint(self.model_dir), variables_to_restore 115 | ) 116 | 117 | def after_create_session(self, sess, coord): 118 | tf.logging.info('Loading EMA weights...') 119 | self.load_ema(sess) 120 | -------------------------------------------------------------------------------- /network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib.slim as slim 3 | 4 | 5 | BATCH_NORM_MOMENTUM = 0.91 6 | BATCH_NORM_EPSILON = 1e-3 7 | 8 | 9 | def network(images, is_training, num_landmarks): 10 | """ 11 | Arguments: 12 | images: a float tensor with shape [batch_size, height, width, 3], 13 | a batch of RGB images with pixels values in the range [0, 1]. 14 | is_training: a boolean. 15 | num_landmarks: an integer. 16 | Returns: 17 | a float tensor with shape [batch_size, num_landmarks, 2]. 18 | """ 19 | 20 | def batch_norm(x): 21 | x = tf.layers.batch_normalization( 22 | x, axis=3, center=True, scale=True, 23 | momentum=BATCH_NORM_MOMENTUM, 24 | epsilon=BATCH_NORM_EPSILON, 25 | training=is_training, fused=True, 26 | name='batch_norm' 27 | ) 28 | return x 29 | 30 | with tf.name_scope('standardize_input'): 31 | x = (2.0 * images) - 1.0 32 | 33 | with tf.variable_scope('network'): 34 | params = { 35 | 'padding': 'SAME', 36 | 'activation_fn': tf.nn.relu, 37 | 'normalizer_fn': batch_norm, 38 | 'data_format': 'NHWC' 39 | } 40 | with slim.arg_scope([slim.conv2d], **params): 41 | with slim.arg_scope([slim.max_pool2d], stride=2, padding='SAME', data_format='NHWC'): 42 | 43 | num_filters = [32, 64, 128, 256, 512] 44 | for i, f in enumerate(num_filters, 1): 45 | x = slim.conv2d(x, f, (3, 3), stride=1, scope='conv%d' % i) 46 | x = slim.max_pool2d(x, (2, 2), scope='pool%d' % i) 47 | 48 | x = flatten(x) 49 | x = slim.fully_connected( 50 | x, 1024, activation_fn=tf.nn.relu, 51 | normalizer_fn=None, scope='fc1' 52 | ) 53 | x = slim.fully_connected( 54 | x, 2 * num_landmarks, activation_fn=None, 55 | normalizer_fn=None, scope='fc2', 56 | biases_initializer=tf.constant_initializer(0.5), 57 | ) 58 | batch_size = tf.shape(x)[0] 59 | x = tf.reshape(x, [batch_size, num_landmarks, 2]) 60 | return x 61 | 62 | 63 | def flatten(x): 64 | with tf.name_scope('flatten'): 65 | batch_size = tf.shape(x)[0] 66 | height, width, channels = x.shape.as_list()[1:] 67 | x = tf.reshape(x, [batch_size, channels * height * width]) 68 | return x 69 | 70 | 71 | def prelu(x): 72 | """It is not used here.""" 73 | with tf.variable_scope('prelu'): 74 | in_channels = x.shape[3].value 75 | alpha = tf.get_variable( 76 | 'alpha', [in_channels], 77 | initializer=tf.constant_initializer(0.1), 78 | dtype=tf.float32 79 | ) 80 | return tf.nn.relu(x) - alpha * tf.nn.relu(-x) 81 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import json 3 | import os 4 | from model import model_fn, RestoreMovingAverageHook 5 | from input_pipeline import Pipeline 6 | tf.logging.set_verbosity('INFO') 7 | 8 | 9 | CONFIG = 'config.json' 10 | GPU_TO_USE = '0' 11 | params = json.load(open(CONFIG)) 12 | 13 | 14 | def get_input_fn(is_training=True): 15 | 16 | image_size = params['image_size'] 17 | data_dir = params['train_dataset'] if is_training else params['val_dataset'] 18 | batch_size = params['batch_size'] 19 | num_landmarks = params['num_landmarks'] 20 | 21 | filenames = os.listdir(data_dir) 22 | filenames = [n for n in filenames if n.endswith('.tfrecords')] 23 | filenames = sorted(filenames) 24 | filenames = [os.path.join(data_dir, n) for n in filenames] 25 | 26 | def input_fn(): 27 | with tf.device('/cpu:0'), tf.name_scope('input_pipeline'): 28 | pipeline = Pipeline( 29 | filenames, batch_size=batch_size, image_size=image_size, num_landmarks=num_landmarks, 30 | repeat=is_training, shuffle=is_training, augmentation=is_training, 31 | ) 32 | return pipeline.dataset 33 | return input_fn 34 | 35 | 36 | config = tf.ConfigProto(allow_soft_placement=True) 37 | config.gpu_options.visible_device_list = GPU_TO_USE 38 | 39 | run_config = tf.estimator.RunConfig() 40 | run_config = run_config.replace( 41 | model_dir=params['model_dir'], 42 | session_config=config, 43 | save_summary_steps=2000, 44 | save_checkpoints_secs=1800, 45 | log_step_count_steps=1000 46 | ) 47 | 48 | train_input_fn = get_input_fn(is_training=True) 49 | val_input_fn = get_input_fn(is_training=False) 50 | 51 | estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config) 52 | train_spec = tf.estimator.TrainSpec( 53 | train_input_fn, max_steps=params['num_steps'] 54 | ) 55 | eval_spec = tf.estimator.EvalSpec( 56 | val_input_fn, steps=None, hooks=[RestoreMovingAverageHook(params['model_dir'])], 57 | start_delay_secs=3600, throttle_secs=3600 58 | ) 59 | tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) 60 | --------------------------------------------------------------------------------