├── .gitignore
├── LICENSE
├── README.md
├── config.json
├── create_pb.py
├── data
    ├── create_tfrecords.py
    ├── explore_and_prepare_CelebA.ipynb
    ├── procrustes.py
    └── test_input_pipeline.ipynb
├── inference
    ├── example.jpg
    ├── face_detector.py
    ├── landmark_detector.py
    ├── the_office.jpg
    └── try_detector.ipynb
├── input_pipeline
    ├── __init__.py
    ├── augmentations.py
    └── input_pipeline.py
├── loss.py
├── metrics.py
├── model.py
├── network.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | __pycache__
3 | models/
4 | export/
5 | *.pb
6 | *.csv
7 | *.npy
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Dan Antoshchenko
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Wing Loss
 2 | 
 3 | This is an implementation of the loss function from  
 4 | [Wing Loss for Robust Facial Landmark Localisation with Convolutional Neural Networks](https://arxiv.org/abs/1711.06753).
 5 | 
 6 | ## How to use a pretrained model
 7 | 1. Download a pretrained model from [here](https://drive.google.com/drive/folders/1yCGoE6wC8ZOVDX8DekkEtZZIxfP0wnVU?usp=sharing).
 8 | 2. See an example of usage in `inference/try_detector.ipynb`.
 9 | 
10 | ## Example
11 | ![example](inference/example.jpg)
12 | 
13 | ## Notes
14 | 1. I didn't train on any datasets in the paper.
15 | 2. I simply trained on CelebA dataset (it has five landmark locations for each face).
16 | 3. I use a detector from [here](https://github.com/TropComplique/FaceBoxes-tensorflow) to detect faces.
17 | 4. The inference speed is ~0.15 ms per image (video card is NVIDIA GeForce GTX 1080 Ti, batch size is 8).
18 | 5. I used procrustes analysis for data balancing (see `data/explore_and_prepare_CelebA.ipynb`).
19 | 
20 | ## Requirements
21 | 1. tensorflow 1.12
22 | 2. numpy, Pillow, tqdm
23 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model_dir": "models/run00",
 3 | 
 4 |   "num_landmarks": 5, "weight_decay": 1e-4,
 5 |   "epsilon": 2.0, "w": 10.0,
 6 |   "initial_lr": 4e-4,
 7 | 
 8 |   "image_size": [64, 64],
 9 |   "batch_size": 16,
10 |   "train_dataset": "/mnt/datasets/dan/CelebA/train_shards/",
11 |   "val_dataset": "/mnt/datasets/dan/CelebA/val_shards/",
12 |   "num_steps": 180000
13 | }
14 | 


--------------------------------------------------------------------------------
/create_pb.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import os
 3 | import shutil
 4 | import json
 5 | from model import model_fn
 6 | 
 7 | 
 8 | """
 9 | The purpose of this script is to export
10 | the inference graph as a SavedModel.
11 | 
12 | Also it creates a .pb frozen inference graph.
13 | """
14 | 
15 | 
16 | OUTPUT_FOLDER = 'export/'  # for savedmodel
17 | PB_FILE_PATH = 'inference/model.pb'
18 | CONFIG = 'config.json'
19 | GPU_TO_USE = '0'
20 | 
21 | params = json.load(open(CONFIG))
22 | WIDTH, HEIGHT = params['image_size']
23 | 
24 | 
25 | def export_savedmodel():
26 |     config = tf.ConfigProto()
27 |     config.gpu_options.visible_device_list = GPU_TO_USE
28 |     run_config = tf.estimator.RunConfig()
29 |     run_config = run_config.replace(
30 |         model_dir=params['model_dir'],
31 |         session_config=config
32 |     )
33 |     estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config)
34 | 
35 |     def serving_input_receiver_fn():
36 |         images = tf.placeholder(dtype=tf.uint8, shape=[None, HEIGHT, WIDTH, 3], name='images')
37 |         features = tf.to_float(images) * (1.0/255.0)
38 |         return tf.estimator.export.TensorServingInputReceiver(features=features, receiver_tensors={'images': images})
39 | 
40 |     shutil.rmtree(OUTPUT_FOLDER, ignore_errors=True)
41 |     os.mkdir(OUTPUT_FOLDER)
42 |     estimator.export_savedmodel(OUTPUT_FOLDER, serving_input_receiver_fn)
43 | 
44 | 
45 | def convert_to_pb():
46 | 
47 |     subfolders = os.listdir(OUTPUT_FOLDER)
48 |     assert len(subfolders) == 1
49 |     last_saved_model = os.path.join(OUTPUT_FOLDER, subfolders[0])
50 | 
51 |     graph = tf.Graph()
52 |     config = tf.ConfigProto()
53 |     config.gpu_options.visible_device_list = GPU_TO_USE
54 | 
55 |     with graph.as_default():
56 |         with tf.Session(graph=graph, config=config) as sess:
57 |             tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], last_saved_model)
58 | 
59 |             # output ops
60 |             keep_nodes = ['landmarks']
61 | 
62 |             input_graph_def = tf.graph_util.convert_variables_to_constants(
63 |                 sess, graph.as_graph_def(),
64 |                 output_node_names=keep_nodes
65 |             )
66 |             output_graph_def = tf.graph_util.remove_training_nodes(
67 |                 input_graph_def, protected_nodes=keep_nodes
68 |             )
69 | 
70 |             with tf.gfile.GFile(PB_FILE_PATH, 'wb') as f:
71 |                 f.write(output_graph_def.SerializeToString())
72 |             print('%d ops in the final graph.' % len(output_graph_def.node))
73 | 
74 | 
75 | tf.logging.set_verbosity('INFO')
76 | export_savedmodel()
77 | convert_to_pb()
78 | 


--------------------------------------------------------------------------------
/data/create_tfrecords.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import PIL.Image
  4 | import tensorflow as tf
  5 | import json
  6 | import shutil
  7 | import random
  8 | import math
  9 | import argparse
 10 | from tqdm import tqdm
 11 | 
 12 | 
 13 | """
 14 | The purpose of this script is to create a set of .tfrecords files
 15 | from a folder of images and a folder of annotations.
 16 | Annotations are in the json format.
 17 | Images must have .jpg or .jpeg filename extension.
 18 | 
 19 | Example of a json annotation (with filename "132416.json"):
 20 | {
 21 |   "box": {"ymin": 1, "ymax": 248, "xmax": 1149, "xmin": 1014},
 22 |   "landmarks": [[102, 98], [135, 109], [121, 132], [85, 134], [117, 144]]
 23 |   "filename": "132416.jpg",
 24 |   "size": {"depth": 3, "width": 356, "height": 570}
 25 | }
 26 | 
 27 | Landmarks are in the following order:
 28 | [[lefteye_x lefteye_y]
 29 |  [righteye_x righteye_y]
 30 |  [nose_x nose_y]
 31 |  [leftmouth_x leftmouth_y]
 32 |  [rightmouth_x rightmouth_y]]
 33 | 
 34 | Example of use:
 35 | python create_tfrecords.py \
 36 |     --image_dir=/mnt/datasets/dan/CelebA/train/images/ \
 37 |     --annotations_dir=/mnt/datasets/dan/CelebA/train/annotations/ \
 38 |     --output=/mnt/datasets/dan/CelebA/train_shards/ \
 39 |     --num_shards=800
 40 | 
 41 | python create_tfrecords.py \
 42 |     --image_dir=/mnt/datasets/dan/CelebA/val/images/ \
 43 |     --annotations_dir=/mnt/datasets/dan/CelebA/val/annotations/ \
 44 |     --output=/mnt/datasets/dan/CelebA/val_shards/ \
 45 |     --num_shards=1
 46 | """
 47 | 
 48 | 
 49 | def make_args():
 50 |     parser = argparse.ArgumentParser()
 51 |     parser.add_argument('-i', '--image_dir', type=str)
 52 |     parser.add_argument('-a', '--annotations_dir', type=str)
 53 |     parser.add_argument('-o', '--output', type=str)
 54 |     parser.add_argument('-s', '--num_shards', type=int, default=1)
 55 |     return parser.parse_args()
 56 | 
 57 | 
 58 | def dict_to_tf_example(annotation, image_dir):
 59 |     """Convert dict to tf.Example proto.
 60 | 
 61 |     Notice that this function normalizes the bounding
 62 |     box coordinates provided by the raw data.
 63 | 
 64 |     Arguments:
 65 |         data: a dict.
 66 |         image_dir: a string, path to the image directory.
 67 |     Returns:
 68 |         an instance of tf.Example.
 69 |     """
 70 |     image_name = annotation['filename']
 71 |     assert image_name.endswith('.jpg') or image_name.endswith('.jpeg')
 72 | 
 73 |     image_path = os.path.join(image_dir, image_name)
 74 |     with tf.gfile.GFile(image_path, 'rb') as f:
 75 |         encoded_jpg = f.read()
 76 | 
 77 |     # check image format
 78 |     encoded_jpg_io = io.BytesIO(encoded_jpg)
 79 |     image = PIL.Image.open(encoded_jpg_io)
 80 |     assert image.format == 'JPEG'
 81 |     assert image.mode == 'RGB'
 82 | 
 83 |     width = int(annotation['size']['width'])
 84 |     height = int(annotation['size']['height'])
 85 |     assert width > 0 and height > 0
 86 |     assert image.size[0] == width and image.size[1] == height
 87 | 
 88 |     ymin = float(annotation['box']['ymin'])/height
 89 |     xmin = float(annotation['box']['xmin'])/width
 90 |     ymax = float(annotation['box']['ymax'])/height
 91 |     xmax = float(annotation['box']['xmax'])/width
 92 |     assert (ymin < ymax) and (xmin < xmax)
 93 | 
 94 |     # note that i reversing the order of the coordinates here
 95 |     landmarks = annotation['landmarks']
 96 |     landmarks_flattened = []
 97 |     for x, y in landmarks:
 98 |         y, x = y/height, x/width
 99 |         assert y <= ymax and y >= ymin
100 |         assert x <= xmax and x >= xmin
101 |         landmarks_flattened.extend([y, x])
102 | 
103 |     example = tf.train.Example(features=tf.train.Features(feature={
104 |         'image': _bytes_feature(encoded_jpg),
105 |         'xmin': _float_feature(xmin),
106 |         'xmax': _float_feature(xmax),
107 |         'ymin': _float_feature(ymin),
108 |         'ymax': _float_feature(ymax),
109 |         'landmarks': _float_list_feature(landmarks_flattened),
110 |     }))
111 |     return example
112 | 
113 | 
114 | def _bytes_feature(value):
115 |     return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
116 | 
117 | 
118 | def _float_list_feature(value):
119 |     return tf.train.Feature(float_list=tf.train.FloatList(value=value))
120 | 
121 | 
122 | def _float_feature(value):
123 |     return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
124 | 
125 | 
126 | def main():
127 |     ARGS = make_args()
128 | 
129 |     image_dir = ARGS.image_dir
130 |     annotations_dir = ARGS.annotations_dir
131 |     print('Reading images from:', image_dir)
132 |     print('Reading annotations from:', annotations_dir, '\n')
133 | 
134 |     examples_list = os.listdir(annotations_dir)
135 |     random.shuffle(examples_list)
136 |     num_examples = len(examples_list)
137 |     print('Number of images:', num_examples)
138 | 
139 |     num_shards = ARGS.num_shards
140 |     shard_size = math.ceil(num_examples/num_shards)
141 |     print('Number of images per shard:', shard_size)
142 | 
143 |     output_dir = ARGS.output
144 |     shutil.rmtree(output_dir, ignore_errors=True)
145 |     os.mkdir(output_dir)
146 | 
147 |     shard_id = 0
148 |     num_examples_written = 0
149 |     for example in tqdm(examples_list):
150 | 
151 |         if num_examples_written == 0:
152 |             shard_path = os.path.join(output_dir, 'shard-%04d.tfrecords' % shard_id)
153 |             writer = tf.python_io.TFRecordWriter(shard_path)
154 | 
155 |         path = os.path.join(annotations_dir, example)
156 |         annotation = json.load(open(path))
157 |         tf_example = dict_to_tf_example(annotation, image_dir)
158 |         writer.write(tf_example.SerializeToString())
159 |         num_examples_written += 1
160 | 
161 |         if num_examples_written == shard_size:
162 |             shard_id += 1
163 |             num_examples_written = 0
164 |             writer.close()
165 | 
166 |     if num_examples_written != 0:
167 |         writer.close()
168 | 
169 |     print('Result is here:', ARGS.output)
170 | 
171 | 
172 | main()
173 | 


--------------------------------------------------------------------------------
/data/explore_and_prepare_CelebA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import json\n",
 11 |     "from PIL import Image, ImageDraw\n",
 12 |     "import os\n",
 13 |     "import cv2\n",
 14 |     "import pandas as pd\n",
 15 |     "from tqdm import tqdm\n",
 16 |     "import shutil\n",
 17 |     "import random\n",
 18 |     "\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "%matplotlib inline\n",
 21 |     "\n",
 22 |     "from procrustes import procrustes\n",
 23 |     "from sklearn.decomposition import PCA\n",
 24 |     "\n",
 25 |     "import sys\n",
 26 |     "sys.path.append('../inference/')\n",
 27 |     "from face_detector import FaceDetector\n",
 28 |     "# this face detector is taken from here\n",
 29 |     "# https://github.com/TropComplique/FaceBoxes-tensorflow\n",
 30 |     "# (facial keypoints detector will be trained to work well with this detector)"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "The purpose of this script is to explore images/annotations of the CelebA dataset.  \n",
 38 |     "Also it cleans CelebA.  \n",
 39 |     "Also it converts annotations into json format."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "IMAGES_DIR = '/home/gpu2/hdd/dan/CelebA/img_celeba.7z/out/'\n",
 49 |     "ANNOTATIONS_PATH = '/home/gpu2/hdd/dan/CelebA/list_landmarks_celeba.txt'\n",
 50 |     "SPLIT_PATH = '/home/gpu2/hdd/dan/CelebA/list_eval_partition.txt'"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "# Read data"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# collect paths to all images\n",
 67 |     "\n",
 68 |     "all_paths = []\n",
 69 |     "for name in tqdm(os.listdir(IMAGES_DIR)):\n",
 70 |     "    all_paths.append(os.path.join(IMAGES_DIR, name))\n",
 71 |     "\n",
 72 |     "metadata = pd.DataFrame(all_paths, columns=['full_path'])\n",
 73 |     "\n",
 74 |     "# strip root folder\n",
 75 |     "metadata['name'] = metadata.full_path.apply(lambda x: os.path.relpath(x, IMAGES_DIR))"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# number of images is taken from the official website\n",
 85 |     "assert len(metadata) == 202599"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "# see all unique endings\n",
 95 |     "metadata.name.apply(lambda x: x.split('.')[-1]).unique()"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "### Detect a face on each image"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# load faceboxes detector\n",
112 |     "face_detector = FaceDetector('../inference/model-step-240000.pb', visible_device_list='0')"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "detections = []\n",
122 |     "for p in tqdm(metadata.full_path):\n",
123 |     "    image = cv2.imread(p)\n",
124 |     "    image = image[:, :, [2, 1, 0]]  # to RGB\n",
125 |     "    detections.append(face_detector(image))"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "# take only images where one high confidence box is detected\n",
135 |     "bad_images = [metadata.name[i] for i, (b, s) in enumerate(detections) if len(b) != 1 or s.max() < 0.5]"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "boxes = {}\n",
145 |     "for n, (box, score) in zip(metadata.name, detections):\n",
146 |     "    if n not in bad_images:\n",
147 |     "        ymin, xmin, ymax, xmax = box[0]\n",
148 |     "        boxes[n] = (xmin, ymin, xmax, ymax)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "### Read keypoints from annotations"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "def get_numbers(s):\n",
165 |     "    s = s.strip().split(' ')\n",
166 |     "    return [s[0]] + [int(i) for i in s[1:] if i]\n",
167 |     "    \n",
168 |     "with open(ANNOTATIONS_PATH, 'r') as f:\n",
169 |     "    content = f.readlines()\n",
170 |     "    content = content[2:]\n",
171 |     "    content = [get_numbers(s) for s in content]\n",
172 |     "\n",
173 |     "landmarks = {}\n",
174 |     "more_bad_images = []\n",
175 |     "for i in content:\n",
176 |     "    name = i[0]\n",
177 |     "    \n",
178 |     "    keypoints = [\n",
179 |     "        [i[1], i[2]],  # lefteye_x lefteye_y \n",
180 |     "        [i[3], i[4]],  # righteye_x righteye_y\n",
181 |     "        [i[5], i[6]],  # nose_x nose_y \n",
182 |     "        [i[7], i[8]],  # leftmouth_x leftmouth_y\n",
183 |     "        [i[9], i[10]],  # rightmouth_x rightmouth_y\n",
184 |     "    ]\n",
185 |     "    \n",
186 |     "    # assert that landmarks are inside the box\n",
187 |     "    if name in bad_images:\n",
188 |     "        continue\n",
189 |     "    xmin, ymin, xmax, ymax = boxes[name]\n",
190 |     "    points = np.array(keypoints)\n",
191 |     "    is_normal = (points[:, 0] > xmin).all() and\\\n",
192 |     "        (points[:, 0] < xmax).all() and\\\n",
193 |     "        (points[:, 1] > ymin).all() and\\\n",
194 |     "        (points[:, 1] < ymax).all()\n",
195 |     "    if not is_normal:\n",
196 |     "        more_bad_images.append(name)\n",
197 |     "\n",
198 |     "    landmarks[name] = keypoints "
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "# number of weird landmarks\n",
208 |     "len(more_bad_images)"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "to_remove = more_bad_images + bad_images\n",
218 |     "metadata = metadata.loc[~metadata.name.isin(to_remove)]\n",
219 |     "metadata = metadata.reset_index(drop=True)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "# backup results\n",
229 |     "metadata.to_csv('metadata.csv')\n",
230 |     "np.save('boxes.npy', boxes)\n",
231 |     "np.save('landmarks.npy', landmarks)\n",
232 |     "np.save('to_remove.npy', to_remove)\n",
233 |     "\n",
234 |     "# metadata = pd.read_csv('metadata.csv', index_col=0)\n",
235 |     "# boxes = np.load('boxes.npy')[()]\n",
236 |     "# landmarks = np.load('landmarks.npy')[()]\n",
237 |     "# to_remove = np.load('to_remove.npy')"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "# size after cleaning\n",
247 |     "len(metadata)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "# Show some bounding boxes and landmarks"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "def draw_boxes_on_image(path, box, keypoints):\n",
264 |     "\n",
265 |     "    image = Image.open(path)\n",
266 |     "    draw = ImageDraw.Draw(image, 'RGBA')\n",
267 |     "\n",
268 |     "    xmin, ymin, xmax, ymax = box\n",
269 |     "    fill = (255, 255, 255, 45)\n",
270 |     "    outline = 'red'\n",
271 |     "    draw.rectangle(\n",
272 |     "        [(xmin, ymin), (xmax, ymax)],\n",
273 |     "        fill=fill, outline=outline\n",
274 |     "    )\n",
275 |     "    \n",
276 |     "    for x, y in keypoints:\n",
277 |     "        draw.ellipse([\n",
278 |     "            (x - 2.0, y - 2.0),\n",
279 |     "            (x + 2.0, y + 2.0)\n",
280 |     "        ], outline='red')\n",
281 |     "\n",
282 |     "    return image"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {
289 |     "scrolled": false
290 |    },
291 |    "outputs": [],
292 |    "source": [
293 |     "i = random.randint(0, len(metadata) - 1)  # choose a random image\n",
294 |     "some_boxes = boxes[metadata.name[i]]\n",
295 |     "keypoints = landmarks[metadata.name[i]]\n",
296 |     "draw_boxes_on_image(metadata.full_path[i], some_boxes, keypoints)"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "# Procrustes analysis (Pose-based Data Balancing strategy)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "landmarks_array = []\n",
313 |     "boxes_array = []\n",
314 |     "for n in metadata.name:\n",
315 |     "    landmarks_array.append(np.array(landmarks[n]))\n",
316 |     "    boxes_array.append(np.array(boxes[n]))\n",
317 |     "\n",
318 |     "landmarks_array = np.stack(landmarks_array, axis=0)\n",
319 |     "landmarks_array = landmarks_array.astype('float32')\n",
320 |     "boxes_array = np.stack(boxes_array)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "mean_shape = landmarks_array.mean(0)  # reference shape\n",
330 |     "num_images = len(landmarks_array)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "aligned = []\n",
340 |     "for shape in tqdm(landmarks_array):\n",
341 |     "    Z, _ = procrustes(mean_shape, shape, reflection=False)\n",
342 |     "    aligned.append(Z)\n",
343 |     "aligned = np.stack(aligned)"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {},
350 |    "outputs": [],
351 |    "source": [
352 |     "pca = PCA(n_components=1)\n",
353 |     "projected = pca.fit_transform(aligned.reshape((-1, 10)))\n",
354 |     "projected = projected[:, 0]\n",
355 |     "\n",
356 |     "plt.hist(projected, bins=40);"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "# frontal faces:\n",
366 |     "indices = np.where(np.abs(projected) < 5)[0]\n",
367 |     "\n",
368 |     "# faces turned to the left:\n",
369 |     "# indices = np.where(projected > 15)[0]\n",
370 |     "\n",
371 |     "# faces turned to the right:\n",
372 |     "# indices = np.where(projected < -30)[0]\n",
373 |     "\n",
374 |     "i = indices[random.randint(0, len(indices) - 1)]\n",
375 |     "some_boxes = boxes[metadata.name[i]]\n",
376 |     "keypoints = landmarks[metadata.name[i]]\n",
377 |     "draw_boxes_on_image(metadata.full_path[i], some_boxes, keypoints)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": null,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "# it is not strictly a yaw angle\n",
387 |     "metadata['yaw'] = projected"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "# Create train-val split"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "split = pd.read_csv(SPLIT_PATH, header=None, sep=' ')\n",
404 |     "split.columns = ['name', 'assignment']"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "split = split.loc[~split.name.isin(to_remove)]\n",
414 |     "split = split.reset_index(drop=True)"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {},
421 |    "outputs": [],
422 |    "source": [
423 |     "split.assignment.value_counts()"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {},
430 |    "outputs": [],
431 |    "source": [
432 |     "# \"0\" represents training image, \"1\" represents validation image, \"2\" represents testing image\n",
433 |     "train = list(split.loc[split.assignment.isin([0, 1]), 'name'])\n",
434 |     "val = list(split.loc[split.assignment.isin([2]), 'name'])"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "markdown",
439 |    "metadata": {},
440 |    "source": [
441 |     "# Upsample rare poses"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": [
450 |     "metadata['is_train'] = metadata.name.isin(train).astype('int')"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {},
457 |    "outputs": [],
458 |    "source": [
459 |     "bins = [metadata.yaw.min() - 1.0, -20.0, -5.0, 5.0, 20.0, metadata.yaw.max() + 1.0]\n",
460 |     "metadata['bin'] = pd.cut(metadata.yaw, bins, labels=False)\n",
461 |     "metadata.loc[metadata.is_train == 1, 'bin'].value_counts()"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {},
468 |    "outputs": [],
469 |    "source": [
470 |     "bins_to_upsample = [0, 1, 3, 4]\n",
471 |     "num_samples = 80000"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": null,
477 |    "metadata": {},
478 |    "outputs": [],
479 |    "source": [
480 |     "val_metadata = metadata.loc[metadata.is_train == 0]"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": null,
486 |    "metadata": {},
487 |    "outputs": [],
488 |    "source": [
489 |     "upsampled = [metadata.loc[(metadata.is_train == 1) & (metadata.bin == 2)]]\n",
490 |     "for b in bins_to_upsample:\n",
491 |     "    to_use = (metadata.is_train == 1) & (metadata.bin == b)\n",
492 |     "    m = metadata.loc[to_use].sample(n=num_samples, replace=True)\n",
493 |     "    upsampled.append(m)"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": null,
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": [
502 |     "upsampled = pd.concat(upsampled)\n",
503 |     "upsampled.bin.value_counts()"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": null,
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": [
512 |     "metadata = pd.concat([upsampled, val_metadata])"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "markdown",
517 |    "metadata": {},
518 |    "source": [
519 |     "# Convert"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": null,
525 |    "metadata": {},
526 |    "outputs": [],
527 |    "source": [
528 |     "def get_annotation(name, new_name, width, height, translation):\n",
529 |     "    xmin, ymin, xmax, ymax = boxes[name]\n",
530 |     "    keypoints = landmarks[name]\n",
531 |     "    \n",
532 |     "    tx, ty = translation\n",
533 |     "    keypoints = [[p[0] - tx, p[1] - ty]for p in keypoints]\n",
534 |     "    xmin, ymin = xmin - tx, ymin - ty\n",
535 |     "    xmax, ymax = xmax - tx, ymax - ty\n",
536 |     "    \n",
537 |     "    annotation = {\n",
538 |     "        \"filename\": new_name,\n",
539 |     "        \"size\": {\"depth\": 3, \"width\": width, \"height\": height},\n",
540 |     "        \"box\": {\"ymin\": int(ymin), \"ymax\": int(ymax), \"xmax\": int(xmax), \"xmin\": int(xmin)},\n",
541 |     "        \"landmarks\": keypoints\n",
542 |     "    }\n",
543 |     "    return annotation"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": null,
549 |    "metadata": {},
550 |    "outputs": [],
551 |    "source": [
552 |     "# create folders for the converted dataset\n",
553 |     "TRAIN_DIR = '/mnt/datasets/dan/CelebA/train/'\n",
554 |     "shutil.rmtree(TRAIN_DIR, ignore_errors=True)\n",
555 |     "os.mkdir(TRAIN_DIR)\n",
556 |     "os.mkdir(os.path.join(TRAIN_DIR, 'images'))\n",
557 |     "os.mkdir(os.path.join(TRAIN_DIR, 'annotations'))\n",
558 |     "\n",
559 |     "VAL_DIR = '/mnt/datasets/dan/CelebA/val/'\n",
560 |     "shutil.rmtree(VAL_DIR, ignore_errors=True)\n",
561 |     "os.mkdir(VAL_DIR)\n",
562 |     "os.mkdir(os.path.join(VAL_DIR, 'images'))\n",
563 |     "os.mkdir(os.path.join(VAL_DIR, 'annotations'))"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "code",
568 |    "execution_count": null,
569 |    "metadata": {
570 |     "scrolled": true
571 |    },
572 |    "outputs": [],
573 |    "source": [
574 |     "counter = 0\n",
575 |     "\n",
576 |     "for T in tqdm(metadata.itertuples()):\n",
577 |     "    \n",
578 |     "    # get width and height of an image\n",
579 |     "    image = cv2.imread(T.full_path)\n",
580 |     "    h, w, c = image.shape\n",
581 |     "    assert c == 3\n",
582 |     "    \n",
583 |     "    # name of the image\n",
584 |     "    name = T.name\n",
585 |     "    assert name.endswith('.jpg')\n",
586 |     "    \n",
587 |     "    if name in train:\n",
588 |     "        result_dir = TRAIN_DIR\n",
589 |     "    elif name in val:\n",
590 |     "        result_dir = VAL_DIR\n",
591 |     "    else:\n",
592 |     "        print('WTF')\n",
593 |     "        break\n",
594 |     "    \n",
595 |     "    # crop the image to save space\n",
596 |     "    xmin, ymin, xmax, ymax = boxes[name]\n",
597 |     "    width, height = xmax - xmin, ymax - ymin\n",
598 |     "    assert width > 0 and height > 0\n",
599 |     "    xmin = max(int(xmin - width), 0)\n",
600 |     "    ymin = max(int(ymin - height), 0)\n",
601 |     "    xmax = min(int(xmax + width), w)\n",
602 |     "    ymax = min(int(ymax + height), h)\n",
603 |     "    crop = image[ymin:ymax, xmin:xmax, :]\n",
604 |     "    \n",
605 |     "    # we need to transform annotations after cropping\n",
606 |     "    translation = [xmin, ymin]\n",
607 |     "    \n",
608 |     "    # we need to rename images because of upsampling\n",
609 |     "    new_name = str(counter) + '.jpg'\n",
610 |     "    counter += 1\n",
611 |     "    cv2.imwrite(os.path.join(result_dir, 'images', new_name), crop)\n",
612 |     "\n",
613 |     "    # save annotation for it\n",
614 |     "    d = get_annotation(name, new_name, xmax - xmin, ymax - ymin, translation)\n",
615 |     "    json_name = new_name[:-4] + '.json'\n",
616 |     "    json.dump(d, open(os.path.join(result_dir, 'annotations', json_name), 'w'))"
617 |    ]
618 |   }
619 |  ],
620 |  "metadata": {
621 |   "kernelspec": {
622 |    "display_name": "Python 3",
623 |    "language": "python",
624 |    "name": "python3"
625 |   },
626 |   "language_info": {
627 |    "codemirror_mode": {
628 |     "name": "ipython",
629 |     "version": 3
630 |    },
631 |    "file_extension": ".py",
632 |    "mimetype": "text/x-python",
633 |    "name": "python",
634 |    "nbconvert_exporter": "python",
635 |    "pygments_lexer": "ipython3",
636 |    "version": "3.6.3"
637 |   }
638 |  },
639 |  "nbformat": 4,
640 |  "nbformat_minor": 1
641 | }
642 | 


--------------------------------------------------------------------------------
/data/procrustes.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def procrustes(X, Y, scaling=True, reflection='best'):
 5 |     """
 6 |     This function is taken from here:
 7 |     https://stackoverflow.com/a/18927641.
 8 | 
 9 |     It aligns a shape defined by Y to a shape defined by X.
10 |     # Y_transformed = b * YT + c
11 | 
12 |     Procrustes analysis determines a linear transformation (translation,
13 |     reflection, orthogonal rotation and scaling) of the points in Y to best
14 |     conform them to the points in matrix X, using the sum of squared errors
15 |     as the goodness of fit criterion.
16 | 
17 |     Arguments:
18 |         X, Y: float numpy arrays with shape [n, p].
19 |         scaling: a boolean, if False, the scaling
20 |         component of the transformation is forced to 1.
21 |         reflection: a string or boolean,
22 |             possible values are 'best', False, True.
23 |             if 'best' (default), the transformation solution may or may not
24 |             include a reflection component, depending on which fits the data
25 |             best. setting reflection to True or False forces a solution with
26 |             reflection or no reflection respectively.
27 |     Returns:
28 |         Z: a float numpy array with shape [n, p].
29 |         transform: a dict specifying the rotation, translation
30 |             and scaling that maps X --> Y.
31 |     """
32 |     muX = X.mean(0)
33 |     muY = Y.mean(0)
34 | 
35 |     # center shapes
36 |     X0 = X - muX
37 |     Y0 = Y - muY
38 | 
39 |     # compute centered frobenius norm
40 |     normX = np.sqrt((X0**2).sum())
41 |     normY = np.sqrt((Y0**2).sum())
42 | 
43 |     # scale to equal (unit) norm
44 |     X0 /= normX
45 |     Y0 /= normY
46 | 
47 |     # get an optimal rotation matrix of Y
48 |     A = np.matmul(X0.T, Y0)  # shape [p, p]
49 |     U, s, Vt = np.linalg.svd(A, full_matrices=False)
50 |     # they have shapes [p, k], [k], [k, p]
51 |     V = Vt.T
52 |     T = np.matmul(V, U.T)  # shape [p, p]
53 |     # T is orthogonal
54 | 
55 |     if reflection is not 'best':
56 | 
57 |         # does the current solution use a reflection?
58 |         has_reflection = np.linalg.det(T) < 0
59 | 
60 |         # if that's not what was specified, force another reflection
61 |         if reflection != has_reflection:
62 |             V[:, -1] *= -1
63 |             s[-1] *= -1  # the smallest singular value
64 |             T = np.matmul(V, U.T)
65 | 
66 |     traceTA = s.sum()  # trace of TA
67 | 
68 |     if scaling:
69 | 
70 |         # optimal scaling of Y
71 |         b = traceTA * normX / normY
72 | 
73 |         # transformed coords
74 |         Z = normX * traceTA * np.matmul(Y0, T) + muX
75 | 
76 |         # frobenius_norm(Y0 * T) = frobenius_norm(Y0) = 1
77 | 
78 |     else:
79 |         b = 1
80 |         Z = normY * np.matmul(Y0, T) + muX
81 | 
82 |     c = muX - b * np.matmul(muY, T)
83 | 
84 |     transform = {'rotation': T, 'scale': b, 'translation': c}
85 |     """
86 |     b * Y * T + c =
87 |     = b * (Y0 * normY + muY) * T + muX - b * muY * T =
88 |     = b * normY * Y0 * T + muX =
89 |     = Z
90 |     """
91 |     return Z, transform
92 | 


--------------------------------------------------------------------------------
/data/test_input_pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import tensorflow as tf\n",
 20 |     "import numpy as np\n",
 21 |     "from PIL import Image, ImageDraw\n",
 22 |     "import json\n",
 23 |     "\n",
 24 |     "import sys\n",
 25 |     "sys.path.append('..')\n",
 26 |     "from input_pipeline import Pipeline"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "# Get images and boxes"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {
 40 |     "scrolled": false
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "tf.reset_default_graph()\n",
 45 |     "\n",
 46 |     "pipeline = Pipeline(\n",
 47 |     "    ['/mnt/datasets/dan/CelebA/train_shards/shard-0000.tfrecords'],\n",
 48 |     "    batch_size=100, image_size=(64, 64),\n",
 49 |     "    num_landmarks=5, repeat=True, \n",
 50 |     "    shuffle=False, augmentation=True\n",
 51 |     ")\n",
 52 |     "\n",
 53 |     "dataset = pipeline.dataset\n",
 54 |     "iterator = tf.data.Iterator.from_structure(dataset.output_types, dataset.output_shapes)\n",
 55 |     "init = iterator.make_initializer(dataset)\n",
 56 |     "features, labels = iterator.get_next()"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "with tf.Session() as sess:\n",
 66 |     "    sess.run(init)\n",
 67 |     "    I, L = sess.run([features, labels])"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "# Show an augmented image"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "def draw_on_image(image, keypoints):\n",
 84 |     "\n",
 85 |     "    image_copy = image.copy()\n",
 86 |     "    draw = ImageDraw.Draw(image_copy, 'RGBA')\n",
 87 |     "    \n",
 88 |     "    for i, (y, x) in enumerate(keypoints):\n",
 89 |     "        draw.ellipse([\n",
 90 |     "            (x - 1.0, y - 1.0),\n",
 91 |     "            (x + 1.0, y + 1.0)\n",
 92 |     "        ], outline='red')\n",
 93 |     "        draw.text((x, y), text=str(i), fill='blue')\n",
 94 |     "\n",
 95 |     "    return image_copy"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "# choose an image\n",
105 |     "i = 0\n",
106 |     "image = Image.fromarray((I[i]*255.0).astype('uint8'))\n",
107 |     "w, h = image.size\n",
108 |     "keypoints = L[i]*np.array([h, w])\n",
109 |     "draw_on_image(image, keypoints)"
110 |    ]
111 |   }
112 |  ],
113 |  "metadata": {
114 |   "kernelspec": {
115 |    "display_name": "Python 3",
116 |    "language": "python",
117 |    "name": "python3"
118 |   },
119 |   "language_info": {
120 |    "codemirror_mode": {
121 |     "name": "ipython",
122 |     "version": 3
123 |    },
124 |    "file_extension": ".py",
125 |    "mimetype": "text/x-python",
126 |    "name": "python",
127 |    "nbconvert_exporter": "python",
128 |    "pygments_lexer": "ipython3",
129 |    "version": "3.6.3"
130 |   }
131 |  },
132 |  "nbformat": 4,
133 |  "nbformat_minor": 2
134 | }
135 | 


--------------------------------------------------------------------------------
/inference/example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/wing-loss/d7335610d26cf805bf5a20ae0d70df5de85d1521/inference/example.jpg


--------------------------------------------------------------------------------
/inference/face_detector.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | """
 6 | This is a face detector taken from here:
 7 | https://github.com/TropComplique/FaceBoxes-tensorflow
 8 | """
 9 | 
10 | 
11 | class FaceDetector:
12 |     def __init__(self, model_path, gpu_memory_fraction=0.25, visible_device_list='0'):
13 |         """
14 |         Arguments:
15 |             model_path: a string, path to a pb file.
16 |             gpu_memory_fraction: a float number.
17 |             visible_device_list: a string.
18 |         """
19 |         with tf.gfile.GFile(model_path, 'rb') as f:
20 |             graph_def = tf.GraphDef()
21 |             graph_def.ParseFromString(f.read())
22 | 
23 |         graph = tf.Graph()
24 |         with graph.as_default():
25 |             tf.import_graph_def(graph_def, name='import')
26 | 
27 |         self.input_image = graph.get_tensor_by_name('import/image_tensor:0')
28 |         self.output_ops = [
29 |             graph.get_tensor_by_name('import/boxes:0'),
30 |             graph.get_tensor_by_name('import/scores:0'),
31 |             graph.get_tensor_by_name('import/num_boxes:0'),
32 |         ]
33 | 
34 |         gpu_options = tf.GPUOptions(
35 |             per_process_gpu_memory_fraction=gpu_memory_fraction,
36 |             visible_device_list=visible_device_list
37 |         )
38 |         config_proto = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)
39 |         self.sess = tf.Session(graph=graph, config=config_proto)
40 | 
41 |     def __call__(self, image, score_threshold=0.5):
42 |         """Detect faces.
43 | 
44 |         Arguments:
45 |             image: a numpy uint8 array with shape [height, width, 3],
46 |                 that represents a RGB image.
47 |             score_threshold: a float number.
48 |         Returns:
49 |             boxes: a float numpy array of shape [num_faces, 4].
50 |             scores: a float numpy array of shape [num_faces].
51 | 
52 |         Note that box coordinates are in the order: ymin, xmin, ymax, xmax!
53 |         """
54 |         h, w, _ = image.shape
55 |         image = np.expand_dims(image, 0)
56 | 
57 |         boxes, scores, num_boxes = self.sess.run(
58 |             self.output_ops, feed_dict={self.input_image: image}
59 |         )
60 |         num_boxes = num_boxes[0]
61 |         boxes = boxes[0][:num_boxes]
62 |         scores = scores[0][:num_boxes]
63 | 
64 |         to_keep = scores > score_threshold
65 |         boxes = boxes[to_keep]
66 |         scores = scores[to_keep]
67 | 
68 |         scaler = np.array([h, w, h, w], dtype='float32')
69 |         boxes = boxes * scaler
70 | 
71 |         return boxes, scores
72 | 


--------------------------------------------------------------------------------
/inference/landmark_detector.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class KeypointDetector:
 6 |     def __init__(self, model_path, gpu_memory_fraction=0.25, visible_device_list='0'):
 7 |         """
 8 |         Arguments:
 9 |             model_path: a string, path to a pb file.
10 |             gpu_memory_fraction: a float number.
11 |             visible_device_list: a string.
12 |         """
13 |         with tf.gfile.GFile(model_path, 'rb') as f:
14 |             graph_def = tf.GraphDef()
15 |             graph_def.ParseFromString(f.read())
16 | 
17 |         graph = tf.Graph()
18 |         with graph.as_default():
19 |             tf.import_graph_def(graph_def, name='import')
20 | 
21 |         self.input_image = graph.get_tensor_by_name('import/images:0')
22 |         self.output = graph.get_tensor_by_name('import/landmarks:0')
23 | 
24 |         gpu_options = tf.GPUOptions(
25 |             per_process_gpu_memory_fraction=gpu_memory_fraction,
26 |             visible_device_list=visible_device_list
27 |         )
28 |         config_proto = tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)
29 |         self.sess = tf.Session(graph=graph, config=config_proto)
30 | 
31 |     def __call__(self, images):
32 |         """
33 |         Arguments:
34 |             images: a numpy uint8 array with shape [b, 64, 64, 3],
35 |                 that represents a batch of RGB images.
36 |         Returns:
37 |             a float numpy array of shape [b, 5, 2].
38 | 
39 |         Note that points coordinates are in the order: (y, x).
40 |         Also coordinates are relative to the image (in the [0, 1] range).
41 |         """
42 |         landmarks = self.sess.run(self.output, feed_dict={self.input_image: images})
43 |         return landmarks
44 | 


--------------------------------------------------------------------------------
/inference/the_office.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TropComplique/wing-loss/d7335610d26cf805bf5a20ae0d70df5de85d1521/inference/the_office.jpg


--------------------------------------------------------------------------------
/input_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from .input_pipeline import Pipeline
2 | 


--------------------------------------------------------------------------------
/input_pipeline/augmentations.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import cv2
  3 | import math
  4 | 
  5 | 
  6 | """
  7 | `image` is assumed to be a float tensor with shape [height, width, 3],
  8 | it is a RGB image with pixel values in range [0, 1].
  9 | `box` is a float tensor with shape [4].
 10 | `landmarks` is a float tensor with shape [num_landmarks, 2].
 11 | """
 12 | 
 13 | 
 14 | def random_rotation(image, box, landmarks, max_angle=10):
 15 |     with tf.name_scope('random_rotation'):
 16 |         # get a random angle
 17 |         max_angle_radians = max_angle*(math.pi/180.0)
 18 |         theta = tf.random_uniform(
 19 |             [], minval=-max_angle_radians,
 20 |             maxval=max_angle_radians, dtype=tf.float32
 21 |         )
 22 | 
 23 |         # find the center of the image
 24 |         image_height = tf.to_float(tf.shape(image)[0])
 25 |         image_width = tf.to_float(tf.shape(image)[1])
 26 |         scaler = tf.stack([image_height, image_width], axis=0)
 27 |         center = tf.reshape(0.5*scaler, [1, 2])
 28 | 
 29 |         rotation = tf.stack([
 30 |             tf.cos(theta), tf.sin(theta),
 31 |             -tf.sin(theta), tf.cos(theta)
 32 |         ], axis=0)
 33 |         rotation_matrix = tf.reshape(rotation, [2, 2])
 34 | 
 35 |         inverse_rotation = tf.stack([
 36 |             tf.cos(theta), -tf.sin(theta),
 37 |             tf.sin(theta), tf.cos(theta)
 38 |         ], axis=0)
 39 |         inverse_rotation_matrix = tf.reshape(inverse_rotation, [2, 2])
 40 | 
 41 |         # now i want to rotate the image and annotations around the image center,
 42 |         # note: landmark and box coordinates are (y, x) not (x, y)
 43 | 
 44 |         # rotate box
 45 |         ymin, xmin, ymax, xmax = tf.unstack(box, axis=0)
 46 |         h, w = ymax - ymin, xmax - xmin
 47 |         box = tf.stack([
 48 |             ymin, xmin, ymin, xmax,
 49 |             ymax, xmax, ymax, xmin
 50 |         ], axis=0)  # four corners
 51 |         box = tf.matmul(tf.reshape(box, [4, 2])*scaler - center, rotation_matrix) + center
 52 |         y, x = tf.unstack(box/scaler, axis=1)
 53 |         ymin, ymax = tf.reduce_min(y), tf.reduce_max(y)
 54 |         xmin, xmax = tf.reduce_min(x), tf.reduce_max(x)
 55 |         box = tf.stack([ymin, xmin, ymax, xmax], axis=0)
 56 | 
 57 |         # rotate landmarks
 58 |         landmarks = tf.matmul(landmarks*scaler - center, rotation_matrix) + center
 59 |         landmarks = landmarks/scaler
 60 | 
 61 |         # rotate image
 62 |         translate = center - tf.matmul(center, inverse_rotation_matrix)
 63 |         translate_y, translate_x = tf.unstack(tf.squeeze(translate, axis=0), axis=0)
 64 |         transform = tf.stack([
 65 |             tf.cos(theta), -tf.sin(theta), translate_x,
 66 |             tf.sin(theta), tf.cos(theta), translate_y,
 67 |             0.0, 0.0
 68 |         ])
 69 |         image = tf.contrib.image.transform(image, transform, interpolation='BILINEAR')
 70 | 
 71 |         return image, box, landmarks
 72 | 
 73 | 
 74 | def random_box_jitter(box, landmarks, ratio=0.05):
 75 |     """Randomly jitter bounding box.
 76 | 
 77 |     Arguments:
 78 |         box: a float tensor with shape [4].
 79 |         landmarks: a float tensor with shape [num_landmarks, 2].
 80 |         ratio: a float number.
 81 |             The ratio of the box width and height that the corners can jitter.
 82 |             For example if the width is 100 pixels and ratio is 0.05,
 83 |             the corners can jitter up to 5 pixels in the x direction.
 84 |     Returns:
 85 |         a float tensor with shape [4].
 86 |     """
 87 |     with tf.name_scope('random_box_jitter'):
 88 | 
 89 |         # get the tight box around all landmarks
 90 |         y, x = tf.unstack(landmarks, axis=1)
 91 |         ymin_tight, ymax_tight = tf.reduce_min(y), tf.reduce_max(y)
 92 |         xmin_tight, xmax_tight = tf.reduce_min(x), tf.reduce_max(x)
 93 |         # we want to keep landmarks inside the new distorted box
 94 | 
 95 |         ymin, xmin, ymax, xmax = tf.unstack(box, axis=0)
 96 |         box_height, box_width = ymax - ymin, xmax - xmin
 97 | 
 98 |         # it is assumed that initially
 99 |         # all landmarks were inside the box
100 |         new_ymin = tf.random_uniform(
101 |             [], minval=ymin - box_height * ratio,
102 |             maxval=tf.minimum(ymin_tight, ymin + box_height * ratio),
103 |             dtype=tf.float32
104 |         )
105 |         new_xmin = tf.random_uniform(
106 |             [], minval=xmin - box_width * ratio,
107 |             maxval=tf.minimum(xmin_tight, xmin + box_width * ratio),
108 |             dtype=tf.float32
109 |         )
110 |         new_ymax = tf.random_uniform(
111 |             [], minval=tf.maximum(ymax_tight, ymax - box_height * ratio),
112 |             maxval=ymax + box_height * ratio,
113 |             dtype=tf.float32
114 |         )
115 |         new_xmax = tf.random_uniform(
116 |             [], minval=tf.maximum(xmax_tight, xmax - box_width * ratio),
117 |             maxval=xmax + box_width * ratio,
118 |             dtype=tf.float32
119 |         )
120 |         distorted_box = tf.stack([new_ymin, new_xmin, new_ymax, new_xmax], axis=0)
121 |         return distorted_box
122 | 
123 | 
124 | def random_gaussian_blur(image, probability=0.3, kernel_size=3):
125 |     h, w, _ = image.shape.as_list()
126 | 
127 |     def blur(image):
128 |         image = (image*255.0).astype('uint8')
129 |         image = cv2.blur(image, (kernel_size, kernel_size))
130 |         return (image/255.0).astype('float32')
131 | 
132 |     with tf.name_scope('random_gaussian_blur'):
133 |         do_it = tf.less(tf.random_uniform([]), probability)
134 |         image = tf.cond(
135 |             do_it,
136 |             lambda: tf.py_func(blur, [image], tf.float32, stateful=False),
137 |             lambda: image
138 |         )
139 |         image.set_shape([h, w, 3])  # without this shape information is lost
140 |         return image
141 | 
142 | 
143 | def random_color_manipulations(image, probability=0.5, grayscale_probability=0.1):
144 | 
145 |     def manipulate(image):
146 |         br_delta = tf.random_uniform([], -32.0/255.0, 32.0/255.0)
147 |         cb_factor = tf.random_uniform([], -0.1, 0.1)
148 |         cr_factor = tf.random_uniform([], -0.1, 0.1)
149 |         channels = tf.split(axis=2, num_or_size_splits=3, value=image)
150 |         red_offset = 1.402 * cr_factor + br_delta
151 |         green_offset = -0.344136 * cb_factor - 0.714136 * cr_factor + br_delta
152 |         blue_offset = 1.772 * cb_factor + br_delta
153 |         channels[0] += red_offset
154 |         channels[1] += green_offset
155 |         channels[2] += blue_offset
156 |         image = tf.concat(axis=2, values=channels)
157 |         image = tf.clip_by_value(image, 0.0, 1.0)
158 |         return image
159 | 
160 |     def to_grayscale(image):
161 |         image = tf.image.rgb_to_grayscale(image)
162 |         image = tf.image.grayscale_to_rgb(image)
163 |         return image
164 | 
165 |     with tf.name_scope('random_color_manipulations'):
166 |         do_it = tf.less(tf.random_uniform([]), probability)
167 |         image = tf.cond(do_it, lambda: manipulate(image), lambda: image)
168 | 
169 |     with tf.name_scope('to_grayscale'):
170 |         make_gray = tf.less(tf.random_uniform([]), grayscale_probability)
171 |         image = tf.cond(make_gray, lambda: to_grayscale(image), lambda: image)
172 | 
173 |     return image
174 | 
175 | 
176 | def random_flip_left_right(image, landmarks):
177 | 
178 |     def flip(image, landmarks):
179 |         flipped_image = tf.image.flip_left_right(image)
180 |         y, x = tf.unstack(landmarks, axis=1)
181 |         flipped_x = tf.subtract(1.0, x)
182 |         flipped_landmarks = tf.stack([y, flipped_x], axis=1)
183 | 
184 |         # landmarks order: left_eye, right_eye, nose, left_mouth, right_mouth.
185 |         # so, when we flip the image we need to flip some of the landmarks
186 |         correct_order = tf.constant([1, 0, 2, 4, 3], dtype=tf.int32)
187 |         flipped_landmarks = tf.gather(flipped_landmarks, correct_order)
188 | 
189 |         return flipped_image, flipped_landmarks
190 | 
191 |     with tf.name_scope('random_flip_left_right'):
192 |         do_it = tf.less(tf.random_uniform([]), 0.5)
193 |         image, landmarks = tf.cond(do_it, lambda: flip(image, landmarks), lambda: (image, landmarks))
194 |         return image, landmarks
195 | 
196 | 
197 | def random_pixel_value_scale(image, minval=0.9, maxval=1.1, probability=0.5):
198 |     """This function scales each pixel independently of the other ones.
199 | 
200 |     Arguments:
201 |         image: a float tensor with shape [height, width, 3],
202 |             an image with pixel values varying between [0, 1].
203 |         minval: a float number, lower ratio of scaling pixel values.
204 |         maxval: a float number, upper ratio of scaling pixel values.
205 |         probability: a float number.
206 |     Returns:
207 |         a float tensor with shape [height, width, 3].
208 |     """
209 |     def random_value_scale(image):
210 |         color_coefficient = tf.random_uniform(
211 |             tf.shape(image), minval=minval,
212 |             maxval=maxval, dtype=tf.float32
213 |         )
214 |         image = tf.multiply(image, color_coefficient)
215 |         image = tf.clip_by_value(image, 0.0, 1.0)
216 |         return image
217 | 
218 |     with tf.name_scope('random_pixel_value_scale'):
219 |         do_it = tf.less(tf.random_uniform([]), probability)
220 |         image = tf.cond(do_it, lambda: random_value_scale(image), lambda: image)
221 |         return image
222 | 


--------------------------------------------------------------------------------
/input_pipeline/input_pipeline.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from .augmentations import random_color_manipulations,\
  3 |     random_flip_left_right, random_pixel_value_scale, \
  4 |     random_gaussian_blur, random_rotation, random_box_jitter
  5 | 
  6 | 
  7 | SHUFFLE_BUFFER_SIZE = 20000
  8 | NUM_THREADS = 8
  9 | RESIZE_METHOD = tf.image.ResizeMethod.BILINEAR
 10 | 
 11 | 
 12 | class Pipeline:
 13 |     def __init__(self, filenames, batch_size, image_size, num_landmarks,
 14 |                  repeat=False, shuffle=False, augmentation=False):
 15 |         """
 16 |         Arguments:
 17 |             filenames: a list of strings, paths to tfrecords files.
 18 |             batch_size: an integer.
 19 |             image_size: a list with two integers [width, height],
 20 |                 images of this size will be in a batch
 21 |             num_landmarks: an integer.
 22 |             repeat: a boolean, whether repeat indefinitely.
 23 |             shuffle: whether to shuffle the dataset.
 24 |             augmentation: whether to do data augmentation.
 25 |         """
 26 |         self.image_width, self.image_height = image_size
 27 |         self.augmentation = augmentation
 28 |         self.batch_size = batch_size
 29 | 
 30 |         assert num_landmarks == 5
 31 |         self.num_landmarks = num_landmarks
 32 | 
 33 |         def get_num_samples(filename):
 34 |             return sum(1 for _ in tf.python_io.tf_record_iterator(filename))
 35 | 
 36 |         num_examples = 0
 37 |         for filename in filenames:
 38 |             num_examples_in_file = get_num_samples(filename)
 39 |             assert num_examples_in_file > 0
 40 |             num_examples += num_examples_in_file
 41 |         self.num_examples = num_examples
 42 |         assert self.num_examples > 0
 43 | 
 44 |         dataset = tf.data.Dataset.from_tensor_slices(filenames)
 45 |         num_shards = len(filenames)
 46 | 
 47 |         if shuffle:
 48 |             dataset = dataset.shuffle(buffer_size=num_shards)
 49 |         dataset = dataset.flat_map(tf.data.TFRecordDataset)
 50 |         dataset = dataset.prefetch(buffer_size=batch_size)
 51 | 
 52 |         if shuffle:
 53 |             dataset = dataset.shuffle(buffer_size=SHUFFLE_BUFFER_SIZE)
 54 |         dataset = dataset.repeat(None if repeat else 1)
 55 |         dataset = dataset.map(self._parse_and_preprocess, num_parallel_calls=NUM_THREADS)
 56 | 
 57 |         dataset = dataset.batch(batch_size)
 58 |         dataset = dataset.prefetch(buffer_size=1)
 59 | 
 60 |         self.dataset = dataset
 61 | 
 62 |     def _parse_and_preprocess(self, example_proto):
 63 |         """What this function does:
 64 |         1. Parses one record from a tfrecords file and decodes it.
 65 |         2. (optionally) Augments it.
 66 | 
 67 |         Returns:
 68 |             image: a float tensor with shape [image_height, image_width, 3],
 69 |                 an RGB image with pixel values in the range [0, 1].
 70 |             landmarks: a float tensor with shape [num_landmarks, 2].
 71 |         """
 72 |         features = {
 73 |             'image': tf.FixedLenFeature([], tf.string),
 74 |             'ymin': tf.FixedLenFeature([], tf.float32),
 75 |             'xmin': tf.FixedLenFeature([], tf.float32),
 76 |             'ymax': tf.FixedLenFeature([], tf.float32),
 77 |             'xmax': tf.FixedLenFeature([], tf.float32),
 78 |             'landmarks': tf.FixedLenFeature([2 * self.num_landmarks], tf.float32)
 79 |         }
 80 |         parsed_features = tf.parse_single_example(example_proto, features)
 81 | 
 82 |         # get image
 83 |         image = tf.image.decode_jpeg(parsed_features['image'], channels=3)
 84 |         image = tf.image.convert_image_dtype(image, tf.float32)
 85 |         # now pixel values are scaled to [0, 1] range
 86 | 
 87 |         # get face box, it must be in from-zero-to-one format
 88 |         box = tf.stack([
 89 |             parsed_features['ymin'], parsed_features['xmin'],
 90 |             parsed_features['ymax'], parsed_features['xmax']
 91 |         ], axis=0)
 92 |         box = tf.clip_by_value(tf.to_float(box), clip_value_min=0.0, clip_value_max=1.0)
 93 | 
 94 |         # get facial landmarks, they must be in from-zero-to-one format
 95 |         landmarks = tf.to_float(parsed_features['landmarks'])
 96 |         landmarks = tf.reshape(landmarks, [self.num_landmarks, 2])
 97 |         landmarks = tf.clip_by_value(landmarks, clip_value_min=0.0, clip_value_max=1.0)
 98 |         # it assumed that landmarks are inside the bounding box (or on the edges)
 99 | 
100 |         if self.augmentation:
101 |             image, landmarks = self._augmentation_fn(image, box, landmarks)
102 |         else:
103 |             image, landmarks = crop(image, landmarks, box)
104 |             image = tf.image.resize_images(
105 |                 image, [self.image_height, self.image_width],
106 |                 method=RESIZE_METHOD
107 |             )
108 | 
109 |         return image, landmarks
110 | 
111 |     def _augmentation_fn(self, image, box, landmarks):
112 |         # there are a lot of hyperparameters here,
113 |         # you will need to tune them all, haha
114 |         image, box, landmarks = random_rotation(image, box, landmarks, max_angle=5)
115 |         box = random_box_jitter(box, landmarks, ratio=0.025)
116 |         image, landmarks = crop(image, landmarks, box)
117 |         image = tf.image.resize_images(
118 |             image, [self.image_height, self.image_width],
119 |             method=RESIZE_METHOD
120 |         )
121 |         image = random_color_manipulations(image, probability=0.1, grayscale_probability=0.01)
122 |         image = random_pixel_value_scale(image, minval=0.85, maxval=1.15, probability=0.1)
123 |         image = random_gaussian_blur(image, probability=0.1, kernel_size=4)
124 |         image, landmarks = random_flip_left_right(image, landmarks)
125 |         return image, landmarks
126 | 
127 | 
128 | def crop(image, landmarks, box):
129 |     """
130 |     Crops the image to the box.
131 |     It also adds some margin.
132 |     Finally, it transforms coordinates of the landmarks.
133 |     """
134 |     image_h = tf.to_float(tf.shape(image)[0])
135 |     image_w = tf.to_float(tf.shape(image)[1])
136 |     scaler = tf.stack([image_h, image_w, image_h, image_w], axis=0)
137 |     box = box * scaler
138 |     ymin, xmin, ymax, xmax = tf.unstack(box, axis=0)
139 | 
140 |     h, w = ymax - ymin, xmax - xmin
141 |     margin_y, margin_x = h / 6.0, w / 6.0  # 6.0 here is a hyperparameter
142 | 
143 |     ymin, xmin = ymin - 0.5 * margin_y, xmin - 0.5 * margin_x
144 |     ymax, xmax = ymax + 0.5 * margin_y, xmax + 0.5 * margin_x
145 |     ymin = tf.clip_by_value(ymin, 0.0, image_h)
146 |     xmin = tf.clip_by_value(xmin, 0.0, image_w)
147 |     ymax = tf.clip_by_value(ymax, 0.0, image_h)
148 |     xmax = tf.clip_by_value(xmax, 0.0, image_w)
149 | 
150 |     # for some reason box width or height sometimes becomes zero,
151 |     # but it happens very very rarely
152 |     h, w = tf.to_int32(ymax - ymin), tf.to_int32(xmax - xmin)
153 |     box_is_okay = tf.greater(h * w, 0)
154 | 
155 |     def do_it(image, landmarks):
156 |         image = tf.image.crop_to_bounding_box(
157 |             image, tf.to_int32(ymin), tf.to_int32(xmin), h, w
158 |         )
159 |         # translate coordinates of the landmarks
160 |         shift = tf.stack([ymin/(ymax - ymin), xmin/(xmax - xmin)], axis=0)
161 |         scaler = tf.stack([image_h/(ymax - ymin), image_w/(xmax - xmin)], axis=0)
162 |         landmarks = (landmarks * scaler) - shift
163 |         return image, landmarks
164 | 
165 |     image, landmarks = tf.cond(
166 |         box_is_okay,
167 |         lambda: do_it(image, landmarks),
168 |         lambda: (image, landmarks)
169 |     )
170 | 
171 |     landmarks = tf.clip_by_value(landmarks, 0.0, 1.0)
172 |     return image, landmarks
173 | 


--------------------------------------------------------------------------------
/loss.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import math
 3 | 
 4 | 
 5 | def wing_loss(landmarks, labels, w=10.0, epsilon=2.0):
 6 |     """
 7 |     Arguments:
 8 |         landmarks, labels: float tensors with shape [batch_size, num_landmarks, 2].
 9 |         w, epsilon: a float numbers.
10 |     Returns:
11 |         a float tensor with shape [].
12 |     """
13 |     with tf.name_scope('wing_loss'):
14 |         x = landmarks - labels
15 |         c = w * (1.0 - math.log(1.0 + w/epsilon))
16 |         absolute_x = tf.abs(x)
17 |         losses = tf.where(
18 |             tf.greater(w, absolute_x),
19 |             w * tf.log(1.0 + absolute_x/epsilon),
20 |             absolute_x - c
21 |         )
22 |         loss = tf.reduce_mean(tf.reduce_sum(losses, axis=[1, 2]), axis=0)
23 |         return loss
24 | 


--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | """
 5 | For evaluation during the training I use NME (normalized mean error).
 6 | You can find more about it here:
 7 | https://arxiv.org/abs/1506.03799 (Pose-Invariant 3D Face Alignment)
 8 | Note that my version here is slightly different.
 9 | So you cannot compare its value with results in papers.
10 | 
11 | It is assumed that num_landmarks = 5
12 | and that they are in the following order:
13 | [[lefteye_y lefteye_x]
14 |  [righteye_y righteye_x]
15 |  [nose_y nose_x]
16 |  [leftmouth_y leftmouth_x]
17 |  [rightmouth_y rightmouth_x]]
18 | """
19 | 
20 | 
21 | def nme_metric_ops(labels, landmarks):
22 |     """
23 |     Arguments:
24 |         labels, landmarks: a float tensors with shape [batch_size, num_landmarks, 2].
25 |     Returns:
26 |         two ops like in tf.metrics API.
27 |     """
28 |     norms = tf.norm(labels - landmarks, axis=2)  # shape [batch_size, num_landmarks]
29 |     mean_norm = tf.reduce_mean(norms, axis=1)  # shape [batch_size]
30 |     eye_distance = tf.norm(labels[:, 0, :] - labels[:, 1, :], axis=1)  # shape [batch_size]
31 | 
32 |     values = mean_norm/tf.maximum(eye_distance, 1.0)
33 |     mean, update_op = tf.metrics.mean(values)
34 |     return mean, update_op
35 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from network import network
  3 | from loss import wing_loss
  4 | from metrics import nme_metric_ops
  5 | 
  6 | 
  7 | MOVING_AVERAGE_DECAY = 0.995
  8 | 
  9 | 
 10 | def model_fn(features, labels, mode, params):
 11 |     """
 12 |     This is a function for creating a tensorflow computational graph.
 13 |     The function is in the format required by tf.estimator.
 14 |     """
 15 | 
 16 |     # features are just a tensor of RGB images
 17 | 
 18 |     is_training = mode == tf.estimator.ModeKeys.TRAIN
 19 |     landmarks = network(features, is_training, num_landmarks=params['num_landmarks'])
 20 |     # landmarks are normalized to [0, 1] range
 21 | 
 22 |     if mode == tf.estimator.ModeKeys.PREDICT:
 23 |         predictions = {'landmarks': landmarks}
 24 |         export_outputs = tf.estimator.export.PredictOutput({
 25 |             name: tf.identity(tensor, name)
 26 |             for name, tensor in predictions.items()
 27 |         })
 28 |         return tf.estimator.EstimatorSpec(
 29 |             mode, predictions=predictions,
 30 |             export_outputs={'outputs': export_outputs}
 31 |         )
 32 | 
 33 |     with tf.name_scope('rescale'):
 34 |         w, h = params['image_size']
 35 |         scaler = tf.constant([h, w], dtype=tf.float32)
 36 |         labels = labels * scaler
 37 |         landmarks = landmarks * scaler
 38 | 
 39 |     loss = wing_loss(landmarks, labels, w=params['w'], epsilon=params['epsilon'])
 40 |     tf.losses.add_loss(loss)
 41 |     tf.summary.scalar('just_wing_loss', loss)
 42 | 
 43 |     # add L2 regularization
 44 |     with tf.name_scope('weight_decay'):
 45 |         add_weight_decay(params['weight_decay'])
 46 |         regularization_loss = tf.losses.get_regularization_loss()
 47 |         tf.summary.scalar('regularization_loss', regularization_loss)
 48 | 
 49 |     total_loss = tf.losses.get_total_loss(add_regularization_losses=True)
 50 | 
 51 |     if mode == tf.estimator.ModeKeys.EVAL:
 52 |         eval_metric_ops = {
 53 |             'validation_mae': tf.metrics.mean_absolute_error(labels, landmarks),
 54 |             'normalized_mean_error': nme_metric_ops(labels, landmarks)
 55 |         }
 56 |         return tf.estimator.EstimatorSpec(
 57 |             mode, loss=total_loss,
 58 |             eval_metric_ops=eval_metric_ops
 59 |         )
 60 | 
 61 |     assert mode == tf.estimator.ModeKeys.TRAIN
 62 | 
 63 |     with tf.variable_scope('learning_rate'):
 64 |         global_step = tf.train.get_global_step()
 65 |         learning_rate = tf.train.cosine_decay(
 66 |             params['initial_lr'], global_step,
 67 |             decay_steps=params['num_steps']
 68 |         )
 69 |         tf.summary.scalar('learning_rate', learning_rate)
 70 | 
 71 |     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
 72 |     with tf.control_dependencies(update_ops), tf.variable_scope('optimizer'):
 73 |         optimizer = tf.train.AdamOptimizer(learning_rate)
 74 |         grads_and_vars = optimizer.compute_gradients(total_loss)
 75 |         train_op = optimizer.apply_gradients(grads_and_vars, global_step)
 76 | 
 77 |     for g, v in grads_and_vars:
 78 |         tf.summary.histogram(v.name[:-2] + '_hist', v)
 79 |         tf.summary.histogram(v.name[:-2] + '_grad_hist', g)
 80 | 
 81 |     with tf.control_dependencies([train_op]), tf.name_scope('ema'):
 82 |         ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY, num_updates=global_step)
 83 |         train_op = ema.apply(tf.trainable_variables())
 84 | 
 85 |     with tf.name_scope('evaluation_ops'):
 86 |         mae = tf.reduce_mean(tf.abs(labels - landmarks), axis=[0, 1, 2])
 87 | 
 88 |     tf.summary.scalar('train_mae', mae)
 89 |     return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
 90 | 
 91 | 
 92 | def add_weight_decay(weight_decay):
 93 |     """Add L2 regularization to all (or some) trainable kernel weights."""
 94 |     weight_decay = tf.constant(
 95 |         weight_decay, tf.float32,
 96 |         [], 'weight_decay'
 97 |     )
 98 |     trainable_vars = tf.trainable_variables()
 99 |     kernels = [v for v in trainable_vars if 'weights' in v.name]
100 |     for K in kernels:
101 |         x = tf.multiply(weight_decay, tf.nn.l2_loss(K))
102 |         tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, x)
103 | 
104 | 
105 | class RestoreMovingAverageHook(tf.train.SessionRunHook):
106 |     def __init__(self, model_dir):
107 |         super(RestoreMovingAverageHook, self).__init__()
108 |         self.model_dir = model_dir
109 | 
110 |     def begin(self):
111 |         ema = tf.train.ExponentialMovingAverage(decay=MOVING_AVERAGE_DECAY)
112 |         variables_to_restore = ema.variables_to_restore()
113 |         self.load_ema = tf.contrib.framework.assign_from_checkpoint_fn(
114 |             tf.train.latest_checkpoint(self.model_dir), variables_to_restore
115 |         )
116 | 
117 |     def after_create_session(self, sess, coord):
118 |         tf.logging.info('Loading EMA weights...')
119 |         self.load_ema(sess)
120 | 


--------------------------------------------------------------------------------
/network.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow.contrib.slim as slim
 3 | 
 4 | 
 5 | BATCH_NORM_MOMENTUM = 0.91
 6 | BATCH_NORM_EPSILON = 1e-3
 7 | 
 8 | 
 9 | def network(images, is_training, num_landmarks):
10 |     """
11 |     Arguments:
12 |         images: a float tensor with shape [batch_size, height, width, 3],
13 |             a batch of RGB images with pixels values in the range [0, 1].
14 |         is_training: a boolean.
15 |         num_landmarks: an integer.
16 |     Returns:
17 |         a float tensor with shape [batch_size, num_landmarks, 2].
18 |     """
19 | 
20 |     def batch_norm(x):
21 |         x = tf.layers.batch_normalization(
22 |             x, axis=3, center=True, scale=True,
23 |             momentum=BATCH_NORM_MOMENTUM,
24 |             epsilon=BATCH_NORM_EPSILON,
25 |             training=is_training, fused=True,
26 |             name='batch_norm'
27 |         )
28 |         return x
29 | 
30 |     with tf.name_scope('standardize_input'):
31 |         x = (2.0 * images) - 1.0
32 | 
33 |     with tf.variable_scope('network'):
34 |         params = {
35 |             'padding': 'SAME',
36 |             'activation_fn': tf.nn.relu,
37 |             'normalizer_fn': batch_norm,
38 |             'data_format': 'NHWC'
39 |         }
40 |         with slim.arg_scope([slim.conv2d], **params):
41 |             with slim.arg_scope([slim.max_pool2d], stride=2, padding='SAME', data_format='NHWC'):
42 | 
43 |                 num_filters = [32, 64, 128, 256, 512]
44 |                 for i, f in enumerate(num_filters, 1):
45 |                     x = slim.conv2d(x, f, (3, 3), stride=1, scope='conv%d' % i)
46 |                     x = slim.max_pool2d(x, (2, 2), scope='pool%d' % i)
47 | 
48 |         x = flatten(x)
49 |         x = slim.fully_connected(
50 |             x, 1024, activation_fn=tf.nn.relu,
51 |             normalizer_fn=None, scope='fc1'
52 |         )
53 |         x = slim.fully_connected(
54 |             x, 2 * num_landmarks, activation_fn=None,
55 |             normalizer_fn=None, scope='fc2',
56 |             biases_initializer=tf.constant_initializer(0.5),
57 |         )
58 |         batch_size = tf.shape(x)[0]
59 |         x = tf.reshape(x, [batch_size, num_landmarks, 2])
60 |         return x
61 | 
62 | 
63 | def flatten(x):
64 |     with tf.name_scope('flatten'):
65 |         batch_size = tf.shape(x)[0]
66 |         height, width, channels = x.shape.as_list()[1:]
67 |         x = tf.reshape(x, [batch_size, channels * height * width])
68 |         return x
69 | 
70 | 
71 | def prelu(x):
72 |     """It is not used here."""
73 |     with tf.variable_scope('prelu'):
74 |         in_channels = x.shape[3].value
75 |         alpha = tf.get_variable(
76 |             'alpha', [in_channels],
77 |             initializer=tf.constant_initializer(0.1),
78 |             dtype=tf.float32
79 |         )
80 |         return tf.nn.relu(x) - alpha * tf.nn.relu(-x)
81 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import json
 3 | import os
 4 | from model import model_fn, RestoreMovingAverageHook
 5 | from input_pipeline import Pipeline
 6 | tf.logging.set_verbosity('INFO')
 7 | 
 8 | 
 9 | CONFIG = 'config.json'
10 | GPU_TO_USE = '0'
11 | params = json.load(open(CONFIG))
12 | 
13 | 
14 | def get_input_fn(is_training=True):
15 | 
16 |     image_size = params['image_size']
17 |     data_dir = params['train_dataset'] if is_training else params['val_dataset']
18 |     batch_size = params['batch_size']
19 |     num_landmarks = params['num_landmarks']
20 | 
21 |     filenames = os.listdir(data_dir)
22 |     filenames = [n for n in filenames if n.endswith('.tfrecords')]
23 |     filenames = sorted(filenames)
24 |     filenames = [os.path.join(data_dir, n) for n in filenames]
25 | 
26 |     def input_fn():
27 |         with tf.device('/cpu:0'), tf.name_scope('input_pipeline'):
28 |             pipeline = Pipeline(
29 |                 filenames, batch_size=batch_size, image_size=image_size, num_landmarks=num_landmarks,
30 |                 repeat=is_training, shuffle=is_training, augmentation=is_training,
31 |             )
32 |         return pipeline.dataset
33 |     return input_fn
34 | 
35 | 
36 | config = tf.ConfigProto(allow_soft_placement=True)
37 | config.gpu_options.visible_device_list = GPU_TO_USE
38 | 
39 | run_config = tf.estimator.RunConfig()
40 | run_config = run_config.replace(
41 |     model_dir=params['model_dir'],
42 |     session_config=config,
43 |     save_summary_steps=2000,
44 |     save_checkpoints_secs=1800,
45 |     log_step_count_steps=1000
46 | )
47 | 
48 | train_input_fn = get_input_fn(is_training=True)
49 | val_input_fn = get_input_fn(is_training=False)
50 | 
51 | estimator = tf.estimator.Estimator(model_fn, params=params, config=run_config)
52 | train_spec = tf.estimator.TrainSpec(
53 |     train_input_fn, max_steps=params['num_steps']
54 | )
55 | eval_spec = tf.estimator.EvalSpec(
56 |     val_input_fn, steps=None, hooks=[RestoreMovingAverageHook(params['model_dir'])],
57 |     start_delay_secs=3600, throttle_secs=3600
58 | )
59 | tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
60 | 


--------------------------------------------------------------------------------