├── README.md
└── prepare.py


/README.md:
--------------------------------------------------------------------------------
1 | Scripts to prepare OXFORD VGG Face dataset
2 | ====================
3 | 
4 | Things I used to download and prepare dataset for experiments:
5 |  * [aria2](https://aria2.github.io/) download utility. You can download all prepared links from [here](https://yadi.sk/d/c8gR5nIEqZteV). To download dataset simply type `aria2c -i vgg-faces-aria.txt --deferred-input -j 60`. It took approximatly 1.5 days and 100GB of space.
6 |  * Python 2.7.11 with `pillow` module for image processing. You can crop all downloaded images with `python2 prepare.py` command.
7 | 
8 | Check dataset's [page](http://www.robots.ox.ac.uk/~vgg/data/vgg_face/) for more information.
9 | 


--------------------------------------------------------------------------------
/prepare.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python2
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from __future__ import print_function, division
 5 | 
 6 | import os
 7 | import shutil
 8 | import imghdr
 9 | from PIL import Image
10 | 
11 | 
12 | def clean_subdir(save_path, cur_path):
13 |     _, subdirs, files = next(os.walk(cur_path))
14 |     if len(subdirs) > 0:
15 |         subdir_path = os.path.join(cur_path, subdirs[0])
16 |         clean_subdir(save_path, subdir_path)
17 |     elif len(files) > 0:
18 |         clean_file(save_path, os.path.join(cur_path, files[0]))
19 |     shutil.rmtree(cur_path)
20 |     return None
21 | 
22 | 
23 | def clean_file(save_path, cur_path):
24 |     print(cur_path, save_path)
25 |     ext = imghdr.what(cur_path)
26 |     if ext is None:
27 |         os.remove(cur_path)
28 |         return None
29 |     save_name = save_path + '.' + ext
30 |     shutil.move(cur_path, save_name)
31 |     return None
32 | 
33 | 
34 | def get_clean_name(name):
35 |     return name[:name.index('.')]
36 | 
37 | 
38 | def walk_and_clean(root_dir='./'):
39 |     names = next(os.walk(root_dir))[1]
40 |     print(names)
41 |     for name in names:
42 |         cur_path = os.path.join(root_dir, name)
43 |         _, subdirs, files = next(os.walk(cur_path))
44 |         clean_subdirs = map(get_clean_name, subdirs)
45 |         map(lambda s: os.rename(os.path.join(cur_path, s[1]), os.path.join(cur_path, s[0])), zip(clean_subdirs, subdirs))
46 |         map(lambda s: clean_subdir(os.path.join(cur_path, s), os.path.join(cur_path, s)), clean_subdirs)
47 |         clean_files = map(get_clean_name, files)
48 |         map(lambda f: clean_file(os.path.join(cur_path, f[0]), os.path.join(cur_path, f[1])), zip(clean_files, files))
49 | 
50 | 
51 | def map_names(filenames):
52 |     idx_map = dict()
53 |     for name in filenames:
54 |         idx_map[int(get_clean_name(name))] = name
55 |     return idx_map
56 | 
57 | 
58 | def crop_from_file(filename, images_dir='./', save_dir='./cropped'):
59 |     person = filename[filename.rfind('/')+1:filename.rfind('.')]
60 |     print(person)
61 |     if not os.path.exists(os.path.join(save_dir, person)):
62 |         os.makedirs(os.path.join(save_dir, person))
63 |     img_map = map_names(next(os.walk(os.path.join(images_dir, person)))[2])
64 |     with open(filename, 'r') as f:
65 |         for line in f:
66 |             arguments = map(str.strip, line.split(' '))
67 |             if len(arguments) != 10:
68 |                 continue
69 |             img_idx = int(arguments[0])
70 |             bbox = tuple(map(int, map(float, arguments[3:7])))
71 |             if img_idx not in img_map:
72 |                 continue
73 |             img_name = img_map[img_idx]
74 |             Image.open(os.path.join(images_dir, person, img_name)).crop(bbox).save(os.path.join(save_dir, person, img_name))
75 |     return None
76 | 
77 | 
78 | def walk_and_crop(instr_dir='./', images_dir='./images', save_dir='./cropped'):
79 |     map(lambda f: crop_from_file(os.path.join(instr_dir, f), images_dir, save_dir), next(os.walk(instr_dir))[2])
80 |     return None
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     walk_and_clean('./out')
85 |     os.makedirs('./cropped')
86 |     walk_and_crop('./files', './out', './cropped')
87 | 


--------------------------------------------------------------------------------