├── png └── 0.png ├── README.md └── make_HQ_images.py /png/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mazzzystar/make-CelebA-HQ/HEAD/png/0.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # make-CelebA-HQ 2 | Python script to download and create the celebA-HQ dataset. 3 | 4 | To get the celebA-HQ dataset, you need to 5 | 6 | a) download the celebA dataset. [Google Drive](https://drive.google.com/drive/folders/0B7EVK8r0v71pWEZsZE9oNnFzTm8) or [Baidu Drive](https://pan.baidu.com/s/1eSNpdRG#list/path=%2F). 7 | 8 | b) download celebA-HQ dataset. [Google Drive](https://drive.google.com/drive/folders/0B4qLcYyJmiz0TXY1NG02bzZVRGs) 9 | 10 | c) do some processing to get the HQ images `make_HQ_images.py`. 11 | 12 | 13 | # Usage 14 | 1.Clone the repository 15 | ``` 16 | git clone https://github.com/mazzzystar/make-CelebA-HQ.git 17 | cd make-CelebA-HQ 18 | ``` 19 | 20 | 2.Install necessary packages 21 | * Install miniconda https://conda.io/miniconda.html 22 | * Create a new environement 23 | ``` 24 | conda create -n celebaHQ python=3 25 | source activate celebaHQ 26 | ``` 27 | * Install the packages 28 | ``` 29 | conda install jpeg tqdm requests pillow urllib3 numpy cryptography scipy 30 | pip install opencv-python==3.4.0.12 cryptography==2.1.4 31 | ``` 32 | * Install 7zip (On Ubuntu) 33 | ``` 34 | sudo apt-get install p7zip-full 35 | ``` 36 | 37 | 3.Download and Unzip CelebA & CelebA-HQ 38 | 39 | Downloading CelebA: [Large-scale CelebFaces Attributes (CelebA) Dataset](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) 40 | 41 | Downloading CelebA-HQ. Google Drive: https://drive.google.com/drive/folders/0B4qLcYyJmiz0TXY1NG02bzZVRGs 42 | ```bash 43 | # there are 14 file inside. (img_celeba.7z.001, ... img_celeba.7z.014) 44 | cd CelebA/Img/img_celeba.7z 45 | # merge .7z files into a single file. 46 | cat img_celeba.7z.0** > img_celeba.7z 47 | # unzip the .7z file 48 | 7z x img_celeba.7z 49 | # move the unzipped folder 'img_celeba' under 'Img/' path 50 | mv img_celeba ../ 51 | 52 | 53 | cd CelebA-HQ/ 54 | # unzip the zips of deltas00000.zip, deltas00001.zip, .. into one file 55 | unzip '*.zip' -d combined 56 | # move the "image_list.txt" to current github repo folder. 57 | ``` 58 | 59 | 4.Make sure your datasets/landmark/scripts under current Github repo folder. 60 | ``` 61 | make-CelebA-HQ---CelebA 62 | | |___Anno 63 | | | |___list_landmarks_celeba.txt 64 | | | |___... 65 | | | 66 | | |___Img 67 | | |___ img_celeba/ 68 | | 69 | |_CelebA-HQ 70 | | | 71 | | |_Combined/ 72 | | 73 | |-make_HQ_images.py 74 | |-image_list.txt 75 | |_README.md 76 | 77 | ``` 78 | 79 | 5.Run the scripts 80 | ``` 81 | python make_HQ_images.py ./ 82 | 83 | ``` 84 | where `./` is the directory where you wish the data to be saved. this script will take a few hours to run depending on your internet connection and your CPU power. The final HQ images will be saved as `.npy` files in the `./celebA-HQ/combined/` folder. 85 | 86 | 6.Sample of loading image 87 | ```python 88 | import numpy as np 89 | from PIL import Image 90 | 91 | img = np.load('imgHQ15858.npy') 92 | img = img.reshape(3, 1024, 1024) 93 | print(img.shape) 94 | img = np.transpose(img, (1, 2, 0)) 95 | print(img.shape) 96 | 97 | im = Image.fromarray(img) 98 | im.show() 99 | ``` 100 | ![](png/0.png) 101 | # Sources 102 | This code is inspired by these files 103 | * https://github.com/nperraud/download-celebA-HQ 104 | 105 | # Note 106 | The code above use `jpeg=8d` version of `md5` for checking hash value, however I could get the expected value for `jpeg=9b` version, so I simply ignore the md5 checking. Tell me if you think somewhere wrong. 107 | -------------------------------------------------------------------------------- /make_HQ_images.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import PIL 4 | import hashlib 5 | import numpy as np 6 | from PIL import Image 7 | import cryptography.hazmat.primitives.hashes 8 | import cryptography.hazmat.backends 9 | import cryptography.hazmat.primitives.kdf.pbkdf2 10 | import cryptography.fernet 11 | import base64 12 | import bz2 13 | import scipy 14 | from scipy import ndimage 15 | import multiprocessing as mp 16 | import argparse 17 | 18 | # This code is inspired by 19 | # https://github.com/nperraud/download-celebA-HQ/blob/master/make_HQ_images.py 20 | parser = argparse.ArgumentParser(description='Download celebA-HQ helper') 21 | parser.add_argument('path', type=str) 22 | 23 | args = parser.parse_args() 24 | dirpath = args.path 25 | 26 | dataset_dir = os.path.join(dirpath, 'CelebA') 27 | expected_images = 202599 28 | delta_dir = os.path.join(dirpath, 'CelebA-HQ/combined') 29 | 30 | celeba_dir = os.path.join(dataset_dir, 'Img/img_celeba') 31 | print('Loading CelebA from {}'.format(celeba_dir)) 32 | if len(glob.glob(os.path.join(celeba_dir, '*.jpg'))) != expected_images: 33 | raise ValueError('Expected to find {} images'.format(expected_images)) 34 | 35 | with open(os.path.join(dataset_dir, 'Anno', 'list_landmarks_celeba.txt'), 'rt') as file: 36 | landmarks = [[float(value) for value in line.split()[1:]] for line in file.readlines()[2:]] 37 | landmarks = np.float32(landmarks).reshape(-1, 5, 2) 38 | 39 | print('Loading CelebA-HQ deltas from {}'.format(delta_dir)) 40 | expected_dat = 30000 41 | 42 | if len(glob.glob(os.path.join(delta_dir, '*.dat'))) != expected_dat: 43 | raise ValueError('Expected to find {} dat files'.format(expected_dat)) 44 | 45 | with open(os.path.join('image_list.txt'), 'rt') as file: 46 | lines = [line.split() for line in file] 47 | fields = dict() 48 | for idx, field in enumerate(lines[0]): 49 | typef = int if field.endswith('idx') else str 50 | fields[field] = [typef(line[idx]) for line in lines[1:]] 51 | 52 | indices = np.array(fields['idx']) 53 | 54 | """ 55 | # Must use pillow version 3.1.1 for everything to work correctly. 56 | if getattr(PIL, 'PILLOW_VERSION', '') != '3.1.1': 57 | raise ValueError('create_celebahq requires pillow version 3.1.1') # conda install pillow=3.1.1 58 | """ 59 | 60 | # Must use libjpeg version 8d for everything to work correctly. 61 | """ 62 | img = np.array(PIL.Image.open(os.path.join(celeba_dir, '000001.jpg'))) 63 | md5 = hashlib.md5() 64 | md5.update(img.tobytes()) 65 | if md5.hexdigest() != '9cad8178d6cb0196b36f7b34bc5eb6d3': 66 | raise ValueError('create_celebahq requires libjpeg version 8d') # conda install jpeg=8d 67 | """ 68 | 69 | def rot90(v): 70 | return np.array([-v[1], v[0]]) 71 | 72 | 73 | def process_func(idx): 74 | # Load original image. 75 | orig_idx = fields['orig_idx'][idx] 76 | orig_file = fields['orig_file'][idx] 77 | orig_path = os.path.join(celeba_dir, orig_file) 78 | img = PIL.Image.open(orig_path) 79 | # Choose oriented crop rectangle. 80 | lm = landmarks[orig_idx] 81 | eye_avg = (lm[0] + lm[1]) * 0.5 + 0.5 82 | mouth_avg = (lm[3] + lm[4]) * 0.5 + 0.5 83 | eye_to_eye = lm[1] - lm[0] 84 | eye_to_mouth = mouth_avg - eye_avg 85 | x = eye_to_eye - rot90(eye_to_mouth) 86 | x /= np.hypot(*x) 87 | x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8) 88 | y = rot90(x) 89 | c = eye_avg + eye_to_mouth * 0.1 90 | quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y]) 91 | zoom = 1024 / (np.hypot(*x) * 2) 92 | # Shrink. 93 | shrink = int(np.floor(0.5 / zoom)) 94 | if shrink > 1: 95 | size = (int(np.round(float(img.size[0]) / shrink)), int(np.round(float(img.size[1]) / shrink))) 96 | img = img.resize(size, PIL.Image.ANTIALIAS) 97 | quad /= shrink 98 | zoom *= shrink 99 | # Crop. 100 | border = max(int(np.round(1024 * 0.1 / zoom)), 3) 101 | crop = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1])))) 102 | crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]), min(crop[3] + border, img.size[1])) 103 | if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]: 104 | img = img.crop(crop) 105 | quad -= crop[0:2] 106 | # Simulate super-resolution. 107 | superres = int(np.exp2(np.ceil(np.log2(zoom)))) 108 | if superres > 1: 109 | img = img.resize((img.size[0] * superres, img.size[1] * superres), PIL.Image.ANTIALIAS) 110 | quad *= superres 111 | zoom /= superres 112 | # Pad. 113 | pad = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1])))) 114 | pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0), max(pad[3] - img.size[1] + border, 0)) 115 | if max(pad) > border - 4: 116 | pad = np.maximum(pad, int(np.round(1024 * 0.3 / zoom))) 117 | img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect') 118 | h, w, _ = img.shape 119 | y, x, _ = np.mgrid[:h, :w, :1] 120 | mask = 1.0 - np.minimum(np.minimum(np.float32(x) / pad[0], np.float32(y) / pad[1]), np.minimum(np.float32(w-1-x) / pad[2], np.float32(h-1-y) / pad[3])) 121 | blur = 1024 * 0.02 / zoom 122 | img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0) 123 | img += (np.median(img, axis=(0,1)) - img) * np.clip(mask, 0.0, 1.0) 124 | img = PIL.Image.fromarray(np.uint8(np.clip(np.round(img), 0, 255)), 'RGB') 125 | quad += pad[0:2] 126 | # Transform. 127 | img = img.transform((4096, 4096), PIL.Image.QUAD, (quad + 0.5).flatten(), PIL.Image.BILINEAR) 128 | img = img.resize((1024, 1024), PIL.Image.ANTIALIAS) 129 | img = np.asarray(img).transpose(2, 0, 1) 130 | # Verify MD5. 131 | """ 132 | md5 = hashlib.md5() 133 | md5.update(img.tobytes()) 134 | assert md5.hexdigest() == fields['proc_md5'][idx] 135 | """ 136 | # # Load delta image and original JPG. 137 | # with zipfile.ZipFile(os.path.join(delta_dir, 'deltas%05d.zip' % (idx - idx % 1000)), 'r') as zip: 138 | # delta_bytes = zip.read('delta%05d.dat' % idx) 139 | with open(os.path.join(delta_dir,'delta%05d.dat' % idx), 'rb') as file: 140 | delta_bytes = file.read() 141 | with open(orig_path, 'rb') as file: 142 | orig_bytes = file.read() 143 | # Decrypt delta image, using original JPG data as decryption key. 144 | algorithm = cryptography.hazmat.primitives.hashes.SHA256() 145 | backend = cryptography.hazmat.backends.default_backend() 146 | salt = bytes(orig_file, 'ascii') 147 | kdf = cryptography.hazmat.primitives.kdf.pbkdf2.PBKDF2HMAC(algorithm=algorithm, length=32, salt=salt, iterations=100000, backend=backend) 148 | key = base64.urlsafe_b64encode(kdf.derive(orig_bytes)) 149 | delta = np.frombuffer(bz2.decompress(cryptography.fernet.Fernet(key).decrypt(delta_bytes)), dtype=np.uint8).reshape(3, 1024, 1024) 150 | # Apply delta image. 151 | img = img + delta 152 | # Verify MD5. 153 | """ 154 | md5 = hashlib.md5() 155 | md5.update(img.tobytes()) 156 | assert md5.hexdigest() == fields['final_md5'][idx] 157 | """ 158 | return img 159 | 160 | 161 | def do_the_work(img_num): 162 | print('Create image number {}'.format(img_num)) 163 | img = process_func(img_num) 164 | np.save(os.path.join(delta_dir, 'imgHQ%05d' % img_num), [img]) 165 | 166 | # for img_num in range(expected_dat): 167 | # do_the_work(img_num) 168 | 169 | num_workers = mp.cpu_count() - 1 170 | print('Starting a pool with {} workers'.format(num_workers)) 171 | with mp.Pool(processes=num_workers) as pool: 172 | pool.map(do_the_work, list(range(expected_dat))) 173 | if len(glob.glob(os.path.join(delta_dir, '*.npy'))) != 30000: 174 | raise ValueError('Expected to find {} npy files\n Something went wrong!'.format(30000)) 175 | # Remove the dat files 176 | for filepath in glob.glob(os.path.join(delta_dir, '*.dat')): 177 | os.remove(filepath) 178 | print('All done! Congratulations!') --------------------------------------------------------------------------------