├── png
    └── 0.png
├── README.md
└── make_HQ_images.py


/png/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mazzzystar/make-CelebA-HQ/HEAD/png/0.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # make-CelebA-HQ
  2 | Python script to download and create the celebA-HQ dataset.
  3 | 
  4 | To get the celebA-HQ dataset, you need to 
  5 | 
  6 |  a) download the celebA dataset. [Google Drive](https://drive.google.com/drive/folders/0B7EVK8r0v71pWEZsZE9oNnFzTm8) or [Baidu Drive](https://pan.baidu.com/s/1eSNpdRG#list/path=%2F).
  7 |  
  8 |  b) download celebA-HQ dataset. [Google Drive](https://drive.google.com/drive/folders/0B4qLcYyJmiz0TXY1NG02bzZVRGs)
  9 |  
 10 |  c) do some processing to get the HQ images `make_HQ_images.py`.
 11 | 
 12 | 
 13 | # Usage
 14 | 1.Clone the repository
 15 | ```
 16 | git clone https://github.com/mazzzystar/make-CelebA-HQ.git
 17 | cd make-CelebA-HQ
 18 | ```
 19 | 
 20 | 2.Install necessary packages
 21 |  * Install miniconda https://conda.io/miniconda.html
 22 |  * Create a new environement
 23 |  ```
 24 |  conda create -n celebaHQ python=3
 25 |  source activate celebaHQ
 26 |  ```
 27 |  * Install the packages
 28 |  ```
 29 |  conda install jpeg tqdm requests pillow urllib3 numpy cryptography scipy
 30 |  pip install opencv-python==3.4.0.12 cryptography==2.1.4
 31 |  ```
 32 |  * Install 7zip (On Ubuntu)
 33 |  ```
 34 |  sudo apt-get install p7zip-full
 35 |  ```
 36 | 
 37 | 3.Download and Unzip CelebA & CelebA-HQ
 38 | 
 39 | Downloading CelebA: [Large-scale CelebFaces Attributes (CelebA) Dataset](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html)
 40 | 
 41 | Downloading CelebA-HQ. Google Drive: https://drive.google.com/drive/folders/0B4qLcYyJmiz0TXY1NG02bzZVRGs
 42 | ```bash
 43 | # there are 14 file inside. (img_celeba.7z.001, ... img_celeba.7z.014)
 44 | cd CelebA/Img/img_celeba.7z  
 45 | # merge .7z files into a single file.
 46 | cat img_celeba.7z.0** > img_celeba.7z
 47 | # unzip the .7z file
 48 | 7z x img_celeba.7z
 49 | # move the unzipped folder 'img_celeba' under 'Img/' path
 50 | mv img_celeba ../
 51 | 
 52 | 
 53 | cd CelebA-HQ/
 54 | # unzip the zips of deltas00000.zip, deltas00001.zip, .. into one file
 55 | unzip '*.zip' -d combined
 56 | # move the "image_list.txt" to current github repo folder.
 57 | ```
 58 | 
 59 | 4.Make sure your datasets/landmark/scripts under current Github repo folder.
 60 | ```
 61 | make-CelebA-HQ---CelebA
 62 |                |    |___Anno
 63 |                |    |     |___list_landmarks_celeba.txt
 64 |                |    |     |___...
 65 |                |    |     
 66 |                |    |___Img
 67 |                |         |___ img_celeba/
 68 |                |
 69 |                |_CelebA-HQ
 70 |                |    |
 71 |                |    |_Combined/
 72 |                |  
 73 |                |-make_HQ_images.py
 74 |                |-image_list.txt  
 75 |                |_README.md 
 76 |   
 77 | ```
 78 | 
 79 | 5.Run the scripts
 80 | ```
 81 | python make_HQ_images.py ./
 82 | 
 83 | ```
 84 | where `./` is the directory where you wish the data to be saved. this script will take a few hours to run depending on your internet connection and your CPU power. The final HQ images will be saved as `.npy` files in the `./celebA-HQ/combined/` folder.
 85 | 
 86 | 6.Sample of loading image
 87 | ```python
 88 | import numpy as np
 89 | from PIL import Image
 90 | 
 91 | img = np.load('imgHQ15858.npy')
 92 | img = img.reshape(3, 1024, 1024)
 93 | print(img.shape)
 94 | img = np.transpose(img, (1, 2, 0))
 95 | print(img.shape)
 96 | 
 97 | im = Image.fromarray(img)
 98 | im.show()
 99 | ```
100 | ![](png/0.png)
101 | # Sources
102 | This code is inspired by these files
103 | * https://github.com/nperraud/download-celebA-HQ
104 | 
105 | # Note
106 | The code above use `jpeg=8d` version of `md5` for checking hash value, however I could get the expected value for `jpeg=9b` version, so I simply ignore the md5 checking. Tell me if you think somewhere wrong.
107 | 


--------------------------------------------------------------------------------
/make_HQ_images.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import PIL
  4 | import hashlib
  5 | import numpy as np
  6 | from PIL import Image
  7 | import cryptography.hazmat.primitives.hashes
  8 | import cryptography.hazmat.backends
  9 | import cryptography.hazmat.primitives.kdf.pbkdf2
 10 | import cryptography.fernet
 11 | import base64
 12 | import bz2
 13 | import scipy
 14 | from scipy import ndimage
 15 | import multiprocessing as mp
 16 | import argparse
 17 | 
 18 | # This code is inspired by
 19 | # https://github.com/nperraud/download-celebA-HQ/blob/master/make_HQ_images.py
 20 | parser = argparse.ArgumentParser(description='Download celebA-HQ helper')
 21 | parser.add_argument('path', type=str)
 22 | 
 23 | args = parser.parse_args()
 24 | dirpath = args.path
 25 | 
 26 | dataset_dir = os.path.join(dirpath, 'CelebA')
 27 | expected_images = 202599
 28 | delta_dir = os.path.join(dirpath, 'CelebA-HQ/combined')
 29 | 
 30 | celeba_dir = os.path.join(dataset_dir, 'Img/img_celeba')
 31 | print('Loading CelebA from {}'.format(celeba_dir))
 32 | if len(glob.glob(os.path.join(celeba_dir, '*.jpg'))) != expected_images:
 33 |     raise ValueError('Expected to find {} images'.format(expected_images))
 34 | 
 35 | with open(os.path.join(dataset_dir, 'Anno', 'list_landmarks_celeba.txt'), 'rt') as file:
 36 |     landmarks = [[float(value) for value in line.split()[1:]] for line in file.readlines()[2:]]
 37 |     landmarks = np.float32(landmarks).reshape(-1, 5, 2)
 38 | 
 39 | print('Loading CelebA-HQ deltas from {}'.format(delta_dir))
 40 | expected_dat = 30000
 41 | 
 42 | if len(glob.glob(os.path.join(delta_dir, '*.dat'))) != expected_dat:
 43 |     raise ValueError('Expected to find {} dat files'.format(expected_dat))
 44 | 
 45 | with open(os.path.join('image_list.txt'), 'rt') as file:
 46 |     lines = [line.split() for line in file]
 47 |     fields = dict()
 48 |     for idx, field in enumerate(lines[0]):
 49 |         typef = int if field.endswith('idx') else str
 50 |         fields[field] = [typef(line[idx]) for line in lines[1:]]
 51 | 
 52 | indices = np.array(fields['idx'])
 53 | 
 54 | """
 55 | # Must use pillow version 3.1.1 for everything to work correctly.
 56 | if getattr(PIL, 'PILLOW_VERSION', '') != '3.1.1':
 57 |     raise ValueError('create_celebahq requires pillow version 3.1.1') # conda install pillow=3.1.1
 58 | """
 59 | 
 60 | # Must use libjpeg version 8d for everything to work correctly.
 61 | """
 62 | img = np.array(PIL.Image.open(os.path.join(celeba_dir, '000001.jpg')))
 63 | md5 = hashlib.md5()
 64 | md5.update(img.tobytes())
 65 | if md5.hexdigest() != '9cad8178d6cb0196b36f7b34bc5eb6d3':
 66 |     raise ValueError('create_celebahq requires libjpeg version 8d') # conda install jpeg=8d
 67 | """
 68 | 
 69 | def rot90(v):
 70 |     return np.array([-v[1], v[0]])
 71 | 
 72 | 
 73 | def process_func(idx):
 74 |     # Load original image.
 75 |     orig_idx = fields['orig_idx'][idx]
 76 |     orig_file = fields['orig_file'][idx]
 77 |     orig_path = os.path.join(celeba_dir, orig_file)
 78 |     img = PIL.Image.open(orig_path)
 79 |     # Choose oriented crop rectangle.
 80 |     lm = landmarks[orig_idx]
 81 |     eye_avg = (lm[0] + lm[1]) * 0.5 + 0.5
 82 |     mouth_avg = (lm[3] + lm[4]) * 0.5 + 0.5
 83 |     eye_to_eye = lm[1] - lm[0]
 84 |     eye_to_mouth = mouth_avg - eye_avg
 85 |     x = eye_to_eye - rot90(eye_to_mouth)
 86 |     x /= np.hypot(*x)
 87 |     x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
 88 |     y = rot90(x)
 89 |     c = eye_avg + eye_to_mouth * 0.1
 90 |     quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
 91 |     zoom = 1024 / (np.hypot(*x) * 2)
 92 |     # Shrink.
 93 |     shrink = int(np.floor(0.5 / zoom))
 94 |     if shrink > 1:
 95 |         size = (int(np.round(float(img.size[0]) / shrink)), int(np.round(float(img.size[1]) / shrink)))
 96 |         img = img.resize(size, PIL.Image.ANTIALIAS)
 97 |         quad /= shrink
 98 |         zoom *= shrink
 99 |     # Crop.
100 |     border = max(int(np.round(1024 * 0.1 / zoom)), 3)
101 |     crop = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
102 |     crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]), min(crop[3] + border, img.size[1]))
103 |     if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
104 |         img = img.crop(crop)
105 |         quad -= crop[0:2]
106 |     # Simulate super-resolution.
107 |     superres = int(np.exp2(np.ceil(np.log2(zoom))))
108 |     if superres > 1:
109 |         img = img.resize((img.size[0] * superres, img.size[1] * superres), PIL.Image.ANTIALIAS)
110 |         quad *= superres
111 |         zoom /= superres
112 |     # Pad.
113 |     pad = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
114 |     pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0), max(pad[3] - img.size[1] + border, 0))
115 |     if max(pad) > border - 4:
116 |         pad = np.maximum(pad, int(np.round(1024 * 0.3 / zoom)))
117 |         img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
118 |         h, w, _ = img.shape
119 |         y, x, _ = np.mgrid[:h, :w, :1]
120 |         mask = 1.0 - np.minimum(np.minimum(np.float32(x) / pad[0], np.float32(y) / pad[1]), np.minimum(np.float32(w-1-x) / pad[2], np.float32(h-1-y) / pad[3]))
121 |         blur = 1024 * 0.02 / zoom
122 |         img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
123 |         img += (np.median(img, axis=(0,1)) - img) * np.clip(mask, 0.0, 1.0)
124 |         img = PIL.Image.fromarray(np.uint8(np.clip(np.round(img), 0, 255)), 'RGB')
125 |         quad += pad[0:2]
126 |     # Transform.
127 |     img = img.transform((4096, 4096), PIL.Image.QUAD, (quad + 0.5).flatten(), PIL.Image.BILINEAR)
128 |     img = img.resize((1024, 1024), PIL.Image.ANTIALIAS)
129 |     img = np.asarray(img).transpose(2, 0, 1)
130 |     # Verify MD5.
131 |     """
132 |     md5 = hashlib.md5()
133 |     md5.update(img.tobytes())
134 |     assert md5.hexdigest() == fields['proc_md5'][idx]
135 |     """
136 |     # # Load delta image and original JPG.
137 |     # with zipfile.ZipFile(os.path.join(delta_dir, 'deltas%05d.zip' % (idx - idx % 1000)), 'r') as zip:
138 |     #     delta_bytes = zip.read('delta%05d.dat' % idx)
139 |     with open(os.path.join(delta_dir,'delta%05d.dat' % idx), 'rb') as file:
140 |         delta_bytes = file.read()
141 |     with open(orig_path, 'rb') as file:
142 |         orig_bytes = file.read()
143 |     # Decrypt delta image, using original JPG data as decryption key.
144 |     algorithm = cryptography.hazmat.primitives.hashes.SHA256()
145 |     backend = cryptography.hazmat.backends.default_backend()
146 |     salt = bytes(orig_file, 'ascii')
147 |     kdf = cryptography.hazmat.primitives.kdf.pbkdf2.PBKDF2HMAC(algorithm=algorithm, length=32, salt=salt, iterations=100000, backend=backend)
148 |     key = base64.urlsafe_b64encode(kdf.derive(orig_bytes))
149 |     delta = np.frombuffer(bz2.decompress(cryptography.fernet.Fernet(key).decrypt(delta_bytes)), dtype=np.uint8).reshape(3, 1024, 1024)
150 |     # Apply delta image.
151 |     img = img + delta
152 |     # Verify MD5.
153 |     """
154 |     md5 = hashlib.md5()
155 |     md5.update(img.tobytes())
156 |     assert md5.hexdigest() == fields['final_md5'][idx]
157 |     """
158 |     return img
159 | 
160 | 
161 | def do_the_work(img_num):
162 |     print('Create image number {}'.format(img_num))
163 |     img = process_func(img_num)
164 |     np.save(os.path.join(delta_dir, 'imgHQ%05d' % img_num), [img])
165 | 
166 | # for img_num in range(expected_dat):
167 | #     do_the_work(img_num)
168 | 
169 | num_workers = mp.cpu_count() - 1
170 | print('Starting a pool with {} workers'.format(num_workers))
171 | with mp.Pool(processes=num_workers) as pool:
172 |     pool.map(do_the_work, list(range(expected_dat)))
173 | if len(glob.glob(os.path.join(delta_dir, '*.npy'))) != 30000:
174 |     raise ValueError('Expected to find {} npy files\n Something went wrong!'.format(30000))
175 | # Remove the dat files
176 | for filepath in glob.glob(os.path.join(delta_dir, '*.dat')):
177 |     os.remove(filepath)
178 | print('All done! Congratulations!')


--------------------------------------------------------------------------------