├── .gitignore ├── LICENSE ├── README.md ├── dataloader.py └── preprocess_sequential.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # Ignore notebook checkpoints 57 | */.ipynb_checkpoints/ 58 | 59 | # ignore experimental logs 60 | experiments/*/output/* 61 | experiments/*/*/output/* 62 | experiments/mnist.pkl.gz 63 | 64 | # ignore intermediate data files 65 | experiments/t10k* 66 | experiments/train* 67 | 68 | # ignore paper compiling output 69 | paper/out 70 | 71 | # ignore all notebooks 72 | notebooks 73 | 74 | # ignore eggs 75 | .eggs 76 | 77 | # ignore any checkpoints or data 78 | checkpoint 79 | data 80 | 81 | # ignore pkls 82 | *.pkl 83 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Gavin Gray 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Install 2 | ------- 3 | Requirements: 4 | 5 | * [Tensorpack][]: clone and `pip install -e .` 6 | * [LMDB][]: `pip install lmdb` 7 | * [OpenCV][]: `pip install opencv-python` 8 | * [Protobuf][]: `conda install protobuf` 9 | * [Prctl][]: clone, `sudo apt-get install build-essential libcap-dev` and `python setup.py build` 10 | 11 | 12 | [tensorpack]: https://github.com/ppwwyyxx/tensorpack 13 | [lmdb]: https://lmdb.readthedocs.io/en/release/ 14 | [opencv]: https://pypi.python.org/pypi/opencv-python 15 | [Protobuf]: https://github.com/google/protobuf 16 | [Prctl]: https://github.com/seveas/python-prctl 17 | 18 | 19 | `Tensorpack` version > 0.9 is currently NOT supported. 20 | Note that some prebuilt `opencv` is much slower than others. 21 | Remember to check with [this script](https://github.com/tensorpack/benchmarks/blob/master/ImageNet/benchmark-opencv-resize.py) and make sure it prints < 1s. 22 | 23 | ### Preprocessing 24 | 25 | To start, set the environment variable `IMAGENET` to the ILSVRC2012 26 | dataset. `TENSORPACK_DATASET` should also be set (for tensorpack). 27 | 28 | ```script 29 | export IMAGENET='/mnt/work/data/raw-data/' 30 | python preprocess_sequential.py 31 | ``` 32 | 33 | ### Usage 34 | 35 | ``` 36 | train_loader = LMDBLoader('train', batch_size=args.batch_size, num_workers=32, shuffle=True, cuda=True) 37 | valid_loader = LMDBLoader('val', batch_size=args.batch_size, num_workers=32, shuffle=False, cuda=True) 38 | ``` 39 | ## TODO 40 | - [ ] Image Normalization 41 | - [ ] Support HDF5 format 42 | - [ ] Tensorpack version > 0.9 43 | 44 | ### Disclaimer 45 | 46 | Code mainly from [sequential-imagenet-dataloader](https://github.com/BayesWatch/sequential-imagenet-dataloader), and [Tensorpack](https://github.com/tensorpack/tensorpack) examples. 47 | 48 | ### Reference 49 | 50 | * [Data loader takes a lot of time for every nth iteration](https://discuss.pytorch.org/t/data-loader-takes-a-lot-of-time-for-every-nth-iteration/10831) 51 | * [First batch of Imagenet training is slow with sequential loading](https://discuss.pytorch.org/t/first-batch-of-imagenet-training-is-slow-with-sequential-loading/11464) 52 | * [How to prefetch data when processing with GPU?](https://discuss.pytorch.org/t/how-to-prefetch-data-when-processing-with-gpu/548) 53 | * [How to speed up the data loader](https://discuss.pytorch.org/t/how-to-speed-up-the-data-loader/13740) 54 | [Fast data loader for Imagenet](https://discuss.pytorch.org/t/fast-data-loader-for-imagenet/988/14) 55 | 56 | -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- 1 | # dataloader respecting the PyTorch conventions, but using tensorpack to load and process 2 | # includes typical augmentations for ImageNet training 3 | 4 | import os 5 | 6 | import cv2 7 | import torch 8 | 9 | import numpy as np 10 | import tensorpack.dataflow as td 11 | from tensorpack import imgaug 12 | from tensorpack.dataflow import (AugmentImageComponent, PrefetchDataZMQ, 13 | BatchData, MultiThreadMapData) 14 | 15 | ##################################################################################################### 16 | # copied from: https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/imagenet_utils.py # 17 | ##################################################################################################### 18 | class GoogleNetResize(imgaug.ImageAugmentor): 19 | """ 20 | crop 8%~100% of the original image 21 | See `Going Deeper with Convolutions` by Google. 22 | """ 23 | def __init__(self, crop_area_fraction=0.08, 24 | aspect_ratio_low=0.75, aspect_ratio_high=1.333, 25 | target_shape=224): 26 | self._init(locals()) 27 | 28 | def _augment(self, img, _): 29 | h, w = img.shape[:2] 30 | area = h * w 31 | for _ in range(10): 32 | targetArea = self.rng.uniform(self.crop_area_fraction, 1.0) * area 33 | aspectR = self.rng.uniform(self.aspect_ratio_low, self.aspect_ratio_high) 34 | ww = int(np.sqrt(targetArea * aspectR) + 0.5) 35 | hh = int(np.sqrt(targetArea / aspectR) + 0.5) 36 | if self.rng.uniform() < 0.5: 37 | ww, hh = hh, ww 38 | if hh <= h and ww <= w: 39 | x1 = 0 if w == ww else self.rng.randint(0, w - ww) 40 | y1 = 0 if h == hh else self.rng.randint(0, h - hh) 41 | out = img[y1:y1 + hh, x1:x1 + ww] 42 | out = cv2.resize(out, (self.target_shape, self.target_shape), interpolation=cv2.INTER_CUBIC) 43 | return out 44 | out = imgaug.ResizeShortestEdge(self.target_shape, interp=cv2.INTER_CUBIC).augment(img) 45 | out = imgaug.CenterCrop(self.target_shape).augment(out) 46 | return out 47 | 48 | 49 | def fbresnet_augmentor(isTrain): 50 | """ 51 | Augmentor used in fb.resnet.torch, for BGR images in range [0,255]. 52 | """ 53 | if isTrain: 54 | augmentors = [ 55 | GoogleNetResize(), 56 | imgaug.RandomOrderAug( 57 | [imgaug.BrightnessScale((0.6, 1.4), clip=False), 58 | imgaug.Contrast((0.6, 1.4), clip=False), 59 | imgaug.Saturation(0.4, rgb=False), 60 | # rgb-bgr conversion for the constants copied from fb.resnet.torch 61 | imgaug.Lighting(0.1, 62 | eigval=np.asarray( 63 | [0.2175, 0.0188, 0.0045][::-1]) * 255.0, 64 | eigvec=np.array( 65 | [[-0.5675, 0.7192, 0.4009], 66 | [-0.5808, -0.0045, -0.8140], 67 | [-0.5836, -0.6948, 0.4203]], 68 | dtype='float32')[::-1, ::-1] 69 | )]), 70 | imgaug.Flip(horiz=True), 71 | ] 72 | else: 73 | augmentors = [ 74 | imgaug.ResizeShortestEdge(256, cv2.INTER_CUBIC), 75 | imgaug.CenterCrop((224, 224)), 76 | ] 77 | return augmentors 78 | ##################################################################################################### 79 | ##################################################################################################### 80 | 81 | 82 | numpy_type_map = { 83 | 'float64': torch.DoubleTensor, 84 | 'float32': torch.FloatTensor, 85 | 'float16': torch.HalfTensor, 86 | 'int64': torch.LongTensor, 87 | 'int32': torch.IntTensor, 88 | 'int16': torch.ShortTensor, 89 | 'int8': torch.CharTensor, 90 | 'uint8': torch.ByteTensor, 91 | } 92 | 93 | 94 | def default_collate(batch): 95 | 96 | error_msg = "batch must contain tensors, numbers, dicts or lists; found {}" 97 | elem_type = type(batch[0]) 98 | if torch.is_tensor(batch[0]): 99 | out = None 100 | if _use_shared_memory: 101 | # If we're in a background process, concatenate directly into a 102 | # shared memory tensor to avoid an extra copy 103 | numel = sum([x.numel() for x in batch]) 104 | storage = batch[0].storage()._new_shared(numel) 105 | out = batch[0].new(storage) 106 | return torch.stack(batch, 0, out=out) 107 | elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \ 108 | and elem_type.__name__ != 'string_': 109 | elem = batch[0] 110 | if elem_type.__name__ == 'ndarray': 111 | # array of string classes and object 112 | if re.search('[SaUO]', elem.dtype.str) is not None: 113 | raise TypeError(error_msg.format(elem.dtype)) 114 | 115 | return torch.stack([torch.from_numpy(b) for b in batch], 0) 116 | if elem.shape == (): # scalars 117 | py_type = float if elem.dtype.name.startswith('float') else int 118 | return numpy_type_map[elem.dtype.name](list(map(py_type, batch))) 119 | elif isinstance(batch[0], int): 120 | return torch.LongTensor(batch) 121 | elif isinstance(batch[0], float): 122 | return torch.DoubleTensor(batch) 123 | elif isinstance(batch[0], string_classes): 124 | return batch 125 | elif isinstance(batch[0], collections.Mapping): 126 | return {key: default_collate([d[key] for d in batch]) for key in batch[0]} 127 | elif isinstance(batch[0], collections.Sequence): 128 | transposed = zip(*batch) 129 | return [default_collate(samples) for samples in transposed] 130 | 131 | raise TypeError((error_msg.format(type(batch[0])))) 132 | 133 | 134 | class LMDBLoader(object): 135 | 136 | def __init__(self, mode, batch_size=256, shuffle=False, num_workers=25, cache=50000, 137 | collate_fn=default_collate, drop_last=False, cuda=False): 138 | # enumerate standard imagenet augmentors 139 | imagenet_augmentors = fbresnet_augmentor(mode == 'train') 140 | 141 | # load the lmdb if we can find it 142 | lmdb_loc = os.path.join(os.environ['IMAGENET'],'ILSVRC-%s.lmdb'%mode) 143 | ds = td.LMDBData(lmdb_loc, shuffle=shuffle) 144 | # ds = td.LocallyShuffleData(ds, cache) 145 | ds = td.PrefetchData(ds, 5000, 1) 146 | ds = td.LMDBDataPoint(ds) 147 | ds = td.MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) 148 | ds = td.AugmentImageComponent(ds, imagenet_augmentors) 149 | ds = td.PrefetchDataZMQ(ds, num_workers) 150 | self.ds = td.BatchData(ds, batch_size) 151 | self.ds.reset_state() 152 | 153 | self.batch_size = batch_size 154 | self.num_workers = num_workers 155 | self.cuda = cuda 156 | #self.drop_last = drop_last 157 | 158 | def __iter__(self): 159 | for x, y in self.ds.get_data(): 160 | if self.cuda: 161 | # images come out as uint8, which are faster to copy onto the gpu 162 | x = torch.ByteTensor(x).cuda() 163 | y = torch.IntTensor(y).cuda() 164 | # but once they're on the gpu, we'll need them in 165 | yield uint8_to_float(x), y.long() 166 | else: 167 | yield uint8_to_float(torch.ByteTensor(x)), torch.IntTensor(y).long() 168 | 169 | def __len__(self): 170 | return self.ds.size() 171 | 172 | def uint8_to_float(x): 173 | x = x.permute(0,3,1,2) # pytorch is (n,c,w,h) 174 | return x.float()/128. - 1. 175 | 176 | if __name__ == '__main__': 177 | from tqdm import tqdm 178 | dl = LMDBLoader('train', cuda=True) 179 | for x in tqdm(dl, total=len(dl)): 180 | pass 181 | -------------------------------------------------------------------------------- /preprocess_sequential.py: -------------------------------------------------------------------------------- 1 | # loads imagenet and writes it into one massive binary file 2 | 3 | import os 4 | import numpy as np 5 | from tensorpack.dataflow import * 6 | 7 | if __name__ == '__main__': 8 | class BinaryILSVRC12(dataset.ILSVRC12Files): 9 | def get_data(self): 10 | # for fname, label in super(BinaryILSVRC12, self).get_data(): 11 | for fname, label in super(BinaryILSVRC12, self).__iter__(): 12 | with open(fname, 'rb') as f: 13 | jpeg = f.read() 14 | jpeg = np.asarray(bytearray(jpeg), dtype='uint8') 15 | yield [jpeg, label] 16 | imagenet_path = os.environ['IMAGENET'] 17 | 18 | for name in ['train', 'val']: 19 | ds0 = BinaryILSVRC12(imagenet_path, name) 20 | ds1 = PrefetchDataZMQ(ds0, nr_proc=1) 21 | dftools.dump_dataflow_to_lmdb(ds1, os.path.join(imagenet_path,'ILSVRC-%s.lmdb'%name)) 22 | --------------------------------------------------------------------------------