├── README.md ├── lazy_loader.py └── test.py /README.md: -------------------------------------------------------------------------------- 1 | # Lazy Loader 2 | 3 | 專們為了巨量資料而設計的資料載入器。 4 | 5 | ## Usage 6 | 7 | 首先必須先將一整個資料切割成多份並儲存在硬碟上,切割的份數盡可能多,接下來就可以用 Lazy Loader 載入之,以下是它使用的範例。 8 | 9 | ```python 10 | from lazy_loader import LazyLoader 11 | 12 | loader = LazyLoader( 13 | filenames = ['file1', 'file2', 'file3'], # 切割資料的檔案名稱 14 | stream_loader = stream_loader, # 實做的載入器 15 | stream_parser = stream_parser, # 實做的解析器 16 | batch_generator = batch_generator, # 實做的生產器 17 | down_sample_rate = 16, # 有 1/N 的機率採樣資料,越大則資料擴散度越好 18 | num_workers = 4, # 載入器數目,越多載入速度越快 19 | buffer_size = 32 * 1024, # 緩衝大小,越大能提供越好的亂度,但需要的記憶體容量越大 20 | batch_size = 256 # batch 大小 21 | flag = flag # 傳遞停止的信號 22 | ) 23 | 24 | for _ in range(10000): 25 | batch = next(loader) # 可以一直拿取下一份 batch 26 | 27 | # 程式結束後,發送停止信號 28 | flag.set_stop_flag() 29 | try: 30 | batch = next(loader) 31 | except StopIteration: 32 | print("stop...") 33 | 34 | ``` 35 | 36 | 而 ```StreamLoader``` 、```StreamParser``` 和 ```BatchGenerator``` 的實做界面為下。 37 | 38 | ```python 39 | class StreamLoader: 40 | def __init__(self): 41 | pass 42 | 43 | def func(self, filename): 44 | 45 | # 輸入檔案的名稱,載入檔案轉換成 stream 並回傳之, 46 | # 如果檔案不存在,必須回傳 None 47 | 48 | return stream 49 | 50 | 51 | class StreamParser: 52 | def __init__(self): 53 | pass 54 | 55 | def func(self, stream): 56 | 57 | # 輸入檔案的 stream ,解析 stream 內的一份資料並回傳, 58 | # 如果 stream 已經結束無法解析資料,必須回傳 None 59 | 60 | return data 61 | 62 | 63 | class BatchGenerator: 64 | def __init__(self): 65 | pass 66 | 67 | def func(self, data_list): 68 | 69 | # 輸入是資料的 list,回傳串接好的 batch 資料 70 | 71 | return batch 72 | ``` 73 | 74 | 完整的範例請看 ```test.py``` 75 | 76 | ## License 77 | 78 | 沒有 License ,請自由的加入自己的專案中。 79 | 80 | ## TODO 81 | 82 | * 支援 LMDB ? 83 | -------------------------------------------------------------------------------- /lazy_loader.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import threading 3 | import random 4 | 5 | class ShuffleBuffer: 6 | def __init__(self, buf_size): 7 | self._buf = list() 8 | self._max_buf_size = buf_size 9 | assert self._max_buf_size >= 1, "Buffer size must be greater than zero." 10 | 11 | def insert_item_and_pop(self, item): 12 | size = len(self._buf) 13 | 14 | if size > 4: 15 | i = random.randint(0, size-1) 16 | 17 | # Apply Fisher-Yates shuffle algorithm. Efficiently shuffle 18 | # the random buffer. 19 | self._buf[i], item = item, self._buf[i] 20 | 21 | if size < self._max_buf_size: 22 | self._buf.append(item) 23 | return None 24 | return item 25 | 26 | class DataLoader: 27 | def __init__(self, filenames, data_writer, down_sample_rate, stream_loader, stream_parser): 28 | self.done = filenames 29 | self.tasks = list() 30 | 31 | self.parser = stream_parser 32 | self.loader = stream_loader 33 | self.writer = data_writer 34 | self.stream = None 35 | 36 | # Use a random sample input data reader. This helps improve the spread of 37 | # games in the shuffle buffer. 38 | self.rate = down_sample_rate 39 | 40 | assert len(filenames) != 0, "" 41 | 42 | def _open_new_stream(self): 43 | if len(self.tasks) == 0: 44 | self.tasks, self.done = self.done, self.tasks 45 | random.shuffle(self.tasks) 46 | 47 | filename = self.tasks.pop() 48 | self.done.append(filename) 49 | 50 | return self.loader.func(filename) 51 | 52 | def next(self): 53 | while True: 54 | if self.stream is None: 55 | self.stream = self._open_new_stream() 56 | 57 | data = self.parser.func(self.stream) 58 | 59 | if data is None: 60 | # The stream is end. Open the new stream next time. 61 | self.stream = None 62 | continue 63 | 64 | if self.rate > 1: 65 | # Apply the down-sample. 66 | if random.randint(0, self.rate-1) != 0: 67 | continue 68 | 69 | self.writer.send(data) 70 | break 71 | 72 | class LoaderConfig: 73 | def __init__(self): 74 | self.filenames = list() 75 | self.stream_loader = None 76 | self.stream_parser = None 77 | self.batch_generator = None 78 | self.down_sample_rate = 16 79 | self.num_workers = 0 80 | self.buffer_size = 0 81 | self.batch_size = 0 82 | self.flag = None 83 | 84 | def valid(self): 85 | if len(self.filenames) <= 0 or \ 86 | self.stream_loader is None or \ 87 | self.stream_parser is None or \ 88 | self.batch_generator is None or \ 89 | self.num_workers <= 0 or \ 90 | self.buffer_size <= 0 or \ 91 | self.batch_size <= 0: 92 | return False 93 | return True 94 | 95 | class LoaderFlag: 96 | NONE = 0 97 | STOP = 1 98 | 99 | def __init__(self): 100 | self.flag = mp.Value("i", self.NONE) 101 | 102 | def is_stop(self): 103 | with self.flag.get_lock(): 104 | v = self.flag.value 105 | return v == self.STOP 106 | 107 | def reset_flag(self): 108 | with self.flag.get_lock(): 109 | self.flag.value = self.NONE 110 | 111 | def set_stop_flag(self): 112 | with self.flag.get_lock(): 113 | self.flag.value = self.STOP 114 | 115 | def _load_from_files(config, data_writer): 116 | # Load the data from disk. Suggest to design a heavy stream parser instead 117 | # of heavy batch generator. It is because that N workers execute the 118 | # parser function, only one worker executes generator function. 119 | 120 | loader = DataLoader( 121 | filenames = config.filenames, 122 | data_writer = data_writer, 123 | down_sample_rate = config.down_sample_rate, 124 | stream_loader = config.stream_loader, 125 | stream_parser = config.stream_parser 126 | ) 127 | 128 | while True: 129 | if config.flag.is_stop(): 130 | data_writer.close() 131 | break 132 | loader.next() 133 | 134 | def _gather_batch(config, data_readers, batch_writer): 135 | shuf_buff = ShuffleBuffer(config.buffer_size) 136 | batch_gen = config.batch_generator 137 | 138 | stop = False 139 | while not stop: 140 | # Fill the buffer until it is full. 141 | for r in data_readers: 142 | try: 143 | item = r.recv() 144 | outs = shuf_buff.insert_item_and_pop(item) 145 | if outs is not None: 146 | stop = True 147 | except: 148 | return 149 | 150 | # Now, start to prepare the batch. It significantly improve 151 | # the loader performanc. 152 | while True: 153 | if config.flag.is_stop(): 154 | batch_writer.close() 155 | break 156 | 157 | data_list = list() 158 | 159 | while len(data_list) < config.batch_size: 160 | for r in data_readers: 161 | try: 162 | item = r.recv() 163 | outs = shuf_buff.insert_item_and_pop(item) 164 | if outs is not None: 165 | data_list.append(outs) 166 | except: 167 | return 168 | 169 | # Send the batch. 170 | batch = batch_gen.func(data_list) 171 | try: 172 | batch_writer.send(batch) 173 | except: 174 | return 175 | 176 | def LazyLoader(*args, **kwargs): 177 | config = LoaderConfig() 178 | 179 | config.filenames = kwargs.get("filenames", list()) 180 | config.stream_loader = kwargs.get("stream_loader", None) 181 | config.stream_parser = kwargs.get("stream_parser", None) 182 | config.batch_generator = kwargs.get("batch_generator",None) 183 | config.down_sample_rate = kwargs.get("down_sample_rate", 0) 184 | config.num_workers = kwargs.get("num_workers", 0) 185 | config.buffer_size = kwargs.get("buffer_size", 0) 186 | config.batch_size = kwargs.get("batch_size", 0) 187 | config.flag = kwargs.get("flag", LoaderFlag()) 188 | 189 | if not config.valid(): 190 | print("Config is invalid. Please check your setting.") 191 | return None 192 | 193 | proc_list = list() 194 | data_readers = list() 195 | batch_reader, batch_writer = mp.Pipe(duplex=False) 196 | 197 | for _ in range(config.num_workers): 198 | # One process uses one pipe. 199 | data_reader, data_writer = mp.Pipe(duplex=False) 200 | data_readers.append(data_reader) 201 | 202 | # Create one SMP process. 203 | p = mp.Process( 204 | target=_load_from_files, 205 | args=(config, data_writer), 206 | daemon=True 207 | ) 208 | p.start() 209 | proc_list.append(p) 210 | data_writer.close() 211 | 212 | t = threading.Thread( 213 | target=_gather_batch, 214 | args=(config, data_readers, batch_writer), 215 | daemon=True 216 | ) 217 | t.start() 218 | 219 | # Do not close it because the thread and main thread share the same 220 | # writer. 221 | # batch_writer.close() 222 | 223 | while True: 224 | if config.flag.is_stop(): 225 | for idx, p in enumerate(proc_list): 226 | while p.is_alive(): 227 | try: 228 | _ = data_readers[idx].recv() 229 | continue 230 | except: 231 | pass 232 | data_readers[idx].close() 233 | batch_reader.close() 234 | batch_writer.close() 235 | t.join() 236 | return 237 | try: 238 | batch = batch_reader.recv() 239 | yield batch 240 | except: 241 | return 242 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os, random, glob, io, time 2 | from lazy_loader import LazyLoader, LoaderFlag 3 | 4 | def gen_dummy_data(): 5 | dirname = "dummy-data" 6 | if not os.path.isdir(dirname): 7 | os.mkdir(dirname) 8 | 9 | print("write the sample data in the {} file...\n".format(dirname)) 10 | 11 | for i in range(10): 12 | filename = os.path.join(dirname, "data_{}.txt".format(i+1)) 13 | if not os.path.isfile(filename): 14 | with open(filename, 'w') as f: 15 | for j in range(1024): 16 | random_list = list() 17 | for k in range(2): 18 | random_list.append(random.randint(0, 99999)) 19 | for r in random_list: 20 | f.write("{} ".format(r)) 21 | f.write("\n") 22 | 23 | def gather_filenames(): 24 | def gather_recursive_files(root): 25 | l = list() 26 | for name in glob.glob(os.path.join(root, "*")): 27 | if os.path.isdir(name): 28 | l.extend(gather_recursive_files(name)) 29 | else: 30 | l.append(name) 31 | return l 32 | 33 | return gather_recursive_files("dummy-data") 34 | 35 | class StreamLoader: 36 | def __init__(self): 37 | pass 38 | 39 | def func(self, filename): 40 | stream = None 41 | if os.path.isfile(filename): 42 | with open(filename, 'r') as f: 43 | stream = io.StringIO(f.read()) 44 | return stream 45 | 46 | class StreamParser: 47 | def __init__(self): 48 | pass 49 | 50 | def func(self, stream): 51 | if stream is None: 52 | return None 53 | 54 | line = stream.readline() 55 | if len(line) == 0: 56 | return None 57 | 58 | data = list() 59 | vals = line.split() 60 | for v in vals: 61 | data.append(int(v)) 62 | return data 63 | 64 | class BatchGenerator: 65 | def __init__(self): 66 | pass 67 | 68 | def func(self, data_list): 69 | x = list() 70 | y = list() 71 | 72 | for data in data_list: 73 | x.append(data[0]) 74 | y.append(data[1]) 75 | 76 | batch = x, y 77 | return batch 78 | 79 | if __name__ == "__main__": 80 | # Create sample data. 81 | gen_dummy_data() 82 | 83 | # Implement the loader, parser and generator. 84 | sl = StreamLoader() 85 | sp = StreamParser() 86 | bg = BatchGenerator() 87 | flag = LoaderFlag() 88 | 89 | # Create the lazy loader. 90 | loader = LazyLoader( 91 | filenames = gather_filenames(), 92 | stream_loader = sl, 93 | stream_parser = sp, 94 | batch_generator = bg, 95 | down_sample_rate = 16, 96 | num_workers = 1, 97 | buffer_size = 512, 98 | batch_size = 32, 99 | flag = flag 100 | ) 101 | 102 | # Wait for filling shuffle buffer. 103 | batch = next(loader) 104 | 105 | # gather batch 106 | for _ in range(10): 107 | batch = next(loader) 108 | print(batch) 109 | flag.set_stop_flag() 110 | try: 111 | batch = next(loader) 112 | except StopIteration: 113 | print("stop...") 114 | --------------------------------------------------------------------------------