├── MANIFEST.in ├── README.txt ├── pymapreduce └── __init__.py └── setup.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | Py MapReduce 4 | ================= 5 | 6 | Py MapReduce is a simple monoserver implementation of MapReduce in python, using the multiprocessing module. 7 | It can be use for instance for quick parallelization of file processing task, e.g. performing operations on each line of a large file. 8 | Simple operations (regexp matching etc..) are hard to multithread in python because of the Global Interpreter Lock (http://wiki.python.org/moin/GlobalInterpreterLock). Here multiprocessing can help 9 | 10 | Sample job (Word Count) 11 | ------------ 12 | 13 | class WC(Job): 14 | "Sample Word count parallel implementation" 15 | lc = 0 16 | wc = 0 17 | bc = 0 18 | def __init__(self, f): 19 | self.file = f 20 | 21 | def reduce_start(self): 22 | self.lc = 0 23 | self.wc = 0 24 | self.bc = 0 25 | 26 | def enumerate(self): 27 | return enumerate(open(self.file)) 28 | 29 | def map(self, pos, item): 30 | return (pos, (1, len(item.split()), len(item))) 31 | 32 | def reduce(self, pos, r): 33 | (lc, wc, bc) = r 34 | self.lc = self.lc + lc 35 | self.wc = self.wc + wc 36 | self.bc = self.bc + bc 37 | 38 | def reduce_stop(self): 39 | return (self.lc, self.wc, self.bc) -------------------------------------------------------------------------------- /pymapreduce/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | import multiprocessing, sys, cPickle, os, datetime 5 | from multiprocessing.queues import SimpleQueue 6 | from tempfile import NamedTemporaryFile, TemporaryFile 7 | import heapq, itertools 8 | 9 | class Job(object): 10 | """ 11 | Base class of the Job. 12 | 13 | job.enumerate() should return an interator over ITEMS 14 | 15 | for each INPUT 16 | job.map(item, cb) is called 17 | cb should be called for each key-value pair 18 | 19 | all key values pairs are sorted, and for each key 20 | job.reduce_key_start(key) is called 21 | job.reduce_value(value) is called 22 | job.reduce_key_stop(key) is called 23 | """ 24 | 25 | def map(self, item, cb): 26 | cb (item) 27 | 28 | def reduce_start(self): 29 | pass 30 | 31 | def reduce_key_start(self, key): 32 | pass 33 | 34 | def reduce_key_stop(self, key): 35 | pass 36 | 37 | def reduce_value(self, r): 38 | pass 39 | 40 | def reduce_stop(self): 41 | pass 42 | 43 | class WC(Job): 44 | "Sample Word count parallel implementation" 45 | lc = 0 46 | wc = 0 47 | bc = 0 48 | def __init__(self, f): 49 | self.file = f 50 | 51 | def reduce_start(self): 52 | self.lc = 0 53 | self.wc = 0 54 | self.bc = 0 55 | 56 | def enumerate(self): 57 | return enumerate(open(self.file)) 58 | 59 | def map(self, item, cb): 60 | (pos, line) = item 61 | cb((pos, (1, len(line.split()), len(line)))) 62 | 63 | def reduce_value(self, r): 64 | (lc, wc, bc) = r 65 | self.lc = self.lc + lc 66 | self.wc = self.wc + wc 67 | self.bc = self.bc + bc 68 | 69 | def reduce_stop(self): 70 | return (self.lc, self.wc, self.bc) 71 | 72 | def debug_print(s): 73 | print >> sys.stderr, "[%s] (pid %u) %s" % (datetime.datetime.now().strftime('%H:%M:%S'), os.getpid(), s) 74 | 75 | class BaseRunner(object): 76 | STOP_MSG = "##STOP_MSG##" 77 | def __init__(self): 78 | self.debug = False 79 | pass 80 | def reduce_loop(self, item_iterator): 81 | job = self.job 82 | job.reduce_start() 83 | pkey = None 84 | for (key, val) in item_iterator: 85 | if pkey == None or pkey != key: 86 | if not (pkey is None): 87 | job.reduce_key_stop(pkey) 88 | job.reduce_key_start(key) 89 | pkey = key 90 | job.reduce_value(val) 91 | if not (pkey is None): 92 | job.reduce_key_stop(pkey) 93 | return job.reduce_stop() 94 | 95 | class SingleThreadRunner(BaseRunner): 96 | """ 97 | Runner that executes a job in a single thread on a single process 98 | """ 99 | def __init__(self): 100 | pass 101 | def run(self, job): 102 | self.job = job 103 | buf = [] 104 | for elt in job.enumerate(): 105 | job.map(elt, buf.append) 106 | buf.sort() 107 | return self.reduce_loop(buf) 108 | 109 | class BaseMultiprocessingRunner(BaseRunner): 110 | def __init__(self): 111 | super(BaseMultiprocessingRunner, self).__init__() 112 | self.numprocs = max(multiprocessing.cpu_count() - 1, 1) 113 | self.map_input_queue = SimpleQueue() 114 | self.map_output_queue = SimpleQueue() 115 | def run_map(self): 116 | for item in iter(self.map_input_queue.get, self.STOP_MSG): 117 | self.job.map(item, self.map_output_queue.put) 118 | self.map_output_queue.put(self.STOP_MSG) 119 | if self.debug: 120 | debug_print("Output : STOP sent") 121 | def run_enumerate(self): 122 | for inp in self.job.enumerate(): 123 | self.map_input_queue.put(inp) 124 | for work in range(self.numprocs): 125 | self.map_input_queue.put(self.STOP_MSG) 126 | if self.debug: 127 | debug_print("Input: STOP sent") 128 | def run(self, job): 129 | self.job = job 130 | # Process that reads the input file 131 | self.enumeration_process = multiprocessing.Process(target=self.run_enumerate, args=()) 132 | 133 | self.mappers = [ multiprocessing.Process(target=self.run_map, args=()) 134 | for i in range(self.numprocs)] 135 | 136 | self.enumeration_process.start() 137 | for mapper in self.mappers: 138 | mapper.start() 139 | r = self.run_reduce() 140 | self.enumeration_process.join() 141 | for mapper in self.mappers: 142 | mapper.join() 143 | return r 144 | 145 | class DiskBasedRunner(BaseMultiprocessingRunner): 146 | def __init__(self, map_buffer_size = 10000, reduce_max_files = 10 ): 147 | super(DiskBasedRunner, self).__init__() 148 | self.item_buffer = {} 149 | self.map_buffer_size = map_buffer_size 150 | self.reduce_max_files = reduce_max_files 151 | self.map_opened_files = [] 152 | 153 | def run_map(self): 154 | self.item_buffer = [] 155 | for item in iter(self.map_input_queue.get, self.STOP_MSG): 156 | self.job.map(item, self.item_buffer.append) 157 | if len(self.item_buffer) > self.map_buffer_size: 158 | self.map_buffer_clear() 159 | self.map_buffer_clear() 160 | self.map_output_queue.put(self.STOP_MSG) 161 | if self.debug: 162 | debug_print("Map done") 163 | 164 | def map_buffer_clear(self): 165 | self.item_buffer.sort() 166 | f = NamedTemporaryFile() # We keep the file opened as it would close automatically 167 | if self.debug: 168 | debug_print('Temp file %s' % f.name) 169 | for item in self.item_buffer: 170 | cPickle.dump(item, f, cPickle.HIGHEST_PROTOCOL) 171 | f.flush() 172 | self.map_opened_files.append(f) 173 | self.map_output_queue.put(f.name) 174 | del self.item_buffer[:] 175 | 176 | def get_next_file(self): 177 | while self.stopped_received < self.numprocs: 178 | filename = self.map_output_queue.get() 179 | if filename == self.STOP_MSG: 180 | self.stopped_received = self.stopped_received + 1 181 | if self.debug: 182 | debug_print("Reduced received complete output from %u mappers" % self.stopped_received) 183 | continue 184 | else: 185 | if self.debug: 186 | debug_print('Reading %s' % filename) 187 | yield open(filename, 'r') 188 | if self.debug: 189 | debug_print('All files from mappers received') 190 | 191 | def iter_on_file(self, stream): 192 | try: 193 | while True: 194 | yield cPickle.load(stream) 195 | except EOFError: 196 | stream.close() 197 | if hasattr(stream, "name"): 198 | os.remove(stream.name) 199 | 200 | def run_reduce(self): 201 | self.stopped_received = 0 202 | self.merged_files = [] 203 | merged_iterator = None 204 | while True: 205 | # Iterate and merge files until all jobs are processed 206 | get_next = self.get_next_file() 207 | files = get_next 208 | #itertools.islice(get_next, self.reduce_max_files) 209 | all_files = [file for file in files] 210 | iterables = [self.iter_on_file(file) for file in all_files] 211 | merged_iterator = heapq.merge(*iterables) 212 | if self.stopped_received < self.numprocs: 213 | if self.debug: 214 | debug_print("Performing intermediate merge on %u files" % len(iterables)) 215 | f = TemporaryFile() 216 | self.merged_files.append(f) 217 | for m in merged_iterator: 218 | cPickle.dump(m, f, cPickle.HIGHEST_PROTOCOL) 219 | f.seek(0) 220 | f.flush() 221 | else: 222 | break 223 | if len(self.merged_files) > 0: 224 | if self.debug: 225 | debug_print("Final merge") 226 | # Final merge if required 227 | merged_iterator = heapq.merge(*([self.iter_on_file(stream) for stream in self.merged_files]+[merged_iterator])) 228 | if self.debug: 229 | debug_print("Reduce loop") 230 | result = self.reduce_loop(merged_iterator) 231 | return result 232 | 233 | class SedLikeJobRunner(BaseMultiprocessingRunner): 234 | """ 235 | Runner optimzed for jobs that outputs key values of the form (i, value) where i are consecutive integer 236 | starting at '0' 237 | """ 238 | def __init__(self): 239 | super(SedLikeJobRunner, self).__init__() 240 | def run_reduce(self): 241 | cur = 0 242 | buffer = {} 243 | self.job.reduce_start() 244 | for mappers in range(self.numprocs): 245 | for msg in iter(self.map_output_queue.get, self.STOP_MSG): 246 | (i, val) = msg 247 | # verify rows are in order, if not save in buffer 248 | if i != cur: 249 | buffer[i] = val 250 | else: 251 | self.job.reduce_key_start(i) 252 | self.job.reduce_value(val) 253 | self.job.reduce_key_stop(i) 254 | cur += 1 255 | while cur in buffer: 256 | self.job.reduce_key_start(cur) 257 | self.job.reduce_value(buffer[cur]) 258 | self.job.reduce_key_stop(cur) 259 | del buffer[cur] 260 | cur += 1 261 | if self.debug: 262 | debug_print("Mapper done %u" % mappers) 263 | return self.job.reduce_stop() 264 | 265 | class WCLikeJobRunner(BaseMultiprocessingRunner): 266 | """ 267 | Runner optimized for jobs that outputs always the same key, and perform only a global reduce over all values 268 | """ 269 | def __init__(self): 270 | super(WCLikeJobRunner, self).__init__() 271 | 272 | def run_reduce(self): 273 | self.job.reduce_start() 274 | for mappers in range(self.numprocs): 275 | for msg in iter(self.map_output_queue.get, self.STOP_MSG): 276 | (key, val) = msg 277 | self.job.reduce_value(val) 278 | return self.job.reduce_stop() 279 | 280 | class RambasedRunner(BaseMultiprocessingRunner): 281 | def __init__(self): 282 | super(RambasedRunner, self).__init__() 283 | 284 | def run_reduce(self): 285 | self.job.reduce_start() 286 | buf = [] 287 | for mappers in range(self.numprocs): 288 | for msg in iter(self.map_output_queue.get, self.STOP_MSG): 289 | buf.append(msg) 290 | buf.sort() 291 | return self.reduce_loop(buf) 292 | 293 | if __name__ == "__main__": 294 | runners = [] 295 | runners.append(SingleThreadRunner()) 296 | runners.append(RambasedRunner()) 297 | runners.append(WCLikeJobRunner()) 298 | runners.append(SedLikeJobRunner()) 299 | runners.append(DiskBasedRunner()) 300 | for runner in runners: 301 | runner.debug = True 302 | for argv in sys.argv[1:]: 303 | (lc, wc, bc) = runner.run(WC(argv)) 304 | print "(%s)\t%u\t%u\t%u\t%s" % (runner.__class__.__name__, lc, wc, bc, argv) 305 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | try: 6 | from distutils.command.build_py import build_py_2to3 as build_py 7 | except ImportError: 8 | from distutils.command.build_py import build_py 9 | 10 | setup( 11 | name = 'pymapreduce', 12 | version = '0.2', 13 | url = 'https://github.com/fdouetteau/PyMapReduce', 14 | description = 'Simple Mapreduce in Python', 15 | author = 'Florian Douetteau', 16 | author_email = 'florian@douetteau.net', 17 | license = 'MIT', 18 | platforms = 'any', 19 | packages = [ 'pymapreduce'], 20 | py_modules = [ 21 | ], 22 | requires = [ 23 | ], 24 | classifiers = [ 25 | 'Environment :: Web Environment', 26 | 'Intended Audience :: Developers', 27 | 'License :: OSI Approved :: MIT License', 28 | 'Operating System :: OS Independent', 29 | 'Programming Language :: Python', 30 | 'Topic :: Internet :: WWW/HTTP :: Dynamic Content', 31 | 'Topic :: Software Development :: Libraries :: Python Modules' 32 | ], 33 | cmdclass = {'build_py': build_py} 34 | ) 35 | --------------------------------------------------------------------------------