├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── setup.py └── taskmap ├── __init__.py ├── logparse.py ├── taskmap.py ├── taskmap_test.py └── tgraph.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | .static_storage/ 56 | .media/ 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.5" 4 | - "3.6" 5 | 6 | # command to install dependencies 7 | install: 8 | - pip install --editable ./ 9 | - pip install pytest 10 | 11 | # command to run tests 12 | script: 13 | - py.test 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Noam Finkelstein 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/n-s-f/taskmap.svg?branch=master)](https://travis-ci.org/n-s-f/taskmap) 2 | 3 | # taskmap 4 | 5 | This library facilitates keeping track of dependencies between python functions, 6 | and running them asyncronously and/or in parallel. 7 | 8 | ## Overview 9 | 10 | There are many libraries in python that help run pipelines in parallel while 11 | keeping track of dependencies between tasks, notably 12 | [joblib](https://pythonhosted.org/joblib/) and 13 | [luigi](https://github.com/spotify/luigi). 14 | 15 | taskmap provides a way to easily handle coroutines in task pipelines. Many kinds 16 | of pipelines will have a significant number of tasks that are IO bound. Running 17 | these tasks in parallel will still leave lots of unused processing time. 18 | 19 | That's where taskmap comes in. It's designed to help you get the most out of a 20 | single machine. You specify the dependency graph for your tasks (which are just 21 | native python functions or coroutines), and optionally which ones are IO bound. 22 | The tasks can then be run asynchronously and in parallel, making sure that no core 23 | sits unused while there are tasks it could be running. 24 | 25 | Because taskmap keeps track of the dependency graph, it is easy to only rerun 26 | failed tasks. It's also possible to change the functions corresponding to tasks 27 | and rerun only those changed tasks and their children. You can then cache your 28 | results, so that later you can pick up where you left off. 29 | 30 | ## Installation 31 | 32 | ``` 33 | pip install taskmap 34 | ``` 35 | 36 | ## Quick Start 37 | 38 | This example demonstrates the major use case of the taskmap library. 39 | 40 | ```.py 41 | import taskmap 42 | import asyncio 43 | import time 44 | 45 | # simulates io waits with asyncio.sleep 46 | async def io_bound_a(): await asyncio.sleep(1); return 'io_a' 47 | async def io_bound_b(x): await asyncio.sleep(1); return x + ' io_b' 48 | 49 | # simulates cpu usage with time.sleep 50 | async def cpu_bound_a(x): time.sleep(1); return x + ' cpu_a' 51 | async def cpu_bound_b(): time.sleep(1); return 'cpu_b' 52 | 53 | def test_async_parallel_demo(): 54 | # given 55 | funcs = { 56 | 'io_bound_a': io_bound_a, 57 | 'io_bound_b': io_bound_b, 58 | 'cpu_bound_a': cpu_bound_a, 59 | 'cpu_bound_b': cpu_bound_b, 60 | } 61 | 62 | dependencies = { 63 | 'io_bound_a': [], 64 | 'io_bound_b': ['cpu_bound_b'], 65 | 'cpu_bound_a': ['io_bound_a'], 66 | 'cpu_bound_b': [], 67 | } 68 | 69 | io_bound = ['io_bound_a', 'io_bound_b'] 70 | graph = taskmap.create_graph(funcs, dependencies, io_bound=io_bound) 71 | 72 | # when 73 | graph = taskmap.run_parallel_async(graph, nprocs=2) 74 | 75 | # then 76 | assert graph.results['io_bound_a'] == 'io_a' 77 | assert graph.results['io_bound_b'] == 'cpu_b io_b' 78 | assert graph.results['cpu_bound_a'] == 'io_a cpu_a' 79 | assert graph.results['cpu_bound_b'] == 'cpu_b' 80 | ``` 81 | 82 | More examples can be found in the tests. 83 | 84 | ## API 85 | 86 | ### Creating and Running the Graph 87 | 88 | #### create_graph(funcs, dependencies, io_bound=None, done=None, results=None) 89 | 90 | Creates the dependency graph. 91 | 92 | `dependencies`: a dictionary that maps task names to a list of dependencies. The 93 | results of those dependencies will be fed into the function in the order in 94 | which they appear. Tasks that return `None` will not have the results fed into 95 | the tasks that depend on them. 96 | 97 | `funcs`: a dictionary that maps the names of the tasks to functions. Each 98 | function should accept the same number of arguments as it has dependencies that 99 | return non `None` values. 100 | 101 | `io_bound`: a list of the names of the tasks that are io bound. These will be 102 | picked up first, so that the cpu bound tasks can be executed while waiting on 103 | results from e.g. network or database calls. 104 | 105 | `done`: a list of the names of tasks that are already done. These tasks will not 106 | be run if any of the `run*(graph)` functions are called with this graph. This is 107 | a way to run only part of a dependency graph without changing the code that 108 | creates the `dependencies` or `funcs` arguments. 109 | 110 | `results`: a dictionary mapping the names of tasks to their results. This is 111 | useful if the tasks listed in the `done` arguments have results that their 112 | children will need passed to them. 113 | 114 | This function will throw for a dependency dictionary with cyclic dependencies, 115 | or if there are functions that are depended on but are not present as keys in 116 | the dependencies dictionary. 117 | 118 | Note that for coroutines, `functools.partial` will not work. If you need to 119 | create partial functions to use as tasks, you can use `partial` from the `paco` 120 | library. 121 | 122 | #### taskmap.run_parallel_async(graph, sleep=0.1, ncores=None) 123 | 124 | Runs the graph asynchronously across multiple cores. All tasks must be python 125 | coroutines. This can be used when tasks are bottlenecked by both io and cpu. 126 | 127 | `sleep` determines how long each process waits between checks to see if a new 128 | task has become available. 129 | 130 | `ncores` is how many cores are used in parallel. Defaults to half of available 131 | cores. 132 | 133 | #### taskmap.run_async(graph, sleep=.01) 134 | 135 | Runs all coroutines on a single core. This can be used if all tasks are 136 | bottlenecked by io. 137 | 138 | #### taskmap.run_parallel(graph, sleep=.01, ncores=None) 139 | 140 | The tasks must be normal python functions, and are not run in parallel but not 141 | asynchronously. This can be used if all tasks are cpu bottlenecked. 142 | 143 | #### taskmap.run(graph) 144 | 145 | All tasks must be normal python functions and are run synchronously in a single 146 | process. 147 | 148 | ### Handling Failed Tasks 149 | 150 | #### taskmap.reset_failed_tasks 151 | 152 | taskmap marks tasks that throw an exception as 'failed'. This function allows 153 | you to rebuild a graph to only run the tasks that have failed and their 154 | children. A common pattern is: 155 | 156 | ```.py 157 | result_graph = taskmap.run_parallel_async(graph) 158 | # failures abound 159 | 160 | new_graph = taskmap.reset_failed_tasks(result_graph) 161 | 162 | # make a fix (e.g. make sure DB is available) 163 | new_result_graph = taskmap.run_parallel_async(new_graph) 164 | ``` 165 | 166 | #### taskmap.reset_tasks 167 | 168 | This function allows you to rebuild a graph to only run a subset of tasks, and 169 | their children. This is useful if you change some of the tasks in the `'funcs'` 170 | and want to rerun those tasks and the tasks that depend on their outcomes. This 171 | can be because there was an bug in the task, or simply because you want to alter 172 | the behavior. 173 | 174 | ```.py 175 | result_graph = taskmap.run_parallel_async(graph) 176 | 177 | # change the function corresponding to some task name 178 | result_graph.funcs['some_func'] = new_task 179 | 180 | new_graph = taskmap.reset_tasks(result_graph, ['some_func']) 181 | new_result_graph = taskmap.run_parallel_async(new_graph) 182 | ``` 183 | 184 | ### Manipulating the graph 185 | 186 | #### taskmap.mark_as_done(graph, tasks) 187 | #### taskmap.mark_as_done_except(graph, tasks) 188 | 189 | ### Parsing logs 190 | 191 | #### taskmap.in\_progress(path\_to\_log) 192 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='taskmap', 5 | version='0.0.7', 6 | description='Dependency graph with parallel asyncronous task runner', 7 | url='https://github.com/n-s-f/taskmap', 8 | packages=find_packages(exclude=['contrib', 'docs', 'tests']), 9 | install_requires=['multiprocess', 'multiprocessing-logging'], 10 | ) 11 | -------------------------------------------------------------------------------- /taskmap/__init__.py: -------------------------------------------------------------------------------- 1 | from .taskmap import (run_task, run_task_async, run, run_parallel, run_async, 2 | run_parallel_async) 3 | 4 | from .tgraph import (create_graph, get_ready_tasks, mark_as_done, 5 | mark_as_done_except, get_all_children, reset_tasks, 6 | reset_failed_tasks) 7 | 8 | from .logparse import in_progress 9 | -------------------------------------------------------------------------------- /taskmap/logparse.py: -------------------------------------------------------------------------------- 1 | def in_progress(path_to_log): 2 | with open(path_to_log, 'r') as f: 3 | log = f.readlines() 4 | 5 | queued = [] 6 | started = [] 7 | finished = [] 8 | for line in log: 9 | words = line.strip().split(' ') 10 | 11 | if 'starting' in words: 12 | started.append(words[-1]) 13 | elif 'finished' in words: 14 | finished.append(words[-1]) 15 | elif 'queueing' in words: 16 | queued.append(words[-1]) 17 | 18 | return { 19 | 'in_progress': set(started) - set(finished), 20 | 'queued': set(queued) - set(started), 21 | } 22 | -------------------------------------------------------------------------------- /taskmap/taskmap.py: -------------------------------------------------------------------------------- 1 | from taskmap import tgraph 2 | 3 | import os 4 | import time 5 | import asyncio 6 | import logging 7 | import traceback 8 | import multiprocess as mp 9 | 10 | 11 | def log(graph): 12 | return logging.getLogger('{}-worker'.format(graph.name)) 13 | 14 | 15 | def mlog(graph): 16 | return logging.getLogger('{}-manager'.format(graph.name)) 17 | 18 | 19 | def run_task(graph, task, raise_errors=False): 20 | graph = tgraph.mark_as_in_progress(graph, task) 21 | args = get_task_args(graph, task) 22 | log(graph).info('pid {}: starting task {}'.format(os.getpid(), task)) 23 | 24 | try: 25 | result = graph.funcs[task](*args) 26 | return task_success(graph, task, result) 27 | 28 | except Exception as error: 29 | graph = task_error(graph, task, error) 30 | if raise_errors: 31 | raise 32 | return graph 33 | 34 | 35 | async def run_task_async(graph, task, raise_errors=False): 36 | graph = tgraph.mark_as_in_progress(graph, task) 37 | args = get_task_args(graph, task) 38 | log(graph).info('pid {}: starting task {}'.format(os.getpid(), task)) 39 | 40 | try: 41 | result = await asyncio.coroutine(graph.funcs[task])(*args) 42 | return task_success(graph, task, result) 43 | 44 | except Exception as error: 45 | graph = task_error(graph, task, error) 46 | if raise_errors: 47 | raise 48 | return graph 49 | 50 | 51 | def task_success(graph, task, result): 52 | log(graph).info('pid {}: finished task {}'.format(os.getpid(), task)) 53 | graph.results[task] = result 54 | return tgraph.mark_as_done(graph, task) 55 | 56 | 57 | def task_error(graph, task, error): 58 | tb = traceback.format_exc() 59 | msg = 'pid {}: failed task {}: stack {}'.format(os.getpid(), task, tb) 60 | log(graph).exception(msg, {'exc_info': error}) 61 | graph.results[task] = error 62 | graph = tgraph.mark_as_done(graph, task) 63 | return mark_children_as_incomplete(graph, task) 64 | 65 | 66 | def run(graph, raise_errors=False): 67 | while not tgraph.all_done(graph): 68 | ready = tgraph.get_ready_tasks(graph) 69 | for task in ready: 70 | log(graph).info('pid {}: claiming task {}'.format(os.getpid(), task)) 71 | graph = run_task(graph, task, raise_errors) 72 | return graph 73 | 74 | 75 | def run_parallel(graph, nprocs=None, sleep=0.2, raise_errors=False): 76 | nprocs = nprocs or mp.cpu_count() - 1 77 | with mp.Manager() as manager: 78 | graph = tgraph.create_parallel_compatible_graph(graph, manager) 79 | with mp.Pool(nprocs) as pool: 80 | 81 | exception_q = mp.Queue(10) 82 | 83 | def error_callback(exception): 84 | exception_q.put_nowait(exception) 85 | pool.terminate() 86 | 87 | while not tgraph.all_done(graph): 88 | for task in tgraph.get_ready_tasks(graph, reverse=False): 89 | graph = tgraph.mark_as_in_progress(graph, task) 90 | mlog(graph).info( 91 | 'pid {}: assigning task {}'.format(os.getpid(), task)) 92 | pool.apply_async( 93 | run_task, args=(graph, task, raise_errors), 94 | error_callback=error_callback 95 | ) 96 | time.sleep(sleep) 97 | 98 | if not exception_q.empty(): 99 | raise exception_q.get() 100 | 101 | return tgraph.recover_values_from_manager(graph) 102 | 103 | 104 | def exception_handler(loop, context): 105 | # workaround for the fact that asyncio will not let you stop on exceptions 106 | # for tasks added to the loop after it has already started running 107 | loop.stop() 108 | 109 | 110 | def run_async(graph, sleep=0.2, coro=None, raise_errors=False): 111 | ioq = asyncio.Queue(len(graph.funcs.keys())) 112 | cpuq = asyncio.Queue(len(graph.funcs.keys())) 113 | loop = asyncio.new_event_loop() 114 | loop.set_exception_handler(exception_handler) 115 | coros = asyncio.gather( 116 | queue_loader(graph, ioq, cpuq, sleep), 117 | scheduler(graph, sleep, ioq, cpuq, loop, raise_errors), 118 | loop=loop) 119 | 120 | try: 121 | loop.run_until_complete(coros) 122 | except Exception as error: 123 | raise RuntimeError('An async task has failed. Please check your logs') 124 | finally: 125 | loop.close() 126 | 127 | return graph 128 | 129 | 130 | def run_parallel_async(graph, nprocs=None, sleep=0.2, raise_errors=False): 131 | if nprocs == 1: 132 | return run_async(graph, sleep=sleep, raise_errors=raise_errors) 133 | 134 | nprocs = nprocs or mp.cpu_count() // 2 135 | 136 | with mp.Manager() as manager: 137 | graph = tgraph.create_parallel_compatible_graph(graph, manager) 138 | 139 | ioq = mp.Queue(len(graph.funcs.keys())) 140 | cpuq = mp.Queue(len(graph.funcs.keys())) 141 | 142 | procs = [mp.Process(target=run_scheduler, 143 | args=(graph, sleep, ioq, cpuq, raise_errors)) 144 | for _ in range(nprocs)] 145 | for proc in procs: 146 | proc.start() 147 | 148 | while not tgraph.all_done(graph): 149 | for task in tgraph.get_ready_tasks(graph): 150 | graph = tgraph.mark_as_in_progress(graph, task) 151 | mlog(graph).info( 152 | 'pid {}: queueing task {}'.format(os.getpid(), task)) 153 | if task in graph.io_bound: 154 | ioq.put(task) 155 | else: 156 | cpuq.put(task) 157 | 158 | time.sleep(sleep) 159 | 160 | if raise_errors and sum(not p.exitcode for p in procs): 161 | raise RuntimeError('An async task has failed. Please check your logs') 162 | 163 | return tgraph.recover_values_from_manager(graph) 164 | 165 | 166 | def run_scheduler(graph, sleep, ioq, cpuq, raise_errors=False): 167 | loop = asyncio.new_event_loop() 168 | loop.set_exception_handler(exception_handler) 169 | try: 170 | loop.run_until_complete( 171 | scheduler(graph, sleep, ioq, cpuq, loop, raise_errors)) 172 | except Exception as error: 173 | raise RuntimeError('An async task has failed. Please check your logs') 174 | finally: 175 | loop.close() 176 | 177 | 178 | # TODO: scheduler can be improved 179 | async def scheduler(graph, sleep, ioq, cpuq, loop, raise_errors): 180 | while not tgraph.all_done(graph): 181 | try: 182 | task = ioq.get_nowait() 183 | log(graph).info( 184 | 'pid {}: dequeueing task {}'.format(os.getpid(), task)) 185 | asyncio.ensure_future( 186 | run_task_async(graph, task, raise_errors), loop=loop) 187 | except Exception: 188 | try: 189 | task = cpuq.get_nowait() 190 | log(graph).info( 191 | 'pid {}: dequeueing task {}'.format(os.getpid(), task)) 192 | asyncio.ensure_future( 193 | run_task_async(graph, task, raise_errors), loop=loop) 194 | # don't put two cpu intensive tasks on the same core without waiting 195 | await asyncio.sleep(sleep) 196 | except Exception: 197 | await asyncio.sleep(sleep) 198 | 199 | 200 | async def queue_loader(graph, ioq, cpuq, sleep): 201 | while not tgraph.all_done(graph): 202 | for task in tgraph.get_ready_tasks(graph): 203 | graph = tgraph.mark_as_in_progress(graph, task) 204 | log(graph).info( 205 | 'pid {}: queueing task {}'.format(os.getpid(), task)) 206 | 207 | if task in graph.io_bound: 208 | await ioq.put(task) 209 | else: 210 | await cpuq.put(task) 211 | 212 | await asyncio.sleep(sleep) 213 | 214 | 215 | def mark_children_as_incomplete(graph, task): 216 | children = tgraph.get_all_children(graph, task) 217 | 218 | if not children: 219 | return graph 220 | 221 | log(graph).info('pid {}: marking children {} of failed task {}'.format( 222 | os.getpid(), children, task)) 223 | 224 | msg = 'Ancestor task {} failed; task not run'.format(task) 225 | for child in children: 226 | graph.results[child] = msg 227 | tgraph.mark_as_done(graph, child) 228 | return graph 229 | 230 | 231 | def get_task_args(graph, task): 232 | return [ 233 | graph.results.get(dep) for dep in graph.dependencies[task] 234 | if graph.results.get(dep) is not None 235 | ] 236 | -------------------------------------------------------------------------------- /taskmap/taskmap_test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import taskmap 4 | import pytest 5 | import time 6 | import os 7 | 8 | # disable logging during tests 9 | logging.disable(logging.CRITICAL) 10 | 11 | 12 | def a(): 13 | return 5 14 | 15 | 16 | def b(x): 17 | return x + 10 18 | 19 | 20 | def c(x, y): 21 | return x + y + 20 22 | 23 | 24 | def test_graph_ready(): 25 | # given 26 | dependencies = { 27 | 'a': {'b', 'c'}, 28 | 'b': {'c'}, 29 | 'c': set(), 30 | } 31 | 32 | funcs = { 33 | 'a': a, 34 | 'b': b, 35 | 'c': c, 36 | } 37 | 38 | graph = taskmap.create_graph(funcs, dependencies) 39 | 40 | # when 41 | results = taskmap.get_ready_tasks(graph) 42 | 43 | # then 44 | assert results == ['c'] 45 | 46 | 47 | def test_graph_ordered_ready(): 48 | # given 49 | dependencies = {'a': set(), 'b': set()} 50 | funcs = {'a': a, 'b': b} 51 | io_bound = ['a'] 52 | graph = taskmap.create_graph(funcs, dependencies, io_bound=io_bound) 53 | 54 | # when 55 | results = taskmap.get_ready_tasks(graph) 56 | 57 | # then 58 | assert results == ['a', 'b'] 59 | 60 | 61 | def test_tasks_can_be_marked_done(): 62 | # given 63 | funcs = {'a': a, 'b': b} 64 | dependencies = {'a': ['b'], 'b': []} 65 | 66 | # when 67 | graph = taskmap.create_graph(funcs, dependencies, done=['b']) 68 | 69 | # then 70 | assert taskmap.get_ready_tasks(graph) == ['a'] 71 | 72 | 73 | def test_cached_results_are_used(): 74 | # given 75 | funcs = {'a': a, 'b': b} 76 | dependencies = {'b': ['a'], 'a': []} 77 | results = {'a': 5} 78 | 79 | graph = taskmap.create_graph( 80 | funcs, dependencies, done=['a'], results=results) 81 | 82 | # when 83 | graph = taskmap.run(graph) 84 | 85 | # then 86 | assert graph.results['b'] == 15 87 | 88 | 89 | def test_graph_ready_after_task_completed(): 90 | # given 91 | dependencies = { 92 | 'a': {'b', 'c'}, 93 | 'b': {'c'}, 94 | 'c': set(), 95 | } 96 | 97 | funcs = { 98 | 'a': a, 99 | 'b': b, 100 | 'c': c, 101 | } 102 | 103 | graph = taskmap.create_graph(funcs, dependencies) 104 | ready = taskmap.get_ready_tasks(graph) 105 | 106 | # when 107 | for func in ready: 108 | taskmap.mark_as_done(graph, func) 109 | 110 | results = taskmap.get_ready_tasks(graph) 111 | 112 | # then 113 | assert results == ['b'] 114 | 115 | 116 | def test_mark_as_done_except(): 117 | # given 118 | dependencies = { 119 | 'a': {'b', 'c'}, 120 | 'b': {'c'}, 121 | 'c': set(), 122 | } 123 | 124 | funcs = { 125 | 'a': a, 126 | 'b': b, 127 | 'c': c, 128 | } 129 | 130 | graph = taskmap.create_graph(funcs, dependencies) 131 | graph = taskmap.mark_as_done_except(graph, ['c']) 132 | 133 | results = taskmap.get_ready_tasks(graph) 134 | 135 | # then 136 | assert results == ['c'] 137 | 138 | 139 | def test_cyclic_dependency(): 140 | # given 141 | dependencies = { 142 | 'a': {'b'}, 143 | 'b': {'c'}, 144 | 'c': {'a'}, 145 | } 146 | 147 | funcs = { 148 | 'a': a, 149 | 'b': b, 150 | 'c': c, 151 | } 152 | 153 | # then 154 | with pytest.raises(ValueError): 155 | 156 | # when 157 | taskmap.create_graph(funcs, dependencies) 158 | 159 | 160 | def test_absent_tasks(): 161 | # given 162 | dependencies = { 163 | 'a': {'b', 'c'}, 164 | } 165 | 166 | funcs = { 167 | 'a': a, 168 | 'b': b, 169 | 'c': c, 170 | } 171 | 172 | # then 173 | with pytest.raises(ValueError): 174 | 175 | # when 176 | taskmap.create_graph(funcs, dependencies) 177 | 178 | 179 | def test_all_names_are_funcs(): 180 | # given 181 | dependencies = {'d': ['a'], 'a': []} 182 | 183 | funcs = {'a': a, 'b': b, 'c': c} 184 | 185 | # then 186 | with pytest.raises(ValueError): 187 | 188 | # when 189 | taskmap.create_graph(funcs, dependencies) 190 | 191 | 192 | def test_logging_no_write(): 193 | # given 194 | dependencies = {'a': []} 195 | funcs = {'a': a} 196 | logging_config = {'write': False} 197 | 198 | # when 199 | taskmap.create_graph(funcs, dependencies, name='name', logging_config=logging_config) 200 | 201 | 202 | def test_logging_filename_change(): 203 | # given 204 | dependencies = {'a': []} 205 | funcs = {'a': a} 206 | name = 'test-taskmap-name' 207 | graph = taskmap.create_graph(funcs, dependencies, name=name, 208 | logging_config={'write': True}) 209 | 210 | # when 211 | graph = taskmap.run(graph) 212 | 213 | # then 214 | assert any(name in f for f in os.listdir('./')) 215 | 216 | 217 | def test_default_logging_severity_level(): 218 | dependencies = {'a': []} 219 | funcs = {'a': a} 220 | name = 'test-taskmap-default-level' 221 | taskmap.create_graph(funcs, dependencies, name=name) 222 | 223 | manager_logger_name = '{}-manager'.format(name) 224 | worker_logger_name = '{}-worker'.format(name) 225 | 226 | assert logging.getLogger(manager_logger_name).level == logging.DEBUG 227 | assert logging.getLogger(worker_logger_name).level == logging.DEBUG 228 | 229 | 230 | def test_explicit_logging_severity_level(): 231 | dependencies = {'a': []} 232 | funcs = {'a': a} 233 | name = 'test-taskmap-explicit-level' 234 | taskmap.create_graph(funcs, dependencies, name=name, 235 | logging_config={'level': logging.ERROR}) 236 | 237 | manager_logger_name = '{}-manager'.format(name) 238 | worker_logger_name = '{}-worker'.format(name) 239 | 240 | assert logging.getLogger(manager_logger_name).level == logging.ERROR 241 | assert logging.getLogger(worker_logger_name).level == logging.ERROR 242 | 243 | 244 | def test_run_pass_args(): 245 | # given 246 | dependencies = { 247 | 'c': ['a', 'b'], 248 | 'b': ['a'], 249 | 'a': [], 250 | } 251 | 252 | funcs = { 253 | 'a': a, 254 | 'b': b, 255 | 'c': c, 256 | } 257 | 258 | graph = taskmap.create_graph(funcs, dependencies) 259 | 260 | # when 261 | graph = taskmap.run(graph) 262 | 263 | # then 264 | assert graph.results == {'a': 5, 'b': 15, 'c': 40} 265 | 266 | 267 | error = RuntimeError('some error') 268 | 269 | 270 | def d(): 271 | raise error 272 | 273 | 274 | def test_sync_error_handling(): 275 | # given 276 | dependencies = { 277 | 'c': ['d'], 278 | 'd': [], 279 | } 280 | 281 | funcs = { 282 | 'd': d, 283 | 'c': c, 284 | } 285 | 286 | # when 287 | graph = taskmap.create_graph(funcs, dependencies) 288 | 289 | result = taskmap.run(graph) 290 | result_parallel = taskmap.run_parallel(graph, nprocs=2, sleep=.001) 291 | 292 | # then 293 | expected = { 294 | 'd': error, 295 | 'c': 'Ancestor task d failed; task not run', 296 | } 297 | assert result.results['c'] == expected['c'] 298 | assert result.results['d'].__class__ == expected['d'].__class__ 299 | assert result.results['d'].args == expected['d'].args 300 | 301 | assert result_parallel.results['c'] == expected['c'] 302 | assert result_parallel.results['d'].__class__ == expected['d'].__class__ 303 | assert result_parallel.results['d'].args == expected['d'].args 304 | 305 | 306 | def test_sync_error_raise_errors(): 307 | # given 308 | dependencies = {'c': ['d'], 'd': []} 309 | funcs = {'d': d, 'c': c} 310 | 311 | # when 312 | graph = taskmap.create_graph(funcs, dependencies) 313 | with pytest.raises(RuntimeError, match='some error'): 314 | taskmap.run(graph, raise_errors=True) 315 | 316 | graph = taskmap.create_graph(funcs, dependencies) 317 | with pytest.raises(RuntimeError, match='some error'): 318 | taskmap.run_parallel(graph, raise_errors=True) 319 | 320 | 321 | async def control(): 322 | return 5 323 | 324 | 325 | async def e(): 326 | raise error 327 | 328 | async def g(er): 329 | return er 330 | 331 | def test_async_error_handling(): 332 | # given 333 | dependencies = { 334 | 'g': ['e'], 335 | 'e': [], 336 | 'control': [], 337 | } 338 | 339 | funcs = { 340 | 'e': e, 341 | 'g': g, 342 | 'control': control, 343 | } 344 | 345 | # when 346 | graph = taskmap.create_graph(funcs.copy(), dependencies.copy()) 347 | graph = taskmap.run_async(graph, sleep=.001) 348 | 349 | graph_parallel = taskmap.create_graph(funcs.copy(), dependencies.copy()) 350 | graph_parallel = taskmap.run_parallel_async(graph_parallel, nprocs=2, sleep=.001) 351 | 352 | # then 353 | expected = { 354 | 'e': error, 355 | 'control': 5, 356 | 'g': 'Ancestor task e failed; task not run', 357 | } 358 | 359 | assert graph.results['g'] == expected['g'] 360 | assert graph.results['e'].__class__ == expected['e'].__class__ 361 | assert graph.results['e'].args == expected['e'].args 362 | assert graph.results['control'] == 5 363 | 364 | assert graph_parallel.results['g'] == expected['g'] 365 | assert graph_parallel.results['e'].__class__ == expected['e'].__class__ 366 | assert graph_parallel.results['e'].args == expected['e'].args 367 | assert graph.results['control'] == 5 368 | 369 | 370 | def test_async_error_raise_errors(): 371 | # given 372 | funcs = {'e': e, 'control': control} 373 | dependencies = {'e': [], 'control': []} 374 | 375 | # when 376 | graph = taskmap.create_graph(funcs, dependencies) 377 | with pytest.raises(RuntimeError, match='check your logs'): 378 | taskmap.run_async(graph, raise_errors=True) 379 | 380 | graph = taskmap.create_graph(funcs, dependencies) 381 | with pytest.raises(RuntimeError, match='check your logs'): 382 | taskmap.run_parallel_async(graph, raise_errors=True) 383 | 384 | 385 | def test_rebuilding_graph_from_failure(): 386 | # given 387 | dependencies = { 388 | 'c': ['e'], 389 | 'e': [], 390 | 'w': [], 391 | } 392 | 393 | funcs = { 394 | 'e': e, 395 | 'c': c, 396 | 'w': w, 397 | } 398 | 399 | graph = taskmap.create_graph(funcs.copy(), dependencies.copy()) 400 | graph = taskmap.run_parallel_async(graph, nprocs=2, sleep=.001) 401 | 402 | # when 403 | new_graph = taskmap.reset_failed_tasks(graph) 404 | 405 | # then 406 | assert new_graph.done == ['w'] 407 | 408 | 409 | def test_get_all_children(): 410 | # given 411 | # given 412 | dependencies = { 413 | 'd': ['a'], 414 | 'c': ['b'], 415 | 'b': ['a'], 416 | 'a': [], 417 | } 418 | 419 | funcs = { 420 | 'a': a, 421 | 'b': b, 422 | 'c': c, 423 | 'd': d, 424 | } 425 | 426 | graph = taskmap.create_graph(funcs, dependencies) 427 | 428 | # when 429 | a_children = taskmap.get_all_children(graph, 'a') 430 | b_children = taskmap.get_all_children(graph, 'b') 431 | c_children = taskmap.get_all_children(graph, 'c') 432 | 433 | # then 434 | assert a_children == {'b', 'c', 'd'} 435 | assert b_children == {'c'} 436 | assert c_children == set() 437 | 438 | 439 | def long_task(): 440 | time.sleep(.02) 441 | return 5 442 | 443 | 444 | def test_run_parallel(): 445 | # given 446 | dependencies = { 447 | 'c': ['long_task', 'b'], 448 | 'b': ['long_task'], 449 | 'long_task': [], 450 | } 451 | 452 | funcs = { 453 | 'long_task': long_task, 454 | 'b': b, 455 | 'c': c, 456 | } 457 | 458 | graph = taskmap.create_graph(funcs, dependencies) 459 | 460 | # when 461 | graph = taskmap.run_parallel(graph, nprocs=2, sleep=.001) 462 | 463 | # then 464 | assert graph.results == {'long_task': 5, 'b': 15, 'c': 40} 465 | 466 | 467 | async def ab(x): 468 | return x + 10 469 | 470 | 471 | async def ac(x, y): 472 | return x + y + 20 473 | 474 | 475 | async def along_task(): 476 | await asyncio.sleep(.02) 477 | return 5 478 | 479 | 480 | def test_run_async(): 481 | # given 482 | dependencies = { 483 | 'ac': ['along_task', 'ab'], 484 | 'ab': ['along_task'], 485 | 'along_task': [], 486 | } 487 | 488 | funcs = { 489 | 'along_task': along_task, 490 | 'ab': ab, 491 | 'ac': ac, 492 | } 493 | 494 | graph = taskmap.create_graph(funcs, dependencies) 495 | 496 | # when 497 | graph = taskmap.run_async(graph, sleep=0.001) 498 | 499 | # then 500 | assert graph.results == {'along_task': 5, 'ab': 15, 'ac': 40} 501 | 502 | 503 | def test_run_parllel_async(): 504 | # given 505 | dependencies = { 506 | 'ac': ['along_task', 'ab'], 507 | 'ab': ['along_task'], 508 | 'along_task': [], 509 | } 510 | 511 | funcs = { 512 | 'along_task': along_task, 513 | 'ab': ab, 514 | 'ac': ac, 515 | } 516 | 517 | graph = taskmap.create_graph(funcs, dependencies) 518 | 519 | # when 520 | graph = taskmap.run_parallel_async(graph, nprocs=2, sleep=.001) 521 | 522 | # then 523 | assert graph.results == {'along_task': 5, 'ab': 15, 'ac': 40} 524 | 525 | 526 | async def x(): 527 | await asyncio.sleep(.4) 528 | return 5 529 | 530 | 531 | async def y(): 532 | await asyncio.sleep(.4) 533 | return 5 534 | 535 | 536 | def test_async_speed(): 537 | # given 538 | funcs = {'x': x, 'y': y} 539 | dependencies = {'x': [], 'y': []} 540 | graph = taskmap.create_graph(funcs, dependencies) 541 | 542 | # when 543 | start = time.time() 544 | taskmap.run_async(graph, sleep=0.001) 545 | end = time.time() 546 | 547 | # then 548 | assert end - start < .8 549 | 550 | 551 | def v(): 552 | time.sleep(.4) 553 | return 5 554 | 555 | 556 | def u(): 557 | time.sleep(.4) 558 | return 5 559 | 560 | 561 | def test_parallel_speed(): 562 | # given 563 | funcs = {'x': u, 'y': v} 564 | dependencies = {'x': [], 'y': []} 565 | graph = taskmap.create_graph(funcs, dependencies) 566 | 567 | # when 568 | start = time.time() 569 | taskmap.run_parallel(graph, nprocs=2, sleep=.001) 570 | end = time.time() 571 | 572 | # then 573 | assert end - start < .8 574 | 575 | 576 | async def r(): 577 | await asyncio.sleep(.4) 578 | 579 | 580 | async def t(): 581 | await asyncio.sleep(.4) 582 | 583 | 584 | async def w(): 585 | time.sleep(.4) 586 | 587 | 588 | async def p(): 589 | time.sleep(.4) 590 | 591 | 592 | def test_async_parallel_speed(): 593 | # given 594 | funcs = {'r': r, 't': t, 'w': w, 'p': p} 595 | dependencies = {'r': [], 't': [], 'w': [], 'p': []} 596 | graph = taskmap.create_graph(funcs, dependencies, io_bound=['r', 't']) 597 | 598 | # when 599 | start = time.time() 600 | taskmap.run_parallel_async(graph, nprocs=2, sleep=.0001) 601 | end = time.time() 602 | 603 | # then 604 | assert end - start < .8 605 | 606 | 607 | async def io_bound_a(): await asyncio.sleep(.4); return 'io_a' 608 | async def io_bound_b(x): await asyncio.sleep(.4); return x + ' io_b' 609 | async def cpu_bound_a(x): time.sleep(.4); return x + ' cpu_a' 610 | async def cpu_bound_b(): time.sleep(.4); return 'cpu_b' 611 | 612 | 613 | def test_async_parallel_demo(): 614 | # given 615 | funcs = { 616 | 'io_bound_a': io_bound_a, 617 | 'io_bound_b': io_bound_b, 618 | 'cpu_bound_a': cpu_bound_a, 619 | 'cpu_bound_b': cpu_bound_b, 620 | } 621 | 622 | dependencies = { 623 | 'io_bound_a': [], 624 | 'io_bound_b': ['cpu_bound_b'], 625 | 'cpu_bound_a': ['io_bound_a'], 626 | 'cpu_bound_b': [], 627 | } 628 | 629 | io_bound = ['io_bound_a', 'io_bound_b'] 630 | graph = taskmap.create_graph(funcs, dependencies, io_bound=io_bound) 631 | 632 | # when 633 | start = time.time() 634 | graph = taskmap.run_parallel_async(graph, nprocs=2, sleep=.001) 635 | end = time.time() 636 | 637 | # then 638 | assert end - start < 1.2 639 | assert graph.results['io_bound_a'] == 'io_a' 640 | assert graph.results['io_bound_b'] == 'cpu_b io_b' 641 | assert graph.results['cpu_bound_a'] == 'io_a cpu_a' 642 | assert graph.results['cpu_bound_b'] == 'cpu_b' 643 | -------------------------------------------------------------------------------- /taskmap/tgraph.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import datetime as dt 3 | import multiprocessing_logging as mplogging 4 | 5 | from itertools import chain 6 | from operator import contains 7 | from functools import partial 8 | from collections import namedtuple 9 | 10 | Graph = namedtuple('graph', [ 11 | 'funcs', 'dependencies', 'done', 'results', 'in_progress', 'lock', 12 | 'io_bound', 'name' 13 | ]) 14 | 15 | 16 | def reset_failed_tasks(graph): 17 | """ 18 | create a new graph based on the outcomes of a previous run. 19 | if there were errors - only the failed tasks and their children will 20 | be included in the new graph. otherwise the new graph will be empty 21 | """ 22 | failed_tasks = set([ 23 | task for task, res in graph.results.items() 24 | if isinstance(res, Exception) 25 | ]) 26 | 27 | return reset_tasks(graph, failed_tasks) 28 | 29 | 30 | def reset_tasks(graph, tasks): 31 | children = set(chain(* [get_all_children(graph, task) for task in tasks])) 32 | rerun = children | set(tasks) 33 | 34 | for task in rerun: 35 | if task in graph.done: 36 | graph.results[task] = None 37 | graph.done.remove(task) 38 | 39 | return graph 40 | 41 | 42 | def create_graph(funcs, dependencies, io_bound=None, done=None, results=None, 43 | name='taskmap', logging_config=None): 44 | """ 45 | logging_config is expected to be a dictionary. the keys can be 'name', 46 | which names the loggers to be used, and 'write', which specificies whether 47 | the log is written to disk. Note if two graphs with the same name are 48 | created, only the logging config from the first will be used. 49 | """ 50 | defaults = {'name': name, 'write': False} 51 | setup_loggers({**defaults, **(logging_config or {})}) 52 | 53 | dependencies = {task: list(deps) for task, deps in dependencies.items()} 54 | io_bound = io_bound or [] 55 | done = done or [] 56 | results = results or {} 57 | 58 | check_all_tasks_present(dependencies) 59 | check_cyclic_dependency(dependencies) 60 | check_all_keys_are_funcs(funcs, dependencies) 61 | 62 | return Graph( 63 | funcs=funcs, 64 | dependencies=dependencies, 65 | in_progress=[], 66 | done=list(done), 67 | results=results, 68 | lock=0, 69 | io_bound=io_bound, 70 | name=name 71 | ) 72 | 73 | 74 | def check_cyclic_dependency(dependencies): 75 | ancestry = dict() 76 | 77 | for task, parents in dependencies.items(): 78 | already_seen = set() 79 | ancestry[task] = set() 80 | 81 | while parents: 82 | if task in parents: 83 | raise ValueError('Cyclic dependency: task %s' % task) 84 | 85 | already_seen.update(parents) 86 | ancestry[task].update(parents) 87 | 88 | new_parents = set() 89 | for parent in parents: 90 | new_parents.update(ancestry.get(parent, dependencies[parent])) 91 | 92 | parents = new_parents - already_seen 93 | 94 | 95 | def check_all_tasks_present(deps): 96 | absent_tasks = set(chain(*deps.values())) - set(deps.keys()) 97 | 98 | if absent_tasks: 99 | msg = ' '.join([ 100 | 'Tasks {} are depended upon, but are not present as', 101 | 'keys in dependencies dictionary.' 102 | ]) 103 | raise ValueError(msg.format(absent_tasks)) 104 | 105 | 106 | def check_all_keys_are_funcs(funcs, dependencies): 107 | vacuous_names = set(dependencies.keys()) - set(funcs.keys()) 108 | if vacuous_names: 109 | msg = ' '.join([ 110 | 'Tasks {} are listed in the dependencies dict, but do', 111 | 'not correspond to functions in the funcs dict.' 112 | ]) 113 | raise ValueError(msg.format(vacuous_names)) 114 | 115 | 116 | def get_all_children(graph, task): 117 | all_children = set() 118 | new_children = {k for k, v in graph.dependencies.items() if task in v} 119 | while new_children: 120 | all_children.update(new_children) 121 | new_children = { 122 | k 123 | for child in new_children for k, v in graph.dependencies.items() 124 | if child in v 125 | } 126 | new_children = new_children - all_children 127 | 128 | return all_children 129 | 130 | 131 | def get_ready_tasks(graph, reverse=True): 132 | done = set(graph.done) or set() 133 | in_progress = graph.in_progress or set() 134 | ready = set() 135 | for task, deps in graph.dependencies.items(): 136 | if not set(deps) - done: 137 | ready.add(task) 138 | ready = list(ready - done - set(in_progress)) 139 | key = partial(contains, graph.io_bound) 140 | return sorted(ready, key=key, reverse=reverse) 141 | 142 | 143 | def mark_as_done_except(graph, task): 144 | if type(task) == str: 145 | task = [task] 146 | 147 | all_tasks = graph.dependencies.keys() 148 | 149 | for t in set(all_tasks) - set(graph.done) - set(task): 150 | graph.done.append(t) 151 | 152 | return graph 153 | 154 | 155 | def mark_as_done(graph, task): 156 | if type(task) == str: 157 | task = [task] 158 | 159 | for t in set(task) - set(graph.done): 160 | graph.done.append(t) 161 | 162 | return graph 163 | 164 | 165 | def mark_as_in_progress(graph, task): 166 | graph.in_progress.append(task) 167 | return graph 168 | 169 | 170 | def all_done(graph): 171 | return set(graph.done) == set(graph.dependencies.keys()) 172 | 173 | 174 | def setup_loggers(config): 175 | name = config.get('name', 'taskmap') 176 | level = config.get('level', logging.DEBUG) 177 | 178 | if logging.getLogger('{}-manager'.format(name)).handlers: 179 | # we've already configured these loggers 180 | return 181 | 182 | mlogger = logging.getLogger('{}-manager'.format(name)) 183 | mlogger.setLevel(level) 184 | 185 | logger = logging.getLogger('{}-worker'.format(name)) 186 | logger.setLevel(level) 187 | 188 | formatter = logging.Formatter( 189 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 190 | 191 | ch = logging.StreamHandler() 192 | ch.setFormatter(formatter) 193 | ch.setLevel(level) 194 | logger.addHandler(ch) 195 | mlogger.addHandler(ch) 196 | 197 | if config.get('write', True): 198 | now = dt.datetime.now() 199 | logname_frmt = '{}{}.log'.format(name, now.strftime('%m-%d-%Y:%H.%M.%S')) 200 | fh = logging.FileHandler(logname_frmt) 201 | fh.setLevel(level) 202 | fh.setFormatter(formatter) 203 | logger.addHandler(fh) 204 | mlogger.addHandler(fh) 205 | 206 | mplogging.install_mp_handler(logger) 207 | 208 | 209 | def create_parallel_compatible_graph(graph, manager): 210 | return Graph( 211 | funcs=manager.dict(graph.funcs), 212 | dependencies=manager.dict(graph.dependencies), 213 | done=manager.list(graph.done), 214 | results=manager.dict(graph.results), 215 | in_progress=manager.list(), 216 | lock=manager.Value(int, 0), 217 | io_bound=manager.list(graph.io_bound), 218 | name=graph.name) 219 | 220 | 221 | def recover_values_from_manager(graph): 222 | return Graph( 223 | lock=0, 224 | in_progress=[], 225 | done=list(graph.done), 226 | funcs=dict(graph.funcs), 227 | results=dict(graph.results), 228 | io_bound=list(graph.io_bound), 229 | dependencies=dict(graph.dependencies), 230 | name=graph.name) 231 | --------------------------------------------------------------------------------