├── MANIFEST.in ├── taskgraph ├── __init__.py └── Task.py ├── requirements.txt ├── bitbucket-pipelines.yml ├── tox.ini ├── pyproject.toml ├── .github └── workflows │ └── pythonapp.yml ├── setup.py ├── LICENSE.txt ├── README.rst ├── HISTORY.rst └── tests └── test_task.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # This file defines extra files needed for the source distribution. 2 | # Setup.py requires these two files to exist, so we add them here. 3 | 4 | include README.rst HISTORY.rst LICENSE.txt 5 | -------------------------------------------------------------------------------- /taskgraph/__init__.py: -------------------------------------------------------------------------------- 1 | """TaskGraph init module.""" 2 | 3 | from .Task import TaskGraph 4 | from .Task import Task 5 | from .Task import _TASKGRAPH_DATABASE_FILENAME 6 | from .Task import __version__ 7 | 8 | __all__ = ['__version__', 'TaskGraph', 'Task', '_TASKGRAPH_DATABASE_FILENAME'] 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # requirements.txt 2 | # -------------------- 3 | # This file records the packages and requirements needed in order for 4 | # taskgraph to work as expected. 5 | 6 | retrying>=1.3.0 7 | importlib_metadata # technically only required on python < 3.8; easier to install with conda across all versions 8 | -------------------------------------------------------------------------------- /bitbucket-pipelines.yml: -------------------------------------------------------------------------------- 1 | pipelines: 2 | default: 3 | - parallel: 4 | - step: 5 | name: Tests on python3.6 6 | image: python:3.6-stretch 7 | caches: 8 | - pip 9 | script: 10 | - pip install tox 11 | - tox -e py36-base,py36-psutil 12 | - step: 13 | name: Tests on python3.7 14 | image: python:3.7-stretch 15 | caches: 16 | - pip 17 | script: 18 | - pip install tox 19 | - tox -e py37-base,py37-psutil 20 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = {py37,py38,py39,py310,py311}-{base,psutil} 3 | 4 | [gh-actions] 5 | # Allows us to use tox configuration to manage our tests, but still run on 6 | # github actions in the GHA matrix job matrix with GHA-managed python. 7 | # Requires tox-gh-actions package to run. 8 | python = 9 | 3.6: py36 10 | 3.7: py37 11 | 3.8: py38 12 | 3.9: py39 13 | 3.10: py310 14 | 3.11: py311 15 | 3.12: py312 16 | 17 | [testenv] 18 | commands = 19 | pytest --log-level=DEBUG \ 20 | --cov=taskgraph \ 21 | --cov-report=term \ 22 | --cov-report=xml \ 23 | --cov-report=html \ 24 | --junitxml={toxinidir}/testresults.xml {toxinidir}/tests 25 | changedir= 26 | {envtmpdir} 27 | 28 | # If tox-conda is installed (https://github.com/tox-dev/tox-conda), 29 | # use conda-forge python builds for the environments. 30 | conda_channels= 31 | conda-forge 32 | 33 | # Only install psutil to the environments where we're testing psutil. 34 | # "psutil: psutil" is an example of tox's generative environment definition 35 | # and will match all environments containing the string "psutil" 36 | deps = 37 | setuptools_scm 38 | pytest 39 | pytest-cov 40 | rstcheck 41 | psutil: psutil 42 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "taskgraph" 3 | description = "Parallel task graph framework" 4 | readme = "README.rst" 5 | requires-python = ">=3.6" 6 | license = {file = "LICENSE.txt"} 7 | maintainers = [ 8 | {name = "Natural Capital Project Software Team"} 9 | ] 10 | keywords = ["parallel", "multiprocessing", "distributed", "computing"] 11 | classifiers = [ 12 | "Intended Audience :: Developers", 13 | "Topic :: System :: Distributed Computing", 14 | "Development Status :: 5 - Production/Stable", 15 | "Natural Language :: English", 16 | "Operating System :: MacOS :: MacOS X", 17 | "Operating System :: Microsoft", 18 | "Operating System :: POSIX", 19 | "Programming Language :: Python :: 3.8", 20 | "Programming Language :: Python :: 3.9", 21 | "Programming Language :: Python :: 3.10", 22 | "Programming Language :: Python :: 3.11", 23 | "Programming Language :: Python :: 3.12", 24 | "License :: OSI Approved :: BSD License" 25 | ] 26 | # the version is provided dynamically by setuptools_scm 27 | # `dependencies` and `optional-dependencies` are provided by setuptools 28 | # using the corresponding setup args `install_requires` and `extras_require` 29 | dynamic = ["version", "dependencies", "optional-dependencies"] 30 | 31 | [build-system] 32 | requires = [ 33 | 'wheel', 'setuptools_scm>=8.0' 34 | ] 35 | build-backend = "setuptools.build_meta" 36 | 37 | [tool.setuptools_scm] 38 | version_scheme = "post-release" 39 | local_scheme = "node-and-date" 40 | -------------------------------------------------------------------------------- /.github/workflows/pythonapp.yml: -------------------------------------------------------------------------------- 1 | name: Test TaskGraph 2 | on: 3 | push: 4 | branches: 5 | - "**" 6 | pull_request: 7 | branches: 8 | - "**" 9 | jobs: 10 | Test: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python-version: [3.8, 3.9, "3.10", "3.11", "3.12"] 16 | os: [ubuntu-latest, windows-latest, macos-latest] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | with: 21 | # Fetch all history (it's a small repo) for scm-based versioning 22 | fetch-depth: 0 23 | 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v2 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | # See this comment about the importlib_metadata constraint: 33 | # https://github.com/python/importlib_metadata/issues/406#issuecomment-1264666048 34 | pip install tox tox-gh-actions flake8 "importlib_metadata<5" rstcheck 35 | 36 | - name: Lint with flake8 37 | run: | 38 | # stop the build if there are Python syntax errors or undefined names 39 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 40 | # exit-zero treats all errors as warnings 41 | flake8 . --count --exit-zero --max-line-length=80 --statistics 42 | 43 | - name: Run tests 44 | run: | 45 | tox 46 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """taskgraph setup.py.""" 2 | from setuptools import setup 3 | 4 | _REQUIREMENTS = [ 5 | x for x in open('requirements.txt').read().split('\n') 6 | if not x.startswith('#') and len(x) > 0] 7 | 8 | LONG_DESCRIPTION = '%s\n\n%s' % ( 9 | open('README.rst').read(), 10 | open('HISTORY.rst').read()) 11 | 12 | setup( 13 | name='taskgraph', 14 | use_scm_version={'version_scheme': 'post-release', 15 | 'local_scheme': 'node-and-date'}, 16 | setup_requires=['setuptools_scm'], 17 | description='Parallel task graph framework.', 18 | long_description=LONG_DESCRIPTION, 19 | url='https://github.com/natcap/taskgraph', 20 | packages=['taskgraph'], 21 | license='BSD', 22 | keywords='parallel multiprocessing distributed computing', 23 | install_requires=_REQUIREMENTS, 24 | extras_require={ 25 | 'niced_processes': ['psutil'], 26 | }, 27 | classifiers=[ 28 | 'Intended Audience :: Developers', 29 | 'Topic :: System :: Distributed Computing', 30 | 'Development Status :: 5 - Production/Stable', 31 | 'Natural Language :: English', 32 | 'Operating System :: MacOS :: MacOS X', 33 | 'Operating System :: Microsoft', 34 | 'Operating System :: POSIX', 35 | 'Programming Language :: Python :: 3.8', 36 | 'Programming Language :: Python :: 3.9', 37 | 'Programming Language :: Python :: 3.10', 38 | 'Programming Language :: Python :: 3.11', 39 | 'Programming Language :: Python :: 3.12', 40 | 41 | 'License :: OSI Approved :: BSD License' 42 | ]) 43 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | In this license, "Natural Capital Project" is defined as the parties of 2 | Stanford University, The Nature Conservancy, World Wildlife Fund Inc., 3 | and University of Minnesota. 4 | 5 | This tool has an open license. All people are invited to use the tool 6 | under the following conditions and terms: 7 | 8 | Copyright (c) 2020, Natural Capital Project 9 | 10 | All rights reserved. 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted provided that the following conditions are 14 | met: 15 | 16 | * Redistributions of source code must retain the above copyright 17 | notice, this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the 22 | distribution. 23 | 24 | * Neither the name of Natural Capital Project nor the names of 25 | its contributors may be used to endorse or promote products derived 26 | from this software without specific prior written permission. 27 | 28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 29 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 30 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 31 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 32 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 33 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 34 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 35 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 36 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 37 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 38 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 39 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | About TaskGraph 3 | =============== 4 | 5 | ``TaskGraph`` is a library that was developed to help manage complicated 6 | computational software pipelines consisting of long running individual tasks. 7 | Many of these tasks could be executed in parallel, almost all of them wrote 8 | results to disk, and many times results could be reused from part of the 9 | pipeline. TaskGraph manages all of this for you. With it you can schedule 10 | tasks with dependencies, avoid recomputing results that have already been 11 | computed, and allot multiple CPU cores to execute tasks in parallel if 12 | desired. 13 | 14 | TaskGraph Dependencies 15 | ---------------------- 16 | 17 | Task Graph is written in pure Python, but if the ``psutils`` package is 18 | installed the distributed multiprocessing processes will be ``nice``\d. 19 | 20 | Example Use 21 | ----------- 22 | 23 | Install ``TaskGraph`` with 24 | 25 | ``pip install taskgraph`` 26 | 27 | Then 28 | 29 | .. code-block:: python 30 | 31 | import os 32 | import pickle 33 | import logging 34 | 35 | import taskgraph 36 | 37 | logging.basicConfig(level=logging.DEBUG) 38 | 39 | def _create_list_on_disk(value, length, target_path): 40 | """Create a numpy array on disk filled with value of `size`.""" 41 | target_list = [value] * length 42 | pickle.dump(target_list, open(target_path, 'wb')) 43 | 44 | 45 | def _sum_lists_from_disk(list_a_path, list_b_path, target_path): 46 | """Read two lists, add them and save result.""" 47 | list_a = pickle.load(open(list_a_path, 'rb')) 48 | list_b = pickle.load(open(list_b_path, 'rb')) 49 | target_list = [] 50 | for a, b in zip(list_a, list_b): 51 | target_list.append(a+b) 52 | pickle.dump(target_list, open(target_path, 'wb')) 53 | 54 | # create a taskgraph that uses 4 multiprocessing subprocesses when possible 55 | if __name__ == '__main__': 56 | workspace_dir = 'workspace' 57 | task_graph = taskgraph.TaskGraph(workspace_dir, 4) 58 | target_a_path = os.path.join(workspace_dir, 'a.dat') 59 | target_b_path = os.path.join(workspace_dir, 'b.dat') 60 | result_path = os.path.join(workspace_dir, 'result.dat') 61 | result_2_path = os.path.join(workspace_dir, 'result2.dat') 62 | value_a = 5 63 | value_b = 10 64 | list_len = 10 65 | task_a = task_graph.add_task( 66 | func=_create_list_on_disk, 67 | args=(value_a, list_len, target_a_path), 68 | target_path_list=[target_a_path]) 69 | task_b = task_graph.add_task( 70 | func=_create_list_on_disk, 71 | args=(value_b, list_len, target_b_path), 72 | target_path_list=[target_b_path]) 73 | sum_task = task_graph.add_task( 74 | func=_sum_lists_from_disk, 75 | args=(target_a_path, target_b_path, result_path), 76 | target_path_list=[result_path], 77 | dependent_task_list=[task_a, task_b]) 78 | 79 | task_graph.close() 80 | task_graph.join() 81 | 82 | # expect that result is a list `list_len` long with `value_a+value_b` in it 83 | result = pickle.load(open(result_path, 'rb')) 84 | 85 | 86 | Caveats 87 | ------- 88 | 89 | * Taskgraph's default method of checking whether a file has changed 90 | (``hash_algorithm='sizetimestamp'``) uses the filesystem's modification 91 | timestamp, interpreted in integer nanoseconds. This check is only as 92 | accurate as the filesystem's timestamp. For example: 93 | 94 | * FAT and FAT32 timestamps have a 2-second modification timestamp resolution 95 | * exFAT has a 10 millisecond timestamp resolution 96 | * NTFS has a 100 nanosecond timestamp resolution 97 | * HFS+ has a 1 second timestamp resolution 98 | * APFS has a 1 nanosecond timestamp resolution 99 | * ext3 has a 1 second timestamp resolution 100 | * ext4 has a 1 nanosecond timestamp resolution 101 | 102 | If you suspect timestamp resolution to be an issue on your filesystem, you 103 | may wish to store your files on a filesystem with more accurate timestamps or 104 | else consider using a different ``hash_algorithm``. 105 | 106 | 107 | Running Tests 108 | ------------- 109 | 110 | Taskgraph includes a ``tox`` configuration for automating builds across 111 | multiple python versions and whether ``psutil`` is installed. To execute all 112 | tests on all platforms, run: 113 | 114 | $ tox 115 | 116 | Alternatively, if you're only trying to run tests on a single configuration 117 | (say, python 3.7 without ``psutil``), you'd run:: 118 | 119 | $ tox -e py37 120 | 121 | Or if you'd like to run the tests for the combination of Python 3.7 with 122 | ``psutil``, you'd run:: 123 | 124 | $ tox -e py37-psutil 125 | 126 | If you don't have multiple python installations already available on your system, 127 | an easy way to accomplish this is to use ``tox-conda`` 128 | (https://github.com/tox-dev/tox-conda) which will use conda environments to manage 129 | the versions of python available:: 130 | 131 | $ pip install tox-conda 132 | $ tox 133 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | ========================= 4 | TaskGraph Release History 5 | ========================= 6 | 7 | .. 8 | Unreleased Changes 9 | ------------------ 10 | 11 | 0.11.2 (2025-05-21) 12 | ------------------- 13 | * Using ``importlib.metadata`` or ``importlib_metadata``, depending on the 14 | python version, to read the version from package metadata. This is in 15 | response to ``pkg_resources`` being deprecated. 16 | (`#100 `_) 17 | 18 | 0.11.1 (2023-10-27) 19 | ------------------- 20 | * Adding ``pyproject.toml`` for our build definitions. 21 | * Python 3.6 has reached end-of-life and is no longer maintained, so it has 22 | been removed from the automated tests. 23 | * Python 3.7 has reached end-of-life and is no longer maintained, so it has 24 | been removed from automated tests. 25 | * Python 3.11 has been released, so ``taskgraph`` is now tested against this 26 | new version of the language. 27 | * Python 3.12 has been released, so ``taskgraph`` is now tested against this 28 | new version of the language. 29 | 30 | 0.11.0 (2021-10-12) 31 | ------------------- 32 | * Testing against python 3.10 in github actions and officially noting support 33 | for 3.10 in ``setup.py``. 34 | * Testing against python 3.9 in github actions and noting support in 35 | ``setup.py``. 36 | * Fixed an issue where exceptions raised during execution where the task 37 | completed before ``TaskGraph.join()`` was called would not be raised. Now, 38 | if a task raises an exception, its exception will always be raised when 39 | either ``Task.join()`` and ``TaskGraph.join()`` is called. 40 | * Fixed an issue where tasks with ``hash_algorithm='sizetimestamp'`` would, 41 | under certain conditions, fail to re-execute when they should. This only 42 | occurred when a graph writing the same amount of , but possibly different, 43 | data is executed successively, with less than about 1.5 seconds between 44 | task executions. 45 | * After many years with the Natural Capital Project, Rich Sharp has stepped 46 | down from the Project and as the maintainer of ``taskgraph``. James 47 | Douglass is taking his place, and this change is now reflected in 48 | ``setup.py``. 49 | * Fixes an issue that causes an ``EOFError`` or ``BrokenPipeError`` to occur 50 | when the ``TaskGraph`` terminates. 51 | * Updated the ``taskgraph`` example in the README for the latest API changes 52 | and to clarify the need for ``if __name__ == '__main__':`` 53 | * Fixed an issue that could cause the ``TaskGraph`` object to hang if 54 | duplicate ``Task`` objects were created. 55 | * Fixed an issue that was causing TaskGraph to ignore a changed 56 | ``hash_algorithm`` if the TaskGraph was created on one run, was 57 | deconstructed, then restarted. If the user chose a different hash, TaskGraph 58 | would use the hash that the target file was originally hashed under rather 59 | than the new algorithm. 60 | * Removed ``copy_duplicate_artifact`` and ``hardlink_allowed`` parameters 61 | and functionality from TaskGraph. This is to address a design error that 62 | TaskGraph is not well suited for caching file results to avoid 63 | recomputation. Rather than add additional complexity around the limitations 64 | of this feature it is being removed to guide a design toward a standalone 65 | cache library if needed. 66 | 67 | 0.10.3 (2021-01-29) 68 | ------------------- 69 | * Fixed issue that could cause combinatorial memory usage leading to poor 70 | runtime or ``MemoryError`` if a dictionary were passed that had thousands 71 | of elements. 72 | * Fixed issue that would cause ``TaskGraph`` to not recognize a directory 73 | that was meant to be ignored and in some cases cause ``Task`` to 74 | unnecessarily reexecute. 75 | 76 | 0.10.2 (2020-12-11) 77 | ------------------- 78 | * Fixed an issue that would raise an exception when `__del__` was 79 | deconstructing a taskgraph object and a thread ``join()`` would cause a 80 | deadlock. 81 | 82 | 0.10.1 (2020-12-11) 83 | ------------------- 84 | * Fixed an issue that would ignore the state of a ``transient_run`` flag if 85 | a previous Task run had run it with that flag set to False. 86 | * Removed a limit on the number of times ``TaskGraph`` can attempt to update 87 | its database up to 5 minutes of continuous failures. This is to address 88 | expected issues when many parallel threads may compete for an update. 89 | Relevant information about why the database update fails is logged. 90 | * Fixed an issue where the logging queue would always report an exception 91 | even if the logging thread shut down correctly. 92 | 93 | 0.10.0 (2020-08-25) 94 | ------------------- 95 | * Fixed several race conditions that could cause the ``TaskGraph`` object to 96 | hang on an otherwise ordinary termination. 97 | * Changed logging level to "INFO" on cases where the taskgraph was not 98 | precalculated since it's an expected path of execution in ``TaskGraph``. 99 | * Adding a ``hardlink_allowed`` parameter to ``add_task`` that allows the 100 | attempt to hardlink a file in a case where a ``copy_artifact=True`` may 101 | permit one. This will save on disk space as well as computation time 102 | if large files are not needed to copy. 103 | * Adding a ``store_result`` flag to ``add_task`` that conditionally stores 104 | the ``func`` result in the database for later ``.get``. This was added to 105 | guard against return types that were not picklable and would otherwise 106 | cause an exception when being executed normally. 107 | * Fixed issue that would cause the logger thread to continue reporting status 108 | after all tasks were complete and the graph was closed. 109 | 110 | 0.9.1 (2020-06-04) 111 | ------------------ 112 | * Fixed issue that would cause an infinite loop if a ``TaskGraph`` object were 113 | created with a database from an incompatible previous version. Behavior now 114 | is to log the issue, delete the old database, and create a new compatible 115 | one. 116 | * Fixed issue that would cause some rare infinite loops if ``TaskGraph`` were 117 | to fail due to some kinds of task exceptions. 118 | * Adding open source BSD-3-Clause license. 119 | 120 | 0.9.0 (2020-03-05) 121 | ------------------ 122 | * Updating primary repository URL to GitHub. 123 | * Adding support for Python 3.8. 124 | * Removing the ``EncapsulatedOp`` abstract class. In practice the development 125 | loop that encouraged the use of ``EncapsulatedOp`` is flawed and can lead to 126 | design errors. 127 | * Removing unnecessary internal locks which will improve runtime performance of 128 | processing many small Tasks. 129 | * Refactor to support separate TaskGraph objects that use the same database. 130 | * Removed the ``n_retries`` parameter from ``add_task``. Users are recommended 131 | to handle retries within functions themselves. 132 | * Added a ``hash_target_files`` flag to ``add_task`` that when set to False, 133 | causes TaskGraph to only note the existence of target files after execution 134 | or as part of an evaluation to determine if the Task was precalculated. 135 | This is useful for operations that initialize a file but subsequent runs of 136 | the program modify it such as a new database or a downloaded file. 137 | * Fixed an issue on the monitor execution thread that caused shutdown of a 138 | TaskGraph object to be delayed up to the amount of delay in the monitor 139 | reporting update. 140 | * Added a ``.get()`` function for ``Task`` objects that returns the result of 141 | the respective ``func`` call. This value is cached in the TaskGraph database 142 | and hence can be used to avoid repeated execution. Note the addition of this 143 | function changes the functionality of calling ``add_task`` with no target 144 | path list. In previous versions the Task would execute once per TaskGraph 145 | instance, now successive ``Task`` objects with the same execution signature 146 | will use cached results. 147 | * To support the addition of the ``.get()`` function a ``transient_run`` 148 | parameter is added to ``add_task`` that causes TaskGraph to avoid 149 | recording a completed ``Task`` even if the execution hash would have been 150 | identical to a previously completed run where the target artifacts still 151 | existed. 152 | 153 | 0.8.5 (2019-09-11) 154 | ------------------ 155 | * Dropped support for Python 2.7. 156 | * Fixed an issue where paths in ``ignore_paths`` were not getting ignored in 157 | the case of ``copy_duplicate_artifact=True``. 158 | * Fixed an issue where the "percent completed" in the logging monitor would 159 | sometimes exceed 100%. This occurred when a duplicate task was added to 160 | the TaskGraph object. 161 | * Fixed an issue where a relative path set as a target path would always cause 162 | TaskGraph to raise an exception after the task was complete. 163 | * Fixed an issue where kwargs that were unhashable were not considered when 164 | determining if a Task should be re-run. 165 | * Fixed an issue where files with almost identical modified times and sizes 166 | would hash equal in cases even when the filenames were different. 167 | 168 | 0.8.4 (2019-05-23) 169 | ------------------ 170 | * Fixed an exception that occurred when two tasks were constructed that 171 | targeted the same file but one path was relative and the other was absolute. 172 | 173 | 0.8.3 (2019-02-26) 174 | ------------------ 175 | * Fixed an issue that would cause TaskGraph to raise an IOError if an 176 | ``add_task`` call was marked for ``copy_duplicate_artifact`` but the 177 | base target file was missing. 178 | * Fixed an issue that would prevent the source distribution from 179 | installing. 180 | * Taskgraph is now tested against python versions 2.7, 3.6 and 3.7. 181 | 182 | 0.8.2 (2019-01-31) 183 | ------------------ 184 | * Adjusted logging levels so most chatty information is lowered to debug and 185 | oddness in ``__del__`` shutdown are degraded from ``error`` to ``debug`` so 186 | as not to cause alarm. 187 | 188 | 0.8.1 (2019-01-09) 189 | ------------------ 190 | * Fixed an issue that would cause a deadlock if two tasks were added that had 191 | the same function signature except different target paths. 192 | 193 | 0.8.0 (2019-01-07) 194 | ------------------ 195 | * Fixed a race condition that would sometimes cause an exception when multiple 196 | threads attempted to read or write to the completed Task Database. 197 | * Fixed an issue that could cause an exception in ``__del__`` to print to 198 | stderr during Python interpreter shutdown. 199 | * Added a ``hash_algorithm`` parameter to ``add_task`` that is a string of 200 | either 'sizetimestamp' or anything in ``hashlib.algorithms_available``. This 201 | option tells TaskGraph how to fingerprint input and target files to 202 | determine the need for recomputation. 203 | * Added a ``copy_duplicate_artifact`` parameter to ``add_task`` that when True 204 | tells TaskGraph to copy duplicate target results to a new target so long as 205 | all the parameters and base/target files fingerprint to the same value. 206 | This can save significant computation time when use in scenarios where 207 | there are small changes in a workflow, but otherwise significant changes 208 | in filenames. This often occurs when putting timestamps or other suffixes 209 | on files that otherwise have identical content. 210 | 211 | 0.7.2 (2018-11-21) 212 | ------------------ 213 | * TaskGraph now stores all task completion information in a single SQLite 214 | database stored in its cache directory. In previous versions 215 | TaskGraph would write a small text file for each task in a highly branching 216 | directory tree. This structure made removal of those directory trees 217 | computationally difficult. 218 | * Fixed an issue that would cause TaskGraph to reexecute if the target path 219 | was included in the argument list and that path was not normalized to the 220 | operating system's path style. 221 | * Fixed a deadlock in some cases where Tasks failed while other tasks checked 222 | for pre-execution clauses. 223 | 224 | 0.7.0 (2018-10-22) 225 | ------------------ 226 | * Fixed an issue where very long strings might be interpreted as paths and 227 | Windows crashes because the path is too long. 228 | * Fixed a deadlock issue where a Task might raise an unhandled exception as a 229 | new task was added to the TaskGraph. 230 | * Fixed the occasional ``BrokenPipeError`` that could occur when a Task 231 | encountered an unhandled exception. 232 | * Added an ``n_retries`` parameter to ``add_task`` that lets TaskGraph attempt 233 | to reexecute a failing Task up to ``n_retries`` times before terminating 234 | the TaskGraph. 235 | * Removed the ``delayed_start`` option. 236 | 237 | 0.6.1 (2018-08-14) 238 | ------------------ 239 | * Resolving an issue with duplicate logging being printed to stdout when 240 | ``n_workers > 0``. Logging is now only handled in the process that contains 241 | the TaskGraph instance. 242 | * Updated main logging message to indicate which tasks, by task name, are 243 | currently active and how many tasks are ready to execute but can't because 244 | there is not an open worker. 245 | * Attempted to fix an issue where processes in the process pool were not 246 | terminating on a Linux system by aggressively joining all threads and 247 | processes when possible. 248 | * Fixed an issue that would cause tasks that had been previously calculated to 249 | prematurely trigger children tasks even if the parent tasks of the current 250 | task needed to be reexecuted. 251 | 252 | 0.6.0 (2018-07-24) 253 | ------------------ 254 | * Added a ``delayed_start`` flag to TaskGraph to allow for delayed execution 255 | of taskgraph tasks. If enabled on threaded or multiprocess mode, calls to 256 | ``add_task`` will not execute tasks until the ``join`` method is invoked on 257 | ``taskgraph``. This allows for finer control over execution order when tasks 258 | are passed non-equivalent ``priority`` levels. 259 | * Fixing an issue where a non-JSON serializeable object would cause 260 | ``add_task`` to crash. Now TaskGraph is more tolerant of non-JSON 261 | serializeable objects and will log warnings when parameters cannot be 262 | serialized. 263 | * TaskGraph constructor has an option to report a ongoing logging message 264 | at a set interval. The message reports how many tasks have been committed 265 | and completed. 266 | * Fixed a bug that would cause TaskGraph to needlessly reexecute a task if 267 | the only change was the order of the ``target_path_list`` or 268 | ``dependent_task_list`` variables. 269 | * Fixed a bug that would cause a task to reexecute between runs if input 270 | argument was a file that would be generated by a task that had not yet 271 | executed. 272 | * Made a code change that makes it very likely that tasks will be executed in 273 | priority order if added to a TaskGraph in delayed execution mode. 274 | * Refactored internal TaskGraph scheduling to fix a design error that made it 275 | likely tasks would be needlessly reexecuted. This also simplified TaskGraph 276 | flow control and cause slight performance improvements. 277 | * Fixed an issue discovered when a ``scipy.sparse`` matrix was passed as an 278 | argument and ``add_task`` crashed on infinite recursion. Type checking of 279 | arguments has been simplified and now iteration only occurs on the Python 280 | ``set``, ``dict``, ``list``, and ``tuple`` types. 281 | * Fixed an issue where the ``TaskGraph`` was not ``join``\ing the worker 282 | process pool on a closed/join TaskGraph, or when the ``TaskGraph`` object 283 | was being deconstructed. This would occasionally cause a race condition 284 | where the TaskGraph may still have a cache ``.json`` file open. Discovered 285 | through a flaky build test. 286 | * Added functionality to the ``TaskGraph`` object to propagate log messages 287 | from workers back to the parent process. This only applies for cases where 288 | a ``TaskGraph`` instance is started with ``n_workers > 0``. 289 | * Fixed an issue where a function that was passed as an argument would cause 290 | a reexecution on a separate run because the ``__repr__`` of a function 291 | includes its pointer address. 292 | * Adjusted logging levels so that detailed task information is shown on DEBUG 293 | but basic status updates are shown in INFO. 294 | 295 | 0.5.2 (2018-06-20) 296 | ------------------ 297 | * Fixing an issue where a Task would hang on a ``join`` if the number of 298 | workers in TaskGraph was -1 and a call to ``add_task`` has a non-``None`` 299 | passed to ``target_path_list`` and the resulting task was ``\.join``\ed 300 | after a second run of the same program. 301 | 302 | 0.5.1 (2018-06-20) 303 | ------------------ 304 | * Fixing an issue where TaskGraph would hang on a ``join`` if the number of 305 | workers was -1 and a call to ``add_task`` has ``None`` passed to 306 | ``target_path_list``. 307 | 308 | 0.5.0 (2018-05-04) 309 | ------------------ 310 | * Taskgraph now supports python versions 2 and 3 (tested with python 2.7, 311 | 3.6). 312 | * Fixed an issue with ``taskgraph.TaskGraph`` that prevented a multiprocessed 313 | graph from executing on POSIX systems when ``psutil`` was installed. 314 | * Adding matrix-based test automation (python 2.7, python 3.6, with/without 315 | ``psutil``) via ``tox``. 316 | * Updating repository path to ``https://bitbucket.org/natcap/taskgraph``. 317 | 318 | 0.4.0 (2018-04-18) 319 | ------------------ 320 | * Auto-versioning now happens via ``setuptools_scm``, replacing previous calls 321 | to ``natcap.versioner``. 322 | * Added an option to ``TaskGraph`` constructor to allow negative values in the 323 | ``n_workers`` argument to indicate that the entire object should run in the 324 | main thread. A value of 0 will indicate that no multiprocessing will be used 325 | but concurrency will be allowed for non-blocking ``add_task``. 326 | * Added an abstract class ``task.EncapsulatedTaskOp`` that can be used to 327 | instance a class that needs scope in order to be used as an operation passed 328 | to a process. The advantage of using ``EncapsulatedTaskOp`` is that the 329 | ``__name__`` hash used by ``TaskGraph`` to determine if a task is unique is 330 | calculated in the superclass and the subclass need only worry about 331 | implementation of ``__call__``. 332 | * Added a ``priority`` optional scalar argument to ``TaskGraph.add_task`` to 333 | indicates the priority preference of the task to be executed. A higher 334 | priority task whose dependencies are satisfied will executed before one with 335 | a lower priority. 336 | 337 | 0.3.0 (2017-11-17) 338 | ------------------ 339 | * Refactor of core scheduler. Old scheduler used asynchronicity to attempt to 340 | test if a Task was complete, occasionally testing all Tasks in potential 341 | work queue per task completion. Scheduler now uses bookkeeping to keep track 342 | of all dependencies and submits tasks for work only when all dependencies 343 | are satisfied. 344 | * TaskGraph and Task ``.join`` methods now have a timeout parameter. 345 | Additionally ``join`` now also returns False if ``join`` terminates because 346 | of a timeout. 347 | * More robust error reporting and shutdown of TaskGraph if any tasks fail 348 | during execution using pure threading or multiprocessing. 349 | 350 | 351 | 0.2.7 (2017-11-09) 352 | ------------------ 353 | * Fixed a critical error from the last hotfix that prevented ``taskgraph`` 354 | from avoiding recomputation of already completed tasks. 355 | 356 | 0.2.6 (2017-11-07) 357 | ------------------ 358 | * Fixed an issue from the previous hotfix that could cause ``taskgraph`` to 359 | exceed the number of available threads if enough tasks were added with long 360 | running dependencies. 361 | * Additional error checking and flow control ensures that a TaskGraph will 362 | catastrophically fail and report useful exception logging a task fails 363 | during runtime. 364 | * Fixed a deadlock issue where a failure on a subtask would occasionally cause 365 | a TaskGraph to hang. 366 | * ``Task.is_complete`` raises a RuntimeError if the task is complete but 367 | failed. 368 | * More efficient handling of topological progression of task execution to 369 | attempt to maximize total possible CPU load. 370 | * Fixing an issue from the last release that caused the test cases to fail. 371 | (Don't use 0.2.5 at all). 372 | 373 | 0.2.5 (2017-10-11) 374 | ------------------ 375 | * Fixed a bug where tasks with satisfied dependencies or no dependencies were 376 | blocked on dependent tasks added to the task graph earlier in the main 377 | thread execution. 378 | * Indicating that ``psutil`` is an optional dependency through the ``setup`` 379 | function. 380 | 381 | 0.2.4 (2017-09-19) 382 | ------------------ 383 | * Empty release. Possible bug with PyPI release, so re-releasing with a 384 | bumped up version. 385 | 386 | 0.2.3 (2017-09-18) 387 | ------------------ 388 | * More robust testing on a chain of tasks that might fail because an ancestor 389 | failed. 390 | 391 | 0.2.2 (2017-08-15) 392 | ------------------ 393 | * Changed how TaskGraph determines of work is complete. Now records target 394 | paths in file token with modified time and file size. When checking if work 395 | is complete, the token is loaded and the target file stats are compared for 396 | each file. 397 | 398 | 0.2.1 (2017-08-11) 399 | ------------------ 400 | * Handling cases where a function might be an object or something else that 401 | can't import source code. 402 | * Using natcap.versioner for versioning. 403 | 404 | 0.2.0 (2017-07-31) 405 | ------------------ 406 | * Fixing an issue where ``types.StringType`` is not the same as 407 | ``types.StringTypes``. 408 | * Redefined ``target`` in ``add_task`` to ``func`` to avoid naming collision 409 | with ``target_path_list`` in the same function. 410 | 411 | 0.1.1 (2017-07-31) 412 | ------------------ 413 | * Fixing a TYPO on ``__version__`` number scheme. 414 | * Importing ``psutil`` if it exists. 415 | 416 | 0.1.0 (2017-07-29) 417 | ------------------ 418 | * Initial release. 419 | -------------------------------------------------------------------------------- /tests/test_task.py: -------------------------------------------------------------------------------- 1 | """Tests for taskgraph.""" 2 | import hashlib 3 | import logging 4 | import logging.handlers 5 | import multiprocessing 6 | import os 7 | import pathlib 8 | import pickle 9 | import re 10 | import shutil 11 | import sqlite3 12 | import subprocess 13 | import tempfile 14 | import time 15 | import unittest 16 | 17 | import retrying 18 | import taskgraph 19 | 20 | LOGGER = logging.getLogger(__name__) 21 | 22 | N_TEARDOWN_RETRIES = 5 23 | MAX_TRY_WAIT_MS = 500 24 | 25 | 26 | def _return_value_once(value): 27 | """Return the value passed to it only once.""" 28 | if hasattr(_return_value_once, 'executed'): 29 | raise RuntimeError("this function was called twice") 30 | _return_value_once.executed = True 31 | return value 32 | 33 | 34 | def _noop_function(**kwargs): 35 | """Do nothing except allow kwargs to be passed.""" 36 | pass 37 | 38 | 39 | def _long_running_function(delay): 40 | """Wait for ``delay`` seconds.""" 41 | time.sleep(delay) 42 | 43 | 44 | def _create_two_files_on_disk(value, target_a_path, target_b_path): 45 | """Create two files and write ``value`` and append if possible.""" 46 | with open(target_a_path, 'a') as a_file: 47 | a_file.write(value) 48 | 49 | with open(target_b_path, 'a') as b_file: 50 | b_file.write(value) 51 | 52 | 53 | def _merge_and_append_files(base_a_path, base_b_path, target_path): 54 | """Merge two files and append if possible to new file.""" 55 | with open(target_path, 'a') as target_file: 56 | for base_path in [base_a_path, base_b_path]: 57 | with open(base_path, 'r') as base_file: 58 | target_file.write(base_file.read()) 59 | 60 | 61 | def _create_list_on_disk(value, length, target_path=None): 62 | """Create a numpy array on disk filled with value of ``size``.""" 63 | target_list = [value] * length 64 | pickle.dump(target_list, open(target_path, 'wb')) 65 | 66 | 67 | def _call_it(target, *args): 68 | """Invoke ``target`` with ``args``.""" 69 | target(*args) 70 | 71 | 72 | def _append_val(path, *val): 73 | """Append a ``val`` to file at ``path``.""" 74 | with open(path, 'a') as target_file: 75 | for v in val: 76 | target_file.write(str(v)) 77 | 78 | 79 | def _sum_lists_from_disk(list_a_path, list_b_path, target_path): 80 | """Read two lists, add them and save result.""" 81 | list_a = pickle.load(open(list_a_path, 'rb')) 82 | list_b = pickle.load(open(list_b_path, 'rb')) 83 | target_list = [] 84 | for a, b in zip(list_a, list_b): 85 | target_list.append(a+b) 86 | pickle.dump(target_list, open(target_path, 'wb')) 87 | 88 | 89 | def _div_by_zero(): 90 | """Divide by zero to raise an exception.""" 91 | return 1/0 92 | 93 | 94 | def _create_file(target_path, content): 95 | """Create a file with contents.""" 96 | with open(target_path, 'w') as target_file: 97 | target_file.write(content) 98 | 99 | 100 | def _create_file_once(target_path, content): 101 | """Create a file on the first call, raise an exception on the second.""" 102 | if hasattr(_create_file_once, 'executed'): 103 | raise RuntimeError("this function was called twice") 104 | _create_file_once.executed = True 105 | with open(target_path, 'w') as target_file: 106 | target_file.write(content) 107 | 108 | 109 | def _copy_file_once(base_path, target_path): 110 | """Copy base to target on the first call, raise exception on second.""" 111 | if hasattr(_copy_file_once, 'executed'): 112 | raise RuntimeError("this function was called twice") 113 | _copy_file_once.executed = True 114 | shutil.copyfile(base_path, target_path) 115 | 116 | 117 | def _copy_two_files_once(base_path, target_a_path, target_b_path): 118 | """Copy base to target a/b on first call, raise exception on second.""" 119 | if hasattr(_copy_two_files_once, 'executed'): 120 | raise RuntimeError("this function was called twice") 121 | _copy_two_files_once.executed = True 122 | shutil.copyfile(base_path, target_a_path) 123 | shutil.copyfile(base_path, target_b_path) 124 | 125 | 126 | def _log_from_another_process(logger_name, log_message): 127 | """Write a log message to a given logger. 128 | 129 | Args: 130 | logger_name (string): The string logger name to which ``log_message`` 131 | will be logged. 132 | log_message (string): The string log message to be logged (at INFO 133 | level) to the logger at ``logger_name``. 134 | 135 | Returns: 136 | ``None`` 137 | 138 | """ 139 | logger = logging.getLogger(logger_name) 140 | logger.info(log_message) 141 | 142 | 143 | class TaskGraphTests(unittest.TestCase): 144 | """Tests for the taskgraph.""" 145 | 146 | def setUp(self): 147 | """Create temp workspace directory.""" 148 | # this lets us delete the workspace after its done no matter the 149 | # the rest result 150 | self.workspace_dir = tempfile.mkdtemp() 151 | 152 | @retrying.retry( 153 | stop_max_attempt_number=N_TEARDOWN_RETRIES, 154 | wait_exponential_multiplier=250, wait_exponential_max=MAX_TRY_WAIT_MS) 155 | def tearDown(self): 156 | """Remove temporary directory.""" 157 | try: 158 | shutil.rmtree(self.workspace_dir) 159 | except Exception: 160 | LOGGER.exception('error when tearing down.') 161 | raise 162 | 163 | def test_version_loaded(self): 164 | """TaskGraph: verify we can load the version.""" 165 | try: 166 | import taskgraph 167 | 168 | # Verifies that there's a version attribute and it has a value. 169 | self.assertTrue(len(taskgraph.__version__) > 0) 170 | except Exception: 171 | self.fail('Could not load the taskgraph version as expected.') 172 | 173 | def test_single_task(self): 174 | """TaskGraph: Test a single task.""" 175 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0, 0.1) 176 | # forcing this one to be unicode since there shouldn't be a problem 177 | # with that at all... 178 | target_path = u'%s' % os.path.join(self.workspace_dir, '1000.dat') 179 | value = 5 180 | list_len = 1000 181 | _ = task_graph.add_task( 182 | func=_create_list_on_disk, 183 | args=(value, list_len), 184 | kwargs={ 185 | 'target_path': target_path, 186 | }, 187 | target_path_list=[target_path]) 188 | task_graph.close() 189 | task_graph.join() 190 | result = pickle.load(open(target_path, 'rb')) 191 | self.assertEqual(result, [value]*list_len) 192 | 193 | def test_task_hash_source_deleted(self): 194 | """TaskGraph: test if old target deleted when hashing duplicate.""" 195 | target_a_path = os.path.join(self.workspace_dir, 'a.txt') 196 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 197 | task_a = task_graph.add_task( 198 | func=_create_file, 199 | args=(target_a_path, 'test value'), 200 | target_path_list=[target_a_path]) 201 | task_a.join() 202 | target_b_path = os.path.join(self.workspace_dir, 'b.txt') 203 | _ = task_graph.add_task( 204 | func=_create_file, 205 | args=(target_b_path, 'test value'), 206 | target_path_list=[target_b_path]) 207 | task_graph.close() 208 | task_graph.join() 209 | del task_graph 210 | 211 | os.remove(target_a_path) 212 | os.remove(target_b_path) 213 | 214 | target_c_path = os.path.join(self.workspace_dir, 'c.txt') 215 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 216 | _ = task_graph.add_task( 217 | func=_create_file, 218 | args=(target_c_path, 'test value'), 219 | target_path_list=[target_c_path]) 220 | task_graph.close() 221 | task_graph.join() 222 | 223 | with open(target_c_path, 'r') as target_file: 224 | result = target_file.read() 225 | self.assertEqual(result, 'test value') 226 | 227 | def test_task_rel_vs_absolute(self): 228 | """TaskGraph: test that relative path equates to absolute.""" 229 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 230 | 231 | target_a_path = os.path.relpath(os.path.join( 232 | self.workspace_dir, 'a.txt'), start=self.workspace_dir) 233 | target_b_path = os.path.abspath(target_a_path) 234 | 235 | _ = task_graph.add_task( 236 | func=_create_file, 237 | args=(target_a_path, 'test value'), 238 | target_path_list=[target_a_path], 239 | task_name='task a') 240 | 241 | _ = task_graph.add_task( 242 | func=_create_file, 243 | args=(target_b_path, 'test value'), 244 | target_path_list=[target_b_path], 245 | task_name='task b') 246 | 247 | task_graph.close() 248 | task_graph.join() 249 | del task_graph 250 | 251 | with open(target_a_path, 'r') as a_file: 252 | m = hashlib.md5() 253 | m.update(a_file.read().encode('utf-8')) 254 | a_digest = m.digest() 255 | with open(target_b_path, 'r') as b_file: 256 | m = hashlib.md5() 257 | m.update(b_file.read().encode('utf-8')) 258 | b_digest = m.digest() 259 | self.assertEqual(a_digest, b_digest) 260 | 261 | def test_timeout_task(self): 262 | """TaskGraph: Test timeout functionality.""" 263 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 264 | _ = task_graph.add_task( 265 | func=_long_running_function, 266 | args=(5,)) 267 | task_graph.close() 268 | timedout = not task_graph.join(0.5) 269 | # this should timeout since function runs for 5 seconds 270 | self.assertTrue(timedout) 271 | 272 | def test_precomputed_task(self): 273 | """TaskGraph: Test that a task reuses old results.""" 274 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 275 | target_path = os.path.join(self.workspace_dir, '1000.dat') 276 | value = 5 277 | list_len = 1000 278 | _ = task_graph.add_task( 279 | func=_create_list_on_disk, 280 | args=(value, list_len), 281 | kwargs={ 282 | 'target_path': target_path, 283 | }, 284 | target_path_list=[target_path]) 285 | task_graph.close() 286 | task_graph.join() 287 | result = pickle.load(open(target_path, 'rb')) 288 | self.assertEqual(result, [value]*list_len) 289 | result_m_time = os.path.getmtime(target_path) 290 | del task_graph 291 | 292 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 293 | _ = task_graph.add_task( 294 | func=_create_list_on_disk, 295 | args=(value, list_len), 296 | kwargs={ 297 | 'target_path': target_path, 298 | }, 299 | target_path_list=[target_path]) 300 | task_graph.close() 301 | task_graph.join() 302 | del task_graph 303 | 304 | # taskgraph shouldn't have recomputed the result 305 | second_result_m_time = os.path.getmtime(target_path) 306 | self.assertEqual(result_m_time, second_result_m_time) 307 | 308 | def test_task_chain(self): 309 | """TaskGraph: Test a task chain.""" 310 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 311 | target_a_path = os.path.join(self.workspace_dir, 'a.dat') 312 | target_b_path = os.path.join(self.workspace_dir, 'b.dat') 313 | result_path = os.path.join(self.workspace_dir, 'result.dat') 314 | result_2_path = os.path.join(self.workspace_dir, 'result2.dat') 315 | value_a = 5 316 | value_b = 10 317 | list_len = 10 318 | task_a = task_graph.add_task( 319 | func=_create_list_on_disk, 320 | args=(value_a, list_len), 321 | kwargs={ 322 | 'target_path': target_a_path, 323 | }, 324 | target_path_list=[target_a_path]) 325 | task_b = task_graph.add_task( 326 | func=_create_list_on_disk, 327 | args=(value_b, list_len), 328 | kwargs={ 329 | 'target_path': target_b_path, 330 | }, 331 | target_path_list=[target_b_path]) 332 | sum_task = task_graph.add_task( 333 | func=_sum_lists_from_disk, 334 | args=(target_a_path, target_b_path), 335 | kwargs={ 336 | 'target_path': result_path, 337 | }, 338 | target_path_list=[result_path], 339 | dependent_task_list=[task_a, task_b]) 340 | sum_task.join() 341 | 342 | result = pickle.load(open(result_path, 'rb')) 343 | self.assertEqual(result, [value_a+value_b]*list_len) 344 | 345 | sum_2_task = task_graph.add_task( 346 | func=_sum_lists_from_disk, 347 | args=(target_a_path, result_path, result_2_path), 348 | target_path_list=[result_2_path], 349 | dependent_task_list=[task_a, sum_task]) 350 | sum_2_task.join() 351 | result2 = pickle.load(open(result_2_path, 'rb')) 352 | expected_result = [(value_a*2+value_b)]*list_len 353 | self.assertEqual(result2, expected_result) 354 | 355 | sum_3_task = task_graph.add_task( 356 | func=_sum_lists_from_disk, 357 | args=(target_a_path, result_path, result_2_path), 358 | target_path_list=[result_2_path], 359 | dependent_task_list=[task_a, sum_task]) 360 | task_graph.close() 361 | sum_3_task.join() 362 | result3 = pickle.load(open(result_2_path, 'rb')) 363 | expected_result = [(value_a*2+value_b)]*list_len 364 | self.assertEqual(result3, expected_result) 365 | task_graph.join() 366 | 367 | def test_task_chain_single_thread(self): 368 | """TaskGraph: Test a single threaded task chain.""" 369 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 370 | target_a_path = os.path.join(self.workspace_dir, 'a.dat') 371 | target_b_path = os.path.join(self.workspace_dir, 'b.dat') 372 | result_path = os.path.join(self.workspace_dir, 'result.dat') 373 | result_2_path = os.path.join(self.workspace_dir, 'result2.dat') 374 | value_a = 5 375 | value_b = 10 376 | list_len = 10 377 | task_a = task_graph.add_task( 378 | func=_create_list_on_disk, 379 | args=(value_a, list_len), 380 | kwargs={ 381 | 'target_path': target_a_path, 382 | }, 383 | target_path_list=[target_a_path], 384 | task_name='task a') 385 | task_b = task_graph.add_task( 386 | func=_create_list_on_disk, 387 | args=(value_b, list_len), 388 | kwargs={ 389 | 'target_path': target_b_path, 390 | }, 391 | target_path_list=[target_b_path], 392 | task_name='task b') 393 | sum_task = task_graph.add_task( 394 | func=_sum_lists_from_disk, 395 | args=(target_a_path, target_b_path), 396 | kwargs={ 397 | 'target_path': result_path, 398 | }, 399 | target_path_list=[result_path], 400 | dependent_task_list=[task_a, task_b], 401 | task_name='task c') 402 | sum_task.join() 403 | 404 | result = pickle.load(open(result_path, 'rb')) 405 | self.assertEqual(result, [value_a+value_b]*list_len) 406 | 407 | sum_2_task = task_graph.add_task( 408 | func=_sum_lists_from_disk, 409 | args=(target_a_path, result_path, result_2_path), 410 | target_path_list=[result_2_path], 411 | dependent_task_list=[task_a, sum_task], 412 | task_name='task sum_2') 413 | sum_2_task.join() 414 | result2 = pickle.load(open(result_2_path, 'rb')) 415 | expected_result = [(value_a*2+value_b)]*list_len 416 | self.assertEqual(result2, expected_result) 417 | 418 | sum_3_task = task_graph.add_task( 419 | func=_sum_lists_from_disk, 420 | args=(target_a_path, result_path, result_2_path), 421 | target_path_list=[result_2_path], 422 | dependent_task_list=[task_a, sum_task], 423 | task_name='task sum_3') 424 | task_graph.close() 425 | sum_3_task.join() 426 | result3 = pickle.load(open(result_2_path, 'rb')) 427 | expected_result = [(value_a*2+value_b)]*list_len 428 | task_graph.join() 429 | task_graph = None 430 | self.assertEqual(result3, expected_result) 431 | 432 | # we should have 4 completed values in the database, 5 total but one 433 | # was a duplicate 434 | database_path = os.path.join( 435 | self.workspace_dir, taskgraph._TASKGRAPH_DATABASE_FILENAME) 436 | conn = sqlite3.connect(database_path) 437 | with conn: 438 | cursor = conn.cursor() 439 | cursor.execute("SELECT * FROM taskgraph_data") 440 | result = cursor.fetchall() 441 | conn.close() 442 | self.assertEqual(len(result), 4) 443 | 444 | def test_task_broken_chain(self): 445 | """TaskGraph: Test a multiprocess chain with exception raised.""" 446 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 4) 447 | target_a_path = os.path.join(self.workspace_dir, 'a.dat') 448 | target_b_path = os.path.join(self.workspace_dir, 'b.dat') 449 | result_path = os.path.join(self.workspace_dir, 'result.dat') 450 | value_a = 5 451 | list_len = 10 452 | task_a = task_graph.add_task( 453 | func=_create_list_on_disk, 454 | args=(value_a, list_len), 455 | kwargs={ 456 | 'target_path': target_a_path, 457 | }, 458 | target_path_list=[target_a_path]) 459 | task_b = task_graph.add_task( 460 | func=_div_by_zero, 461 | dependent_task_list=[task_a]) 462 | _ = task_graph.add_task( 463 | func=_sum_lists_from_disk, 464 | args=(target_a_path, target_b_path), 465 | kwargs={ 466 | 'target_path': result_path, 467 | }, 468 | target_path_list=[result_path], 469 | dependent_task_list=[task_a, task_b]) 470 | task_graph.close() 471 | 472 | with self.assertRaises(ZeroDivisionError): 473 | task_graph.join() 474 | 475 | def test_broken_task(self): 476 | """TaskGraph: Test that a task with an exception won't hang.""" 477 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 1) 478 | 479 | broken_task = task_graph.add_task( 480 | func=_div_by_zero, task_name='test_broken_task') 481 | with self.assertRaises(ZeroDivisionError): 482 | _ = broken_task.join() 483 | 484 | task_graph.close() 485 | 486 | with self.assertRaises(ZeroDivisionError): 487 | task_graph.join() 488 | 489 | def test_broken_task_chain(self): 490 | """TaskGraph: test dependent tasks fail on ancestor fail.""" 491 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 4) 492 | 493 | target_path = os.path.join(self.workspace_dir, '1000.dat') 494 | value = 5 495 | list_len = 1000 496 | for task_id in range(1): 497 | target_path = os.path.join( 498 | self.workspace_dir, '1000_%d.dat' % task_id) 499 | normal_task = task_graph.add_task( 500 | func=_create_list_on_disk, 501 | args=(value, list_len), 502 | kwargs={'target_path': target_path}, 503 | target_path_list=[target_path], 504 | task_name='create list on disk %d' % task_id) 505 | zero_div_task = task_graph.add_task( 506 | func=_div_by_zero, 507 | dependent_task_list=[normal_task], 508 | task_name='test_broken_task_chain_%d' % task_id) 509 | target_path = os.path.join( 510 | self.workspace_dir, 'after_zerodiv_1000_%d.dat' % task_id) 511 | _ = task_graph.add_task( 512 | func=_create_list_on_disk, 513 | args=(value, list_len), 514 | kwargs={'target_path': target_path}, 515 | dependent_task_list=[zero_div_task], 516 | target_path_list=[target_path], 517 | task_name='create list on disk after zero div%d' % task_id) 518 | 519 | task_graph.close() 520 | with self.assertRaises(ZeroDivisionError): 521 | task_graph.join() 522 | 523 | def test_empty_task(self): 524 | """TaskGraph: Test an empty task.""" 525 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 526 | _ = task_graph.add_task() 527 | task_graph.close() 528 | task_graph.join() 529 | # we shouldn't have anything in the database 530 | database_path = os.path.join( 531 | self.workspace_dir, taskgraph._TASKGRAPH_DATABASE_FILENAME) 532 | 533 | conn = sqlite3.connect(database_path) 534 | with conn: 535 | cursor = conn.cursor() 536 | cursor.executescript("SELECT * FROM taskgraph_data") 537 | result = cursor.fetchall() 538 | conn.close() 539 | self.assertEqual(len(result), 0) 540 | 541 | def test_closed_graph(self): 542 | """TaskGraph: Test adding to an closed task graph fails.""" 543 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 544 | task_graph.close() 545 | target_path = os.path.join(self.workspace_dir, '1000.dat') 546 | value = 5 547 | list_len = 1000 548 | with self.assertRaises(ValueError): 549 | _ = task_graph.add_task( 550 | func=_create_list_on_disk, 551 | args=(value, list_len), 552 | kwargs={'target_path': target_path}, 553 | target_path_list=[target_path]) 554 | task_graph.join() 555 | 556 | def test_single_task_multiprocessing(self): 557 | """TaskGraph: Test a single task with multiprocessing.""" 558 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 1) 559 | target_path = os.path.join(self.workspace_dir, '1000.dat') 560 | value = 5 561 | list_len = 1000 562 | _ = task_graph.add_task( 563 | func=_create_list_on_disk, 564 | args=(value, list_len), 565 | kwargs={ 566 | 'target_path': target_path, 567 | }, 568 | target_path_list=[target_path]) 569 | task_graph.close() 570 | task_graph.join() 571 | result = pickle.load(open(target_path, 'rb')) 572 | self.assertEqual(result, [value]*list_len) 573 | 574 | def test_get_file_stats(self): 575 | """TaskGraph: Test _get_file_stats subroutine.""" 576 | from taskgraph.Task import _get_file_stats 577 | test_dir = os.path.join(self.workspace_dir, 'test_dir') 578 | test_file = os.path.join(test_dir, 'test_file.txt') 579 | os.mkdir(test_dir) 580 | with open(test_file, 'w') as f: 581 | f.write('\n') 582 | nofile = os.path.join(self.workspace_dir, 'nofile') 583 | base_value = [ 584 | nofile, test_dir, test_file, 585 | 10, {'a': {'b': test_file}}, {'a': {'b': test_dir, 'foo': 9}}] 586 | ignore_dir_result = list(_get_file_stats( 587 | base_value, 'sizetimestamp', [], True)) 588 | # should get two results if we ignore the directories because there's 589 | # only two files 590 | self.assertEqual(len(ignore_dir_result), 2) 591 | dir_result = list(_get_file_stats( 592 | base_value, 'sizetimestamp', [], False)) 593 | # should get four results if we track directories because of two files 594 | # and two directories 595 | self.assertEqual(len(dir_result), 4) 596 | 597 | result = list(_get_file_stats(nofile, 'sizetimestamp', [], False)) 598 | self.assertEqual(result, []) 599 | 600 | def test_transient_runs(self): 601 | """TaskGraph: ensure that transent tasks reexecute.""" 602 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 603 | target_path = os.path.join(self.workspace_dir, '1000.dat') 604 | value = 5 605 | list_len = 1000 606 | _ = task_graph.add_task( 607 | func=_create_list_on_disk, 608 | args=(value, list_len), 609 | kwargs={ 610 | 'target_path': target_path, 611 | }) 612 | task_graph.close() 613 | task_graph.join() 614 | task_graph = None 615 | 616 | os.remove(target_path) 617 | 618 | task_graph2 = taskgraph.TaskGraph(self.workspace_dir, -1) 619 | _ = task_graph2.add_task( 620 | func=_create_list_on_disk, 621 | args=(value, list_len), 622 | transient_run=True, 623 | kwargs={ 624 | 'target_path': target_path, 625 | }) 626 | 627 | task_graph2.close() 628 | task_graph2.join() 629 | 630 | self.assertTrue( 631 | os.path.exists(target_path), 632 | "Expected file to exist because taskgraph should have re-run.") 633 | 634 | def test_repeat_targeted_runs(self): 635 | """TaskGraph: ensure that repeated runs with targets can join.""" 636 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 637 | target_path = os.path.join(self.workspace_dir, '1000.dat') 638 | value = 5 639 | list_len = 1000 640 | _ = task_graph.add_task( 641 | func=_create_list_on_disk, 642 | args=(value, list_len), 643 | kwargs={ 644 | 'target_path': target_path, 645 | }, 646 | target_path_list=[target_path]) 647 | task_graph.close() 648 | task_graph.join() 649 | task_graph = None 650 | 651 | task_graph2 = taskgraph.TaskGraph(self.workspace_dir, -1) 652 | task = task_graph2.add_task( 653 | func=_create_list_on_disk, 654 | args=(value, list_len), 655 | kwargs={ 656 | 'target_path': target_path, 657 | }, 658 | target_path_list=[target_path]) 659 | self.assertTrue(task.join(1.0), "join failed after 1 second") 660 | task_graph2.close() 661 | task_graph2.join() 662 | 663 | def test_task_equality(self): 664 | """TaskGraph: test correctness of == and != for Tasks.""" 665 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 666 | target_path = os.path.join(self.workspace_dir, '1000.dat') 667 | value = 5 668 | list_len = 1000 669 | task_a = task_graph.add_task( 670 | func=_create_list_on_disk, 671 | args=(value, list_len), 672 | kwargs={'target_path': target_path}, 673 | target_path_list=[target_path]) 674 | task_a_same = task_graph.add_task( 675 | func=_create_list_on_disk, 676 | args=(value, list_len), 677 | kwargs={'target_path': target_path}, 678 | target_path_list=[target_path]) 679 | task_b = task_graph.add_task( 680 | func=_create_list_on_disk, 681 | args=(value+1, list_len), 682 | kwargs={'target_path': target_path}, 683 | target_path_list=[target_path]) 684 | 685 | self.assertTrue(task_a == task_a) 686 | self.assertTrue(task_a == task_a_same) 687 | self.assertTrue(task_a != task_b) 688 | 689 | def test_async_logging(self): 690 | """TaskGraph: ensure async logging can execute.""" 691 | task_graph = taskgraph.TaskGraph( 692 | self.workspace_dir, 0, reporting_interval=0.5) 693 | _ = task_graph.add_task( 694 | func=_long_running_function, 695 | args=(1.0,)) 696 | task_graph.close() 697 | task_graph.join() 698 | timedout = not task_graph.join(5) 699 | # this should not timeout since function runs for 1 second 700 | self.assertFalse(timedout, "task timed out") 701 | 702 | def test_scrub(self): 703 | """TaskGraph: ensure scrub is not scrubbing base types.""" 704 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 705 | 706 | target_path = os.path.join(self.workspace_dir, 'a.txt') 707 | first_task = task_graph.add_task( 708 | func=_append_val, 709 | args=(target_path, 1, [1], {'x': 1}), 710 | task_name='first append') 711 | 712 | second_task = task_graph.add_task( 713 | func=_append_val, 714 | args=(target_path, 1, [1], {'x': 2}), 715 | dependent_task_list=[first_task], 716 | task_name='second append') 717 | 718 | _ = task_graph.add_task( 719 | func=_append_val, 720 | args=(target_path, 1, [2], {'x': 1}), 721 | dependent_task_list=[second_task], 722 | task_name='third append') 723 | 724 | task_graph.close() 725 | task_graph.join() 726 | 727 | with open(target_path, 'r') as target_file: 728 | file_value = target_file.read() 729 | self.assertEqual("1[1]{'x': 1}1[1]{'x': 2}1[2]{'x': 1}", file_value) 730 | 731 | def test_target_path_order(self): 732 | """TaskGraph: ensure target path order doesn't matter.""" 733 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 734 | target_a_path = os.path.join(self.workspace_dir, 'a.txt') 735 | target_b_path = os.path.join(self.workspace_dir, 'b.txt') 736 | 737 | task_graph.add_task( 738 | func=_create_two_files_on_disk, 739 | args=("word", target_a_path, target_b_path), 740 | target_path_list=[target_a_path, target_b_path]) 741 | 742 | task_graph.add_task( 743 | func=_create_two_files_on_disk, 744 | args=("word", target_a_path, target_b_path), 745 | target_path_list=[target_b_path, target_a_path]) 746 | 747 | task_graph.close() 748 | task_graph.join() 749 | 750 | with open(target_a_path, 'r') as a_file: 751 | a_value = a_file.read() 752 | 753 | with open(target_b_path, 'r') as b_file: 754 | b_value = b_file.read() 755 | 756 | self.assertEqual(a_value, "word") 757 | self.assertEqual(b_value, "word") 758 | 759 | def test_task_hash_when_ready(self): 760 | """TaskGraph: ensure tasks don't record execution info until ready.""" 761 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 762 | target_a_path = os.path.join(self.workspace_dir, 'a.txt') 763 | target_b_path = os.path.join(self.workspace_dir, 'b.txt') 764 | 765 | create_files_task = task_graph.add_task( 766 | func=_create_two_files_on_disk, 767 | args=("word", target_a_path, target_b_path), 768 | target_path_list=[target_a_path, target_b_path]) 769 | 770 | target_merged_path = os.path.join(self.workspace_dir, 'merged.txt') 771 | task_graph.add_task( 772 | func=_merge_and_append_files, 773 | args=(target_a_path, target_b_path, target_merged_path), 774 | target_path_list=[target_merged_path], 775 | dependent_task_list=[create_files_task]) 776 | 777 | task_graph.join() 778 | 779 | # this second task shouldn't execute because it's a copy of the first 780 | task_graph.add_task( 781 | func=_merge_and_append_files, 782 | args=(target_a_path, target_b_path, target_merged_path), 783 | target_path_list=[target_merged_path], 784 | dependent_task_list=[create_files_task]) 785 | 786 | task_graph.close() 787 | task_graph.join() 788 | 789 | with open(target_merged_path, 'r') as target_file: 790 | target_string = target_file.read() 791 | 792 | self.assertEqual(target_string, "wordword") 793 | 794 | def test_multiprocessed_logging(self): 795 | """TaskGraph: ensure tasks can log from multiple processes.""" 796 | logger_name = 'test.task.queuelogger' 797 | log_message = 'This is coming from another process' 798 | logger = logging.getLogger(logger_name) 799 | logger.setLevel(logging.DEBUG) 800 | file_log_path = os.path.join( 801 | self.workspace_dir, 'test_multiprocessed_logging.log') 802 | file_handler = logging.FileHandler(file_log_path) 803 | file_handler.setFormatter( 804 | logging.Formatter(fmt=':%(processName)s:%(message)s:')) 805 | logger.addHandler(file_handler) 806 | 807 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 1) 808 | log_task = task_graph.add_task( 809 | func=_log_from_another_process, 810 | args=(logger_name, log_message)) 811 | log_task.join() 812 | file_handler.flush() 813 | task_graph.close() 814 | task_graph.join() 815 | file_handler.close() 816 | 817 | @retrying.retry(wait_exponential_multiplier=100, 818 | wait_exponential_max=1000, 819 | stop_max_attempt_number=5) 820 | def get_name_and_message(): 821 | with open(file_log_path, 'r') as log_file: 822 | message = log_file.read().rstrip() 823 | print(message) 824 | process_name, logged_message = re.match( 825 | ':([^:]*):([^:]*):', message).groups() 826 | return process_name, logged_message 827 | 828 | process_name, logged_message = get_name_and_message() 829 | self.assertEqual(logged_message, log_message) 830 | self.assertNotEqual( 831 | process_name, multiprocessing.current_process().name) 832 | 833 | def test_repeated_function(self): 834 | """TaskGraph: ensure no reruns if argument is a function.""" 835 | global _append_val 836 | 837 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 838 | target_path = os.path.join(self.workspace_dir, 'testfile.txt') 839 | task_graph.add_task( 840 | func=_call_it, 841 | args=(_append_val, target_path, 1), 842 | target_path_list=[target_path], 843 | ignore_path_list=[target_path], 844 | task_name='first _call_it') 845 | task_graph.close() 846 | task_graph.join() 847 | del task_graph 848 | 849 | # this causes the address to change 850 | def _append_val(path, *val): 851 | """Append a ``val`` to file at ``path``.""" 852 | with open(path, 'a') as target_file: 853 | for v in val: 854 | target_file.write(str(v)) 855 | 856 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 1) 857 | target_path = os.path.join(self.workspace_dir, 'testfile.txt') 858 | task_graph.add_task( 859 | func=_call_it, 860 | args=(_append_val, target_path, 1), 861 | target_path_list=[target_path], 862 | ignore_path_list=[target_path], 863 | task_name='second _call_it') 864 | task_graph.close() 865 | task_graph.join() 866 | 867 | with open(target_path, 'r') as target_file: 868 | result = target_file.read() 869 | 870 | # the second call shouldn't happen 871 | self.assertEqual(result, '1') 872 | 873 | def test_unix_path_repeated_function(self): 874 | """TaskGraph: ensure no reruns if path is unix style.""" 875 | global _append_val 876 | _append_val = _append_val # flake8 complains if not defined 877 | 878 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 879 | target_dir = self.workspace_dir + '/foo/bar/rad/' 880 | os.makedirs(target_dir) 881 | target_path = target_dir + '/testfile.txt' 882 | task_graph.add_task( 883 | func=_call_it, 884 | args=(_append_val, target_path, 1), 885 | target_path_list=[target_path], 886 | task_name='first _call_it') 887 | task_graph.close() 888 | task_graph.join() 889 | del task_graph 890 | 891 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 892 | task_graph.add_task( 893 | func=_call_it, 894 | args=(_append_val, target_path, 1), 895 | target_path_list=[target_path], 896 | task_name='second _call_it') 897 | task_graph.close() 898 | task_graph.join() 899 | 900 | with open(target_path, 'r') as target_file: 901 | result = target_file.read() 902 | 903 | # the second call shouldn't happen 904 | self.assertEqual(result, '1') 905 | 906 | def test_very_long_string(self): 907 | """TaskGraph: ensure that long strings don't case an OSError.""" 908 | from taskgraph.Task import _get_file_stats 909 | 910 | # this is a list with two super long strings to try to trick some 911 | # os function into thinking it's a path. 912 | base_value = [ 913 | 'c:' + r'\\\\\\\\x\\\\\\\\'*2**10 + 'foo', 914 | 'wfeji3223j8923j9' * 2**10] 915 | self.assertEqual( 916 | list(_get_file_stats(base_value, 'sizetimestamp', [], True)), []) 917 | 918 | def test_duplicate_call_changed_target(self): 919 | """TaskGraph: test that duplicate calls copy target path.""" 920 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 921 | target_path = os.path.join(self.workspace_dir, 'testfile.txt') 922 | 923 | if hasattr(_create_file_once, 'executed'): 924 | del _create_file_once.executed 925 | 926 | task_graph.add_task( 927 | func=_create_file_once, 928 | args=(target_path, 'test'), 929 | target_path_list=[target_path], 930 | hash_target_files=False, 931 | task_name='first _create_file_once') 932 | 933 | task_graph.close() 934 | task_graph.join() 935 | del task_graph 936 | 937 | with open(target_path, 'a') as target_file: 938 | target_file.write('updated') 939 | 940 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 941 | task_graph.add_task( 942 | func=_create_file_once, 943 | args=(target_path, 'test'), 944 | target_path_list=[target_path], 945 | hash_target_files=False, 946 | task_name='first _create_file_once') 947 | 948 | task_graph.close() 949 | task_graph.join() 950 | del task_graph 951 | 952 | with open(target_path, 'r') as result_file: 953 | result_contents = result_file.read() 954 | self.assertEqual('testupdated', result_contents) 955 | 956 | def test_duplicate_call_modify_timestamp(self): 957 | """TaskGraph: test that duplicate call modified stamp recompute.""" 958 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 959 | target_path = os.path.join(self.workspace_dir, 'testfile.txt') 960 | task_graph.add_task( 961 | func=_create_file, 962 | args=(target_path, 'test'), 963 | target_path_list=[target_path], 964 | task_name='first _create_file') 965 | task_graph.close() 966 | task_graph.join() 967 | del task_graph 968 | 969 | with open(target_path, 'w') as target_file: 970 | target_file.write('test2') 971 | with open(target_path, 'r') as target_file: 972 | contents = target_file.read() 973 | self.assertEqual(contents, 'test2') 974 | 975 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 976 | task_graph.add_task( 977 | func=_create_file, 978 | args=(target_path, 'test'), 979 | target_path_list=[target_path], 980 | task_name='second _create_file') 981 | 982 | task_graph.close() 983 | task_graph.join() 984 | 985 | with open(target_path, 'r') as target_file: 986 | contents = target_file.read() 987 | self.assertEqual(contents, 'test') 988 | 989 | def test_different_target_path_list(self): 990 | """TaskGraph: duplicate calls with different targets should fail.""" 991 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 992 | target_path = os.path.join(self.workspace_dir, 'testfile.txt') 993 | task_graph.add_task( 994 | func=_create_list_on_disk, 995 | args=('test', 1, target_path), 996 | target_path_list=[target_path], 997 | task_name='first _create_list_on_disk') 998 | 999 | with self.assertRaises(RuntimeError): 1000 | # make the same call but with different target path list 1001 | task_graph.add_task( 1002 | func=_create_list_on_disk, 1003 | args=('test', 1, target_path), 1004 | target_path_list=[target_path, 'test.txt'], 1005 | task_name='first _create_list_on_disk') 1006 | 1007 | task_graph.close() 1008 | task_graph.join() 1009 | 1010 | def test_terminated_taskgraph(self): 1011 | """TaskGraph: terminated task graph raises exception correctly.""" 1012 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 4) 1013 | _ = task_graph.add_task(func=_div_by_zero) 1014 | with self.assertRaises(ZeroDivisionError): 1015 | task_graph.join() 1016 | 1017 | with self.assertRaises(RuntimeError) as cm: 1018 | _ = task_graph.add_task(func=_div_by_zero) 1019 | expected_message = "add_task when Taskgraph is terminated" 1020 | actual_message = str(cm.exception) 1021 | self.assertTrue(expected_message in actual_message, actual_message) 1022 | 1023 | task_graph.close() 1024 | # try closing twice just to mess with coverage 1025 | task_graph.close() 1026 | 1027 | def test_type_list_error(self): 1028 | """TaskGraph: Task not passed to dependent task list.""" 1029 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 1030 | target_path = os.path.join(self.workspace_dir, 'testfile.txt') 1031 | with self.assertRaises(ValueError) as cm: 1032 | task_graph.add_task( 1033 | func=_create_list_on_disk, 1034 | args=('test', 1, target_path), 1035 | target_path_list=[target_path], 1036 | dependent_task_list=[target_path], 1037 | task_name='first _create_list_on_disk') 1038 | expected_message = ( 1039 | "Objects passed to dependent task list that are not tasks") 1040 | actual_message = str(cm.exception) 1041 | self.assertTrue(expected_message in actual_message, actual_message) 1042 | 1043 | def test_target_list_error(self): 1044 | """TaskGraph: Path not passed to target list.""" 1045 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 1046 | target_path = os.path.join(self.workspace_dir, 'testfile.txt') 1047 | with self.assertRaises(ValueError) as cm: 1048 | task_graph.add_task( 1049 | func=_create_list_on_disk, 1050 | args=('test', 1, target_path), 1051 | target_path_list=[1], 1052 | task_name='_create_list_on_disk') 1053 | expected_message = ( 1054 | "Values passed to target_path_list are not strings") 1055 | actual_message = str(cm.exception) 1056 | self.assertTrue(expected_message in actual_message, actual_message) 1057 | 1058 | def test_target_path_missing_file(self): 1059 | """TaskGraph: func runs, but missing target.""" 1060 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 1061 | target_path = os.path.join(self.workspace_dir, 'testfile.txt') 1062 | not_target_path = os.path.join(self.workspace_dir, 'not_target.txt') 1063 | with self.assertRaises(RuntimeError) as cm: 1064 | task_graph.add_task( 1065 | func=_create_list_on_disk, 1066 | args=('test', 1, target_path), 1067 | target_path_list=[not_target_path], 1068 | task_name='_create_list_on_disk') 1069 | expected_message = "Missing expected target path results" 1070 | actual_message = str(cm.exception) 1071 | self.assertTrue(expected_message in actual_message, actual_message) 1072 | 1073 | def test_expected_path_list(self): 1074 | """TaskGraph: test expected path list matches actual path list.""" 1075 | def _create_file(target_path, content): 1076 | with open(target_path, 'w') as target_file: 1077 | target_file.write(content) 1078 | 1079 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0) 1080 | # note it is important this is a relative path that does not 1081 | # contain the drive letter on Windows. 1082 | absolute_target_file_path = os.path.join( 1083 | self.workspace_dir, 'a.txt') 1084 | relative_path = os.path.relpath(absolute_target_file_path, 1085 | start=self.workspace_dir) 1086 | 1087 | _ = task_graph.add_task( 1088 | func=_create_file, 1089 | args=(relative_path, 'test value'), 1090 | target_path_list=[relative_path], 1091 | task_name='create file') 1092 | 1093 | task_graph.close() 1094 | task_graph.join() 1095 | del task_graph 1096 | 1097 | self.assertTrue('Ran without crashing!') 1098 | 1099 | def test_kwargs_hashed(self): 1100 | """TaskGraph: ensure kwargs are considered in determining id hash.""" 1101 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0) 1102 | 1103 | task_a = task_graph.add_task( 1104 | func=_noop_function, 1105 | kwargs={ 1106 | 'content': ['this value: a']}, 1107 | task_name='noop a') 1108 | 1109 | task_b = task_graph.add_task( 1110 | func=_noop_function, 1111 | kwargs={ 1112 | 'content': ['this value b']}, 1113 | task_name='noop b') 1114 | 1115 | task_graph.close() 1116 | task_graph.join() 1117 | del task_graph 1118 | 1119 | self.assertNotEqual( 1120 | task_a._task_id_hash, task_b._task_id_hash, 1121 | "task ids should be different since the kwargs are different") 1122 | 1123 | def test_same_timestamp_and_value(self): 1124 | """TaskGraph: ensure identical files but filename are noticed.""" 1125 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0) 1126 | 1127 | file_a_path = os.path.join(self.workspace_dir, 'file_a.txt') 1128 | file_b_path = os.path.join(self.workspace_dir, 'file_b.txt') 1129 | 1130 | with open(file_a_path, 'w') as file_a: 1131 | file_a.write('a') 1132 | with open(file_b_path, 'w') as file_b: 1133 | file_b.write('a') 1134 | 1135 | os.utime(file_a_path, (0, 0)) 1136 | os.utime(file_b_path, (0, 0)) 1137 | 1138 | task_a = task_graph.add_task( 1139 | func=_noop_function, 1140 | kwargs={ 1141 | 'path': file_a_path}, 1142 | task_name='noop a') 1143 | 1144 | task_b = task_graph.add_task( 1145 | func=_noop_function, 1146 | kwargs={ 1147 | 'path': file_b_path}, 1148 | task_name='noop b') 1149 | 1150 | task_graph.close() 1151 | task_graph.join() 1152 | del task_graph 1153 | 1154 | self.assertNotEqual( 1155 | task_a._task_id_hash, task_b._task_id_hash, 1156 | "task ids should be different since the filenames are different") 1157 | 1158 | def test_different_hash_different_file(self): 1159 | """TaskGraph: ensure identical files but filename are noticed.""" 1160 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0) 1161 | target_file_path = os.path.join(self.workspace_dir, 'target.txt') 1162 | _ = task_graph.add_task( 1163 | func=_create_file, 1164 | args=(target_file_path, 'content'), 1165 | hash_algorithm='exists', 1166 | target_path_list=[target_file_path], 1167 | task_name='create content, hash with exists') 1168 | task_graph.close() 1169 | task_graph.join() 1170 | del task_graph 1171 | 1172 | with open(target_file_path, 'r') as target_file: 1173 | self.assertEqual(target_file.read(), 'content') 1174 | with open(target_file_path, 'w') as target_file: 1175 | target_file.write('overwritten') 1176 | 1177 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0) 1178 | _ = task_graph.add_task( 1179 | func=_create_file, 1180 | args=(target_file_path, 'content'), 1181 | hash_algorithm='exists', 1182 | target_path_list=[target_file_path], 1183 | task_name='will not overwrite content, hash with exists') 1184 | task_graph.close() 1185 | task_graph.join() 1186 | del task_graph 1187 | 1188 | with open(target_file_path, 'r') as target_file: 1189 | self.assertEqual(target_file.read(), 'overwritten') 1190 | 1191 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0) 1192 | _ = task_graph.add_task( 1193 | func=_create_file, 1194 | args=(target_file_path, 'content'), 1195 | hash_algorithm='md5', 1196 | target_path_list=[target_file_path], 1197 | task_name='create content again with new hash') 1198 | task_graph.close() 1199 | task_graph.join() 1200 | del task_graph 1201 | 1202 | with open(target_file_path, 'r') as target_file: 1203 | self.assertEqual(target_file.read(), 'content') 1204 | 1205 | def test_return_value_no_record(self): 1206 | """TaskGraph: test ``get`` raises exception if not set to record.""" 1207 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 1208 | value_task = task_graph.add_task( 1209 | func=_noop_function, 1210 | store_result=False) 1211 | 1212 | # get wil raise a ValueError because store_result is not True 1213 | with self.assertRaises(ValueError) as cm: 1214 | _ = value_task.get() 1215 | expected_message = 'must set `store_result` to True in `add_task`' 1216 | actual_message = str(cm.exception) 1217 | self.assertTrue(expected_message in actual_message, actual_message) 1218 | 1219 | def test_return_value(self): 1220 | """TaskGraph: test that ``.get`` behavior works as expected.""" 1221 | if hasattr(_return_value_once, 'executed'): 1222 | del _return_value_once.executed 1223 | n_iterations = 3 1224 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0, 0) 1225 | for iteration_id in range(n_iterations): 1226 | transient_run = iteration_id == n_iterations-1 1227 | LOGGER.debug(iteration_id) 1228 | expected_value = 'a good value' 1229 | value_task = task_graph.add_task( 1230 | func=_return_value_once, 1231 | transient_run=transient_run, 1232 | store_result=True, 1233 | args=(expected_value,), 1234 | task_name=f'{expected_value} iter {iteration_id}') 1235 | value = value_task.get() 1236 | self.assertEqual(value, expected_value) 1237 | task_graph.close() 1238 | task_graph.join() 1239 | task_graph = None 1240 | 1241 | # reset run 1242 | del _return_value_once.executed 1243 | for iteration_id in range(n_iterations): 1244 | LOGGER.debug(iteration_id) 1245 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0, 0) 1246 | expected_value = 'transient run' 1247 | if iteration_id == 0: 1248 | value_task = task_graph.add_task( 1249 | func=_return_value_once, 1250 | transient_run=True, 1251 | store_result=True, 1252 | args=(expected_value,), 1253 | task_name='first re-run transient') 1254 | value = value_task.get() 1255 | self.assertEqual(value, expected_value) 1256 | task_graph.close() 1257 | task_graph.join() 1258 | else: 1259 | with self.assertRaises(RuntimeError): 1260 | value_task = task_graph.add_task( 1261 | func=_return_value_once, 1262 | transient_run=True, 1263 | store_result=True, 1264 | args=(expected_value,), 1265 | task_name=f'expected error {iteration_id}') 1266 | 1267 | value = value_task.get() 1268 | 1269 | with self.assertRaises(RuntimeError): 1270 | task_graph.join() 1271 | 1272 | task_graph = None 1273 | 1274 | def test_malformed_taskgraph_database(self): 1275 | """TaskGraph: Test an empty task.""" 1276 | db_schema_test_list = [ 1277 | ''' 1278 | CREATE TABLE taskgraph_data ( 1279 | bad_name_1 TEXT NOT NULL, 1280 | bad_name_2 BLOB NOT NULL, 1281 | bad_name_3 BLOB NOT NULL); 1282 | ''', 1283 | ''' 1284 | CREATE TABLE taskgraph_data ( 1285 | task_reexecution_hash TEXT NOT NULL, 1286 | target_path_stats BLOB NOT NULL); 1287 | ''', 1288 | ''' 1289 | CREATE TABLE bad_table_name ( 1290 | task_reexecution_hash TEXT NOT NULL, 1291 | target_path_stats BLOB NOT NULL, 1292 | result BLOB NOT NULL, 1293 | PRIMARY KEY (task_reexecution_hash)); 1294 | ''' 1295 | ] 1296 | 1297 | for db_schema in db_schema_test_list: 1298 | database_path = os.path.join( 1299 | self.workspace_dir, taskgraph._TASKGRAPH_DATABASE_FILENAME) 1300 | if os.path.exists(database_path): 1301 | os.remove(database_path) 1302 | connection = sqlite3.connect(database_path) 1303 | cursor = connection.cursor() 1304 | cursor.executescript(db_schema) 1305 | cursor.close() 1306 | connection.commit() 1307 | connection.close() 1308 | 1309 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 0) 1310 | _ = task_graph.add_task() 1311 | task_graph.close() 1312 | task_graph.join() 1313 | del task_graph 1314 | 1315 | expected_column_name_list = [ 1316 | 'task_reexecution_hash', 'target_path_stats', 'result'] 1317 | connection = sqlite3.connect(database_path) 1318 | cursor = connection.cursor() 1319 | cursor.execute('PRAGMA table_info(taskgraph_data)') 1320 | result = list(cursor.fetchall()) 1321 | cursor.close() 1322 | connection.commit() 1323 | connection.close() 1324 | for header_line in result: 1325 | column_name = header_line[1] 1326 | if column_name not in expected_column_name_list: 1327 | raise ValueError( 1328 | f'unexpected column name {column_name} in ' 1329 | 'taskgraph_data ') 1330 | self.assertEqual(len(result), len(expected_column_name_list)) 1331 | 1332 | def test_terminate_log(self): 1333 | """TaskGraph: test that the logger thread terminates on .join.""" 1334 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 1, 5.0) 1335 | _ = task_graph.add_task() 1336 | task_graph.join() 1337 | 1338 | # logger should not terminate until after join, give it enough time 1339 | # to have a chance to close, but not so long the test hangs 1340 | task_graph._logging_monitor_thread.join(0.1) 1341 | self.assertTrue(task_graph._logging_monitor_thread.is_alive()) 1342 | task_graph._execution_monitor_thread.join(0.1) 1343 | self.assertTrue(task_graph._execution_monitor_thread.is_alive()) 1344 | 1345 | task_graph.close() 1346 | task_graph.join() 1347 | 1348 | # 5 seconds should be way too much time to expect the thread to join 1349 | task_graph._logging_monitor_thread.join(5) 1350 | self.assertFalse(task_graph._logging_monitor_thread.is_alive()) 1351 | task_graph._execution_monitor_thread.join(5) 1352 | self.assertFalse(task_graph._execution_monitor_thread.is_alive()) 1353 | 1354 | def test_dictionary_arguments(self): 1355 | """TaskGraph: test that large dictionary arguments behave well.""" 1356 | task_graph = taskgraph.TaskGraph(self.workspace_dir, -1) 1357 | dict_arg = {} 1358 | x = {None: None} 1359 | for _ in range(10000): 1360 | dict_arg[_] = x 1361 | 1362 | def my_op(dict_arg): 1363 | pass 1364 | task_graph.add_task( 1365 | func=my_op, args=(), kwargs={'dict_arg': dict_arg}) 1366 | task_graph.join() 1367 | self.assertTrue(True, 'no memory error so everything is fine') 1368 | 1369 | def test_filter_non_files(self): 1370 | """TaskGraph: test internal filter non-files function.""" 1371 | from taskgraph.Task import _filter_non_files 1372 | from taskgraph.Task import _normalize_path 1373 | 1374 | # Test a passthrough 1375 | test_dict = { 1376 | 0: {'one': 0, 'two': 1, 'three': 2}, 1377 | 1: {'one': 1, 'two': 2, 'three': 3}, 1378 | 2: {'one': 2, 'two': 3, 'three': 4}} 1379 | self.assertEqual( 1380 | test_dict, _filter_non_files(test_dict, [], [], False)) 1381 | 1382 | # Test combination of files, not existing files, and flags in the 1383 | # call 1384 | test_file_a_exists = _normalize_path(os.path.join( 1385 | self.workspace_dir, 'exists_a.txt')) 1386 | pathlib.Path(test_file_a_exists).touch() 1387 | test_file_b_exists = _normalize_path(os.path.join( 1388 | self.workspace_dir, 'exists_b.txt')) 1389 | pathlib.Path(test_file_b_exists).touch() 1390 | test_file_not_a_exists = _normalize_path(os.path.join( 1391 | self.workspace_dir, 'does_not_exist_a.txt')) 1392 | test_file_not_b_exists = _normalize_path(os.path.join( 1393 | self.workspace_dir, 'does_not_exist_b.txt')) 1394 | 1395 | test_dict = { 1396 | 0: {'one': 0, 'two': 1, 'three': 2}, 1397 | 1: {'one': 1, 'two': 2, 'three': 3}, 1398 | 2: {'one': 2, 'two': 3, 'three': 4}, 1399 | 4: {'bar': test_file_not_a_exists}, 1400 | 5: {'foo': test_file_a_exists}, 1401 | 6: test_file_b_exists, 1402 | 7: test_file_not_b_exists, 1403 | 8: _normalize_path(self.workspace_dir)} 1404 | 1405 | expected_result_dict = { 1406 | 0: {'one': 0, 'two': 1, 'three': 2}, 1407 | 1: {'one': 1, 'two': 2, 'three': 3}, 1408 | 2: {'one': 2, 'two': 3, 'three': 4}, 1409 | 4: {'bar': test_file_not_a_exists}, 1410 | 5: {'foo': None}, 1411 | 6: test_file_b_exists, 1412 | 7: None, 1413 | 8: _normalize_path(self.workspace_dir)} 1414 | 1415 | self.assertEqual( 1416 | _filter_non_files( 1417 | test_dict, 1418 | [test_file_b_exists], 1419 | [test_file_not_b_exists], 1420 | True), 1421 | expected_result_dict) 1422 | 1423 | # and test same as above but don't keep directories: 1424 | expected_result_dict[8] = None 1425 | self.assertEqual( 1426 | _filter_non_files( 1427 | test_dict, 1428 | [test_file_b_exists], 1429 | [test_file_not_b_exists], 1430 | False), 1431 | expected_result_dict) 1432 | 1433 | def test_duplicate_task_hang_on_exit(self): 1434 | """TaskGraph: ensure duplicate tasks don't cause taskgraph to hang.""" 1435 | task_graph = taskgraph.TaskGraph(self.workspace_dir, 1) 1436 | target_path = os.path.join(self.workspace_dir, 'target.txt') 1437 | content = 'test' 1438 | for _ in range(10): 1439 | _ = task_graph.add_task( 1440 | func=_create_file, 1441 | args=(target_path, content), 1442 | target_path_list=[target_path], 1443 | task_name='create content') 1444 | task_graph.join() 1445 | task_graph.close() 1446 | 1447 | def test_history_rst_format(self): 1448 | """TaskGraph: ensure HISTORY.rst is correctly formatted.""" 1449 | # ensure there are no errors when checking the history file 1450 | history_filepath = os.path.join( 1451 | os.path.dirname(__file__), '..', 'HISTORY.rst') 1452 | subprocess.check_call(['rstcheck', history_filepath]) 1453 | 1454 | def test_mtime_mismatch(self): 1455 | """TaskGraph: ensure re-run when file mtimes don't match. 1456 | 1457 | This test addresses the issue described under the github issue 1458 | https://github.com/natcap/taskgraph/issues/70. 1459 | """ 1460 | target_path = os.path.join(self.workspace_dir, 'target.txt') 1461 | 1462 | # SETUP: When we call 3 similar graphs in rapid succession, the file's 1463 | # mtime is not precise enough to detect that the file has actually 1464 | # changed. The specific conditions here are: 1465 | # * The task "test text" has already been computed once 1466 | # * The file written by the first "test text" is replaced by content 1467 | # of the same filesize (thus fooling the size part of sizetimestamp) 1468 | # * The graphs are executed fast enough that _is_precalculated's mtime 1469 | # check via math.isclose() couldn't detect the recalculation. 1470 | for content in ('test text', 'TEST TEXT', 'test text'): 1471 | task_graph = taskgraph.TaskGraph(self.workspace_dir, n_workers=-1) 1472 | _ = task_graph.add_task( 1473 | func=_create_file, 1474 | args=(target_path, content), 1475 | target_path_list=[target_path], 1476 | task_name='create content') 1477 | task_graph.join() 1478 | task_graph.close() 1479 | 1480 | with open(target_path) as target_file: 1481 | self.assertEqual(target_file.read(), content) 1482 | 1483 | 1484 | def Fail(n_tries, result_path): 1485 | """Create a function that fails after ``n_tries``.""" 1486 | def fail_func(): 1487 | fail_func._n_tries -= 1 1488 | if fail_func._n_tries > 0: 1489 | raise ValueError("Fail %d more times", fail_func._n_tries) 1490 | with open(result_path, 'w') as result_file: 1491 | result_file.write("finished!") 1492 | fail_func._n_tries = n_tries 1493 | 1494 | return fail_func 1495 | -------------------------------------------------------------------------------- /taskgraph/Task.py: -------------------------------------------------------------------------------- 1 | """Task graph framework.""" 2 | import collections 3 | import hashlib 4 | import inspect 5 | import logging 6 | import logging.handlers 7 | import multiprocessing 8 | import multiprocessing.pool 9 | import os 10 | import pathlib 11 | import pickle 12 | import pprint 13 | import queue 14 | import sqlite3 15 | import threading 16 | import time 17 | try: 18 | from importlib.metadata import PackageNotFoundError 19 | from importlib.metadata import version 20 | except ImportError: 21 | # importlib.metadata added to stdlib in 3.8 22 | from importlib_metadata import PackageNotFoundError 23 | from importlib_metadata import version 24 | 25 | import retrying 26 | 27 | try: 28 | __version__ = version('taskgraph') 29 | except PackageNotFoundError: 30 | # package is not installed; no metadata available 31 | pass 32 | 33 | 34 | _VALID_PATH_TYPES = (str, pathlib.PurePath) 35 | _TASKGRAPH_DATABASE_FILENAME = 'taskgraph_data.db' 36 | 37 | try: 38 | import psutil 39 | HAS_PSUTIL = True 40 | if psutil.WINDOWS: 41 | # Windows' scheduler doesn't use POSIX niceness. 42 | PROCESS_LOW_PRIORITY = psutil.BELOW_NORMAL_PRIORITY_CLASS 43 | else: 44 | # On POSIX, use system niceness. 45 | # -20 is high priority, 0 is normal priority, 19 is low priority. 46 | # 10 here is an arbitrary selection that's probably nice enough. 47 | PROCESS_LOW_PRIORITY = 10 48 | except ImportError: 49 | HAS_PSUTIL = False 50 | 51 | LOGGER = logging.getLogger(__name__) 52 | _MAX_TIMEOUT = 5.0 # amount of time to wait for threads to terminate 53 | 54 | 55 | # We want our processing pool to be nondeamonic so that workers could use 56 | # multiprocessing if desired (deamonic processes cannot start new processes) 57 | # the following bit of code to do this was taken from 58 | # https://stackoverflow.com/a/8963618/42897 59 | class NoDaemonProcess(multiprocessing.Process): 60 | """Make 'daemon' attribute always return False.""" 61 | 62 | @property 63 | def daemon(self): 64 | """Return False indicating not a daemon process.""" 65 | return False 66 | 67 | @daemon.setter 68 | def daemon(self, value): 69 | """Do not allow daemon value to be overriden.""" 70 | pass 71 | 72 | 73 | class NoDaemonContext(type(multiprocessing.get_context('spawn'))): 74 | """From https://stackoverflow.com/a/8963618/42897. 75 | 76 | "As the current implementation of multiprocessing [3.7+] has been 77 | extensively refactored to be based on contexts, we need to provide a 78 | NoDaemonContext class that has our NoDaemonProcess as attribute. 79 | [NonDaemonicPool] will then use that context instead of the default 80 | one." "spawn" is chosen as default since that is the default and only 81 | context option for Windows and is the default option for Mac OS as 82 | well since 3.8. 83 | 84 | """ 85 | 86 | Process = NoDaemonProcess 87 | 88 | 89 | class NonDaemonicPool(multiprocessing.pool.Pool): 90 | """NonDaemonic Process Pool.""" 91 | 92 | def __init__(self, *args, **kwargs): 93 | """Invoke super to set the context of Pool class explicitly.""" 94 | kwargs['context'] = NoDaemonContext() 95 | super(NonDaemonicPool, self).__init__(*args, **kwargs) 96 | 97 | 98 | def _null_func(): 99 | """Use when func=None on add_task.""" 100 | return None 101 | 102 | 103 | def _initialize_logging_to_queue(logging_queue): 104 | """Add a synchronized queue to a new process. 105 | 106 | This is intended to be called as an initialization function to 107 | ``multiprocessing.Pool`` to establish logging from a Pool worker to the 108 | main python process via a multiprocessing Queue. 109 | 110 | Args: 111 | logging_queue (multiprocessing.Queue): The queue to use for passing 112 | log records back to the main process. 113 | 114 | Returns: 115 | None 116 | 117 | """ 118 | root_logger = logging.getLogger() 119 | 120 | # By the time this function is called, `root_logger` has a copy of all of 121 | # the logging handlers registered to it within the parent process, which 122 | # leads to duplicate logging in some cases. By removing all of the 123 | # handlers here, we ensure that log messages can only be passed back to the 124 | # parent process by the `logging_queue`, where they will be handled. 125 | for handler in root_logger.handlers[:]: 126 | root_logger.removeHandler(handler) 127 | 128 | root_logger.setLevel(logging.NOTSET) 129 | handler = logging.handlers.QueueHandler(logging_queue) 130 | root_logger.addHandler(handler) 131 | 132 | 133 | def _logging_queue_monitor(logging_queue): 134 | """Monitor ``logging_queue`` for message and pass to ``logger``.""" 135 | LOGGER.debug('Starting logging worker') 136 | while True: 137 | record = logging_queue.get() 138 | if record is None: 139 | break 140 | logger = logging.getLogger(record.name) 141 | logger.handle(record) 142 | LOGGER.debug('_logging_queue_monitor shutting down') 143 | 144 | 145 | def _create_taskgraph_table_schema(taskgraph_database_path): 146 | """Create database exists and/or ensures it is compatible and recreate. 147 | 148 | Args: 149 | taskgraph_database_path (str): path to an existing database or desired 150 | location of a new database. 151 | 152 | Returns: 153 | None. 154 | 155 | """ 156 | sql_create_projects_table_script = ( 157 | """ 158 | CREATE TABLE taskgraph_data ( 159 | task_reexecution_hash TEXT NOT NULL, 160 | target_path_stats BLOB NOT NULL, 161 | result BLOB NOT NULL, 162 | PRIMARY KEY (task_reexecution_hash) 163 | ); 164 | CREATE TABLE global_variables ( 165 | key TEXT NOT NULL, 166 | value BLOB, 167 | PRIMARY KEY (key) 168 | ); 169 | """) 170 | 171 | table_valid = True 172 | expected_table_column_name_map = { 173 | 'taskgraph_data': [ 174 | 'task_reexecution_hash', 'target_path_stats', 'result'], 175 | 'global_variables': ['key', 'value']} 176 | if os.path.exists(taskgraph_database_path): 177 | try: 178 | # check that the tables exist and the column names are as expected 179 | for expected_table_name in expected_table_column_name_map: 180 | table_result = _execute_sqlite( 181 | ''' 182 | SELECT name 183 | FROM sqlite_master 184 | WHERE type='table' AND name=? 185 | ''', taskgraph_database_path, 186 | argument_list=[expected_table_name], 187 | mode='read_only', execute='execute', fetch='all') 188 | if not table_result: 189 | raise ValueError(f'missing table {expected_table_name}') 190 | 191 | # this query returns a list of results of the form 192 | # [(0, 'task_reexecution_hash', 'TEXT', 1, None, 1), ... ] 193 | # we'll just check that the header names are the same, no 194 | # need to be super aggressive, also need to construct the 195 | # PRAGMA string directly since it doesn't take arguments 196 | table_info_result = _execute_sqlite( 197 | f'PRAGMA table_info({expected_table_name})', 198 | taskgraph_database_path, mode='read_only', 199 | execute='execute', fetch='all') 200 | 201 | expected_column_names = expected_table_column_name_map[ 202 | expected_table_name] 203 | header_count = 0 204 | for header_line in table_info_result: 205 | column_name = header_line[1] 206 | if column_name not in expected_column_names: 207 | raise ValueError( 208 | f'expected {column_name} in table ' 209 | f'{expected_table_name} but not found') 210 | header_count += 1 211 | if header_count < len(expected_column_names): 212 | raise ValueError( 213 | f'found only {header_count} of an expected ' 214 | f'{len(expected_column_names)} columns in table ' 215 | f'{expected_table_name}') 216 | if not table_info_result: 217 | raise ValueError(f'missing table {expected_table_name}') 218 | except Exception: 219 | # catch all "Exception"s because anything that goes wrong while 220 | # checking the database should be considered a bad database and we 221 | # should make a new one. 222 | LOGGER.exception( 223 | f'{taskgraph_database_path} exists, but is incompatible ' 224 | 'somehow. Deleting and making a new one.') 225 | os.remove(taskgraph_database_path) 226 | table_valid = False 227 | else: 228 | # table does not exist 229 | table_valid = False 230 | 231 | if not table_valid: 232 | # create the base table 233 | _execute_sqlite( 234 | sql_create_projects_table_script, taskgraph_database_path, 235 | mode='modify', execute='script') 236 | # set the database version 237 | _execute_sqlite( 238 | ''' 239 | INSERT OR REPLACE INTO global_variables 240 | VALUES ("version", ?) 241 | ''', taskgraph_database_path, mode='modify', 242 | argument_list=(__version__,)) 243 | 244 | 245 | class TaskGraph(object): 246 | """Encapsulates the worker and tasks states for parallel processing.""" 247 | 248 | def __init__( 249 | self, taskgraph_cache_dir_path, n_workers, 250 | reporting_interval=None): 251 | """Create a task graph. 252 | 253 | Creates an object for building task graphs, executing them, 254 | parallelizing independent work notes, and avoiding repeated calls. 255 | 256 | Args: 257 | taskgraph_cache_dir_path (string): path to a directory that 258 | either contains a taskgraph cache from a previous instance or 259 | will create a new one if none exists. 260 | n_workers (int): number of parallel *subprocess* workers to allow 261 | during task graph execution. If set to 0, don't use 262 | subprocesses. If set to <0, use only the main thread for any 263 | execution and scheduling. In the case of the latter, 264 | ``add_task`` will be a blocking call. 265 | reporting_interval (scalar): if not None, report status of task 266 | graph every ``reporting_interval`` seconds. 267 | 268 | """ 269 | try: 270 | os.makedirs(taskgraph_cache_dir_path) 271 | except OSError: 272 | LOGGER.debug( 273 | "%s already exists, no need to make it", 274 | taskgraph_cache_dir_path) 275 | 276 | self._taskgraph_cache_dir_path = taskgraph_cache_dir_path 277 | 278 | # this variable is used to print accurate representation of how many 279 | # tasks have been completed in the logging output. 280 | self._added_task_count = 0 281 | 282 | # use this to keep track of all the tasks added to the graph by their 283 | # task hashes. Used to determine if an identical task has been added 284 | # to the taskgraph during `add_task` 285 | self._task_hash_map = dict() 286 | 287 | # use this to keep track of all the tasks added to the graph by their 288 | # task names. Used to map a unique task name to the task object it 289 | # represents 290 | self._task_name_map = dict() 291 | 292 | # used to remember if task_graph has been closed 293 | self._closed = False 294 | 295 | # keep track if the task graph has been forcibly terminated 296 | self._terminated = False 297 | 298 | # if n_workers > 0 this will be a multiprocessing pool used to execute 299 | # the __call__ functions in Tasks 300 | self._worker_pool = None 301 | 302 | # If n_workers > 0 this will be a threading.Thread used to propagate 303 | # log records from another process into the current process. 304 | self._logging_monitor_thread = None 305 | 306 | # If n_workers > 0, this will be a multiprocessing.Queue used to pass 307 | # log records from the process pool to the parent process. 308 | self._logging_queue = None 309 | 310 | # keeps track of the tasks currently being processed for logging. 311 | self._active_task_list = [] 312 | 313 | # keeps track of how many tasks have all their dependencies satisfied 314 | # and are waiting for a worker 315 | self._task_waiting_count = 0 316 | 317 | # this might hold the threads to execute tasks if n_workers >= 0 318 | self._task_executor_thread_list = [] 319 | 320 | # executor threads wait on this event that gets set when new tasks are 321 | # added to the queue. If the queue is empty an executor will clear 322 | # the event to halt other executors 323 | self._executor_ready_event = threading.Event() 324 | 325 | # tasks that have all their dependencies satisfied go in this queue 326 | # and can be executed immediately 327 | self._task_ready_priority_queue = queue.PriorityQueue() 328 | 329 | # maps a list of task names that need to be executed before the key 330 | # task can 331 | self._task_dependent_map = collections.defaultdict(set) 332 | 333 | # maps a list of task names that are dependent to a task 334 | self._dependent_task_map = collections.defaultdict(set) 335 | 336 | # tasks that complete are added to this set 337 | self._completed_task_names = set() 338 | 339 | self._task_database_path = os.path.join( 340 | self._taskgraph_cache_dir_path, _TASKGRAPH_DATABASE_FILENAME) 341 | 342 | # create new table if needed 343 | _create_taskgraph_table_schema(self._task_database_path) 344 | 345 | # check the version of the database and warn if a problem 346 | local_version = _execute_sqlite( 347 | ''' 348 | SELECT value 349 | FROM global_variables 350 | WHERE key=? 351 | ''', self._task_database_path, mode='read_only', 352 | fetch='one', argument_list=['version'])[0] 353 | if local_version != __version__: 354 | LOGGER.warning( 355 | f'the database located at {self._task_database_path} was ' 356 | f'created with TaskGraph version {local_version} but the ' 357 | f'current version is {__version__}') 358 | 359 | # no need to set up schedulers if n_workers is single threaded 360 | self._n_workers = n_workers 361 | if n_workers < 0: 362 | return 363 | 364 | # start concurrent reporting of taskgraph if reporting interval is set 365 | self._reporting_interval = reporting_interval 366 | if reporting_interval is not None: 367 | self._execution_monitor_wait_event = threading.Event() 368 | self._execution_monitor_thread = threading.Thread( 369 | target=self._execution_monitor, 370 | args=(self._execution_monitor_wait_event,), 371 | name='_execution_monitor') 372 | # make it a daemon so we don't have to figure out how to 373 | # close it when execution complete 374 | self._execution_monitor_thread.daemon = True 375 | self._execution_monitor_thread.start() 376 | 377 | # launch executor threads 378 | self._executor_thread_count = max(0, n_workers) 379 | for thread_id in range(max(1, n_workers)): 380 | task_executor_thread = threading.Thread( 381 | target=self._task_executor, 382 | name='task_executor_%s' % thread_id) 383 | # make daemons in case there's a catastrophic error the main 384 | # thread won't hang 385 | task_executor_thread.daemon = True 386 | task_executor_thread.start() 387 | self._task_executor_thread_list.append(task_executor_thread) 388 | 389 | # set up multiprocessing if n_workers > 0 390 | if n_workers > 0: 391 | self._logging_queue = multiprocessing.Queue() 392 | self._worker_pool = NonDaemonicPool( 393 | n_workers, initializer=_initialize_logging_to_queue, 394 | initargs=(self._logging_queue,)) 395 | self._logging_monitor_thread = threading.Thread( 396 | target=_logging_queue_monitor, 397 | args=(self._logging_queue,)) 398 | 399 | self._logging_monitor_thread.daemon = True 400 | self._logging_monitor_thread.start() 401 | if HAS_PSUTIL: 402 | parent = psutil.Process() 403 | parent.nice(PROCESS_LOW_PRIORITY) 404 | for child in parent.children(): 405 | try: 406 | child.nice(PROCESS_LOW_PRIORITY) 407 | except psutil.NoSuchProcess: 408 | LOGGER.warning( 409 | "NoSuchProcess exception encountered when trying " 410 | "to nice %s. This might be a bug in `psutil` so " 411 | "it should be okay to ignore.") 412 | 413 | def __del__(self): 414 | """Ensure all threads have been joined for cleanup.""" 415 | self._terminate() 416 | 417 | def _task_executor(self): 418 | """Worker that executes Tasks that have satisfied dependencies.""" 419 | while True: 420 | # this event blocks until the task graph has signaled it wants 421 | # the executors to read the state of the queue or a stop event or 422 | # a timeout exceeded just to protect against a worst case deadlock 423 | self._executor_ready_event.wait(_MAX_TIMEOUT) 424 | # this lock synchronizes changes between the queue and 425 | # executor_ready_event 426 | if self._terminated: 427 | LOGGER.debug( 428 | "taskgraph is terminated, ending %s", 429 | threading.current_thread()) 430 | break 431 | task = None 432 | try: 433 | task = self._task_ready_priority_queue.get_nowait() 434 | self._task_waiting_count -= 1 435 | task_name_time_tuple = (task.task_name, time.time()) 436 | self._active_task_list.append(task_name_time_tuple) 437 | except queue.Empty: 438 | # no tasks are waiting could be because the taskgraph is 439 | # closed or because the queue is just empty. 440 | if (self._closed and len(self._completed_task_names) == 441 | self._added_task_count): 442 | # the graph is closed and there are as many completed tasks 443 | # as there are added tasks, so none left. The executor can 444 | # terminate. 445 | self._executor_thread_count -= 1 446 | if self._executor_thread_count == 0 and self._worker_pool: 447 | # only the last executor should terminate the worker 448 | # pool, because otherwise who knows if it's still 449 | # executing anything 450 | try: 451 | self._worker_pool.close() 452 | self._worker_pool.terminate() 453 | self._worker_pool = None 454 | self._terminate() 455 | except Exception: 456 | # there's the possibility for a race condition here 457 | # where another thread already closed the worker 458 | # pool, so just guard against it 459 | LOGGER.warning('worker pool was already closed') 460 | LOGGER.debug( 461 | "no tasks are pending and taskgraph closed, normally " 462 | "terminating executor %s." % threading.current_thread()) 463 | break 464 | else: 465 | # there's still the possibility for work to be added or 466 | # still work in the pipeline 467 | self._executor_ready_event.clear() 468 | if task is None: 469 | continue 470 | try: 471 | task._call() 472 | task.task_done_executing_event.set() 473 | except Exception as e: 474 | # An error occurred on a call, terminate the taskgraph 475 | task.exception_object = e 476 | LOGGER.exception( 477 | 'A taskgraph _task_executor failed on Task ' 478 | '%s. Terminating taskgraph.', task.task_name) 479 | self._terminate() 480 | break 481 | 482 | LOGGER.debug( 483 | "task %s is complete, checking to see if any dependent " 484 | "tasks can be executed now", task.task_name) 485 | self._completed_task_names.add(task.task_name) 486 | self._active_task_list.remove(task_name_time_tuple) 487 | for waiting_task_name in ( 488 | self._task_dependent_map[task.task_name]): 489 | # remove `task` from the set of tasks that 490 | # `waiting_task` was waiting on. 491 | self._dependent_task_map[waiting_task_name].remove( 492 | task.task_name) 493 | # if there aren't any left, we can push `waiting_task` 494 | # to the work queue 495 | if not self._dependent_task_map[waiting_task_name]: 496 | # if we removed the last task we can put it to the 497 | # work queue 498 | LOGGER.debug( 499 | "Task %s is ready for processing, sending to " 500 | "task_ready_priority_queue", 501 | waiting_task_name) 502 | del self._dependent_task_map[waiting_task_name] 503 | self._task_ready_priority_queue.put( 504 | self._task_name_map[waiting_task_name]) 505 | self._task_waiting_count += 1 506 | # indicate to executors there is work to do 507 | self._executor_ready_event.set() 508 | del self._task_dependent_map[task.task_name] 509 | # this extra set ensures that recently emptied map won't get 510 | # ignored by the executor if no work is left to do and the graph is 511 | # closed 512 | self._executor_ready_event.set() 513 | LOGGER.debug("task %s done processing", task.task_name) 514 | LOGGER.debug("task executor shutting down") 515 | 516 | def add_task( 517 | self, func=None, args=None, kwargs=None, task_name=None, 518 | target_path_list=None, ignore_path_list=None, 519 | hash_target_files=True, dependent_task_list=None, 520 | ignore_directories=True, priority=0, 521 | hash_algorithm='sizetimestamp', transient_run=False, 522 | store_result=False): 523 | """Add a task to the task graph. 524 | 525 | Args: 526 | func (callable): target function 527 | args (list): argument list for ``func`` 528 | kwargs (dict): keyword arguments for ``func`` 529 | target_path_list (list): if not None, a list of file paths that 530 | are expected to be output by ``func``. If any of these paths 531 | don't exist, or their timestamp is earlier than an input 532 | arg or work token, func will be executed. 533 | 534 | If ``None``, any identical calls to ``add_task`` will be 535 | skipped for the TaskGraph object. A future TaskGraph object 536 | will re-run an exact call once for its lifetime. The reasoning 537 | is that it is likely the user wishes to run a target-less task 538 | once for the lifetime of a task-graph, but would otherwise not 539 | have a transient result that could be re-used in a future 540 | instantiation of a TaskGraph object. 541 | 542 | task_name (string): if not None, this value is used to identify 543 | the task in logging messages. 544 | ignore_path_list (list): list of file paths that could be in 545 | args/kwargs that should be ignored when considering timestamp 546 | hashes. 547 | hash_target_files (bool): If True, the hash value of the target 548 | files will be recorded to determine if a future run of this 549 | function is precalculated. If False, this function only notes 550 | the existence of the target files before determining if 551 | a function call is precalculated. 552 | dependent_task_list (list): list of ``Task``s that this task must 553 | ``join`` before executing. 554 | ignore_directories (boolean): if the existence/timestamp of any 555 | directories discovered in args or kwargs is used as part 556 | of the work token hash. 557 | priority (numeric): the priority of a task is considered when 558 | there is more than one task whose dependencies have been 559 | met and are ready for scheduling. Tasks are inserted into the 560 | work queue in order of decreasing priority value 561 | (priority 10 is higher than priority 1). This value can be 562 | positive, negative, and/or floating point. 563 | hash_algorithm (string): either a hash function id that 564 | exists in hashlib.algorithms_available, 'sizetimestamp', 565 | or 'exists'. Any paths to actual files in the arguments will 566 | be digested with this algorithm. If value is 'sizetimestamp' 567 | the digest will only use the normed path, size, and timestamp 568 | of any files found in the arguments. This value is used when 569 | determining whether a task is precalculated or its target 570 | files can be copied to an equivalent task. Note if 571 | ``hash_algorithm`` is 'sizetimestamp' the task will require the 572 | same base path files to determine equality. If it is a 573 | ``hashlib`` algorithm only file contents will be considered. 574 | If the value is 'exists' the only test for file equivalence 575 | will be if it exists on disk (True) or not (False). 576 | transient_run (bool): if True, this Task will be reexecuted 577 | even if it was successfully executed in a previous TaskGraph 578 | instance. If False, this Task will be skipped if it was 579 | executed successfully in a previous TaskGraph instance. One 580 | might wish to set `transient_run` to True on a Task that does 581 | some sort of initialization that's needed every time a 582 | TaskGraph is instantiated. Perhaps to acquire dynamic resources 583 | or authenticate permissions. 584 | store_result (bool): If True, the result of ``func`` will be stored 585 | in the TaskGraph database and retrievable with a call to 586 | ``.get()`` on a ``Task`` object. 587 | 588 | Returns: 589 | Task which was just added to the graph or an existing Task that 590 | has the same signature and has already been added to the 591 | TaskGraph. 592 | 593 | Raises: 594 | ValueError if objects are passed to the dependent task list that 595 | are not Tasks. 596 | ValueError if ``add_task`` is invoked after the ``TaskGraph`` is 597 | closed. 598 | RuntimeError if ``add_task`` is invoked after ``TaskGraph`` has 599 | reached a terminate state. 600 | 601 | """ 602 | try: 603 | if self._terminated: 604 | raise RuntimeError( 605 | "add_task when Taskgraph is terminated.") 606 | if self._closed: 607 | raise ValueError( 608 | "The task graph is closed and cannot accept more " 609 | "tasks.") 610 | self._added_task_count += 1 611 | if args is None: 612 | args = [] 613 | if kwargs is None: 614 | kwargs = {} 615 | if task_name is None: 616 | task_name = 'UNNAMED TASK' 617 | if dependent_task_list is None: 618 | dependent_task_list = [] 619 | if target_path_list is None: 620 | target_path_list = [] 621 | if ignore_path_list is None: 622 | ignore_path_list = [] 623 | if func is None: 624 | func = _null_func 625 | 626 | # this is a pretty common error to accidentally not pass a 627 | # Task to the dependent task list. 628 | if any(not isinstance(task, Task) 629 | for task in dependent_task_list): 630 | raise ValueError( 631 | "Objects passed to dependent task list that are not " 632 | "tasks: %s", dependent_task_list) 633 | 634 | task_name = '%s (%d)' % (task_name, len(self._task_hash_map)) 635 | new_task = Task( 636 | task_name, func, args, kwargs, target_path_list, 637 | ignore_path_list, hash_target_files, ignore_directories, 638 | transient_run, self._worker_pool, 639 | priority, hash_algorithm, store_result, 640 | self._task_database_path) 641 | 642 | self._task_name_map[new_task.task_name] = new_task 643 | # it may be this task was already created in an earlier call, 644 | # use that object in its place 645 | if new_task in self._task_hash_map: 646 | duplicate_task = self._task_hash_map[new_task] 647 | new_task_target_set = set(new_task._target_path_list) 648 | duplicate_task_target_set = set( 649 | duplicate_task._target_path_list) 650 | if new_task_target_set == duplicate_task_target_set: 651 | LOGGER.warning( 652 | "A duplicate task was submitted: %s original: %s", 653 | new_task, self._task_hash_map[new_task]) 654 | self._added_task_count -= 1 655 | return duplicate_task 656 | disjoint_target_set = ( 657 | new_task_target_set.symmetric_difference( 658 | duplicate_task_target_set)) 659 | if len(disjoint_target_set) == ( 660 | len(new_task_target_set) + 661 | len(duplicate_task_target_set)): 662 | if duplicate_task not in dependent_task_list: 663 | LOGGER.info( 664 | "A task was created that had an identical " 665 | "args signature sans target paths, but a " 666 | "different target_path_list of the same " 667 | "length. To avoid recomputation, dynamically " 668 | "adding previous Task (%s) as a dependent " 669 | "task to this one (%s).", 670 | duplicate_task.task_name, task_name) 671 | dependent_task_list = ( 672 | dependent_task_list + [duplicate_task]) 673 | else: 674 | raise RuntimeError( 675 | "A task was created that has the same arguments " 676 | "as another task, but only partially different " 677 | "expected target paths. This runs the risk of " 678 | "unpredictably overwriting output so treating as " 679 | "a runtime error: submitted task: %s, existing " 680 | "task: %s" % (new_task, duplicate_task)) 681 | self._task_hash_map[new_task] = new_task 682 | if self._n_workers < 0: 683 | # call directly if single threaded 684 | new_task._call() 685 | else: 686 | # determine if task is ready or is dependent on other 687 | # tasks 688 | LOGGER.debug( 689 | "multithreaded: %s sending to new task queue.", 690 | task_name) 691 | outstanding_dep_task_name_list = [ 692 | dep_task.task_name for dep_task in dependent_task_list 693 | if dep_task.task_name 694 | not in self._completed_task_names] 695 | if not outstanding_dep_task_name_list: 696 | LOGGER.debug( 697 | "sending task %s right away", new_task.task_name) 698 | self._task_ready_priority_queue.put(new_task) 699 | self._task_waiting_count += 1 700 | self._executor_ready_event.set() 701 | else: 702 | # there are unresolved tasks that the waiting 703 | # process scheduler has not been notified of. 704 | # Record dependencies. 705 | for dep_task_name in outstanding_dep_task_name_list: 706 | # record tasks that are dependent on dep_task_name 707 | self._task_dependent_map[dep_task_name].add( 708 | new_task.task_name) 709 | # record tasks that new_task depends on 710 | self._dependent_task_map[new_task.task_name].add( 711 | dep_task_name) 712 | return new_task 713 | 714 | except Exception: 715 | # something went wrong, shut down the taskgraph 716 | LOGGER.exception( 717 | "Something went wrong when adding task %s, " 718 | "terminating taskgraph.", task_name) 719 | self._terminate() 720 | raise 721 | 722 | def _execution_monitor(self, monitor_wait_event): 723 | """Log state of taskgraph every ``self._reporting_interval`` seconds. 724 | 725 | Args: 726 | monitor_wait_event (threading.Event): used to sleep the monitor 727 | for``self._reporting_interval`` seconds, or to wake up to 728 | terminate for shutdown. 729 | 730 | Returns: 731 | None. 732 | 733 | """ 734 | start_time = time.time() 735 | while True: 736 | if self._terminated: 737 | break 738 | active_task_count = len(self._active_task_list) 739 | queue_length = self._task_ready_priority_queue.qsize() 740 | active_task_message = '\n'.join( 741 | ['\t%s: executing for %.2fs' % ( 742 | task_name, time.time() - task_time) 743 | for task_name, task_time in self._active_task_list]) 744 | 745 | completed_tasks = len(self._completed_task_names) 746 | percent_complete = 0.0 747 | if self._added_task_count > 0: 748 | percent_complete = 100.0 * ( 749 | float(completed_tasks) / self._added_task_count) 750 | 751 | LOGGER.info( 752 | "\n\ttaskgraph execution status: tasks added: %d \n" 753 | "\ttasks complete: %d (%.1f%%) \n" 754 | "\ttasks waiting for a free worker: %d (qsize: %d)\n" 755 | "\ttasks executing (%d): graph is %s\n%s", 756 | self._added_task_count, completed_tasks, percent_complete, 757 | self._task_waiting_count, queue_length, active_task_count, 758 | 'closed' if self._closed else 'open', 759 | active_task_message) 760 | 761 | monitor_wait_event.wait( 762 | timeout=self._reporting_interval - ( 763 | (time.time() - start_time)) % self._reporting_interval) 764 | LOGGER.debug("_execution monitor shutting down") 765 | 766 | def join(self, timeout=None): 767 | """Join all threads in the graph. 768 | 769 | Args: 770 | timeout (float): if not none will attempt to join subtasks with 771 | this value. If a subtask times out, the whole function will 772 | timeout. 773 | 774 | Returns: 775 | True if successful join, False if timed out. 776 | 777 | """ 778 | LOGGER.debug("joining taskgraph") 779 | if self._n_workers < 0: 780 | # Join() is meaningless since tasks execute synchronously. 781 | LOGGER.debug( 782 | 'n_workers: %s; join is vacuously true' % self._n_workers) 783 | return True 784 | 785 | try: 786 | LOGGER.debug("attempting to join threads") 787 | timedout = False 788 | for task in self._task_hash_map.values(): 789 | LOGGER.debug("attempting to join task %s", task.task_name) 790 | # task.join() will raise any exception that resulted from the 791 | # task's execution. 792 | timedout = not task.join(timeout) 793 | LOGGER.debug("task %s was joined", task.task_name) 794 | # if the last task timed out then we want to timeout for all 795 | # of the task graph 796 | if timedout: 797 | LOGGER.info( 798 | "task %s timed out in graph join", task.task_name) 799 | return False 800 | if self._closed: 801 | # Close down the taskgraph; ok if already terminated 802 | self._executor_ready_event.set() 803 | self._terminate() 804 | return True 805 | except Exception: 806 | # If there's an exception on a join it means that a task failed 807 | # to execute correctly. Print a helpful message then terminate the 808 | # taskgraph object. 809 | LOGGER.exception( 810 | "Exception raised when joining task %s. It's possible " 811 | "that this task did not cause the exception, rather another " 812 | "exception terminated the task_graph. Check the log to see " 813 | "if there are other exceptions.", task) 814 | self._terminate() 815 | raise 816 | 817 | def close(self): 818 | """Prevent future tasks from being added to the work queue.""" 819 | LOGGER.debug("Closing taskgraph.") 820 | if self._closed: 821 | return 822 | self._closed = True 823 | # this wakes up all the executors and any that wouldn't otherwise 824 | # have work to do will see there are no tasks left and terminate 825 | self._executor_ready_event.set() 826 | LOGGER.debug("taskgraph closed") 827 | 828 | def _terminate(self): 829 | """Immediately terminate remaining task graph computation.""" 830 | LOGGER.debug( 831 | "Invoking terminate. already terminated? %s", self._terminated) 832 | if self._terminated: 833 | return 834 | try: 835 | # it's possible the global state is not well defined, so just in 836 | # case we'll wrap it all up in a try/except 837 | self._terminated = True 838 | if self._executor_ready_event is not None: 839 | # alert executors to check that _terminated is True 840 | self._executor_ready_event.set() 841 | LOGGER.debug("shutting down workers") 842 | if self._worker_pool is not None: 843 | self._worker_pool.close() 844 | self._worker_pool.terminate() 845 | self._worker_pool = None 846 | 847 | # This will terminate the logging worker 848 | if self._logging_queue is not None: 849 | self._logging_queue.put(None) 850 | 851 | # This will cause all 'join'ed Tasks to join. 852 | if self._n_workers >= 0: 853 | self._executor_ready_event.set() 854 | if self._reporting_interval is not None: 855 | self._execution_monitor_wait_event.set() 856 | for task in self._task_hash_map.values(): 857 | # shortcut to get the tasks to mark as joined 858 | task.task_done_executing_event.set() 859 | 860 | LOGGER.debug('taskgraph terminated') 861 | except Exception: 862 | LOGGER.exception( 863 | 'ignoring an exception that occurred during _terminate') 864 | 865 | 866 | class Task(object): 867 | """Encapsulates work/task state for multiprocessing.""" 868 | 869 | def __init__( 870 | self, task_name, func, args, kwargs, target_path_list, 871 | ignore_path_list, hash_target_files, ignore_directories, 872 | transient_run, worker_pool, priority, hash_algorithm, 873 | store_result, task_database_path): 874 | """Make a Task. 875 | 876 | Args: 877 | task_name (int): unique task id from the task graph. 878 | func (function): a function that takes the argument list 879 | ``args`` 880 | args (tuple): a list of arguments to pass to ``func``. Can be 881 | None. 882 | kwargs (dict): keyword arguments to pass to ``func``. Can be 883 | None. 884 | target_path_list (list): a list of filepaths that this task 885 | should generate. 886 | ignore_path_list (list): list of file paths that could be in 887 | args/kwargs that should be ignored when considering timestamp 888 | hashes. 889 | hash_target_files (bool): If True, the hash value of the target 890 | files will be recorded to determine if a future run of this 891 | function is precalculated. If False, this function only notes 892 | the existence of the target files before determining if 893 | a function call is precalculated. 894 | ignore_directories (bool): if the existence/timestamp of any 895 | directories discovered in args or kwargs is used as part 896 | of the work token hash. 897 | transient_run (bool): if True a call with an identical execution 898 | hash will be reexecuted on a subsequent instantiation of a 899 | future TaskGraph object. If a duplicate task is submitted 900 | to the same object it will not be re-run in any scenario. 901 | Otherwise if False, subsequent tasks with an identical 902 | execution hash will be skipped. 903 | worker_pool (multiprocessing.Pool): if not None, is a 904 | multiprocessing pool that can be used for ``_call`` execution. 905 | priority (numeric): the priority of a task is considered when 906 | there is more than one task whose dependencies have been 907 | met and are ready for scheduling. Tasks are inserted into the 908 | work queue in order of decreasing priority. This value can be 909 | positive, negative, and/or floating point. 910 | hash_algorithm (string): either a hash function id that 911 | exists in hashlib.algorithms_available, 'sizetimestamp', 912 | or 'exists'. Any paths to actual files in the arguments will 913 | be digested with this algorithm. If value is 'sizetimestamp' 914 | the digest will only use the normed path, size, and timestamp 915 | of any files found in the arguments. If 'exists' will be 916 | considered the same file only if a file with the same filename 917 | exists on disk. 918 | store_result (bool): If true, the result of ``func`` will be 919 | stored in the TaskGraph database and retrievable with a call 920 | to ``.get()`` on the Task object. 921 | task_database_path (str): path to an SQLITE database that has 922 | table named "taskgraph_data" with the three fields: 923 | task_hash TEXT NOT NULL, 924 | target_path_stats BLOB NOT NULL 925 | result BLOB NOT NULL 926 | If a call is successful its hash is inserted/updated in the 927 | table, the target_path_stats stores the base/target stats 928 | for the target files created by the call and listed in 929 | ``target_path_list``, and the result of ``func`` is stored in 930 | ``result``. 931 | 932 | """ 933 | # it is a common error to accidentally pass a non string as to the 934 | # target path list, this terminates early if so 935 | if any([not (isinstance(path, _VALID_PATH_TYPES)) 936 | for path in target_path_list]): 937 | raise ValueError( 938 | "Values passed to target_path_list are not strings: %s", 939 | target_path_list) 940 | 941 | # sort the target path list because the order doesn't matter for 942 | # a result, but it would cause a task to be reexecuted if the only 943 | # difference was a different order. 944 | self._target_path_list = sorted([ 945 | _normalize_path(path) for path in target_path_list]) 946 | self.task_name = task_name 947 | self._func = func 948 | self._args = args 949 | self._kwargs = kwargs 950 | self._ignore_path_list = [ 951 | _normalize_path(path) for path in ignore_path_list] 952 | self._hash_target_files = hash_target_files 953 | self._ignore_directories = ignore_directories 954 | self._transient_run = transient_run 955 | self._worker_pool = worker_pool 956 | self._task_database_path = task_database_path 957 | self._hash_algorithm = hash_algorithm 958 | self._store_result = store_result 959 | self.exception_object = None 960 | 961 | # invert the priority since sorting goes smallest to largest and we 962 | # want more positive priority values to be executed first. 963 | self._priority = -priority 964 | 965 | # Used to ensure only one attempt at executing and also a mechanism 966 | # to see when Task is complete. This can be set if a Task finishes 967 | # a _call and there are no more attempts at reexecution. 968 | self.task_done_executing_event = threading.Event() 969 | 970 | # These are used to store and later access the result of the call. 971 | self._result = None 972 | 973 | # Calculate a hash based only on argument inputs. 974 | try: 975 | if not hasattr(Task, 'func_source_map'): 976 | Task.func_source_map = {} 977 | # memoize func source code because it's likely we'll import 978 | # the same func many times and reflection is slow 979 | if self._func not in Task.func_source_map: 980 | Task.func_source_map[self._func] = ( 981 | inspect.getsource(self._func)) 982 | source_code = Task.func_source_map[self._func] 983 | except (IOError, TypeError): 984 | # many reasons for this, for example, frozen Python code won't 985 | # have source code, so just leave blank 986 | source_code = '' 987 | 988 | if not hasattr(self._func, '__name__'): 989 | LOGGER.warning( 990 | "function does not have a __name__ which means it will not " 991 | "be considered when calculating a successive input has " 992 | "been changed with another function without __name__.") 993 | self._func.__name__ = '' 994 | 995 | args_clean = [] 996 | for index, arg in enumerate(self._args): 997 | try: 998 | scrubbed_value = _scrub_task_args(arg, self._target_path_list) 999 | _ = pickle.dumps(scrubbed_value) 1000 | args_clean.append(scrubbed_value) 1001 | except TypeError: 1002 | LOGGER.warning( 1003 | "could not pickle argument at index %d (%s). " 1004 | "Skipping argument which means it will not be considered " 1005 | "when calculating whether inputs have been changed " 1006 | "on a successive run.", index, arg) 1007 | 1008 | kwargs_clean = {} 1009 | # iterate through sorted order so we get the same hash result with the 1010 | # same set of kwargs irrespective of the item dict order. 1011 | for key, arg in sorted(self._kwargs.items()): 1012 | try: 1013 | scrubbed_value = _scrub_task_args(arg, self._target_path_list) 1014 | _ = pickle.dumps(scrubbed_value) 1015 | kwargs_clean[key] = scrubbed_value 1016 | except TypeError: 1017 | LOGGER.warning( 1018 | "could not pickle kw argument %s (%s) scrubbed to %s. " 1019 | "Skipping argument which means it will not be considered " 1020 | "when calculating whether inputs have been changed " 1021 | "on a successive run.", key, arg, scrubbed_value) 1022 | 1023 | self._reexecution_info = { 1024 | 'func_name': self._func.__name__, 1025 | 'args_clean': args_clean, 1026 | 'kwargs_clean': kwargs_clean, 1027 | 'source_code_hash': hashlib.sha1( 1028 | source_code.encode('utf-8')).hexdigest(), 1029 | } 1030 | 1031 | argument_hash_string = ':'.join([ 1032 | repr(self._reexecution_info[key]) 1033 | for key in sorted(self._reexecution_info.keys())]) 1034 | 1035 | self._task_id_hash = hashlib.sha1( 1036 | argument_hash_string.encode('utf-8')).hexdigest() 1037 | 1038 | # this will get calculated when ``is_precalculated`` is invoked. 1039 | self._task_reexecution_hash = None 1040 | 1041 | def __eq__(self, other): 1042 | """Two tasks are equal if their hashes are equal.""" 1043 | return ( 1044 | isinstance(self, other.__class__) and 1045 | (self._task_id_hash == other._task_id_hash)) 1046 | 1047 | def __hash__(self): 1048 | """Return the base-16 integer hash of this hash string.""" 1049 | return int(self._task_id_hash, 16) 1050 | 1051 | def __ne__(self, other): 1052 | """Inverse of __eq__.""" 1053 | return not self.__eq__(other) 1054 | 1055 | def __lt__(self, other): 1056 | """Less than based on priority.""" 1057 | return self._priority < other._priority 1058 | 1059 | def __repr__(self): 1060 | """Create a string representation of a Task.""" 1061 | return "Task object %s:\n\n" % (id(self)) + pprint.pformat( 1062 | { 1063 | "task_name": self.task_name, 1064 | "priority": self._priority, 1065 | "ignore_path_list": self._ignore_path_list, 1066 | "ignore_directories": self._ignore_directories, 1067 | "target_path_list": self._target_path_list, 1068 | "task_id_hash": self._task_id_hash, 1069 | "task_reexecution_hash": self._task_reexecution_hash, 1070 | "exception_object": self.exception_object, 1071 | "self._reexecution_info": self._reexecution_info, 1072 | "self._result": self._result, 1073 | }) 1074 | 1075 | def _call(self): 1076 | """Invoke this method to execute task. 1077 | 1078 | Precondition is that the Task dependencies are satisfied. 1079 | 1080 | Sets the ``self.task_done_executing_event`` flag if execution is 1081 | successful. 1082 | 1083 | Raises: 1084 | RuntimeError if any target paths are not generated after the 1085 | function call is complete. 1086 | 1087 | """ 1088 | LOGGER.debug("_call check if precalculated %s", self.task_name) 1089 | if not self._transient_run and self.is_precalculated(): 1090 | self.task_done_executing_event.set() 1091 | return 1092 | LOGGER.debug("not precalculated %s", self.task_name) 1093 | 1094 | if self._worker_pool is not None: 1095 | result = self._worker_pool.apply_async( 1096 | func=self._func, args=self._args, kwds=self._kwargs) 1097 | # the following blocks and raises an exception if result 1098 | # raised an exception 1099 | LOGGER.debug("apply_async for task %s", self.task_name) 1100 | payload = result.get() 1101 | else: 1102 | LOGGER.debug("direct _func for task %s", self.task_name) 1103 | payload = self._func(*self._args, **self._kwargs) 1104 | if self._store_result: 1105 | self._result = payload 1106 | 1107 | # check that the target paths exist and record stats for later 1108 | if not self._hash_target_files: 1109 | target_hash_algorithm = 'exists' 1110 | else: 1111 | target_hash_algorithm = self._hash_algorithm 1112 | result_target_path_stats = list( 1113 | _get_file_stats( 1114 | self._target_path_list, target_hash_algorithm, [], False)) 1115 | result_target_path_set = set( 1116 | [x[0] for x in result_target_path_stats]) 1117 | target_path_set = set(self._target_path_list) 1118 | if target_path_set != result_target_path_set: 1119 | raise RuntimeError( 1120 | "In Task: %s\nMissing expected target path results.\n" 1121 | "Expected: %s\nObserved: %s\n" % ( 1122 | self.task_name, self._target_path_list, 1123 | result_target_path_set)) 1124 | 1125 | # this step will only record the run if there is an expected 1126 | # target file. Otherwise we infer the result of this call is 1127 | # transient between taskgraph executions and we should expect to 1128 | # run it again. 1129 | if not self._transient_run: 1130 | _execute_sqlite( 1131 | "INSERT OR REPLACE INTO taskgraph_data VALUES (?, ?, ?)", 1132 | self._task_database_path, mode='modify', 1133 | argument_list=( 1134 | self._task_reexecution_hash, 1135 | pickle.dumps(result_target_path_stats), 1136 | pickle.dumps(self._result))) 1137 | self.task_done_executing_event.set() 1138 | LOGGER.debug("successful run on task %s", self.task_name) 1139 | 1140 | def is_precalculated(self): 1141 | """Return true if _call need not be invoked. 1142 | 1143 | If the task has been precalculated it will fetch the return result from 1144 | the previous run. 1145 | 1146 | Returns: 1147 | True if the Task's target paths exist in the same state as the 1148 | last recorded run at the time this function is called. It is 1149 | possible this value could change without running the Task if 1150 | input parameter file stats change. False otherwise. 1151 | 1152 | """ 1153 | # This gets a list of the files and their file stats that can be found 1154 | # in args and kwargs but ignores anything specifically targeted or 1155 | # an expected result. This will allow a task to change its hash in 1156 | # case a different version of a file was passed in. 1157 | # these are the stats of the files that exist that aren't ignored 1158 | if not self._hash_target_files: 1159 | target_hash_algorithm = 'exists' 1160 | else: 1161 | target_hash_algorithm = self._hash_algorithm 1162 | file_stat_list = list(_get_file_stats( 1163 | [self._args, self._kwargs], 1164 | target_hash_algorithm, 1165 | self._target_path_list+self._ignore_path_list, 1166 | self._ignore_directories)) 1167 | 1168 | other_arguments = _filter_non_files( 1169 | [self._reexecution_info['args_clean'], 1170 | self._reexecution_info['kwargs_clean']], 1171 | self._target_path_list, 1172 | self._ignore_path_list, 1173 | self._ignore_directories) 1174 | 1175 | LOGGER.debug("file_stat_list: %s", file_stat_list) 1176 | LOGGER.debug("other_arguments: %s", other_arguments) 1177 | 1178 | # add the file stat list to the already existing reexecution info 1179 | # dictionary that contains stats that should not change whether 1180 | # files have been created/updated/or not. 1181 | self._reexecution_info['file_stat_list'] = file_stat_list 1182 | self._reexecution_info['other_arguments'] = other_arguments 1183 | 1184 | reexecution_string = '%s:%s:%s:%s:%s' % ( 1185 | self._reexecution_info['func_name'], 1186 | self._reexecution_info['source_code_hash'], 1187 | self._reexecution_info['other_arguments'], 1188 | self._store_result, 1189 | # the x[1] is to only take the digest part of the 'file_stat' 1190 | str([x[1] for x in file_stat_list])) 1191 | 1192 | self._task_reexecution_hash = hashlib.sha1( 1193 | reexecution_string.encode('utf-8')).hexdigest() 1194 | try: 1195 | database_result = _execute_sqlite( 1196 | """SELECT target_path_stats, result from taskgraph_data 1197 | WHERE (task_reexecution_hash == ?)""", 1198 | self._task_database_path, mode='read_only', 1199 | argument_list=(self._task_reexecution_hash,), fetch='one') 1200 | if database_result is None: 1201 | LOGGER.debug( 1202 | "not precalculated, Task hash does not " 1203 | "exist (%s)", self.task_name) 1204 | LOGGER.debug("is_precalculated full task info: %s", self) 1205 | return False 1206 | result_target_path_stats = pickle.loads(database_result[0]) 1207 | mismatched_target_file_list = [] 1208 | for path, hash_string in result_target_path_stats: 1209 | if path not in self._target_path_list: 1210 | mismatched_target_file_list.append( 1211 | 'Recorded path not in target path list %s' % path) 1212 | if not os.path.exists(path): 1213 | mismatched_target_file_list.append( 1214 | 'Path not found: %s' % path) 1215 | continue 1216 | elif target_hash_algorithm == 'exists': 1217 | # this is the case where hash_algorithm == 'exists' but 1218 | # we already know the file exists so we do nothing 1219 | continue 1220 | if target_hash_algorithm == 'sizetimestamp': 1221 | size, modified_time, actual_path = [ 1222 | x for x in hash_string.split('::')] 1223 | if actual_path != path: 1224 | mismatched_target_file_list.append( 1225 | "Path names don't match\n" 1226 | "cached: (%s)\nactual (%s)" % (path, actual_path)) 1227 | 1228 | # Using nanosecond resolution for mtime (instead of the 1229 | # usual float result of os.path.getmtime()) allows us to 1230 | # precisely compare modification time because we're 1231 | # comparing ints: st_mtime_ns always returns an int. 1232 | # 1233 | # Timestamp resolution: the python docs note that "many 1234 | # filesystems do not provide nanosecond precision". 1235 | # This is true (e.g. FAT, FAT32 timestamps are only 1236 | # accurate to within 2 seconds), but the data read from the 1237 | # filesystem will be consistent. This lets us know 1238 | # whether the timestamp changed. This also means that, on 1239 | # FAT filesystems, if a file is changed within 2s of its 1240 | # creation time, we might not be able to detect it. This 1241 | # is a weakness of FAT, not taskgraph. 1242 | target_modified_time = os.stat(path).st_mtime_ns 1243 | if not int(modified_time) == target_modified_time: 1244 | mismatched_target_file_list.append( 1245 | "Modified times don't match " 1246 | "cached: (%f) actual: (%f)" % ( 1247 | float(modified_time), target_modified_time)) 1248 | continue 1249 | target_size = os.path.getsize(path) 1250 | if float(size) != target_size: 1251 | mismatched_target_file_list.append( 1252 | "File sizes don't match " 1253 | "cached: (%s) actual: (%s)" % ( 1254 | size, target_size)) 1255 | else: 1256 | target_hash = _hash_file(path, target_hash_algorithm) 1257 | if hash_string != target_hash: 1258 | mismatched_target_file_list.append( 1259 | "File hashes are different. cached: (%s) " 1260 | "actual: (%s)" % (hash_string, target_hash)) 1261 | if mismatched_target_file_list: 1262 | LOGGER.info( 1263 | "not precalculated (%s), Task hash exists, " 1264 | "but there are these mismatches: %s", 1265 | self.task_name, '\n'.join(mismatched_target_file_list)) 1266 | return False 1267 | if self._store_result: 1268 | self._result = pickle.loads(database_result[1]) 1269 | LOGGER.debug("precalculated (%s)" % self) 1270 | return True 1271 | except EOFError: 1272 | LOGGER.exception("not precalculated %s, EOFError", self.task_name) 1273 | return False 1274 | 1275 | def join(self, timeout=None): 1276 | """Block until task is complete, raise exception if runtime failed.""" 1277 | LOGGER.debug( 1278 | "joining %s done executing: %s", self.task_name, 1279 | self.task_done_executing_event) 1280 | successful_wait = self.task_done_executing_event.wait(timeout) 1281 | if self.exception_object: 1282 | raise self.exception_object 1283 | return successful_wait 1284 | 1285 | def get(self, timeout=None): 1286 | """Return the result of the ``func`` once it is ready. 1287 | 1288 | If ``timeout`` is None, this call blocks until the task is complete 1289 | determined by a call to ``.join()``. Otherwise will wait up to 1290 | ``timeout`` seconds before raising a``RuntimeError`` if exceeded. 1291 | 1292 | Args: 1293 | timeout (float): if not None this parameter is a floating point 1294 | number specifying a timeout for the operation in seconds. 1295 | 1296 | Returns: 1297 | value of the result 1298 | 1299 | Raises: 1300 | RuntimeError when ``timeout`` exceeded. 1301 | ValueError if ``store_result`` was set to ``False`` when the task 1302 | was created. 1303 | 1304 | """ 1305 | if not self._store_result: 1306 | raise ValueError( 1307 | 'must set `store_result` to True in `add_task` to invoke this ' 1308 | 'function') 1309 | timeout = not self.join(timeout) 1310 | if timeout: 1311 | raise RuntimeError('call to get timed out') 1312 | return self._result 1313 | 1314 | 1315 | def _get_file_stats( 1316 | base_value, hash_algorithm, ignore_list, 1317 | ignore_directories): 1318 | """Return fingerprints of any filepaths in ``base_value``. 1319 | 1320 | Args: 1321 | base_value: any python value. Any file paths in ``base_value`` 1322 | should be processed with `_normalize_path`. 1323 | hash_algorithm (string): either a hash function id that 1324 | exists in hashlib.algorithms_available, 'exists', or 1325 | 'sizetimestamp'. Any paths to actual files in the arguments will be 1326 | digested with this algorithm. If value is 'sizetimestamp' the 1327 | digest will only use the normed path, size, and timestamp of any 1328 | files found in the arguments. This value is used when 1329 | determining whether a task is precalculated or its target 1330 | files can be copied to an equivalent task. Note if 1331 | ``hash_algorithm`` is 'sizetimestamp' the task will require the 1332 | same base path files to determine equality. If it is a 1333 | ``hashlib`` algorithm only file contents will be considered. If 1334 | this value is 'exists' the value of the hash will be 'exists'. 1335 | ignore_list (list): any paths found in this list are not included 1336 | as part of the file stats. All paths in this list should be 1337 | "os.path.norm"ed. 1338 | ignore_directories (boolean): If True directories are not 1339 | considered for filestats. 1340 | 1341 | 1342 | Return: 1343 | list of (path, digest) tuples for any filepaths found in 1344 | base_value or nested in base value that are not otherwise 1345 | ignored by the input parameters where digest is created by 1346 | the hash algorithm specified in ``hash_algorithm``. 1347 | 1348 | """ 1349 | if isinstance(base_value, _VALID_PATH_TYPES): 1350 | try: 1351 | norm_path = _normalize_path(base_value) 1352 | if norm_path not in ignore_list and ( 1353 | not os.path.isdir(norm_path) or 1354 | not ignore_directories) and os.path.exists(norm_path): 1355 | if hash_algorithm == 'exists': 1356 | yield (norm_path, 'exists') 1357 | else: 1358 | yield ( 1359 | norm_path, _hash_file(norm_path, hash_algorithm)) 1360 | except (OSError, ValueError): 1361 | # I ran across a ValueError when one of the os.path functions 1362 | # interpreted the value as a path that was too long. 1363 | # OSErrors could happen if there's coincidentally a directory we 1364 | # can't read or it's not a file or something else out of our 1365 | # control 1366 | LOGGER.exception( 1367 | "base_value couldn't be analyzed somehow '%s'", base_value) 1368 | elif isinstance(base_value, dict): 1369 | for key in base_value.keys(): 1370 | value = base_value[key] 1371 | for stat in _get_file_stats( 1372 | value, hash_algorithm, ignore_list, ignore_directories): 1373 | yield stat 1374 | elif isinstance(base_value, (list, set, tuple)): 1375 | for value in base_value: 1376 | for stat in _get_file_stats( 1377 | value, hash_algorithm, ignore_list, ignore_directories): 1378 | yield stat 1379 | 1380 | 1381 | def _filter_non_files( 1382 | base_value, keep_list, ignore_list, keep_directories): 1383 | """Remove any values that are files not in ignore list or directories. 1384 | 1385 | Args: 1386 | base_value: any python value. Any file paths in ``base_value`` 1387 | should be "os.path.norm"ed before this function is called. 1388 | contains filepaths in any nested structure. 1389 | keep_list (list): any paths found in this list are not filtered. 1390 | All paths in this list should be "os.path.norm"ed. 1391 | ignore_list (list): any paths found in this list are filtered. 1392 | keep_directories (boolean): If True directories are not filtered 1393 | out. 1394 | 1395 | Return: 1396 | original ``base_value`` with any nested file paths for files that 1397 | exist in the os.exists set to ``None``. 1398 | 1399 | """ 1400 | if isinstance(base_value, _VALID_PATH_TYPES): 1401 | try: 1402 | norm_path = _normalize_path(base_value) 1403 | if norm_path not in ignore_list and ( 1404 | norm_path in keep_list or (( 1405 | os.path.isdir(norm_path) and keep_directories) or ( 1406 | not os.path.isfile(norm_path) and 1407 | not os.path.isdir(norm_path)))): 1408 | return norm_path 1409 | return None 1410 | except (OSError, ValueError): 1411 | # I ran across a ValueError when one of the os.path functions 1412 | # interpreted the value as a path that was too long. 1413 | # OSErrors could happen if there's coincidentally a directory we 1414 | # can't read or it's not a file or something else out of our 1415 | # control 1416 | LOGGER.exception( 1417 | "base_value couldn't be analyzed somehow '%s'", base_value) 1418 | elif isinstance(base_value, dict): 1419 | return { 1420 | key: _filter_non_files( 1421 | value, keep_list, ignore_list, keep_directories) 1422 | for key, value in base_value.items() 1423 | } 1424 | elif isinstance(base_value, (list, set, tuple)): 1425 | return type(base_value)([ 1426 | _filter_non_files( 1427 | value, keep_list, ignore_list, keep_directories) 1428 | for value in base_value]) 1429 | else: 1430 | return base_value 1431 | 1432 | 1433 | def _scrub_task_args(base_value, target_path_list): 1434 | """Attempt to convert ``base_value`` to canonical values. 1435 | 1436 | Any paths in ``base_value`` are normalized, any paths that are also in 1437 | the``target_path_list`` are replaced with a placeholder so that if 1438 | all other arguments are the same in ``base_value`` except target path 1439 | name the function will hash to the same. 1440 | 1441 | This function can be called before the Task dependencies are satisfied 1442 | since it doesn't inspect any file stats on disk. 1443 | 1444 | Args: 1445 | base_value: any python value 1446 | target_path_list (list): a list of strings that if found in 1447 | ``base_value`` should be replaced with 'in_target_path' so 1448 | 1449 | Returns: 1450 | base_value with any functions replaced as strings and paths in 1451 | ``target_path_list`` with a 'target_path_list[n]' placeholder. 1452 | 1453 | """ 1454 | if callable(base_value): 1455 | try: 1456 | if not hasattr(Task, 'func_source_map'): 1457 | Task.func_source_map = {} 1458 | # memoize func source code because it's likely we'll import 1459 | # the same func many times and reflection is slow 1460 | if base_value not in Task.func_source_map: 1461 | Task.func_source_map[base_value] = ( 1462 | inspect.getsource(base_value)).replace( 1463 | ' ', '').replace('\t', '') 1464 | source_code = Task.func_source_map[base_value] 1465 | except (IOError, TypeError): 1466 | # many reasons for this, for example, frozen Python code won't 1467 | # have source code, so just leave blank 1468 | source_code = '' 1469 | return '%s:%s' % (base_value.__name__, source_code) 1470 | elif isinstance(base_value, dict): 1471 | result_dict = {} 1472 | for key in base_value.keys(): 1473 | result_dict[key] = _scrub_task_args( 1474 | base_value[key], target_path_list) 1475 | return result_dict 1476 | elif isinstance(base_value, (list, set, tuple)): 1477 | result_list = [] 1478 | for value in base_value: 1479 | result_list.append(_scrub_task_args(value, target_path_list)) 1480 | return type(base_value)(result_list) 1481 | elif isinstance(base_value, _VALID_PATH_TYPES): 1482 | normalized_path = _normalize_path(base_value) 1483 | if normalized_path in target_path_list: 1484 | return 'in_target_path_list' 1485 | else: 1486 | return normalized_path 1487 | else: 1488 | return base_value 1489 | 1490 | 1491 | def _hash_file(file_path, hash_algorithm, buf_size=2**20): 1492 | """Return a hex digest of ``file_path``. 1493 | 1494 | Args: 1495 | file_path (string): path to file to hash. 1496 | hash_algorithm (string): a hash function id that exists in 1497 | hashlib.algorithms_available or 'sizetimestamp'. If function id 1498 | is in hashlib.algorithms_available, the file contents are hashed 1499 | with that function and the fingerprint is returned. If value is 1500 | 'sizetimestamp' the size and timestamp of the file are returned 1501 | in a string of the form 1502 | '[sizeinbytes]:[lastmodifiedtime]'. 1503 | buf_size (int): number of bytes to read from ``file_path`` at a time 1504 | for digesting. 1505 | 1506 | Returns: 1507 | a hash hex digest computed with hash algorithm ``hash_algorithm`` 1508 | of the binary contents of the file located at ``file_path``. 1509 | 1510 | """ 1511 | if hash_algorithm == 'sizetimestamp': 1512 | norm_path = _normalize_path(file_path) 1513 | return '%d::%i::%s' % ( 1514 | os.path.getsize(norm_path), os.stat(norm_path).st_mtime_ns, 1515 | norm_path) 1516 | hash_func = hashlib.new(hash_algorithm) 1517 | with open(file_path, 'rb') as f: 1518 | binary_data = f.read(buf_size) 1519 | while binary_data: 1520 | hash_func.update(binary_data) 1521 | binary_data = f.read(buf_size) 1522 | return hash_func.hexdigest() 1523 | 1524 | 1525 | def _normalize_path(path): 1526 | """Convert ``path`` into normalized, normcase, absolute filepath.""" 1527 | norm_path = os.path.normpath(path) 1528 | try: 1529 | abs_path = os.path.abspath(norm_path) 1530 | except TypeError: 1531 | # this occurs when encountering VERY long strings that might be 1532 | # interpreted as paths 1533 | LOGGER.warning( 1534 | "failed to abspath %s so returning normalized path instead") 1535 | abs_path = norm_path 1536 | return os.path.normcase(abs_path) 1537 | 1538 | 1539 | @retrying.retry( 1540 | wait_exponential_multiplier=500, wait_exponential_max=3200, 1541 | stop_max_attempt_number=100) 1542 | def _execute_sqlite( 1543 | sqlite_command, database_path, argument_list=None, 1544 | mode='read_only', execute='execute', fetch=None): 1545 | """Execute SQLite command and attempt retries on a failure. 1546 | 1547 | Args: 1548 | sqlite_command (str): a well formatted SQLite command. 1549 | database_path (str): path to the SQLite database to operate on. 1550 | argument_list (list): ``execute == 'execute'`` then this list is passed 1551 | to the internal sqlite3 ``execute`` call. 1552 | mode (str): must be either 'read_only' or 'modify'. 1553 | execute (str): must be either 'execute' or 'script'. 1554 | fetch (str): if not ``None`` can be either 'all' or 'one'. 1555 | If not None the result of a fetch will be returned by this 1556 | function. 1557 | 1558 | Returns: 1559 | result of fetch if ``fetch`` is not None. 1560 | 1561 | """ 1562 | cursor = None 1563 | connection = None 1564 | try: 1565 | if mode == 'read_only': 1566 | ro_uri = r'%s?mode=ro' % pathlib.Path( 1567 | os.path.abspath(database_path)).as_uri() 1568 | LOGGER.debug( 1569 | '%s exists: %s', ro_uri, os.path.exists(os.path.abspath( 1570 | database_path))) 1571 | connection = sqlite3.connect(ro_uri, uri=True) 1572 | elif mode == 'modify': 1573 | connection = sqlite3.connect(database_path) 1574 | else: 1575 | raise ValueError('Unknown mode: %s' % mode) 1576 | 1577 | if execute == 'execute': 1578 | if argument_list is None: 1579 | cursor = connection.execute(sqlite_command) 1580 | else: 1581 | cursor = connection.execute(sqlite_command, argument_list) 1582 | elif execute == 'script': 1583 | cursor = connection.executescript(sqlite_command) 1584 | else: 1585 | raise ValueError('Unknown execute mode: %s' % execute) 1586 | 1587 | result = None 1588 | payload = None 1589 | if fetch == 'all': 1590 | payload = (cursor.fetchall()) 1591 | elif fetch == 'one': 1592 | payload = (cursor.fetchone()) 1593 | elif fetch is not None: 1594 | raise ValueError('Unknown fetch mode: %s' % fetch) 1595 | if payload is not None: 1596 | result = list(payload) 1597 | cursor.close() 1598 | connection.commit() 1599 | connection.close() 1600 | cursor = None 1601 | connection = None 1602 | return result 1603 | except sqlite3.OperationalError: 1604 | LOGGER.warning( 1605 | 'TaskGraph database is locked because another process is using ' 1606 | 'it, waiting for a bit of time to try again') 1607 | raise 1608 | except Exception: 1609 | LOGGER.exception('Exception on _execute_sqlite: %s', sqlite_command) 1610 | raise 1611 | finally: 1612 | if cursor is not None: 1613 | cursor.close() 1614 | if connection is not None: 1615 | connection.commit() 1616 | connection.close() 1617 | --------------------------------------------------------------------------------