├── MANIFEST.in
├── taskgraph
    ├── __init__.py
    └── Task.py
├── requirements.txt
├── bitbucket-pipelines.yml
├── tox.ini
├── pyproject.toml
├── .github
    └── workflows
    │   └── pythonapp.yml
├── setup.py
├── LICENSE.txt
├── README.rst
├── HISTORY.rst
└── tests
    └── test_task.py


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # This file defines extra files needed for the source distribution.
2 | # Setup.py requires these two files to exist, so we add them here.
3 | 
4 | include README.rst HISTORY.rst LICENSE.txt
5 | 


--------------------------------------------------------------------------------
/taskgraph/__init__.py:
--------------------------------------------------------------------------------
1 | """TaskGraph init module."""
2 | 
3 | from .Task import TaskGraph
4 | from .Task import Task
5 | from .Task import _TASKGRAPH_DATABASE_FILENAME
6 | from .Task import __version__
7 | 
8 | __all__ = ['__version__', 'TaskGraph', 'Task', '_TASKGRAPH_DATABASE_FILENAME']
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # requirements.txt
2 | # --------------------
3 | # This file records the packages and requirements needed in order for
4 | # taskgraph to work as expected.
5 | 
6 | retrying>=1.3.0
7 | importlib_metadata  # technically only required on python < 3.8; easier to install with conda across all versions
8 | 


--------------------------------------------------------------------------------
/bitbucket-pipelines.yml:
--------------------------------------------------------------------------------
 1 | pipelines:
 2 |   default:
 3 |     - parallel:
 4 |       - step:
 5 |           name: Tests on python3.6
 6 |           image: python:3.6-stretch
 7 |           caches:
 8 |             - pip
 9 |           script:
10 |             - pip install tox
11 |             - tox -e py36-base,py36-psutil
12 |       - step:
13 |           name: Tests on python3.7
14 |           image: python:3.7-stretch
15 |           caches:
16 |             - pip
17 |           script:
18 |             - pip install tox
19 |             - tox -e py37-base,py37-psutil
20 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = {py37,py38,py39,py310,py311}-{base,psutil}
 3 | 
 4 | [gh-actions]
 5 | # Allows us to use tox configuration to manage our tests, but still run on
 6 | # github actions in the GHA matrix job matrix with GHA-managed python.
 7 | # Requires tox-gh-actions package to run.
 8 | python =
 9 |     3.6: py36
10 |     3.7: py37
11 |     3.8: py38
12 |     3.9: py39
13 |     3.10: py310
14 |     3.11: py311
15 |     3.12: py312
16 | 
17 | [testenv]
18 | commands =
19 |     pytest --log-level=DEBUG \
20 |         --cov=taskgraph \
21 |         --cov-report=term \
22 |         --cov-report=xml \
23 |         --cov-report=html \
24 |         --junitxml={toxinidir}/testresults.xml {toxinidir}/tests
25 | changedir=
26 |     {envtmpdir}
27 | 
28 | # If tox-conda is installed (https://github.com/tox-dev/tox-conda),
29 | # use conda-forge python builds for the environments.
30 | conda_channels=
31 |     conda-forge
32 | 
33 | # Only install psutil to the environments where we're testing psutil.
34 | # "psutil: psutil" is an example of tox's generative environment definition
35 | # and will match all environments containing the string "psutil"
36 | deps =
37 |     setuptools_scm
38 |     pytest
39 |     pytest-cov
40 |     rstcheck
41 |     psutil: psutil
42 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "taskgraph"
 3 | description = "Parallel task graph framework"
 4 | readme = "README.rst"
 5 | requires-python = ">=3.6"
 6 | license = {file = "LICENSE.txt"}
 7 | maintainers = [
 8 |     {name = "Natural Capital Project Software Team"}
 9 | ]
10 | keywords = ["parallel", "multiprocessing", "distributed", "computing"]
11 | classifiers = [
12 |     "Intended Audience :: Developers",
13 |     "Topic :: System :: Distributed Computing",
14 |     "Development Status :: 5 - Production/Stable",
15 |     "Natural Language :: English",
16 |     "Operating System :: MacOS :: MacOS X",
17 |     "Operating System :: Microsoft",
18 |     "Operating System :: POSIX",
19 |     "Programming Language :: Python :: 3.8",
20 |     "Programming Language :: Python :: 3.9",
21 |     "Programming Language :: Python :: 3.10",
22 |     "Programming Language :: Python :: 3.11",
23 |     "Programming Language :: Python :: 3.12",
24 |     "License :: OSI Approved :: BSD License"
25 | ]
26 | # the version is provided dynamically by setuptools_scm
27 | # `dependencies` and `optional-dependencies` are provided by setuptools
28 | # using the corresponding setup args `install_requires` and `extras_require`
29 | dynamic = ["version", "dependencies", "optional-dependencies"]
30 | 
31 | [build-system]
32 | requires = [
33 |     'wheel', 'setuptools_scm>=8.0'
34 | ]
35 | build-backend = "setuptools.build_meta"
36 | 
37 | [tool.setuptools_scm]
38 | version_scheme = "post-release"
39 | local_scheme = "node-and-date"
40 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonapp.yml:
--------------------------------------------------------------------------------
 1 | name: Test TaskGraph
 2 | on:
 3 |     push:
 4 |         branches:
 5 |             - "**"
 6 |     pull_request:
 7 |         branches:
 8 |             - "**"
 9 | jobs:
10 |   Test:
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         python-version: [3.8, 3.9, "3.10", "3.11", "3.12"]
16 |         os: [ubuntu-latest, windows-latest, macos-latest]
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v2
20 |         with:
21 |           # Fetch all history (it's a small repo) for scm-based versioning
22 |           fetch-depth: 0
23 | 
24 |       - name: Set up Python ${{ matrix.python-version }}
25 |         uses: actions/setup-python@v2
26 |         with:
27 |           python-version: ${{ matrix.python-version }}
28 | 
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           # See this comment about the importlib_metadata constraint:
33 |           # https://github.com/python/importlib_metadata/issues/406#issuecomment-1264666048
34 |           pip install tox tox-gh-actions flake8 "importlib_metadata<5" rstcheck
35 | 
36 |       - name: Lint with flake8
37 |         run: |
38 |           # stop the build if there are Python syntax errors or undefined names
39 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
40 |           # exit-zero treats all errors as warnings
41 |           flake8 . --count --exit-zero --max-line-length=80 --statistics
42 | 
43 |       - name: Run tests
44 |         run: |
45 |           tox
46 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """taskgraph setup.py."""
 2 | from setuptools import setup
 3 | 
 4 | _REQUIREMENTS = [
 5 |     x for x in open('requirements.txt').read().split('\n')
 6 |     if not x.startswith('#') and len(x) > 0]
 7 | 
 8 | LONG_DESCRIPTION = '%s\n\n%s' % (
 9 |     open('README.rst').read(),
10 |     open('HISTORY.rst').read())
11 | 
12 | setup(
13 |     name='taskgraph',
14 |     use_scm_version={'version_scheme': 'post-release',
15 |                      'local_scheme': 'node-and-date'},
16 |     setup_requires=['setuptools_scm'],
17 |     description='Parallel task graph framework.',
18 |     long_description=LONG_DESCRIPTION,
19 |     url='https://github.com/natcap/taskgraph',
20 |     packages=['taskgraph'],
21 |     license='BSD',
22 |     keywords='parallel multiprocessing distributed computing',
23 |     install_requires=_REQUIREMENTS,
24 |     extras_require={
25 |         'niced_processes': ['psutil'],
26 |         },
27 |     classifiers=[
28 |         'Intended Audience :: Developers',
29 |         'Topic :: System :: Distributed Computing',
30 |         'Development Status :: 5 - Production/Stable',
31 |         'Natural Language :: English',
32 |         'Operating System :: MacOS :: MacOS X',
33 |         'Operating System :: Microsoft',
34 |         'Operating System :: POSIX',
35 |         'Programming Language :: Python :: 3.8',
36 |         'Programming Language :: Python :: 3.9',
37 |         'Programming Language :: Python :: 3.10',
38 |         'Programming Language :: Python :: 3.11',
39 |         'Programming Language :: Python :: 3.12',
40 | 
41 |         'License :: OSI Approved :: BSD License'
42 |     ])
43 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | In this license, "Natural Capital Project" is defined as the parties of
 2 | Stanford University, The Nature Conservancy, World Wildlife Fund Inc.,
 3 | and University of Minnesota.
 4 | 
 5 | This tool has an open license. All people are invited to use the tool
 6 | under the following conditions and terms:
 7 | 
 8 | Copyright (c) 2020, Natural Capital Project
 9 | 
10 | All rights reserved.
11 | 
12 | Redistribution and use in source and binary forms, with or without
13 | modification, are permitted provided that the following conditions are
14 | met:
15 | 
16 |  * Redistributions of source code must retain the above copyright
17 |    notice, this list of conditions and the following disclaimer.
18 | 
19 |  * Redistributions in binary form must reproduce the above copyright
20 |    notice, this list of conditions and the following disclaimer in the
21 |    documentation and/or other materials provided with the
22 |    distribution.
23 | 
24 |  * Neither the name of Natural Capital Project nor the names of
25 |    its contributors may be used to endorse or promote products derived
26 |    from this software without specific prior written permission.
27 | 
28 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
31 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
32 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
33 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
34 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
35 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
36 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
38 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ===============
  2 | About TaskGraph
  3 | ===============
  4 | 
  5 | ``TaskGraph`` is a library that was developed to help manage complicated
  6 | computational software pipelines consisting of long running individual tasks.
  7 | Many of these tasks could be executed in parallel, almost all of them wrote
  8 | results to disk, and many times results could be reused from part of the
  9 | pipeline. TaskGraph manages all of this for you. With it you can schedule
 10 | tasks with dependencies, avoid recomputing results that have already been
 11 | computed, and allot multiple CPU cores to execute tasks in parallel if
 12 | desired.
 13 | 
 14 | TaskGraph Dependencies
 15 | ----------------------
 16 | 
 17 | Task Graph is written in pure Python, but if the ``psutils`` package is
 18 | installed the distributed multiprocessing processes will be ``nice``\d.
 19 | 
 20 | Example Use
 21 | -----------
 22 | 
 23 | Install ``TaskGraph`` with
 24 | 
 25 | ``pip install taskgraph``
 26 | 
 27 | Then
 28 | 
 29 | .. code-block:: python
 30 | 
 31 |   import os
 32 |   import pickle
 33 |   import logging
 34 | 
 35 |   import taskgraph
 36 | 
 37 |   logging.basicConfig(level=logging.DEBUG)
 38 | 
 39 |   def _create_list_on_disk(value, length, target_path):
 40 |       """Create a numpy array on disk filled with value of `size`."""
 41 |       target_list = [value] * length
 42 |       pickle.dump(target_list, open(target_path, 'wb'))
 43 | 
 44 | 
 45 |   def _sum_lists_from_disk(list_a_path, list_b_path, target_path):
 46 |       """Read two lists, add them and save result."""
 47 |       list_a = pickle.load(open(list_a_path, 'rb'))
 48 |       list_b = pickle.load(open(list_b_path, 'rb'))
 49 |       target_list = []
 50 |       for a, b in zip(list_a, list_b):
 51 |           target_list.append(a+b)
 52 |       pickle.dump(target_list, open(target_path, 'wb'))
 53 | 
 54 |   # create a taskgraph that uses 4 multiprocessing subprocesses when possible
 55 |   if __name__ == '__main__':
 56 |       workspace_dir = 'workspace'
 57 |       task_graph = taskgraph.TaskGraph(workspace_dir, 4)
 58 |       target_a_path = os.path.join(workspace_dir, 'a.dat')
 59 |       target_b_path = os.path.join(workspace_dir, 'b.dat')
 60 |       result_path = os.path.join(workspace_dir, 'result.dat')
 61 |       result_2_path = os.path.join(workspace_dir, 'result2.dat')
 62 |       value_a = 5
 63 |       value_b = 10
 64 |       list_len = 10
 65 |       task_a = task_graph.add_task(
 66 |           func=_create_list_on_disk,
 67 |           args=(value_a, list_len, target_a_path),
 68 |           target_path_list=[target_a_path])
 69 |       task_b = task_graph.add_task(
 70 |           func=_create_list_on_disk,
 71 |           args=(value_b, list_len, target_b_path),
 72 |           target_path_list=[target_b_path])
 73 |       sum_task = task_graph.add_task(
 74 |           func=_sum_lists_from_disk,
 75 |           args=(target_a_path, target_b_path, result_path),
 76 |           target_path_list=[result_path],
 77 |           dependent_task_list=[task_a, task_b])
 78 | 
 79 |       task_graph.close()
 80 |       task_graph.join()
 81 | 
 82 |       # expect that result is a list `list_len` long with `value_a+value_b` in it
 83 |       result = pickle.load(open(result_path, 'rb'))
 84 | 
 85 | 
 86 | Caveats
 87 | -------
 88 | 
 89 | * Taskgraph's default method of checking whether a file has changed
 90 |   (``hash_algorithm='sizetimestamp'``) uses the filesystem's modification
 91 |   timestamp, interpreted in integer nanoseconds.  This check is only as
 92 |   accurate as the filesystem's timestamp.  For example:
 93 | 
 94 |   * FAT and FAT32 timestamps have a 2-second modification timestamp resolution
 95 |   * exFAT has a 10 millisecond timestamp resolution
 96 |   * NTFS has a 100 nanosecond timestamp resolution
 97 |   * HFS+ has a 1 second timestamp resolution
 98 |   * APFS has a 1 nanosecond timestamp resolution
 99 |   * ext3 has a 1 second timestamp resolution
100 |   * ext4 has a 1 nanosecond timestamp resolution
101 | 
102 |   If you suspect timestamp resolution to be an issue on your filesystem, you
103 |   may wish to store your files on a filesystem with more accurate timestamps or
104 |   else consider using a different ``hash_algorithm``.
105 | 
106 | 
107 | Running Tests
108 | -------------
109 | 
110 | Taskgraph includes a ``tox`` configuration for automating builds across
111 | multiple python versions and whether ``psutil`` is installed.  To execute all
112 | tests on all platforms, run:
113 | 
114 |     $ tox
115 | 
116 | Alternatively, if you're only trying to run tests on a single configuration
117 | (say, python 3.7 without ``psutil``), you'd run::
118 | 
119 |     $ tox -e py37
120 | 
121 | Or if you'd like to run the tests for the combination of Python 3.7 with
122 | ``psutil``, you'd run::
123 | 
124 |     $ tox -e py37-psutil
125 | 
126 | If you don't have multiple python installations already available on your system,
127 | an easy way to accomplish this is to use ``tox-conda``
128 | (https://github.com/tox-dev/tox-conda) which will use conda environments to manage
129 | the versions of python available::
130 | 
131 |     $ pip install tox-conda
132 |     $ tox
133 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
  1 | .. :changelog:
  2 | 
  3 | =========================
  4 | TaskGraph Release History
  5 | =========================
  6 | 
  7 | ..
  8 |    Unreleased Changes
  9 |    ------------------
 10 | 
 11 | 0.11.2 (2025-05-21)
 12 | -------------------
 13 | * Using ``importlib.metadata`` or ``importlib_metadata``, depending on the
 14 |   python version, to read the version from package metadata.  This is in
 15 |   response to ``pkg_resources`` being deprecated.
 16 |   (`#100 <https://github.com/natcap/taskgraph/issues/100>`_)
 17 | 
 18 | 0.11.1 (2023-10-27)
 19 | -------------------
 20 | * Adding ``pyproject.toml`` for our build definitions.
 21 | * Python 3.6 has reached end-of-life and is no longer maintained, so it has
 22 |   been removed from the automated tests.
 23 | * Python 3.7 has reached end-of-life and is no longer maintained, so it has
 24 |   been removed from automated tests.
 25 | * Python 3.11 has been released, so ``taskgraph`` is now tested against this
 26 |   new version of the language.
 27 | * Python 3.12 has been released, so ``taskgraph`` is now tested against this
 28 |   new version of the language.
 29 | 
 30 | 0.11.0 (2021-10-12)
 31 | -------------------
 32 | * Testing against python 3.10 in github actions and officially noting support
 33 |   for 3.10 in ``setup.py``.
 34 | * Testing against python 3.9 in github actions and noting support in
 35 |   ``setup.py``.
 36 | * Fixed an issue where exceptions raised during execution where the task
 37 |   completed before ``TaskGraph.join()`` was called would not be raised.  Now,
 38 |   if a task raises an exception, its exception will always be raised when
 39 |   either ``Task.join()`` and ``TaskGraph.join()`` is called.
 40 | * Fixed an issue where tasks with ``hash_algorithm='sizetimestamp'`` would,
 41 |   under certain conditions, fail to re-execute when they should.  This only
 42 |   occurred when a graph writing the same amount of , but possibly different,
 43 |   data is executed successively, with less than about 1.5 seconds between
 44 |   task executions.
 45 | * After many years with the Natural Capital Project, Rich Sharp has stepped
 46 |   down from the Project and as the maintainer of ``taskgraph``.  James
 47 |   Douglass is taking his place, and this change is now reflected in
 48 |   ``setup.py``.
 49 | * Fixes an issue that causes an ``EOFError`` or ``BrokenPipeError`` to occur
 50 |   when the ``TaskGraph`` terminates.
 51 | * Updated the ``taskgraph`` example in the README for the latest API changes
 52 |   and to clarify the need for ``if __name__ == '__main__':``
 53 | * Fixed an issue that could cause the ``TaskGraph`` object to hang if
 54 |   duplicate ``Task`` objects were created.
 55 | * Fixed an issue that was causing TaskGraph to ignore a changed
 56 |   ``hash_algorithm`` if the TaskGraph was created on one run, was
 57 |   deconstructed, then restarted. If the user chose a different hash, TaskGraph
 58 |   would use the hash that the target file was originally hashed under rather
 59 |   than the new algorithm.
 60 | * Removed ``copy_duplicate_artifact`` and ``hardlink_allowed`` parameters
 61 |   and functionality from TaskGraph. This is to address a design error that
 62 |   TaskGraph is not well suited for caching file results to avoid
 63 |   recomputation. Rather than add additional complexity around the limitations
 64 |   of this feature it is being removed to guide a design toward a standalone
 65 |   cache library if needed.
 66 | 
 67 | 0.10.3 (2021-01-29)
 68 | -------------------
 69 | * Fixed issue that could cause combinatorial memory usage leading to poor
 70 |   runtime or ``MemoryError`` if a dictionary were passed that had thousands
 71 |   of elements.
 72 | * Fixed issue that would cause ``TaskGraph`` to not recognize a directory
 73 |   that was meant to be ignored and in some cases cause ``Task`` to
 74 |   unnecessarily reexecute.
 75 | 
 76 | 0.10.2 (2020-12-11)
 77 | -------------------
 78 | * Fixed an issue that would raise an exception when `__del__` was
 79 |   deconstructing a taskgraph object and a thread ``join()`` would cause a
 80 |   deadlock.
 81 | 
 82 | 0.10.1 (2020-12-11)
 83 | -------------------
 84 | * Fixed an issue that would ignore the state of a ``transient_run`` flag if
 85 |   a previous Task run had run it with that flag set to False.
 86 | * Removed a limit on the number of times ``TaskGraph`` can attempt to update
 87 |   its database up to 5 minutes of continuous failures. This is to address
 88 |   expected issues when many parallel threads may compete for an update.
 89 |   Relevant information about why the database update fails is logged.
 90 | * Fixed an issue where the logging queue would always report an exception
 91 |   even if the logging thread shut down correctly.
 92 | 
 93 | 0.10.0 (2020-08-25)
 94 | -------------------
 95 | * Fixed several race conditions that could cause the ``TaskGraph`` object to
 96 |   hang on an otherwise ordinary termination.
 97 | * Changed logging level to "INFO" on cases where the taskgraph was not
 98 |   precalculated since it's an expected path of execution in ``TaskGraph``.
 99 | * Adding a ``hardlink_allowed`` parameter to ``add_task`` that allows the
100 |   attempt to hardlink a file in a case where a ``copy_artifact=True`` may
101 |   permit one. This will save on disk space as well as computation time
102 |   if large files are not needed to copy.
103 | * Adding a ``store_result`` flag to ``add_task`` that conditionally stores
104 |   the ``func`` result in the database for later ``.get``. This was added to
105 |   guard against return types that were not picklable and would otherwise
106 |   cause an exception when being executed normally.
107 | * Fixed issue that would cause the logger thread to continue reporting status
108 |   after all tasks were complete and the graph was closed.
109 | 
110 | 0.9.1 (2020-06-04)
111 | ------------------
112 | * Fixed issue that would cause an infinite loop if a ``TaskGraph`` object were
113 |   created with a database from an incompatible previous version. Behavior now
114 |   is to log the issue, delete the old database, and create a new compatible
115 |   one.
116 | * Fixed issue that would cause some rare infinite loops if ``TaskGraph`` were
117 |   to fail due to some kinds of task exceptions.
118 | * Adding open source BSD-3-Clause license.
119 | 
120 | 0.9.0 (2020-03-05)
121 | ------------------
122 | * Updating primary repository URL to GitHub.
123 | * Adding support for Python 3.8.
124 | * Removing the ``EncapsulatedOp`` abstract class. In practice the development
125 |   loop that encouraged the use of ``EncapsulatedOp`` is flawed and can lead to
126 |   design errors.
127 | * Removing unnecessary internal locks which will improve runtime performance of
128 |   processing many small Tasks.
129 | * Refactor to support separate TaskGraph objects that use the same database.
130 | * Removed the ``n_retries`` parameter from ``add_task``. Users are recommended
131 |   to handle retries within functions themselves.
132 | * Added a ``hash_target_files`` flag to ``add_task`` that when set to False,
133 |   causes TaskGraph to only note the existence of target files after execution
134 |   or as part of an evaluation to determine if the Task was precalculated.
135 |   This is useful for operations that initialize a file but subsequent runs of
136 |   the program modify it such as a new database or a downloaded file.
137 | * Fixed an issue on the monitor execution thread that caused shutdown of a
138 |   TaskGraph object to be delayed up to the amount of delay in the monitor
139 |   reporting update.
140 | * Added a ``.get()`` function for ``Task`` objects that returns the result of
141 |   the respective ``func`` call. This value is cached in the TaskGraph database
142 |   and hence can be used to avoid repeated execution. Note the addition of this
143 |   function changes the functionality of calling ``add_task`` with no target
144 |   path list. In previous versions the Task would execute once per TaskGraph
145 |   instance, now successive ``Task`` objects with the same execution signature
146 |   will use cached results.
147 | * To support the addition of the ``.get()`` function a ``transient_run``
148 |   parameter is added to ``add_task`` that causes TaskGraph to avoid
149 |   recording a completed ``Task`` even if the execution hash would have been
150 |   identical to a previously completed run where the target artifacts still
151 |   existed.
152 | 
153 | 0.8.5 (2019-09-11)
154 | ------------------
155 | * Dropped support for Python 2.7.
156 | * Fixed an issue where paths in ``ignore_paths`` were not getting ignored in
157 |   the case of ``copy_duplicate_artifact=True``.
158 | * Fixed an issue where the "percent completed" in the logging monitor would
159 |   sometimes exceed 100%. This occurred when a duplicate task was added to
160 |   the TaskGraph object.
161 | * Fixed an issue where a relative path set as a target path would always cause
162 |   TaskGraph to raise an exception after the task was complete.
163 | * Fixed an issue where kwargs that were unhashable were not considered when
164 |   determining if a Task should be re-run.
165 | * Fixed an issue where files with almost identical modified times and sizes
166 |   would hash equal in cases even when the filenames were different.
167 | 
168 | 0.8.4 (2019-05-23)
169 | ------------------
170 | * Fixed an exception that occurred when two tasks were constructed that
171 |   targeted the same file but one path was relative and the other was absolute.
172 | 
173 | 0.8.3 (2019-02-26)
174 | ------------------
175 | * Fixed an issue that would cause TaskGraph to raise an IOError if an
176 |   ``add_task`` call was marked for ``copy_duplicate_artifact`` but the
177 |   base target file was missing.
178 | * Fixed an issue that would prevent the source distribution from
179 |   installing.
180 | * Taskgraph is now tested against python versions 2.7, 3.6 and 3.7.
181 | 
182 | 0.8.2 (2019-01-31)
183 | ------------------
184 | * Adjusted logging levels so most chatty information is lowered to debug and
185 |   oddness in ``__del__`` shutdown are degraded from ``error`` to ``debug`` so
186 |   as not to cause alarm.
187 | 
188 | 0.8.1 (2019-01-09)
189 | ------------------
190 | * Fixed an issue that would cause a deadlock if two tasks were added that had
191 |   the same function signature except different target paths.
192 | 
193 | 0.8.0 (2019-01-07)
194 | ------------------
195 | * Fixed a race condition that would sometimes cause an exception when multiple
196 |   threads attempted to read or write to the completed Task Database.
197 | * Fixed an issue that could cause an exception in ``__del__`` to print to
198 |   stderr during Python interpreter shutdown.
199 | * Added a ``hash_algorithm`` parameter to ``add_task`` that is a string of
200 |   either 'sizetimestamp' or anything in ``hashlib.algorithms_available``. This
201 |   option tells TaskGraph how to fingerprint input and target files to
202 |   determine the need for recomputation.
203 | * Added a ``copy_duplicate_artifact`` parameter to ``add_task`` that when True
204 |   tells TaskGraph to copy duplicate target results to a new target so long as
205 |   all the parameters and base/target files fingerprint to the same value.
206 |   This can save significant computation time when use in scenarios where
207 |   there are small changes in a workflow, but otherwise significant changes
208 |   in filenames. This often occurs when putting timestamps or other suffixes
209 |   on files that otherwise have identical content.
210 | 
211 | 0.7.2 (2018-11-21)
212 | ------------------
213 | * TaskGraph now stores all task completion information in a single SQLite
214 |   database stored in its cache directory. In previous versions
215 |   TaskGraph would write a small text file for each task in a highly branching
216 |   directory tree. This structure made removal of those directory trees
217 |   computationally difficult.
218 | * Fixed an issue that would cause TaskGraph to reexecute if the target path
219 |   was included in the argument list and that path was not normalized to the
220 |   operating system's path style.
221 | * Fixed a deadlock in some cases where Tasks failed while other tasks checked
222 |   for pre-execution clauses.
223 | 
224 | 0.7.0 (2018-10-22)
225 | ------------------
226 | * Fixed an issue where very long strings might be interpreted as paths and
227 |   Windows crashes because the path is too long.
228 | * Fixed a deadlock issue where a Task might raise an unhandled exception as a
229 |   new task was added to the TaskGraph.
230 | * Fixed the occasional ``BrokenPipeError`` that could occur when a Task
231 |   encountered an unhandled exception.
232 | * Added an ``n_retries`` parameter to ``add_task`` that lets TaskGraph attempt
233 |   to reexecute a failing Task up to ``n_retries`` times before terminating
234 |   the TaskGraph.
235 | * Removed the ``delayed_start`` option.
236 | 
237 | 0.6.1 (2018-08-14)
238 | ------------------
239 | * Resolving an issue with duplicate logging being printed to stdout when
240 |   ``n_workers > 0``.  Logging is now only handled in the process that contains
241 |   the TaskGraph instance.
242 | * Updated main logging message to indicate which tasks, by task name, are
243 |   currently active and how many tasks are ready to execute but can't because
244 |   there is not an open worker.
245 | * Attempted to fix an issue where processes in the process pool were not
246 |   terminating on a Linux system by aggressively joining all threads and
247 |   processes when possible.
248 | * Fixed an issue that would cause tasks that had been previously calculated to
249 |   prematurely trigger children tasks even if the parent tasks of the current
250 |   task needed to be reexecuted.
251 | 
252 | 0.6.0 (2018-07-24)
253 | ------------------
254 | * Added a ``delayed_start`` flag to TaskGraph to allow for delayed execution
255 |   of taskgraph tasks. If enabled on threaded or multiprocess mode, calls to
256 |   ``add_task`` will not execute tasks until the ``join`` method is invoked on
257 |   ``taskgraph``. This allows for finer control over execution order when tasks
258 |   are passed non-equivalent ``priority`` levels.
259 | * Fixing an issue where a non-JSON serializeable object would cause
260 |   ``add_task`` to crash. Now TaskGraph is more tolerant of non-JSON
261 |   serializeable objects and will log warnings when parameters cannot be
262 |   serialized.
263 | * TaskGraph constructor has an option to report a ongoing logging message
264 |   at a set interval. The message reports how many tasks have been committed
265 |   and completed.
266 | * Fixed a bug that would cause TaskGraph to needlessly reexecute a task if
267 |   the only change was the order of the ``target_path_list`` or
268 |   ``dependent_task_list`` variables.
269 | * Fixed a bug that would cause a task to reexecute between runs if input
270 |   argument was a file that would be generated by a task that had not yet
271 |   executed.
272 | * Made a code change that makes it very likely that tasks will be executed in
273 |   priority order if added to a TaskGraph in delayed execution mode.
274 | * Refactored internal TaskGraph scheduling to fix a design error that made it
275 |   likely tasks would be needlessly reexecuted. This also simplified TaskGraph
276 |   flow control and cause slight performance improvements.
277 | * Fixed an issue discovered when a ``scipy.sparse`` matrix was passed as an
278 |   argument and ``add_task`` crashed on infinite recursion. Type checking of
279 |   arguments has been simplified and now iteration only occurs on the Python
280 |   ``set``, ``dict``, ``list``, and ``tuple`` types.
281 | * Fixed an issue where the ``TaskGraph`` was not ``join``\ing the worker
282 |   process pool on a closed/join TaskGraph, or when the ``TaskGraph`` object
283 |   was being deconstructed. This would occasionally cause a race condition
284 |   where the TaskGraph may still have a cache ``.json`` file open. Discovered
285 |   through a flaky build test.
286 | * Added functionality to the ``TaskGraph`` object to propagate log messages
287 |   from workers back to the parent process.  This only applies for cases where
288 |   a ``TaskGraph`` instance is started with ``n_workers > 0``.
289 | * Fixed an issue where a function that was passed as an argument would cause
290 |   a reexecution on a separate run because the ``__repr__`` of a function
291 |   includes its pointer address.
292 | * Adjusted logging levels so that detailed task information is shown on DEBUG
293 |   but basic status updates are shown in INFO.
294 | 
295 | 0.5.2 (2018-06-20)
296 | ------------------
297 | * Fixing an issue where a Task would hang on a ``join`` if the number of
298 |   workers in TaskGraph was -1 and a call to ``add_task`` has a non-``None``
299 |   passed to ``target_path_list`` and the resulting task was ``\.join``\ed
300 |   after a second run of the same program.
301 | 
302 | 0.5.1 (2018-06-20)
303 | ------------------
304 | * Fixing an issue where TaskGraph would hang on a ``join`` if the number of
305 |   workers was -1 and a call to ``add_task`` has ``None`` passed to
306 |   ``target_path_list``.
307 | 
308 | 0.5.0 (2018-05-04)
309 | ------------------
310 | * Taskgraph now supports python versions 2 and 3 (tested with python 2.7,
311 |   3.6).
312 | * Fixed an issue with ``taskgraph.TaskGraph`` that prevented a multiprocessed
313 |   graph from executing on POSIX systems when ``psutil`` was installed.
314 | * Adding matrix-based test automation (python 2.7, python 3.6, with/without
315 |   ``psutil``) via ``tox``.
316 | * Updating repository path to ``https://bitbucket.org/natcap/taskgraph``.
317 | 
318 | 0.4.0 (2018-04-18)
319 | ------------------
320 | * Auto-versioning now happens via ``setuptools_scm``, replacing previous calls
321 |   to ``natcap.versioner``.
322 | * Added an option to ``TaskGraph`` constructor to allow negative values in the
323 |   ``n_workers`` argument to indicate that the entire object should run in the
324 |   main thread. A value of 0 will indicate that no multiprocessing will be used
325 |   but concurrency will be allowed for non-blocking ``add_task``.
326 | * Added an abstract class ``task.EncapsulatedTaskOp`` that can be used to
327 |   instance a class that needs scope in order to be used as an operation passed
328 |   to a process. The advantage of using ``EncapsulatedTaskOp`` is that the
329 |   ``__name__`` hash used by ``TaskGraph`` to determine if a task is unique is
330 |   calculated in the superclass and the subclass need only worry about
331 |   implementation of ``__call__``.
332 | * Added a ``priority`` optional scalar argument to ``TaskGraph.add_task`` to
333 |   indicates the priority preference of the task to be executed. A higher
334 |   priority task whose dependencies are satisfied will executed before one with
335 |   a lower priority.
336 | 
337 | 0.3.0 (2017-11-17)
338 | ------------------
339 | * Refactor of core scheduler. Old scheduler used asynchronicity to attempt to
340 |   test if a Task was complete, occasionally testing all Tasks in potential
341 |   work queue per task completion. Scheduler now uses bookkeeping to keep track
342 |   of all dependencies and submits tasks for work only when all dependencies
343 |   are satisfied.
344 | * TaskGraph and Task ``.join`` methods now have a timeout parameter.
345 |   Additionally ``join`` now also returns False if ``join`` terminates because
346 |   of a timeout.
347 | * More robust error reporting and shutdown of TaskGraph if any tasks fail
348 |   during execution using pure threading or multiprocessing.
349 | 
350 | 
351 | 0.2.7 (2017-11-09)
352 | ------------------
353 | * Fixed a critical error from the last hotfix that prevented ``taskgraph``
354 |   from avoiding recomputation of already completed tasks.
355 | 
356 | 0.2.6 (2017-11-07)
357 | ------------------
358 | * Fixed an issue from the previous hotfix that could cause ``taskgraph`` to
359 |   exceed the number of available threads if enough tasks were added with long
360 |   running dependencies.
361 | * Additional error checking and flow control ensures that a TaskGraph will
362 |   catastrophically fail and report useful exception logging a task fails
363 |   during runtime.
364 | * Fixed a deadlock issue where a failure on a subtask would occasionally cause
365 |   a TaskGraph to hang.
366 | * ``Task.is_complete`` raises a RuntimeError if the task is complete but
367 |   failed.
368 | * More efficient handling of topological progression of task execution to
369 |   attempt to maximize total possible CPU load.
370 | * Fixing an issue from the last release that caused the test cases to fail.
371 |   (Don't use 0.2.5 at all).
372 | 
373 | 0.2.5 (2017-10-11)
374 | ------------------
375 | * Fixed a bug where tasks with satisfied dependencies or no dependencies were
376 |   blocked on dependent tasks added to the task graph earlier in the main
377 |   thread execution.
378 | * Indicating that ``psutil`` is an optional dependency through the ``setup``
379 |   function.
380 | 
381 | 0.2.4 (2017-09-19)
382 | ------------------
383 | * Empty release.  Possible bug with PyPI release, so re-releasing with a
384 |   bumped up version.
385 | 
386 | 0.2.3 (2017-09-18)
387 | ------------------
388 | * More robust testing on a chain of tasks that might fail because an ancestor
389 |   failed.
390 | 
391 | 0.2.2 (2017-08-15)
392 | ------------------
393 | * Changed how TaskGraph determines of work is complete.  Now records target
394 |   paths in file token with modified time and file size.  When checking if work
395 |   is complete, the token is loaded and the target file stats are compared for
396 |   each file.
397 | 
398 | 0.2.1 (2017-08-11)
399 | ------------------
400 | * Handling cases where a function might be an object or something else that
401 |   can't import source code.
402 | * Using natcap.versioner for versioning.
403 | 
404 | 0.2.0 (2017-07-31)
405 | ------------------
406 | * Fixing an issue where ``types.StringType`` is not the same as
407 |   ``types.StringTypes``.
408 | * Redefined ``target`` in ``add_task`` to ``func`` to avoid naming collision
409 |   with ``target_path_list`` in the same function.
410 | 
411 | 0.1.1 (2017-07-31)
412 | ------------------
413 | * Fixing a TYPO on ``__version__`` number scheme.
414 | * Importing ``psutil`` if it exists.
415 | 
416 | 0.1.0 (2017-07-29)
417 | ------------------
418 | * Initial release.
419 | 


--------------------------------------------------------------------------------
/tests/test_task.py:
--------------------------------------------------------------------------------
   1 | """Tests for taskgraph."""
   2 | import hashlib
   3 | import logging
   4 | import logging.handlers
   5 | import multiprocessing
   6 | import os
   7 | import pathlib
   8 | import pickle
   9 | import re
  10 | import shutil
  11 | import sqlite3
  12 | import subprocess
  13 | import tempfile
  14 | import time
  15 | import unittest
  16 | 
  17 | import retrying
  18 | import taskgraph
  19 | 
  20 | LOGGER = logging.getLogger(__name__)
  21 | 
  22 | N_TEARDOWN_RETRIES = 5
  23 | MAX_TRY_WAIT_MS = 500
  24 | 
  25 | 
  26 | def _return_value_once(value):
  27 |     """Return the value passed to it only once."""
  28 |     if hasattr(_return_value_once, 'executed'):
  29 |         raise RuntimeError("this function was called twice")
  30 |     _return_value_once.executed = True
  31 |     return value
  32 | 
  33 | 
  34 | def _noop_function(**kwargs):
  35 |     """Do nothing except allow kwargs to be passed."""
  36 |     pass
  37 | 
  38 | 
  39 | def _long_running_function(delay):
  40 |     """Wait for ``delay`` seconds."""
  41 |     time.sleep(delay)
  42 | 
  43 | 
  44 | def _create_two_files_on_disk(value, target_a_path, target_b_path):
  45 |     """Create two files and write ``value`` and append if possible."""
  46 |     with open(target_a_path, 'a') as a_file:
  47 |         a_file.write(value)
  48 | 
  49 |     with open(target_b_path, 'a') as b_file:
  50 |         b_file.write(value)
  51 | 
  52 | 
  53 | def _merge_and_append_files(base_a_path, base_b_path, target_path):
  54 |     """Merge two files and append if possible to new file."""
  55 |     with open(target_path, 'a') as target_file:
  56 |         for base_path in [base_a_path, base_b_path]:
  57 |             with open(base_path, 'r') as base_file:
  58 |                 target_file.write(base_file.read())
  59 | 
  60 | 
  61 | def _create_list_on_disk(value, length, target_path=None):
  62 |     """Create a numpy array on disk filled with value of ``size``."""
  63 |     target_list = [value] * length
  64 |     pickle.dump(target_list, open(target_path, 'wb'))
  65 | 
  66 | 
  67 | def _call_it(target, *args):
  68 |     """Invoke ``target`` with ``args``."""
  69 |     target(*args)
  70 | 
  71 | 
  72 | def _append_val(path, *val):
  73 |     """Append a ``val`` to file at ``path``."""
  74 |     with open(path, 'a') as target_file:
  75 |         for v in val:
  76 |             target_file.write(str(v))
  77 | 
  78 | 
  79 | def _sum_lists_from_disk(list_a_path, list_b_path, target_path):
  80 |     """Read two lists, add them and save result."""
  81 |     list_a = pickle.load(open(list_a_path, 'rb'))
  82 |     list_b = pickle.load(open(list_b_path, 'rb'))
  83 |     target_list = []
  84 |     for a, b in zip(list_a, list_b):
  85 |         target_list.append(a+b)
  86 |     pickle.dump(target_list, open(target_path, 'wb'))
  87 | 
  88 | 
  89 | def _div_by_zero():
  90 |     """Divide by zero to raise an exception."""
  91 |     return 1/0
  92 | 
  93 | 
  94 | def _create_file(target_path, content):
  95 |     """Create a file with contents."""
  96 |     with open(target_path, 'w') as target_file:
  97 |         target_file.write(content)
  98 | 
  99 | 
 100 | def _create_file_once(target_path, content):
 101 |     """Create a file on the first call, raise an exception on the second."""
 102 |     if hasattr(_create_file_once, 'executed'):
 103 |         raise RuntimeError("this function was called twice")
 104 |     _create_file_once.executed = True
 105 |     with open(target_path, 'w') as target_file:
 106 |         target_file.write(content)
 107 | 
 108 | 
 109 | def _copy_file_once(base_path, target_path):
 110 |     """Copy base to target on the first call, raise exception on second."""
 111 |     if hasattr(_copy_file_once, 'executed'):
 112 |         raise RuntimeError("this function was called twice")
 113 |     _copy_file_once.executed = True
 114 |     shutil.copyfile(base_path, target_path)
 115 | 
 116 | 
 117 | def _copy_two_files_once(base_path, target_a_path, target_b_path):
 118 |     """Copy base to target a/b on first call, raise exception on second."""
 119 |     if hasattr(_copy_two_files_once, 'executed'):
 120 |         raise RuntimeError("this function was called twice")
 121 |     _copy_two_files_once.executed = True
 122 |     shutil.copyfile(base_path, target_a_path)
 123 |     shutil.copyfile(base_path, target_b_path)
 124 | 
 125 | 
 126 | def _log_from_another_process(logger_name, log_message):
 127 |     """Write a log message to a given logger.
 128 | 
 129 |     Args:
 130 |         logger_name (string): The string logger name to which ``log_message``
 131 |             will be logged.
 132 |         log_message (string): The string log message to be logged (at INFO
 133 |             level) to the logger at ``logger_name``.
 134 | 
 135 |     Returns:
 136 |         ``None``
 137 | 
 138 |     """
 139 |     logger = logging.getLogger(logger_name)
 140 |     logger.info(log_message)
 141 | 
 142 | 
 143 | class TaskGraphTests(unittest.TestCase):
 144 |     """Tests for the taskgraph."""
 145 | 
 146 |     def setUp(self):
 147 |         """Create temp workspace directory."""
 148 |         # this lets us delete the workspace after its done no matter the
 149 |         # the rest result
 150 |         self.workspace_dir = tempfile.mkdtemp()
 151 | 
 152 |     @retrying.retry(
 153 |         stop_max_attempt_number=N_TEARDOWN_RETRIES,
 154 |         wait_exponential_multiplier=250, wait_exponential_max=MAX_TRY_WAIT_MS)
 155 |     def tearDown(self):
 156 |         """Remove temporary directory."""
 157 |         try:
 158 |             shutil.rmtree(self.workspace_dir)
 159 |         except Exception:
 160 |             LOGGER.exception('error when tearing down.')
 161 |             raise
 162 | 
 163 |     def test_version_loaded(self):
 164 |         """TaskGraph: verify we can load the version."""
 165 |         try:
 166 |             import taskgraph
 167 | 
 168 |             # Verifies that there's a version attribute and it has a value.
 169 |             self.assertTrue(len(taskgraph.__version__) > 0)
 170 |         except Exception:
 171 |             self.fail('Could not load the taskgraph version as expected.')
 172 | 
 173 |     def test_single_task(self):
 174 |         """TaskGraph: Test a single task."""
 175 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0, 0.1)
 176 |         # forcing this one to be unicode since there shouldn't be a problem
 177 |         # with that at all...
 178 |         target_path = u'%s' % os.path.join(self.workspace_dir, '1000.dat')
 179 |         value = 5
 180 |         list_len = 1000
 181 |         _ = task_graph.add_task(
 182 |             func=_create_list_on_disk,
 183 |             args=(value, list_len),
 184 |             kwargs={
 185 |                 'target_path': target_path,
 186 |             },
 187 |             target_path_list=[target_path])
 188 |         task_graph.close()
 189 |         task_graph.join()
 190 |         result = pickle.load(open(target_path, 'rb'))
 191 |         self.assertEqual(result, [value]*list_len)
 192 | 
 193 |     def test_task_hash_source_deleted(self):
 194 |         """TaskGraph: test if old target deleted when hashing duplicate."""
 195 |         target_a_path = os.path.join(self.workspace_dir, 'a.txt')
 196 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 197 |         task_a = task_graph.add_task(
 198 |             func=_create_file,
 199 |             args=(target_a_path, 'test value'),
 200 |             target_path_list=[target_a_path])
 201 |         task_a.join()
 202 |         target_b_path = os.path.join(self.workspace_dir, 'b.txt')
 203 |         _ = task_graph.add_task(
 204 |             func=_create_file,
 205 |             args=(target_b_path, 'test value'),
 206 |             target_path_list=[target_b_path])
 207 |         task_graph.close()
 208 |         task_graph.join()
 209 |         del task_graph
 210 | 
 211 |         os.remove(target_a_path)
 212 |         os.remove(target_b_path)
 213 | 
 214 |         target_c_path = os.path.join(self.workspace_dir, 'c.txt')
 215 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
 216 |         _ = task_graph.add_task(
 217 |             func=_create_file,
 218 |             args=(target_c_path, 'test value'),
 219 |             target_path_list=[target_c_path])
 220 |         task_graph.close()
 221 |         task_graph.join()
 222 | 
 223 |         with open(target_c_path, 'r') as target_file:
 224 |             result = target_file.read()
 225 |         self.assertEqual(result, 'test value')
 226 | 
 227 |     def test_task_rel_vs_absolute(self):
 228 |         """TaskGraph: test that relative path equates to absolute."""
 229 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 230 | 
 231 |         target_a_path = os.path.relpath(os.path.join(
 232 |             self.workspace_dir, 'a.txt'), start=self.workspace_dir)
 233 |         target_b_path = os.path.abspath(target_a_path)
 234 | 
 235 |         _ = task_graph.add_task(
 236 |            func=_create_file,
 237 |            args=(target_a_path, 'test value'),
 238 |            target_path_list=[target_a_path],
 239 |            task_name='task a')
 240 | 
 241 |         _ = task_graph.add_task(
 242 |            func=_create_file,
 243 |            args=(target_b_path, 'test value'),
 244 |            target_path_list=[target_b_path],
 245 |            task_name='task b')
 246 | 
 247 |         task_graph.close()
 248 |         task_graph.join()
 249 |         del task_graph
 250 | 
 251 |         with open(target_a_path, 'r') as a_file:
 252 |             m = hashlib.md5()
 253 |             m.update(a_file.read().encode('utf-8'))
 254 |             a_digest = m.digest()
 255 |         with open(target_b_path, 'r') as b_file:
 256 |             m = hashlib.md5()
 257 |             m.update(b_file.read().encode('utf-8'))
 258 |             b_digest = m.digest()
 259 |         self.assertEqual(a_digest, b_digest)
 260 | 
 261 |     def test_timeout_task(self):
 262 |         """TaskGraph: Test timeout functionality."""
 263 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 264 |         _ = task_graph.add_task(
 265 |             func=_long_running_function,
 266 |             args=(5,))
 267 |         task_graph.close()
 268 |         timedout = not task_graph.join(0.5)
 269 |         # this should timeout since function runs for 5 seconds
 270 |         self.assertTrue(timedout)
 271 | 
 272 |     def test_precomputed_task(self):
 273 |         """TaskGraph: Test that a task reuses old results."""
 274 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 275 |         target_path = os.path.join(self.workspace_dir, '1000.dat')
 276 |         value = 5
 277 |         list_len = 1000
 278 |         _ = task_graph.add_task(
 279 |             func=_create_list_on_disk,
 280 |             args=(value, list_len),
 281 |             kwargs={
 282 |                 'target_path': target_path,
 283 |             },
 284 |             target_path_list=[target_path])
 285 |         task_graph.close()
 286 |         task_graph.join()
 287 |         result = pickle.load(open(target_path, 'rb'))
 288 |         self.assertEqual(result, [value]*list_len)
 289 |         result_m_time = os.path.getmtime(target_path)
 290 |         del task_graph
 291 | 
 292 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 293 |         _ = task_graph.add_task(
 294 |             func=_create_list_on_disk,
 295 |             args=(value, list_len),
 296 |             kwargs={
 297 |                 'target_path': target_path,
 298 |             },
 299 |             target_path_list=[target_path])
 300 |         task_graph.close()
 301 |         task_graph.join()
 302 |         del task_graph
 303 | 
 304 |         # taskgraph shouldn't have recomputed the result
 305 |         second_result_m_time = os.path.getmtime(target_path)
 306 |         self.assertEqual(result_m_time, second_result_m_time)
 307 | 
 308 |     def test_task_chain(self):
 309 |         """TaskGraph: Test a task chain."""
 310 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 311 |         target_a_path = os.path.join(self.workspace_dir, 'a.dat')
 312 |         target_b_path = os.path.join(self.workspace_dir, 'b.dat')
 313 |         result_path = os.path.join(self.workspace_dir, 'result.dat')
 314 |         result_2_path = os.path.join(self.workspace_dir, 'result2.dat')
 315 |         value_a = 5
 316 |         value_b = 10
 317 |         list_len = 10
 318 |         task_a = task_graph.add_task(
 319 |             func=_create_list_on_disk,
 320 |             args=(value_a, list_len),
 321 |             kwargs={
 322 |                 'target_path': target_a_path,
 323 |             },
 324 |             target_path_list=[target_a_path])
 325 |         task_b = task_graph.add_task(
 326 |             func=_create_list_on_disk,
 327 |             args=(value_b, list_len),
 328 |             kwargs={
 329 |                 'target_path': target_b_path,
 330 |             },
 331 |             target_path_list=[target_b_path])
 332 |         sum_task = task_graph.add_task(
 333 |             func=_sum_lists_from_disk,
 334 |             args=(target_a_path, target_b_path),
 335 |             kwargs={
 336 |                 'target_path': result_path,
 337 |             },
 338 |             target_path_list=[result_path],
 339 |             dependent_task_list=[task_a, task_b])
 340 |         sum_task.join()
 341 | 
 342 |         result = pickle.load(open(result_path, 'rb'))
 343 |         self.assertEqual(result, [value_a+value_b]*list_len)
 344 | 
 345 |         sum_2_task = task_graph.add_task(
 346 |             func=_sum_lists_from_disk,
 347 |             args=(target_a_path, result_path, result_2_path),
 348 |             target_path_list=[result_2_path],
 349 |             dependent_task_list=[task_a, sum_task])
 350 |         sum_2_task.join()
 351 |         result2 = pickle.load(open(result_2_path, 'rb'))
 352 |         expected_result = [(value_a*2+value_b)]*list_len
 353 |         self.assertEqual(result2, expected_result)
 354 | 
 355 |         sum_3_task = task_graph.add_task(
 356 |             func=_sum_lists_from_disk,
 357 |             args=(target_a_path, result_path, result_2_path),
 358 |             target_path_list=[result_2_path],
 359 |             dependent_task_list=[task_a, sum_task])
 360 |         task_graph.close()
 361 |         sum_3_task.join()
 362 |         result3 = pickle.load(open(result_2_path, 'rb'))
 363 |         expected_result = [(value_a*2+value_b)]*list_len
 364 |         self.assertEqual(result3, expected_result)
 365 |         task_graph.join()
 366 | 
 367 |     def test_task_chain_single_thread(self):
 368 |         """TaskGraph: Test a single threaded task chain."""
 369 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
 370 |         target_a_path = os.path.join(self.workspace_dir, 'a.dat')
 371 |         target_b_path = os.path.join(self.workspace_dir, 'b.dat')
 372 |         result_path = os.path.join(self.workspace_dir, 'result.dat')
 373 |         result_2_path = os.path.join(self.workspace_dir, 'result2.dat')
 374 |         value_a = 5
 375 |         value_b = 10
 376 |         list_len = 10
 377 |         task_a = task_graph.add_task(
 378 |             func=_create_list_on_disk,
 379 |             args=(value_a, list_len),
 380 |             kwargs={
 381 |                 'target_path': target_a_path,
 382 |             },
 383 |             target_path_list=[target_a_path],
 384 |             task_name='task a')
 385 |         task_b = task_graph.add_task(
 386 |             func=_create_list_on_disk,
 387 |             args=(value_b, list_len),
 388 |             kwargs={
 389 |                 'target_path': target_b_path,
 390 |             },
 391 |             target_path_list=[target_b_path],
 392 |             task_name='task b')
 393 |         sum_task = task_graph.add_task(
 394 |             func=_sum_lists_from_disk,
 395 |             args=(target_a_path, target_b_path),
 396 |             kwargs={
 397 |                 'target_path': result_path,
 398 |             },
 399 |             target_path_list=[result_path],
 400 |             dependent_task_list=[task_a, task_b],
 401 |             task_name='task c')
 402 |         sum_task.join()
 403 | 
 404 |         result = pickle.load(open(result_path, 'rb'))
 405 |         self.assertEqual(result, [value_a+value_b]*list_len)
 406 | 
 407 |         sum_2_task = task_graph.add_task(
 408 |             func=_sum_lists_from_disk,
 409 |             args=(target_a_path, result_path, result_2_path),
 410 |             target_path_list=[result_2_path],
 411 |             dependent_task_list=[task_a, sum_task],
 412 |             task_name='task sum_2')
 413 |         sum_2_task.join()
 414 |         result2 = pickle.load(open(result_2_path, 'rb'))
 415 |         expected_result = [(value_a*2+value_b)]*list_len
 416 |         self.assertEqual(result2, expected_result)
 417 | 
 418 |         sum_3_task = task_graph.add_task(
 419 |             func=_sum_lists_from_disk,
 420 |             args=(target_a_path, result_path, result_2_path),
 421 |             target_path_list=[result_2_path],
 422 |             dependent_task_list=[task_a, sum_task],
 423 |             task_name='task sum_3')
 424 |         task_graph.close()
 425 |         sum_3_task.join()
 426 |         result3 = pickle.load(open(result_2_path, 'rb'))
 427 |         expected_result = [(value_a*2+value_b)]*list_len
 428 |         task_graph.join()
 429 |         task_graph = None
 430 |         self.assertEqual(result3, expected_result)
 431 | 
 432 |         # we should have 4 completed values in the database, 5 total but one
 433 |         # was a duplicate
 434 |         database_path = os.path.join(
 435 |             self.workspace_dir, taskgraph._TASKGRAPH_DATABASE_FILENAME)
 436 |         conn = sqlite3.connect(database_path)
 437 |         with conn:
 438 |             cursor = conn.cursor()
 439 |             cursor.execute("SELECT * FROM taskgraph_data")
 440 |             result = cursor.fetchall()
 441 |         conn.close()
 442 |         self.assertEqual(len(result), 4)
 443 | 
 444 |     def test_task_broken_chain(self):
 445 |         """TaskGraph: Test a multiprocess chain with exception raised."""
 446 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 4)
 447 |         target_a_path = os.path.join(self.workspace_dir, 'a.dat')
 448 |         target_b_path = os.path.join(self.workspace_dir, 'b.dat')
 449 |         result_path = os.path.join(self.workspace_dir, 'result.dat')
 450 |         value_a = 5
 451 |         list_len = 10
 452 |         task_a = task_graph.add_task(
 453 |             func=_create_list_on_disk,
 454 |             args=(value_a, list_len),
 455 |             kwargs={
 456 |                 'target_path': target_a_path,
 457 |             },
 458 |             target_path_list=[target_a_path])
 459 |         task_b = task_graph.add_task(
 460 |             func=_div_by_zero,
 461 |             dependent_task_list=[task_a])
 462 |         _ = task_graph.add_task(
 463 |             func=_sum_lists_from_disk,
 464 |             args=(target_a_path, target_b_path),
 465 |             kwargs={
 466 |                 'target_path': result_path,
 467 |             },
 468 |             target_path_list=[result_path],
 469 |             dependent_task_list=[task_a, task_b])
 470 |         task_graph.close()
 471 | 
 472 |         with self.assertRaises(ZeroDivisionError):
 473 |             task_graph.join()
 474 | 
 475 |     def test_broken_task(self):
 476 |         """TaskGraph: Test that a task with an exception won't hang."""
 477 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 1)
 478 | 
 479 |         broken_task = task_graph.add_task(
 480 |             func=_div_by_zero, task_name='test_broken_task')
 481 |         with self.assertRaises(ZeroDivisionError):
 482 |             _ = broken_task.join()
 483 | 
 484 |         task_graph.close()
 485 | 
 486 |         with self.assertRaises(ZeroDivisionError):
 487 |             task_graph.join()
 488 | 
 489 |     def test_broken_task_chain(self):
 490 |         """TaskGraph: test dependent tasks fail on ancestor fail."""
 491 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 4)
 492 | 
 493 |         target_path = os.path.join(self.workspace_dir, '1000.dat')
 494 |         value = 5
 495 |         list_len = 1000
 496 |         for task_id in range(1):
 497 |             target_path = os.path.join(
 498 |                 self.workspace_dir, '1000_%d.dat' % task_id)
 499 |             normal_task = task_graph.add_task(
 500 |                 func=_create_list_on_disk,
 501 |                 args=(value, list_len),
 502 |                 kwargs={'target_path': target_path},
 503 |                 target_path_list=[target_path],
 504 |                 task_name='create list on disk %d' % task_id)
 505 |             zero_div_task = task_graph.add_task(
 506 |                 func=_div_by_zero,
 507 |                 dependent_task_list=[normal_task],
 508 |                 task_name='test_broken_task_chain_%d' % task_id)
 509 |             target_path = os.path.join(
 510 |                 self.workspace_dir, 'after_zerodiv_1000_%d.dat' % task_id)
 511 |             _ = task_graph.add_task(
 512 |                 func=_create_list_on_disk,
 513 |                 args=(value, list_len),
 514 |                 kwargs={'target_path': target_path},
 515 |                 dependent_task_list=[zero_div_task],
 516 |                 target_path_list=[target_path],
 517 |                 task_name='create list on disk after zero div%d' % task_id)
 518 | 
 519 |         task_graph.close()
 520 |         with self.assertRaises(ZeroDivisionError):
 521 |             task_graph.join()
 522 | 
 523 |     def test_empty_task(self):
 524 |         """TaskGraph: Test an empty task."""
 525 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 526 |         _ = task_graph.add_task()
 527 |         task_graph.close()
 528 |         task_graph.join()
 529 |         # we shouldn't have anything in the database
 530 |         database_path = os.path.join(
 531 |             self.workspace_dir, taskgraph._TASKGRAPH_DATABASE_FILENAME)
 532 | 
 533 |         conn = sqlite3.connect(database_path)
 534 |         with conn:
 535 |             cursor = conn.cursor()
 536 |             cursor.executescript("SELECT * FROM taskgraph_data")
 537 |             result = cursor.fetchall()
 538 |         conn.close()
 539 |         self.assertEqual(len(result), 0)
 540 | 
 541 |     def test_closed_graph(self):
 542 |         """TaskGraph: Test adding to an closed task graph fails."""
 543 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 544 |         task_graph.close()
 545 |         target_path = os.path.join(self.workspace_dir, '1000.dat')
 546 |         value = 5
 547 |         list_len = 1000
 548 |         with self.assertRaises(ValueError):
 549 |             _ = task_graph.add_task(
 550 |                 func=_create_list_on_disk,
 551 |                 args=(value, list_len),
 552 |                 kwargs={'target_path': target_path},
 553 |                 target_path_list=[target_path])
 554 |         task_graph.join()
 555 | 
 556 |     def test_single_task_multiprocessing(self):
 557 |         """TaskGraph: Test a single task with multiprocessing."""
 558 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 1)
 559 |         target_path = os.path.join(self.workspace_dir, '1000.dat')
 560 |         value = 5
 561 |         list_len = 1000
 562 |         _ = task_graph.add_task(
 563 |             func=_create_list_on_disk,
 564 |             args=(value, list_len),
 565 |             kwargs={
 566 |                 'target_path': target_path,
 567 |             },
 568 |             target_path_list=[target_path])
 569 |         task_graph.close()
 570 |         task_graph.join()
 571 |         result = pickle.load(open(target_path, 'rb'))
 572 |         self.assertEqual(result, [value]*list_len)
 573 | 
 574 |     def test_get_file_stats(self):
 575 |         """TaskGraph: Test _get_file_stats subroutine."""
 576 |         from taskgraph.Task import _get_file_stats
 577 |         test_dir = os.path.join(self.workspace_dir, 'test_dir')
 578 |         test_file = os.path.join(test_dir, 'test_file.txt')
 579 |         os.mkdir(test_dir)
 580 |         with open(test_file, 'w') as f:
 581 |             f.write('\n')
 582 |         nofile = os.path.join(self.workspace_dir, 'nofile')
 583 |         base_value = [
 584 |             nofile, test_dir, test_file,
 585 |             10, {'a': {'b': test_file}}, {'a': {'b': test_dir, 'foo': 9}}]
 586 |         ignore_dir_result = list(_get_file_stats(
 587 |             base_value, 'sizetimestamp', [], True))
 588 |         # should get two results if we ignore the directories because there's
 589 |         # only two files
 590 |         self.assertEqual(len(ignore_dir_result), 2)
 591 |         dir_result = list(_get_file_stats(
 592 |             base_value, 'sizetimestamp', [], False))
 593 |         # should get four results if we track directories because of two files
 594 |         # and two directories
 595 |         self.assertEqual(len(dir_result), 4)
 596 | 
 597 |         result = list(_get_file_stats(nofile, 'sizetimestamp', [], False))
 598 |         self.assertEqual(result, [])
 599 | 
 600 |     def test_transient_runs(self):
 601 |         """TaskGraph: ensure that transent tasks reexecute."""
 602 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
 603 |         target_path = os.path.join(self.workspace_dir, '1000.dat')
 604 |         value = 5
 605 |         list_len = 1000
 606 |         _ = task_graph.add_task(
 607 |             func=_create_list_on_disk,
 608 |             args=(value, list_len),
 609 |             kwargs={
 610 |                 'target_path': target_path,
 611 |             })
 612 |         task_graph.close()
 613 |         task_graph.join()
 614 |         task_graph = None
 615 | 
 616 |         os.remove(target_path)
 617 | 
 618 |         task_graph2 = taskgraph.TaskGraph(self.workspace_dir, -1)
 619 |         _ = task_graph2.add_task(
 620 |             func=_create_list_on_disk,
 621 |             args=(value, list_len),
 622 |             transient_run=True,
 623 |             kwargs={
 624 |                 'target_path': target_path,
 625 |             })
 626 | 
 627 |         task_graph2.close()
 628 |         task_graph2.join()
 629 | 
 630 |         self.assertTrue(
 631 |             os.path.exists(target_path),
 632 |             "Expected file to exist because taskgraph should have re-run.")
 633 | 
 634 |     def test_repeat_targeted_runs(self):
 635 |         """TaskGraph: ensure that repeated runs with targets can join."""
 636 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
 637 |         target_path = os.path.join(self.workspace_dir, '1000.dat')
 638 |         value = 5
 639 |         list_len = 1000
 640 |         _ = task_graph.add_task(
 641 |             func=_create_list_on_disk,
 642 |             args=(value, list_len),
 643 |             kwargs={
 644 |                 'target_path': target_path,
 645 |             },
 646 |             target_path_list=[target_path])
 647 |         task_graph.close()
 648 |         task_graph.join()
 649 |         task_graph = None
 650 | 
 651 |         task_graph2 = taskgraph.TaskGraph(self.workspace_dir, -1)
 652 |         task = task_graph2.add_task(
 653 |             func=_create_list_on_disk,
 654 |             args=(value, list_len),
 655 |             kwargs={
 656 |                 'target_path': target_path,
 657 |             },
 658 |             target_path_list=[target_path])
 659 |         self.assertTrue(task.join(1.0), "join failed after 1 second")
 660 |         task_graph2.close()
 661 |         task_graph2.join()
 662 | 
 663 |     def test_task_equality(self):
 664 |         """TaskGraph: test correctness of == and != for Tasks."""
 665 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
 666 |         target_path = os.path.join(self.workspace_dir, '1000.dat')
 667 |         value = 5
 668 |         list_len = 1000
 669 |         task_a = task_graph.add_task(
 670 |             func=_create_list_on_disk,
 671 |             args=(value, list_len),
 672 |             kwargs={'target_path': target_path},
 673 |             target_path_list=[target_path])
 674 |         task_a_same = task_graph.add_task(
 675 |             func=_create_list_on_disk,
 676 |             args=(value, list_len),
 677 |             kwargs={'target_path': target_path},
 678 |             target_path_list=[target_path])
 679 |         task_b = task_graph.add_task(
 680 |             func=_create_list_on_disk,
 681 |             args=(value+1, list_len),
 682 |             kwargs={'target_path': target_path},
 683 |             target_path_list=[target_path])
 684 | 
 685 |         self.assertTrue(task_a == task_a)
 686 |         self.assertTrue(task_a == task_a_same)
 687 |         self.assertTrue(task_a != task_b)
 688 | 
 689 |     def test_async_logging(self):
 690 |         """TaskGraph: ensure async logging can execute."""
 691 |         task_graph = taskgraph.TaskGraph(
 692 |             self.workspace_dir, 0, reporting_interval=0.5)
 693 |         _ = task_graph.add_task(
 694 |             func=_long_running_function,
 695 |             args=(1.0,))
 696 |         task_graph.close()
 697 |         task_graph.join()
 698 |         timedout = not task_graph.join(5)
 699 |         # this should not timeout since function runs for 1 second
 700 |         self.assertFalse(timedout, "task timed out")
 701 | 
 702 |     def test_scrub(self):
 703 |         """TaskGraph: ensure scrub is not scrubbing base types."""
 704 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 705 | 
 706 |         target_path = os.path.join(self.workspace_dir, 'a.txt')
 707 |         first_task = task_graph.add_task(
 708 |             func=_append_val,
 709 |             args=(target_path, 1, [1], {'x': 1}),
 710 |             task_name='first append')
 711 | 
 712 |         second_task = task_graph.add_task(
 713 |             func=_append_val,
 714 |             args=(target_path, 1, [1], {'x': 2}),
 715 |             dependent_task_list=[first_task],
 716 |             task_name='second append')
 717 | 
 718 |         _ = task_graph.add_task(
 719 |             func=_append_val,
 720 |             args=(target_path, 1, [2], {'x': 1}),
 721 |             dependent_task_list=[second_task],
 722 |             task_name='third append')
 723 | 
 724 |         task_graph.close()
 725 |         task_graph.join()
 726 | 
 727 |         with open(target_path, 'r') as target_file:
 728 |             file_value = target_file.read()
 729 |         self.assertEqual("1[1]{'x': 1}1[1]{'x': 2}1[2]{'x': 1}", file_value)
 730 | 
 731 |     def test_target_path_order(self):
 732 |         """TaskGraph: ensure target path order doesn't matter."""
 733 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 734 |         target_a_path = os.path.join(self.workspace_dir, 'a.txt')
 735 |         target_b_path = os.path.join(self.workspace_dir, 'b.txt')
 736 | 
 737 |         task_graph.add_task(
 738 |             func=_create_two_files_on_disk,
 739 |             args=("word", target_a_path, target_b_path),
 740 |             target_path_list=[target_a_path, target_b_path])
 741 | 
 742 |         task_graph.add_task(
 743 |             func=_create_two_files_on_disk,
 744 |             args=("word", target_a_path, target_b_path),
 745 |             target_path_list=[target_b_path, target_a_path])
 746 | 
 747 |         task_graph.close()
 748 |         task_graph.join()
 749 | 
 750 |         with open(target_a_path, 'r') as a_file:
 751 |             a_value = a_file.read()
 752 | 
 753 |         with open(target_b_path, 'r') as b_file:
 754 |             b_value = b_file.read()
 755 | 
 756 |         self.assertEqual(a_value, "word")
 757 |         self.assertEqual(b_value, "word")
 758 | 
 759 |     def test_task_hash_when_ready(self):
 760 |         """TaskGraph: ensure tasks don't record execution info until ready."""
 761 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 762 |         target_a_path = os.path.join(self.workspace_dir, 'a.txt')
 763 |         target_b_path = os.path.join(self.workspace_dir, 'b.txt')
 764 | 
 765 |         create_files_task = task_graph.add_task(
 766 |             func=_create_two_files_on_disk,
 767 |             args=("word", target_a_path, target_b_path),
 768 |             target_path_list=[target_a_path, target_b_path])
 769 | 
 770 |         target_merged_path = os.path.join(self.workspace_dir, 'merged.txt')
 771 |         task_graph.add_task(
 772 |             func=_merge_and_append_files,
 773 |             args=(target_a_path, target_b_path, target_merged_path),
 774 |             target_path_list=[target_merged_path],
 775 |             dependent_task_list=[create_files_task])
 776 | 
 777 |         task_graph.join()
 778 | 
 779 |         # this second task shouldn't execute because it's a copy of the first
 780 |         task_graph.add_task(
 781 |             func=_merge_and_append_files,
 782 |             args=(target_a_path, target_b_path, target_merged_path),
 783 |             target_path_list=[target_merged_path],
 784 |             dependent_task_list=[create_files_task])
 785 | 
 786 |         task_graph.close()
 787 |         task_graph.join()
 788 | 
 789 |         with open(target_merged_path, 'r') as target_file:
 790 |             target_string = target_file.read()
 791 | 
 792 |         self.assertEqual(target_string, "wordword")
 793 | 
 794 |     def test_multiprocessed_logging(self):
 795 |         """TaskGraph: ensure tasks can log from multiple processes."""
 796 |         logger_name = 'test.task.queuelogger'
 797 |         log_message = 'This is coming from another process'
 798 |         logger = logging.getLogger(logger_name)
 799 |         logger.setLevel(logging.DEBUG)
 800 |         file_log_path = os.path.join(
 801 |             self.workspace_dir, 'test_multiprocessed_logging.log')
 802 |         file_handler = logging.FileHandler(file_log_path)
 803 |         file_handler.setFormatter(
 804 |             logging.Formatter(fmt=':%(processName)s:%(message)s:'))
 805 |         logger.addHandler(file_handler)
 806 | 
 807 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 1)
 808 |         log_task = task_graph.add_task(
 809 |             func=_log_from_another_process,
 810 |             args=(logger_name, log_message))
 811 |         log_task.join()
 812 |         file_handler.flush()
 813 |         task_graph.close()
 814 |         task_graph.join()
 815 |         file_handler.close()
 816 | 
 817 |         @retrying.retry(wait_exponential_multiplier=100,
 818 |                         wait_exponential_max=1000,
 819 |                         stop_max_attempt_number=5)
 820 |         def get_name_and_message():
 821 |             with open(file_log_path, 'r') as log_file:
 822 |                 message = log_file.read().rstrip()
 823 |             print(message)
 824 |             process_name, logged_message = re.match(
 825 |                 ':([^:]*):([^:]*):', message).groups()
 826 |             return process_name, logged_message
 827 | 
 828 |         process_name, logged_message = get_name_and_message()
 829 |         self.assertEqual(logged_message, log_message)
 830 |         self.assertNotEqual(
 831 |             process_name, multiprocessing.current_process().name)
 832 | 
 833 |     def test_repeated_function(self):
 834 |         """TaskGraph: ensure no reruns if argument is a function."""
 835 |         global _append_val
 836 | 
 837 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 838 |         target_path = os.path.join(self.workspace_dir, 'testfile.txt')
 839 |         task_graph.add_task(
 840 |             func=_call_it,
 841 |             args=(_append_val, target_path, 1),
 842 |             target_path_list=[target_path],
 843 |             ignore_path_list=[target_path],
 844 |             task_name='first _call_it')
 845 |         task_graph.close()
 846 |         task_graph.join()
 847 |         del task_graph
 848 | 
 849 |         # this causes the address to change
 850 |         def _append_val(path, *val):
 851 |             """Append a ``val`` to file at ``path``."""
 852 |             with open(path, 'a') as target_file:
 853 |                 for v in val:
 854 |                     target_file.write(str(v))
 855 | 
 856 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 1)
 857 |         target_path = os.path.join(self.workspace_dir, 'testfile.txt')
 858 |         task_graph.add_task(
 859 |             func=_call_it,
 860 |             args=(_append_val, target_path, 1),
 861 |             target_path_list=[target_path],
 862 |             ignore_path_list=[target_path],
 863 |             task_name='second _call_it')
 864 |         task_graph.close()
 865 |         task_graph.join()
 866 | 
 867 |         with open(target_path, 'r') as target_file:
 868 |             result = target_file.read()
 869 | 
 870 |         # the second call shouldn't happen
 871 |         self.assertEqual(result, '1')
 872 | 
 873 |     def test_unix_path_repeated_function(self):
 874 |         """TaskGraph: ensure no reruns if path is unix style."""
 875 |         global _append_val
 876 |         _append_val = _append_val  # flake8 complains if not defined
 877 | 
 878 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
 879 |         target_dir = self.workspace_dir + '/foo/bar/rad/'
 880 |         os.makedirs(target_dir)
 881 |         target_path = target_dir + '/testfile.txt'
 882 |         task_graph.add_task(
 883 |             func=_call_it,
 884 |             args=(_append_val, target_path, 1),
 885 |             target_path_list=[target_path],
 886 |             task_name='first _call_it')
 887 |         task_graph.close()
 888 |         task_graph.join()
 889 |         del task_graph
 890 | 
 891 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
 892 |         task_graph.add_task(
 893 |             func=_call_it,
 894 |             args=(_append_val, target_path, 1),
 895 |             target_path_list=[target_path],
 896 |             task_name='second _call_it')
 897 |         task_graph.close()
 898 |         task_graph.join()
 899 | 
 900 |         with open(target_path, 'r') as target_file:
 901 |             result = target_file.read()
 902 | 
 903 |         # the second call shouldn't happen
 904 |         self.assertEqual(result, '1')
 905 | 
 906 |     def test_very_long_string(self):
 907 |         """TaskGraph: ensure that long strings don't case an OSError."""
 908 |         from taskgraph.Task import _get_file_stats
 909 | 
 910 |         # this is a list with two super long strings to try to trick some
 911 |         # os function into thinking it's a path.
 912 |         base_value = [
 913 |             'c:' + r'\\\\\\\\x\\\\\\\\'*2**10 + 'foo',
 914 |             'wfeji3223j8923j9' * 2**10]
 915 |         self.assertEqual(
 916 |             list(_get_file_stats(base_value, 'sizetimestamp', [], True)), [])
 917 | 
 918 |     def test_duplicate_call_changed_target(self):
 919 |         """TaskGraph: test that duplicate calls copy target path."""
 920 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 921 |         target_path = os.path.join(self.workspace_dir, 'testfile.txt')
 922 | 
 923 |         if hasattr(_create_file_once, 'executed'):
 924 |             del _create_file_once.executed
 925 | 
 926 |         task_graph.add_task(
 927 |             func=_create_file_once,
 928 |             args=(target_path, 'test'),
 929 |             target_path_list=[target_path],
 930 |             hash_target_files=False,
 931 |             task_name='first _create_file_once')
 932 | 
 933 |         task_graph.close()
 934 |         task_graph.join()
 935 |         del task_graph
 936 | 
 937 |         with open(target_path, 'a') as target_file:
 938 |             target_file.write('updated')
 939 | 
 940 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 941 |         task_graph.add_task(
 942 |             func=_create_file_once,
 943 |             args=(target_path, 'test'),
 944 |             target_path_list=[target_path],
 945 |             hash_target_files=False,
 946 |             task_name='first _create_file_once')
 947 | 
 948 |         task_graph.close()
 949 |         task_graph.join()
 950 |         del task_graph
 951 | 
 952 |         with open(target_path, 'r') as result_file:
 953 |             result_contents = result_file.read()
 954 |         self.assertEqual('testupdated', result_contents)
 955 | 
 956 |     def test_duplicate_call_modify_timestamp(self):
 957 |         """TaskGraph: test that duplicate call modified stamp recompute."""
 958 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 959 |         target_path = os.path.join(self.workspace_dir, 'testfile.txt')
 960 |         task_graph.add_task(
 961 |             func=_create_file,
 962 |             args=(target_path, 'test'),
 963 |             target_path_list=[target_path],
 964 |             task_name='first _create_file')
 965 |         task_graph.close()
 966 |         task_graph.join()
 967 |         del task_graph
 968 | 
 969 |         with open(target_path, 'w') as target_file:
 970 |             target_file.write('test2')
 971 |         with open(target_path, 'r') as target_file:
 972 |             contents = target_file.read()
 973 |         self.assertEqual(contents, 'test2')
 974 | 
 975 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 976 |         task_graph.add_task(
 977 |             func=_create_file,
 978 |             args=(target_path, 'test'),
 979 |             target_path_list=[target_path],
 980 |             task_name='second _create_file')
 981 | 
 982 |         task_graph.close()
 983 |         task_graph.join()
 984 | 
 985 |         with open(target_path, 'r') as target_file:
 986 |             contents = target_file.read()
 987 |         self.assertEqual(contents, 'test')
 988 | 
 989 |     def test_different_target_path_list(self):
 990 |         """TaskGraph: duplicate calls with different targets should fail."""
 991 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
 992 |         target_path = os.path.join(self.workspace_dir, 'testfile.txt')
 993 |         task_graph.add_task(
 994 |             func=_create_list_on_disk,
 995 |             args=('test', 1, target_path),
 996 |             target_path_list=[target_path],
 997 |             task_name='first _create_list_on_disk')
 998 | 
 999 |         with self.assertRaises(RuntimeError):
1000 |             # make the same call but with different target path list
1001 |             task_graph.add_task(
1002 |                 func=_create_list_on_disk,
1003 |                 args=('test', 1, target_path),
1004 |                 target_path_list=[target_path, 'test.txt'],
1005 |                 task_name='first _create_list_on_disk')
1006 | 
1007 |         task_graph.close()
1008 |         task_graph.join()
1009 | 
1010 |     def test_terminated_taskgraph(self):
1011 |         """TaskGraph: terminated task graph raises exception correctly."""
1012 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 4)
1013 |         _ = task_graph.add_task(func=_div_by_zero)
1014 |         with self.assertRaises(ZeroDivisionError):
1015 |             task_graph.join()
1016 | 
1017 |         with self.assertRaises(RuntimeError) as cm:
1018 |             _ = task_graph.add_task(func=_div_by_zero)
1019 |         expected_message = "add_task when Taskgraph is terminated"
1020 |         actual_message = str(cm.exception)
1021 |         self.assertTrue(expected_message in actual_message, actual_message)
1022 | 
1023 |         task_graph.close()
1024 |         # try closing twice just to mess with coverage
1025 |         task_graph.close()
1026 | 
1027 |     def test_type_list_error(self):
1028 |         """TaskGraph: Task not passed to dependent task list."""
1029 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
1030 |         target_path = os.path.join(self.workspace_dir, 'testfile.txt')
1031 |         with self.assertRaises(ValueError) as cm:
1032 |             task_graph.add_task(
1033 |                 func=_create_list_on_disk,
1034 |                 args=('test', 1, target_path),
1035 |                 target_path_list=[target_path],
1036 |                 dependent_task_list=[target_path],
1037 |                 task_name='first _create_list_on_disk')
1038 |         expected_message = (
1039 |             "Objects passed to dependent task list that are not tasks")
1040 |         actual_message = str(cm.exception)
1041 |         self.assertTrue(expected_message in actual_message, actual_message)
1042 | 
1043 |     def test_target_list_error(self):
1044 |         """TaskGraph: Path not passed to target list."""
1045 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
1046 |         target_path = os.path.join(self.workspace_dir, 'testfile.txt')
1047 |         with self.assertRaises(ValueError) as cm:
1048 |             task_graph.add_task(
1049 |                 func=_create_list_on_disk,
1050 |                 args=('test', 1, target_path),
1051 |                 target_path_list=[1],
1052 |                 task_name='_create_list_on_disk')
1053 |         expected_message = (
1054 |             "Values passed to target_path_list are not strings")
1055 |         actual_message = str(cm.exception)
1056 |         self.assertTrue(expected_message in actual_message, actual_message)
1057 | 
1058 |     def test_target_path_missing_file(self):
1059 |         """TaskGraph: func runs, but missing target."""
1060 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
1061 |         target_path = os.path.join(self.workspace_dir, 'testfile.txt')
1062 |         not_target_path = os.path.join(self.workspace_dir, 'not_target.txt')
1063 |         with self.assertRaises(RuntimeError) as cm:
1064 |             task_graph.add_task(
1065 |                 func=_create_list_on_disk,
1066 |                 args=('test', 1, target_path),
1067 |                 target_path_list=[not_target_path],
1068 |                 task_name='_create_list_on_disk')
1069 |         expected_message = "Missing expected target path results"
1070 |         actual_message = str(cm.exception)
1071 |         self.assertTrue(expected_message in actual_message, actual_message)
1072 | 
1073 |     def test_expected_path_list(self):
1074 |         """TaskGraph: test expected path list matches actual path list."""
1075 |         def _create_file(target_path, content):
1076 |             with open(target_path, 'w') as target_file:
1077 |                 target_file.write(content)
1078 | 
1079 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0)
1080 |         # note it is important this is a relative path that does not
1081 |         # contain the drive letter on Windows.
1082 |         absolute_target_file_path = os.path.join(
1083 |             self.workspace_dir, 'a.txt')
1084 |         relative_path = os.path.relpath(absolute_target_file_path,
1085 |                                         start=self.workspace_dir)
1086 | 
1087 |         _ = task_graph.add_task(
1088 |            func=_create_file,
1089 |            args=(relative_path, 'test value'),
1090 |            target_path_list=[relative_path],
1091 |            task_name='create file')
1092 | 
1093 |         task_graph.close()
1094 |         task_graph.join()
1095 |         del task_graph
1096 | 
1097 |         self.assertTrue('Ran without crashing!')
1098 | 
1099 |     def test_kwargs_hashed(self):
1100 |         """TaskGraph: ensure kwargs are considered in determining id hash."""
1101 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0)
1102 | 
1103 |         task_a = task_graph.add_task(
1104 |             func=_noop_function,
1105 |             kwargs={
1106 |                 'content': ['this value: a']},
1107 |             task_name='noop a')
1108 | 
1109 |         task_b = task_graph.add_task(
1110 |             func=_noop_function,
1111 |             kwargs={
1112 |                 'content': ['this value b']},
1113 |             task_name='noop b')
1114 | 
1115 |         task_graph.close()
1116 |         task_graph.join()
1117 |         del task_graph
1118 | 
1119 |         self.assertNotEqual(
1120 |             task_a._task_id_hash, task_b._task_id_hash,
1121 |             "task ids should be different since the kwargs are different")
1122 | 
1123 |     def test_same_timestamp_and_value(self):
1124 |         """TaskGraph: ensure identical files but filename are noticed."""
1125 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0)
1126 | 
1127 |         file_a_path = os.path.join(self.workspace_dir, 'file_a.txt')
1128 |         file_b_path = os.path.join(self.workspace_dir, 'file_b.txt')
1129 | 
1130 |         with open(file_a_path, 'w') as file_a:
1131 |             file_a.write('a')
1132 |         with open(file_b_path, 'w') as file_b:
1133 |             file_b.write('a')
1134 | 
1135 |         os.utime(file_a_path, (0, 0))
1136 |         os.utime(file_b_path, (0, 0))
1137 | 
1138 |         task_a = task_graph.add_task(
1139 |             func=_noop_function,
1140 |             kwargs={
1141 |                 'path': file_a_path},
1142 |             task_name='noop a')
1143 | 
1144 |         task_b = task_graph.add_task(
1145 |             func=_noop_function,
1146 |             kwargs={
1147 |                 'path': file_b_path},
1148 |             task_name='noop b')
1149 | 
1150 |         task_graph.close()
1151 |         task_graph.join()
1152 |         del task_graph
1153 | 
1154 |         self.assertNotEqual(
1155 |             task_a._task_id_hash, task_b._task_id_hash,
1156 |             "task ids should be different since the filenames are different")
1157 | 
1158 |     def test_different_hash_different_file(self):
1159 |         """TaskGraph: ensure identical files but filename are noticed."""
1160 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0)
1161 |         target_file_path = os.path.join(self.workspace_dir, 'target.txt')
1162 |         _ = task_graph.add_task(
1163 |             func=_create_file,
1164 |             args=(target_file_path, 'content'),
1165 |             hash_algorithm='exists',
1166 |             target_path_list=[target_file_path],
1167 |             task_name='create content, hash with exists')
1168 |         task_graph.close()
1169 |         task_graph.join()
1170 |         del task_graph
1171 | 
1172 |         with open(target_file_path, 'r') as target_file:
1173 |             self.assertEqual(target_file.read(), 'content')
1174 |         with open(target_file_path, 'w') as target_file:
1175 |             target_file.write('overwritten')
1176 | 
1177 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0)
1178 |         _ = task_graph.add_task(
1179 |             func=_create_file,
1180 |             args=(target_file_path, 'content'),
1181 |             hash_algorithm='exists',
1182 |             target_path_list=[target_file_path],
1183 |             task_name='will not overwrite content, hash with exists')
1184 |         task_graph.close()
1185 |         task_graph.join()
1186 |         del task_graph
1187 | 
1188 |         with open(target_file_path, 'r') as target_file:
1189 |             self.assertEqual(target_file.read(), 'overwritten')
1190 | 
1191 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0)
1192 |         _ = task_graph.add_task(
1193 |             func=_create_file,
1194 |             args=(target_file_path, 'content'),
1195 |             hash_algorithm='md5',
1196 |             target_path_list=[target_file_path],
1197 |             task_name='create content again with new hash')
1198 |         task_graph.close()
1199 |         task_graph.join()
1200 |         del task_graph
1201 | 
1202 |         with open(target_file_path, 'r') as target_file:
1203 |             self.assertEqual(target_file.read(), 'content')
1204 | 
1205 |     def test_return_value_no_record(self):
1206 |         """TaskGraph: test  ``get`` raises exception if not set to record."""
1207 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
1208 |         value_task = task_graph.add_task(
1209 |             func=_noop_function,
1210 |             store_result=False)
1211 | 
1212 |         # get wil raise a ValueError because store_result is not True
1213 |         with self.assertRaises(ValueError) as cm:
1214 |             _ = value_task.get()
1215 |         expected_message = 'must set `store_result` to True in `add_task`'
1216 |         actual_message = str(cm.exception)
1217 |         self.assertTrue(expected_message in actual_message, actual_message)
1218 | 
1219 |     def test_return_value(self):
1220 |         """TaskGraph: test that ``.get`` behavior works as expected."""
1221 |         if hasattr(_return_value_once, 'executed'):
1222 |             del _return_value_once.executed
1223 |         n_iterations = 3
1224 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 0, 0)
1225 |         for iteration_id in range(n_iterations):
1226 |             transient_run = iteration_id == n_iterations-1
1227 |             LOGGER.debug(iteration_id)
1228 |             expected_value = 'a good value'
1229 |             value_task = task_graph.add_task(
1230 |                 func=_return_value_once,
1231 |                 transient_run=transient_run,
1232 |                 store_result=True,
1233 |                 args=(expected_value,),
1234 |                 task_name=f'{expected_value} iter {iteration_id}')
1235 |             value = value_task.get()
1236 |             self.assertEqual(value, expected_value)
1237 |         task_graph.close()
1238 |         task_graph.join()
1239 |         task_graph = None
1240 | 
1241 |         # reset run
1242 |         del _return_value_once.executed
1243 |         for iteration_id in range(n_iterations):
1244 |             LOGGER.debug(iteration_id)
1245 |             task_graph = taskgraph.TaskGraph(self.workspace_dir, 0, 0)
1246 |             expected_value = 'transient run'
1247 |             if iteration_id == 0:
1248 |                 value_task = task_graph.add_task(
1249 |                     func=_return_value_once,
1250 |                     transient_run=True,
1251 |                     store_result=True,
1252 |                     args=(expected_value,),
1253 |                     task_name='first re-run transient')
1254 |                 value = value_task.get()
1255 |                 self.assertEqual(value, expected_value)
1256 |                 task_graph.close()
1257 |                 task_graph.join()
1258 |             else:
1259 |                 with self.assertRaises(RuntimeError):
1260 |                     value_task = task_graph.add_task(
1261 |                         func=_return_value_once,
1262 |                         transient_run=True,
1263 |                         store_result=True,
1264 |                         args=(expected_value,),
1265 |                         task_name=f'expected error {iteration_id}')
1266 | 
1267 |                     value = value_task.get()
1268 | 
1269 |                 with self.assertRaises(RuntimeError):
1270 |                     task_graph.join()
1271 | 
1272 |             task_graph = None
1273 | 
1274 |     def test_malformed_taskgraph_database(self):
1275 |         """TaskGraph: Test an empty task."""
1276 |         db_schema_test_list = [
1277 |             '''
1278 |             CREATE TABLE taskgraph_data (
1279 |                 bad_name_1 TEXT NOT NULL,
1280 |                 bad_name_2 BLOB NOT NULL,
1281 |                 bad_name_3 BLOB NOT NULL);
1282 |             ''',
1283 |             '''
1284 |             CREATE TABLE taskgraph_data (
1285 |                 task_reexecution_hash TEXT NOT NULL,
1286 |                 target_path_stats BLOB NOT NULL);
1287 |             ''',
1288 |             '''
1289 |             CREATE TABLE bad_table_name (
1290 |                 task_reexecution_hash TEXT NOT NULL,
1291 |                 target_path_stats BLOB NOT NULL,
1292 |                 result BLOB NOT NULL,
1293 |                 PRIMARY KEY (task_reexecution_hash));
1294 |             '''
1295 |         ]
1296 | 
1297 |         for db_schema in db_schema_test_list:
1298 |             database_path = os.path.join(
1299 |                 self.workspace_dir, taskgraph._TASKGRAPH_DATABASE_FILENAME)
1300 |             if os.path.exists(database_path):
1301 |                 os.remove(database_path)
1302 |             connection = sqlite3.connect(database_path)
1303 |             cursor = connection.cursor()
1304 |             cursor.executescript(db_schema)
1305 |             cursor.close()
1306 |             connection.commit()
1307 |             connection.close()
1308 | 
1309 |             task_graph = taskgraph.TaskGraph(self.workspace_dir, 0)
1310 |             _ = task_graph.add_task()
1311 |             task_graph.close()
1312 |             task_graph.join()
1313 |             del task_graph
1314 | 
1315 |             expected_column_name_list = [
1316 |                 'task_reexecution_hash', 'target_path_stats', 'result']
1317 |             connection = sqlite3.connect(database_path)
1318 |             cursor = connection.cursor()
1319 |             cursor.execute('PRAGMA table_info(taskgraph_data)')
1320 |             result = list(cursor.fetchall())
1321 |             cursor.close()
1322 |             connection.commit()
1323 |             connection.close()
1324 |             for header_line in result:
1325 |                 column_name = header_line[1]
1326 |                 if column_name not in expected_column_name_list:
1327 |                     raise ValueError(
1328 |                         f'unexpected column name {column_name} in '
1329 |                         'taskgraph_data ')
1330 |             self.assertEqual(len(result), len(expected_column_name_list))
1331 | 
1332 |     def test_terminate_log(self):
1333 |         """TaskGraph: test that the logger thread terminates on .join."""
1334 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 1, 5.0)
1335 |         _ = task_graph.add_task()
1336 |         task_graph.join()
1337 | 
1338 |         # logger should not terminate until after join, give it enough time
1339 |         # to have a chance to close, but not so long the test hangs
1340 |         task_graph._logging_monitor_thread.join(0.1)
1341 |         self.assertTrue(task_graph._logging_monitor_thread.is_alive())
1342 |         task_graph._execution_monitor_thread.join(0.1)
1343 |         self.assertTrue(task_graph._execution_monitor_thread.is_alive())
1344 | 
1345 |         task_graph.close()
1346 |         task_graph.join()
1347 | 
1348 |         # 5 seconds should be way too much time to expect the thread to join
1349 |         task_graph._logging_monitor_thread.join(5)
1350 |         self.assertFalse(task_graph._logging_monitor_thread.is_alive())
1351 |         task_graph._execution_monitor_thread.join(5)
1352 |         self.assertFalse(task_graph._execution_monitor_thread.is_alive())
1353 | 
1354 |     def test_dictionary_arguments(self):
1355 |         """TaskGraph: test that large dictionary arguments behave well."""
1356 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, -1)
1357 |         dict_arg = {}
1358 |         x = {None: None}
1359 |         for _ in range(10000):
1360 |             dict_arg[_] = x
1361 | 
1362 |         def my_op(dict_arg):
1363 |             pass
1364 |         task_graph.add_task(
1365 |             func=my_op, args=(), kwargs={'dict_arg': dict_arg})
1366 |         task_graph.join()
1367 |         self.assertTrue(True, 'no memory error so everything is fine')
1368 | 
1369 |     def test_filter_non_files(self):
1370 |         """TaskGraph: test internal filter non-files function."""
1371 |         from taskgraph.Task import _filter_non_files
1372 |         from taskgraph.Task import _normalize_path
1373 | 
1374 |         # Test a passthrough
1375 |         test_dict = {
1376 |             0: {'one': 0, 'two': 1, 'three': 2},
1377 |             1: {'one': 1, 'two': 2, 'three': 3},
1378 |             2: {'one': 2, 'two': 3, 'three': 4}}
1379 |         self.assertEqual(
1380 |             test_dict, _filter_non_files(test_dict, [], [], False))
1381 | 
1382 |         # Test combination of files, not existing files, and flags in the
1383 |         # call
1384 |         test_file_a_exists = _normalize_path(os.path.join(
1385 |             self.workspace_dir, 'exists_a.txt'))
1386 |         pathlib.Path(test_file_a_exists).touch()
1387 |         test_file_b_exists = _normalize_path(os.path.join(
1388 |             self.workspace_dir, 'exists_b.txt'))
1389 |         pathlib.Path(test_file_b_exists).touch()
1390 |         test_file_not_a_exists = _normalize_path(os.path.join(
1391 |             self.workspace_dir, 'does_not_exist_a.txt'))
1392 |         test_file_not_b_exists = _normalize_path(os.path.join(
1393 |             self.workspace_dir, 'does_not_exist_b.txt'))
1394 | 
1395 |         test_dict = {
1396 |             0: {'one': 0, 'two': 1, 'three': 2},
1397 |             1: {'one': 1, 'two': 2, 'three': 3},
1398 |             2: {'one': 2, 'two': 3, 'three': 4},
1399 |             4: {'bar': test_file_not_a_exists},
1400 |             5: {'foo': test_file_a_exists},
1401 |             6: test_file_b_exists,
1402 |             7: test_file_not_b_exists,
1403 |             8: _normalize_path(self.workspace_dir)}
1404 | 
1405 |         expected_result_dict = {
1406 |             0: {'one': 0, 'two': 1, 'three': 2},
1407 |             1: {'one': 1, 'two': 2, 'three': 3},
1408 |             2: {'one': 2, 'two': 3, 'three': 4},
1409 |             4: {'bar': test_file_not_a_exists},
1410 |             5: {'foo': None},
1411 |             6: test_file_b_exists,
1412 |             7: None,
1413 |             8: _normalize_path(self.workspace_dir)}
1414 | 
1415 |         self.assertEqual(
1416 |             _filter_non_files(
1417 |                 test_dict,
1418 |                 [test_file_b_exists],
1419 |                 [test_file_not_b_exists],
1420 |                 True),
1421 |             expected_result_dict)
1422 | 
1423 |         # and test same as above but don't keep directories:
1424 |         expected_result_dict[8] = None
1425 |         self.assertEqual(
1426 |             _filter_non_files(
1427 |                 test_dict,
1428 |                 [test_file_b_exists],
1429 |                 [test_file_not_b_exists],
1430 |                 False),
1431 |             expected_result_dict)
1432 | 
1433 |     def test_duplicate_task_hang_on_exit(self):
1434 |         """TaskGraph: ensure duplicate tasks don't cause taskgraph to hang."""
1435 |         task_graph = taskgraph.TaskGraph(self.workspace_dir, 1)
1436 |         target_path = os.path.join(self.workspace_dir, 'target.txt')
1437 |         content = 'test'
1438 |         for _ in range(10):
1439 |             _ = task_graph.add_task(
1440 |                 func=_create_file,
1441 |                 args=(target_path, content),
1442 |                 target_path_list=[target_path],
1443 |                 task_name='create content')
1444 |         task_graph.join()
1445 |         task_graph.close()
1446 | 
1447 |     def test_history_rst_format(self):
1448 |         """TaskGraph: ensure HISTORY.rst is correctly formatted."""
1449 |         # ensure there are no errors when checking the history file
1450 |         history_filepath = os.path.join(
1451 |             os.path.dirname(__file__), '..', 'HISTORY.rst')
1452 |         subprocess.check_call(['rstcheck', history_filepath])
1453 | 
1454 |     def test_mtime_mismatch(self):
1455 |         """TaskGraph: ensure re-run when file mtimes don't match.
1456 | 
1457 |         This test addresses the issue described under the github issue
1458 |         https://github.com/natcap/taskgraph/issues/70.
1459 |         """
1460 |         target_path = os.path.join(self.workspace_dir, 'target.txt')
1461 | 
1462 |         # SETUP:  When we call 3 similar graphs in rapid succession, the file's
1463 |         # mtime is not precise enough to detect that the file has actually
1464 |         # changed.  The specific conditions here are:
1465 |         #  * The task "test text" has already been computed once
1466 |         #  * The file written by the first "test text" is replaced by content
1467 |         #    of the same filesize (thus fooling the size part of sizetimestamp)
1468 |         #  * The graphs are executed fast enough that _is_precalculated's mtime
1469 |         #    check via math.isclose() couldn't detect the recalculation.
1470 |         for content in ('test text', 'TEST TEXT', 'test text'):
1471 |             task_graph = taskgraph.TaskGraph(self.workspace_dir, n_workers=-1)
1472 |             _ = task_graph.add_task(
1473 |                 func=_create_file,
1474 |                 args=(target_path, content),
1475 |                 target_path_list=[target_path],
1476 |                 task_name='create content')
1477 |             task_graph.join()
1478 |             task_graph.close()
1479 | 
1480 |         with open(target_path) as target_file:
1481 |             self.assertEqual(target_file.read(), content)
1482 | 
1483 | 
1484 | def Fail(n_tries, result_path):
1485 |     """Create a function that fails after ``n_tries``."""
1486 |     def fail_func():
1487 |         fail_func._n_tries -= 1
1488 |         if fail_func._n_tries > 0:
1489 |             raise ValueError("Fail %d more times", fail_func._n_tries)
1490 |         with open(result_path, 'w') as result_file:
1491 |             result_file.write("finished!")
1492 |     fail_func._n_tries = n_tries
1493 | 
1494 |     return fail_func
1495 | 


--------------------------------------------------------------------------------
/taskgraph/Task.py:
--------------------------------------------------------------------------------
   1 | """Task graph framework."""
   2 | import collections
   3 | import hashlib
   4 | import inspect
   5 | import logging
   6 | import logging.handlers
   7 | import multiprocessing
   8 | import multiprocessing.pool
   9 | import os
  10 | import pathlib
  11 | import pickle
  12 | import pprint
  13 | import queue
  14 | import sqlite3
  15 | import threading
  16 | import time
  17 | try:
  18 |     from importlib.metadata import PackageNotFoundError
  19 |     from importlib.metadata import version
  20 | except ImportError:
  21 |     # importlib.metadata added to stdlib in 3.8
  22 |     from importlib_metadata import PackageNotFoundError
  23 |     from importlib_metadata import version
  24 | 
  25 | import retrying
  26 | 
  27 | try:
  28 |     __version__ = version('taskgraph')
  29 | except PackageNotFoundError:
  30 |     # package is not installed; no metadata available
  31 |     pass
  32 | 
  33 | 
  34 | _VALID_PATH_TYPES = (str, pathlib.PurePath)
  35 | _TASKGRAPH_DATABASE_FILENAME = 'taskgraph_data.db'
  36 | 
  37 | try:
  38 |     import psutil
  39 |     HAS_PSUTIL = True
  40 |     if psutil.WINDOWS:
  41 |         # Windows' scheduler doesn't use POSIX niceness.
  42 |         PROCESS_LOW_PRIORITY = psutil.BELOW_NORMAL_PRIORITY_CLASS
  43 |     else:
  44 |         # On POSIX, use system niceness.
  45 |         # -20 is high priority, 0 is normal priority, 19 is low priority.
  46 |         # 10 here is an arbitrary selection that's probably nice enough.
  47 |         PROCESS_LOW_PRIORITY = 10
  48 | except ImportError:
  49 |     HAS_PSUTIL = False
  50 | 
  51 | LOGGER = logging.getLogger(__name__)
  52 | _MAX_TIMEOUT = 5.0  # amount of time to wait for threads to terminate
  53 | 
  54 | 
  55 | # We want our processing pool to be nondeamonic so that workers could use
  56 | # multiprocessing if desired (deamonic processes cannot start new processes)
  57 | # the following bit of code to do this was taken from
  58 | # https://stackoverflow.com/a/8963618/42897
  59 | class NoDaemonProcess(multiprocessing.Process):
  60 |     """Make 'daemon' attribute always return False."""
  61 | 
  62 |     @property
  63 |     def daemon(self):
  64 |         """Return False indicating not a daemon process."""
  65 |         return False
  66 | 
  67 |     @daemon.setter
  68 |     def daemon(self, value):
  69 |         """Do not allow daemon value to be overriden."""
  70 |         pass
  71 | 
  72 | 
  73 | class NoDaemonContext(type(multiprocessing.get_context('spawn'))):
  74 |     """From https://stackoverflow.com/a/8963618/42897.
  75 | 
  76 |     "As the current implementation of multiprocessing [3.7+] has been
  77 |     extensively refactored to be based on contexts, we need to provide a
  78 |     NoDaemonContext class that has our NoDaemonProcess as attribute.
  79 |     [NonDaemonicPool] will then use that context instead of the default
  80 |     one." "spawn" is chosen as default since that is the default and only
  81 |     context option for Windows and is the default option for Mac OS as
  82 |     well since 3.8.
  83 | 
  84 |     """
  85 | 
  86 |     Process = NoDaemonProcess
  87 | 
  88 | 
  89 | class NonDaemonicPool(multiprocessing.pool.Pool):
  90 |     """NonDaemonic Process Pool."""
  91 | 
  92 |     def __init__(self, *args, **kwargs):
  93 |         """Invoke super to set the context of Pool class explicitly."""
  94 |         kwargs['context'] = NoDaemonContext()
  95 |         super(NonDaemonicPool, self).__init__(*args, **kwargs)
  96 | 
  97 | 
  98 | def _null_func():
  99 |     """Use when func=None on add_task."""
 100 |     return None
 101 | 
 102 | 
 103 | def _initialize_logging_to_queue(logging_queue):
 104 |     """Add a synchronized queue to a new process.
 105 | 
 106 |     This is intended to be called as an initialization function to
 107 |     ``multiprocessing.Pool`` to establish logging from a Pool worker to the
 108 |     main python process via a multiprocessing Queue.
 109 | 
 110 |     Args:
 111 |         logging_queue (multiprocessing.Queue): The queue to use for passing
 112 |             log records back to the main process.
 113 | 
 114 |     Returns:
 115 |         None
 116 | 
 117 |     """
 118 |     root_logger = logging.getLogger()
 119 | 
 120 |     # By the time this function is called, `root_logger` has a copy of all of
 121 |     # the logging handlers registered to it within the parent process, which
 122 |     # leads to duplicate logging in some cases.  By removing all of the
 123 |     # handlers here, we ensure that log messages can only be passed back to the
 124 |     # parent process by the `logging_queue`, where they will be handled.
 125 |     for handler in root_logger.handlers[:]:
 126 |         root_logger.removeHandler(handler)
 127 | 
 128 |     root_logger.setLevel(logging.NOTSET)
 129 |     handler = logging.handlers.QueueHandler(logging_queue)
 130 |     root_logger.addHandler(handler)
 131 | 
 132 | 
 133 | def _logging_queue_monitor(logging_queue):
 134 |     """Monitor ``logging_queue`` for message and pass to ``logger``."""
 135 |     LOGGER.debug('Starting logging worker')
 136 |     while True:
 137 |         record = logging_queue.get()
 138 |         if record is None:
 139 |             break
 140 |         logger = logging.getLogger(record.name)
 141 |         logger.handle(record)
 142 |     LOGGER.debug('_logging_queue_monitor shutting down')
 143 | 
 144 | 
 145 | def _create_taskgraph_table_schema(taskgraph_database_path):
 146 |     """Create database exists and/or ensures it is compatible and recreate.
 147 | 
 148 |     Args:
 149 |         taskgraph_database_path (str): path to an existing database or desired
 150 |             location of a new database.
 151 | 
 152 |     Returns:
 153 |         None.
 154 | 
 155 |     """
 156 |     sql_create_projects_table_script = (
 157 |         """
 158 |         CREATE TABLE taskgraph_data (
 159 |             task_reexecution_hash TEXT NOT NULL,
 160 |             target_path_stats BLOB NOT NULL,
 161 |             result BLOB NOT NULL,
 162 |             PRIMARY KEY (task_reexecution_hash)
 163 |         );
 164 |         CREATE TABLE global_variables (
 165 |             key TEXT NOT NULL,
 166 |             value BLOB,
 167 |             PRIMARY KEY (key)
 168 |         );
 169 |         """)
 170 | 
 171 |     table_valid = True
 172 |     expected_table_column_name_map = {
 173 |         'taskgraph_data': [
 174 |             'task_reexecution_hash', 'target_path_stats', 'result'],
 175 |         'global_variables': ['key', 'value']}
 176 |     if os.path.exists(taskgraph_database_path):
 177 |         try:
 178 |             # check that the tables exist and the column names are as expected
 179 |             for expected_table_name in expected_table_column_name_map:
 180 |                 table_result = _execute_sqlite(
 181 |                     '''
 182 |                     SELECT name
 183 |                     FROM sqlite_master
 184 |                     WHERE type='table' AND name=?
 185 |                     ''', taskgraph_database_path,
 186 |                     argument_list=[expected_table_name],
 187 |                     mode='read_only', execute='execute', fetch='all')
 188 |                 if not table_result:
 189 |                     raise ValueError(f'missing table {expected_table_name}')
 190 | 
 191 |                 # this query returns a list of results of the form
 192 |                 # [(0, 'task_reexecution_hash', 'TEXT', 1, None, 1), ... ]
 193 |                 # we'll just check that the header names are the same, no
 194 |                 # need to be super aggressive, also need to construct the
 195 |                 # PRAGMA string directly since it doesn't take arguments
 196 |                 table_info_result = _execute_sqlite(
 197 |                     f'PRAGMA table_info({expected_table_name})',
 198 |                     taskgraph_database_path, mode='read_only',
 199 |                     execute='execute', fetch='all')
 200 | 
 201 |                 expected_column_names = expected_table_column_name_map[
 202 |                     expected_table_name]
 203 |                 header_count = 0
 204 |                 for header_line in table_info_result:
 205 |                     column_name = header_line[1]
 206 |                     if column_name not in expected_column_names:
 207 |                         raise ValueError(
 208 |                             f'expected {column_name} in table '
 209 |                             f'{expected_table_name} but not found')
 210 |                     header_count += 1
 211 |                 if header_count < len(expected_column_names):
 212 |                     raise ValueError(
 213 |                         f'found only {header_count} of an expected '
 214 |                         f'{len(expected_column_names)} columns in table '
 215 |                         f'{expected_table_name}')
 216 |                 if not table_info_result:
 217 |                     raise ValueError(f'missing table {expected_table_name}')
 218 |         except Exception:
 219 |             # catch all "Exception"s because anything that goes wrong while
 220 |             # checking the database should be considered a bad database and we
 221 |             # should make a new one.
 222 |             LOGGER.exception(
 223 |                 f'{taskgraph_database_path} exists, but is incompatible '
 224 |                 'somehow. Deleting and making a new one.')
 225 |             os.remove(taskgraph_database_path)
 226 |             table_valid = False
 227 |     else:
 228 |         # table does not exist
 229 |         table_valid = False
 230 | 
 231 |     if not table_valid:
 232 |         # create the base table
 233 |         _execute_sqlite(
 234 |             sql_create_projects_table_script, taskgraph_database_path,
 235 |             mode='modify', execute='script')
 236 |         # set the database version
 237 |         _execute_sqlite(
 238 |             '''
 239 |             INSERT OR REPLACE INTO global_variables
 240 |             VALUES ("version", ?)
 241 |             ''', taskgraph_database_path, mode='modify',
 242 |             argument_list=(__version__,))
 243 | 
 244 | 
 245 | class TaskGraph(object):
 246 |     """Encapsulates the worker and tasks states for parallel processing."""
 247 | 
 248 |     def __init__(
 249 |             self, taskgraph_cache_dir_path, n_workers,
 250 |             reporting_interval=None):
 251 |         """Create a task graph.
 252 | 
 253 |         Creates an object for building task graphs, executing them,
 254 |         parallelizing independent work notes, and avoiding repeated calls.
 255 | 
 256 |         Args:
 257 |             taskgraph_cache_dir_path (string): path to a directory that
 258 |                 either contains a taskgraph cache from a previous instance or
 259 |                 will create a new one if none exists.
 260 |             n_workers (int): number of parallel *subprocess* workers to allow
 261 |                 during task graph execution.  If set to 0, don't use
 262 |                 subprocesses.  If set to <0, use only the main thread for any
 263 |                 execution and scheduling. In the case of the latter,
 264 |                 ``add_task`` will be a blocking call.
 265 |             reporting_interval (scalar): if not None, report status of task
 266 |                 graph every ``reporting_interval`` seconds.
 267 | 
 268 |         """
 269 |         try:
 270 |             os.makedirs(taskgraph_cache_dir_path)
 271 |         except OSError:
 272 |             LOGGER.debug(
 273 |                 "%s already exists, no need to make it",
 274 |                 taskgraph_cache_dir_path)
 275 | 
 276 |         self._taskgraph_cache_dir_path = taskgraph_cache_dir_path
 277 | 
 278 |         # this variable is used to print accurate representation of how many
 279 |         # tasks have been completed in the logging output.
 280 |         self._added_task_count = 0
 281 | 
 282 |         # use this to keep track of all the tasks added to the graph by their
 283 |         # task hashes. Used to determine if an identical task has been added
 284 |         # to the taskgraph during `add_task`
 285 |         self._task_hash_map = dict()
 286 | 
 287 |         # use this to keep track of all the tasks added to the graph by their
 288 |         # task names. Used to map a unique task name to the task object it
 289 |         # represents
 290 |         self._task_name_map = dict()
 291 | 
 292 |         # used to remember if task_graph has been closed
 293 |         self._closed = False
 294 | 
 295 |         # keep track if the task graph has been forcibly terminated
 296 |         self._terminated = False
 297 | 
 298 |         # if n_workers > 0 this will be a multiprocessing pool used to execute
 299 |         # the __call__ functions in Tasks
 300 |         self._worker_pool = None
 301 | 
 302 |         # If n_workers > 0 this will be a threading.Thread used to propagate
 303 |         # log records from another process into the current process.
 304 |         self._logging_monitor_thread = None
 305 | 
 306 |         # If n_workers > 0, this will be a multiprocessing.Queue used to pass
 307 |         # log records from the process pool to the parent process.
 308 |         self._logging_queue = None
 309 | 
 310 |         # keeps track of the tasks currently being processed for logging.
 311 |         self._active_task_list = []
 312 | 
 313 |         # keeps track of how many tasks have all their dependencies satisfied
 314 |         # and are waiting for a worker
 315 |         self._task_waiting_count = 0
 316 | 
 317 |         # this might hold the threads to execute tasks if n_workers >= 0
 318 |         self._task_executor_thread_list = []
 319 | 
 320 |         # executor threads wait on this event that gets set when new tasks are
 321 |         # added to the queue. If the queue is empty an executor will clear
 322 |         # the event to halt other executors
 323 |         self._executor_ready_event = threading.Event()
 324 | 
 325 |         # tasks that have all their dependencies satisfied go in this queue
 326 |         # and can be executed immediately
 327 |         self._task_ready_priority_queue = queue.PriorityQueue()
 328 | 
 329 |         # maps a list of task names that need to be executed before the key
 330 |         # task can
 331 |         self._task_dependent_map = collections.defaultdict(set)
 332 | 
 333 |         # maps a list of task names that are dependent to a task
 334 |         self._dependent_task_map = collections.defaultdict(set)
 335 | 
 336 |         # tasks that complete are added to this set
 337 |         self._completed_task_names = set()
 338 | 
 339 |         self._task_database_path = os.path.join(
 340 |             self._taskgraph_cache_dir_path, _TASKGRAPH_DATABASE_FILENAME)
 341 | 
 342 |         # create new table if needed
 343 |         _create_taskgraph_table_schema(self._task_database_path)
 344 | 
 345 |         # check the version of the database and warn if a problem
 346 |         local_version = _execute_sqlite(
 347 |             '''
 348 |             SELECT value
 349 |             FROM global_variables
 350 |             WHERE key=?
 351 |             ''', self._task_database_path, mode='read_only',
 352 |             fetch='one', argument_list=['version'])[0]
 353 |         if local_version != __version__:
 354 |             LOGGER.warning(
 355 |                 f'the database located at {self._task_database_path} was '
 356 |                 f'created with TaskGraph version {local_version} but the '
 357 |                 f'current version is {__version__}')
 358 | 
 359 |         # no need to set up schedulers if n_workers is single threaded
 360 |         self._n_workers = n_workers
 361 |         if n_workers < 0:
 362 |             return
 363 | 
 364 |         # start concurrent reporting of taskgraph if reporting interval is set
 365 |         self._reporting_interval = reporting_interval
 366 |         if reporting_interval is not None:
 367 |             self._execution_monitor_wait_event = threading.Event()
 368 |             self._execution_monitor_thread = threading.Thread(
 369 |                 target=self._execution_monitor,
 370 |                 args=(self._execution_monitor_wait_event,),
 371 |                 name='_execution_monitor')
 372 |             # make it a daemon so we don't have to figure out how to
 373 |             # close it when execution complete
 374 |             self._execution_monitor_thread.daemon = True
 375 |             self._execution_monitor_thread.start()
 376 | 
 377 |         # launch executor threads
 378 |         self._executor_thread_count = max(0, n_workers)
 379 |         for thread_id in range(max(1, n_workers)):
 380 |             task_executor_thread = threading.Thread(
 381 |                 target=self._task_executor,
 382 |                 name='task_executor_%s' % thread_id)
 383 |             # make daemons in case there's a catastrophic error the main
 384 |             # thread won't hang
 385 |             task_executor_thread.daemon = True
 386 |             task_executor_thread.start()
 387 |             self._task_executor_thread_list.append(task_executor_thread)
 388 | 
 389 |         # set up multiprocessing if n_workers > 0
 390 |         if n_workers > 0:
 391 |             self._logging_queue = multiprocessing.Queue()
 392 |             self._worker_pool = NonDaemonicPool(
 393 |                 n_workers, initializer=_initialize_logging_to_queue,
 394 |                 initargs=(self._logging_queue,))
 395 |             self._logging_monitor_thread = threading.Thread(
 396 |                 target=_logging_queue_monitor,
 397 |                 args=(self._logging_queue,))
 398 | 
 399 |             self._logging_monitor_thread.daemon = True
 400 |             self._logging_monitor_thread.start()
 401 |             if HAS_PSUTIL:
 402 |                 parent = psutil.Process()
 403 |                 parent.nice(PROCESS_LOW_PRIORITY)
 404 |                 for child in parent.children():
 405 |                     try:
 406 |                         child.nice(PROCESS_LOW_PRIORITY)
 407 |                     except psutil.NoSuchProcess:
 408 |                         LOGGER.warning(
 409 |                             "NoSuchProcess exception encountered when trying "
 410 |                             "to nice %s. This might be a bug in `psutil` so "
 411 |                             "it should be okay to ignore.")
 412 | 
 413 |     def __del__(self):
 414 |         """Ensure all threads have been joined for cleanup."""
 415 |         self._terminate()
 416 | 
 417 |     def _task_executor(self):
 418 |         """Worker that executes Tasks that have satisfied dependencies."""
 419 |         while True:
 420 |             # this event blocks until the task graph has signaled it wants
 421 |             # the executors to read the state of the queue or a stop event or
 422 |             # a timeout exceeded just to protect against a worst case deadlock
 423 |             self._executor_ready_event.wait(_MAX_TIMEOUT)
 424 |             # this lock synchronizes changes between the queue and
 425 |             # executor_ready_event
 426 |             if self._terminated:
 427 |                 LOGGER.debug(
 428 |                     "taskgraph is terminated, ending %s",
 429 |                     threading.current_thread())
 430 |                 break
 431 |             task = None
 432 |             try:
 433 |                 task = self._task_ready_priority_queue.get_nowait()
 434 |                 self._task_waiting_count -= 1
 435 |                 task_name_time_tuple = (task.task_name, time.time())
 436 |                 self._active_task_list.append(task_name_time_tuple)
 437 |             except queue.Empty:
 438 |                 # no tasks are waiting could be because the taskgraph is
 439 |                 # closed or because the queue is just empty.
 440 |                 if (self._closed and len(self._completed_task_names) ==
 441 |                         self._added_task_count):
 442 |                     # the graph is closed and there are as many completed tasks
 443 |                     # as there are added tasks, so none left. The executor can
 444 |                     # terminate.
 445 |                     self._executor_thread_count -= 1
 446 |                     if self._executor_thread_count == 0 and self._worker_pool:
 447 |                         # only the last executor should terminate the worker
 448 |                         # pool, because otherwise who knows if it's still
 449 |                         # executing anything
 450 |                         try:
 451 |                             self._worker_pool.close()
 452 |                             self._worker_pool.terminate()
 453 |                             self._worker_pool = None
 454 |                             self._terminate()
 455 |                         except Exception:
 456 |                             # there's the possibility for a race condition here
 457 |                             # where another thread already closed the worker
 458 |                             # pool, so just guard against it
 459 |                             LOGGER.warning('worker pool was already closed')
 460 |                     LOGGER.debug(
 461 |                         "no tasks are pending and taskgraph closed, normally "
 462 |                         "terminating executor %s." % threading.current_thread())
 463 |                     break
 464 |                 else:
 465 |                     # there's still the possibility for work to be added or
 466 |                     # still work in the pipeline
 467 |                     self._executor_ready_event.clear()
 468 |             if task is None:
 469 |                 continue
 470 |             try:
 471 |                 task._call()
 472 |                 task.task_done_executing_event.set()
 473 |             except Exception as e:
 474 |                 # An error occurred on a call, terminate the taskgraph
 475 |                 task.exception_object = e
 476 |                 LOGGER.exception(
 477 |                     'A taskgraph _task_executor failed on Task '
 478 |                     '%s. Terminating taskgraph.', task.task_name)
 479 |                 self._terminate()
 480 |                 break
 481 | 
 482 |             LOGGER.debug(
 483 |                 "task %s is complete, checking to see if any dependent "
 484 |                 "tasks can be executed now", task.task_name)
 485 |             self._completed_task_names.add(task.task_name)
 486 |             self._active_task_list.remove(task_name_time_tuple)
 487 |             for waiting_task_name in (
 488 |                     self._task_dependent_map[task.task_name]):
 489 |                 # remove `task` from the set of tasks that
 490 |                 # `waiting_task` was waiting on.
 491 |                 self._dependent_task_map[waiting_task_name].remove(
 492 |                     task.task_name)
 493 |                 # if there aren't any left, we can push `waiting_task`
 494 |                 # to the work queue
 495 |                 if not self._dependent_task_map[waiting_task_name]:
 496 |                     # if we removed the last task we can put it to the
 497 |                     # work queue
 498 |                     LOGGER.debug(
 499 |                         "Task %s is ready for processing, sending to "
 500 |                         "task_ready_priority_queue",
 501 |                         waiting_task_name)
 502 |                     del self._dependent_task_map[waiting_task_name]
 503 |                     self._task_ready_priority_queue.put(
 504 |                         self._task_name_map[waiting_task_name])
 505 |                     self._task_waiting_count += 1
 506 |                     # indicate to executors there is work to do
 507 |                     self._executor_ready_event.set()
 508 |             del self._task_dependent_map[task.task_name]
 509 |             # this extra set ensures that recently emptied map won't get
 510 |             # ignored by the executor if no work is left to do and the graph is
 511 |             # closed
 512 |             self._executor_ready_event.set()
 513 |             LOGGER.debug("task %s done processing", task.task_name)
 514 |         LOGGER.debug("task executor shutting down")
 515 | 
 516 |     def add_task(
 517 |             self, func=None, args=None, kwargs=None, task_name=None,
 518 |             target_path_list=None, ignore_path_list=None,
 519 |             hash_target_files=True, dependent_task_list=None,
 520 |             ignore_directories=True, priority=0,
 521 |             hash_algorithm='sizetimestamp', transient_run=False,
 522 |             store_result=False):
 523 |         """Add a task to the task graph.
 524 | 
 525 |         Args:
 526 |             func (callable): target function
 527 |             args (list): argument list for ``func``
 528 |             kwargs (dict): keyword arguments for ``func``
 529 |             target_path_list (list): if not None, a list of file paths that
 530 |                 are expected to be output by ``func``.  If any of these paths
 531 |                 don't exist, or their timestamp is earlier than an input
 532 |                 arg or work token, func will be executed.
 533 | 
 534 |                 If ``None``, any identical calls to ``add_task`` will be
 535 |                 skipped for the TaskGraph object. A future TaskGraph object
 536 |                 will re-run an exact call once for its lifetime. The reasoning
 537 |                 is that it is likely the user wishes to run a target-less task
 538 |                 once for the lifetime of a task-graph, but would otherwise not
 539 |                 have a transient result that could be re-used in a future
 540 |                 instantiation of a TaskGraph object.
 541 | 
 542 |             task_name (string): if not None, this value is used to identify
 543 |                 the task in logging messages.
 544 |             ignore_path_list (list): list of file paths that could be in
 545 |                 args/kwargs that should be ignored when considering timestamp
 546 |                 hashes.
 547 |             hash_target_files (bool): If True, the hash value of the target
 548 |                 files will be recorded to determine if a future run of this
 549 |                 function is precalculated. If False, this function only notes
 550 |                 the existence of the target files before determining if
 551 |                 a function call is precalculated.
 552 |             dependent_task_list (list): list of ``Task``s that this task must
 553 |                 ``join`` before executing.
 554 |             ignore_directories (boolean): if the existence/timestamp of any
 555 |                 directories discovered in args or kwargs is used as part
 556 |                 of the work token hash.
 557 |             priority (numeric): the priority of a task is considered when
 558 |                 there is more than one task whose dependencies have been
 559 |                 met and are ready for scheduling. Tasks are inserted into the
 560 |                 work queue in order of decreasing priority value
 561 |                 (priority 10 is higher than priority 1). This value can be
 562 |                 positive, negative, and/or floating point.
 563 |             hash_algorithm (string): either a hash function id that
 564 |                 exists in hashlib.algorithms_available, 'sizetimestamp',
 565 |                 or 'exists'. Any paths to actual files in the arguments will
 566 |                 be digested with this algorithm. If value is 'sizetimestamp'
 567 |                 the digest will only use the normed path, size, and timestamp
 568 |                 of any files found in the arguments. This value is used when
 569 |                 determining whether a task is precalculated or its target
 570 |                 files can be copied to an equivalent task. Note if
 571 |                 ``hash_algorithm`` is 'sizetimestamp' the task will require the
 572 |                 same base path files to determine equality. If it is a
 573 |                 ``hashlib`` algorithm only file contents will be considered.
 574 |                 If the value is 'exists' the only test for file equivalence
 575 |                 will be if it exists on disk (True) or not (False).
 576 |             transient_run (bool): if True, this Task will be reexecuted
 577 |                 even if it was successfully executed in a previous TaskGraph
 578 |                 instance. If False, this Task will be skipped if it was
 579 |                 executed successfully in a previous TaskGraph instance. One
 580 |                 might wish to set `transient_run` to True on a Task that does
 581 |                 some sort of initialization that's needed every time a
 582 |                 TaskGraph is instantiated. Perhaps to acquire dynamic resources
 583 |                 or authenticate permissions.
 584 |             store_result (bool): If True, the result of ``func`` will be stored
 585 |                 in the TaskGraph database and retrievable with a call to
 586 |                 ``.get()`` on a ``Task`` object.
 587 | 
 588 |         Returns:
 589 |             Task which was just added to the graph or an existing Task that
 590 |             has the same signature and has already been added to the
 591 |             TaskGraph.
 592 | 
 593 |         Raises:
 594 |             ValueError if objects are passed to the dependent task list that
 595 |                 are not Tasks.
 596 |             ValueError if ``add_task`` is invoked after the ``TaskGraph`` is
 597 |                 closed.
 598 |             RuntimeError if ``add_task`` is invoked after ``TaskGraph`` has
 599 |                 reached a terminate state.
 600 | 
 601 |         """
 602 |         try:
 603 |             if self._terminated:
 604 |                 raise RuntimeError(
 605 |                     "add_task when Taskgraph is terminated.")
 606 |             if self._closed:
 607 |                 raise ValueError(
 608 |                     "The task graph is closed and cannot accept more "
 609 |                     "tasks.")
 610 |             self._added_task_count += 1
 611 |             if args is None:
 612 |                 args = []
 613 |             if kwargs is None:
 614 |                 kwargs = {}
 615 |             if task_name is None:
 616 |                 task_name = 'UNNAMED TASK'
 617 |             if dependent_task_list is None:
 618 |                 dependent_task_list = []
 619 |             if target_path_list is None:
 620 |                 target_path_list = []
 621 |             if ignore_path_list is None:
 622 |                 ignore_path_list = []
 623 |             if func is None:
 624 |                 func = _null_func
 625 | 
 626 |             # this is a pretty common error to accidentally not pass a
 627 |             # Task to the dependent task list.
 628 |             if any(not isinstance(task, Task)
 629 |                    for task in dependent_task_list):
 630 |                 raise ValueError(
 631 |                     "Objects passed to dependent task list that are not "
 632 |                     "tasks: %s", dependent_task_list)
 633 | 
 634 |             task_name = '%s (%d)' % (task_name, len(self._task_hash_map))
 635 |             new_task = Task(
 636 |                 task_name, func, args, kwargs, target_path_list,
 637 |                 ignore_path_list, hash_target_files, ignore_directories,
 638 |                 transient_run, self._worker_pool,
 639 |                 priority, hash_algorithm, store_result,
 640 |                 self._task_database_path)
 641 | 
 642 |             self._task_name_map[new_task.task_name] = new_task
 643 |             # it may be this task was already created in an earlier call,
 644 |             # use that object in its place
 645 |             if new_task in self._task_hash_map:
 646 |                 duplicate_task = self._task_hash_map[new_task]
 647 |                 new_task_target_set = set(new_task._target_path_list)
 648 |                 duplicate_task_target_set = set(
 649 |                     duplicate_task._target_path_list)
 650 |                 if new_task_target_set == duplicate_task_target_set:
 651 |                     LOGGER.warning(
 652 |                         "A duplicate task was submitted: %s original: %s",
 653 |                         new_task, self._task_hash_map[new_task])
 654 |                     self._added_task_count -= 1
 655 |                     return duplicate_task
 656 |                 disjoint_target_set = (
 657 |                     new_task_target_set.symmetric_difference(
 658 |                         duplicate_task_target_set))
 659 |                 if len(disjoint_target_set) == (
 660 |                         len(new_task_target_set) +
 661 |                         len(duplicate_task_target_set)):
 662 |                     if duplicate_task not in dependent_task_list:
 663 |                         LOGGER.info(
 664 |                             "A task was created that had an identical "
 665 |                             "args signature sans target paths, but a "
 666 |                             "different target_path_list of the same "
 667 |                             "length. To avoid recomputation, dynamically "
 668 |                             "adding previous Task (%s) as a dependent "
 669 |                             "task to this one (%s).",
 670 |                             duplicate_task.task_name, task_name)
 671 |                         dependent_task_list = (
 672 |                             dependent_task_list + [duplicate_task])
 673 |                 else:
 674 |                     raise RuntimeError(
 675 |                         "A task was created that has the same arguments "
 676 |                         "as another task, but only partially different "
 677 |                         "expected target paths. This runs the risk of "
 678 |                         "unpredictably overwriting output so treating as "
 679 |                         "a runtime error: submitted task: %s, existing "
 680 |                         "task: %s" % (new_task, duplicate_task))
 681 |             self._task_hash_map[new_task] = new_task
 682 |             if self._n_workers < 0:
 683 |                 # call directly if single threaded
 684 |                 new_task._call()
 685 |             else:
 686 |                 # determine if task is ready or is dependent on other
 687 |                 # tasks
 688 |                 LOGGER.debug(
 689 |                     "multithreaded: %s sending to new task queue.",
 690 |                     task_name)
 691 |                 outstanding_dep_task_name_list = [
 692 |                     dep_task.task_name for dep_task in dependent_task_list
 693 |                     if dep_task.task_name
 694 |                     not in self._completed_task_names]
 695 |                 if not outstanding_dep_task_name_list:
 696 |                     LOGGER.debug(
 697 |                         "sending task %s right away", new_task.task_name)
 698 |                     self._task_ready_priority_queue.put(new_task)
 699 |                     self._task_waiting_count += 1
 700 |                     self._executor_ready_event.set()
 701 |                 else:
 702 |                     # there are unresolved tasks that the waiting
 703 |                     # process scheduler has not been notified of.
 704 |                     # Record dependencies.
 705 |                     for dep_task_name in outstanding_dep_task_name_list:
 706 |                         # record tasks that are dependent on dep_task_name
 707 |                         self._task_dependent_map[dep_task_name].add(
 708 |                             new_task.task_name)
 709 |                         # record tasks that new_task depends on
 710 |                         self._dependent_task_map[new_task.task_name].add(
 711 |                             dep_task_name)
 712 |             return new_task
 713 | 
 714 |         except Exception:
 715 |             # something went wrong, shut down the taskgraph
 716 |             LOGGER.exception(
 717 |                 "Something went wrong when adding task %s, "
 718 |                 "terminating taskgraph.", task_name)
 719 |             self._terminate()
 720 |             raise
 721 | 
 722 |     def _execution_monitor(self, monitor_wait_event):
 723 |         """Log state of taskgraph every ``self._reporting_interval`` seconds.
 724 | 
 725 |         Args:
 726 |             monitor_wait_event (threading.Event): used to sleep the monitor
 727 |                 for``self._reporting_interval`` seconds, or to wake up to
 728 |                 terminate for shutdown.
 729 | 
 730 |         Returns:
 731 |             None.
 732 | 
 733 |         """
 734 |         start_time = time.time()
 735 |         while True:
 736 |             if self._terminated:
 737 |                 break
 738 |             active_task_count = len(self._active_task_list)
 739 |             queue_length = self._task_ready_priority_queue.qsize()
 740 |             active_task_message = '\n'.join(
 741 |                 ['\t%s: executing for %.2fs' % (
 742 |                     task_name, time.time() - task_time)
 743 |                  for task_name, task_time in self._active_task_list])
 744 | 
 745 |             completed_tasks = len(self._completed_task_names)
 746 |             percent_complete = 0.0
 747 |             if self._added_task_count > 0:
 748 |                 percent_complete = 100.0 * (
 749 |                     float(completed_tasks) / self._added_task_count)
 750 | 
 751 |             LOGGER.info(
 752 |                 "\n\ttaskgraph execution status: tasks added: %d \n"
 753 |                 "\ttasks complete: %d (%.1f%%) \n"
 754 |                 "\ttasks waiting for a free worker: %d (qsize: %d)\n"
 755 |                 "\ttasks executing (%d): graph is %s\n%s",
 756 |                 self._added_task_count, completed_tasks, percent_complete,
 757 |                 self._task_waiting_count, queue_length, active_task_count,
 758 |                 'closed' if self._closed else 'open',
 759 |                 active_task_message)
 760 | 
 761 |             monitor_wait_event.wait(
 762 |                 timeout=self._reporting_interval - (
 763 |                     (time.time() - start_time)) % self._reporting_interval)
 764 |         LOGGER.debug("_execution monitor shutting down")
 765 | 
 766 |     def join(self, timeout=None):
 767 |         """Join all threads in the graph.
 768 | 
 769 |         Args:
 770 |             timeout (float): if not none will attempt to join subtasks with
 771 |                 this value. If a subtask times out, the whole function will
 772 |                 timeout.
 773 | 
 774 |         Returns:
 775 |             True if successful join, False if timed out.
 776 | 
 777 |         """
 778 |         LOGGER.debug("joining taskgraph")
 779 |         if self._n_workers < 0:
 780 |             # Join() is meaningless since tasks execute synchronously.
 781 |             LOGGER.debug(
 782 |                 'n_workers: %s; join is vacuously true' % self._n_workers)
 783 |             return True
 784 | 
 785 |         try:
 786 |             LOGGER.debug("attempting to join threads")
 787 |             timedout = False
 788 |             for task in self._task_hash_map.values():
 789 |                 LOGGER.debug("attempting to join task %s", task.task_name)
 790 |                 # task.join() will raise any exception that resulted from the
 791 |                 # task's execution.
 792 |                 timedout = not task.join(timeout)
 793 |                 LOGGER.debug("task %s was joined", task.task_name)
 794 |                 # if the last task timed out then we want to timeout for all
 795 |                 # of the task graph
 796 |                 if timedout:
 797 |                     LOGGER.info(
 798 |                         "task %s timed out in graph join", task.task_name)
 799 |                     return False
 800 |             if self._closed:
 801 |                 # Close down the taskgraph; ok if already terminated
 802 |                 self._executor_ready_event.set()
 803 |                 self._terminate()
 804 |             return True
 805 |         except Exception:
 806 |             # If there's an exception on a join it means that a task failed
 807 |             # to execute correctly. Print a helpful message then terminate the
 808 |             # taskgraph object.
 809 |             LOGGER.exception(
 810 |                 "Exception raised when joining task %s. It's possible "
 811 |                 "that this task did not cause the exception, rather another "
 812 |                 "exception terminated the task_graph. Check the log to see "
 813 |                 "if there are other exceptions.", task)
 814 |             self._terminate()
 815 |             raise
 816 | 
 817 |     def close(self):
 818 |         """Prevent future tasks from being added to the work queue."""
 819 |         LOGGER.debug("Closing taskgraph.")
 820 |         if self._closed:
 821 |             return
 822 |         self._closed = True
 823 |         # this wakes up all the executors and any that wouldn't otherwise
 824 |         # have work to do will see there are no tasks left and terminate
 825 |         self._executor_ready_event.set()
 826 |         LOGGER.debug("taskgraph closed")
 827 | 
 828 |     def _terminate(self):
 829 |         """Immediately terminate remaining task graph computation."""
 830 |         LOGGER.debug(
 831 |             "Invoking terminate. already terminated? %s", self._terminated)
 832 |         if self._terminated:
 833 |             return
 834 |         try:
 835 |             # it's possible the global state is not well defined, so just in
 836 |             # case we'll wrap it all up in a try/except
 837 |             self._terminated = True
 838 |             if self._executor_ready_event is not None:
 839 |                 # alert executors to check that _terminated is True
 840 |                 self._executor_ready_event.set()
 841 |             LOGGER.debug("shutting down workers")
 842 |             if self._worker_pool is not None:
 843 |                 self._worker_pool.close()
 844 |                 self._worker_pool.terminate()
 845 |                 self._worker_pool = None
 846 | 
 847 |             # This will terminate the logging worker
 848 |             if self._logging_queue is not None:
 849 |                 self._logging_queue.put(None)
 850 | 
 851 |             # This will cause all 'join'ed Tasks to join.
 852 |             if self._n_workers >= 0:
 853 |                 self._executor_ready_event.set()
 854 |                 if self._reporting_interval is not None:
 855 |                     self._execution_monitor_wait_event.set()
 856 |                 for task in self._task_hash_map.values():
 857 |                     # shortcut to get the tasks to mark as joined
 858 |                     task.task_done_executing_event.set()
 859 | 
 860 |             LOGGER.debug('taskgraph terminated')
 861 |         except Exception:
 862 |             LOGGER.exception(
 863 |                 'ignoring an exception that occurred during _terminate')
 864 | 
 865 | 
 866 | class Task(object):
 867 |     """Encapsulates work/task state for multiprocessing."""
 868 | 
 869 |     def __init__(
 870 |             self, task_name, func, args, kwargs, target_path_list,
 871 |             ignore_path_list, hash_target_files, ignore_directories,
 872 |             transient_run, worker_pool, priority, hash_algorithm,
 873 |             store_result, task_database_path):
 874 |         """Make a Task.
 875 | 
 876 |         Args:
 877 |             task_name (int): unique task id from the task graph.
 878 |             func (function): a function that takes the argument list
 879 |                ``args``
 880 |             args (tuple): a list of arguments to pass to ``func``.  Can be
 881 |                 None.
 882 |             kwargs (dict): keyword arguments to pass to ``func``.  Can be
 883 |                 None.
 884 |             target_path_list (list): a list of filepaths that this task
 885 |                 should generate.
 886 |             ignore_path_list (list): list of file paths that could be in
 887 |                 args/kwargs that should be ignored when considering timestamp
 888 |                 hashes.
 889 |             hash_target_files (bool): If True, the hash value of the target
 890 |                 files will be recorded to determine if a future run of this
 891 |                 function is precalculated. If False, this function only notes
 892 |                 the existence of the target files before determining if
 893 |                 a function call is precalculated.
 894 |             ignore_directories (bool): if the existence/timestamp of any
 895 |                 directories discovered in args or kwargs is used as part
 896 |                 of the work token hash.
 897 |             transient_run (bool): if True a call with an identical execution
 898 |                 hash will be reexecuted on a subsequent instantiation of a
 899 |                 future TaskGraph object. If a duplicate task is submitted
 900 |                 to the same object it will not be re-run in any scenario.
 901 |                 Otherwise if False, subsequent tasks with an identical
 902 |                 execution hash will be skipped.
 903 |             worker_pool (multiprocessing.Pool): if not None, is a
 904 |                 multiprocessing pool that can be used for ``_call`` execution.
 905 |             priority (numeric): the priority of a task is considered when
 906 |                 there is more than one task whose dependencies have been
 907 |                 met and are ready for scheduling. Tasks are inserted into the
 908 |                 work queue in order of decreasing priority. This value can be
 909 |                 positive, negative, and/or floating point.
 910 |             hash_algorithm (string): either a hash function id that
 911 |                 exists in hashlib.algorithms_available, 'sizetimestamp',
 912 |                 or 'exists'. Any paths to actual files in the arguments will
 913 |                 be digested with this algorithm. If value is 'sizetimestamp'
 914 |                 the digest will only use the normed path, size, and timestamp
 915 |                 of any files found in the arguments. If 'exists' will be
 916 |                 considered the same file only if a file with the same filename
 917 |                 exists on disk.
 918 |             store_result (bool): If true, the result of ``func`` will be
 919 |                 stored in the TaskGraph database and retrievable with a call
 920 |                 to ``.get()`` on the Task object.
 921 |             task_database_path (str): path to an SQLITE database that has
 922 |                 table named "taskgraph_data" with the three fields:
 923 |                     task_hash TEXT NOT NULL,
 924 |                     target_path_stats BLOB NOT NULL
 925 |                     result BLOB NOT NULL
 926 |                 If a call is successful its hash is inserted/updated in the
 927 |                 table, the target_path_stats stores the base/target stats
 928 |                 for the target files created by the call and listed in
 929 |                 ``target_path_list``, and the result of ``func`` is stored in
 930 |                 ``result``.
 931 | 
 932 |         """
 933 |         # it is a common error to accidentally pass a non string as to the
 934 |         # target path list, this terminates early if so
 935 |         if any([not (isinstance(path, _VALID_PATH_TYPES))
 936 |                 for path in target_path_list]):
 937 |             raise ValueError(
 938 |                 "Values passed to target_path_list are not strings: %s",
 939 |                 target_path_list)
 940 | 
 941 |         # sort the target path list because the order doesn't matter for
 942 |         # a result, but it would cause a task to be reexecuted if the only
 943 |         # difference was a different order.
 944 |         self._target_path_list = sorted([
 945 |             _normalize_path(path) for path in target_path_list])
 946 |         self.task_name = task_name
 947 |         self._func = func
 948 |         self._args = args
 949 |         self._kwargs = kwargs
 950 |         self._ignore_path_list = [
 951 |             _normalize_path(path) for path in ignore_path_list]
 952 |         self._hash_target_files = hash_target_files
 953 |         self._ignore_directories = ignore_directories
 954 |         self._transient_run = transient_run
 955 |         self._worker_pool = worker_pool
 956 |         self._task_database_path = task_database_path
 957 |         self._hash_algorithm = hash_algorithm
 958 |         self._store_result = store_result
 959 |         self.exception_object = None
 960 | 
 961 |         # invert the priority since sorting goes smallest to largest and we
 962 |         # want more positive priority values to be executed first.
 963 |         self._priority = -priority
 964 | 
 965 |         # Used to ensure only one attempt at executing and also a mechanism
 966 |         # to see when Task is complete. This can be set if a Task finishes
 967 |         # a _call and there are no more attempts at reexecution.
 968 |         self.task_done_executing_event = threading.Event()
 969 | 
 970 |         # These are used to store and later access the result of the call.
 971 |         self._result = None
 972 | 
 973 |         # Calculate a hash based only on argument inputs.
 974 |         try:
 975 |             if not hasattr(Task, 'func_source_map'):
 976 |                 Task.func_source_map = {}
 977 |             # memoize func source code because it's likely we'll import
 978 |             # the same func many times and reflection is slow
 979 |             if self._func not in Task.func_source_map:
 980 |                 Task.func_source_map[self._func] = (
 981 |                     inspect.getsource(self._func))
 982 |             source_code = Task.func_source_map[self._func]
 983 |         except (IOError, TypeError):
 984 |             # many reasons for this, for example, frozen Python code won't
 985 |             # have source code, so just leave blank
 986 |             source_code = ''
 987 | 
 988 |         if not hasattr(self._func, '__name__'):
 989 |             LOGGER.warning(
 990 |                 "function does not have a __name__ which means it will not "
 991 |                 "be considered when calculating a successive input has "
 992 |                 "been changed with another function without __name__.")
 993 |             self._func.__name__ = ''
 994 | 
 995 |         args_clean = []
 996 |         for index, arg in enumerate(self._args):
 997 |             try:
 998 |                 scrubbed_value = _scrub_task_args(arg, self._target_path_list)
 999 |                 _ = pickle.dumps(scrubbed_value)
1000 |                 args_clean.append(scrubbed_value)
1001 |             except TypeError:
1002 |                 LOGGER.warning(
1003 |                     "could not pickle argument at index %d (%s). "
1004 |                     "Skipping argument which means it will not be considered "
1005 |                     "when calculating whether inputs have been changed "
1006 |                     "on a successive run.", index, arg)
1007 | 
1008 |         kwargs_clean = {}
1009 |         # iterate through sorted order so we get the same hash result with the
1010 |         # same set of kwargs irrespective of the item dict order.
1011 |         for key, arg in sorted(self._kwargs.items()):
1012 |             try:
1013 |                 scrubbed_value = _scrub_task_args(arg, self._target_path_list)
1014 |                 _ = pickle.dumps(scrubbed_value)
1015 |                 kwargs_clean[key] = scrubbed_value
1016 |             except TypeError:
1017 |                 LOGGER.warning(
1018 |                     "could not pickle kw argument %s (%s) scrubbed to %s. "
1019 |                     "Skipping argument which means it will not be considered "
1020 |                     "when calculating whether inputs have been changed "
1021 |                     "on a successive run.", key, arg, scrubbed_value)
1022 | 
1023 |         self._reexecution_info = {
1024 |             'func_name': self._func.__name__,
1025 |             'args_clean': args_clean,
1026 |             'kwargs_clean': kwargs_clean,
1027 |             'source_code_hash': hashlib.sha1(
1028 |                 source_code.encode('utf-8')).hexdigest(),
1029 |         }
1030 | 
1031 |         argument_hash_string = ':'.join([
1032 |             repr(self._reexecution_info[key])
1033 |             for key in sorted(self._reexecution_info.keys())])
1034 | 
1035 |         self._task_id_hash = hashlib.sha1(
1036 |             argument_hash_string.encode('utf-8')).hexdigest()
1037 | 
1038 |         # this will get calculated when ``is_precalculated`` is invoked.
1039 |         self._task_reexecution_hash = None
1040 | 
1041 |     def __eq__(self, other):
1042 |         """Two tasks are equal if their hashes are equal."""
1043 |         return (
1044 |             isinstance(self, other.__class__) and
1045 |             (self._task_id_hash == other._task_id_hash))
1046 | 
1047 |     def __hash__(self):
1048 |         """Return the base-16 integer hash of this hash string."""
1049 |         return int(self._task_id_hash, 16)
1050 | 
1051 |     def __ne__(self, other):
1052 |         """Inverse of __eq__."""
1053 |         return not self.__eq__(other)
1054 | 
1055 |     def __lt__(self, other):
1056 |         """Less than based on priority."""
1057 |         return self._priority < other._priority
1058 | 
1059 |     def __repr__(self):
1060 |         """Create a string representation of a Task."""
1061 |         return "Task object %s:\n\n" % (id(self)) + pprint.pformat(
1062 |             {
1063 |                 "task_name": self.task_name,
1064 |                 "priority": self._priority,
1065 |                 "ignore_path_list": self._ignore_path_list,
1066 |                 "ignore_directories": self._ignore_directories,
1067 |                 "target_path_list": self._target_path_list,
1068 |                 "task_id_hash": self._task_id_hash,
1069 |                 "task_reexecution_hash": self._task_reexecution_hash,
1070 |                 "exception_object": self.exception_object,
1071 |                 "self._reexecution_info": self._reexecution_info,
1072 |                 "self._result": self._result,
1073 |             })
1074 | 
1075 |     def _call(self):
1076 |         """Invoke this method to execute task.
1077 | 
1078 |         Precondition is that the Task dependencies are satisfied.
1079 | 
1080 |         Sets the ``self.task_done_executing_event`` flag if execution is
1081 |         successful.
1082 | 
1083 |         Raises:
1084 |             RuntimeError if any target paths are not generated after the
1085 |                 function call is complete.
1086 | 
1087 |         """
1088 |         LOGGER.debug("_call check if precalculated %s", self.task_name)
1089 |         if not self._transient_run and self.is_precalculated():
1090 |             self.task_done_executing_event.set()
1091 |             return
1092 |         LOGGER.debug("not precalculated %s", self.task_name)
1093 | 
1094 |         if self._worker_pool is not None:
1095 |             result = self._worker_pool.apply_async(
1096 |                 func=self._func, args=self._args, kwds=self._kwargs)
1097 |             # the following blocks and raises an exception if result
1098 |             # raised an exception
1099 |             LOGGER.debug("apply_async for task %s", self.task_name)
1100 |             payload = result.get()
1101 |         else:
1102 |             LOGGER.debug("direct _func for task %s", self.task_name)
1103 |             payload = self._func(*self._args, **self._kwargs)
1104 |         if self._store_result:
1105 |             self._result = payload
1106 | 
1107 |         # check that the target paths exist and record stats for later
1108 |         if not self._hash_target_files:
1109 |             target_hash_algorithm = 'exists'
1110 |         else:
1111 |             target_hash_algorithm = self._hash_algorithm
1112 |         result_target_path_stats = list(
1113 |             _get_file_stats(
1114 |                 self._target_path_list, target_hash_algorithm, [], False))
1115 |         result_target_path_set = set(
1116 |             [x[0] for x in result_target_path_stats])
1117 |         target_path_set = set(self._target_path_list)
1118 |         if target_path_set != result_target_path_set:
1119 |             raise RuntimeError(
1120 |                 "In Task: %s\nMissing expected target path results.\n"
1121 |                 "Expected: %s\nObserved: %s\n" % (
1122 |                     self.task_name, self._target_path_list,
1123 |                     result_target_path_set))
1124 | 
1125 |         # this step will only record the run if there is an expected
1126 |         # target file. Otherwise we infer the result of this call is
1127 |         # transient between taskgraph executions and we should expect to
1128 |         # run it again.
1129 |         if not self._transient_run:
1130 |             _execute_sqlite(
1131 |                 "INSERT OR REPLACE INTO taskgraph_data VALUES (?, ?, ?)",
1132 |                 self._task_database_path, mode='modify',
1133 |                 argument_list=(
1134 |                     self._task_reexecution_hash,
1135 |                     pickle.dumps(result_target_path_stats),
1136 |                     pickle.dumps(self._result)))
1137 |         self.task_done_executing_event.set()
1138 |         LOGGER.debug("successful run on task %s", self.task_name)
1139 | 
1140 |     def is_precalculated(self):
1141 |         """Return true if _call need not be invoked.
1142 | 
1143 |         If the task has been precalculated it will fetch the return result from
1144 |         the previous run.
1145 | 
1146 |         Returns:
1147 |             True if the Task's target paths exist in the same state as the
1148 |             last recorded run at the time this function is called. It is
1149 |             possible this value could change without running the Task if
1150 |             input parameter file stats change. False otherwise.
1151 | 
1152 |         """
1153 |         # This gets a list of the files and their file stats that can be found
1154 |         # in args and kwargs but ignores anything specifically targeted or
1155 |         # an expected result. This will allow a task to change its hash in
1156 |         # case a different version of a file was passed in.
1157 |         # these are the stats of the files that exist that aren't ignored
1158 |         if not self._hash_target_files:
1159 |             target_hash_algorithm = 'exists'
1160 |         else:
1161 |             target_hash_algorithm = self._hash_algorithm
1162 |         file_stat_list = list(_get_file_stats(
1163 |             [self._args, self._kwargs],
1164 |             target_hash_algorithm,
1165 |             self._target_path_list+self._ignore_path_list,
1166 |             self._ignore_directories))
1167 | 
1168 |         other_arguments = _filter_non_files(
1169 |             [self._reexecution_info['args_clean'],
1170 |              self._reexecution_info['kwargs_clean']],
1171 |             self._target_path_list,
1172 |             self._ignore_path_list,
1173 |             self._ignore_directories)
1174 | 
1175 |         LOGGER.debug("file_stat_list: %s", file_stat_list)
1176 |         LOGGER.debug("other_arguments: %s", other_arguments)
1177 | 
1178 |         # add the file stat list to the already existing reexecution info
1179 |         # dictionary that contains stats that should not change whether
1180 |         # files have been created/updated/or not.
1181 |         self._reexecution_info['file_stat_list'] = file_stat_list
1182 |         self._reexecution_info['other_arguments'] = other_arguments
1183 | 
1184 |         reexecution_string = '%s:%s:%s:%s:%s' % (
1185 |             self._reexecution_info['func_name'],
1186 |             self._reexecution_info['source_code_hash'],
1187 |             self._reexecution_info['other_arguments'],
1188 |             self._store_result,
1189 |             # the x[1] is to only take the digest part of the 'file_stat'
1190 |             str([x[1] for x in file_stat_list]))
1191 | 
1192 |         self._task_reexecution_hash = hashlib.sha1(
1193 |             reexecution_string.encode('utf-8')).hexdigest()
1194 |         try:
1195 |             database_result = _execute_sqlite(
1196 |                 """SELECT target_path_stats, result from taskgraph_data
1197 |                     WHERE (task_reexecution_hash == ?)""",
1198 |                 self._task_database_path, mode='read_only',
1199 |                 argument_list=(self._task_reexecution_hash,), fetch='one')
1200 |             if database_result is None:
1201 |                 LOGGER.debug(
1202 |                     "not precalculated, Task hash does not "
1203 |                     "exist (%s)", self.task_name)
1204 |                 LOGGER.debug("is_precalculated full task info: %s", self)
1205 |                 return False
1206 |             result_target_path_stats = pickle.loads(database_result[0])
1207 |             mismatched_target_file_list = []
1208 |             for path, hash_string in result_target_path_stats:
1209 |                 if path not in self._target_path_list:
1210 |                     mismatched_target_file_list.append(
1211 |                         'Recorded path not in target path list %s' % path)
1212 |                 if not os.path.exists(path):
1213 |                     mismatched_target_file_list.append(
1214 |                         'Path not found: %s' % path)
1215 |                     continue
1216 |                 elif target_hash_algorithm == 'exists':
1217 |                     # this is the case where hash_algorithm == 'exists' but
1218 |                     # we already know the file exists so we do nothing
1219 |                     continue
1220 |                 if target_hash_algorithm == 'sizetimestamp':
1221 |                     size, modified_time, actual_path = [
1222 |                         x for x in hash_string.split('::')]
1223 |                     if actual_path != path:
1224 |                         mismatched_target_file_list.append(
1225 |                             "Path names don't match\n"
1226 |                             "cached: (%s)\nactual (%s)" % (path, actual_path))
1227 | 
1228 |                     # Using nanosecond resolution for mtime (instead of the
1229 |                     # usual float result of os.path.getmtime()) allows us to
1230 |                     # precisely compare modification time because we're
1231 |                     # comparing ints: st_mtime_ns always returns an int.
1232 |                     #
1233 |                     # Timestamp resolution: the python docs note that "many
1234 |                     # filesystems do not provide nanosecond precision".
1235 |                     # This is true (e.g. FAT, FAT32 timestamps are only
1236 |                     # accurate to within 2 seconds), but the data read from the
1237 |                     # filesystem will be consistent. This lets us know
1238 |                     # whether the timestamp changed.  This also means that, on
1239 |                     # FAT filesystems, if a file is changed within 2s of its
1240 |                     # creation time, we might not be able to detect it.  This
1241 |                     # is a weakness of FAT, not taskgraph.
1242 |                     target_modified_time = os.stat(path).st_mtime_ns
1243 |                     if not int(modified_time) == target_modified_time:
1244 |                         mismatched_target_file_list.append(
1245 |                             "Modified times don't match "
1246 |                             "cached: (%f) actual: (%f)" % (
1247 |                                 float(modified_time), target_modified_time))
1248 |                         continue
1249 |                     target_size = os.path.getsize(path)
1250 |                     if float(size) != target_size:
1251 |                         mismatched_target_file_list.append(
1252 |                             "File sizes don't match "
1253 |                             "cached: (%s) actual: (%s)" % (
1254 |                                 size, target_size))
1255 |                 else:
1256 |                     target_hash = _hash_file(path, target_hash_algorithm)
1257 |                     if hash_string != target_hash:
1258 |                         mismatched_target_file_list.append(
1259 |                             "File hashes are different. cached: (%s) "
1260 |                             "actual: (%s)" % (hash_string, target_hash))
1261 |             if mismatched_target_file_list:
1262 |                 LOGGER.info(
1263 |                     "not precalculated (%s), Task hash exists, "
1264 |                     "but there are these mismatches: %s",
1265 |                     self.task_name, '\n'.join(mismatched_target_file_list))
1266 |                 return False
1267 |             if self._store_result:
1268 |                 self._result = pickle.loads(database_result[1])
1269 |             LOGGER.debug("precalculated (%s)" % self)
1270 |             return True
1271 |         except EOFError:
1272 |             LOGGER.exception("not precalculated %s, EOFError", self.task_name)
1273 |             return False
1274 | 
1275 |     def join(self, timeout=None):
1276 |         """Block until task is complete, raise exception if runtime failed."""
1277 |         LOGGER.debug(
1278 |             "joining %s done executing: %s", self.task_name,
1279 |             self.task_done_executing_event)
1280 |         successful_wait = self.task_done_executing_event.wait(timeout)
1281 |         if self.exception_object:
1282 |             raise self.exception_object
1283 |         return successful_wait
1284 | 
1285 |     def get(self, timeout=None):
1286 |         """Return the result of the ``func`` once it is ready.
1287 | 
1288 |         If ``timeout`` is None, this call blocks until the task is complete
1289 |         determined by a call to ``.join()``. Otherwise will wait up to
1290 |         ``timeout`` seconds before raising a``RuntimeError`` if exceeded.
1291 | 
1292 |         Args:
1293 |             timeout (float): if not None this parameter is a floating point
1294 |                 number specifying a timeout for the operation in seconds.
1295 | 
1296 |         Returns:
1297 |             value of the result
1298 | 
1299 |         Raises:
1300 |             RuntimeError when ``timeout`` exceeded.
1301 |             ValueError if ``store_result`` was set to ``False`` when the task
1302 |                 was created.
1303 | 
1304 |         """
1305 |         if not self._store_result:
1306 |             raise ValueError(
1307 |                 'must set `store_result` to True in `add_task` to invoke this '
1308 |                 'function')
1309 |         timeout = not self.join(timeout)
1310 |         if timeout:
1311 |             raise RuntimeError('call to get timed out')
1312 |         return self._result
1313 | 
1314 | 
1315 | def _get_file_stats(
1316 |         base_value, hash_algorithm, ignore_list,
1317 |         ignore_directories):
1318 |     """Return fingerprints of any filepaths in ``base_value``.
1319 | 
1320 |     Args:
1321 |         base_value: any python value. Any file paths in ``base_value``
1322 |             should be processed with `_normalize_path`.
1323 |         hash_algorithm (string): either a hash function id that
1324 |             exists in hashlib.algorithms_available, 'exists', or
1325 |             'sizetimestamp'. Any paths to actual files in the arguments will be
1326 |             digested with this algorithm. If value is 'sizetimestamp' the
1327 |             digest will only use the normed path, size, and timestamp of any
1328 |             files found in the arguments. This value is used when
1329 |             determining whether a task is precalculated or its target
1330 |             files can be copied to an equivalent task. Note if
1331 |             ``hash_algorithm`` is 'sizetimestamp' the task will require the
1332 |             same base path files to determine equality. If it is a
1333 |             ``hashlib`` algorithm only file contents will be considered. If
1334 |             this value is 'exists' the value of the hash will be 'exists'.
1335 |         ignore_list (list): any paths found in this list are not included
1336 |             as part of the file stats. All paths in this list should be
1337 |             "os.path.norm"ed.
1338 |         ignore_directories (boolean): If True directories are not
1339 |             considered for filestats.
1340 | 
1341 | 
1342 |     Return:
1343 |         list of (path, digest) tuples for any filepaths found in
1344 |             base_value or nested in base value that are not otherwise
1345 |             ignored by the input parameters where digest is created by
1346 |             the hash algorithm specified in ``hash_algorithm``.
1347 | 
1348 |     """
1349 |     if isinstance(base_value, _VALID_PATH_TYPES):
1350 |         try:
1351 |             norm_path = _normalize_path(base_value)
1352 |             if norm_path not in ignore_list and (
1353 |                     not os.path.isdir(norm_path) or
1354 |                     not ignore_directories) and os.path.exists(norm_path):
1355 |                 if hash_algorithm == 'exists':
1356 |                     yield (norm_path, 'exists')
1357 |                 else:
1358 |                     yield (
1359 |                         norm_path, _hash_file(norm_path, hash_algorithm))
1360 |         except (OSError, ValueError):
1361 |             # I ran across a ValueError when one of the os.path functions
1362 |             # interpreted the value as a path that was too long.
1363 |             # OSErrors could happen if there's coincidentally a directory we
1364 |             # can't read or it's not a file or something else out of our
1365 |             # control
1366 |             LOGGER.exception(
1367 |                 "base_value couldn't be analyzed somehow '%s'", base_value)
1368 |     elif isinstance(base_value, dict):
1369 |         for key in base_value.keys():
1370 |             value = base_value[key]
1371 |             for stat in _get_file_stats(
1372 |                     value, hash_algorithm, ignore_list, ignore_directories):
1373 |                 yield stat
1374 |     elif isinstance(base_value, (list, set, tuple)):
1375 |         for value in base_value:
1376 |             for stat in _get_file_stats(
1377 |                     value, hash_algorithm, ignore_list, ignore_directories):
1378 |                 yield stat
1379 | 
1380 | 
1381 | def _filter_non_files(
1382 |         base_value, keep_list, ignore_list, keep_directories):
1383 |     """Remove any values that are files not in ignore list or directories.
1384 | 
1385 |     Args:
1386 |         base_value: any python value. Any file paths in ``base_value``
1387 |             should be "os.path.norm"ed before this function is called.
1388 |             contains filepaths in any nested structure.
1389 |         keep_list (list): any paths found in this list are not filtered.
1390 |             All paths in this list should be "os.path.norm"ed.
1391 |         ignore_list (list): any paths found in this list are filtered.
1392 |         keep_directories (boolean): If True directories are not filtered
1393 |             out.
1394 | 
1395 |     Return:
1396 |         original ``base_value`` with any nested file paths for files that
1397 |         exist in the os.exists set to ``None``.
1398 | 
1399 |     """
1400 |     if isinstance(base_value, _VALID_PATH_TYPES):
1401 |         try:
1402 |             norm_path = _normalize_path(base_value)
1403 |             if norm_path not in ignore_list and (
1404 |                     norm_path in keep_list or ((
1405 |                         os.path.isdir(norm_path) and keep_directories) or (
1406 |                         not os.path.isfile(norm_path) and
1407 |                         not os.path.isdir(norm_path)))):
1408 |                 return norm_path
1409 |             return None
1410 |         except (OSError, ValueError):
1411 |             # I ran across a ValueError when one of the os.path functions
1412 |             # interpreted the value as a path that was too long.
1413 |             # OSErrors could happen if there's coincidentally a directory we
1414 |             # can't read or it's not a file or something else out of our
1415 |             # control
1416 |             LOGGER.exception(
1417 |                 "base_value couldn't be analyzed somehow '%s'", base_value)
1418 |     elif isinstance(base_value, dict):
1419 |         return {
1420 |             key: _filter_non_files(
1421 |                 value, keep_list, ignore_list, keep_directories)
1422 |             for key, value in base_value.items()
1423 |         }
1424 |     elif isinstance(base_value, (list, set, tuple)):
1425 |         return type(base_value)([
1426 |             _filter_non_files(
1427 |                 value, keep_list, ignore_list, keep_directories)
1428 |             for value in base_value])
1429 |     else:
1430 |         return base_value
1431 | 
1432 | 
1433 | def _scrub_task_args(base_value, target_path_list):
1434 |     """Attempt to convert ``base_value`` to canonical values.
1435 | 
1436 |     Any paths in ``base_value`` are normalized, any paths that are also in
1437 |     the``target_path_list`` are replaced with a placeholder so that if
1438 |     all other arguments are the same in ``base_value`` except target path
1439 |     name the function will hash to the same.
1440 | 
1441 |     This function can be called before the Task dependencies are satisfied
1442 |     since it doesn't inspect any file stats on disk.
1443 | 
1444 |     Args:
1445 |         base_value: any python value
1446 |         target_path_list (list): a list of strings that if found in
1447 |             ``base_value`` should be replaced with 'in_target_path' so
1448 | 
1449 |     Returns:
1450 |         base_value with any functions replaced as strings and paths in
1451 |             ``target_path_list`` with a 'target_path_list[n]' placeholder.
1452 | 
1453 |     """
1454 |     if callable(base_value):
1455 |         try:
1456 |             if not hasattr(Task, 'func_source_map'):
1457 |                 Task.func_source_map = {}
1458 |             # memoize func source code because it's likely we'll import
1459 |             # the same func many times and reflection is slow
1460 |             if base_value not in Task.func_source_map:
1461 |                 Task.func_source_map[base_value] = (
1462 |                     inspect.getsource(base_value)).replace(
1463 |                         ' ', '').replace('\t', '')
1464 |             source_code = Task.func_source_map[base_value]
1465 |         except (IOError, TypeError):
1466 |             # many reasons for this, for example, frozen Python code won't
1467 |             # have source code, so just leave blank
1468 |             source_code = ''
1469 |         return '%s:%s' % (base_value.__name__, source_code)
1470 |     elif isinstance(base_value, dict):
1471 |         result_dict = {}
1472 |         for key in base_value.keys():
1473 |             result_dict[key] = _scrub_task_args(
1474 |                 base_value[key], target_path_list)
1475 |         return result_dict
1476 |     elif isinstance(base_value, (list, set, tuple)):
1477 |         result_list = []
1478 |         for value in base_value:
1479 |             result_list.append(_scrub_task_args(value, target_path_list))
1480 |         return type(base_value)(result_list)
1481 |     elif isinstance(base_value, _VALID_PATH_TYPES):
1482 |         normalized_path = _normalize_path(base_value)
1483 |         if normalized_path in target_path_list:
1484 |             return 'in_target_path_list'
1485 |         else:
1486 |             return normalized_path
1487 |     else:
1488 |         return base_value
1489 | 
1490 | 
1491 | def _hash_file(file_path, hash_algorithm, buf_size=2**20):
1492 |     """Return a hex digest of ``file_path``.
1493 | 
1494 |     Args:
1495 |         file_path (string): path to file to hash.
1496 |         hash_algorithm (string): a hash function id that exists in
1497 |             hashlib.algorithms_available or 'sizetimestamp'. If function id
1498 |             is in hashlib.algorithms_available, the file contents are hashed
1499 |             with that function and the fingerprint is returned. If value is
1500 |             'sizetimestamp' the size and timestamp of the file are returned
1501 |             in a string of the form
1502 |             '[sizeinbytes]:[lastmodifiedtime]'.
1503 |         buf_size (int): number of bytes to read from ``file_path`` at a time
1504 |             for digesting.
1505 | 
1506 |     Returns:
1507 |         a hash hex digest computed with hash algorithm ``hash_algorithm``
1508 |         of the binary contents of the file located at ``file_path``.
1509 | 
1510 |     """
1511 |     if hash_algorithm == 'sizetimestamp':
1512 |         norm_path = _normalize_path(file_path)
1513 |         return '%d::%i::%s' % (
1514 |             os.path.getsize(norm_path), os.stat(norm_path).st_mtime_ns,
1515 |             norm_path)
1516 |     hash_func = hashlib.new(hash_algorithm)
1517 |     with open(file_path, 'rb') as f:
1518 |         binary_data = f.read(buf_size)
1519 |         while binary_data:
1520 |             hash_func.update(binary_data)
1521 |             binary_data = f.read(buf_size)
1522 |     return hash_func.hexdigest()
1523 | 
1524 | 
1525 | def _normalize_path(path):
1526 |     """Convert ``path`` into normalized, normcase, absolute filepath."""
1527 |     norm_path = os.path.normpath(path)
1528 |     try:
1529 |         abs_path = os.path.abspath(norm_path)
1530 |     except TypeError:
1531 |         # this occurs when encountering VERY long strings that might be
1532 |         # interpreted as paths
1533 |         LOGGER.warning(
1534 |             "failed to abspath %s so returning normalized path instead")
1535 |         abs_path = norm_path
1536 |     return os.path.normcase(abs_path)
1537 | 
1538 | 
1539 | @retrying.retry(
1540 |     wait_exponential_multiplier=500, wait_exponential_max=3200,
1541 |     stop_max_attempt_number=100)
1542 | def _execute_sqlite(
1543 |         sqlite_command, database_path, argument_list=None,
1544 |         mode='read_only', execute='execute', fetch=None):
1545 |     """Execute SQLite command and attempt retries on a failure.
1546 | 
1547 |     Args:
1548 |         sqlite_command (str): a well formatted SQLite command.
1549 |         database_path (str): path to the SQLite database to operate on.
1550 |         argument_list (list): ``execute == 'execute'`` then this list is passed
1551 |             to the internal sqlite3 ``execute`` call.
1552 |         mode (str): must be either 'read_only' or 'modify'.
1553 |         execute (str): must be either 'execute' or 'script'.
1554 |         fetch (str): if not ``None`` can be either 'all' or 'one'.
1555 |             If not None the result of a fetch will be returned by this
1556 |             function.
1557 | 
1558 |     Returns:
1559 |         result of fetch if ``fetch`` is not None.
1560 | 
1561 |     """
1562 |     cursor = None
1563 |     connection = None
1564 |     try:
1565 |         if mode == 'read_only':
1566 |             ro_uri = r'%s?mode=ro' % pathlib.Path(
1567 |                 os.path.abspath(database_path)).as_uri()
1568 |             LOGGER.debug(
1569 |                 '%s exists: %s', ro_uri, os.path.exists(os.path.abspath(
1570 |                     database_path)))
1571 |             connection = sqlite3.connect(ro_uri, uri=True)
1572 |         elif mode == 'modify':
1573 |             connection = sqlite3.connect(database_path)
1574 |         else:
1575 |             raise ValueError('Unknown mode: %s' % mode)
1576 | 
1577 |         if execute == 'execute':
1578 |             if argument_list is None:
1579 |                 cursor = connection.execute(sqlite_command)
1580 |             else:
1581 |                 cursor = connection.execute(sqlite_command, argument_list)
1582 |         elif execute == 'script':
1583 |             cursor = connection.executescript(sqlite_command)
1584 |         else:
1585 |             raise ValueError('Unknown execute mode: %s' % execute)
1586 | 
1587 |         result = None
1588 |         payload = None
1589 |         if fetch == 'all':
1590 |             payload = (cursor.fetchall())
1591 |         elif fetch == 'one':
1592 |             payload = (cursor.fetchone())
1593 |         elif fetch is not None:
1594 |             raise ValueError('Unknown fetch mode: %s' % fetch)
1595 |         if payload is not None:
1596 |             result = list(payload)
1597 |         cursor.close()
1598 |         connection.commit()
1599 |         connection.close()
1600 |         cursor = None
1601 |         connection = None
1602 |         return result
1603 |     except sqlite3.OperationalError:
1604 |         LOGGER.warning(
1605 |             'TaskGraph database is locked because another process is using '
1606 |             'it, waiting for a bit of time to try again')
1607 |         raise
1608 |     except Exception:
1609 |         LOGGER.exception('Exception on _execute_sqlite: %s', sqlite_command)
1610 |         raise
1611 |     finally:
1612 |         if cursor is not None:
1613 |             cursor.close()
1614 |         if connection is not None:
1615 |             connection.commit()
1616 |             connection.close()
1617 | 


--------------------------------------------------------------------------------