├── requirements_dev.txt
├── taskqueue_cli
    ├── __init__.py
    ├── LICENSE
    └── taskqueue_cli.py
├── MANIFEST.in
├── requirements.txt
├── tox.ini
├── AUTHORS
├── taskqueue
    ├── __init__.py
    ├── paths.py
    ├── queueables.py
    ├── scheduler.py
    ├── lib.py
    ├── registered_task.py
    ├── queueablefns.py
    ├── secrets.py
    ├── aws_queue_api.py
    ├── goog_pubsub_api.py
    ├── threaded_queue.py
    ├── file_queue_api.py
    └── taskqueue.py
├── setup.py
├── test
    ├── pathos_issue.py
    ├── test_filequeue.py
    └── test_taskqueue.py
├── setup.cfg
├── .github
    └── workflows
    │   └── python-package.yml
├── LICENSE
├── .gitignore
├── ChangeLog
└── README.md


/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | moto[all]
2 | pytest
3 | 


--------------------------------------------------------------------------------
/taskqueue_cli/__init__.py:
--------------------------------------------------------------------------------
1 | from .taskqueue_cli import *


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include cloudfiles_cli/LICENSE


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | boto3
 2 | click
 3 | gevent
 4 | google-auth>=1.10.0
 5 | google-cloud-core>=1.1.0
 6 | orjson
 7 | numpy>=1.13.1
 8 | pathos
 9 | pbr
10 | tenacity>=8.0.1
11 | tqdm
12 | requests>2,<3


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py27,py36,py37,py38
 3 | 
 4 | [testenv]
 5 | platform = darwin
 6 | deps = 
 7 | 	-rrequirements.txt
 8 | 	-rrequirements_dev.txt
 9 | 
10 | commands = 
11 |     python setup.py develop
12 | 	python -m pytest -v -x test


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Chris Roat <1053153+chrisroat@users.noreply.github.com>
2 | Forrest Collman <forrest.collman@gmail.com>
3 | Jingpeng Wu <jingpeng.wu@gmail.com>
4 | Nico Kemnitz <nkemnitz@princeton.edu>
5 | William Silversmith <william.silversmith@gmail.com>
6 | supersergiy <sergiy.popovich@gmail.com>
7 | 


--------------------------------------------------------------------------------
/taskqueue/__init__.py:
--------------------------------------------------------------------------------
1 | from .registered_task import RegisteredTask, MockTask, PrintTask
2 | from .taskqueue import (
3 |   TaskQueue, MockTaskQueue, GreenTaskQueue, LocalTaskQueue, 
4 |   multiprocess_upload, QueueEmptyError, totask, UnsupportedProtocolError
5 | )
6 | from .queueablefns import queueable, FunctionTask
7 | 
8 | __version__ = '2.13.0'


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | setuptools.setup(
 4 |   setup_requires=['pbr'],
 5 |   include_package_data=True,
 6 |   entry_points={
 7 |     "console_scripts": [
 8 |       "ptq=taskqueue_cli:main"
 9 |     ],
10 |   },
11 |   long_description_content_type="text/markdown",
12 |   pbr=True,
13 |   extras_require={
14 |     "pubsub": ["google-cloud-pubsub"],
15 |   }
16 | )


--------------------------------------------------------------------------------
/test/pathos_issue.py:
--------------------------------------------------------------------------------
 1 | from taskqueue import PrintTask
 2 | 
 3 | import copy
 4 | 
 5 | def crt_tasks(a,b):
 6 |   bounds = 5
 7 | 
 8 |   class TaskIterator():
 9 |     def __init__(self, x):
10 |       self.x = x 
11 |     def __len__(self):
12 |       return b-a
13 |     def __getitem__(self, slc):
14 |       itr = copy.deepcopy(self)
15 |       itr.x = 666
16 |       return itr
17 |     def __iter__(self):
18 |       for i in range(a,b):
19 |         yield PrintTask(str(i) + str(self.x))
20 | 
21 |   return TaskIterator(bounds)
22 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = task-queue
 3 | url = https://github.com/seung-lab/python-task-queue/
 4 | summary = Multithreaded cloud queue client.
 5 | description-file = README.md
 6 | author = Ignacio Tartavull, William Silversmith, and others
 7 | author-email = ws9@princeton.edu
 8 | home-page = https://github.com/seung-lab/python-task-queue/
 9 | license = BSD-3-Clause
10 | classifier =
11 |     Intended Audience :: Developers
12 |     Development Status :: 5 - Production/Stable
13 |     Programming Language :: Python
14 |     Programming Language :: Python :: 3
15 |     Programming Language :: Python :: 3.9
16 |     Programming Language :: Python :: 3.10
17 |     Programming Language :: Python :: 3.11
18 |     Programming Language :: Python :: 3.12
19 |     Programming Language :: Python :: 3.13
20 |     Topic :: Utilities
21 | 
22 | [global]
23 | setup-hooks =
24 |     pbr.hooks.setup_hook
25 | 
26 | [files]
27 | packages = 
28 |     taskqueue
29 |     taskqueue_cli
30 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python ${{ matrix.python-version }}
23 |       uses: actions/setup-python@v2
24 |       with:
25 |         python-version: ${{ matrix.python-version }}
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         if [ -f requirements.txt ]; then pip install -r requirements.txt -r requirements_dev.txt; fi
30 |     - name: Test with pytest
31 |       run: |
32 |         python -m pytest -v -x test
33 | 


--------------------------------------------------------------------------------
/taskqueue/paths.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | import os
 3 | import posixpath
 4 | import re
 5 | import sys
 6 | 
 7 | from .lib import yellow, toabs
 8 | 
 9 | ExtractedPath = namedtuple('ExtractedPath', 
10 |   ('protocol', 'path')
11 | )
12 | 
13 | ALLOWED_PROTOCOLS = [ 'sqs', 'fq', 'mem', 'pubsub']
14 | 
15 | def mkpath(extracted_path):
16 |   return extracted_path.protocol + "://" + extracted_path.path
17 | 
18 | def get_protocol(cloudpath):
19 |   protocol_re = re.compile(r'(?P<proto>\w+)://')
20 |   match = re.match(protocol_re, cloudpath)
21 |   if not match:
22 |     return None
23 |   return match.group("proto")
24 | 
25 | def pop_protocol(cloudpath):
26 |   protocol_re = re.compile(r'(\w+)://')
27 | 
28 |   match = re.match(protocol_re, cloudpath)
29 | 
30 |   if not match:
31 |     return ("sqs", cloudpath)
32 | 
33 |   (protocol,) = match.groups()
34 |   cloudpath = re.sub(protocol_re, '', cloudpath, count=1)
35 | 
36 |   return (protocol, cloudpath)
37 | 
38 | def extract_path(cloudpath):
39 |   protocol, queue_path = pop_protocol(cloudpath)
40 |   if protocol in ('http', 'https'):
41 |     if 'sqs' in queue_path and 'amazonaws.com' in queue_path:
42 |       protocol = 'sqs'
43 |     queue_path = cloudpath
44 |     
45 |   return ExtractedPath(protocol, queue_path)
46 | 
47 | 


--------------------------------------------------------------------------------
/taskqueue/queueables.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import orjson
 4 | 
 5 | from .queueablefns import totask as qtotask, FunctionTask, FunctionTaskLite, tofunc
 6 | from .registered_task import totask as rtotask, RegisteredTask
 7 | 
 8 | def totask(task):
 9 |   if isinstance(task, (FunctionTask, RegisteredTask)):
10 |     return task
11 | 
12 |   if type(task) is bytes:
13 |     task = task.decode('utf8')
14 |   if isinstance(task, str):
15 |     task = orjson.loads(task)
16 | 
17 |   ident = -1
18 |   if isinstance(task, dict):
19 |     ident = task.get('id', -1)
20 |     if 'payload' in task:
21 |       task = task['payload']
22 | 
23 |   if isinstance(task, FunctionTaskLite):
24 |     return FunctionTask(*task)
25 |   elif isinstance(task, list):
26 |     task[3] = ident
27 |     return FunctionTask(*task)
28 |   elif isinstance(task, dict):
29 |     return rtotask(task, ident)
30 |   elif isinstance(task, partial) or callable(task):
31 |     return qtotask(task, ident)
32 | 
33 |   raise ValueError("Unable to convert {} into a task object.".format(task))
34 | 
35 | def totaskid(taskid):
36 |   if hasattr(taskid, 'id'):
37 |     return taskid.id
38 |   elif 'id' in taskid:
39 |     return taskid['id']
40 |   elif isinstance(taskid, (list, tuple)):
41 |     return taskid[3]
42 |   return taskid


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017-2020, William Silversmith, Seung Lab
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/taskqueue_cli/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017-2020, William Silversmith, Seung Lab
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # OS X
104 | .DS_Store
105 | 
106 | test.py


--------------------------------------------------------------------------------
/taskqueue/scheduler.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | from tqdm import tqdm
  4 | 
  5 | from .threaded_queue import ThreadedQueue, DEFAULT_THREADS
  6 | from .lib import STRING_TYPES
  7 | 
  8 | DEFAULT_THREADS = 20
  9 | 
 10 | def schedule_threaded_jobs(
 11 |     fns, concurrency=DEFAULT_THREADS, 
 12 |     progress=None, total=None
 13 |   ):
 14 |   
 15 |   if total is None:
 16 |     try:
 17 |       total = len(fns)
 18 |     except TypeError: # generators don't have len
 19 |       pass
 20 |   
 21 |   desc = progress if isinstance(progress, STRING_TYPES) else None
 22 | 
 23 |   pbar = tqdm(total=total, desc=desc, disable=(not progress))
 24 |   results = []
 25 |   
 26 |   def updatefn(fn):
 27 |     def realupdatefn(iface):
 28 |       ct = fn()
 29 |       pbar.update(ct)
 30 |       results.append(ct) # cPython list append is thread safe
 31 |     return realupdatefn
 32 | 
 33 |   with ThreadedQueue(n_threads=concurrency) as tq:
 34 |     for fn in fns:
 35 |       tq.put(updatefn(fn))
 36 | 
 37 |   return results
 38 | 
 39 | def schedule_green_jobs(
 40 |     fns, concurrency=DEFAULT_THREADS, 
 41 |     progress=None, total=None
 42 |   ):
 43 |   import gevent.pool
 44 | 
 45 |   if total is None:
 46 |     try:
 47 |       total = len(fns)
 48 |     except TypeError: # generators don't have len
 49 |       pass
 50 | 
 51 |   desc = progress if isinstance(progress, STRING_TYPES) else None
 52 | 
 53 |   pbar = tqdm(total=total, desc=desc, disable=(not progress))
 54 | 
 55 |   results = []
 56 |   exceptions = []
 57 | 
 58 |   def add_exception(greenlet):
 59 |     nonlocal exceptions
 60 |     try:
 61 |       greenlet.get()
 62 |     except Exception as err:
 63 |       exceptions.append(err)
 64 | 
 65 |   def updatefn(fn):
 66 |     def realupdatefn():
 67 |       ct = fn()
 68 |       pbar.update(ct)
 69 |       results.append(ct) # cPython list append is thread safe
 70 |     return realupdatefn
 71 | 
 72 |   pool = gevent.pool.Pool(concurrency)
 73 |   for fn in fns:
 74 |     greenlet = pool.spawn( updatefn(fn) )
 75 |     greenlet.link_exception(add_exception)
 76 | 
 77 |   pool.join()
 78 |   pool.kill()
 79 |   pbar.close()
 80 | 
 81 |   if exceptions:
 82 |     raise_multiple(exceptions)
 83 | 
 84 |   return results
 85 | 
 86 | def schedule_jobs(
 87 |     fns, concurrency=DEFAULT_THREADS, 
 88 |     progress=None, total=None, green=False
 89 |   ):
 90 |   """
 91 |   Given a list of functions, execute them concurrently until
 92 |   all complete. 
 93 | 
 94 |   fns: iterable of functions
 95 |   concurrency: number of threads
 96 |   progress: Falsey (no progress), String: Progress + description
 97 |   total: If fns is a generator, this is the number of items to be generated.
 98 |   green: If True, use green threads.
 99 | 
100 |   Return: list of results
101 |   """
102 |   if concurrency < 0:
103 |     raise ValueError("concurrency value cannot be negative: {}".format(concurrency))
104 |   elif concurrency == 0 or total == 1:
105 |     return [ fn() for fn in tqdm(fns, disable=(not progress or total == 1), desc=progress) ]
106 | 
107 |   if green:
108 |     return schedule_green_jobs(fns, concurrency, progress, total)
109 | 
110 |   return schedule_threaded_jobs(fns, concurrency, progress, total)
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/test/test_filequeue.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import time
  4 | 
  5 | from six.moves import range
  6 | import pytest
  7 | 
  8 | import taskqueue
  9 | from taskqueue import RegisteredTask, TaskQueue, MockTask, PrintTask, LocalTaskQueue
 10 | from taskqueue.paths import ExtractedPath, mkpath
 11 | 
 12 | FILE_QURL = 'fq:///tmp/removeme/taskqueue/fq'
 13 | N = 1000
 14 | 
 15 | def crtq():
 16 |   tq = TaskQueue(FILE_QURL)
 17 |   tq.purge()
 18 |   tq.rezero()
 19 | 
 20 |   tq.insert(( PrintTask(i) for i in range(N) ))
 21 |   return tq
 22 | 
 23 | def test_release_all():
 24 |   tq = crtq()
 25 |   for _ in range(tq.enqueued):
 26 |     task = tq.lease(seconds=3600)
 27 | 
 28 |   now = int(time.time())
 29 |   for fname in os.listdir(tq.api.queue_path):
 30 |     assert int(fname.split('--')[0]) > now
 31 | 
 32 |   tq.release_all()
 33 |   now = int(time.time())
 34 |   for fname in os.listdir(tq.api.queue_path):
 35 |     assert int(fname.split('--')[0]) <= now
 36 | 
 37 |   tq.purge()
 38 | 
 39 | def test_count_completions():
 40 |   tq = crtq()
 41 |   executed = tq.poll(stop_fn=lambda executed: N <= executed)
 42 |   assert tq.completed == 0
 43 |   tq = crtq()
 44 |   tq.poll(stop_fn=lambda executed: N <= executed, tally=True)
 45 |   assert tq.completed == N
 46 | 
 47 |   tq.purge()
 48 | 
 49 | def test_count_insertions():
 50 |   tq = crtq()
 51 |   assert tq.inserted == N
 52 |   tq.rezero()
 53 |   assert tq.inserted == 0
 54 | 
 55 |   tq.purge()
 56 | 
 57 | def test_count_leases():
 58 |   tq = crtq()
 59 |   assert tq.leased == 0
 60 |   tq.lease(seconds=10000)
 61 |   assert tq.leased == 1
 62 |   tq.lease(seconds=10000)
 63 |   tq.lease(seconds=10000)
 64 |   tq.lease(seconds=10000)
 65 |   assert tq.leased == 4
 66 |   tq.release_all()
 67 |   assert tq.leased == 0
 68 | 
 69 |   tq.purge()
 70 | 
 71 | def test_renew():
 72 |   tq = TaskQueue(FILE_QURL)
 73 |   tq.purge()
 74 | 
 75 |   tq.insert(PrintTask('hello'))
 76 | 
 77 |   ts = lambda fname: int(fname.split('--')[0])
 78 |   ident = lambda fname: fname.split('--')[1]
 79 | 
 80 |   filenames = os.listdir(tq.api.queue_path)
 81 |   assert len(filenames) == 1
 82 |   filename = filenames[0]
 83 | 
 84 |   assert ts(filename) == 0
 85 |   identity = ident(filename)
 86 | 
 87 |   now = time.time()
 88 |   tq.renew(filename, 1)
 89 | 
 90 |   filenames = os.listdir(tq.api.queue_path)
 91 |   assert len(filenames) == 1
 92 |   filename = filenames[0]
 93 | 
 94 |   assert ts(filename) >= int(time.time()) + 1
 95 |   assert ident(filename) == identity
 96 | 
 97 | def test_enumerating_tasks():
 98 |   tq = TaskQueue(FILE_QURL)
 99 |   tq.purge()
100 | 
101 |   for _ in range(10):
102 |     tq.insert(PrintTask('hello'))
103 |     tq.insert(PrintTask('world'))
104 | 
105 |   lst = list(tq.tasks())
106 |   
107 |   assert len(lst) == 20
108 |   hello = 0
109 |   world = 0
110 |   for task in lst:
111 |     hello += int(task.txt == "hello")
112 |     world += int(task.txt == "world")
113 | 
114 |   assert hello == 10
115 |   assert world == 10
116 | 
117 | def test_is_empty():
118 |   tq = TaskQueue(FILE_QURL)
119 |   tq.purge()
120 | 
121 |   assert tq.is_empty() == True
122 | 
123 |   tq.insert(PrintTask("hello"))
124 | 
125 |   assert tq.is_empty() == False
126 | 
127 |   task = tq.lease()
128 |   tq.delete(task)
129 | 
130 |   assert tq.is_empty() == True
131 | 
132 | 
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/taskqueue/lib.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import json
  3 | import os.path
  4 | import time
  5 | import types
  6 | import sys
  7 | 
  8 | if sys.version_info < (3,0,0):
  9 |   STRING_TYPES = (str, unicode)
 10 | else:
 11 |   STRING_TYPES = (str,)
 12 | 
 13 | COLORS = {
 14 |   'RESET': "\033[m",
 15 |   'YELLOW': "\033[1;93m",
 16 |   'RED': '\033[1;91m',
 17 |   'GREEN': '\033[1;92m',
 18 | }
 19 | 
 20 | def green(text):
 21 |   return colorize('green', text)
 22 | 
 23 | def yellow(text):
 24 |   return colorize('yellow', text)
 25 | 
 26 | def red(text):
 27 |   return colorize('red', text)
 28 | 
 29 | def colorize(color, text):
 30 |   color = color.upper()
 31 |   return COLORS[color] + text + COLORS['RESET']
 32 | 
 33 | def toabs(path):
 34 |   path = os.path.expanduser(path)
 35 |   return os.path.abspath(path)
 36 | 
 37 | def nvl(*args):
 38 |   """Return the leftmost argument that is not None."""
 39 |   if len(args) < 2:
 40 |     raise IndexError("nvl takes at least two arguments.")
 41 |   for arg in args:
 42 |     if arg is not None:
 43 |       return arg
 44 |   return args[-1]
 45 | 
 46 | def mkdir(path):
 47 |   path = toabs(path)
 48 | 
 49 |   try:
 50 |     if path != '' and not os.path.exists(path):
 51 |       os.makedirs(path)
 52 |   except OSError as e:
 53 |     if e.errno == 17: # File Exists
 54 |       time.sleep(0.1)
 55 |       return mkdir(path)
 56 |     else:
 57 |       raise
 58 | 
 59 |   return path
 60 | 
 61 | def sip(iterable, block_size):
 62 |   """Sips a fixed size from the iterable."""
 63 |   ct = 0
 64 |   block = []
 65 |   for x in iterable:
 66 |     ct += 1
 67 |     block.append(x)
 68 |     if ct == block_size:
 69 |       yield block
 70 |       ct = 0
 71 |       block = []
 72 | 
 73 |   if len(block) > 0:
 74 |     yield block
 75 | 
 76 | class NumpyEncoder(json.JSONEncoder):
 77 |   def default(self, obj):
 78 |     try:
 79 |       import numpy as np
 80 |     except ImportError:
 81 |       try:
 82 |         return json.JSONEncoder.default(self, obj)
 83 |       except TypeError:
 84 |         if 'numpy' in str(type(obj)):
 85 |           print(yellow("Type " + str(type(obj)) + " requires a numpy installation to encode. Try `pip install numpy`."))
 86 |         raise
 87 | 
 88 |     if isinstance(obj, np.ndarray):
 89 |       return obj.tolist()
 90 |     if isinstance(obj, np.integer):
 91 |       return int(obj)
 92 |     if isinstance(obj, np.floating):
 93 |       return float(obj)
 94 |     return json.JSONEncoder.default(self, obj)
 95 | 
 96 | def jsonify(obj, **kwargs):
 97 |   return json.dumps(obj, cls=NumpyEncoder, **kwargs)
 98 | 
 99 | def first(lst):
100 |   if isinstance(lst, types.GeneratorType):
101 |     return next(lst)
102 |   try:
103 |     return lst[0]
104 |   except TypeError:
105 |     return next(iter(lst))
106 | 
107 | def toiter(obj, is_iter=False):
108 |   if isinstance(obj, STRING_TYPES) or isinstance(obj, dict):
109 |     if is_iter:
110 |       return [ obj ], False
111 |     return [ obj ]
112 | 
113 |   try:
114 |     iter(obj)
115 |     if is_iter:
116 |       return obj, True
117 |     return obj 
118 |   except TypeError:
119 |     if is_iter:
120 |       return [ obj ], False
121 |     return [ obj ]
122 | 
123 | def duplicates(lst):
124 |   dupes = []
125 |   seen = set()
126 |   for elem in lst:
127 |     if elem in seen:
128 |       dupes.append(elem)
129 |     seen.add(elem)
130 |   return set(dupes)
131 | 
132 | def scatter(sequence, n):
133 |   """Scatters elements of ``sequence`` into ``n`` blocks. Returns generator."""
134 |   if n < 1:
135 |     raise ValueError('n cannot be less than one. Got: ' + str(n))
136 |   sequence = list(sequence)
137 |   for i in range(n):
138 |     yield sequence[i::n]
139 | 


--------------------------------------------------------------------------------
/taskqueue/registered_task.py:
--------------------------------------------------------------------------------
  1 | import six
  2 | from six import with_metaclass
  3 | import functools
  4 | import inspect
  5 | import orjson
  6 | import copy
  7 | import re
  8 | from collections import OrderedDict
  9 | from functools import partial
 10 | 
 11 | import numpy as np
 12 | 
 13 | REGISTRY = {}
 14 | 
 15 | def totask(task, ident=-1):
 16 |   taskobj = deserialize(task)
 17 |   taskobj._id = ident
 18 |   return taskobj
 19 | 
 20 | def deserialize(data):
 21 |   if type(data) is bytes:
 22 |     data = data.decode('utf8')
 23 | 
 24 |   if isinstance(data, six.string_types):
 25 |     data = orjson.loads(data)
 26 | 
 27 |   name = data['class']
 28 |   target_class = REGISTRY[name]
 29 |   del data['class']
 30 |   return target_class(**data)
 31 | 
 32 | class Meta(type):
 33 |   def __new__(meta, name, bases, class_dict):
 34 |     cls = type.__new__(meta, name, bases, class_dict)
 35 |     REGISTRY[cls.__name__] = cls
 36 | 
 37 |     if hasattr(inspect, 'getfullargspec'):
 38 |       argspecfn = inspect.getfullargspec
 39 |     else:
 40 |       argspecfn = inspect.getargspec 
 41 | 
 42 |     cls._arg_names = argspecfn(class_dict['__init__'])[0][1:]
 43 |     return cls
 44 | 
 45 | class RegisteredTask(with_metaclass(Meta)):
 46 |   def __init__(self, *args, **kwargs):
 47 |     self._args = OrderedDict(zip(self._arg_names, args))
 48 |     self._args.update(kwargs)
 49 | 
 50 |     for k,v in six.iteritems(self._args):
 51 |       self.__dict__[k] = v
 52 |   
 53 |   @classmethod
 54 |   def deserialize(cls, data):
 55 |     obj = deserialize(data)
 56 |     assert isinstance(obj, cls)
 57 |     return obj
 58 | 
 59 |   def payload(self):
 60 |     def denumpy(obj):
 61 |       if hasattr(obj, 'serialize') and callable(obj.serialize):
 62 |         return obj.serialize()
 63 | 
 64 |       try:
 65 |         iter(obj)
 66 |       except TypeError:
 67 |         return obj
 68 | 
 69 |       if isinstance(obj, bytes):
 70 |         return obj.decode('utf8')
 71 |       elif isinstance(obj, str):
 72 |         return obj
 73 | 
 74 |       if isinstance(obj, list) or isinstance(obj, tuple):
 75 |         return [ denumpy(x) for x in obj ] 
 76 | 
 77 |       for key, val in six.iteritems(obj):
 78 |         if isinstance(val, np.ndarray):
 79 |           obj[key] = val.tolist()
 80 |         elif isinstance(val, dict):
 81 |           obj[key] = denumpy(val)
 82 |         elif isinstance(val, list):
 83 |           obj[key] = [ denumpy(x) for x in val ]
 84 |         elif hasattr(val, 'serialize') and callable(val.serialize):
 85 |           obj[key] = val.serialize()
 86 | 
 87 |       return obj
 88 | 
 89 |     argcpy = copy.deepcopy(self._args)
 90 |     for k,v in six.iteritems(self._args):
 91 |       argcpy[k] = self.__dict__[k]
 92 |     argcpy['class'] = self.__class__.__name__
 93 | 
 94 |     return dict(denumpy(argcpy))
 95 | 
 96 |   @property
 97 |   def id(self):
 98 |     return self._id
 99 | 
100 |   def __repr__(self):
101 |     string = self.__class__.__name__ + "("
102 |     for arg_name, arg_value in six.iteritems(self._args):
103 |       if isinstance(arg_value, six.string_types):
104 |         string += "{}='{}',".format(arg_name, arg_value)
105 |       else:
106 |         string += "{}={},".format(arg_name, arg_value)
107 | 
108 |     # remove the last comma if necessary
109 |     if string[-1] == ',':
110 |       string = string[:-1]
111 | 
112 |     return string + ")"  
113 | 
114 | class PrintTask(RegisteredTask):
115 |   def __init__(self, txt=''):
116 |     super(PrintTask, self).__init__(txt)
117 |     self.txt = txt
118 | 
119 |   def execute(self):
120 |     if self.txt:
121 |       print(str(self) + ": " + str(self.txt))
122 |     else:
123 |       print(self)
124 | 
125 | class MockTask(RegisteredTask):
126 |   def __init__(self, **kwargs):
127 |     super(MockTask, self).__init__(**kwargs)
128 |   def execute(self):
129 |     pass


--------------------------------------------------------------------------------
/taskqueue_cli/taskqueue_cli.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import importlib
  4 | 
  5 | import click
  6 | from tqdm import tqdm
  7 | 
  8 | from taskqueue import TaskQueue, __version__, QueueEmptyError
  9 | from taskqueue.lib import toabs
 10 | from taskqueue.paths import get_protocol
 11 | 
 12 | def normalize_path(queuepath):
 13 |   if not get_protocol(queuepath):
 14 |     return "fq://" + toabs(queuepath)
 15 |   return queuepath
 16 | 
 17 | @click.group()
 18 | @click.version_option(version=__version__)
 19 | def main():
 20 |   """
 21 |   CLI tool for managing python-task-queue queues.
 22 | 
 23 |   https://github.com/seung-lab/python-task-queue
 24 |   """
 25 |   pass
 26 | 
 27 | @main.command()
 28 | def license():
 29 |   """Prints the license for this library and cli tool."""
 30 |   path = os.path.join(os.path.dirname(__file__), 'LICENSE')
 31 |   with open(path, 'rt') as f:
 32 |     print(f.read())
 33 | 
 34 | @main.command()
 35 | @click.argument("queuepath")
 36 | def rezero(queuepath):
 37 |   """Reset collected statistics for queue."""
 38 |   TaskQueue(normalize_path(queuepath)).rezero()
 39 | 
 40 | @main.command()
 41 | @click.argument("queuepath")
 42 | def status(queuepath):
 43 |   """Print vital statistics for queue."""
 44 |   tq = TaskQueue(normalize_path(queuepath))
 45 |   ins = tq.inserted
 46 |   enq = tq.enqueued
 47 |   comp = tq.completed
 48 |   leased = tq.leased
 49 | 
 50 |   if not math.isnan(ins):
 51 |     print(f"Inserted: {ins}")
 52 | 
 53 |   if ins > 0:
 54 |     print(f"Enqueued: {enq} ({enq / ins * 100:.1f}% left)")
 55 |     if not math.isnan(comp):
 56 |       print(f"Completed: {comp} ({comp / ins * 100:.1f}%)")
 57 |   else:
 58 |     print(f"Enqueued: {enq} (--% left)")
 59 |     if not math.isnan(comp):
 60 |       print(f"Completed: {comp} (--%)")
 61 | 
 62 |   if enq > 0:
 63 |     print(f"Leased: {leased} ({leased / enq * 100:.1f}% of queue)")
 64 |   else:
 65 |     print(f"Leased: {leased} (--%) of queue")
 66 | 
 67 | @main.command()
 68 | @click.argument("queuepath")
 69 | def release(queuepath):
 70 |   """Release all tasks from their leases."""
 71 |   TaskQueue(normalize_path(queuepath)).release_all()
 72 | 
 73 | @main.command()
 74 | @click.argument("src")
 75 | @click.argument("dest")
 76 | @click.option('--load', default=None, help="Load a module to get task definitions.", show_default=True)
 77 | def cp(src, dest, load):
 78 |   """
 79 |   Copy the contents of a queue to another
 80 |   service or location. Do not run this
 81 |   process while a queue is being worked.
 82 | 
 83 |   Currently sqs queues are not copiable,
 84 |   but you can copy an fq to sqs. The mv
 85 |   command supports sqs queues.
 86 |   """
 87 |   if load:
 88 |     importlib.import_module(load)
 89 | 
 90 |   src = normalize_path(src)
 91 |   dest = normalize_path(dest)
 92 | 
 93 |   if get_protocol(src) == "sqs":
 94 |     print("ptq: cp does not support sqs:// as a source.")
 95 |     return
 96 | 
 97 |   tqd = TaskQueue(dest)
 98 |   tqs = TaskQueue(src)
 99 | 
100 |   tqd.insert(tqs)
101 | 
102 | @main.command()
103 | @click.argument("src")
104 | @click.argument("dest")
105 | def mv(src, dest):
106 |   """
107 |   Moves the contents of a queue to another
108 |   service or location. Do not run this
109 |   process while a queue is being worked.
110 | 
111 |   Moving an sqs queue to a file queue
112 |   may result in duplicated tasks.
113 |   """
114 |   src = normalize_path(src)
115 |   dest = normalize_path(dest)
116 | 
117 |   tqd = TaskQueue(dest, progress=False)
118 |   tqs = TaskQueue(src, progress=False)
119 | 
120 |   total = tqs.enqueued
121 |   with tqdm(total=total, desc="Moving") as pbar:
122 |     while True:
123 |       try:
124 |         tasks = tqs.lease(num_tasks=10, seconds=10)
125 |       except QueueEmptyError:
126 |         break
127 | 
128 |       tqd.insert(tasks)
129 |       tqs.delete(tasks)
130 |       pbar.update(len(tasks))
131 | 
132 | @main.command()
133 | @click.argument("queuepath")
134 | def purge(queuepath):
135 |   """Delete all queued messages and zero out queue statistics."""
136 |   queuepath = normalize_path(queuepath)
137 |   tq = TaskQueue(queuepath)
138 |   tq.purge()
139 | 
140 | 
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/taskqueue/queueablefns.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | import functools
  3 | import inspect
  4 | import orjson
  5 | import copy
  6 | import re
  7 | from collections import OrderedDict
  8 | from functools import partial
  9 | 
 10 | import numpy as np
 11 | 
 12 | REGISTRY = {}
 13 | FunctionTaskLite = namedtuple("FunctionTaskLite", [ "key", "args", "kwargs", "id" ])
 14 | 
 15 | class UnregisteredFunctionError(BaseException):
 16 |   pass
 17 | 
 18 | def totask(data, ident=-1):
 19 |   if isinstance(data, FunctionTask):
 20 |     return data
 21 |   if isinstance(data, partial) or callable(data):
 22 |     return func2task(data, ident)
 23 | 
 24 |   if type(data) is bytes:
 25 |     data = data.decode('utf8')
 26 |   if isinstance(data, str):
 27 |     data = orjson.loads(data)
 28 |   data[3] = ident
 29 |   return FunctionTask(*data) 
 30 | 
 31 | def func2task(fn, ident):
 32 |   args = []
 33 |   kwargs = {}
 34 |   rawfn = fn
 35 |   while isinstance(rawfn, partial):
 36 |     args += rawfn.args
 37 |     kwargs.update(rawfn.keywords)
 38 |     rawfn = rawfn.func
 39 | 
 40 |   if not argsokay(rawfn, args, kwargs):
 41 |     raise TypeError("{} didn't get valid arguments. Got: {}, {}. Expected: {}".format(
 42 |       rawfn, args, kwargs, inspect.getfullargspec(rawfn)
 43 |     ))
 44 | 
 45 |   return FunctionTask(
 46 |     (rawfn.__module__, rawfn.__name__), 
 47 |     args, 
 48 |     kwargs,
 49 |     ident
 50 |   )
 51 | 
 52 | def tofunc(task):
 53 |   if callable(task):
 54 |     return task
 55 | 
 56 |   fn = REGISTRY.get(tuple(task[0]), None)
 57 |   if fn is None:
 58 |     raise UnregisteredFunctionError("{} is not registered as a queuable function.".format(task.key))
 59 |   return partial(fn, *task[1], **task[2])
 60 | 
 61 | class FunctionTask():
 62 |   __slots__ = ['key', 'args', 'kwargs', 'id', '_order']
 63 |   def __init__(self, key, args, kwargs, id=None):
 64 |     self.key = tuple(key)
 65 |     self.args = args
 66 |     self.kwargs = kwargs
 67 |     self.id = id
 68 |     self._order = ('key', 'args', 'kwargs', 'id')
 69 |   def __getitem__(self, idx):
 70 |     return getattr(self, self._order[idx])
 71 |   def __setitem__(self, idx, value):
 72 |     setattr(self, self._order[idx], value)
 73 |   def __iter__(self):
 74 |     raise TypeError("FunctionTask is not an iterable.")
 75 |   def payload(self):
 76 |     return FunctionTaskLite(self.key, self.args, self.kwargs, self.id)
 77 |   def execute(self, *args, **kwargs):
 78 |     self(*args, **kwargs)
 79 |   def tofunc(self):
 80 |     fn = REGISTRY.get(tuple(self.key), None)
 81 |     if fn is None:
 82 |       raise UnregisteredFunctionError("{} is not registered as a queuable function.".format(self.key))
 83 |     return partial(fn, *self.args, **self.kwargs)
 84 |   def __repr__(self):
 85 |     return "FunctionTask({},{},{},\"{}\")".format(self.key, self.args, self.kwargs, self.id)
 86 |   def __call__(self):
 87 |     return self.tofunc()()
 88 | 
 89 | def jsonifyable(obj):
 90 |   if hasattr(obj, 'serialize') and callable(obj.serialize):
 91 |     return obj.serialize()
 92 | 
 93 |   try:
 94 |     iter(obj)
 95 |   except TypeError:
 96 |     return obj
 97 | 
 98 |   if isinstance(obj, bytes):
 99 |     return obj.decode('utf8')
100 |   elif isinstance(obj, str):
101 |     return obj
102 | 
103 |   if isinstance(obj, list) or isinstance(obj, tuple):
104 |     return [ jsonifyable(x) for x in obj ] 
105 | 
106 |   for key, val in obj.items():
107 |     if isinstance(val, np.ndarray):
108 |       obj[key] = val.tolist()
109 |     elif isinstance(val, dict):
110 |       obj[key] = jsonifyable(val)
111 |     elif isinstance(val, list):
112 |       obj[key] = [ jsonifyable(x) for x in val ]
113 |     elif hasattr(val, 'serialize') and callable(val.serialize):
114 |       obj[key] = val.serialize()
115 | 
116 |   return obj
117 | 
118 | def argsokay(fn, args, kwargs):
119 |   sig = inspect.signature(fn)
120 |   try:
121 |     sig.bind(*args, **kwargs)
122 |   except TypeError:
123 |     return False
124 |   return True
125 | 
126 | def queueable(fn):
127 |   """Register the input function as queueable and executable via TaskQueue."""
128 |   REGISTRY[(fn.__module__, fn.__name__)] = fn
129 |   return fn


--------------------------------------------------------------------------------
/taskqueue/secrets.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | from collections import defaultdict
  4 | import os
  5 | import json
  6 | 
  7 | from google.oauth2 import service_account
  8 | 
  9 | from .lib import mkdir, colorize
 10 | 
 11 | HOME = os.path.expanduser('~')
 12 | CLOUD_VOLUME_DIR = mkdir(os.path.join(HOME, '.cloudvolume', 'secrets'))
 13 | CLOUD_FILES_DIR = mkdir(os.path.join(HOME, '.cloudfiles', 'secrets'))
 14 | 
 15 | AWS_DEFAULT_REGION = 'us-east-1' if 'AWS_DEFAULT_REGION' not in os.environ else os.environ['AWS_DEFAULT_REGION']
 16 | 
 17 | def secretpath(filepath):
 18 |   preferred = os.path.join(CLOUD_VOLUME_DIR, filepath)
 19 |   
 20 |   if os.path.exists(preferred):
 21 |     return preferred
 22 | 
 23 |   backcompat = [
 24 |     '/', # original
 25 |     CLOUD_FILES_DIR,
 26 |   ]
 27 | 
 28 |   backcompat = [ os.path.join(path, filepath) for path in backcompat ] 
 29 | 
 30 |   for path in backcompat:
 31 |     if os.path.exists(path):
 32 |       return path
 33 | 
 34 |   return preferred
 35 | 
 36 | def default_google_project_name():
 37 |   default_credentials_path = secretpath('google-secret.json')
 38 |   if os.path.exists(default_credentials_path):
 39 |     with open(default_credentials_path, 'rt') as f:
 40 |       return json.loads(f.read())['project_id']
 41 |   return None
 42 | 
 43 | PROJECT_NAME = default_google_project_name()
 44 | GOOGLE_CREDENTIALS_CACHE = {}
 45 | google_credentials_path = secretpath('google-secret.json')
 46 | 
 47 | def google_credentials(bucket = ''):
 48 |   global PROJECT_NAME
 49 |   global GOOGLE_CREDENTIALS_CACHE
 50 | 
 51 |   if bucket in GOOGLE_CREDENTIALS_CACHE.keys():
 52 |     return GOOGLE_CREDENTIALS_CACHE[bucket]
 53 | 
 54 |   paths = [
 55 |     secretpath('google-secret.json')
 56 |   ]
 57 | 
 58 |   if bucket:
 59 |     paths = [ secretpath('{}-google-secret.json'.format(bucket)) ] + paths
 60 | 
 61 |   google_credentials = None
 62 |   project_name = PROJECT_NAME
 63 |   for google_credentials_path in paths:
 64 |     if os.path.exists(google_credentials_path):
 65 |       google_credentials = service_account.Credentials \
 66 |         .from_service_account_file(google_credentials_path)
 67 |       
 68 |       with open(google_credentials_path, 'rt') as f:
 69 |         project_name = json.loads(f.read())['project_id']
 70 |       break
 71 | 
 72 |   if google_credentials == None:
 73 |     print(colorize('yellow', 'Using default Google credentials. There is no ~/.cloudvolume/secrets/google-secret.json set.'))  
 74 |   else:
 75 |     GOOGLE_CREDENTIALS_CACHE[bucket] = (project_name, google_credentials)
 76 | 
 77 |   return project_name, google_credentials
 78 | 
 79 | AWS_CREDENTIALS_CACHE = defaultdict(dict)
 80 | aws_credentials_path = secretpath('aws-secret.json')
 81 | def aws_credentials(bucket = '', service = 'aws'):
 82 |   global AWS_CREDENTIALS_CACHE
 83 | 
 84 |   if service == 's3':
 85 |     service = 'aws'
 86 | 
 87 |   if bucket in AWS_CREDENTIALS_CACHE.keys():
 88 |     return AWS_CREDENTIALS_CACHE[bucket]
 89 | 
 90 |   default_file_path = '{}-secret.json'.format(service)
 91 | 
 92 |   paths = [
 93 |     secretpath(default_file_path)
 94 |   ]
 95 | 
 96 |   if bucket:
 97 |     paths = [ secretpath('{}-{}-secret.json'.format(bucket, service)) ] + paths
 98 | 
 99 |   aws_credentials = {}
100 |   aws_credentials_path = secretpath(default_file_path)
101 |   for aws_credentials_path in paths:
102 |     if os.path.exists(aws_credentials_path):
103 |       with open(aws_credentials_path, 'r') as f:
104 |         aws_credentials = json.loads(f.read())
105 |       break
106 |   
107 |   if not aws_credentials:
108 |     # did not find any secret json file, will try to find it in environment variables
109 |     if 'AWS_ACCESS_KEY_ID' in os.environ and 'AWS_SECRET_ACCESS_KEY' in os.environ:
110 |       aws_credentials = {
111 |         'AWS_ACCESS_KEY_ID': os.environ['AWS_ACCESS_KEY_ID'],
112 |         'AWS_SECRET_ACCESS_KEY': os.environ['AWS_SECRET_ACCESS_KEY'],
113 |       }
114 |     if 'AWS_DEFAULT_REGION' in os.environ:
115 |       aws_credentials['AWS_DEFAULT_REGION'] = os.environ['AWS_DEFAULT_REGION']
116 | 
117 |   AWS_CREDENTIALS_CACHE[service][bucket] = aws_credentials
118 |   return aws_credentials
119 | 


--------------------------------------------------------------------------------
/taskqueue/aws_queue_api.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import types
  4 | 
  5 | import boto3
  6 | import botocore.exceptions
  7 | import botocore.errorfactory
  8 | 
  9 | from .lib import toiter, sip, jsonify
 10 | from .secrets import aws_credentials
 11 | from .secrets import (
 12 |   AWS_DEFAULT_REGION
 13 | )
 14 | 
 15 | import tenacity
 16 | 
 17 | AWS_BATCH_SIZE = 10 # send_message_batch's max batch size is 10
 18 | 
 19 | class ClientSideError(Exception):
 20 |   pass
 21 | 
 22 | retry = tenacity.retry(
 23 |   reraise=True,
 24 |   stop=tenacity.stop_after_attempt(4),
 25 |   wait=tenacity.wait_random_exponential(0.5, 60.0),
 26 |   retry=tenacity.retry_if_not_exception_type(ClientSideError),
 27 | )
 28 | 
 29 | class AWSTaskQueueAPI(object):
 30 |   def __init__(self, qurl, region_name=AWS_DEFAULT_REGION, **kwargs):
 31 |     """
 32 |     qurl: either a queue name (e.g. 'pull_queue') or a url
 33 |       like https://sqs.us-east-1.amazonaws.com/DIGITS/wms-pull-queue
 34 |     kwargs: Keywords for the underlying boto3.client constructor, other than `service_name`, 
 35 |       `region_name`, `aws_secret_access_key`, or `aws_access_key_id`.
 36 |     """
 37 |     if 'region' in kwargs:
 38 |       region_name = kwargs.pop('region')
 39 |     
 40 |     matches = re.search(r'sqs.([\w\d-]+).amazonaws', qurl)
 41 | 
 42 |     if matches is not None:
 43 |       region_name, = matches.groups()
 44 |       self.qurl = qurl
 45 |     else:
 46 |       self.qurl = None
 47 | 
 48 |     credentials = aws_credentials()
 49 |     
 50 |     self.sqs = boto3.client('sqs', 
 51 |       aws_secret_access_key=credentials.get('AWS_SECRET_ACCESS_KEY'),
 52 |       aws_access_key_id=credentials.get('AWS_ACCESS_KEY_ID'),
 53 |       region_name=region_name,
 54 |       **kwargs,
 55 |     )    
 56 | 
 57 |     if self.qurl is None:
 58 |       self.qurl = self._get_qurl(qurl)
 59 | 
 60 |     self.batch_size = AWS_BATCH_SIZE
 61 | 
 62 |   @retry
 63 |   def _get_qurl(self, qurl):
 64 |     try:
 65 |       return self.sqs.get_queue_url(QueueName=qurl)["QueueUrl"]
 66 |     except Exception as err:
 67 |       print(f"Failed to fetch queue URL for: {qurl}")
 68 |       raise    
 69 | 
 70 |   @property
 71 |   def enqueued(self):
 72 |     status = self.status()
 73 |     return (
 74 |       int(status['ApproximateNumberOfMessages']) 
 75 |       + int(status['ApproximateNumberOfMessagesNotVisible'])
 76 |       + int(status['ApproximateNumberOfMessagesDelayed'])
 77 |     )
 78 | 
 79 |   @property
 80 |   def inserted(self):
 81 |     return float('NaN')
 82 | 
 83 |   @property
 84 |   def completed(self):
 85 |     return float('NaN')
 86 | 
 87 |   @property
 88 |   def leased(self):
 89 |     status = self.status()
 90 |     return int(status['ApproximateNumberOfMessagesNotVisible'])
 91 | 
 92 |   def is_empty(self):
 93 |     return self.enqueued == 0
 94 | 
 95 |   @retry
 96 |   def status(self):
 97 |     resp = self.sqs.get_queue_attributes(
 98 |       QueueUrl=self.qurl, 
 99 |       AttributeNames=[
100 |         'ApproximateNumberOfMessages', 
101 |         'ApproximateNumberOfMessagesNotVisible', 
102 |         'ApproximateNumberOfMessagesDelayed'
103 |       ]
104 |     )
105 |     return resp['Attributes']
106 | 
107 |   @retry
108 |   def insert(self, tasks, delay_seconds=0):
109 |     tasks = toiter(tasks)
110 | 
111 |     total = 0
112 |     # send_message_batch's max batch size is 10
113 |     for batch in sip(tasks, self.batch_size):
114 |       if len(batch) == 0:
115 |         break
116 | 
117 |       entries = [ {
118 |           "Id": str(j),
119 |           "MessageBody": jsonify(task),
120 |           "DelaySeconds": delay_seconds,
121 |         } for j, task in enumerate(batch) 
122 |       ]
123 | 
124 |       try:
125 |         resp = self.sqs.send_message_batch(
126 |           QueueUrl=self.qurl,
127 |           Entries=entries,
128 |         )
129 |       except botocore.exceptions.ClientError as error:
130 |         http_code = error.response['ResponseMetadata']['HTTPStatusCode']
131 |         if 400 <= int(http_code) < 500:
132 |           raise ClientSideError(error)
133 |         else:
134 |           raise error
135 |       
136 |       total += len(entries)
137 | 
138 |     return total
139 | 
140 |   def add_insert_count(self, ct):
141 |     pass
142 | 
143 |   def rezero(self):
144 |     pass
145 | 
146 |   @retry
147 |   def renew_lease(self, task, seconds):
148 |     self.sqs.change_message_visibility(
149 |       QueueUrl=self.qurl,
150 |       ReceiptHandle=task.id,
151 |       VisibilityTimeout=seconds,
152 |     )
153 | 
154 |   def cancel_lease(self, task):
155 |     self.renew_lease(task, 0)
156 | 
157 |   def release_all(self):
158 |     raise NotImplementedError()
159 | 
160 |   def lease(self, seconds, num_tasks=1, wait_sec=20):
161 |     if wait_sec is None:
162 |       wait_sec = 20
163 | 
164 |     resp = self.sqs.receive_message(
165 |       QueueUrl=self.qurl,
166 |       AttributeNames=[
167 |         'SentTimestamp'
168 |       ],
169 |       MaxNumberOfMessages=num_tasks,
170 |       MessageAttributeNames=[
171 |         'All'
172 |       ],
173 |       VisibilityTimeout=seconds,
174 |       WaitTimeSeconds=wait_sec,
175 |     )
176 |         
177 |     if 'Messages' not in resp:
178 |       return []
179 | 
180 |     tasks = []
181 |     for msg in resp['Messages']:
182 |       task = json.loads(msg['Body'])
183 |       task['id'] = msg['ReceiptHandle']
184 |       tasks.append(task)
185 |     return tasks
186 | 
187 |   def delete(self, task):
188 |     if type(task) == str:
189 |       rhandle = task
190 |     else:
191 |       try:
192 |         rhandle = task._id
193 |       except AttributeError:
194 |         rhandle = task['id']
195 | 
196 |     try:
197 |       self.sqs.delete_message(
198 |         QueueUrl=self.qurl,
199 |         ReceiptHandle=rhandle,
200 |       )
201 |     except botocore.exceptions.ClientError as err:
202 |       pass
203 | 
204 |     return 1
205 | 
206 |   def tally(self):
207 |     pass
208 | 
209 |   def purge(self, native=False):
210 |     # can throw:
211 |     # except botocore.errorfactory.PurgeQueueInProgress:
212 |     if native:
213 |       self.sqs.purge_queue(QueueUrl=self.qurl)
214 |       return
215 | 
216 |     while self.enqueued:
217 |       # visibility_timeout must be > 0 for delete to work
218 |       tasks = self.lease(num_tasks=10, seconds=10)
219 |       for task in tasks:
220 |         self.delete(task)
221 |     
222 |   def __iter__(self):
223 |     return iter(self.lease(num_tasks=10, seconds=0))
224 |       
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 


--------------------------------------------------------------------------------
/taskqueue/goog_pubsub_api.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import types
  4 | 
  5 | from google.cloud import pubsub_v1
  6 | from google.api_core.exceptions import ClientError
  7 | from google.pubsub_v1.types import PullRequest
  8 | from google.pubsub_v1.types import AcknowledgeRequest
  9 | from .secrets import google_credentials
 10 | 
 11 | from .lib import toiter, sip, jsonify
 12 | 
 13 | 
 14 | import tenacity
 15 | 
 16 | PUBSUB_BATCH_SIZE = 10  # send_message_batch's max batch size is 10
 17 | 
 18 | 
 19 | class ClientSideError(Exception):
 20 |     pass
 21 | 
 22 | 
 23 | retry = tenacity.retry(
 24 |     reraise=True,
 25 |     stop=tenacity.stop_after_attempt(4),
 26 |     wait=tenacity.wait_random_exponential(0.5, 60.0),
 27 |     retry=tenacity.retry_if_not_exception_type(ClientSideError),
 28 | )
 29 | 
 30 | 
 31 | class PubSubTaskQueueAPI(object):
 32 |     def __init__(self, qurl, **kwargs):
 33 |         """
 34 |         qurl: a topic or subscription location
 35 |           conforms to this format projects/{project_id}/topics/{topic_id}/subscriptions/{subscription_id}
 36 |         kwargs: Keywords for the underlying boto3.client constructor, other than `service_name`,
 37 |           `region_name`, `aws_secret_access_key`, or `aws_access_key_id`.
 38 |         """
 39 |         pattern = r"^projects/(?P<project_id>[\w\d-]+)/topics/(?P<topic_id>[\w\d-]+)/subscriptions/(?P<subscription_id>[\w\d-]+)$"
 40 |         matches = re.match(pattern, qurl)
 41 |         if matches is None:
 42 |             raise ValueError(
 43 |                 "qurl does not conform to the required format (projects/{project_id}/topics/{topic_id}/subscriptions/{subscription_id})"
 44 |             )
 45 | 
 46 |         matches = re.search(r"projects/([\w\d-]+)/", qurl)
 47 |         self.project_id = matches.group(1)
 48 | 
 49 |         matches = re.search(r"topics/([\w\d-]+)", qurl)
 50 |         self.topic_id = matches.group(1)
 51 | 
 52 |         matches = re.search(r"subscriptions/([\w\d-]+)", qurl)
 53 |         self.subscription_id = matches.group(1)
 54 | 
 55 |         project_name, credentials = google_credentials()
 56 | 
 57 |         self.subscriber = pubsub_v1.SubscriberClient(credentials=credentials)
 58 |         self.publisher = pubsub_v1.PublisherClient(credentials=credentials)
 59 |         self._topic_path = self.publisher.topic_path(self.project_id, self.topic_id)
 60 |         self._subscription_path = self.subscriber.subscription_path(
 61 |             self.project_id, self.subscription_id
 62 |         )
 63 | 
 64 |         self.batch_size = PUBSUB_BATCH_SIZE
 65 | 
 66 |     @property
 67 |     def enqueued(self):
 68 |         raise float("Nan")
 69 | 
 70 |     @property
 71 |     def inserted(self):
 72 |         return float("NaN")
 73 | 
 74 |     @property
 75 |     def completed(self):
 76 |         return float("NaN")
 77 | 
 78 |     @property
 79 |     def leased(self):
 80 |         return float("NaN")
 81 | 
 82 |     def is_empty(self):
 83 |         return self.enqueued == 0
 84 | 
 85 |     @retry
 86 |     def insert(self, tasks, delay_seconds=0):
 87 |         tasks = toiter(tasks)
 88 | 
 89 |         def publish_batch(batch):
 90 |             if not batch:
 91 |                 return 0
 92 | 
 93 |             futures = []
 94 |             for task in batch:
 95 |                 data = jsonify(task).encode("utf-8")
 96 |                 future = self.publisher.publish(self._topic_path, data)
 97 |                 futures.append(future)
 98 | 
 99 |             # Wait for all messages to be published
100 |             for future in futures:
101 |                 try:
102 |                     # Blocks until the message is published
103 |                     future.result()
104 |                 except Exception as e:
105 |                     raise ClientError(e)
106 | 
107 |             return len(futures)
108 | 
109 |         total = 0
110 | 
111 |         # send_message_batch's max batch size is 10
112 |         for batch in sip(tasks, self.batch_size):
113 |             if len(batch) == 0:
114 |                 break
115 |             total += publish_batch(batch)
116 | 
117 |         return total
118 | 
119 |     def add_insert_count(self, ct):
120 |         pass
121 | 
122 |     def rezero(self):
123 |         pass
124 | 
125 |     @retry
126 |     def renew_lease(self, task, seconds):
127 |         self.subscriber.modify_ack_deadline(
128 |             self._subscription_path,
129 |             [task.id],
130 |             seconds,
131 |         )
132 | 
133 |     def cancel_lease(self, task):
134 |         self.subscriber.acknowledge(self._subscription_path, [task.id])
135 | 
136 |     def release_all(self):
137 |         raise NotImplementedError()
138 | 
139 |     def lease(self, seconds, num_tasks=1, wait_sec=20):
140 |         # Pull messages from the subscription
141 |         request = PullRequest(
142 |             subscription=self._subscription_path, max_messages=num_tasks
143 |         )
144 |         response = self.subscriber.pull(request)
145 | 
146 |         tasks = []
147 |         for received_message in response.received_messages:
148 |             # Load the message data as JSON
149 |             task = json.loads(received_message.message.data.decode("utf-8"))
150 |             # Store the acknowledgement ID in the task
151 |             task["id"] = received_message.ack_id
152 |             tasks.append(task)
153 | 
154 |         return tasks
155 | 
156 |     def delete(self, task):
157 |         if isinstance(task, str):
158 |             ack_id = task
159 |         else:
160 |             try:
161 |                 ack_id = task._id
162 |             except AttributeError:
163 |                 ack_id = task["id"]
164 |         request = AcknowledgeRequest(
165 |             subscription=self._subscription_path, ack_ids=[ack_id]
166 |         )
167 |         self.subscriber.acknowledge(request=request)
168 |         return 1
169 | 
170 |     def tally(self):
171 |         pass
172 | 
173 |     def purge(self, native=False):
174 |         while True:
175 |             # Pull messages from the subscription
176 |             response = self.subscriber.pull(
177 |                 self._subscription_path, max_messages=self.batch_size
178 |             )
179 | 
180 |             if not response.received_messages:
181 |                 # No more messages, break the loop
182 |                 break
183 | 
184 |             # Acknowledge all received messages
185 |             ack_ids = [msg.ack_id for msg in response.received_messages]
186 |             request = AcknowledgeRequest(
187 |                 subscription=self._subscription_path, ack_ids=ack_ids
188 |             )
189 |             self.subscriber.acknowledge(request=request)
190 | 
191 |     def __iter__(self):
192 |         return iter(self.lease(num_tasks=10, seconds=0))
193 | 


--------------------------------------------------------------------------------
/taskqueue/threaded_queue.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | from six.moves import queue as Queue
  4 | from six.moves import range
  5 | from functools import partial
  6 | import threading
  7 | import time
  8 | 
  9 | from tqdm import tqdm
 10 | 
 11 | DEFAULT_THREADS = 20
 12 | 
 13 | class ThreadedQueue(object):
 14 |   """Grant threaded task processing to any derived class."""
 15 |   def __init__(self, n_threads, queue_size=0, progress=None):
 16 |     self._n_threads = n_threads
 17 | 
 18 |     self._queue = Queue.Queue(maxsize=queue_size) # 0 = infinite size
 19 |     self._error_queue = Queue.Queue(maxsize=queue_size)
 20 |     self._threads = ()
 21 |     self._terminate = threading.Event()
 22 | 
 23 |     self._processed_lock = threading.Lock()
 24 |     self.processed = 0
 25 |     self._inserted = 0
 26 | 
 27 |     self.with_progress = progress
 28 | 
 29 |     self.start_threads(n_threads)
 30 | 
 31 |   @property
 32 |   def pending(self):
 33 |       return self._queue.qsize()
 34 | 
 35 |   def put(self, fn):
 36 |     """
 37 |     Enqueue a task function for processing.
 38 | 
 39 |     Requires:
 40 |       fn: a function object that takes one argument
 41 |         that is the interface associated with each
 42 |         thread.
 43 | 
 44 |         e.g. def download(api):
 45 |                results.append(api.download())
 46 | 
 47 |              self.put(download)
 48 | 
 49 |     Returns: self
 50 |     """
 51 |     self._inserted += 1
 52 |     self._queue.put(fn, block=True)
 53 |     return self
 54 | 
 55 |   def start_threads(self, n_threads):
 56 |     """
 57 |     Terminate existing threads and create a 
 58 |     new set if the thread number doesn't match
 59 |     the desired number.
 60 | 
 61 |     Required: 
 62 |       n_threads: (int) number of threads to spawn
 63 | 
 64 |     Returns: self
 65 |     """
 66 |     if n_threads == len(self._threads):
 67 |       return self
 68 |     
 69 |     # Terminate all previous tasks with the existing
 70 |     # event object, then create a new one for the next
 71 |     # generation of threads. The old object will hang
 72 |     # around in memory until the threads actually terminate
 73 |     # after another iteration.
 74 |     self._terminate.set()
 75 |     self._terminate = threading.Event()
 76 | 
 77 |     threads = []
 78 | 
 79 |     for _ in range(n_threads):
 80 |       worker = threading.Thread(
 81 |         target=self._consume_queue, 
 82 |         args=(self._terminate,)
 83 |       )
 84 |       worker.daemon = True
 85 |       worker.start()
 86 |       threads.append(worker)
 87 | 
 88 |     self._threads = tuple(threads)
 89 |     return self
 90 | 
 91 |   def are_threads_alive(self):
 92 |     """Returns: boolean indicating if any threads are alive"""
 93 |     return any(map(lambda t: t.is_alive(), self._threads))
 94 | 
 95 |   def kill_threads(self):
 96 |     """Kill all threads."""
 97 |     self._terminate.set()
 98 |     while self.are_threads_alive():
 99 |       time.sleep(0.001)
100 |     self._threads = ()
101 |     return self
102 | 
103 |   def _initialize_interface(self):
104 |     """
105 |     This is used to initialize the interfaces used in each thread.
106 |     You should reimplement it in subclasses. For example, return
107 |     an API object, file handle, or network connection. The functions
108 |     you pass into the self._queue will get it as the first parameter.
109 | 
110 |     e.g. an implementation in a subclass.
111 |  
112 |         def _initialize_interface(self):
113 |           return HTTPConnection()   
114 | 
115 |         def other_function(self):
116 |           def threaded_file_read(connection):
117 |               # do stuff
118 | 
119 |           self._queue.put(threaded_file_handle)
120 | 
121 |     Returns: Interface object used in threads
122 |     """
123 |     return None
124 | 
125 |   def _close_interface(self, interface):
126 |     """Allows derived classes to clean up after a thread finishes."""
127 |     pass
128 | 
129 |   def _consume_queue(self, terminate_evt):
130 |     """
131 |     This is the main thread function that consumes functions that are
132 |     inside the _queue object. To use, execute self._queue(fn), where fn
133 |     is a function that performs some kind of network IO or otherwise
134 |     benefits from threading and is independent.
135 | 
136 |     terminate_evt is automatically passed in on thread creation and 
137 |     is a common event for this generation of threads. The threads
138 |     will terminate when the event is set and the queue burns down.
139 | 
140 |     Returns: void
141 |     """
142 |     interface = self._initialize_interface()
143 | 
144 |     while not terminate_evt.is_set():
145 |       try:
146 |         fn = self._queue.get(block=True, timeout=0.01)
147 |       except Queue.Empty:
148 |         continue # periodically check if the thread is supposed to die
149 | 
150 |       fn = partial(fn, interface)
151 | 
152 |       try:
153 |         self._consume_queue_execution(fn)
154 |       except Exception as err:
155 |         self._error_queue.put(err)
156 | 
157 |     self._close_interface(interface)
158 | 
159 |   def _consume_queue_execution(self, fn):
160 |     """
161 |     The actual task execution in each thread. This
162 |     is broken out so that exceptions can be caught
163 |     in derived classes and allow them to manipulate 
164 |     the errant task, e.g. putting it back in the queue
165 |     for a retry.
166 | 
167 |     Every task processed will automatically be marked complete.
168 | 
169 |     Required:
170 |       [0] fn: A curried function that includes the interface
171 |               as its first argument.
172 |     Returns: void
173 |     """
174 | 
175 |     # `finally` fires after all success or exceptions
176 |     # exceptions are handled in derived classes
177 |     # and uncaught ones are caught as a last resort
178 |     # in _consume_queue to be raised on the main thread.
179 |     try:
180 |       fn()
181 |     finally:
182 |       with self._processed_lock:
183 |         self.processed += 1
184 |         self._queue.task_done()
185 | 
186 |   def _check_errors(self):
187 |     try:
188 |       err = self._error_queue.get(block=False) 
189 |       self._error_queue.task_done()
190 |       self.kill_threads()
191 |       raise err
192 |     except Queue.Empty:
193 |       pass
194 | 
195 |   def wait(self, progress=None):
196 |     """
197 |     Allow background threads to process until the
198 |     task queue is empty. If there are no threads,
199 |     in theory the queue should always be empty
200 |     as processing happens immediately on the main thread.
201 | 
202 |     Optional:
203 |       progress: (bool or str) show a tqdm progress bar optionally
204 |         with a description if a string is provided
205 |     
206 |     Returns: self (for chaining)
207 | 
208 |     Raises: The first exception recieved from threads
209 |     """
210 |     if not len(self._threads):
211 |       return self
212 | 
213 |     desc = None
214 |     if type(progress) is str:
215 |       desc = progress
216 | 
217 |     last = self._inserted
218 |     with tqdm(total=self._inserted, disable=(not progress), desc=desc) as pbar:
219 |       # Allow queue to consume, but check up on
220 |       # progress and errors every tenth of a second
221 |       while not self._queue.empty():
222 |         size = self._queue.qsize()
223 |         delta = last - size
224 |         if delta != 0: # We should crash on negative numbers
225 |           pbar.update(delta)
226 |         last = size
227 |         self._check_errors()
228 |         time.sleep(0.1)
229 | 
230 |       # Wait until all tasks in the queue are 
231 |       # fully processed. queue.task_done must be
232 |       # called for each task.
233 |       self._queue.join() 
234 |       self._check_errors()
235 | 
236 |       final = self._inserted - last
237 |       if final:
238 |         pbar.update(final)
239 | 
240 |     if self._queue.empty():
241 |       self._inserted = 0
242 | 
243 |     return self
244 | 
245 |   def __del__(self):
246 |     self.wait() # if no threads were set the queue is always empty
247 |     self.kill_threads()
248 | 
249 |   def __enter__(self):
250 |     if self.__class__ is ThreadedQueue and self._n_threads == 0:
251 |       raise ValueError("Using 0 threads in base class ThreadedQueue with statement will never exit.")
252 | 
253 |     self.start_threads(self._n_threads)
254 |     return self
255 | 
256 |   def __exit__(self, exception_type, exception_value, traceback):
257 |     self.wait(progress=self.with_progress)
258 |     self.kill_threads()
259 | 


--------------------------------------------------------------------------------
/test/test_taskqueue.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | import json
  3 | import os
  4 | import time
  5 | 
  6 | from moto import mock_aws
  7 | 
  8 | from six.moves import range
  9 | import pytest
 10 | 
 11 | import taskqueue
 12 | from taskqueue import (
 13 |   queueable, FunctionTask, RegisteredTask, 
 14 |   TaskQueue, MockTask, PrintTask, LocalTaskQueue,
 15 |   QueueEmptyError
 16 | )
 17 | from taskqueue.paths import ExtractedPath, mkpath
 18 | from taskqueue.queueables import totask
 19 | from taskqueue.queueablefns import tofunc, UnregisteredFunctionError, func2task
 20 | 
 21 | @pytest.fixture(scope='function')
 22 | def aws_credentials():
 23 |   """Mocked AWS Credentials for moto."""
 24 |   os.environ['AWS_ACCESS_KEY_ID'] = 'testing'
 25 |   os.environ['AWS_SECRET_ACCESS_KEY'] = 'testing'
 26 |   os.environ['AWS_SECURITY_TOKEN'] = 'testing'
 27 |   os.environ['AWS_SESSION_TOKEN'] = 'testing'
 28 |   os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'
 29 | 
 30 | @pytest.fixture(scope='function')
 31 | def sqs(aws_credentials):
 32 |   with mock_aws():
 33 |     import boto3
 34 |     client = boto3.client('sqs')
 35 |     client.create_queue(QueueName='test-pull-queue')
 36 |     yield client
 37 | 
 38 | QURLS = {
 39 |   'sqs': 'test-pull-queue',
 40 |   'fq': '/tmp/removeme/taskqueue/fq',
 41 | }
 42 | 
 43 | PROTOCOL = ('fq', 'sqs')
 44 | 
 45 | def getpath(protocol):
 46 |   global QURLS
 47 |   qurl = QURLS[protocol]
 48 |   return mkpath(ExtractedPath(protocol, qurl))
 49 | 
 50 | class ExecutePrintTask(RegisteredTask):
 51 |   def __init__(self):
 52 |     super(ExecutePrintTask, self).__init__()
 53 | 
 54 |   def execute(self, wow, wow2):
 55 |     print(wow + wow2)
 56 |     return wow + wow2
 57 | 
 58 | @queueable
 59 | def printfn(txt):
 60 |   print(txt)
 61 |   return 1337
 62 | 
 63 | @queueable
 64 | def sumfn(a,b):
 65 |   return a + b
 66 | 
 67 | def test_task_creation_fns():
 68 |   task = partial(printfn, "hello world")
 69 |   task = totask(task)
 70 | 
 71 |   assert task.key == ("test_taskqueue", "printfn")
 72 |   assert task.args == ["hello world"]
 73 |   assert task.kwargs == {}
 74 |   assert task.id == -1
 75 | 
 76 |   fn = tofunc(task)
 77 |   assert fn() == 1337
 78 | 
 79 |   fn = partial(partial(sumfn, 1), 2)
 80 |   assert func2task(fn, -1)() == 3
 81 | 
 82 |   fn = partial(partial(sumfn, 1), b=2)
 83 |   assert func2task(fn, -1)() == 3
 84 | 
 85 |   task = partial(printfn, not_a_real_arg="hello world")
 86 |   try:
 87 |     task = totask(task)
 88 |     assert False
 89 |   except TypeError:
 90 |     pass
 91 | 
 92 |   try:
 93 |     task = totask(printfn) # not enough args
 94 |     assert False
 95 |   except TypeError:
 96 |     pass
 97 | 
 98 |   task = partial(printfn, "hello world", "omg")
 99 |   try:
100 |     task = totask(task)
101 |     assert False
102 |   except TypeError:
103 |     pass
104 | 
105 |   try:
106 |     FunctionTask(("fake", "fake"), [], {}, None)()
107 |     assert False, "Should not have been able to call this function."
108 |   except UnregisteredFunctionError:
109 |     pass
110 | 
111 | def test_task_creation_classes():
112 |   task = MockTask(this="is", a=[1, 4, 2], simple={"test": "to", "check": 4},
113 |                serialization=('i', 's', 's', 'u', 'e', 's'), wow=4, with_kwargs=None)
114 |   task.wow = 5
115 |   payload = task.payload()
116 |   payload = json.dumps(payload)
117 |   task_deserialized = MockTask.deserialize(payload)
118 | 
119 |   assert task_deserialized._args == {
120 |       "this": "is", "a": [1, 4, 2],
121 |       "simple": {"test": "to", "check": 4},
122 |       "serialization": ["i", "s", "s", "u", "e", "s"],
123 |       "wow": 5,
124 |       "with_kwargs": None
125 |   }
126 | 
127 | def test_queue_transfer(sqs):
128 |   tqsqs = TaskQueue(getpath("sqs"))
129 |   tqsqs.purge()
130 |   tqfq = TaskQueue(getpath("fq"))
131 |   tqfq.purge()
132 | 
133 |   assert tqsqs.enqueued == 0
134 | 
135 |   tqfq.insert(( PrintTask() for _ in range(10) ))
136 |   tqsqs.insert(tqfq)
137 | 
138 |   assert tqsqs.enqueued == 10
139 |   task = tqsqs.lease()
140 |   assert isinstance(task, PrintTask)
141 | 
142 |   try:
143 |     tqfq.insert(tqsqs)
144 |     assert False
145 |   except taskqueue.UnsupportedProtocolError:
146 |     pass
147 | 
148 | @pytest.mark.parametrize('protocol', PROTOCOL)
149 | def test_get(sqs, protocol):
150 |   path = getpath(protocol) 
151 |   tq = TaskQueue(path, n_threads=0)
152 | 
153 |   n_inserts = 5
154 |   tq.purge()
155 |   tq.insert(( PrintTask() for _ in range(n_inserts) ))
156 |   
157 |   for i in range(n_inserts):
158 |     t = tq.lease()
159 |     tq.delete(t)
160 | 
161 | def test_lease(sqs):
162 |   path = getpath("sqs") 
163 |   tq = TaskQueue(path, n_threads=0)
164 | 
165 |   n_inserts = 20
166 |   tq.purge()
167 |   tq.insert(( PrintTask(str(x)) for x in range(n_inserts) ))
168 | 
169 |   tasks = tq.lease(num_tasks=10, wait_sec=0)
170 |   assert len(tasks) == 10
171 |   tq.delete(tasks)
172 | 
173 |   tasks = tq.lease(num_tasks=10, wait_sec=0)
174 |   assert len(tasks) == 10
175 |   tq.delete(tasks)
176 | 
177 |   try:
178 |     tasks = tq.lease(num_tasks=10, wait_sec=0)
179 |     assert False
180 |   except QueueEmptyError:
181 |     pass
182 | 
183 | @pytest.mark.parametrize('protocol', PROTOCOL)
184 | def test_single_threaded_insertion(sqs, protocol):
185 |   path = getpath(protocol) 
186 |   tq = TaskQueue(path, n_threads=0)
187 | 
188 |   tq.purge()
189 |   
190 |   n_inserts = 5
191 |   tq.insert(( PrintTask() for i in range(n_inserts) ))
192 | 
193 |   assert all(map(lambda x: type(x) == PrintTask, tq.list()))
194 | 
195 |   tq.purge()
196 | 
197 | @pytest.mark.parametrize('protocol', PROTOCOL)
198 | def test_single_threaded_insertion_fns(sqs, protocol):
199 |   path = getpath(protocol) 
200 |   tq = TaskQueue(path, n_threads=0)
201 | 
202 |   tq.purge()
203 |   
204 |   n_inserts = 5
205 |   tq.insert(( partial(printfn, "hello world " + str(i)) for i in range(n_inserts) ))
206 | 
207 |   assert all(map(lambda x: isinstance(x, FunctionTask), tq.list()))
208 | 
209 |   tq.purge()
210 | 
211 | @pytest.mark.parametrize('protocol', PROTOCOL)
212 | @pytest.mark.parametrize('green', (True, False))
213 | @pytest.mark.parametrize('threads', (1, 2, 10, 20, 40))
214 | def test_multi_threaded_insertion(sqs, protocol, green, threads):
215 |   path = getpath(protocol) 
216 | 
217 |   tq = TaskQueue(path, n_threads=threads, green=green)
218 | 
219 |   n_inserts = 40
220 |   tq.purge()
221 |   ct = tq.insert(( PrintTask() for i in range(n_inserts)))
222 |   tq.purge()
223 | 
224 |   assert ct == n_inserts
225 | 
226 | # def test_multiprocess_upload():
227 | #   global QURL
228 | 
229 | #   with TaskQueue(QURL) as tq:
230 | #     tq.purge()
231 | 
232 | #   time.sleep(1)
233 | 
234 | #   num_tasks = 1000
235 | #   tasks = [ PrintTask(i) for i in range(num_tasks) ]
236 | 
237 | #   taskqueue.upload(QURL, tasks, parallel=4)
238 | 
239 | #   time.sleep(1)
240 | #   try:
241 | #     assert tq.enqueued == num_tasks
242 | #   finally:
243 | #     with TaskQueue(QURL) as tq:
244 | #       tq.purge()
245 | 
246 | 
247 | @pytest.mark.parametrize('protocol', PROTOCOL)
248 | def test_400_errors(sqs, protocol):
249 |   path = getpath(protocol) 
250 | 
251 |   tq = TaskQueue(path, n_threads=0)
252 |   tq.delete('nonexistent')
253 | 
254 | def test_local_taskqueue():
255 |   tq = LocalTaskQueue(parallel=True, progress=False)
256 |   tasks = ( MockTask(arg=i) for i in range(20000) )
257 |   assert tq.insert(tasks) == 20000
258 | 
259 |   tq = LocalTaskQueue(parallel=1, progress=False)
260 |   tasks = ( (ExecutePrintTask(), [i], { 'wow2': 4 }) for i in range(200) )
261 |   assert tq.insert(tasks) == 200
262 | 
263 |   tq = LocalTaskQueue(parallel=True, progress=False)
264 |   tasks = ( (ExecutePrintTask(), [i], { 'wow2': 4 }) for i in range(200) )
265 |   assert tq.insert(tasks) == 200
266 | 
267 |   tq = LocalTaskQueue(parallel=True, progress=False)
268 |   epts = [ PrintTask(i) for i in range(200) ]
269 |   assert tq.insert(epts) == 200
270 | 
271 | @pytest.mark.parametrize('protocol', PROTOCOL)
272 | def test_parallel_insert_all(sqs, protocol):
273 |   import pathos_issue
274 | 
275 |   path = getpath(protocol) 
276 |   tq = TaskQueue(path, green=True)
277 |   tq.purge()
278 | 
279 |   if protocol == 'fq':
280 |     tq.rezero()
281 | 
282 |   tasks = pathos_issue.crt_tasks(5, 20)
283 |   amt = tq.insert(tasks, parallel=2)
284 | 
285 |   assert amt == 15 
286 |   if protocol == 'fq':
287 |     assert tq.inserted == 15
288 | 
289 |   tq.purge()
290 | 
291 | def test_polling(sqs):
292 |   N = 100
293 |   tasks = [ PrintTask(i) for i in range(N) ]
294 |   tq = TaskQueue(getpath('fq'), green=False)
295 |   tq.purge()
296 |   tq.insert(tasks)
297 | 
298 |   tq.poll(
299 |     lease_seconds=1, 
300 |     verbose=False, 
301 |     tally=True, 
302 |     stop_fn=(lambda executed: executed >= 5)
303 |   )
304 | 
305 |   tq.purge()
306 |   tq.insert(tasks)
307 | 
308 |   tq.poll(
309 |     lease_seconds=1, 
310 |     verbose=False, 
311 |     tally=True, 
312 |     stop_fn=(lambda elapsed_time: elapsed_time >= 1)
313 |   )
314 | 
315 | 
316 | 


--------------------------------------------------------------------------------
/ChangeLog:
--------------------------------------------------------------------------------
  1 | CHANGES
  2 | =======
  3 | 
  4 | 2.13.0
  5 | ------
  6 | 
  7 | * release(2.13.0): adds --load to cp, adds renew\_lease to sqs
  8 | * chore: update ChangeLog
  9 | * feat: add ability to load definitions in cp
 10 | * feat: implement renew\_lease for sqs
 11 | 
 12 | 2.12.1
 13 | ------
 14 | 
 15 | * release(2.12.1): fixes is\_empty for sqs
 16 | * fix: incorrect method declaration for sqs is\_empty
 17 | * fix(sqs): add ApproximateNumberOfMessagesDelayed to status and enqueued
 18 | * docs: document the JSON int->string issue
 19 | 
 20 | 2.12.0
 21 | ------
 22 | 
 23 | * release(2.12.0): adds retry to SQS except for http 4xx errors
 24 | * feat: add retry to SQS (#39)
 25 | * docs: mention that tasks need to be imported
 26 | 
 27 | 2.11.0
 28 | ------
 29 | 
 30 | * release(2.11.0): add ptq purge and support ptq status for SQS
 31 | * feat: add purge command to cli, add status support to SQS
 32 | * docs: update ptq on README
 33 | 
 34 | 2.10.0
 35 | ------
 36 | 
 37 | * release(2.10.0): adds mv command to ptq
 38 | * feat(cli): implement mv command
 39 | * feat: better error message for cp
 40 | * feat(cli): support mv command
 41 | * chore: update ChangeLog
 42 | * docs: mention that filequeue doesn't need a server
 43 | 
 44 | 2.9.0
 45 | -----
 46 | 
 47 | * release(2.9.0): adds cp command to ptq
 48 | * refactor: rename copy to cp
 49 | * docs: describe restrictions on copying sqs
 50 | * chore: update ChangeLog
 51 | * feat(cli): add copy to ptq
 52 | 
 53 | 2.8.7
 54 | -----
 55 | 
 56 | * release(2.8.7): more robust is\_empty for FileQueue
 57 | * fix(fq): iter dies if file deleted between scan and read
 58 | * fix: no need for is\_empty to actually read the files
 59 | 
 60 | 2.8.6
 61 | -----
 62 | 
 63 | * release(2.8.6): more robust test for queueable function arguments
 64 | * fix(queuable): more robust check for correct arguments
 65 | * fix: green threads throw exceptions properly (#34)
 66 | * chore: update changelog
 67 | 
 68 | 2.8.4
 69 | -----
 70 | 
 71 | * chore: update ChangeLog
 72 | * test: add tests for is\_empty
 73 | * fix: missing import of "first" from lib (#33)
 74 | * docs: update README.md
 75 | 
 76 | 2.8.3
 77 | -----
 78 | 
 79 | * release(2.8.3): upload numpy objects to SQS + fixes segfault on Mac
 80 | * fix(gha): install requirements\_dev.txt
 81 | * chore: remove travis CI
 82 | * Create python-package.yml
 83 | * fix: progress bar overestimate (#32)
 84 | * fix: accept numpy arrays for JSON conversion on SQS
 85 | 
 86 | 2.8.2
 87 | -----
 88 | 
 89 | * release(2.8.2): smoother parallel upload progress bar + fixes
 90 | * feat: smoother parallel upload (#31)
 91 | 
 92 | 2.8.1
 93 | -----
 94 | 
 95 | * release(2.8.1): fix for sqs
 96 | * fix: igneous sets tally to True by default which crashes sqs
 97 | 
 98 | 2.8.0
 99 | -----
100 | 
101 | * release(2.8.0): ctrl-c twice to immediately exit poll
102 | * feat: ctrl-c twice to immediately exit poll
103 | * docs: fix spelling error
104 | 
105 | 2.7.0
106 | -----
107 | 
108 | * release(2.7.0): Queueable functions. Fix for multiprocess insert counter
109 | * feat: return number inserted from insert
110 | * fix: resolve race condition in multiprocess insert counter upload
111 | * chore: remove whitespace
112 | * fix: multiprocess insert dying
113 | * docs: add docstring to @queueable
114 | * feat: queueable functions (#30)
115 | 
116 | 2.6.0
117 | -----
118 | 
119 | * release(2.6.0): adds number leased to ptq status
120 | * feat(cli): add number of leased tasks to status for filequeue
121 | 
122 | 2.5.1
123 | -----
124 | 
125 | * release(2.5.1): fix packaging for CLI
126 | * fix: make sure cli is packaged
127 | * docs: describe new ptq tool
128 | 
129 | 2.5.0
130 | -----
131 | 
132 | * release(2.5.0): ptq CLI tool
133 | * refactor: remove commented out code
134 | * feat(cli): add ptq command
135 | 
136 | 2.4.0
137 | -----
138 | 
139 | * release(2.4.0): adds elapsed\_time DI argument to stop\_fn
140 | * docs: describe elapsed\_time parameter
141 | * feat: add "elapsed\_time" DI argument to stop\_fn
142 | 
143 | 2.3.0
144 | -----
145 | 
146 | * release(2.3.0): queue transfer capability
147 | * feat: allow inserting a FileTaskQueue into an AWS TaskQueue
148 | * feat: add tq.tasks() method
149 | * fix(taskqueue): rare error where incr is referenced before assignment
150 | 
151 | 2.2.0
152 | -----
153 | 
154 | * release(2.2.0): updates verbose messages in tq.poll
155 | * feat(tq.poll): add pre-execution message to verbose mode (#27)
156 | * chore: udpate authors and changelog
157 | 
158 | 2.1.0
159 | -----
160 | 
161 | * release(2.1.0): adds support for additional SQS client arguments
162 | * Allow taskqueue instantiation with any boto3 client keywords (#26)
163 | * docs: show how to use delete and purge
164 | 
165 | 2.0.0
166 | -----
167 | 
168 | * release(2.0.0): major refactor + FileQueue
169 | * docs: more documentation for FileQueue
170 | * chore: update Trove classifiers to exclude py27 and include py38
171 | * BREAKING: drop python 2.7 support (#25)
172 | * test: fix travis
173 | * fix: support py27 with scandir pkg
174 | * feat: FileQueue + big redesign of TaskQueue (#24)
175 | 
176 | 1.0.0
177 | -----
178 | 
179 | * release(1.0.0): add before\_fn and after\_fn to poll
180 | * redesign(BREAKING): replace useless log\_fn with before\_fn and after\_fn (#21)
181 | 
182 | 0.15.0
183 | ------
184 | 
185 | * release: 0.15.0
186 | * feat: support progress bars for MockTaskQueue
187 | * fix: accomodate change to Markdown detection on pypi
188 | 
189 | 0.14.5
190 | ------
191 | 
192 | * release(0.14.5): fix multiprocessing progress bar
193 | * fix: parallel progress bar fills smoothly (#19)
194 | 
195 | 0.14.4
196 | ------
197 | 
198 | * release: 0.14.4
199 | * feat(poll): show elapsed time in success message (#18)
200 | * fix: use 20 sec wait time to avoid polling 0s on tasks for AWS
201 | 
202 | 0.14.3
203 | ------
204 | 
205 | * chore: bump version to 0.14.3
206 | * fix: LocalTaskQueue.insert\_all behavior matches MockTaskQueue
207 | 
208 | 0.14.2
209 | ------
210 | 
211 | * chore: version 0.14.2
212 | * fix: support dynamic classes in multiprocessing (#16)
213 | 
214 | 0.14.1
215 | ------
216 | 
217 | * chore: bump version to 0.14.1
218 | * fix: export QueueEmpty
219 | 
220 | 0.14.0
221 | ------
222 | 
223 | * chore: bump version to 0.14.0
224 | * feat: automatic multiprocessing (#15)
225 | * fix: incorrect inheritance in QueueEmpty
226 | * fix: issues when using LocalTaskQueue with RegisteredTask objects
227 | 
228 | 0.13.0
229 | ------
230 | 
231 | * chore: bump version to 0.13.0
232 | * feat: Batched Uploads and Green Threads (#14)
233 | 
234 | 0.12.3
235 | ------
236 | 
237 | * fix: ensure 'serialize' objects are returned serialized
238 | 
239 | 0.12.2
240 | ------
241 | 
242 | * fix: unable to deserialize tasks in LocalTaskQueue b/c missing payload key
243 | 
244 | 0.12.1
245 | ------
246 | 
247 | * chore: version 0.12.1
248 | 
249 | 0.12.0
250 | ------
251 | 
252 | * chore: version 0.12.0
253 | * feat: multiprocess uploads
254 | * feat: pass arguments to execute for LocalTaskQueue and MockTaskQueue
255 | * feat: handle serialization in more data types in RegisteredTask
256 | 
257 | 0.11.1
258 | ------
259 | 
260 | * chore: update version to 0.11.1
261 | * fix: modifications of class attributes would not affect payload
262 | * chore: update changelog
263 | 
264 | 0.11.0
265 | ------
266 | 
267 | * chore: version 0.11.0
268 | * feat: block\_until\_empty() which polls every 2 seconds by default
269 | * docs: docstring for poll function
270 | 
271 | 0.10.0
272 | ------
273 | 
274 | * feat: automatic serialization for objects with 'serialize'
275 | 
276 | 0.9.1
277 | -----
278 | 
279 | * chore: fix \_\_version\_\_ specification
280 | 
281 | 0.9.0
282 | -----
283 | 
284 | * feat: add \_\_version\_\_ number
285 | * refactor: remove old google code to remove a dependency
286 | * feat: add "poll" method based on igneous's task\_execution model
287 | * fix: PrintTask initialize arguments
288 | 
289 | 0.8.0
290 | -----
291 | 
292 | * fix: actual backwards compatibility with python2
293 | * docs: pip installation instructions were pointing to wrong package
294 | * docs: add PyPI badge
295 | * fix: generalize string types for python2 and 3
296 | * refactor: drop appengine support, base64encoding
297 | * docs: add Travis CI badge to README
298 | * fix: enqueued measures both visible and non-visible messages on aws
299 | * refactor: upgrade to inspect.getfullargspec
300 | * refactor: move queue\_name and queue\_server to first arguments
301 | * feat: allow specification of SQS queues via queue name only
302 | * chore: change tab width to 2
303 | 
304 | 0.7.0
305 | -----
306 | 
307 | * test: switching to new credentials
308 | * fix: add back google-api-python-client
309 | * docs: more info on CloudVolume dependency
310 | * docs: added info on CloudVolume dependency
311 | * test: install numpy first for CloudVolume compatibility
312 | * docs: more explicit about how to set up secrets
313 | * docs: made PrintTask example more explicit
314 | * docs: update readme to be more general
315 | * feat: remove broken Google support
316 | 
317 | 0.6.0
318 | -----
319 | 
320 | * fix: test\_task\_creation independent of dict order (#10)
321 | * fix: serialization of kwargs (#7)
322 | * docs: how to use in README.md
323 | 
324 | 0.5.0
325 | -----
326 | 
327 | * feat: Multiprocess LocalTaskQueue (#6)
328 | * chore: bump requirements
329 | 
330 | 0.4.1
331 | -----
332 | 
333 | * fix: updated ptq to use updated cloud-volume semantics
334 | * Update README.md
335 | 
336 | 0.4.0
337 | -----
338 | 
339 | * feat: specify lease seconds through LEASE\_SECONDS environemnt var
340 | 
341 | 0.3.2
342 | -----
343 | 
344 | * fix: error in printing out bad queue urls
345 | 
346 | 0.3.1
347 | -----
348 | 
349 | * fix: forgot to include numpy in RegisteredTask
350 | 
351 | 0.3.0
352 | -----
353 | 
354 | * refactor: simplified AppEngineTaskQueueAPI (#4)
355 | 
356 | 0.2.1
357 | -----
358 | 
359 | * fix: python3 compatibility with python2 strings
360 | 
361 | 0.2.0
362 | -----
363 | 
364 | * feat: appenging queue api now supports tasks/id/:tid
365 | * fix: unassigned variable
366 | * fix: make encoding python3 friendly
367 | * feat: switched appengine implementation to requests
368 | 
369 | 0.1.7
370 | -----
371 | 
372 | * feat: reviving the appengine taskqueue
373 | 
374 | 0.1.6
375 | -----
376 | 
377 | * chore: bumped cloud-volume version
378 | 
379 | 0.1.5
380 | -----
381 | 
382 | * feat: python3 compatibility (#3)
383 | 
384 | 0.1.4
385 | -----
386 | 
387 | * Wms cloud secrets (#2)
388 | 
389 | 0.1.3
390 | -----
391 | 
392 | * fix: secretpath was broken for legacy paths
393 | 
394 | 0.1.2
395 | -----
396 | 
397 | * fix: need to strip project\_name in case there's extra whitespace
398 | * feat: backwards compatible secrets
399 | * fix: add queue\_server to MockTaskQueue's constructor
400 | 
401 | 0.1.1
402 | -----
403 | 
404 | * fix: restore default project\_name
405 | 
406 | 0.1.0
407 | -----
408 | 
409 | * feat: Initial commit
410 | * Initial commit
411 | 


--------------------------------------------------------------------------------
/taskqueue/file_queue_api.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |   import fcntl
  3 |   FILEQUEUE_SUPPORTED = True
  4 | except ImportError:
  5 |   FILEQUEUE_SUPPORTED = False
  6 | 
  7 | import functools
  8 | import itertools
  9 | import json
 10 | import math
 11 | import operator
 12 | import os.path
 13 | import random
 14 | import re
 15 | import shutil
 16 | import uuid
 17 | import time
 18 | 
 19 | import tenacity
 20 | 
 21 | from .lib import mkdir, jsonify, toiter, STRING_TYPES, sip, toabs, first
 22 | 
 23 | retry = tenacity.retry(
 24 |   reraise=True,
 25 |   stop=tenacity.stop_after_attempt(4),
 26 |   wait=tenacity.wait_random_exponential(0.5, 60.0),
 27 | )
 28 | 
 29 | @retry
 30 | def read_file(path, mode='rt', lock=False, block=False):
 31 |   f = open(path, mode)
 32 |   if lock:
 33 |     f = read_lock_file(f)
 34 |   data = f.read()
 35 |   f.close()
 36 |   return data
 37 | 
 38 | @retry
 39 | def write_file(
 40 |   path, file, mode='wt',
 41 |   fsync=False, lock=False,
 42 |   block=False
 43 | ):
 44 |   f = open(path, mode)
 45 |   if lock:
 46 |     f = write_lock_file(f, block=block)
 47 |   f.write(file)
 48 |   if fsync:
 49 |     f.flush() # from application buffers -> OS buffers
 50 |     os.fsync(f.fileno()) # OS buffers -> disk
 51 |   f.close()
 52 | 
 53 | # @retry
 54 | # def touch_file(path):
 55 | #   open(path, 'a').close()
 56 | 
 57 | @retry
 58 | def move_file(src_path, dest_path):
 59 |   os.rename(src_path, dest_path)
 60 | 
 61 | @retry
 62 | def write_lock_file(fd, block=False):
 63 |   """
 64 |   Locks are bound to processes. A terminated process unlocks.
 65 |   Non-blocking, raises OSError if unable to obtain a lock.
 66 | 
 67 |   Note that any closing of a file descriptor for the locked file
 68 |   will release locks for all fds. This means you must open the file
 69 |   and reuse that FD from start to finish.
 70 |   """
 71 | 
 72 |   # https://docs.python.org/3/library/fcntl.html
 73 |   # "On at least some systems, LOCK_EX can only be used if the file
 74 |   # descriptor refers to a file opened for writing."
 75 |   # Locks: LOCK_EX (exclusive), LOCK_SH (shared), LOCK_NB (non-blocking)
 76 |   mode = fcntl.LOCK_EX
 77 |   if not block:
 78 |     mode |= fcntl.LOCK_NB
 79 | 
 80 |   fcntl.lockf(fd.fileno(), mode)
 81 |   return fd
 82 | 
 83 | @retry
 84 | def read_lock_file(fd, block=False):
 85 |   """
 86 |   Locks are bound to processes. A terminated process unlocks.
 87 |   Non-blocking, raises OSError if unable to obtain a lock.
 88 | 
 89 |   Note that any closing of a file descriptor for the locked file
 90 |   will release locks for all fds. This means you must open the file
 91 |   and reuse that FD from start to finish.
 92 |   """
 93 | 
 94 |   # https://docs.python.org/3/library/fcntl.html
 95 |   # "On at least some systems, LOCK_EX can only be used if the file
 96 |   # descriptor refers to a file opened for writing."
 97 |   # Locks: LOCK_EX (exclusive), LOCK_SH (shared), LOCK_NB (non-blocking)
 98 |   mode = fcntl.LOCK_SH
 99 |   if not block:
100 |     mode |= fcntl.LOCK_NB
101 | 
102 |   fcntl.lockf(fd.fileno(), mode)
103 |   return fd
104 | 
105 | def unlock_file(fd):
106 |   fcntl.lockf(fd.fileno(), fcntl.LOCK_UN)
107 |   return fd
108 | 
109 | def idfn(task):
110 |   if isinstance(task, STRING_TYPES):
111 |     ident = task
112 |   else:
113 |     try:
114 |       ident = task.id
115 |     except AttributeError:
116 |       ident = task['id']
117 | 
118 |   if "--" in ident:
119 |     ident = ident.split("--")[1]
120 |   return os.path.splitext(ident)[0] # removes .json if present
121 | 
122 | def get_timestamp(filename):
123 |   filename = os.path.basename(filename)
124 |   return int(filename.split("--")[0])
125 | 
126 | def set_timestamp(filename, timestamp):
127 |   old_timestamp, rest = filename.split('--')
128 |   return "{}--{}".format(timestamp, rest)
129 | 
130 | def nowfn():
131 |   return int(time.time())
132 | 
133 | class FileQueueAPI(object):
134 |   """
135 |   University clusters and supercomputers often cannot access SQS easily
136 |   but have access to a common file system. It would be a pain to have t
137 |   o set up a RabbitMQ instance or similar process on each cluster we
138 |   get access to, so it would be ideal to have a queue system that just
139 |   runs off the filesystem.
140 | 
141 |   We need the following properties from our queue:
142 | 
143 |     Serverless
144 |     Durable - No losses from power outages and process crashes.
145 |     Supports Verbs - queue create, queue delete, task create,
146 |       time limited task lease, task delete, task lease extend,
147 |       and reset tasks leases.
148 |     Parallel Safe
149 |     Recirculating Tasks - If a process fails, eventually the
150 |       task will be picked up by another one.
151 |     Supports millions of tasks.
152 |     Can be operated by a pipeline technician without help
153 |       (or need onerous approvals) from a cluster administrator.
154 | 
155 |   File Queues in principle fulfill the first two properties as the
156 |   server is the filesystem and files do not disappear on power loss
157 |   or process crash. On journaling filesystems, the files do not even
158 |   become corrupted on power loss in the middle of writing. Filesystems
159 |   support millions of files in a single directory, but certain operations
160 |   like listing become unusable. Properties 3 through 6 will require
161 |   careful design. We anticipate that these queues can be run from
162 |   userland and require no special approvals to be used unless the queues
163 |   are very large, in which case the entire job will likely need special
164 |   approval anyway.
165 | 
166 |   With respect to the verbs specified, all should be familiar from SQS
167 |   with one exception: reset task leases is new and is extremely useful
168 |   for resetting a job that has partially run but crashed when the lease
169 |   time is very long.
170 |   """
171 |   def __init__(self, path):
172 |     if not FILEQUEUE_SUPPORTED:
173 |       raise NotImplementedError(
174 |         "FileQueueAPI is only supported on unix based systems due to a "
175 |         "dependency on fcntl interprocess locking."
176 |       )
177 | 
178 |     path = toabs(path)
179 |     self.path = path
180 | 
181 |     self.movement_path = mkdir(os.path.join(path, 'movement'))
182 |     self.queue_path = mkdir(os.path.join(path, 'queue'))
183 |     self.completions_path = os.path.join(path, 'completions')
184 |     self.insertions_path = os.path.join(path, 'insertions')
185 |     self.batch_size = 10
186 | 
187 |   @property
188 |   def enqueued(self):
189 |     return len(self)
190 | 
191 |   @property
192 |   def inserted(self):
193 |     try:
194 |       return int(read_file(self.insertions_path))
195 |     except FileNotFoundError:
196 |       return 0
197 | 
198 |   @property
199 |   def completed(self):
200 |     try:
201 |       return os.path.getsize(self.completions_path)
202 |     except FileNotFoundError:
203 |       return 0
204 | 
205 |   @property
206 |   def leased(self):
207 |     now = nowfn()
208 |     ct = 0
209 |     for file in os.scandir(self.queue_path):
210 |       ct += int(get_timestamp(file.name) > now)
211 |     return ct
212 | 
213 |   @retry
214 |   def insert(self, tasks, delay_seconds=0):
215 |     tasks = toiter(tasks)
216 | 
217 |     timestamp = 0 # immediately available, never assigned
218 |     if delay_seconds > 0:
219 |       timestamp = nowfn() + delay_seconds # unix timestamp
220 | 
221 |     total = 0
222 |     for task in tasks:
223 |       identifier = str(uuid.uuid4())
224 |       filename = "{}--{}.json".format(timestamp, identifier)
225 |       task['id'] = identifier
226 |       write_file(
227 |         os.path.join(self.queue_path, filename),
228 |         jsonify(task)
229 |       )
230 |       write_file(
231 |         os.path.join(self.movement_path, identifier),
232 |         filename + "\n"
233 |       )
234 |       total += 1
235 | 
236 |     return total
237 | 
238 |   def add_insert_count(self, ct):
239 |     try:
240 |       N = read_file(self.insertions_path) # potential multiprocess race condition
241 |       N = int(N) if N != '' else 0
242 |     except FileNotFoundError:
243 |       N = 0
244 | 
245 |     N += int(ct)
246 |     write_file(self.insertions_path, str(N), fsync=True, lock=True, block=True)
247 |     return N
248 | 
249 |   @retry
250 |   def rezero(self):
251 |     # no sense acquiring a lock for completions since other writers aren't
252 |     write_file(self.completions_path, b'', mode='bw+', fsync=True)
253 |     write_file(self.insertions_path, '0', mode='tw+', fsync=True, lock=True, block=True)
254 | 
255 |   @retry
256 |   def renew_lease(self, task, seconds):
257 |     ident = idfn(task)
258 |     movement_path = os.path.join(self.movement_path, ident)
259 | 
260 |     fd = read_lock_file(open(movement_path, 'rt'))
261 |     contents = fd.read()
262 |     fd.close()
263 | 
264 |     fd = write_lock_file(open(movement_path, 'wt'))
265 | 
266 |     for filename in reversed(contents.split('\n')):
267 |       if filename == '':
268 |         continue
269 | 
270 |       old_path = os.path.join(self.queue_path, filename)
271 |       new_filename = set_timestamp(filename, nowfn() + int(seconds))
272 |       new_path = os.path.join(self.queue_path, new_filename)
273 |       try:
274 |         move_file(old_path, new_path)
275 |       except FileNotFoundError:
276 |         continue
277 | 
278 |       try:
279 |         fd.write(contents + new_filename + '\n')
280 |       except:
281 |         move_file(new_path, old_path)
282 |         fd.close()
283 |         raise
284 | 
285 |       break
286 | 
287 |     fd.close() # releases POSIX lock
288 | 
289 |   def cancel_lease(self, task):
290 |     self.renew_lease(task, 0)
291 | 
292 |   def release_all(self):
293 |     """Voids all leases and sets all tasks to available."""
294 |     now = nowfn()
295 |     for file in os.scandir(self.queue_path):
296 |       if get_timestamp(file.name) < now:
297 |         continue
298 | 
299 |       new_filename = set_timestamp(file.name, now)
300 |       move_file(
301 |         os.path.join(self.queue_path, file.name),
302 |         os.path.join(self.queue_path, new_filename)
303 |       )
304 | 
305 |       movement_path = os.path.join(self.movement_path, idfn(new_filename))
306 |       fd = write_lock_file(open(movement_path, 'at'))
307 |       fd.write(new_filename + "\n")
308 |       fd.close()
309 | 
310 |   @retry
311 |   def _lease_filename(self, filename, seconds):
312 |     new_filename = set_timestamp(filename, nowfn() + int(seconds))
313 |     new_filepath = os.path.join(self.queue_path, new_filename)
314 |     movements_filename = idfn(new_filename)
315 |     movements_path = os.path.join(self.movement_path, movements_filename)
316 | 
317 |     fd = write_lock_file(open(movements_path, 'at'))
318 | 
319 |     try:
320 |       move_file(
321 |         os.path.join(self.queue_path, filename),
322 |         new_filepath
323 |       )
324 |     except FileNotFoundError:
325 |       fd.close()
326 |       return None
327 | 
328 |     fd.write(os.path.basename(str(new_filepath)) + '\n')
329 | 
330 |     fd.flush()
331 |     fd.close() # unlocks POSIX advisory file lock
332 | 
333 |     return json.loads(read_file(new_filepath))
334 | 
335 |   def lease(self, seconds, num_tasks, wait_sec=None):
336 |     if wait_sec is None:
337 |       wait_sec = 0
338 | 
339 |     def fmt(direntry):
340 |       filename = direntry.name
341 |       timestamp, _ = filename.split('--')
342 |       return (int(timestamp), filename)
343 | 
344 |     now = nowfn()
345 |     files = ( fmt(direntry) for direntry in os.scandir(self.queue_path) )
346 | 
347 |     leasable_files = []
348 | 
349 |     for batch in sip(files, 250):
350 |       random.shuffle(batch)
351 | 
352 |       for timestamp, filename in batch:
353 |         if timestamp > now:
354 |           continue
355 |         leasable_files.append(filename)
356 |         if len(leasable_files) >= num_tasks:
357 |           break
358 | 
359 |       if len(leasable_files) >= num_tasks:
360 |         break
361 | 
362 |     leases = []
363 |     for filename in leasable_files:
364 |       try:
365 |         lessee = self._lease_filename(filename, seconds)
366 |       except OSError:
367 |         continue
368 | 
369 |       if lessee is not None:
370 |         leases.append(lessee)
371 | 
372 |     wait_leases = []
373 |     if wait_sec > 0 and len(leases) < num_tasks:
374 |       # Add a constant b/c this will cascade into shorter and 
375 |       # shorter checks as wait_sec shrinks and we don't
376 |       # want hundreds of workers to accidently synchronize
377 |       sleep_amt = random.random() * (wait_sec + 1)
378 | 
379 |       # but we still want to guarantee that wait_sec is not
380 |       # exceeded.
381 |       sleep_amt = min(sleep_amt, wait_sec)
382 |       time.sleep(sleep_amt)
383 |       wait_leases = self.lease(
384 |         seconds, 
385 |         num_tasks - len(leases), 
386 |         wait_sec - sleep_amt
387 |       )
388 | 
389 |     return leases + wait_leases
390 | 
391 |   @retry
392 |   def delete(self, task):
393 |     ident = idfn(task)
394 | 
395 |     movements_file_path = os.path.join(self.movement_path, ident)
396 |     try:
397 |       fd = read_lock_file(open(movements_file_path, 'rt'))
398 |     except FileNotFoundError:
399 |       # if it doesn't exist we still want to count this
400 |       # as a delete request succeeded b/c its purpose was
401 |       # achieved and the progress bar should increment.
402 |       return 1
403 | 
404 |     filenames = fd.read().split('\n')
405 |     fd.close()
406 | 
407 |     fd = write_lock_file(open(movements_file_path, 'wt'))
408 | 
409 |     for filename in filenames:
410 |       if filename == '':
411 |         continue
412 | 
413 |       try:
414 |         os.remove(os.path.join(self.queue_path, filename))
415 |       except FileNotFoundError:
416 |         pass
417 | 
418 |     fd.close()
419 |     try:
420 |       os.remove(movements_file_path)
421 |     except FileNotFoundError:
422 |       pass
423 | 
424 |     return 1
425 | 
426 |   def tally(self):
427 |     with open(self.completions_path, 'ba') as f:
428 |       f.write(b'\0')
429 | 
430 |   def purge(self, native=False):
431 |     # native only has meaning for SQS for now
432 |     # but we have to accept it as a parameter.
433 |     all_files = itertools.chain(
434 |       os.scandir(self.queue_path),
435 |       os.scandir(self.movement_path)
436 |     )
437 |     for file in all_files:
438 |       try:
439 |         os.remove(file.path)
440 |       except FileNotFoundError:
441 |         pass
442 | 
443 |     self.rezero()
444 | 
445 |   def is_empty(self):
446 |     try:
447 |       first(os.scandir(self.queue_path))
448 |       return False
449 |     except StopIteration:
450 |       return True
451 | 
452 |   def __iter__(self):
453 |     def read(path):
454 |       with open(path, 'rt') as f:
455 |         return f.read()
456 | 
457 |     for f in os.scandir(self.queue_path):
458 |       try:
459 |         yield read(f.path)
460 |       # It's possible for a task to have been
461 |       # deleted in between scanning and reading.
462 |       except FileNotFoundError:
463 |         continue
464 | 
465 |   def __len__(self):
466 |     return functools.reduce(operator.add, ( 1 for f in os.scandir(self.queue_path) ), 0)
467 | 
468 | 
469 | 
470 | 
471 | 
472 | 
473 | 
474 | 


--------------------------------------------------------------------------------
/taskqueue/taskqueue.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from functools import partial
  3 | import itertools
  4 | import json
  5 | import math
  6 | import os
  7 | import platform
  8 | import random
  9 | import signal
 10 | import threading
 11 | import time
 12 | import traceback
 13 | import types
 14 | import sys
 15 | 
 16 | import gevent.pool
 17 | import multiprocessing as mp
 18 | import numpy as np
 19 | import pathos.pools
 20 | from tqdm import tqdm
 21 | 
 22 | from .threaded_queue import ThreadedQueue
 23 | from .lib import yellow, scatter, sip, toiter
 24 | 
 25 | from .aws_queue_api import AWSTaskQueueAPI, AWS_BATCH_SIZE
 26 | from .file_queue_api import FileQueueAPI
 27 | from .paths import extract_path, mkpath
 28 | from .scheduler import schedule_jobs
 29 | from .queueables import totask, totaskid
 30 | from .queueablefns import FunctionTask
 31 | 
 32 | def totalfn(iterator, total):
 33 |   if total is not None:
 34 |     return total
 35 |   try:
 36 |     return len(iterator)
 37 |   except TypeError:
 38 |     return None
 39 | 
 40 | class UnsupportedProtocolError(BaseException):
 41 |   pass
 42 | 
 43 | class QueueEmptyError(LookupError):
 44 |   pass
 45 | 
 46 | LEASE_SECONDS = 300
 47 | 
 48 | class TaskQueue(object):
 49 |   """
 50 |   The standard usage is that a client calls lease to get the next available task,
 51 |   performs that task, and then calls task.delete on that task before the lease expires.
 52 |   If the client cannot finish the task before the lease expires,
 53 |   and has a reasonable chance of completing the task,
 54 |   it should call task.update before the lease expires.
 55 |   If the client completes the task after the lease has expired,
 56 |   it still needs to delete the task. 
 57 |   Tasks should be designed to be idempotent to avoid errors 
 58 |   if multiple clients complete the same task.
 59 |   
 60 |   The kwargs parameter dict should be queue-type specific parameters that are needed.
 61 |   """
 62 |   def __init__(
 63 |     self, qurl, n_threads=40, 
 64 |     green=False, progress=True, 
 65 |     **kwargs
 66 |   ):
 67 |     self.qurl = qurl
 68 |     self.path = extract_path(qurl)
 69 |     self.api = self.initialize_api(self.path, **kwargs)
 70 |     self.n_threads = n_threads
 71 |     self.green = bool(green)
 72 |     self.progress = bool(progress)
 73 | 
 74 |     if self.green:
 75 |       self.check_monkey_patch_status()
 76 | 
 77 |   @property
 78 |   def qualified_path(self):
 79 |     return mkpath(self.path)
 80 | 
 81 |   def initialize_api(self, path, **kwargs):
 82 |     """Creates correct API object for the type of path 
 83 |     
 84 |     Args:
 85 |       path: ExtractedPath representing the location of the queue
 86 |       region_name: The region for cloud-based queues (optional)
 87 |       kwargs: Keywords to be passed to the underlying queue (optional)
 88 |     """
 89 |     if path.protocol == 'sqs':
 90 |       return AWSTaskQueueAPI(path.path, **kwargs)
 91 |     elif path.protocol == 'fq':
 92 |       return FileQueueAPI(path.path)
 93 |     elif path.protocol == 'pubsub':
 94 |       from .goog_pubsub_api import PubSubTaskQueueAPI
 95 |       return PubSubTaskQueueAPI(path.path, **kwargs)
 96 |     else:
 97 |       raise UnsupportedProtocolError('Unsupported protocol ' + str(self.path.protocol))
 98 | 
 99 |   def check_monkey_patch_status(self):
100 |     import gevent.monkey
101 |     if not gevent.monkey.is_module_patched("socket"):
102 |       print(yellow("""
103 |     Green threads require monkey patching the standard library 
104 |     to use a non-blocking network socket call.
105 | 
106 |     Please place the following lines at the beginning of your
107 |     program. `thread=False` is there because sometimes this
108 |     causes hanging in multiprocessing.
109 | 
110 |     import gevent.monkey
111 |     gevent.monkey.patch_all(thread=False)
112 |         """))
113 | 
114 |   @property
115 |   def enqueued(self):
116 |     """
117 |     Returns the approximate(!) number of tasks enqueued in the cloud.
118 |     WARNING: The number computed by Google is eventually
119 |       consistent. It may return impossible numbers that
120 |       are small deviations from the number in the queue.
121 |       For instance, we've seen 1005 enqueued after 1000 
122 |       inserts.
123 |     
124 |     Returns: (int) number of tasks in cloud queue
125 |     """
126 |     return self.api.enqueued
127 | 
128 |   @property
129 |   def inserted(self):
130 |     return self.api.inserted
131 | 
132 |   @property
133 |   def completed(self):
134 |     return self.api.completed
135 | 
136 |   @property
137 |   def leased(self):
138 |     return self.api.leased
139 | 
140 |   def is_empty(self):
141 |     return self.api.is_empty()
142 | 
143 |   # def status(self):
144 |   #   """
145 |   #   Gets information about the TaskQueue
146 |   #   """
147 |   #   return self.api.get(getStats=True)
148 | 
149 |   def list(self):
150 |     """
151 |     Lists all non-deleted Tasks in a TaskQueue, 
152 |     whether or not they are currently leased, 
153 |     up to a maximum of 100.
154 |     """
155 |     return [ totask(x) for x in iter(self.api) ]
156 | 
157 |   def insert(
158 |     self, tasks, delay_seconds=0, 
159 |     total=None, parallel=1, skip_insert_counter=False
160 |   ):
161 |     """Inserts tasks and returns number inserted."""
162 |     if isinstance(tasks, TaskQueue):
163 |       taskgen = tasks.tasks()
164 |       if not isinstance(taskgen, TaskQueue):
165 |         return self.insert(taskgen, delay_seconds, total, parallel)
166 |       else:
167 |         raise ValueError(str(tasks) + " would have caused an infinite recursion by returning a TaskQueue object from obj.tasks()")
168 |     
169 |     tasks = toiter(tasks)
170 |     total = totalfn(tasks, total)
171 | 
172 |     if parallel not in (1, False) and total is not None and total > 1:
173 |       return multiprocess_upload(self.__class__, mkpath(self.path), tasks, parallel=parallel, total=total)
174 | 
175 |     try:
176 |       batch_size = self.api.batch_size
177 |     except:
178 |       batch_size = 1
179 |     
180 |     bodies = (
181 |       {
182 |         "payload": totask(task).payload(),
183 |         "queueName": self.path.path,
184 |       } 
185 |       for task in tasks
186 |     )
187 | 
188 |     def insertfn(batch):
189 |       return self.api.insert(batch, delay_seconds) 
190 |     
191 |     cts = schedule_jobs(
192 |       fns=( partial(insertfn, batch) for batch in sip(bodies, batch_size) ),
193 |       concurrency=self.n_threads,
194 |       progress=('Inserting' if self.progress else False),
195 |       total=total,
196 |       green=self.green,
197 |     )
198 |     cts = sum(cts)
199 | 
200 |     if not skip_insert_counter:
201 |       self.api.add_insert_count(cts)
202 |     return cts
203 | 
204 |   def add_insert_count(self, ct):
205 |     self.api.add_insert_count(ct)
206 | 
207 |   def insert_all(self, *args, **kwargs):
208 |     """For backwards compatibility."""
209 |     return self.insert(*args, **kwargs)
210 | 
211 |   def rezero(self):
212 |     """Resets statistic counters such as completions and insertions to zero."""
213 |     self.api.rezero()
214 | 
215 |   def renew(self, task, seconds):
216 |     """Update the duration of a task lease."""
217 |     return self.api.renew_lease(task, seconds)
218 | 
219 |   def cancel(self, task):
220 |     return self.api.cancel_lease(task)
221 | 
222 |   def release_all(self):
223 |     return self.api.release_all()
224 | 
225 |   def lease(self, seconds=600, num_tasks=1, wait_sec=None):
226 |     """
227 |     Acquires a lease on the topmost N unowned tasks in the specified queue.
228 |     Required query parameters: leaseSecs, numTasks
229 |     """
230 |     if num_tasks <= 0:
231 |       raise ValueError("num_tasks must be > 0. Got: " + str(num_tasks))
232 |     if seconds < 0:
233 |       raise ValueError("lease seconds must be >= 0. Got: " + str(seconds))
234 | 
235 |     tasks = self.api.lease(seconds, num_tasks, wait_sec)
236 | 
237 |     if not len(tasks):
238 |       raise QueueEmptyError()
239 | 
240 |     if num_tasks == 1:
241 |       return totask(tasks[0])
242 |     else:
243 |       return [ totask(task) for task in tasks ]
244 | 
245 |   def delete(self, task_id, total=None, tally=False):
246 |     """Deletes a task from a TaskQueue."""
247 |     task_id = toiter(task_id)
248 |     total = totalfn(task_id, total)
249 | 
250 |     def deltask(tid):
251 |       num_deleted = self.api.delete(totaskid(tid))
252 |       if tally:
253 |         self.api.tally()
254 |       return num_deleted
255 | 
256 |     schedule_jobs(
257 |       fns=( partial(deltask, tid) for tid in task_id ),
258 |       concurrency=self.n_threads,
259 |       progress=('Deleting' if self.progress else None),
260 |       total=total,
261 |       green=self.green,
262 |     )
263 | 
264 |   def purge(self, native=False):
265 |     """Deletes all tasks in the queue."""
266 |     try:
267 |       return self.api.purge(native)
268 |     except AttributeError:
269 |       while True:
270 |         lst = self.list()
271 |         if len(lst) == 0:
272 |           break
273 | 
274 |         for task in lst:
275 |           self.delete(task)
276 |         self.wait()
277 |       return self
278 | 
279 |   def tasks(self):
280 |     """
281 |     Iterate over all tasks.
282 | 
283 |     Can cause infinite loops on SQS and so is not
284 |     supported. You can use the api method directly 
285 |     if you know what you're doing.
286 |     """
287 |     if self.path.protocol == "sqs":
288 |       raise UnsupportedProtocolError("SQS could enter an infinite loop from this method.")
289 | 
290 |     return ( totask(task) for task in iter(self.api) )
291 | 
292 |   def poll(
293 |     self, lease_seconds=LEASE_SECONDS,  
294 |     verbose=False, execute_args=[], execute_kwargs={}, 
295 |     stop_fn=None, backoff_exceptions=[], min_backoff_window=1, 
296 |     max_backoff_window=120, before_fn=None, after_fn=None,
297 |     tally=False
298 |   ):
299 |     """
300 |     Poll a queue until a stop condition is reached (default forever). Note
301 |     that this function is not thread safe as it requires a global variable
302 |     to intercept SIGINT.
303 |     lease_seconds: each task should be leased for this many seconds
304 |     execute_args / execute_kwargs: pass these arguments to task execution
305 |     backoff_exceptions: A list of exceptions that instead of causing a crash,
306 |       instead cause the polling to back off for an increasing exponential 
307 |       random window.
308 |     min_backoff_window: The minimum sized window (in seconds) to select a 
309 |       random backoff time. 
310 |     max_backoff_window: The window doubles each retry. This is the maximum value
311 |       in seconds.
312 |     stop_fn: A boolean returning function that accepts no parameters. When
313 |       it returns True, the task execution loop will terminate. It is evaluated
314 |       once after every task. If you provide the arguments `executed` (tasks completed) 
315 |       `tries` (current attempts at fetching a task), `previous_execution_time` (time in
316 |       seconds to run the last task), or `elapsed_time` (time since polling started in 
317 |       seconds) they will be dependency injected.
318 |     before_fn: Pass task pre-execution.
319 |     after_fn: Pass task post-execution.
320 |     verbose: print out the status of each step
321 |     tally: contribute each completed task to a completions counter if supported.
322 | 
323 |     Return: number of tasks executed
324 |     """
325 |     global LOOP
326 | 
327 |     if not callable(stop_fn) and stop_fn is not None:
328 |       raise ValueError("stop_fn must be a callable. " + str(stop_fn))
329 |     elif not callable(stop_fn):
330 |       stop_fn = lambda: False
331 | 
332 |     def random_exponential_window_backoff(n):
333 |       n = min(n, min_backoff_window)
334 |       # 120 sec max b/c on avg a request every ~250msec if 500 containers 
335 |       # in contention which seems like a quite reasonable volume of traffic 
336 |       # to handle
337 |       high = min(2 ** n, max_backoff_window) 
338 |       return random.uniform(0, high)
339 | 
340 |     def printv(*args, **kwargs):
341 |       if verbose:
342 |         print(*args, **kwargs)
343 | 
344 |     LOOP = True
345 | 
346 |     def sigint_handler(signum, frame):
347 |       global LOOP
348 |       if LOOP:
349 |         print("Interrupted. Exiting after this task completes. Press Ctrl-C again to exit now.", flush=True)
350 |         LOOP = False
351 |       else:
352 |         sys.exit()
353 |     
354 |     prev_sigint_handler = signal.getsignal(signal.SIGINT)
355 |     signal.signal(signal.SIGINT, sigint_handler)
356 | 
357 |     tries = 0
358 |     executed = 0
359 |     total_elapsed_sec = 0
360 | 
361 |     backoff = False
362 |     backoff_exceptions = tuple(list(backoff_exceptions) + [ QueueEmptyError ])
363 | 
364 |     before_fn = before_fn or (lambda x: x)
365 |     after_fn = after_fn or (lambda x: x)
366 | 
367 |     loop_init_time = time.time()
368 | 
369 |     while LOOP:
370 |       total_elapsed_sec = time.time() - loop_init_time
371 |       task = 'unknown' # for error message prior to leasing
372 |       try:
373 |         task = self.lease(seconds=int(lease_seconds))
374 |         tries += 1
375 |         before_fn(task)
376 |         printv("INFO Running", task, " (id: {})".format(task.id))
377 |         time_start = time.time()
378 |         task.execute(*execute_args, **execute_kwargs)
379 |         time_delta = time.time() - time_start
380 |         executed += 1
381 |         printv("INFO Deleting", task.id)
382 |         self.delete(task, tally=tally)
383 |         printv('INFO', type(task).__name__, task.id , "succesfully executed in {:.2f} sec.".format(time_delta))
384 |         after_fn(task)
385 |         tries = 0
386 |       except backoff_exceptions:
387 |         backoff = True
388 |       except Exception as e:
389 |         printv('ERROR', task, "raised {}\n {}".format(e , traceback.format_exc()))
390 |         raise # this will restart the container in kubernetes
391 |       
392 |       varnames = stop_fn.__code__.co_varnames
393 |       stop_fn_bound = stop_fn
394 |       if 'executed' in varnames:
395 |         stop_fn_bound = partial(stop_fn_bound, executed=executed)
396 |       if 'tries' in varnames:
397 |         stop_fn_bound = partial(stop_fn_bound, tries=tries)
398 |       if 'previous_execution_time' in varnames:
399 |         stop_fn_bound = partial(stop_fn_bound, previous_execution_time=time_delta) 
400 |       if 'elapsed_time' in varnames:
401 |         stop_fn_bound = partial(stop_fn_bound, elapsed_time=total_elapsed_sec) 
402 | 
403 |       if stop_fn_bound():
404 |         break
405 | 
406 |       if backoff:
407 |         time.sleep(random_exponential_window_backoff(tries))
408 | 
409 |       backoff = False
410 | 
411 |     printv("Task execution loop exited.")
412 |     signal.signal(signal.SIGINT, prev_sigint_handler)
413 | 
414 |     return executed
415 | 
416 |   def block_until_empty(self, interval_sec=2):
417 |     while self.enqueued > 0:
418 |       time.sleep(interval_sec)
419 | 
420 |   def __enter__(self):
421 |     return self
422 | 
423 |   def __exit__(self, exception_type, exception_value, traceback):
424 |     pass
425 | 
426 | class LocalTaskQueue(object):
427 |   def __init__(self, parallel=1, queue_name='', queue_server='', progress=True):
428 |     if parallel and type(parallel) == bool:
429 |       parallel = mp.cpu_count()
430 | 
431 |     self.parallel = parallel
432 |     self.queue = []
433 |     self.progress = progress
434 | 
435 |   def insert(
436 |       self, tasks, 
437 |       delay_seconds=0, total=None,
438 |       parallel=None, progress=True
439 |     ):
440 |     tasks = toiter(tasks)
441 |     ct = 0
442 |     for task in tasks:
443 |       args, kwargs = [], {}
444 |       if isinstance(task, tuple):
445 |         task, args, kwargs = task
446 |       task = totask(task)
447 |       task = {
448 |         'payload': task.payload(),
449 |         'id': -1,
450 |       }
451 |       self.queue.append( (task, args, kwargs) )
452 |       ct += 1
453 | 
454 |     return ct
455 | 
456 |   def insert_all(self, *args, **kwargs):
457 |     ct = self.insert(*args, **kwargs)
458 |     self.execute(self.progress)
459 |     return ct
460 | 
461 |   def add_insert_count(self, ct):
462 |     pass
463 | 
464 |   def poll(self, *args, **kwargs):
465 |     pass
466 | 
467 |   def execute(self, progress=True, parallel=None, total=None):
468 |     if parallel is None:
469 |       parallel = self.parallel
470 | 
471 |     total = totalfn(self.queue, total)
472 | 
473 |     # Don't fork, spawn entirely new processes. This
474 |     # avoids accidental deadlocks.
475 |     mp.set_start_method("spawn", force=True)
476 | 
477 |     with tqdm(total=total, desc="Tasks", disable=(not progress)) as pbar:
478 |       if self.parallel == 1:
479 |         while self.queue:
480 |           _task_execute(self.queue.pop(0))
481 |           pbar.update()
482 |       else:
483 |         with pathos.pools.ProcessPool(self.parallel) as executor:
484 |           for _ in executor.imap(_task_execute, self.queue):
485 |             pbar.update()
486 |     
487 |       self.queue = []
488 | 
489 |   def __enter__(self):
490 |     return self
491 | 
492 |   def __exit__(self, exception_type, exception_value, traceback):
493 |     self.execute()
494 | 
495 | class MockTaskQueue(LocalTaskQueue):
496 |   pass
497 | 
498 | class GreenTaskQueue(TaskQueue):
499 |   def __init__(self, *args, **kwargs):
500 |     kwargs['green'] = True
501 |     super(GreenTaskQueue, self).__init__(*args, **kwargs)
502 | 
503 | # Necessary to define here to make the 
504 | # function picklable
505 | def _task_execute(task_tuple):
506 |   task, args, kwargs = task_tuple
507 |   task = totask(task)
508 |   task.execute(*args, **kwargs)
509 | 
510 | ## Multiprocess Upload
511 | 
512 | def soloprocess_upload(QueueClass, queue_name, tasks):
513 |   tq = QueueClass(queue_name, progress=False)
514 |   return tq.insert(tasks, skip_insert_counter=True)
515 | 
516 | error_queue = mp.Queue()
517 | 
518 | def multiprocess_upload(QueueClass, queue_name, tasks, parallel=True, total=None):
519 |   if parallel is True:
520 |     parallel = mp.cpu_count()
521 |   elif parallel <= 0:
522 |     raise ValueError("Parallel must be a positive number or zero (all cpus). Got: " + str(parallel))
523 | 
524 |   if parallel == 1:
525 |     return soloprocess_upload(QueueClass, queue_name, tasks)
526 |     
527 |   def capturing_soloprocess_upload(*args, **kwargs):
528 |     try:
529 |       return soloprocess_upload(*args, **kwargs)
530 |     except Exception as err:
531 |       print(err)
532 |       error_queue.put(err)
533 |     return 0
534 | 
535 |   uploadfn = partial(
536 |     capturing_soloprocess_upload, QueueClass, queue_name
537 |   )
538 | 
539 |   if isinstance(tasks, types.GeneratorType):
540 |     try:
541 |       task = next(item for item in tasks if item is not None)
542 |     except StopIteration:
543 |       return 0
544 |     tasks = itertools.chain([task], tasks)
545 | 
546 |   # This is a hack to get dill to pickle dynamically
547 |   # generated classes. This is an important use case
548 |   # for when we create iterators with generator __iter__
549 |   # functions on demand.
550 | 
551 |   # https://github.com/uqfoundation/dill/issues/56
552 | 
553 |   # cls_module = task.__class__.__module__
554 |   # task.__class__.__module__ = '__main__'
555 | 
556 |   total = totalfn(tasks, total)
557 | 
558 |   block_size = 2000
559 |   if total is not None and (total / parallel) < block_size:
560 |     if total > 500:
561 |       block_size = int(math.ceil(total / parallel))
562 | 
563 |   # Fix for MacOS which can segfault due to 
564 |   # urllib calling libdispatch which is not fork-safe
565 |   # https://bugs.python.org/issue30385
566 |   no_proxy = os.environ.get("no_proxy", "")
567 |   if platform.system().lower() == "darwin":
568 |     os.environ["no_proxy"] = "*"
569 | 
570 |   # Don't fork, spawn entirely new processes. This
571 |   # avoids accidental deadlocks.
572 |   mp.set_start_method("spawn", force=True)
573 | 
574 |   ct = 0
575 |   with tqdm(desc="Upload", total=total) as pbar:
576 |     with pathos.pools.ProcessPool(parallel) as pool:
577 |       for num_inserted in pool.imap(uploadfn, sip(tasks, block_size)):
578 |         pbar.update(num_inserted)
579 |         ct += num_inserted
580 | 
581 |   QueueClass(queue_name).add_insert_count(ct)
582 | 
583 |   if platform.system().lower() == "darwin":
584 |     os.environ["no_proxy"] = no_proxy
585 |   # task.__class__.__module__ = cls_module
586 | 
587 |   if not error_queue.empty():
588 |     errors = []
589 |     while not error_queue.empty():
590 |       err = error_queue.get()
591 |       if err is not StopIteration:
592 |         errors.append(err)
593 |     if len(errors):
594 |       raise Exception(errors)
595 | 
596 |   return ct
597 | 
598 | # c/o https://stackoverflow.com/questions/12826291/raise-two-errors-at-the-same-time
599 | def raise_multiple(errors):
600 |   if not errors:
601 |     return
602 |   try:
603 |     raise errors.pop()
604 |   finally:
605 |     raise_multiple(errors)
606 | 
607 | 
608 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![PyPI version](https://badge.fury.io/py/task-queue.svg)](https://badge.fury.io/py/task-queue)
  2 | 
  3 | # python-task-queue
  4 | 
  5 | A client and system for generating, uploading, leasing, and executing dependency free tasks both locally and in the cloud using AWS SQS or on a single machine or cluster with a common file system using file based queues. Of note, file queue requires no setup or queue service and can be used in a distributed fashion on a network filesystem.
  6 | 
  7 | The task queue uses JSON as a messaging medium, so be aware that e.g. integer dictionary keys can be turned into strings when bound as a parameter.
  8 | 
  9 | ## Installation
 10 | 
 11 | ```bash
 12 | pip install numpy # make sure you do this first on a seperate line
 13 | pip install task-queue
 14 | ```
 15 | 
 16 | The task queue uses your CloudVolume secrets located in `$HOME/.cloudvolume/secrets/`. When using AWS SQS as your queue backend, you must provide `$HOME/.cloudvolume/secrets/aws-secret.json`. See the [CloudVolume](https://github.com/seung-lab/cloud-volume) repo for additional instructions.
 17 | 
 18 | ## Usage
 19 | 
 20 | As of version 2.7.0, there are two ways to create a queueable task. The new way is simpler and probably preferred. 
 21 | 
 22 | *MacOS Only: Note that proxy servers are disabled for parallel operation due to libdispatch being not fork-safe.*
 23 | 
 24 | ### New School: Queueable Functions
 25 | 
 26 | Designate a function as queueable using the `@queueable` decorator. Currently variable positional arguments (`*args`) and variable keyword arguments (`**kwargs`) are not yet supported. If a function is not marked with the decorator, it cannot be executed via the queue.
 27 | 
 28 | ```python
 29 | from taskqueue import queueable
 30 | 
 31 | @queueable
 32 | def print_task(txt):
 33 |   print(str(txt))
 34 | ```
 35 | 
 36 | You then create queueable instantiations of these functions by using the standard library [`partial`](https://docs.python.org/3/library/functools.html#functools.partial) function to create a concrete binding.
 37 | 
 38 | ```python
 39 | from functools import partial
 40 | bound_fn = partial(print_task, txt="hello world")
 41 | ```
 42 | 
 43 | ### Old School: RegisteredTask Subclasses
 44 | 
 45 | Define a class that inherits from taskqueue.RegisteredTask and implements the `execute` method. RegisteredTasks contain logic that will render their attributes into a JSON payload and can be reconstituted into a live class on the other side of a task queue.
 46 | 
 47 | Tasks can be loaded into queues locally or in the cloud and executed later. Here's an example implementation of a trivial `PrintTask`. The attributes of your container class should be simple values that can be easily encoded into JSON such as ints, floats, strings, and numpy arrays. Let the `execute` method download and manipulate heavier data. If you're feeling curious, you can see what JSON a task will turn into by calling `task.payload()`.
 48 | 
 49 | ```python
 50 | from taskqueue import RegisteredTask
 51 | 
 52 | class PrintTask(RegisteredTask):
 53 |   def __init__(self, txt=''):
 54 |     super(PrintTask, self).__init__(txt)
 55 |     # attributes passed to super().__init__ are automatically assigned
 56 |     # use this space to perform additional processing such as:
 57 |     self.txt = str(txt)
 58 | 
 59 |   def execute(self):
 60 |     if self.txt:
 61 |       print(str(self) + ": " + self.txt)
 62 |     else:
 63 |       print(self)
 64 | ```
 65 | 
 66 | ## Local Usage
 67 | 
 68 | For small jobs, you might want to use one or more processes to execute the tasks.
 69 | 
 70 | ```python
 71 | from functools import partial
 72 | from taskqueue import LocalTaskQueue
 73 | from mylibrary import PrintTask # mylibrary is wherever you defined PrintTask
 74 | 
 75 | tq = LocalTaskQueue(parallel=5) # use 5 processes
 76 | 
 77 | 
 78 | tasks = ( PrintTask(i) for i in range(2000) ) # OLD SCHOOL
 79 | tasks = ( partial(print_task, i) for i in range(2000) ) # NEW SCHOOL
 80 | 
 81 | tq.insert_all(tasks) # performs on-line execution (naming is historical)
 82 | 
 83 | # alterternative serial model
 84 | tq.insert(tasks)
 85 | tq.execute()
 86 | 
 87 | # delete tasks
 88 | tq.delete(tasks)
 89 | tq.purge() # delete all tasks
 90 | ```
 91 | 
 92 | This will load the queue with 1000 print tasks then execute them across five processes.
 93 | 
 94 | ## Cloud and Cluster Usage
 95 | 
 96 | 1. Set up an SQS queue and acquire an aws-secret.json that is compatible with CloudVolume. Generate the tasks and insert them into the cloud queue.
 97 | 
 98 | 2. You can alternatively set up a file based queue that has the same time-based leasing property of an SQS queue.
 99 | 
100 | IMPORTANT: You must import the tasks that will be executed, otherwise the code to execute them has not been loaded.
101 | 
102 | ```python
103 | # import gevent.monkey
104 | # gevent.monkey.patch_all(thread=False)
105 | from taskqueue import TaskQueue
106 | from mylibrary import PrintTask # mylibrary is wherever you defined PrintTask
107 | 
108 | # region is SQS specific, green means cooperative threading
109 | tq = TaskQueue('sqs://queue-name', region_name="us-east1-b", green=False)
110 | tq = TaskQueue('fq:///path/to/queue/directory/') # file queue ('fq')
111 | 
112 | # insert accepts any iterable
113 | tq.insert(( PrintTask(i) for i in range(1000) )) # OLD SCHOOL
114 | tq.insert(( partial(print_task, i) for i in range(1000) )) # NEW SCHOOL
115 | tq.enqueued # approximate number of tasks in the queue
116 | 
117 | # FileQueue Only:
118 | tq.inserted #  total number of tasks inserted
119 | tq.completed # number of tasks completed, requires tally=True with poll
120 | tq.rezero() # reset statistics like inserted and completed
121 | tq.release_all() # set all tasks to available
122 | ```
123 | 
124 | This inserts 1000 PrintTask JSON descriptions into your SQS queue.
125 | 
126 | Somewhere else, you'll do the following (probably across multiple workers):
127 | 
128 | ```python
129 | from taskqueue import TaskQueue
130 | import MY_MODULE # MY_MODULE contains the definitions of RegisteredTasks
131 | 
132 | tq = TaskQueue('sqs://queue-name')
133 | tq.poll(
134 |   lease_seconds=int(LEASE_SECONDS),
135 |   verbose=True, # print out a success message
136 |   tally=True, # count number of tasks completed (fq only!)
137 | )
138 | ```
139 | 
140 | Poll will check the queue for a new task periodically. If a task is found, it will execute it immediately, delete the task from the queue, and request another. If no task is found, a random exponential backoff of up to 120sec is built in to prevent workers from attempting to DDOS the queue. If the task fails to complete, the task will eventually recirculate within the queue, ensuring that all tasks will eventually complete provided they are not fundementally flawed in some way.
141 | 
142 | ## Local Container testing
143 | 
144 | If there is a AWS compatible queue running on a local cluster, e.g. [alpine-sqs](https://hub.docker.com/r/roribio16/alpine-sqs/), the underlying connection client
145 | needs additional parameters. These can be passed into the TaskQueue constructor.
146 | 
147 | The following code on a worker will work in local and production contexts:
148 | 
149 | ```python
150 | queue = os.environ['SQS_QUEUE']  # for local, set to "default"
151 | region_name = os.environ.get('SQS_REGION_NAME')  # set only for prod
152 | endpoint_url = os.environ.get('SQS_ENDPOINT_URL')  # set only for local
153 | tqueue = taskqueue.TaskQueue(f'sqs://{queue}',
154 |                              region_name=region_name,
155 |                              endpoint_url=endpoint_url)
156 | ```
157 | 
158 | Example docker-compose.yml for local testing:
159 | 
160 | ```yaml
161 | version: "3.7"
162 | 
163 | services:
164 |   worker:
165 |     image: yourlab/yourworker:v1
166 |     environment:
167 |       - SQS_QUEUE=default
168 |       - SQS_ENDPOINT_URL=http://local_sqs:9324
169 |     depends_on:
170 |       - local_sqs
171 | 
172 |   local_sqs:
173 |     image: roribio16/alpine-sqs
174 | ```
175 | 
176 | Example docker-compose.yml for production:
177 | 
178 | ```yaml
179 | version: "3.7"
180 | 
181 | services:
182 |   worker:
183 |     image: yourlab/yourworker:v1
184 |     environment:
185 |       - SQS_QUEUE=my-aws-queue
186 |       - SQS_REGION=us-west-1
187 | ```
188 | ### Notes on Google PubSub
189 | TaskQueue will try to connect to pubsub using the credentials it finds at ~/.cloudvolume/secrets/google-secret.json 
190 | 
191 | You must first make both a topic and a subscription that is subscribed to that topic.
192 | 
193 | ```
194 | gcloud pubsub topics create TASKQUEUE_TEST_TOPIC
195 | gcloud pubsub subscriptions create TASKQUEUE_TEST_SUBSCRIPTION --topic TASKQUEUE_TEST_TOPIC
196 | ```
197 | 
198 | Then you can specify a taskqueue using this url format (which we invented to include both the project_id, topic and subscription) 
199 | 
200 | ```python
201 | queue_url = "pubsub://projects/<GOOGLE_PROJECT_ID>/topics/TASKQUEUE_TEST_TOPIC/subscriptions/TASKQUEUE_TEST_SUBSCRIPTION"
202 | 
203 | tq = TaskQueue(queue_url)
204 | ```
205 | Note that Google PubSub doesn't have all the same features as Amazon SQS, including statistics reporting, and so some features do not function properly with this backend.
206 | 
207 | Also, google pubsub libraries are not installed by default and so if you want to use this backend install with the pubsub option
208 | 
209 | ```
210 | pip install task-queue[pubsub]
211 | ```
212 | 
213 | 
214 | ### Notes on File Queue
215 | 
216 | ```python
217 | # FileQueue Specific Features
218 | 
219 | tq.inserted # number of inserted tasks
220 | tq.completed # number of completed tasks (counts rerun tasks too)
221 | tq.rezero() # sets tq.inserted and tq.completed to zero.
222 | tq.release_all() # sets all tasks to available
223 | ```
224 | 
225 | FileQueue (`fq://`) is designed to simulate the timed task leasing feature from SQS and exploits a common filesystem to avoid requiring an additional queue server. You can read in detail about its design [on the wiki](https://github.com/seung-lab/python-task-queue/wiki/FileQueue-Design).
226 | 
227 | There are a few things FileQueue can do that SQS can't and also some quirks you should be aware of. For one, FileQueue can track the number of task completions (`tq.completions`, `tq.poll(..., tally=True)`), but it does so by appending a byte to a file called `completions` for each completion. The size of the file in bytes is the number of completions. This design is an attempt to avoid problems with locking and race conditions. FileQueue also tracks insertions (`tq.insertions`) in a more typical way in an `insertions` file. Also unlike SQS, FileQueue allows listing all tasks at once.
228 | 
229 | FileQueue also allows releasing all current tasks from their leases, something impossible in SQS. Sometimes a few tasks will die immediately after leasing, but with a long lease, and you'll figure out how to fix them. Instead of starting over or waiting possibly hours, you can set the queue to be made available again (`tq.release_all()`).
230 | 
231 | As FileQueue is based on the filesystem, it can be managed somewhat via the command line. To delete a queue, just `rm -r $QUEUE_PATH`. To reset a counter: `rm $QUEUE_PATH/completions` (e.g.). If you are brave, you could even use the `mv` command to reassign a task's availability.
232 | 
233 | We also discovered that FileQueues are also amenable to fixing problems on the fly. In one case, we generated a set of tasks that took 4.5 hours of computation time and decided to run those tasks on a different cluster. The 500k tasks each contained a path to the old storage cluster. Using `find`, `xargs`, and `sed` we were able to fix them efficiently. 
234 | 
235 | #### Bundled `ptq` CLI Tool 
236 | 
237 | As of 2.5.0, we now bundle a command line tool `ptq` to make managing running FileQueues easier.
238 | 
239 | ```bash
240 | ptq status fq://./my-queue # prints vital statistics
241 | ptq release fq://./my-queue # releases all tasks from their lease
242 | ptq rezero fq://./my-queue # resets statistics to zero
243 | ptq cp fq://./my-queue sqs://my-cloud-queue # copy a queue (no copies of sqs)
244 | ptq mv sqs://my-cloud-queue fq://./my-queue # move a queue (all supported)
245 | ```
246 | 
247 | ## Motivation
248 | 
249 | Distributed dependency free task execution engines (such as [Igneous](https://github.com/seung-lab/igneous/)) often make use of cloud based queues like Amazon Simple Queue Service (SQS). In the connectomics field we process petascale images which requires generating hundreds of thousands or millions of cloud tasks per a run. In one case, we were processing serial blocks of a large image where each block depended on the previous block's completion. Each block's run required the generation and upload of millions of tasks and the use of thousands of workers. The workers would rapidly drain the task queue and it was important to ensure that it could be fed fast enough to prevent starvation of this enormous cluster.
250 | 
251 | There are a few strategies for accomplishing this. One way might be to use a fully featured DAG supporting engine which could generate the next task on demand. However, we were experienced with SQS and had designed our architecture around it. Furthermore, it was, in our experience, robust to thousands of machines knocking on it. This does not discount that there could be better methods out there, but this was convenient for us.
252 | 
253 | The two major ways to populate the SQS queue at scale would be a task generating task so a single processor could could enlist hundreds or thousands of others or we could just make our task generating client fast and memory efficient and use a handful of cores for multiprocessing. Keeping things simple and local allows for greater operational flexibility and the addition of a drop-in mutiprocessing execution engine allows for the omission of cloud services for small jobs. Importantly, improved small scale performance doesn't preclude the later development of metageneration facilities.
254 | 
255 | By default, the Python task queue libraries are single threaded and blocking, resulting in upload rates of at most tens of tasks per second. It is possible to do much better by using threads, multiple processes, and by batching requests. TaskQueue has achivied upload rates of over 3000 tasks per second single core, and around 10,000 per second multicore on a single machine. This is sufficient to keep our cluster fed and allows for programmer flexibility as they can populate queues from their local machine using simple scripts.
256 | 
257 | ## How to Achieve High Performance
258 | 
259 | Attaining the quoted upload rates is simple but takes a few tricks to tune the queue. By default, TaskQueue will upload hundreds of tasks per second using its threading model. We'll show via progressive examples how to tune your upload script to get many thousands of tasks per second with near zero latency and memory usage. Note that the examples below use `sqs://`, but apply to `fq://` as well. These examples also use the old school style of task instantiation, but you can substitute the new style without consequence.
260 | 
261 | ```python
262 | # Listing 1: 10s per second, high memory usage, non-zero latency
263 | 
264 | tasks = [ PrintTask(i) for i in range(1000000) ]
265 | tq = TaskQueue('sqs://queue-name')
266 | for task in tasks:
267 |   tq.insert(task)
268 | ```
269 | 
270 | This first example shows how you might use the queue in the most naive fashion. The tasks list takes a long time to compute, uses a lot of memory, and then inserts a single task at a time, failing to exploit the threading model in TaskQueue. **Note that this behavior has changed from previous versions where we endorsed the "with" statement where this form was faster, though still problematic.**
271 | 
272 | ```python
273 | # Listing 2: 100-1000s per second, high memory usage, non-zero latency
274 | 
275 | tasks = [ PrintTask(i) for i in range(1000000) ]
276 | tq = TaskQueue('sqs://queue-name')
277 | tq.insert(tasks)
278 | ```
279 | 
280 | The listing above allows you to use ordinary iterative programming techniques to achieve an upload rate of hundreds per a second without much configuration, a marked improvement over simply using boto nakedly. However, the initial generation of a list of tasks uses a lot of memory and introduces a delay while the list is generated.
281 | 
282 | This form also takes advantage of SQS batch upload which allows for submitting 10 tasks at once. As the overhead for submitting a task lies mainly in HTTP/1.1 TCP/IP connection overhead, batching 10 requests results in nearly a 10x improvement in performance. However, in this case we've created all the tasks up front again in order to batch them correctly which results in the same memory and latency issues as in Listing 1.
283 | 
284 | ```python
285 | # Listing 3: 100-1000s per second, low memory usage, near-zero latency
286 | 
287 | tasks = ( PrintTask(i) for i in range(1000000) )
288 | tq = TaskQueue('sqs://queue-name')
289 | tq.insert(tasks, total=1000000) # total necessary for progress bars to work
290 | ```
291 | 
292 | In Listing 3, we've started using generators instead of lists. Generators are essentially lazy-lists that compute the next list element on demand. Defining a generator is fast and takes constant time, so we are able to begin production of new elements nearly instantly. The elements are produced on demand and consumed instantly, resulting in a small constant memory overhead that can be typically measured in kilobytes to megabytes.
293 | 
294 | As generators do not support the `len` operator, we manually pass in the number of items to display a progress bar.
295 | 
296 | ```python
297 | # Listing 4: 100s-1000s per second, low memory usage, near-zero latency
298 | 
299 | import gevent.monkey
300 | gevent.monkey.patch_all()
301 | from taskqueue import TaskQueue
302 | 
303 | tasks = ( PrintTask(i) for i in range(1000000) )
304 | tq = TaskQueue('sqs://queue-name', green=True)
305 | tq.insert(tasks, total=1000000) # total helps the progress bar
306 | ```
307 | 
308 | In Listing 4, we use the `green=True` argument to use cooperative threads. Under the hood, TaskQueue relies on Python kernel threads to achieve concurrent IO. However, on systems with mutliple cores, especially those in a virutalized or NUMA context, the OS will tend to distribute the threads fairly evenly between cores leading to high context-switching overhead. Ironically, a more powerful multicore system can lead to lower performance. To remedy this issue, we introduce a user-space cooperative threading model (green threads) using gevent (which depending on your system is uses either libev or libuv for an event loop).
309 | 
310 | This can result in a substantial performance increase on some systems. Typically a single core will be fully utilized with extremely low overhead. However, using cooperative threading with networked IO in Python requires monkey patching the standard library (!!). Refusing to patch the standard library will result in single threaded performance. Thus, using GreenTaskQueue can introduce problems into many larger applications (we've seen problems with multiprocessing and ipython). However, often the task upload script can be isolated from the rest of the system and this allows monkey patching to be safely performed. To give users more control over when they wish to accept the risk of monkey patching, it is not performed automatically and a warning will appear with instructions for amending your program.
311 | 
312 | ```python
313 | # Listing 5: 1000s-10000 per second, low memory usage, near zero latency, efficient multiprocessing
314 | 
315 | import gevent.monkey
316 | gevent.monkey.patch_all()
317 | from taskqueue import TaskQueue
318 | from concurrent.futures import ProcessPoolExecutor
319 | 
320 | def upload(args):
321 |   start, end = args
322 |   tasks = ( PrintTask(i) for i in range(start, end) )
323 |   tq = TaskQueue('sqs://queue-name', green=True)
324 |   tq.insert(tasks, total=(end - start))
325 | 
326 | task_ranges = [ (0, 250000), (250000, 500000), (500000, 750000), (750000, 1000000) ]
327 | with ProcessPoolExecutor(max_workers=4) as pool:
328 |   pool.map(upload, task_ranges)
329 | ```
330 | 
331 | In Listing 5, we finally move to multiprocessing to attain the highest speeds. There are three critical pieces of this construction to note.
332 | 
333 | First, we do not use the usual `multiprocessing` package and instead use `concurrent.futures.ProcessPoolExecutor`. If a child process dies in `multiprocessing`, the parent process will simply hang (this is by design unfortunately...). Using this alternative package, at least an exception will be thrown.
334 | 
335 | Second, we pass parameters for task generation to the child proceses, not tasks. It is not possible to pass generators from parent to child processes in CPython [1]. It is also inefficient to pass tasks directly as it requires first generating them (as in Listing 1) and then invisibly pickling and unpickling them as they are passed to the child processes. Therefore, we pass only a small number of small picklable objects that are used for constructing a task generator on the other side.
336 | 
337 | Third, as described in the narrative for Listing 5, the GreenTaskQueue has less context-switching overhead than ordinary multithreaded TaskQueue. Using GreenTaskQueue will cause each core to efficiently run independently of the others. At this point, your main bottlenecks will probably be OS/network card related (let us know if they aren't!). Multiprocessing does scale task production, but it's sub-linear in the number of processes. The task upload rate per a process will fall with each additional core added, but each core still adds additional throughput up to some inflection point.
338 | 
339 | ```python
340 | # Listing 6: Exchanging Generators for Iterators
341 | 
342 | import gevent.monkey
343 | gevent.monkey.patch_all()
344 | from taskqueue import TaskQueue
345 | from concurrent.futures import ProcessPoolExecutor
346 | 
347 | class PrintTaskIterator(object):
348 |   def __init__(self, start, end):
349 |     self.start = start
350 |     self.end = end
351 |   def __len__(self):
352 |     return self.end - self.start
353 |   def __iter__(self):
354 |     for i in range(self.start, self.end):
355 |       yield PrintTask(i)
356 | 
357 | def upload(tsks):
358 |   tq = TaskQueue('sqs://queue-name', green=True)
359 |   tq.insert(tsks)
360 | 
361 | tasks = [ PrintTaskIterator(0, 100), PrintTaskIterator(100, 200) ]
362 | with ProcessPoolExecutor(max_workers=2) as execute:
363 |   execute.map(upload, tasks)
364 | ```
365 | 
366 | If you insist on wanting to pass generators to your subprocesses, you can use iterators instead. The construction above allows us to write the generator call up front, pass only a few primatives through the pickling process, and transparently call the generator on the other side. We can even support the `len()` function which is not available for generators.
367 | 
368 | ```python
369 | # Listing 7: Easy Multiprocessing
370 | 
371 | import gevent.monkey
372 | gevent.monkey.patch_all(thread=False)
373 | import copy
374 | from taskqueue import TaskQueue
375 | 
376 | class PrintTaskIterator(object):
377 |   def __init__(self, start, end):
378 |     self.start = start
379 |     self.end = end
380 |   def __getitem__(self, slc):
381 |     itr = copy.deepcopy(self)
382 |     itr.start = self.start + slc.start
383 |     itr.end = self.start + slc.stop
384 |     return itr
385 |   def __len__(self):
386 |     return self.end - self.start
387 |   def __iter__(self):
388 |     for i in range(self.start, self.end):
389 |       yield PrintTask(i)
390 | 
391 | tq = TaskQueue('sqs://queue-name', green=True)
392 | tq.insert(PrintTaskIterator(0,200), parallel=2)
393 | ```
394 | 
395 | If you design your iterators such that the slice operator works, TaskQueue can
396 | automatically resection the iterator such that it can be fed to multiple processes. Notably, we don't return `PrintTaskIterator(self.start+slc.start, self.start+slc.stop)` because it triggers an endless recursion during pickling. However, the runtime copy implementation above sidesteps this issue. Internally, `PrintTaskIterator(0,200)` will be turned into `[ PrintTaskIterator(0,100), PrintTaskIterator(100,200) ]`. We also perform tracking of exceptions raised by child processes in a queue. `gevent.monkey.patch_all(thread=False)` was necessary to avoid multiprocess hanging.
397 | 
398 | [1] You can't pass generators in CPython but [you can pass iterators](https://stackoverflow.com/questions/1939015/singleton-python-generator-or-pickle-a-python-generator/1939493#1939493). You can pass generators if you use Pypy or Stackless Python.
399 | 
400 | --  
401 | Made with <3.
402 | 


--------------------------------------------------------------------------------