├── src ├── __init__.py ├── .gitignore └── tests │ ├── test_pypeflow_common.py │ ├── test_pypeflow_data.py │ ├── test_pypeflow_task.py │ └── test_pypeflow_controller.py ├── pwatcher ├── __init__.py ├── mains │ ├── __init__.py │ ├── pwatcher.py │ ├── job_start.sh │ ├── pypeflow_example.py │ ├── query_server.py │ ├── fs_heartbeat.py │ └── network_heartbeat.py └── blocking.py ├── pypeflow ├── mains │ └── __init__.py ├── __init__.py ├── pwatcher_workflow.py ├── sample_tasks.py ├── util.py ├── do_support.py ├── tasks.py ├── io.py └── do_task.py ├── example ├── testdata │ └── .placeholder ├── README.txt ├── test_shutdown.py └── PypeTest.py ├── examples-pwatcher ├── .gitignore ├── ab │ ├── delete.json │ ├── jobs │ │ ├── b │ │ └── c │ ├── query-ab.json │ ├── makefile │ ├── run.json │ └── logging-cfg.json └── README.md ├── doc ├── Example1.png ├── Example2.png ├── modules.rst ├── index.rst ├── pypeflow.rst ├── concurrent_execution.rst ├── introduction.rst ├── rdf_resprentation.rst ├── Makefile ├── examples.rst └── conf.py ├── presentation ├── pipelines.png ├── escher--unbelievable-527581_1024_768.jpg └── pypeFLOW_tutorial.rst ├── .gitignore ├── bamboo-specs ├── .settings │ ├── org.eclipse.m2e.core.prefs │ ├── org.eclipse.core.resources.prefs │ └── org.eclipse.jdt.core.prefs ├── .gitignore ├── src │ ├── test │ │ └── java │ │ │ └── pacbio │ │ │ └── PlanSpecTest.java │ └── main │ │ └── java │ │ └── pacbio │ │ └── PlanSpec.java ├── .hgignore ├── .project ├── .classpath └── pom.xml ├── travis.sh ├── makefile ├── .travis.yml ├── setup.py ├── bamboo_wheel.sh ├── test ├── test_do_task.py └── test_integ.py ├── LICENSE ├── README.rst └── readme.slurm.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pwatcher/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pwatcher/mains/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pypeflow/mains/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/testdata/.placeholder: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | -------------------------------------------------------------------------------- /examples-pwatcher/.gitignore: -------------------------------------------------------------------------------- 1 | pwatched/ 2 | foo/ 3 | -------------------------------------------------------------------------------- /examples-pwatcher/ab/delete.json: -------------------------------------------------------------------------------- 1 | {"which":"infer"} 2 | -------------------------------------------------------------------------------- /examples-pwatcher/ab/jobs/b: -------------------------------------------------------------------------------- 1 | echo 'hi b' 2 | sleep 5 3 | echo 'bye b' 4 | -------------------------------------------------------------------------------- /examples-pwatcher/ab/jobs/c: -------------------------------------------------------------------------------- 1 | echo 'hi c' 2 | sleep 100 3 | echo 'bye c' 4 | -------------------------------------------------------------------------------- /doc/Example1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacificBiosciences/pypeFLOW/HEAD/doc/Example1.png -------------------------------------------------------------------------------- /doc/Example2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacificBiosciences/pypeFLOW/HEAD/doc/Example2.png -------------------------------------------------------------------------------- /doc/modules.rst: -------------------------------------------------------------------------------- 1 | pypeflow 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | pypeflow 8 | -------------------------------------------------------------------------------- /presentation/pipelines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacificBiosciences/pypeFLOW/HEAD/presentation/pipelines.png -------------------------------------------------------------------------------- /examples-pwatcher/ab/query-ab.json: -------------------------------------------------------------------------------- 1 | { 2 | "which": "list", 3 | "jobids": ["b", "c"], 4 | "~end": {} 5 | } 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /build/ 2 | /dist/ 3 | /src/pypeflow.egg-info/ 4 | /src/tests/__pycache__/ 5 | /wheelhouse/ 6 | /artifacts/ 7 | /*.xml 8 | .pytest_cache/ 9 | -------------------------------------------------------------------------------- /bamboo-specs/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /presentation/escher--unbelievable-527581_1024_768.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacificBiosciences/pypeFLOW/HEAD/presentation/escher--unbelievable-527581_1024_768.jpg -------------------------------------------------------------------------------- /bamboo-specs/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/test/java=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /pypeflow/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.3.0' # should match setup.py 2 | 3 | try: 4 | import sys, pkg_resources 5 | sys.stderr.write('{}\n'.format(pkg_resources.get_distribution('pypeflow'))) 6 | except Exception: 7 | pass 8 | -------------------------------------------------------------------------------- /examples-pwatcher/ab/makefile: -------------------------------------------------------------------------------- 1 | SHELL:=bash 2 | run: 3 | pwatcher-main run < run.json 4 | pstree -pgsu $${USER} 5 | query: 6 | pwatcher-main query < query-ab.json 7 | delete: 8 | pwatcher-main delete <<< $$(echo '{"which":"infer"}') 9 | clean: 10 | rm -rf pwatched *.log 11 | -------------------------------------------------------------------------------- /travis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # -e: fail on error 3 | # -v: show commands 4 | # -x: show expanded commands 5 | set -vex 6 | 7 | #env | sort 8 | sudo mkdir -p /tmp 9 | sudo chmod a+wrx /tmp 10 | python setup.py install 11 | nosetests --with-doctest -v pypeflow/ pwatcher/fs_based.py 12 | -------------------------------------------------------------------------------- /examples-pwatcher/ab/run.json: -------------------------------------------------------------------------------- 1 | { 2 | "jobids": { 3 | "c": { 4 | "cmd": "/bin/bash jobs/c", 5 | "rundir": "./jobs" 6 | }, 7 | "b": { 8 | "cmd": "/bin/bash jobs/b", 9 | "rundir": "./jobs" 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /bamboo-specs/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 3 | org.eclipse.jdt.core.compiler.compliance=1.8 4 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 5 | org.eclipse.jdt.core.compiler.release=disabled 6 | org.eclipse.jdt.core.compiler.source=1.8 7 | -------------------------------------------------------------------------------- /pwatcher/mains/pwatcher.py: -------------------------------------------------------------------------------- 1 | from .. import fs_based 2 | import pdb 3 | import sys 4 | 5 | def main(): 6 | fs_based.main(*sys.argv) # pylint: disable=no-value-for-parameter 7 | 8 | # If run directly, rather than via the 'entry-point', 9 | # then pdb will be used. 10 | if __name__ == "__main__": 11 | #pdb.set_trace() 12 | main() 13 | -------------------------------------------------------------------------------- /pypeflow/pwatcher_workflow.py: -------------------------------------------------------------------------------- 1 | from .simple_pwatcher_bridge import ( 2 | PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase, 3 | makePypeLocalFile, fn, PypeTask) 4 | PypeThreadTaskBase = MyFakePypeThreadTaskBase 5 | 6 | __all__ = [ 7 | 'PypeProcWatcherWorkflow', 'PypeThreadTaskBase', 8 | 'makePypeLocalFile', 'fn', 'PypeTask', 9 | ] 10 | -------------------------------------------------------------------------------- /example/README.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | Try "pythone PypeTest.py localshell 1" to run some random dependent shell script jobs. 4 | You can stop the python script and restart using "pythone PypeTest.py localshell 0". 5 | 6 | Other simple examples: 7 | 8 | python3 PypeTest.py internal 1 #the task is done within python functions 9 | 10 | python3 PypeTest.py #other simple test 11 | -------------------------------------------------------------------------------- /bamboo-specs/.gitignore: -------------------------------------------------------------------------------- 1 | .credentials 2 | 3 | ### Maven 4 | target/ 5 | pom.xml.tag 6 | pom.xml.releaseBackup 7 | pom.xml.versionsBackup 8 | pom.xml.next 9 | release.properties 10 | dependency-reduced-pom.xml 11 | buildNumber.properties 12 | 13 | ### Java 14 | # Compiled class file 15 | *.class 16 | 17 | # Log file 18 | *.log 19 | 20 | # Package Files # 21 | *.jar 22 | 23 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 24 | hs_err_pid* 25 | 26 | ### Idea 27 | .idea/ 28 | *.iml 29 | -------------------------------------------------------------------------------- /bamboo-specs/src/test/java/pacbio/PlanSpecTest.java: -------------------------------------------------------------------------------- 1 | package pacbio; 2 | 3 | import com.atlassian.bamboo.specs.api.builders.plan.Plan; 4 | import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException; 5 | import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders; 6 | import org.junit.Test; 7 | 8 | public class PlanSpecTest { 9 | @Test 10 | public void checkYourPlanOffline() { 11 | Plan plan = new PlanSpec().createPlan(); 12 | 13 | EntityPropertiesBuilders.build(plan); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | WHEELHOUSE?=wheelhouse 2 | PIP=pip3 wheel --wheel-dir ${WHEELHOUSE} --find-links ${WHEELHOUSE} 3 | MY_TEST_FLAGS?=-v -s --durations=0 4 | 5 | default: 6 | pylint: 7 | pylint --errors-only pypeflow/ pwatcher/ 8 | pytest: 9 | python3 -c 'import pypeflow; print(pypeflow)' 10 | py.test ${MY_TEST_FLAGS} --junit-xml=nosetests.xml --doctest-modules pypeflow/ pwatcher/ test/ 11 | autopep8: 12 | autopep8 --max-line-length=120 -ir -j0 pypeflow/ pwatcher/ 13 | wheel: 14 | which pip3 15 | ${PIP} --no-deps . 16 | ls -larth ${WHEELHOUSE} 17 | -------------------------------------------------------------------------------- /bamboo-specs/.hgignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | 3 | .credentials 4 | 5 | ### Maven 6 | target/ 7 | pom.xml.tag 8 | pom.xml.releaseBackup 9 | pom.xml.versionsBackup 10 | pom.xml.next 11 | release.properties 12 | dependency-reduced-pom.xml 13 | buildNumber.properties 14 | 15 | ### Java 16 | # Compiled class file 17 | *.class 18 | 19 | # Log file 20 | *.log 21 | 22 | # Package Files # 23 | *.jar 24 | 25 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 26 | hs_err_pid* 27 | 28 | ### Idea 29 | .idea/ 30 | *.iml 31 | -------------------------------------------------------------------------------- /pypeflow/sample_tasks.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import logging 4 | from .tasks import gen_task 5 | from .simple_pwatcher_bridge import ( 6 | PypeLocalFile, makePypeLocalFile, fn, 7 | PypeTask, #Dist, 8 | ) 9 | LOG = logging.getLogger(__name__) 10 | 11 | 12 | def create_task(i1, o1): 13 | script = """ 14 | cat {input.i1} > {output.o1} 15 | """ 16 | return gen_task( 17 | script=script, 18 | inputs={ 19 | 'i1': i1, 20 | }, 21 | outputs={ 22 | 'o1': o1, 23 | }, 24 | parameters={}, 25 | ) 26 | -------------------------------------------------------------------------------- /pypeflow/util.py: -------------------------------------------------------------------------------- 1 | """Old stuff 2 | Prefer io.py now. 3 | """ 4 | import logging 5 | import os 6 | from .io import (cd, touch, mkdirs, syscall as system) 7 | 8 | LOG = logging.getLogger() 9 | 10 | def run(script_fn): 11 | cwd, basename = os.path.split(script_fn) 12 | with cd(cwd): 13 | system('/bin/bash {}'.format(basename)) 14 | def rmdirs(path): 15 | if os.path.isdir(path): 16 | if len(path) < 20 and 'home' in path: 17 | LOG.error('Refusing to rm {!r} since it might be your homedir.'.format(path)) 18 | return 19 | cmd = 'rm -rf {}'.format(path) 20 | system(cmd) 21 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. PypeFlow documentation master file, created by 2 | sphinx-quickstart on Tue Jan 10 21:13:17 2012. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | pypeFLOW 7 | ==================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | introduction 15 | installation 16 | examples 17 | concurrent_execution 18 | rdf_resprentation 19 | modules 20 | 21 | 22 | Indices and tables 23 | ================== 24 | 25 | * :ref:`genindex` 26 | * :ref:`modindex` 27 | * :ref:`search` 28 | 29 | -------------------------------------------------------------------------------- /bamboo-specs/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | bamboo-specs 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /doc/pypeflow.rst: -------------------------------------------------------------------------------- 1 | pypeflow Package 2 | ================ 3 | 4 | :mod:`common` Module 5 | -------------------- 6 | 7 | .. automodule:: pypeflow.common 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | :mod:`controller` Module 13 | ------------------------ 14 | 15 | .. automodule:: pypeflow.controller 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | :mod:`data` Module 21 | ------------------ 22 | 23 | .. automodule:: pypeflow.data 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | :mod:`task` Module 29 | ------------------ 30 | 31 | .. automodule:: pypeflow.task 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Build matrix / environment variable are explained on: 2 | # http://about.travis-ci.org/docs/user/build-configuration/ 3 | # This file can be validated on: 4 | # http://lint.travis-ci.org/ 5 | 6 | #before_install: sudo apt-get install -y cmake 7 | # cmake is pre-installed in Travis for both linux and osx 8 | 9 | #before_install: 10 | # - sudo apt-get update -qq 11 | # - sudo apt-get install -qq valgrind 12 | sudo: required 13 | os: 14 | - linux 15 | language: python 16 | #compiler: 17 | # - gcc 18 | # - clang 19 | script: ./travis.sh 20 | #env: 21 | # matrix: 22 | # - SHARED_LIB=ON STATIC_LIB=ON CMAKE_PKG=ON BUILD_TYPE=release VERBOSE_MAKE=false 23 | # - SHARED_LIB=OFF STATIC_LIB=ON CMAKE_PKG=OFF BUILD_TYPE=debug VERBOSE_MAKE=true VERBOSE 24 | notifications: 25 | email: false 26 | -------------------------------------------------------------------------------- /doc/concurrent_execution.rst: -------------------------------------------------------------------------------- 1 | Concurrent Execution 2 | ====================== 3 | 4 | ``PypeThreadTaskBase`` provides the base class for task that can 5 | be run concurrently. If a task is built with ``PypeThreadTaskBase``, 6 | it has to be used with ``PypeThreadWorkflow``. And all other tasks 7 | in the workflow should be ``PypeThreadTaskBase`` objects too. We simply 8 | use python thread for concurrent tasks. Due to the Python GIL, it is 9 | not recommand to have python function for intensive computing as task. 10 | The main purpose for ``PypeThreadTaskBase`` is for building tasks that wrapped 11 | some shell commands for running locally or through a cluster environment. 12 | In the furture, it is possible to add multiprocessing based support 13 | for computation intensive python functions as tasks to avoid the GIL. 14 | 15 | 16 | -------------------------------------------------------------------------------- /src/tests/test_pypeflow_common.py: -------------------------------------------------------------------------------- 1 | from nose.tools import assert_equal 2 | from nose import SkipTest 3 | 4 | class TestPypeObject: 5 | def TestRDFXML(self): 6 | # pype_object = PypeObject(URL, **attributes) 7 | # assert_equal(expected, pype_object.RDFXML()) 8 | raise SkipTest # TODO: implement your test here 9 | 10 | def test___init__(self): 11 | # pype_object = PypeObject(URL, **attributes) 12 | raise SkipTest # TODO: implement your test here 13 | 14 | class TestRunShellCmd: 15 | def test_run_shell_cmd(self): 16 | # assert_equal(expected, runShellCmd(args, **kwargs)) 17 | raise SkipTest # TODO: implement your test here 18 | 19 | class TestRunSgeSyncJob: 20 | def test_run_sge_sync_job(self): 21 | # assert_equal(expected, runSgeSyncJob(args)) 22 | raise SkipTest # TODO: implement your test here 23 | 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension, find_packages 2 | import subprocess 3 | 4 | try: 5 | local_version = '+git.{}'.format( 6 | subprocess.check_output('git rev-parse HEAD', shell=True, encoding='utf8')) 7 | except Exception: 8 | local_version = '' 9 | 10 | setup( 11 | name = 'pypeflow', 12 | version='2.3.0' + local_version, # should match __init__.py 13 | author='J. Chin', 14 | author_email='cschin@infoecho.net', 15 | license='LICENSE.txt', 16 | packages=find_packages(), 17 | package_dir = {'':'.'}, 18 | zip_safe = False, 19 | install_requires=[ 20 | 'networkx >=1.9.1', 21 | ], 22 | entry_points = {'console_scripts': [ 23 | 'pwatcher-main=pwatcher.mains.pwatcher:main', 24 | 'pwatcher-pypeflow-example=pwatcher.mains.pypeflow_example:main', 25 | 'heartbeat-wrapper=pwatcher.mains.fs_heartbeat:main', 26 | ], 27 | }, 28 | package_data={'pwatcher.mains': ['*.sh']} 29 | ) 30 | -------------------------------------------------------------------------------- /bamboo_wheel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | type module >& /dev/null || . /mnt/software/Modules/current/init/bash 3 | module purge 4 | module load gcc 5 | 6 | set -vex 7 | ls -larth .. 8 | ls -larth 9 | pwd 10 | 11 | export WHEELHOUSE=./wheelhouse 12 | mkdir -p ${WHEELHOUSE} 13 | 14 | # Give everybody read/write access. 15 | umask 0000 16 | 17 | 18 | module load python/3.7.3 19 | make wheel 20 | 21 | # http://bamboo.pacificbiosciences.com:8085/build/admin/edit/defaultBuildArtifact.action?buildKey=SAT-TAGDEPS-JOB1 22 | # For old artifact config: 23 | mkdir -p ./artifacts/gcc-6.4.0/wheelhouse 24 | rsync -av ${WHEELHOUSE}/pypeflow*.whl artifacts/gcc-6.4.0/wheelhouse/ 25 | 26 | 27 | # Select export dir based on Bamboo branch, but only for develop and master. 28 | case "${bamboo_planRepository_branchName}" in 29 | develop|master) 30 | WHEELHOUSE="/mnt/software/p/python/wheelhouse/${bamboo_planRepository_branchName}/" 31 | rsync -av ./wheelhouse/ ${WHEELHOUSE} 32 | ;; 33 | *) 34 | ;; 35 | esac 36 | -------------------------------------------------------------------------------- /examples-pwatcher/ab/logging-cfg.json: -------------------------------------------------------------------------------- 1 | { 2 | "disable_existing_loggers": false, 3 | "filters": {}, 4 | "formatters": { 5 | "format_brief": { 6 | "format": "%(levelname)s: %(message)s" 7 | }, 8 | "format_full": { 9 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 10 | } 11 | }, 12 | "handlers": { 13 | "handler_file_all": { 14 | "class": "logging.FileHandler", 15 | "filename": "workflow.log", 16 | "formatter": "format_full", 17 | "level": "DEBUG", 18 | "mode": "w" 19 | }, 20 | "handler_stream": { 21 | "class": "logging.StreamHandler", 22 | "formatter": "format_brief", 23 | "level": "INFO", 24 | "stream": "ext://sys.stderr" 25 | } 26 | }, 27 | "loggers": { 28 | "": { 29 | "handlers": [ 30 | "handler_file_all", 31 | "handler_stream" 32 | ], 33 | "level": "NOTSET" 34 | } 35 | }, 36 | "root": { 37 | }, 38 | "version": 1 39 | } 40 | -------------------------------------------------------------------------------- /bamboo-specs/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /test/test_do_task.py: -------------------------------------------------------------------------------- 1 | from pypeflow import do_task as M 2 | import pytest 3 | 4 | testdata = [ 5 | # no subs 6 | ({}, {}, {}, 7 | """\ 8 | echo hello 9 | """, 10 | """\ 11 | echo hello 12 | """), 13 | # simple subs (with quoting) 14 | ({'ii': 'II'}, {'oo': 'O O'}, {'pp': 'PP DO NOT QUOTE'}, 15 | """\ 16 | echo {input.ii} 17 | echo {output.oo} 18 | echo {params.pp} 19 | """, 20 | """\ 21 | echo II 22 | echo 'O O' 23 | echo PP DO NOT QUOTE 24 | """), 25 | # input.ALL 26 | ({'ii': 'II', 'ij': 'IJ'}, {'oo': 'OO'}, {'pp': 'PP'}, 27 | """\ 28 | echo {input.ALL} 29 | echo {output.oo} 30 | echo {params.pp} 31 | """, 32 | """\ 33 | echo II IJ 34 | echo OO 35 | echo PP 36 | """), 37 | # input.i* (with quoting) 38 | ({'ii': 'II', 'ij': 'I J', 'ia': 'IA', 'ab': 'AB'}, {'oo': 'OO'}, {'pp': 'PP'}, 39 | """\ 40 | echo {input.i*} 41 | echo {input.ab} 42 | echo {output.oo} 43 | echo {params.pp} 44 | """, 45 | """\ 46 | echo 'I J' IA II 47 | echo AB 48 | echo OO 49 | echo PP 50 | """), 51 | ] 52 | 53 | @pytest.mark.parametrize("args", testdata) 54 | def test_sub(args): 55 | myi, myo, myp, t, expected = args 56 | got = M.sub(t, myi, myo, myp) 57 | assert expected == got 58 | -------------------------------------------------------------------------------- /pwatcher/mains/job_start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # vim: filetype=sh ts=4 sw=4 sts=4 et: 3 | # 4 | # Wait until file exists, then spawn. 5 | 6 | # This is not Python because the start_tmpl from pbsmrtpipe always runs bash. 7 | # But we use the .py extension because we want this installed with our Python 8 | # code, so we do not need to deal with mobs for installation. (But we might 9 | # need to chmod +x.) 10 | # 11 | # This can be run via 12 | # 13 | # bash -c pwatcher/mains.job_start.py myprog 60 14 | # 15 | # Note: If anyone replaces this, you must ensure that running this is exactly equivalent 16 | # to running the "executable". In other words, no 'mkdir', no 'cd', etc. That will help 17 | # with debugging. 18 | 19 | set -vex 20 | executable=${PYPEFLOW_JOB_START_SCRIPT} 21 | timeout=${PYPEFLOW_JOB_START_TIMEOUT:-60} # wait 60s by default 22 | 23 | # Wait up to timeout seconds for the executable to become "executable", 24 | # then exec. 25 | #timeleft = int(timeout) 26 | while [[ ! -x "${executable}" ]]; do 27 | if [[ "${timeout}" == "0" ]]; then 28 | echo "timed out waiting for (${executable})" 29 | exit 77 30 | fi 31 | echo "not executable: '${executable}', waiting ${timeout}s" 32 | sleep 1 33 | timeout=$((timeout-1)) 34 | done 35 | 36 | /bin/bash ${executable} 37 | -------------------------------------------------------------------------------- /bamboo-specs/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | 6 | com.atlassian.bamboo 7 | bamboo-specs-parent 8 | 6.7.1 9 | 10 | 11 | 12 | com.pacb 13 | bamboo-specs 14 | 1.0.0-SNAPSHOT 15 | jar 16 | 17 | 18 | 19 | com.atlassian.bamboo 20 | bamboo-specs-api 21 | 22 | 23 | com.atlassian.bamboo 24 | bamboo-specs 25 | 26 | 27 | 28 | 29 | junit 30 | junit 31 | test 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /pypeflow/do_support.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | import os 4 | import string 5 | import io 6 | LOG = logging.getLogger(__name__) 7 | BASH = '/bin/bash' 8 | 9 | # This is used by some programs in falcon_kit/mains. 10 | simple_logging_config = """ 11 | [loggers] 12 | keys=root 13 | 14 | [handlers] 15 | keys=stream 16 | 17 | [formatters] 18 | keys=form01,form02 19 | 20 | [logger_root] 21 | level=NOTSET 22 | handlers=stream 23 | 24 | [handler_stream] 25 | class=StreamHandler 26 | level=${FALCON_LOG_LEVEL} 27 | formatter=form01 28 | args=(sys.stderr,) 29 | 30 | [formatter_form01] 31 | format=%(asctime)s - %(name)s - %(levelname)s - %(message)s 32 | 33 | [formatter_form02] 34 | format=[%(levelname)s]%(message)s 35 | """ 36 | def setup_simple_logging(FALCON_LOG_LEVEL='DEBUG', **ignored): 37 | cfg = string.Template(simple_logging_config).substitute(FALCON_LOG_LEVEL=FALCON_LOG_LEVEL) 38 | logger_fileobj = io.StringIO(cfg) 39 | defaults = {} 40 | logging.config.fileConfig(logger_fileobj, defaults=defaults, disable_existing_loggers=False) 41 | 42 | def run_bash(script_fn): 43 | # Assume script was written by this program, so we know it is 44 | # available in the filesystem. 45 | # However, we cannot be sure that the execute permission is set, 46 | # so run it as a script. 47 | cmd = '{} -vex {}'.format(BASH, script_fn) 48 | LOG.info('!{}'.format(cmd)) 49 | rc = os.system(cmd) 50 | if rc: 51 | raise Exception('{} <- {!r}'.format(rc, cmd)) 52 | -------------------------------------------------------------------------------- /test/test_integ.py: -------------------------------------------------------------------------------- 1 | from pypeflow.simple_pwatcher_bridge import ( 2 | PypeProcWatcherWorkflow, 3 | PRODUCERS, 4 | ) 5 | from pypeflow import sample_tasks 6 | from pypeflow import util 7 | import os 8 | 9 | def setup_workflow(): 10 | PRODUCERS.clear() # Forget any PypeTasks already defined. 11 | 12 | job_defaults = { 13 | 'job_type': 'string', 14 | #'submit': 'bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}', 15 | 'submit': 'bash -C ${CMD}', 16 | #'JOB_OPTS': '-pe smp 8 -q bigmem', 17 | 'pwatcher_type': 'blocking', 18 | #'pwatcher_directory': config.get('pwatcher_directory', 'mypwatcher'), 19 | #'use_tmpdir': '/scratch', 20 | 'njobs': 4, 21 | } 22 | wf = PypeProcWatcherWorkflow( 23 | job_defaults=job_defaults, 24 | ) 25 | return wf 26 | 27 | def try_workflow(text, create_task): 28 | """Test the whole workflow. 29 | 'text' is anything. 30 | 'create_tasks' signature: create_task(i1, o1) 31 | """ 32 | wf = setup_workflow() 33 | wf.max_jobs = 2 34 | 35 | i1 = './in/i1' 36 | o1 = './run/dir1/o1.txt' 37 | util.mkdirs('in/') 38 | with open('in/i1', 'w') as ofs: 39 | ofs.write(text) 40 | assert os.path.exists(i1) 41 | assert not os.path.exists(o1) 42 | 43 | task = create_task(i1, o1) 44 | wf.addTask(task) 45 | wf.refreshTargets() 46 | 47 | assert os.path.exists(o1) 48 | assert text == open(o1).read() 49 | 50 | def test_new(tmpdir): 51 | with tmpdir.as_cwd(): 52 | try_workflow('bash-based', sample_tasks.create_task) 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The Clear BSD License 2 | 3 | Copyright (c) 2016, Jason Chin 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted (subject to the limitations in the disclaimer 8 | below) provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its contributors may be used 18 | to endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS 22 | LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 24 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 26 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 28 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 | OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 32 | DAMAGE. 33 | -------------------------------------------------------------------------------- /examples-pwatcher/README.md: -------------------------------------------------------------------------------- 1 | ## pwatcher 2 | Filesystem-based process-watcher. 3 | 4 | Sometimes, the filesystem is the only reliable way to communicate between 5 | processes on different machines. **pwatcher** will watch for 6 | sentinels and heartbeats. 7 | 8 | Two basic ideas: 9 | 10 | 1. To store sentinel-files in a single directory, in order to reduce 11 | the burden on the filesystem. 12 | 2. To use a background thread to update heartbeat-files periodically, 13 | in order to avoid waiting forever on dead jobs. 14 | 15 | ## API 16 | **pwatcher** can be used as a separate process or as a Python module. 17 | If you use it as a module, you should use the contextmanager in order 18 | to release locks quickly. That way, users can query via the command-line 19 | even while a large job is ongoing. 20 | 21 | There are three commands in the API. 22 | 23 | 1. `run` 24 | 2. `query` 25 | 3. `delete` 26 | 27 | They all can be called from the command-line by supplying the arguments as JSON. 28 | 29 | ### Examples 30 | #### Using **pwatcher** 31 | ``` 32 | pip install -e . 33 | cd examples-pwatcher/ab 34 | pwatcher-main run < run.json 35 | pwatcher-main query < query.json 36 | pwatcher-main delete < delete.json 37 | ls pwatched/ 38 | ``` 39 | #### pypeFLOW example 40 | To run this example, you must first install **pypeFLOW**. 41 | ``` 42 | mkdir foo 43 | cd foo 44 | pypeflow_example 45 | ``` 46 | That should create: 47 | * directory `mytmp` 48 | * for pypeflow outputs 49 | * directory `watched` 50 | * `state.py` 51 | * wrappers 52 | * sentinel-files, touched on exit 53 | * heartbeat-files, usually removed when done 54 | * some basic taskrunners 55 | 56 | ### Plans 57 | The API needs a bit of clean-up, but the basic functionality is there. 58 | I still have to inject the grid-control commands. 59 | 60 | I hope to replace **FALCON**'s `fc_run.py` soon! 61 | -------------------------------------------------------------------------------- /pypeflow/tasks.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import collections 4 | import logging 5 | import os 6 | import pprint 7 | from .simple_pwatcher_bridge import (PypeTask, Dist) 8 | from . import io 9 | 10 | LOG = logging.getLogger(__name__) 11 | 12 | 13 | def task_generic_bash_script(self): 14 | """Generic script task. 15 | The script template should be in 16 | self.bash_template 17 | The template will be substituted by 18 | the content of "self" and of "self.parameters". 19 | (That is a little messy, but good enough for now.) 20 | """ 21 | self_dict = dict() 22 | self_dict.update(self.__dict__) 23 | self_dict.update(self.parameters) 24 | script_unsub = self.bash_template 25 | script = script_unsub.format(**self_dict) 26 | script_fn = 'script.sh' 27 | with open(script_fn, 'w') as ofs: 28 | ofs.write(script) 29 | self.generated_script_fn = script_fn 30 | 31 | 32 | def gen_task(script, inputs, outputs, parameters=None, dist=None): 33 | """ 34 | dist is used in two ways: 35 | 1) in the pwatcher, to control job-distribution 36 | 2) as additional parameters: 37 | - params.pypeflow_nproc 38 | - params.pypeflow_mb 39 | """ 40 | if parameters is None: 41 | parameters = dict() 42 | if dist is None: 43 | dist = Dist() 44 | LOG.debug('gen_task({}\n\tinputs={!r},\n\toutputs={!r})'.format( 45 | script, inputs, outputs)) 46 | parameters = dict(parameters) # copy 47 | parameters['pypeflow_nproc'] = dist.pypeflow_nproc 48 | parameters['pypeflow_mb'] = dist.pypeflow_mb 49 | LOG.debug(' parameters={}'.format( 50 | pprint.pformat(parameters))) 51 | LOG.debug(' dist.job_dict={}'.format( 52 | pprint.pformat(dist.job_dict))) 53 | def validate_dict(mydict): 54 | "Python identifiers are illegal as keys." 55 | try: 56 | collections.namedtuple('validate', list(mydict.keys())) 57 | except ValueError as exc: 58 | LOG.exception('Bad key name in task definition dict {!r}'.format(mydict)) 59 | raise 60 | validate_dict(inputs) 61 | validate_dict(outputs) 62 | validate_dict(parameters) 63 | make_task = PypeTask( 64 | inputs={k: v for k,v in inputs.items()}, 65 | outputs={k: v for k,v in outputs.items()}, 66 | parameters=parameters, 67 | bash_template=script, 68 | dist=dist, 69 | ) 70 | return make_task(task_generic_bash_script) 71 | -------------------------------------------------------------------------------- /doc/introduction.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Introduction 3 | ============ 4 | 5 | 6 | What is pypeFLOW 7 | ================ 8 | 9 | pypeFLOW is light weight and reusable make / flow data process 10 | library written in Python. 11 | 12 | Most of bioinformatics analysis or general data analysis 13 | includes various steps combining data files, transforming 14 | files between different formats and calculating statistics 15 | with a variety of tools. Ian Holmes has a great summary and 16 | opinions about bioinformatics workflow at 17 | http://biowiki.org/BioinformaticsWorkflows. It is 18 | interesting that such analysis workflow is really similar to 19 | constructing software without an IDE in general. Using a 20 | "makefile" file for managing bioinformatics analysis 21 | workflow is actually great for generating reproducible and 22 | reusable analysis procedure. Combining with a proper 23 | version control tool, one will be able to manage to work 24 | with a divergent set of data and tools over a period of time 25 | for a project especially when there are complicate 26 | dependence between the data, tools and customized code 27 | for the analysis tasks. 28 | 29 | However, using "make" and "makefile" implies all data 30 | analysis steps are done by some command line tools. If you 31 | have some customized analysis tasks, you will have to write 32 | some scripts and to make them into command line tools. In 33 | my personal experience, I find it is convenient to bypass 34 | such burden and to combine those quick and simple steps in a 35 | single scripts. The only caveat is that if an analyst does 36 | not save the results of any intermediate steps, he or she 37 | has to repeat the computation all over again for every steps 38 | from the beginning. This will waste a lot of computation 39 | cycles and personal time. Well, the solution is simple, 40 | just like the traditional software building process, one 41 | have to track the dependencies and analyze them and only 42 | reprocess those parts that are necessary to get the most 43 | up-to-date final results. 44 | 45 | General Design Principles 46 | ========================= 47 | 48 | - Explicitly modeling data and task dependency 49 | - Support declarative programming style within Python while 50 | maintaining something that imperative programming dose the 51 | best 52 | - Utilize RDF meta-data framework 53 | - Keep it simple if possible 54 | 55 | Features 56 | ======== 57 | 58 | - Multiple concurrent tasks scheduling and running 59 | - Support tasks as simple shell script (considering clustering 60 | job submission in mind) 61 | - reasonable simple interface for declarative programming 62 | 63 | -------------------------------------------------------------------------------- /example/test_shutdown.py: -------------------------------------------------------------------------------- 1 | 2 | # @author Jason Chin 3 | # 4 | # Copyright (C) 2010 by Jason Chin 5 | # Copyright (C) 2011 by Jason Chin 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in 15 | # all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 | # THE SOFTWARE. 24 | 25 | import sys 26 | import os 27 | 28 | 29 | from pypeflow.common import * 30 | from pypeflow.task import PypeThreadTaskBase, PypeTaskBase 31 | from pypeflow.task import PypeTask, PypeShellTask, PypeSGETask, PypeDistributibleTask 32 | from pypeflow.controller import PypeWorkflow, PypeThreadWorkflow, PypeMPWorkflow 33 | from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn 34 | import logging 35 | import time 36 | 37 | logger = logging.getLogger() 38 | #logger.setLevel(logging.INFO) 39 | logger.setLevel(logging.DEBUG) 40 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 41 | ch = logging.StreamHandler() 42 | ch.setLevel(logging.DEBUG) 43 | ch.setFormatter(formatter) 44 | logger.addHandler(ch) 45 | 46 | inputs = {"input": makePypeLocalFile("/tmp/test1_input")} 47 | outputs = {"output": makePypeLocalFile("/tmp/test1_output")} 48 | os.system("touch /tmp/test1_input") 49 | 50 | @PypeTask(inputs = inputs, outputs = outputs, TaskType = PypeThreadTaskBase) 51 | def f(self): 52 | i = 0 53 | while 1: 54 | time.sleep(0.1) 55 | if self.shutdown_event != None and self.shutdown_event.is_set(): 56 | break 57 | if i > 10: 58 | break 59 | i += 1 60 | if self.shutdown_event == None or not self.shutdown_event.is_set(): 61 | os.system("touch %s" % fn(self.output)) 62 | 63 | wf = PypeThreadWorkflow() 64 | wf.addTasks([f]) 65 | wf.refreshTargets() 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | What is pypeFLOW 2 | ================ 3 | 4 | pypeFLOW is light weight and reusable make / flow data process 5 | library written in Python. 6 | 7 | Most of bioinformatics analysis or general data analysis 8 | includes various steps combining data files, transforming 9 | files between different formats and calculating statistics 10 | with a variety of tools. Ian Holmes has a great summary and 11 | opinions about bioinformatics workflow at 12 | http://biowiki.org/BioinformaticsWorkflows. It is 13 | interesting that such analysis workflow is really similar to 14 | constructing software without an IDE in general. Using a 15 | "makefile" file for managing bioinformatics analysis 16 | workflow is actually great for generating reproducible and 17 | reusable analysis procedure. Combining with a proper 18 | version control tool, one will be able to manage to work 19 | with a divergent set of data and tools over a period of time 20 | for a project especially when there are complicate 21 | dependence between the data, tools and customized code 22 | for the analysis tasks. 23 | 24 | However, using "make" and "makefile" implies all data 25 | analysis steps are done by some command line tools. If you 26 | have some customized analysis tasks, you will have to write 27 | some scripts and to make them into command line tools. In 28 | my personal experience, I find it is convenient to bypass 29 | such burden and to combine those quick and simple steps in a 30 | single scripts. The only caveat is that if an analyst does 31 | not save the results of any intermediate steps, he or she 32 | has to repeat the computation all over again for every steps 33 | from the beginning. This will waste a lot of computation 34 | cycles and personal time. Well, the solution is simple, 35 | just like the traditional software building process, one 36 | have to track the dependencies and analyze them and only 37 | reprocess those parts that are necessary to get the most 38 | up-to-date final results. 39 | 40 | General Design Principles 41 | ========================= 42 | 43 | - Explicitly modeling data and task dependency 44 | - Support declarative programming style within Python while 45 | maintaining some thing that imperative programming dose the 46 | best 47 | - Utilize RDF meta-data framework 48 | - Keep it simple if possible 49 | 50 | Features 51 | ======== 52 | 53 | - Multiple concurrent task scheduling and running 54 | - Support task as simple shell script (considering clustering 55 | job submission in mind) 56 | - reasonable simple interface for declarative programming 57 | 58 | General Installation 59 | ==================== 60 | 61 | pypeFlow uses the standard python setup.py for installation:: 62 | 63 | python setup.py install 64 | 65 | Once install, a brief documentation can be generated by:: 66 | 67 | cd doc 68 | make html 69 | 70 | The generate sphinx html document can be viewed by point your web browser 71 | to ``_build/html/index.html`` in the ``doc`` directory. 72 | 73 | DISCLAIMER 74 | ---------- 75 | THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, ARE PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF THIS SITE, ALL SITE-RELATED SERVICES, AND ANY THIRD PARTY WEBSITES OR APPLICATIONS. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A WARRANTY OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACIFIC BIOSCIENCES. 76 | -------------------------------------------------------------------------------- /pwatcher/mains/pypeflow_example.py: -------------------------------------------------------------------------------- 1 | from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase, 2 | makePypeLocalFile, fn, PypeTask) 3 | import json 4 | import logging.config 5 | import os 6 | import sys 7 | 8 | JOB_TYPE = os.environ.get('JOB_TYPE', 'local') 9 | SLEEP_S = os.environ.get('SLEEP_S', '1') 10 | log = logging.getLogger(__name__) 11 | 12 | def spawn(args, check=False): 13 | cmd = args[0] 14 | log.debug('$(%s %s)' %(cmd, repr(args))) 15 | rc = os.spawnv(os.P_WAIT, cmd, args) # spawnvp for PATH lookup 16 | msg = "Call %r returned %d." % (cmd, rc) 17 | if rc: 18 | log.warning(msg) 19 | if check: 20 | raise Exception(msg) 21 | else: 22 | log.debug(msg) 23 | return rc 24 | def system(call, check=False): 25 | log.debug('$(%s)' %repr(call)) 26 | rc = os.system(call) 27 | msg = "Call %r returned %d." % (call, rc) 28 | if rc: 29 | log.warning(msg) 30 | if check: 31 | raise Exception(msg) 32 | else: 33 | log.debug(msg) 34 | return rc 35 | def makedirs(d): 36 | if not os.path.isdir(d): 37 | os.makedirs(d) 38 | def taskrun0(self): 39 | template = """ 40 | sleep_s=%(sleep_s)s 41 | ofile=%(ofile)s 42 | 43 | set -vex 44 | echo start0 45 | sleep ${sleep_s} 46 | touch ${ofile} 47 | echo end0 48 | """ 49 | bash = template %dict( 50 | #ifile=fn(self.i0), 51 | ofile=fn(self.f0), 52 | sleep_s=self.parameters['sleep_s'], 53 | ) 54 | log.debug('taskrun0 bash:\n' + bash) 55 | script = 'taskrun0.sh' 56 | with open(script, 'w') as ofs: 57 | ofs.write(bash) 58 | #system("bash {}".format(script), check=True) 59 | #spawn(['/bin/bash', script], check=True) # Beware! Hard to kill procs. 60 | self.generated_script_fn = script 61 | return script 62 | def taskrun1(self): 63 | template = """ 64 | sleep_s=%(sleep_s)s 65 | ifile=%(ifile)s 66 | ofile=%(ofile)s 67 | 68 | set -vex 69 | echo start1 70 | sleep ${sleep_s} 71 | cp -f ${ifile} ${ofile} 72 | echo end1 73 | """ 74 | bash = template %dict( 75 | ifile=fn(self.f0), 76 | ofile=fn(self.f1), 77 | sleep_s=self.parameters['sleep_s'], 78 | ) 79 | log.debug('taskrun1 bash:\n' + bash) 80 | script = 'taskrun1.sh' 81 | with open(script, 'w') as ofs: 82 | ofs.write(bash) 83 | #system("bash {}".format(script), check=True) 84 | self.generated_script_fn = script 85 | return script 86 | 87 | def main(): 88 | lfn = 'logging-cfg.json' 89 | if os.path.exists(lfn): 90 | logging.config.dictConfig(json.load(open(lfn))) 91 | else: 92 | logging.basicConfig() 93 | logging.getLogger().setLevel(logging.NOTSET) 94 | try: 95 | import logging_tree 96 | logging_tree.printout() 97 | except ImportError: 98 | pass 99 | log.debug('DEBUG LOGGING ON') 100 | log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format( 101 | JOB_TYPE, SLEEP_S)) 102 | exitOnFailure=False 103 | concurrent_jobs=2 104 | Workflow = PypeProcWatcherWorkflow 105 | wf = Workflow(job_type=JOB_TYPE) 106 | wf.max_jobs = concurrent_jobs 107 | 108 | par = dict(sleep_s=SLEEP_S) 109 | DIR ='mytmp' 110 | makedirs(DIR) 111 | f0 = makePypeLocalFile('mytmp/f0') 112 | f1 = makePypeLocalFile('mytmp/f1') 113 | make_task = PypeTask( 114 | inputs = {}, 115 | outputs = {'f0': f0}, 116 | parameters = par, 117 | ) 118 | task = make_task(taskrun0) 119 | wf.addTasks([task]) 120 | make_task = PypeTask( 121 | inputs = {'f0': f0}, 122 | outputs = {'f1': f1}, 123 | parameters = par, 124 | ) 125 | task = make_task(taskrun1) 126 | wf.addTasks([task]) 127 | wf.refreshTargets([task]) 128 | #wf.refreshTargets(exitOnFailure=exitOnFailure) 129 | 130 | if __name__ == "__main__": 131 | main() 132 | -------------------------------------------------------------------------------- /pypeflow/io.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import logging 3 | import os 4 | 5 | LOG = logging.getLogger() 6 | 7 | 8 | def mkdirs(*dirnames): 9 | for dirname in dirnames: 10 | if not dirname: 11 | continue # '' => curdir 12 | if not os.path.isdir(dirname): 13 | os.makedirs(dirname) 14 | if len(dirnames) == 1: 15 | LOG.debug('mkdir -p "{}"'.format(dirnames[0])) 16 | 17 | 18 | def syscall(call, nocheck=False): 19 | """Raise Exception in error, unless nocheck==True 20 | """ 21 | LOG.info('$(%s)' % repr(call)) 22 | rc = os.system(call) 23 | msg = 'Call %r returned %d.' % (call, rc) 24 | if rc: 25 | LOG.warning(msg) 26 | if not nocheck: 27 | raise Exception(msg) 28 | else: 29 | LOG.debug(msg) 30 | return rc 31 | 32 | 33 | def capture(cmd, nocheck=False): 34 | """Capture output, maybe checking return-code. 35 | Return stdout, fully captured. 36 | Wait for subproc to finish. 37 | Warn if empty. 38 | Raise on non-zero exit-code, unless nocheck. 39 | """ 40 | import subprocess 41 | LOG.info('$ {} >'.format(cmd)) 42 | proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding='ascii') 43 | stdout, stderr = proc.communicate() 44 | rc = proc.returncode 45 | if rc: 46 | msg = '{} <- {!r}:\n{}'.format(rc, cmd, stdout) 47 | if nocheck: 48 | LOG.debug(msg) 49 | else: 50 | raise Exception(msg) 51 | assert stderr is None, '{!r} != None'.format(stderr) 52 | output = stdout 53 | if not output: 54 | msg = '{!r} failed to produce any output.'.format(cmd) 55 | LOG.warning(msg) 56 | return output 57 | 58 | 59 | def symlink(src, name, force=True): 60 | if os.path.lexists(name): 61 | os.unlink(name) 62 | os.symlink(src, name) 63 | 64 | 65 | def fix_relative_symlinks(currdir, origdir, recursive=True, relparent='..'): 66 | """ 67 | Fix relative symlinks after cp/rsync, assuming they had 68 | been defined relative to 'origdir'. 69 | If 'recursive', then perform this in all (non-symlinked) sub-dirs also. 70 | Skip relative links that point upward shallower than relparent, and warn. 71 | (Always skip absolute symlinks; we assume those already point to persistent space.) 72 | """ 73 | if recursive: 74 | for dn in os.listdir(currdir): 75 | if not os.path.islink(dn) and os.path.isdir(dn): 76 | fix_relative_symlinks(os.path.join(currdir, dn), os.path.join(origdir, dn), recursive, 77 | os.path.join('..', relparent)) 78 | for fn in os.listdir(currdir): 79 | fn = os.path.join(currdir, fn) 80 | if not os.path.islink(fn): 81 | continue 82 | oldlink = os.readlink(fn) 83 | if os.path.isabs(oldlink): 84 | continue 85 | if not os.path.normpath(oldlink).startswith(relparent): 86 | msg = 'Symlink {}->{} seems to point within the origdir tree. This is unexpected. relparent={}'.format( 87 | fn, oldlink, relparent) 88 | raise Exception(msg) 89 | #LOG.warning(msg) 90 | #continue 91 | newlink = os.path.relpath(os.path.join(origdir, oldlink), currdir) 92 | LOG.debug('Fix symlink to {!r} from {!r}'.format(newlink, oldlink)) 93 | symlink(newlink, fn) 94 | 95 | 96 | def rm(*f): 97 | syscall('rm -f {}'.format(' '.join(f))) 98 | 99 | 100 | def touch(*paths): 101 | msg = 'touch {!r}'.format(paths) 102 | LOG.debug(msg) 103 | for path in paths: 104 | if os.path.exists(path): 105 | os.utime(path, None) 106 | else: 107 | open(path, 'a').close() 108 | 109 | 110 | def filesize(fn): 111 | """In bytes. 112 | Raise if fn does not exist. 113 | """ 114 | return os.stat(fn).st_size 115 | 116 | 117 | def exists_and_not_empty(fn): 118 | if not os.path.exists(fn): 119 | return False 120 | if 0 == filesize(fn): 121 | LOG.debug('File {} is empty.'.format(fn)) 122 | return False 123 | return True 124 | 125 | 126 | @contextlib.contextmanager 127 | def cd(newdir): 128 | # https://stackoverflow.com/a/24176022 129 | prevdir = os.getcwd() 130 | LOG.info('CD: %r <- %r' % (newdir, prevdir)) 131 | os.chdir(os.path.expanduser(newdir)) 132 | try: 133 | yield 134 | finally: 135 | LOG.info('CD: %r -> %r' % (newdir, prevdir)) 136 | os.chdir(prevdir) 137 | -------------------------------------------------------------------------------- /bamboo-specs/src/main/java/pacbio/PlanSpec.java: -------------------------------------------------------------------------------- 1 | package pacbio; 2 | 3 | import com.atlassian.bamboo.specs.api.BambooSpec; 4 | import com.atlassian.bamboo.specs.api.builders.BambooKey; 5 | import com.atlassian.bamboo.specs.api.builders.BambooOid; 6 | import com.atlassian.bamboo.specs.api.builders.permission.PermissionType; 7 | import com.atlassian.bamboo.specs.api.builders.permission.Permissions; 8 | import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions; 9 | import com.atlassian.bamboo.specs.api.builders.plan.Job; 10 | import com.atlassian.bamboo.specs.api.builders.plan.Plan; 11 | import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier; 12 | import com.atlassian.bamboo.specs.api.builders.plan.Stage; 13 | import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup; 14 | import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement; 15 | import com.atlassian.bamboo.specs.api.builders.plan.configuration.ConcurrentBuilds; 16 | import com.atlassian.bamboo.specs.api.builders.project.Project; 17 | import com.atlassian.bamboo.specs.api.builders.requirement.Requirement; 18 | import com.atlassian.bamboo.specs.builders.task.CheckoutItem; 19 | import com.atlassian.bamboo.specs.builders.task.ScriptTask; 20 | import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask; 21 | import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger; 22 | import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties; 23 | import com.atlassian.bamboo.specs.util.BambooServer; 24 | 25 | @BambooSpec 26 | public class PlanSpec { 27 | 28 | public Plan plan() { 29 | final Plan plan = new Plan(new Project() 30 | 31 | .key(new BambooKey("SAT")) 32 | .name("SMRT Analysis Tools (SAT)"), 33 | "pypeflow3", 34 | new BambooKey("PYPBS")) 35 | .description("Plan created from Bamboo Java Specs, modify http://bitbucket.pacificbiosciences.com:7990/projects/SAT/repos/pypeflow3/browse project to update the plan.") 36 | 37 | .pluginConfigurations(new ConcurrentBuilds() 38 | .useSystemWideDefault(false) 39 | .maximumNumberOfConcurrentBuilds(4)) 40 | .stages(new Stage("Default Stage") 41 | .jobs(new Job("Default Job", 42 | new BambooKey("JOB1")) 43 | .tasks(new VcsCheckoutTask() 44 | .description("Checkout Default Repository") 45 | .checkoutItems(new CheckoutItem().defaultRepository()), 46 | new ScriptTask() 47 | .description("build") 48 | .location(ScriptTaskProperties.Location.FILE) 49 | .fileFromPath("build.sh")) 50 | .requirements(new Requirement("system.os") 51 | .matchValue("linux") 52 | .matchType(Requirement.MatchType.EQUALS)))) 53 | .linkedRepositories("pypeflow3") 54 | 55 | .triggers(new BitbucketServerTrigger()) 56 | .planBranchManagement(new PlanBranchManagement() 57 | .createForPullRequest() 58 | .delete(new BranchCleanup() 59 | .whenRemovedFromRepositoryAfterDays(7) 60 | .whenInactiveInRepositoryAfterDays(30)) 61 | .notificationForCommitters()) 62 | .forceStopHungBuilds(); 63 | return plan; 64 | } 65 | 66 | public PlanPermissions planPermission() { 67 | final PlanPermissions planPermission = new PlanPermissions(new PlanIdentifier("SAT", "PYPBS")) 68 | .permissions(new Permissions() 69 | .userPermissions("cdunn", PermissionType.VIEW, PermissionType.BUILD, PermissionType.CLONE, PermissionType.EDIT, PermissionType.ADMIN) 70 | .userPermissions("bli", PermissionType.BUILD, PermissionType.CLONE, PermissionType.ADMIN, PermissionType.VIEW, PermissionType.EDIT)); 71 | return planPermission; 72 | } 73 | 74 | public static void main(String... argv) { 75 | //By default credentials are read from the '.credentials' file. 76 | BambooServer bambooServer = new BambooServer("http://bamboo.pacificbiosciences.com:8085"); 77 | final PlanSpec planSpec = new PlanSpec(); 78 | 79 | final Plan plan = planSpec.plan(); 80 | bambooServer.publish(plan); 81 | 82 | final PlanPermissions planPermission = planSpec.planPermission(); 83 | bambooServer.publish(planPermission); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /pwatcher/mains/query_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | Query the heartbeat server from the command line. 3 | As an argument, takes either server:port or the falcon run directory 4 | (if not argument is given, uses the current directory). 5 | """ 6 | import argparse 7 | import collections 8 | import os 9 | import re 10 | import socket 11 | import sys 12 | 13 | STATE_FN = 'state.py' # taken from network_based.py 14 | STATE_DIR = 'mypwatcher' # taken from pwatcher_bridge.py 15 | 16 | # send message delimited with a \0 17 | def socket_send(socket, message): 18 | socket.sendall(b'{}\0'.format(message)) 19 | 20 | # receive all of \0 delimited message 21 | # may discard content past \0, if any, so not safe to call twice on same socket 22 | def socket_read(socket): 23 | buffer = bytearray(b' ' * 256) 24 | nbytes = socket.recv_into(buffer, 256) 25 | if nbytes == 0: # empty message 26 | return 27 | message = '' 28 | while nbytes != 0: 29 | try: # index() raises when it can't find the character 30 | i = buffer[:nbytes].index('\0') 31 | message += str(buffer[:i]) # discard past delimiter 32 | break 33 | except ValueError: # haven't reached end yet 34 | message += str(buffer) 35 | nbytes = socket.recv_into(buffer, 256) 36 | return message 37 | 38 | # get server and port from watcher state file 39 | def use_state(filename): 40 | with open(filename, 'r') as f: 41 | for line in f: 42 | match = re.match(r" 'server': \('([^']+)', (\d+)\)", line) 43 | if match: 44 | return (match.group(1), int(match.group(2))) 45 | print('Error: could not find server info in state file {}'.format(filename)) 46 | 47 | def parse_args(): 48 | parser = argparse.ArgumentParser(description='query falcon network based heartbeat server') 49 | parser.add_argument('-s', '--server', help='') 50 | parser.add_argument('-f', '--file', help='location of pwatcher state file') 51 | parser.add_argument('-d', '--debug', default=False, action='store_const', const=True, help='get server state instead of process list') 52 | parser.add_argument('sf', nargs='?', help='specify server or file') 53 | return parser.parse_args() 54 | 55 | # parse command line argument (if any) to find server info 56 | def find_server(args): 57 | i = 0 58 | if args.server: 59 | i += 1 60 | if args.file: 61 | i += 1 62 | if args.sf: 63 | i += 1 64 | if i > 1: 65 | raise Exception('Error: may only specify server once. Try "--help".') 66 | if args.sf: 67 | if os.path.exists(args.sf): 68 | args.file = args.sf 69 | else: 70 | try: 71 | args.sf.index(':') 72 | except ValueError: 73 | print('Error: could not parse argument as file or server:port: {}'.format(args.sf)) 74 | return 75 | args.server = args.sf 76 | if args.server: 77 | try: 78 | i = args.server.index(':') 79 | except ValueError: 80 | print('Error: could not parse argument as server:port: {}'.format(args.server)) 81 | return 82 | server = args.server[:i] 83 | port = int(args.server[i + 1:]) 84 | return (server, port) 85 | if not args.file: 86 | args.file = '.' 87 | if os.path.isfile(args.file): 88 | return use_state(args.file) 89 | elif os.path.isdir(args.file): 90 | if os.path.isfile(os.path.join(args.file, STATE_FN)): 91 | return use_state(os.path.join(args.file, STATE_FN)) 92 | elif os.path.isfile(os.path.join(args.file, STATE_DIR, STATE_FN)): 93 | return use_state(os.path.join(args.file, STATE_DIR, STATE_FN)) 94 | print('Error: could not find state file: {}'.format(args.file)) 95 | 96 | def main(): 97 | args = parse_args() 98 | server = find_server(args) 99 | if not server: 100 | sys.exit(1) 101 | s = socket.socket() 102 | s.connect(server) 103 | 104 | if args.debug: 105 | socket_send(s, 'D') 106 | server_state = socket_read(s) 107 | s.close() 108 | state = eval(server_state) 109 | for jobid, val in state.items(): 110 | print('{}: {} {} {} {}'.format(jobid, val[0], val[1], val[2], val[3])) 111 | else: 112 | socket_send(s, 'L') 113 | jobids = socket_read(s) 114 | s.close() 115 | for jobid in jobids.split(): 116 | s = socket.socket() 117 | s.connect(server) 118 | socket_send(s, 'Q {}'.format(jobid)) 119 | m = socket_read(s) 120 | s.close() 121 | print('{} {}'.format(jobid, m)) 122 | 123 | if __name__ == "__main__": 124 | main() 125 | -------------------------------------------------------------------------------- /pwatcher/mains/fs_heartbeat.py: -------------------------------------------------------------------------------- 1 | """Filesystem heartbeat wrapper 2 | 3 | Perl might be better for efficiency. 4 | But we will use python for now. 5 | 6 | Non-zero status means *this* failed, not the wrapped command. 7 | """ 8 | import argparse 9 | import os 10 | import socket 11 | import sys 12 | import threading 13 | import time 14 | 15 | DESCRIPTION = """ 16 | We wrap a system call to produce both a heartbeat and an exit-sentinel 17 | in the filesystem. 18 | """ 19 | EPILOG = """ 20 | We share stderr/stdout with the command. We log to stderr (for now). 21 | """ 22 | HEARTBEAT_TEMPLATE = '0 {pid} {pgid}\n' 23 | EXIT_TEMPLATE = '{exit_code}' 24 | 25 | class _Formatter(argparse.RawDescriptionHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): 26 | pass 27 | _FORMATTER_CLASS = _Formatter 28 | 29 | def parse_args(args): 30 | parser = argparse.ArgumentParser( 31 | description=DESCRIPTION, 32 | epilog=EPILOG, 33 | formatter_class=_FORMATTER_CLASS, 34 | ) 35 | parser.add_argument('--rate', 36 | help='Heartbeat rate, in seconds', 37 | type=float, 38 | default=1.0, # TODO: Make this at least 10, maybe 60. 39 | ) 40 | parser.add_argument('--heartbeat-file', 41 | help='Path to heartbeat file. The first line will have the format {!r}. The rest are just elapsed time'.format( 42 | HEARTBEAT_TEMPLATE), 43 | required=True, 44 | ) 45 | parser.add_argument('--exit-file', 46 | help='Path to exit sentinel file. At end, it will have the format {!r}'.format( 47 | EXIT_TEMPLATE), 48 | required=True, 49 | ) 50 | parser.add_argument('--directory', 51 | help='Directory in which to run COMMAND.', 52 | default='.', 53 | ) 54 | parser.add_argument('command', 55 | help='System call (to be joined by " "). We will block on this and return its result.', 56 | nargs='+', 57 | #required=True, 58 | ) 59 | return parser.parse_args(args) 60 | 61 | def log(msg): 62 | sys.stderr.write(msg) 63 | sys.stderr.write('\n') 64 | #sys.stdout.flush() # If we use stdout. 65 | 66 | def thread_heartbeat(heartbeat_fn, sleep_s): 67 | with open(heartbeat_fn, 'w') as ofs: 68 | pid = os.getpid() 69 | pgid = os.getpgid(0) 70 | ofs.write(HEARTBEAT_TEMPLATE.format( 71 | **locals())) 72 | elapsed = 0 73 | ctime = 0 74 | while True: 75 | #ctime = time.time() 76 | ofs.write('{elapsed} {ctime}\n'.format( 77 | **locals())) 78 | ofs.flush() 79 | time.sleep(sleep_s) 80 | elapsed += 1 81 | 82 | def start_heartbeat(heartbeat_fn, sleep_s): 83 | hb = threading.Thread(target=thread_heartbeat, args=(heartbeat_fn, sleep_s)) 84 | log('alive? {}'.format( 85 | bool(hb.is_alive()))) 86 | hb.daemon = True 87 | hb.start() 88 | return hb 89 | 90 | def run(args): 91 | os.chdir(args.directory) 92 | heartbeat_fn = os.path.abspath(args.heartbeat_file) 93 | exit_fn = os.path.abspath(args.exit_file) 94 | cwd = os.getcwd() 95 | hostname = socket.getfqdn() 96 | sleep_s = args.rate 97 | log(""" 98 | cwd:{cwd!r} 99 | hostname={hostname} 100 | heartbeat_fn={heartbeat_fn!r} 101 | exit_fn={exit_fn!r} 102 | sleep_s={sleep_s!r}""".format( 103 | **locals())) 104 | if os.path.exists(exit_fn): 105 | os.remove(exit_fn) 106 | if os.path.exists(heartbeat_fn): 107 | os.remove(heartbeat_fn) 108 | #os.system('touch {}'.format(heartbeat_fn)) # This would be over-written anyway. 109 | log("before setpgid: pid={} pgid={}".format(os.getpid(), os.getpgid(0))) 110 | try: 111 | os.setpgid(0, 0) # This allows the entire tree of procs to be killed. 112 | log(" after setpgid: pid={} pgid={}".format( 113 | os.getpid(), os.getpgid(0))) 114 | except OSError as e: 115 | log(' Unable to set pgid. Possibly a grid job? Hopefully there will be no dangling processes when killed: {}'.format( 116 | repr(e))) 117 | 118 | #thread = start_heartbeat(heartbeat_fn, sleep_s) 119 | 120 | #log('alive? {} pid={} pgid={}'.format( 121 | # bool(thread.is_alive()), os.getpid(), os.getpgid(0))) 122 | 123 | call = ' '.join(args.command) 124 | log('In cwd: {}, Blocking call: {!r}'.format( 125 | os.getcwd(), call)) 126 | rc = os.system(call) # Blocking. 127 | 128 | log(' returned: {!r}'.format( 129 | rc)) 130 | 131 | # Do not delete the heartbeat here. The discoverer of the exit-sentinel will do that, 132 | # to avoid a race condition. 133 | #if os.path.exists(heartbeat_fn): 134 | # os.remove(heartbeat_fn) 135 | 136 | exit_tmp_fn = exit_fn + '.tmp' 137 | with open(exit_tmp_fn, 'w') as ofs: 138 | ofs.write(EXIT_TEMPLATE.format( 139 | exit_code=rc)) 140 | os.rename(exit_tmp_fn, exit_fn) # atomic 141 | # sys.exit(rc) # No-one would see this anyway. 142 | 143 | def main(): 144 | args = parse_args(sys.argv[1:]) 145 | log(repr(args)) 146 | run(args) 147 | 148 | if __name__ == "__main__": 149 | main() 150 | -------------------------------------------------------------------------------- /doc/rdf_resprentation.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | RDF representation 3 | ================== 4 | 5 | We use a RDF framework to track the relationship between different PypeFLOW objects. 6 | The relationship between different object in a workflow is described by an RDF triple. 7 | 8 | Here are two properties are shared by all PypeFLOW object (defined in ``PypeObject``):: 9 | 10 | @property 11 | def _RDFGraph(self): 12 | graph = Graph() 13 | 14 | for k, v in self.__dict__.iteritems(): 15 | if k == "URL": continue 16 | if k[0] == "_": continue 17 | if hasattr(v, "URL"): 18 | graph.add( ( URIRef(self.URL), pypeNS[k], URIRef(v.URL) ) ) 19 | return graph 20 | 21 | 22 | 23 | @property 24 | def RDFXML(self): 25 | 26 | """ 27 | RDF XML representation of the everything related to the PypeObject 28 | """ 29 | 30 | return self._RDFGraph.serialize() 31 | 32 | 33 | Most relatons used in a workflow are likely be constructed during declaration 34 | In ``PypeTask``, the RDF graph is populated as:: 35 | 36 | @property 37 | def _RDFGraph(self): 38 | graph = Graph() 39 | for k, v in self.__dict__.iteritems(): 40 | if k == "URL": continue 41 | if k[0] == "_": continue 42 | if k in ["inputDataObjs", "outputDataObjs", "mutableDataObjs", "parameters"]: 43 | if k == "inputDataObjs": 44 | for ft, f in v.iteritems(): 45 | graph.add( (URIRef(self.URL), pypeNS["prereq"], URIRef(f.URL) ) ) 46 | elif k == "outputDataObjs": 47 | for ft, f in v.iteritems(): 48 | graph.add( (URIRef(f.URL), pypeNS["prereq"], URIRef(self.URL) ) ) 49 | elif k == "mutableDataObjs": 50 | for ft, f in v.iteritems(): 51 | graph.add( (URIRef(self.URL), pypeNS["hasMutable"], URIRef(f.URL) ) ) 52 | elif k == "parameters": 53 | graph.add( (URIRef(self.URL), pypeNS["hasParameters"], Literal(json.dumps(v)) ) ) 54 | 55 | continue 56 | 57 | if k in self.inputDataObjs: 58 | graph.add( ( URIRef(self.URL), pypeNS["inputDataObject"], URIRef(v.URL) ) ) 59 | continue 60 | 61 | if k in self.outputDataObjs: 62 | graph.add( ( URIRef(self.URL), pypeNS["outputDataObject"], URIRef(v.URL) ) ) 63 | continue 64 | 65 | if k in self.mutableDataObjs: 66 | graph.add( ( URIRef(self.URL), pypeNS["mutableDataObject"], URIRef(v.URL) ) ) 67 | continue 68 | 69 | if hasattr(v, "URL"): 70 | graph.add( ( URIRef(self.URL), pypeNS[k], URIRef(v.URL) ) ) 71 | 72 | graph.add( ( URIRef(self.URL), pypeNS["codeMD5digest"], Literal(self._codeMD5digest) ) ) 73 | graph.add( ( URIRef(self.URL), pypeNS["parameterMD5digest"], Literal(self._paramMD5digest) ) ) 74 | 75 | return graph 76 | 77 | Here is the code that put the statement the input data objects are the 78 | "prerequisite" object of the task:: 79 | 80 | if k == "inputDataObjs": 81 | for ft, f in v.iteritems(): 82 | graph.add( (URIRef(self.URL), pypeNS["prereq"], URIRef(f.URL) ) ) 83 | 84 | Similarly a task is a "prerequisite" object of its output data objects:: 85 | 86 | elif k == "outputDataObjs": 87 | for ft, f in v.iteritems(): 88 | graph.add( (URIRef(f.URL), pypeNS["prereq"], URIRef(self.URL) ) ) 89 | 90 | Typically, an output data object should only has a single prerequisite object. In the case that 91 | a data object will be modified by multiple tasks or served as input and output at the same 92 | time, one should specify such data object as ``mutableDataObject``. 93 | 94 | When a workflow tracing the execution order, only the ``pre-req`` relation is used. However, 95 | one can use the RDF statement to store various attributes for an object. For example, in 96 | the above code, we explicitly specify the input data objects as an attributes:: 97 | 98 | if k in self.inputDataObjs: 99 | graph.add( ( URIRef(self.URL), pypeNS["inputDataObject"], URIRef(v.URL) ) ) 100 | 101 | Here is an example of the RDF triples serialized as XML-RDF:: 102 | 103 | 104 | 105 | 106 | 107 | 122d234ed92c29b77c14a2c8b52c0e4c 108 | c1ce51016644b55e38bf089f47875062 109 | 110 | 111 | 112 | 113 | If we would like to group different tasks into a module, we can use such RDF statement:: 114 | 115 | 116 | 117 | 118 | 119 | This can be generated by inserting the following statement in python code:: 120 | 121 | class MyTaskWithModule(PypeTask): 122 | 123 | def assign_module(self, module): 124 | self._in_modules.append(module) 125 | 126 | @property 127 | def _RDFGraph(self): 128 | g = super(MyTaskWithModule, self)._RDFGraph 129 | for m in self._in_modules: 130 | g.add( ( URIRef(self.URL), pypeNS["inModule"], URIRef(m.URL) ) ) 131 | return g 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PypeFlow.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PypeFlow.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/PypeFlow" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PypeFlow" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /pwatcher/mains/network_heartbeat.py: -------------------------------------------------------------------------------- 1 | """Network server heartbeat wrapper 2 | 3 | Perl might be better for efficiency. 4 | But we will use python for now. 5 | 6 | Non-zero status means *this* failed, not the wrapped command. 7 | """ 8 | import argparse 9 | import os 10 | import shlex 11 | import socket 12 | import subprocess 13 | import sys 14 | import threading 15 | import time 16 | 17 | DESCRIPTION = """ 18 | We wrap a system call to produce a heartbeat. 19 | """ 20 | EPILOG = """ 21 | We log to the status server, and forward command stdout/stderr as well. 22 | """ 23 | 24 | class _Formatter(argparse.RawDescriptionHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): 25 | pass 26 | _FORMATTER_CLASS = _Formatter 27 | 28 | def parse_args(args): 29 | parser = argparse.ArgumentParser( 30 | description=DESCRIPTION, 31 | epilog=EPILOG, 32 | formatter_class=_FORMATTER_CLASS, 33 | ) 34 | parser.add_argument('--rate', 35 | help='Heartbeat rate, in seconds', 36 | type=int, 37 | default=600, 38 | ) 39 | parser.add_argument('--heartbeat-server', 40 | help='Address of the heartbeat server', 41 | required=True, 42 | ) 43 | parser.add_argument('--heartbeat-port', 44 | help='Port of the heartbeat server', 45 | type=int, 46 | required=True, 47 | ) 48 | parser.add_argument('--jobid', 49 | help='Our jobid', 50 | required=True, 51 | ) 52 | parser.add_argument('--exit-dir', 53 | help='Path to emergency exit sentinel directory', 54 | required=True, 55 | ) 56 | parser.add_argument('--directory', 57 | help='Directory in which to run COMMAND.', 58 | default='.', 59 | ) 60 | parser.add_argument('command', 61 | help='System call (to be joined by " "). We will block on this and return its result.', 62 | nargs='+', 63 | #required=True, 64 | ) 65 | return parser.parse_args(args) 66 | 67 | # send message delimited with a \0 68 | def socket_send(socket, message): 69 | socket.sendall(b'{}\0'.format(message)) 70 | 71 | def log(heartbeat_server, jobid, msg): 72 | hsocket = socket.socket() 73 | try: 74 | hsocket.connect(heartbeat_server) 75 | socket_send(hsocket, 's {} {}\n'.format(jobid, msg)) 76 | hsocket.close() 77 | except IOError: # better to miss a line than terminate 78 | pass 79 | 80 | def thread_heartbeat(heartbeat_server, jobid, sleep_s): 81 | pid = os.getpid() 82 | pgid = os.getpgid(0) 83 | hsocket = socket.socket() 84 | try: 85 | hsocket.connect(heartbeat_server) 86 | socket_send(hsocket, 'i {} {} {}'.format(jobid, pid, pgid)) 87 | hsocket.close() 88 | except IOError: # we hope it's a temporary error 89 | pass 90 | while True: 91 | time.sleep(sleep_s) 92 | hsocket = socket.socket() 93 | try: 94 | hsocket.connect(heartbeat_server) 95 | socket_send(hsocket, 'h {}'.format(jobid)) 96 | hsocket.close() 97 | except IOError: # we hope it's a temporary error 98 | pass 99 | 100 | def start_heartbeat(heartbeat_server, jobid, sleep_s): 101 | hb = threading.Thread(target=thread_heartbeat, args=(heartbeat_server, jobid, sleep_s)) 102 | log(heartbeat_server, jobid, 'alive? {}'.format( 103 | bool(hb.is_alive()))) 104 | hb.daemon = True 105 | hb.start() 106 | return hb 107 | 108 | def run(args): 109 | heartbeat_server = (args.heartbeat_server, args.heartbeat_port) 110 | jobid = args.jobid 111 | log(heartbeat_server, jobid, repr(args)) 112 | os.chdir(args.directory) 113 | exit_dir = args.exit_dir 114 | exit_fn = os.path.join(os.path.abspath(exit_dir), jobid) 115 | cwd = os.getcwd() 116 | hostname = socket.getfqdn() 117 | sleep_s = args.rate 118 | log(heartbeat_server, jobid, """ 119 | cwd:{cwd!r} 120 | hostname={hostname} 121 | heartbeat_server={heartbeat_server!r} 122 | jobid={jobid} 123 | exit_dir={exit_dir!r} 124 | sleep_s={sleep_s!r}""".format( 125 | **locals())) 126 | log(heartbeat_server, jobid, "before setpgid: pid={} pgid={}".format(os.getpid(), os.getpgid(0))) 127 | try: 128 | os.setpgid(0, 0) # This allows the entire tree of procs to be killed. 129 | log(heartbeat_server, jobid, " after setpgid: pid={} pgid={}".format( 130 | os.getpid(), os.getpgid(0))) 131 | except OSError as e: 132 | log(heartbeat_server, jobid, ' Unable to set pgid. Possibly a grid job? Hopefully there will be no dangling processes when killed: {}'.format( 133 | repr(e))) 134 | 135 | thread = start_heartbeat(heartbeat_server, jobid, sleep_s) 136 | 137 | log(heartbeat_server, jobid, 'alive? {} pid={} pgid={}'.format( 138 | bool(thread.is_alive()), os.getpid(), os.getpgid(0))) 139 | 140 | call = ' '.join(args.command) 141 | log(heartbeat_server, jobid, 'In cwd: {}, Blocking call: {!r}'.format( 142 | os.getcwd(), call)) 143 | sp = subprocess.Popen(shlex.split(call), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 144 | # forward all output to server until job ends, then get exit value 145 | with sp.stdout as f: 146 | for line in iter(f.readline, b''): 147 | # can't use log() for this because it appends a \n 148 | hsocket = socket.socket() 149 | try: 150 | hsocket.connect(heartbeat_server) 151 | socket_send(hsocket, 's {} {}'.format(jobid, line)) 152 | hsocket.close() 153 | except IOError: # better to miss a line than terminate 154 | pass 155 | rc = sp.wait() 156 | 157 | log(heartbeat_server, jobid, ' returned: {!r}'.format( 158 | rc)) 159 | 160 | hsocket = socket.socket() 161 | try: 162 | hsocket.connect(heartbeat_server) 163 | socket_send(hsocket, 'e {} {}'.format(jobid, rc)) 164 | hsocket.close() 165 | except IOError as e: 166 | log(heartbeat_server, jobid, 'could not update heartbeat server with exit status: {} {}: {!r}'.format(jobid, rc, e)) 167 | with open(exit_fn, 'w') as f: 168 | f.write(str(rc)) 169 | # sys.exit(rc) # No-one would see this anyway. 170 | 171 | def main(): 172 | args = parse_args(sys.argv[1:]) 173 | run(args) 174 | 175 | if __name__ == "__main__": 176 | main() 177 | -------------------------------------------------------------------------------- /readme.slurm.md: -------------------------------------------------------------------------------- 1 | ``` 2 | Usage: sbatch [OPTIONS...] executable [args...] 3 | 4 | Parallel run options: 5 | -a, --array=indexes job array index values 6 | -A, --account=name charge job to specified account 7 | --bb= burst buffer specifications 8 | --begin=time defer job until HH:MM MM/DD/YY 9 | -M, --clusters=names Comma separated list of clusters to issue 10 | commands to. Default is current cluster. 11 | Name of 'all' will submit to run on all clusters. 12 | --comment=name arbitrary comment 13 | --cpu-freq=min[-max[:gov]] requested cpu frequency (and governor) 14 | -c, --cpus-per-task=ncpus number of cpus required per task 15 | -d, --dependency=type:jobid defer job until condition on jobid is satisfied 16 | -D, --workdir=directory set working directory for batch script 17 | -e, --error=err file for batch script's standard error 18 | --export[=names] specify environment variables to export 19 | --export-file=file|fd specify environment variables file or file 20 | descriptor to export 21 | --get-user-env load environment from local cluster 22 | --gid=group_id group ID to run job as (user root only) 23 | --gres=list required generic resources 24 | -H, --hold submit job in held state 25 | --ignore-pbs Ignore #PBS options in the batch script 26 | -i, --input=in file for batch script's standard input 27 | -I, --immediate exit if resources are not immediately available 28 | --jobid=id run under already allocated job 29 | -J, --job-name=jobname name of job 30 | -k, --no-kill do not kill job on node failure 31 | -L, --licenses=names required license, comma separated 32 | -m, --distribution=type distribution method for processes to nodes 33 | (type = block|cyclic|arbitrary) 34 | --mail-type=type notify on state change: BEGIN, END, FAIL or ALL 35 | --mail-user=user who to send email notification for job state 36 | changes 37 | -n, --ntasks=ntasks number of tasks to run 38 | --nice[=value] decrease scheduling priority by value 39 | --no-requeue if set, do not permit the job to be requeued 40 | --ntasks-per-node=n number of tasks to invoke on each node 41 | -N, --nodes=N number of nodes on which to run (N = min[-max]) 42 | -o, --output=out file for batch script's standard output 43 | -O, --overcommit overcommit resources 44 | -p, --partition=partition partition requested 45 | --parsable outputs only the jobid and cluster name (if present), 46 | separated by semicolon, only on successful submission. 47 | --power=flags power management options 48 | --priority=value set the priority of the job to value 49 | --profile=value enable acct_gather_profile for detailed data 50 | value is all or none or any combination of 51 | energy, lustre, network or task 52 | --propagate[=rlimits] propagate all [or specific list of] rlimits 53 | --qos=qos quality of service 54 | -Q, --quiet quiet mode (suppress informational messages) 55 | --reboot reboot compute nodes before starting job 56 | --requeue if set, permit the job to be requeued 57 | -s, --share share nodes with other jobs 58 | -S, --core-spec=cores count of reserved cores 59 | --sicp If specified, signifies job is to receive 60 | --signal=[B:]num[@time] send signal when time limit within time seconds 61 | --switches=max-switches{@max-time-to-wait} 62 | Optimum switches and max time to wait for optimum 63 | --thread-spec=threads count of reserved threads 64 | -t, --time=minutes time limit 65 | --time-min=minutes minimum time limit (if distinct) 66 | --uid=user_id user ID to run job as (user root only) 67 | -v, --verbose verbose mode (multiple -v's increase verbosity) 68 | --wckey=wckey wckey to run job under 69 | --wrap[=command string] wrap command string in a sh script and submit 70 | 71 | Constraint options: 72 | --contiguous demand a contiguous range of nodes 73 | -C, --constraint=list specify a list of constraints 74 | -F, --nodefile=filename request a specific list of hosts 75 | --mem=MB minimum amount of real memory 76 | --mincpus=n minimum number of logical processors (threads) 77 | per node 78 | --reservation=name allocate resources from named reservation 79 | --tmp=MB minimum amount of temporary disk 80 | -w, --nodelist=hosts... request a specific list of hosts 81 | -x, --exclude=hosts... exclude a specific list of hosts 82 | 83 | Consumable resources related options: 84 | --exclusive[=user] allocate nodes in exclusive mode when 85 | cpu consumable resource is enabled 86 | --mem-per-cpu=MB maximum amount of real memory per allocated 87 | cpu required by the job. 88 | --mem >= --mem-per-cpu if --mem is specified. 89 | 90 | Affinity/Multi-core options: (when the task/affinity plugin is enabled) 91 | -B --extra-node-info=S[:C[:T]] Expands to: 92 | --sockets-per-node=S number of sockets per node to allocate 93 | --cores-per-socket=C number of cores per socket to allocate 94 | --threads-per-core=T number of threads per core to allocate 95 | each field can be 'min' or wildcard '*' 96 | total cpus requested = (N x S x C x T) 97 | 98 | --ntasks-per-core=n number of tasks to invoke on each core 99 | --ntasks-per-socket=n number of tasks to invoke on each socket 100 | 101 | 102 | Help options: 103 | -h, --help show this help message 104 | -u, --usage display brief usage message 105 | 106 | Other options: 107 | -V, --version output version information and exit 108 | ``` 109 | 110 | * https://github.com/PacificBiosciences/FALCON-integrate/issues/53 111 | * http://slurm.schedmd.com/ 112 | * http://slurm.schedmd.com/sbatch.html 113 | -------------------------------------------------------------------------------- /doc/examples.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Usage and Examples 3 | ================== 4 | 5 | pypeFLOW Objects 6 | ================ 7 | 8 | There are three different kinds of pypeFLOW objects: 9 | 10 | 1. Data Object 11 | 2. Task Object 12 | 3. Workflow Object 13 | 14 | Data Object 15 | ============ 16 | 17 | The data objects represent the input and output data that is 18 | processed by a pypeFLOW task objects. 19 | 20 | Currently, only local file objects are implemented. In the 21 | future, more general data (e.g. a remote file or Amazon S3 22 | object) can be supported. 23 | 24 | Here is how to create an instance of a local file data object:: 25 | 26 | f1 = makePypeLocalFile("filename") 27 | 28 | The ``makePypeLocalFile`` returns a ``PypeLocalFile`` 29 | object. It does not create the file in the file system. 30 | 31 | Task Object 32 | ============ 33 | 34 | A test object is generally created by the ``@PypeTask`` or 35 | ``@PypeShellTask`` decorator with a task function. You will 36 | need to specifiy the input files and the output files with 37 | ``@Task`` decorator. The task function should declared with 38 | the variable argugment lists ``*argv, **kwargv`` as 39 | argumnets:: 40 | 41 | @PypeTask(inputDataObjs={"fasta":f1, "ref":f2}, 42 | outputDataObjs={"aln":f3}) 43 | def testTask(*argv, **kwargv): 44 | print("testTask is running") 45 | print("fasta input filename is %s" % testTask.fasta.localFileName) 46 | # do something to create the output file(s) 47 | 48 | The decorator return a callable object with the same name of 49 | the function. The above example returns an instance of 50 | ``PypeTaskBase`` object. Within a task function, the input 51 | and output data objects can be retrieved as an instance 52 | attribute. For example, within the ``testTask`` function, 53 | ``testTask.fasta`` is the ``f1`` data object. 54 | ``testTaske.fasta.localFileName`` will give the local file 55 | of the file data object. 56 | 57 | Workflow Object 58 | =============== 59 | 60 | The workflow object contains task objects and data objects. 61 | It creates the dependency directed acyclic graph (DAG) 62 | according to input and output data objects specified for 63 | each task. Circular dependency will be detected. Output 64 | data files should be only dependent on one single task. It 65 | is generally not a good idea to write to the same "output 66 | file" from two tasks. 67 | 68 | The general pattern to specify a workflow is 69 | 70 | 1. Initialize a workflow object 71 | 72 | 2. Add data objects and task objects. One can only add 73 | task objects, the data objects that a task object is 74 | dependent on will be added into the workflow 75 | automatically. 76 | 77 | 3. Specify the data objects to be "refreshed". Namely, 78 | the workflow controller will evaluate the DAG and try to 79 | update the specified data objects if necessary. If a 80 | workflow has been executed and the initial and 81 | intermediate data objects do not change, then no task 82 | will be executed. The tasks will be only executed when 83 | the dependency is not satisfied. Currently, we use the 84 | data objects' time stamp to determine the dependency. If 85 | the output files are newer than the input files for a 86 | task, the task will be ignored. 87 | 88 | See the following section for an example. 89 | 90 | Simple Example 91 | ================ 92 | 93 | A simple workflow can look like this:: 94 | 95 | from pypeflow.common import * 96 | from pypeflow.task import PypeThreadTaskBase, PypeTaskBase 97 | from pypeflow.task import PypeTask, PypeShellTask, PypeSGETask, PypeDistributibleTask 98 | from pypeflow.controller import PypeWorkflow, PypeThreadWorkflow 99 | from pypeflow.data import PypeLocalFile, makePypeLocalFile 100 | 101 | def simpleTest(): 102 | 103 | wf = PypeWorkflow() 104 | 105 | # f1 and f2 are the mock input files 106 | f1 = makePypeLocalFile("test.fa") 107 | f2 = makePypeLocalFile("ref.fa") 108 | 109 | # f3 is the object of the expected output of the "testTask" 110 | f3 = makePypeLocalFile("aln.txt", readOnly=False) 111 | 112 | # create the mock files 113 | os.system("touch %s" % f1.localFileName) 114 | os.system("touch %s" % f2.localFileName) 115 | 116 | # the testTask will take f1 (as "testTask.fasta") and f2 (as "testTask.ref") and generate f3 (as "testTask.aln") 117 | @PypeTask(inputDataObjs={"fasta":f1, "ref":f2}, 118 | outputDataObjs={"aln":f3}, 119 | parameters={"a":10}, **{"b":12}) 120 | def testTask(*argv, **kwargv): 121 | print("testTask is running") 122 | for ft, f in testTask.outputDataObjs.iteritems(): 123 | #os.system("touch %s" % f.localFileName) 124 | runShellCmd(["touch", "%s" % f.localFileName]) 125 | runShellCmd(["sleep", "5" ]) 126 | 127 | # the testTask will take f1 (as "testTask.fasta") and f3 (as "testTask.aln") and generate f4 (as "testTask.aln2") 128 | f4 = makePypeLocalFile("aln2.txt", readOnly=False) 129 | @PypeTask(inputDataObjs={"fasta":f1, "aln":f3}, 130 | outputDataObjs={"aln2":f4}, 131 | parameters={"a":10}, **{"b":12}) 132 | def testTask2(*argv, **kwargv): 133 | print("testTask2 is running") 134 | for ft, f in testTask2.outputDataObjs.iteritems(): 135 | #os.system("touch %s" % f.localFileName) 136 | runShellCmd(["touch", "%s" % f.localFileName]) 137 | 138 | # one can add objects one by one to the workflow 139 | #wf.addObjects([f1,f2,f3,f4]) 140 | #wf.addObjects([testTask, testTask2]) 141 | 142 | # or, one can add the "tasks" into the workflow, the input and output data objects will be added automatically 143 | wf.addTasks([testTask, testTask2]) 144 | 145 | #print out the RDFXML file that represents the workflow 146 | print (wf.RDFXML) 147 | #a graphviz dot for rendering the dependency graph if one 148 | print (wf.graphvizDot) 149 | 150 | # execute the workflow until f4 is updated 151 | wf.refreshTargets([f4]) 152 | 153 | # mock the case that f1 is updated 154 | print("re-touch f1") 155 | os.system("sleep 1;touch %s;" % f1.localFileName) 156 | wf.refreshTargets([f4]) 157 | 158 | # mock the case that f3 is updated 159 | print("re-touch f3") 160 | os.system("sleep 1;touch %s;" % f3.localFileName) 161 | 162 | The dependecy graph is shown below: 163 | 164 | .. image:: Example1.png 165 | :width: 400 px 166 | 167 | In the ``example/`` directory, you can generate a more complicated mock example and excute it 168 | by running the ``PypeTest.py`` script with ``python3 PypeTest.py localshell 1``. 169 | 170 | The dependency graph of the mock workflow looks like: 171 | 172 | .. image:: Example2.png 173 | :width: 600 px 174 | 175 | 176 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # pypeFlow documentation build configuration file, created by 4 | # sphinx-quickstart on Tue Jan 10 21:13:17 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.insert(0, os.path.abspath('.')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'pypeFlow' 44 | copyright = u'2012, Chen-Shan Chin' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '0.1' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '0.1' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ['_build'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | html_theme = 'nature' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | #html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | #html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | #html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | #html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | #html_domain_indices = True 142 | 143 | # If false, no index is generated. 144 | #html_use_index = True 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | #html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | #html_show_sourcelink = True 151 | 152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 153 | #html_show_sphinx = True 154 | 155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 156 | #html_show_copyright = True 157 | 158 | # If true, an OpenSearch description file will be output, and all pages will 159 | # contain a tag referring to it. The value of this option must be the 160 | # base URL from which the finished HTML is served. 161 | #html_use_opensearch = '' 162 | 163 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 164 | #html_file_suffix = None 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'pypeFlowdoc' 168 | 169 | 170 | # -- Options for LaTeX output -------------------------------------------------- 171 | 172 | latex_elements = { 173 | # The paper size ('letterpaper' or 'a4paper'). 174 | #'papersize': 'letterpaper', 175 | 176 | # The font size ('10pt', '11pt' or '12pt'). 177 | #'pointsize': '10pt', 178 | 179 | # Additional stuff for the LaTeX preamble. 180 | #'preamble': '', 181 | } 182 | 183 | # Grouping the document tree into LaTeX files. List of tuples 184 | # (source start file, target name, title, author, documentclass [howto/manual]). 185 | latex_documents = [ 186 | ('index', 'pypeFlow.tex', u'pypeFlow Documentation', 187 | u'Chen-Shan Chin', 'manual'), 188 | ] 189 | 190 | # The name of an image file (relative to this directory) to place at the top of 191 | # the title page. 192 | #latex_logo = None 193 | 194 | # For "manual" documents, if this is true, then toplevel headings are parts, 195 | # not chapters. 196 | #latex_use_parts = False 197 | 198 | # If true, show page references after internal links. 199 | #latex_show_pagerefs = False 200 | 201 | # If true, show URL addresses after external links. 202 | #latex_show_urls = False 203 | 204 | # Documents to append as an appendix to all manuals. 205 | #latex_appendices = [] 206 | 207 | # If false, no module index is generated. 208 | #latex_domain_indices = True 209 | 210 | 211 | # -- Options for manual page output -------------------------------------------- 212 | 213 | # One entry per manual page. List of tuples 214 | # (source start file, name, description, authors, manual section). 215 | man_pages = [ 216 | ('index', 'pypeflow', u'PypeFlow Documentation', 217 | [u'Chen-Shan Chin'], 1) 218 | ] 219 | 220 | # If true, show URL addresses after external links. 221 | #man_show_urls = False 222 | 223 | 224 | # -- Options for Texinfo output ------------------------------------------------ 225 | 226 | # Grouping the document tree into Texinfo files. List of tuples 227 | # (source start file, target name, title, author, 228 | # dir menu entry, description, category) 229 | texinfo_documents = [ 230 | ('index', 'pypeFlow', u'pypeFlow Documentation', 231 | u'Chen-Shan Chin', 'PypeFlow', 'One line description of project.', 232 | 'Miscellaneous'), 233 | ] 234 | 235 | # Documents to append as an appendix to all manuals. 236 | #texinfo_appendices = [] 237 | 238 | # If false, no module index is generated. 239 | #texinfo_domain_indices = True 240 | 241 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 242 | #texinfo_show_urls = 'footnote' 243 | -------------------------------------------------------------------------------- /src/tests/test_pypeflow_data.py: -------------------------------------------------------------------------------- 1 | from nose.tools import assert_equal 2 | from nose import SkipTest 3 | import tempfile 4 | import pypeflow.data 5 | import pypeflow.task 6 | import os 7 | 8 | PypeLocalFileCollection = pypeflow.data.PypeLocalFileCollection 9 | PypeLocalFile = pypeflow.data.PypeLocalFile 10 | fn = pypeflow.data.fn 11 | 12 | class TestFn: 13 | def test_fn(self): 14 | file = PypeLocalFile("file://localhost/test1") 15 | assert fn(file) == "/test1" 16 | file = PypeLocalFile("file://localhost/test1/") 17 | assert fn(file) == "/test1/" 18 | file = PypeLocalFile("file://localhost/tmp/test1") 19 | assert fn(file) == "/tmp/test1" 20 | file = PypeLocalFile("file://localhost"+ os.path.abspath("./test1")) 21 | assert fn(file) == os.path.abspath("./test1") 22 | 23 | class TestPypeDataObjectBase: #this class can not be tested directly 24 | pass 25 | 26 | class TestPypeLocalFile: 27 | def test___init__(self): 28 | obj = PypeLocalFile("file://localhost/test") 29 | assert fn(obj) == "/test" 30 | obj = PypeLocalFile("file://localhost/test", **{"x":123}) 31 | assert obj.x == 123 32 | 33 | def test_clean(self): 34 | # pype_local_file = PypeLocalFile(URL, readOnly, **attributes) 35 | # assert_equal(expected, pype_local_file.clean()) 36 | raise SkipTest # TODO: implement your test here 37 | 38 | def test_exists(self): 39 | obj = PypeLocalFile("file://localhost/tmp/pypetest/test") 40 | os.system("mkdir -p /tmp/pypetest/; touch /tmp/pypetest/test") 41 | assert obj.exists == True 42 | os.system("rm /tmp/pypetest/test") 43 | assert obj.exists == False 44 | 45 | 46 | # pype_local_file = PypeLocalFile(URL, readOnly, **attributes) 47 | # assert_equal(expected, pype_local_file.exists()) 48 | raise SkipTest # TODO: implement your test here 49 | 50 | def test_path(self): 51 | # pype_local_file = PypeLocalFile(URL, readOnly, **attributes) 52 | # assert_equal(expected, pype_local_file.path()) 53 | raise SkipTest # TODO: implement your test here 54 | 55 | def test_timeStamp(self): 56 | # pype_local_file = PypeLocalFile(URL, readOnly, **attributes) 57 | # assert_equal(expected, pype_local_file.timeStamp()) 58 | raise SkipTest # TODO: implement your test here 59 | 60 | def test_verify(self): 61 | # pype_local_file = PypeLocalFile(URL, readOnly, **attributes) 62 | # assert_equal(expected, pype_local_file.verify()) 63 | raise SkipTest # TODO: implement your test here 64 | 65 | class TestPypeLocalFileColletion: 66 | 67 | def test___init__(self): 68 | files = PypeLocalFileCollection("files://localhost/tmp/pypetest/test1") 69 | assert files.URL == "files://localhost/tmp/pypetest/test1" 70 | assert files.localFileName == None 71 | 72 | def test_addLocalFile(self): 73 | files = PypeLocalFileCollection("files://localhost/tmp/pypetest/test1") 74 | aNewFile = PypeLocalFile("file://localhost/tmp/pypetest/test2") 75 | files.addLocalFile(aNewFile) 76 | assert files.localFileName == files.localFiles[0].localFileName 77 | assert fn(files) == fn(files.localFiles[0]) 78 | 79 | def test_timeStamp(self): 80 | raise SkipTest # TODO: implement your test here 81 | 82 | def exists(self): 83 | raise SkipTest # TODO: implement your test here 84 | 85 | class TestPypeHDF5Dataset: 86 | pass 87 | 88 | class TestPypeLocalCompositeFile: 89 | def test___init__(self): 90 | # pype_local_composite_file = PypeLocalCompositeFile(URL, readOnly, **attributes) 91 | raise SkipTest # TODO: implement your test here 92 | 93 | class TestMakePypeLocalFile: 94 | def test_make_pype_local_file(self): 95 | # assert_equal(expected, makePypeLocalFile(aLocalFileName, readOnly, **attributes)) 96 | raise SkipTest # TODO: implement your test here 97 | 98 | class TestPypeSplittableLocalFile: 99 | def test___init__(self): 100 | pype_splittable_local_file =\ 101 | pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/./test.txt", 102 | nChunk=5) 103 | for i in range(5): 104 | assert pype_splittable_local_file._splittedFiles[i].URL ==\ 105 | 'file://localhost/./%03d_test.txt' % i 106 | 107 | def test_setGatherTask(self): 108 | 109 | for i in range(5): 110 | with open("/tmp/pypetest/%03d_test_fofn.txt" % i, "w") as f: 111 | f.write("file%02d\n" % i) 112 | 113 | pype_splittable_local_file =\ 114 | pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/tmp/pypetest/test_fofn.txt", 115 | nChunk=5) 116 | with open("/tmp/pypetest/gather.sh", "w") as f: 117 | f.write("#!/bin/bash\n") 118 | f.write("if [ -e /tmp/pypetest/test_fofn.txt ]; then rm /tmp/pypetest/test_fofn.txt; fi\n") 119 | f.write("for f in %s;" % " ".join( ["%03d" % i for i in range(5)] )) 120 | f.write('do cat /tmp/pypetest/$f"_test_fofn.txt" >> /tmp/pypetest/test_fofn.txt\n') 121 | f.write("done\n") 122 | 123 | PypeShellTask = pypeflow.task.PypeShellTask 124 | PypeTaskBase = pypeflow.task.PypeTaskBase 125 | pype_splittable_local_file.setGatherTask(PypeShellTask, 126 | PypeTaskBase, 127 | "/tmp/pypetest/gather.sh") 128 | pype_splittable_local_file.getGatherTask()() 129 | 130 | with open("/tmp/pypetest/test_fofn.txt") as f: 131 | i = 0 132 | for l in f: 133 | l = l.strip() 134 | assert l == "file%02d" % i 135 | i += 1 136 | 137 | import os 138 | for i in range(5): 139 | os.system(" rm /tmp/pypetest/%03d_test_fofn.txt" % i) 140 | 141 | def test_setScatterTask(self): 142 | 143 | with open("/tmp/pypetest/test_fofn.txt", "w") as f: 144 | for i in range(5): 145 | f.write("file%02d\n" % i) 146 | 147 | pype_splittable_local_file =\ 148 | pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/tmp/pypetest/test_fofn.txt", 149 | nChunk=5) 150 | 151 | with open("/tmp/pypetest/scatter.sh", "w") as f: 152 | f.write("#!/bin/bash\n") 153 | f.write("for f in %s;" % " ".join( ["%03d" % i for i in range(5)] )) 154 | f.write('do if [ -e /tmp/pypetest/%f"_test_fofn.txt" ]; \ 155 | then rm /tmp/pypetest/$f"_test_fofn.txt"; fi\n') 156 | f.write("done\n") 157 | for i in range(5): 158 | f.write("echo file%02d > /tmp/pypetest/%03d_test_fofn.txt\n" % (i, i)) 159 | 160 | PypeShellTask = pypeflow.task.PypeShellTask 161 | PypeTaskBase = pypeflow.task.PypeTaskBase 162 | pype_splittable_local_file.setScatterTask(PypeShellTask, 163 | PypeTaskBase, 164 | "/tmp/pypetest/scatter.sh") 165 | pype_splittable_local_file.getScatterTask()() 166 | 167 | for i in range(5): 168 | with open("/tmp/pypetest/%03d_test_fofn.txt" % i) as f: 169 | l = f.read().strip() 170 | assert l == "file%02d" % i 171 | 172 | 173 | def test_getGatherTask(self): 174 | pype_splittable_local_file =\ 175 | pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/tmp/pypetest/test_fofn.txt", 176 | nChunk=5) 177 | PypeShellTask = pypeflow.task.PypeShellTask 178 | PypeTaskBase = pypeflow.task.PypeTaskBase 179 | pype_splittable_local_file.setGatherTask(PypeShellTask, PypeTaskBase, "/tmp/pypetest/gather.sh") 180 | assert pype_splittable_local_file.getGatherTask() == pype_splittable_local_file._gatherTask 181 | assert pype_splittable_local_file.getScatterTask() == None 182 | 183 | def test_getScatterTask(self): 184 | pype_splittable_local_file =\ 185 | pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/tmp/pypetest/test_fofn.txt", 186 | nChunk=5) 187 | PypeShellTask = pypeflow.task.PypeShellTask 188 | PypeTaskBase = pypeflow.task.PypeTaskBase 189 | pype_splittable_local_file.setScatterTask(PypeShellTask, PypeTaskBase, "/tmp/pypetest/scatter.sh") 190 | #pype_splittable_local_file.getScatterTask() 191 | assert pype_splittable_local_file.getScatterTask() == pype_splittable_local_file._scatterTask 192 | assert pype_splittable_local_file.getGatherTask() == None 193 | 194 | def test_getSplittedFiles(self): 195 | pype_splittable_local_file =\ 196 | pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/tmp/pypetest/test.txt", 197 | nChunk=5) 198 | i = 0 199 | for f in pype_splittable_local_file.getSplittedFiles(): 200 | assert f.URL ==\ 201 | 'file://localhost/tmp/pypetest/%03d_test.txt' % i 202 | i += 1 203 | -------------------------------------------------------------------------------- /pypeflow/do_task.py: -------------------------------------------------------------------------------- 1 | from . import do_support, util 2 | from .io import fix_relative_symlinks 3 | import argparse 4 | import copy 5 | import importlib 6 | import inspect 7 | import json 8 | import logging 9 | import os 10 | import pprint 11 | import re 12 | import string 13 | import sys 14 | import time 15 | from shlex import quote 16 | DONE = 'done' 17 | STATUS = 'status' 18 | TIMEOUT = 30 19 | LOG = logging.getLogger() 20 | DESCRIPTION = """Given a JSON description, call a python-function. 21 | """ 22 | EPILOG = """ 23 | The JSON looks like this: 24 | { 25 | "inputs": {"input-name": "filename"}, 26 | "outputs": {"output-name": "output-filename (relative)"}, 27 | "bash_template_fn": "template.sh", 28 | "parameters": {} 29 | } 30 | 31 | This program will run on the work host, and it will do several things: 32 | - Run in CWD. 33 | - Verify that inputs are available. (Wait til timeout if not.) 34 | - Possibly, cd to tmpdir and create symlinks from inputs. 35 | - Run the python-function. 36 | - Its module must be available (e.g. in PYTHONPATH). 37 | - Pass a kwd-dict of the union of inputs/outputs/parameters. 38 | - Ignore return-value. Expect exceptions. 39 | - Possibly, mv outputs from tmpdir to workdir. 40 | - Write exit-code into STATUS. 41 | - Touch DONE on success. 42 | """ 43 | """ 44 | (Someday, we might also support runnable Python modules, or even executables via execvp().) 45 | 46 | Note: qsub will *not* run this directly. There is a higher layer. 47 | """ 48 | 49 | def get_parser(): 50 | class _Formatter(argparse.RawDescriptionHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): 51 | pass 52 | parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG, 53 | formatter_class=_Formatter, 54 | ) 55 | parser.add_argument('--timeout', 56 | type=int, default=TIMEOUT, 57 | help='How many seconds to wait for input files (and JSON) to exist. (default: %(default)s') 58 | parser.add_argument('--tmpdir', 59 | help='Root directory to run in. (Sub-dir name will be based on CWD.)') 60 | parser.add_argument('json_fn', 61 | help='JSON file, as per epilog.') 62 | return parser 63 | 64 | def wait_for(fn, timeout=None): 65 | if timeout is None: 66 | global TIMEOUT 67 | timeout = copy.copy(TIMEOUT) # just to be clear 68 | try: 69 | _wait_for(fn, timeout) 70 | except BaseException: 71 | LOG.exception('Was waiting for {!r}'.format(fn)) 72 | raise 73 | 74 | def _wait_for(fn, timeout): 75 | LOG.debug('Checking existence of {!r} with timeout={}'.format(fn, timeout)) 76 | dirname = os.path.dirname(fn) 77 | if os.path.exists(dirname): 78 | if not os.access(dirname, os.X_OK): 79 | raise Exception('Cannot x into dir {!r}'.format(dirname)) 80 | while not os.path.exists(fn): 81 | if timeout > 0: 82 | time.sleep(1) 83 | timeout -= 1 84 | else: 85 | raise Exception('Timed out waiting for {!r}'.format(fn)) 86 | assert os.access(fn, os.R_OK), '{!r} not readable'.format(fn) 87 | 88 | def get_func(python_function): 89 | mod_name, func_name = os.path.splitext(python_function) 90 | func_name = func_name[1:] # skip dot 91 | mod = importlib.import_module(mod_name) 92 | func = getattr(mod, func_name) 93 | return func 94 | 95 | class OldTaskRunner(object): 96 | def __init__(self, inputs, outputs, parameters): 97 | for k,v in (list(inputs.items()) + list(outputs.items())): 98 | setattr(self, k, v) 99 | self.parameters = parameters 100 | self.inputs = inputs 101 | self.outputs = outputs 102 | 103 | def run_python_func(func, inputs, outputs, parameters): 104 | if False: 105 | kwds = dict() 106 | kwds.update(inputs) 107 | kwds.update(outputs) 108 | kwds.update(parameters) 109 | func(**kwds) 110 | else: 111 | # old way, for now 112 | cwd = os.getcwd() 113 | parameters['cwd'] = cwd 114 | self = OldTaskRunner(inputs, outputs, parameters) 115 | func(self=self) 116 | script_fn = getattr(self, 'generated_script_fn', None) 117 | if script_fn is not None: 118 | do_support.run_bash(script_fn) 119 | 120 | def run_python(python_function_name, myinputs, myoutputs, parameters): 121 | func = get_func(python_function_name) 122 | try: 123 | run_python_func(func, myinputs, myoutputs, parameters) 124 | except TypeError: 125 | # Report the actual function spec. 126 | LOG.error('For function "{}", {}'.format(python_function_name, inspect.getargspec(func))) 127 | raise 128 | 129 | class Attrs(object): 130 | """This facilitates substitution of values in string. 131 | """ 132 | def __str__(self): 133 | # For this, all values must be strings. 134 | return ' '.join(f for f in self.kwds.values()) 135 | def __getattr__(self, name): 136 | # For this, values can be string, int, float, etc. 137 | if '*' in name: 138 | re_star = re.compile('^' + name.replace('*', '.*') + '$') 139 | result = (v for (k,v) in self.kwds.items() if re_star.search(k)) 140 | elif 'ALL' == name: 141 | result = iter(self.kwds.values()) 142 | else: 143 | result = [str(self.kwds[name])] 144 | return ' '.join(self.quote(v) for v in sorted(result)) 145 | def __init__(self, kwds, quote=quote): 146 | self.kwds = kwds 147 | self.quote = quote 148 | 149 | def sub(bash_template, myinputs, myoutputs, parameters): 150 | # Set substitution dict 151 | var_dict = dict() 152 | valid_parameters = {k:v for k,v in parameters.items() if not k.startswith('_')} 153 | assert 'input' not in parameters 154 | assert 'output' not in parameters 155 | # input/output/params are the main values substituted in the subset of 156 | # snakemake which we support. 157 | var_dict['input'] = Attrs(myinputs) 158 | var_dict['output'] = Attrs(myoutputs) 159 | var_dict['params'] = Attrs(valid_parameters, quote=lambda x:x) 160 | fmtr = string.Formatter() 161 | return fmtr.vformat(bash_template, [], var_dict) 162 | 163 | def run_bash(bash_template, myinputs, myoutputs, parameters): 164 | # Like snakemake, we use bash "strict mode", but we add -vx. 165 | # http://redsymbol.net/articles/unofficial-bash-strict-mode/ 166 | prefix = """ 167 | IFS=$'\n\t' 168 | set -vxeuo pipefail 169 | hostname 170 | pwd 171 | date 172 | """ 173 | # Substitute 174 | try: 175 | task_lines = sub(bash_template, myinputs, myoutputs, parameters) 176 | except Exception: 177 | msg = """\ 178 | Failed to substitute var_dict 179 | inputs: {} 180 | outputs: {} 181 | parameters: {} 182 | into bash script: 183 | {} 184 | Possibly you forgot to use "input.foo" "output.bar" "params.fubar" etc. in your script? 185 | """.format(myinputs, myoutputs, parameters, bash_template) 186 | LOG.error(msg) 187 | raise 188 | 189 | postfix = """ 190 | date 191 | """ 192 | # Combine 193 | bash_content = prefix + task_lines + postfix 194 | 195 | # Write user_script.sh 196 | bash_fn = 'user_script.sh' 197 | with open(bash_fn, 'w') as ofs: 198 | ofs.write(bash_content) 199 | cmd = '/bin/bash {}'.format(bash_fn) 200 | util.system(cmd) 201 | 202 | def run_cfg_in_tmpdir(cfg, tmpdir, relpath): 203 | """ 204 | Accept 'inputs', 'outputs', 'parameters' in cfg. 205 | Relativize 'inputs' relative to relpath, unless running in tmpdir. 206 | ('outputs' are always relative to rundir.) 207 | If 'bash_template_fn' in cfg, then substitute and use it. 208 | """ 209 | inputs = cfg['inputs'] 210 | outputs = cfg['outputs'] 211 | parameters = cfg['parameters'] 212 | bash_template_fn = cfg['bash_template_fn'] 213 | for k,v in list(inputs.items()): 214 | if not os.path.isabs(v): 215 | inputs[k] = os.path.normpath(os.path.join(relpath, v)) 216 | if tmpdir: 217 | inputs[k] = os.path.abspath(inputs[k]) 218 | for fn in inputs.values(): 219 | wait_for(fn) 220 | wait_for(bash_template_fn) 221 | bash_template = open(bash_template_fn).read() 222 | myinputs = dict(inputs) 223 | myoutputs = dict(outputs) 224 | finaloutdir = os.getcwd() 225 | if tmpdir: 226 | import getpass 227 | user = getpass.getuser() 228 | pid = os.getpid() 229 | myrundir = '{tmpdir}/{user}/pypetmp/{finaloutdir}'.format(**locals()) 230 | util.rmdirs(myrundir) 231 | util.mkdirs(myrundir) 232 | # TODO(CD): Copy inputs w/ flock. 233 | else: 234 | myrundir = finaloutdir 235 | with util.cd(myrundir): 236 | if tmpdir: 237 | # Check again, in case we have the paths wrong. 238 | for fn in inputs.values(): 239 | wait_for(fn, 0) 240 | # TODO(CD): Write a script in wdir even when running in tmpdir (so we can see it on error). 241 | run_bash(bash_template, myinputs, myoutputs, parameters) 242 | if tmpdir: 243 | """ 244 | for k,v in outputs.iteritems(): 245 | cmd = 'mv -f {} {}'.format( 246 | os.path.join(myrundir, v), 247 | os.path.join(finaloutdir, v)) 248 | util.system(cmd) 249 | """ 250 | cmd = 'rsync -av {}/ {}; rm -rf {}'.format(myrundir, finaloutdir, myrundir) 251 | util.system(cmd) 252 | fix_relative_symlinks(finaloutdir, myrundir, recursive=True) 253 | for fn in cfg['outputs'].values(): 254 | wait_for(fn) 255 | 256 | def run(json_fn, timeout, tmpdir): 257 | if isinstance(timeout, int): 258 | global TIMEOUT 259 | TIMEOUT = timeout 260 | wait_for(json_fn) 261 | LOG.debug('Loading JSON from {!r}'.format(json_fn)) 262 | cfg = json.loads(open(json_fn).read()) 263 | LOG.debug(pprint.pformat(cfg)) 264 | rundir = os.path.normpath(os.path.dirname(json_fn)) 265 | with util.cd(rundir): 266 | run_cfg_in_tmpdir(cfg, tmpdir, '.') 267 | 268 | def main(): 269 | parser = get_parser() 270 | parsed_args = parser.parse_args(sys.argv[1:]) 271 | try: 272 | run(**vars(parsed_args)) 273 | except Exception: 274 | LOG.critical('Error in {} with args={!r}'.format(sys.argv[0], pprint.pformat(vars(parsed_args)))) 275 | raise 276 | 277 | if __name__ == "__main__": 278 | do_support.setup_simple_logging(**os.environ) 279 | LOG.debug('Running "{}"'.format(' '.join(sys.argv))) 280 | main() 281 | -------------------------------------------------------------------------------- /example/PypeTest.py: -------------------------------------------------------------------------------- 1 | # @author Jason Chin 2 | 3 | import sys 4 | import os 5 | 6 | 7 | from pypeflow.common import * 8 | from pypeflow.task import PypeThreadTaskBase, PypeTaskBase 9 | from pypeflow.task import PypeTask, PypeShellTask, PypeSGETask, PypeDistributibleTask 10 | from pypeflow.controller import PypeWorkflow, PypeThreadWorkflow, PypeMPWorkflow 11 | from pypeflow.data import PypeLocalFile, makePypeLocalFile 12 | import logging 13 | 14 | logger = logging.getLogger() 15 | #logger.setLevel(logging.INFO) 16 | logger.setLevel(logging.DEBUG) 17 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 18 | ch = logging.StreamHandler() 19 | ch.setLevel(logging.DEBUG) 20 | ch.setFormatter(formatter) 21 | logger.addHandler(ch) 22 | 23 | 24 | def simpleTest(): 25 | 26 | wf = PypeWorkflow() 27 | 28 | # f1 and f2 are the mock input files 29 | f1 = makePypeLocalFile("test.fa") 30 | f2 = makePypeLocalFile("ref.fa") 31 | 32 | # f3 is the object of the expected output of the "testTask" 33 | f3 = makePypeLocalFile("aln.txt", readOnly=False) 34 | 35 | # create the mock files 36 | os.system("touch %s" % f1.localFileName) 37 | os.system("touch %s" % f2.localFileName) 38 | 39 | # the testTask will take f1 (as "testTask.fasta") and f2 (as "testTask.ref") and generate f3 (as "testTask.aln") 40 | @PypeTask(inputDataObjs={"fasta":f1, "ref":f2}, 41 | outputDataObjs={"aln":f3}, 42 | parameters={"a":10}, **{"b":12}) 43 | def testTask(*argv, **kwargv): 44 | print("testTask is running") 45 | print("fasta input filename is %s" % testTask.fasta.localFileName) 46 | for ft, f in testTask.outputDataObjs.items(): 47 | #os.system("touch %s" % f.localFileName) 48 | runShellCmd(["touch", "%s" % f.localFileName]) 49 | runShellCmd(["sleep", "5" ]) 50 | 51 | # the testTask will take f1 (as "testTask.fasta") and f3 (as "testTask.aln") and generate f4 (as "testTask.aln2") 52 | f4 = makePypeLocalFile("aln2.txt", readOnly=False) 53 | @PypeTask(inputDataObjs={"fasta":f1, "aln":f3}, 54 | outputDataObjs={"aln2":f4}, 55 | parameters={"a":10}, **{"b":12}) 56 | def testTask2(*argv, **kwargv): 57 | print("testTask2 is running") 58 | for ft, f in testTask2.outputDataObjs.items(): 59 | #os.system("touch %s" % f.localFileName) 60 | runShellCmd(["touch", "%s" % f.localFileName]) 61 | 62 | # one can add objects one by one to the workflow 63 | #wf.addObjects([f1,f2,f3,f4]) 64 | #wf.addObjects([testTask, testTask2]) 65 | 66 | # or, one can add the "tasks" into the workflow, the input and output data objects will be added automatically 67 | wf.addTasks([testTask, testTask2]) 68 | 69 | #print out the RDFXML file that represents the workflow 70 | print (wf.RDFXML) 71 | #a graphviz dot for rendering the dependency graph if one 72 | print (wf.graphvizDot) 73 | 74 | # execute the workflow until f4 is updated 75 | wf.refreshTargets([f4]) 76 | 77 | # mock the case that f1 is updated 78 | print("re-touch f1") 79 | os.system("sleep 1;touch %s;" % f1.localFileName) 80 | wf.refreshTargets([f4]) 81 | 82 | # mock the case that f3 is updated 83 | print("re-touch f3") 84 | os.system("sleep 1;touch %s;" % f3.localFileName) 85 | 86 | def simpleTest2(): 87 | 88 | wf = PypeWorkflow() 89 | 90 | f1 = makePypeLocalFile("test.fa") 91 | f2 = makePypeLocalFile("ref.fa") 92 | f3 = makePypeLocalFile("aln.txt", readOnly=False) 93 | f4 = makePypeLocalFile("aln2.txt", readOnly=False) 94 | 95 | os.system("touch %s" % f1.localFileName) 96 | os.system("touch %s" % f2.localFileName) 97 | 98 | @PypeTask(inputDataObjs={"fasta":f1, "ref":f2}, 99 | outputDataObjs={"aln":f3}, 100 | parameters={"a":10}, **{"b":12}) 101 | def testTask(*argv, **kwargv): 102 | print("testTask is running") 103 | for ft, f in testTask.outputDataObjs.items(): 104 | #os.system("touch %s" % f.localFileName) 105 | runShellCmd(["touch", "%s" % f.localFileName]) 106 | runShellCmd(["sleep", "5" ]) 107 | 108 | @PypeTask(inputDataObjs={"fasta":f1, "aln":f3}, 109 | outputDataObjs={"aln2":f4}, 110 | parameters={"a":10}, **{"b":12}) 111 | def testTask2(*argv, **kwargv): 112 | print("testTask2 is running") 113 | for ft, f in testTask2.outputDataObjs.items(): 114 | #os.system("touch %s" % f.localFileName) 115 | runShellCmd(["touch", "%s" % f.localFileName]) 116 | 117 | #wf.addObjects([f1,f2,f3,f4]) wf.addObjects([testTask, testTask2]) 118 | 119 | wf.addTasks([testTask, testTask2]) 120 | 121 | print (wf.RDFXML) 122 | print (wf.graphvizDot) 123 | 124 | #aGraph = PypeGraph(wf._RDFGraph) print(aGraph.tSort()) 125 | 126 | wf.refreshTargets([f4]) 127 | 128 | print("re-touch f1") 129 | os.system("sleep 1;touch %s;" % f1.localFileName) 130 | wf.refreshTargets([f4]) 131 | 132 | print("re-touch f3") 133 | os.system("sleep 1;touch %s;" % f3.localFileName) 134 | 135 | def testDistributed(runmode, cleanup): 136 | logger.info("test start") 137 | baseDir = "." 138 | import random 139 | random.seed(1984) 140 | #PypeThreadWorkflow.setNumThreadAllowed(20,20) 141 | #wf = PypeThreadWorkflow() 142 | PypeMPWorkflow.setNumThreadAllowed(20,20) 143 | wf = PypeMPWorkflow() 144 | allTasks = [] 145 | for layer in range(5): 146 | fN = random.randint(3,7) 147 | fin = [None] * fN 148 | fout = [None] * fN 149 | fmut = [None] * fN 150 | for w in range(fN): 151 | fin[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer, w) ) 152 | fout[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer+1, w) ) 153 | fmut[w] = makePypeLocalFile(baseDir + "/testdata/m_testfile_l%d_w%d.dat" % (layer+1, w) ) 154 | #wf.addObjects([fin[w], fout[w], fmut[w]]) 155 | 156 | for w in range(fN): 157 | inputDataObjs = {} 158 | outputDataObjs = {} 159 | mutableDataObjs = {} 160 | for i in range(5): 161 | inputDataObjs["infile%d" % i] = random.choice(fin) 162 | 163 | i = 0 164 | for obj in random.sample(fmut,2): 165 | #mutableDataObjs["outfile%d" % i] = obj 166 | i += 1 167 | outputDataObjs["outfile%d" % i] = fout[w] 168 | 169 | shellCmd = "sleep 1\n" + "\n".join([ "echo %d %d ... >> %s" % (layer, w, of.localFileName) for of in outputDataObjs.values() ]) + "\nsleep 10" 170 | shellCmd += "sleep 1\n" + "\n".join([ "echo %d %d ... >> %s" % (layer, w, of.localFileName) for of in mutableDataObjs.values() ]) + "\nsleep 10" 171 | shellFileName = baseDir + "/testdata/task_l%d_w%d.sh" % (layer, w) 172 | shfile = open(shellFileName, 'w') 173 | print(shellCmd, file=shfile) 174 | shfile.close() 175 | 176 | if runmode == "internal": 177 | def t1(self): 178 | runShellCmd(["sleep", "%d" % random.randint(0,20) ]) 179 | 180 | for of in self.outputDataObjs.values(): 181 | runShellCmd(["touch", of.localFileName]) 182 | 183 | task = PypeTask(inputDataObjs = inputDataObjs, 184 | outputDataObjs = outputDataObjs, 185 | mutableDataObjs = mutableDataObjs, 186 | URL="task://internal/task_l%d_w%d" % (layer, w), 187 | TaskType=PypeThreadTaskBase) ( t1 ) 188 | 189 | elif runmode == "localshell": 190 | task = PypeShellTask(inputDataObjs = inputDataObjs, 191 | outputDataObjs = outputDataObjs, 192 | mutableDataObjs = mutableDataObjs, 193 | URL="task://localshell/task_l%d_w%d" % (layer, w), 194 | TaskType=PypeThreadTaskBase) ( "%s" % shellFileName ) 195 | 196 | elif runmode == "sge": 197 | task = PypeSGETask(inputDataObjs = inputDataObjs, 198 | outputDataObjs = outputDataObjs, 199 | mutableDataObjs = mutableDataObjs, 200 | URL="task://sge/task_l%d_w%d" % (layer, w), 201 | TaskType=PypeThreadTaskBase) ( "%s" % shellFileName ) 202 | 203 | elif runmode == "mixed": 204 | #distributed = random.choice( (False, True) ) 205 | distributed = True if w % 3 == 0 else False 206 | task = PypeDistributibleTask(inputDataObjs = inputDataObjs, 207 | outputDataObjs = outputDataObjs, 208 | mutableDataObjs = mutableDataObjs, 209 | URL="task://sge/task_l%d_w%d" % (layer, w), 210 | distributed=distributed, 211 | TaskType=PypeThreadTaskBase) ( "%s" % shellFileName ) 212 | 213 | wf.addTasks([task]) 214 | allTasks.append(task) 215 | 216 | for URL in wf._pypeObjects: 217 | prereqJobURLs = [str(u) for u in wf._RDFGraph.transitive_objects(URIRef(URL), pypeNS["prereq"]) 218 | if isinstance(wf._pypeObjects[str(u)], PypeLocalFile) and str(u) != URL ] 219 | if len(prereqJobURLs) == 0: 220 | if cleanup == "1": 221 | os.system("echo start > %s" % wf._pypeObjects[URL].localFileName) 222 | pass 223 | wf.refreshTargets(allTasks) 224 | dotFile = open("test.dot","w") 225 | #print >>dotFile, wf.graphvizShortNameDot 226 | print(wf.graphvizDot, file=dotFile) 227 | dotFile.close() 228 | dotFile = open("test_short_name.dot","w") 229 | print(wf.graphvizShortNameDot, file=dotFile) 230 | dotFile.close() 231 | rdfFile = open("test.rdf","w") 232 | print(wf.RDFXML, file=rdfFile) 233 | rdfFile.close() 234 | if runmode != "internal": 235 | mkFile = open("test.mk","w") 236 | print(wf.makeFileStr, file=mkFile) 237 | mkFile.close() 238 | 239 | if __name__ == "__main__": 240 | try: 241 | testDistributed(sys.argv[1], sys.argv[2]) 242 | except IndexError: 243 | print("try: python3 PypeTest.py localshell 1") 244 | print("running simpleTest()") 245 | simpleTest() 246 | 247 | -------------------------------------------------------------------------------- /src/tests/test_pypeflow_task.py: -------------------------------------------------------------------------------- 1 | from nose.tools import assert_equal 2 | from nose import SkipTest 3 | import pypeflow.task 4 | import pypeflow.data 5 | 6 | class TestPypeTaskBase: 7 | def test___call__(self): 8 | # pype_task_base = PypeTaskBase(URL, *argv, **kwargv) 9 | # assert_equal(expected, pype_task_base.__call__(*argv, **kwargv)) 10 | raise SkipTest # TODO: implement your test here 11 | 12 | def test___init__(self): 13 | # pype_task_base = PypeTaskBase(URL, *argv, **kwargv) 14 | raise SkipTest # TODO: implement your test here 15 | 16 | def test_finalize(self): 17 | # pype_task_base = PypeTaskBase(URL, *argv, **kwargv) 18 | # assert_equal(expected, pype_task_base.finalize()) 19 | raise SkipTest # TODO: implement your test here 20 | 21 | def test_setInputs(self): 22 | # pype_task_base = PypeTaskBase(URL, *argv, **kwargv) 23 | # assert_equal(expected, pype_task_base.setInputs(inputDataObjs)) 24 | raise SkipTest # TODO: implement your test here 25 | 26 | def test_setOutputs(self): 27 | # pype_task_base = PypeTaskBase(URL, *argv, **kwargv) 28 | # assert_equal(expected, pype_task_base.setOutputs(outputDataObjs)) 29 | raise SkipTest # TODO: implement your test here 30 | 31 | def test_setReferenceMD5(self): 32 | # pype_task_base = PypeTaskBase(URL, *argv, **kwargv) 33 | # assert_equal(expected, pype_task_base.setReferenceMD5(md5Str)) 34 | raise SkipTest # TODO: implement your test here 35 | 36 | def test_status(self): 37 | # pype_task_base = PypeTaskBase(URL, *argv, **kwargv) 38 | # assert_equal(expected, pype_task_base.status()) 39 | raise SkipTest # TODO: implement your test here 40 | 41 | class TestPypeThreadTaskBase: 42 | def test___call__(self): 43 | # pype_thread_task_base = PypeThreadTaskBase() 44 | # assert_equal(expected, pype_thread_task_base.__call__(*argv, **kwargv)) 45 | raise SkipTest # TODO: implement your test here 46 | 47 | def test_nSlots(self): 48 | # pype_thread_task_base = PypeThreadTaskBase() 49 | # assert_equal(expected, pype_thread_task_base.nSlots()) 50 | raise SkipTest # TODO: implement your test here 51 | 52 | def test_setMessageQueue(self): 53 | # pype_thread_task_base = PypeThreadTaskBase() 54 | # assert_equal(expected, pype_thread_task_base.setMessageQueue(q)) 55 | raise SkipTest # TODO: implement your test here 56 | 57 | class TestPypeDistributiableTaskBase: 58 | def test___init__(self): 59 | # pype_distributiable_task_base = PypeDistributiableTaskBase(URL, *argv, **kwargv) 60 | raise SkipTest # TODO: implement your test here 61 | 62 | 63 | class TestPypeTask: 64 | def test_pype_task(self): 65 | # assert_equal(expected, PypeTask(*argv, **kwargv)) 66 | raise SkipTest # TODO: implement your test here 67 | 68 | class TestPypeShellTask: 69 | def test_pype_shell_task(self): 70 | # assert_equal(expected, PypeShellTask(*argv, **kwargv)) 71 | raise SkipTest # TODO: implement your test here 72 | 73 | class TestPypeSGETask: 74 | def test_pype_sge_task(self): 75 | # assert_equal(expected, PypeSGETask(*argv, **kwargv)) 76 | raise SkipTest # TODO: implement your test here 77 | 78 | class TestPypeDistributibleTask: 79 | def test_pype_distributible_task(self): 80 | # assert_equal(expected, PypeDistributibleTask(*argv, **kwargv)) 81 | raise SkipTest # TODO: implement your test here 82 | 83 | 84 | class TestTimeStampCompare: 85 | def test_time_stamp_compare(self): 86 | # assert_equal(expected, timeStampCompare(inputDataObjs, outputDataObjs, parameters)) 87 | raise SkipTest # TODO: implement your test here 88 | 89 | class TestPypeTaskCollectionBase: 90 | def test___init__(self): 91 | # pype_task_collection_base = PypeTaskCollectionBase(URL, tasks) 92 | raise SkipTest # TODO: implement your test here 93 | 94 | def test_getTasks(self): 95 | # pype_task_collection_base = PypeTaskCollectionBase(URL, tasks) 96 | # assert_equal(expected, pype_task_collection_base.getTasks()) 97 | raise SkipTest # TODO: implement your test here 98 | 99 | class TestPypeTaskCollection: 100 | def test___init__(self): 101 | # pype_task_collection = PypeTaskCollection(URL, tasks) 102 | raise SkipTest # TODO: implement your test here 103 | 104 | def test_addTask(self): 105 | # pype_task_collection = PypeTaskCollection(URL, tasks) 106 | # assert_equal(expected, pype_task_collection.addTask(task)) 107 | raise SkipTest # TODO: implement your test here 108 | 109 | def test_getTasks(self): 110 | # pype_task_collection = PypeTaskCollection(URL, tasks) 111 | # assert_equal(expected, pype_task_collection.getTasks()) 112 | raise SkipTest # TODO: implement your test here 113 | 114 | class TestPypeScatteredTasks: 115 | 116 | def test_pype_scattered_tasks(self): 117 | import os 118 | #os.system("rm -rf /tmp/pypetest/*") 119 | nChunk = 5 120 | 121 | infileObj =\ 122 | pypeflow.data.PypeSplittableLocalFile( 123 | "splittablefile://localhost/tmp/pypetest/test_in_1.txt", 124 | nChunk = nChunk) 125 | 126 | with open(infileObj.localFileName, "w") as f: 127 | for i in range(nChunk): 128 | f.write("file%02d\n" % i) 129 | 130 | def scatter(*argv, **kwargv): 131 | outputObjs = sorted( kwargv["outputDataObjs"].items() ) 132 | nOut = len(outputObjs) 133 | outputObjs = [ (o[0], o[1], open(o[1].localFileName, "w")) for o in outputObjs] 134 | with open(kwargv["inputDataObjs"]["completeFile"].localFileName,"r") as f: 135 | i = 0 136 | for l in f: 137 | outf = outputObjs[i % nOut][2] 138 | outf.write(l) 139 | i += 1 140 | for o in outputObjs: 141 | o[2].close() 142 | 143 | PypeShellTask = pypeflow.task.PypeShellTask 144 | PypeTask = pypeflow.task.PypeTask 145 | PypeTaskBase = pypeflow.task.PypeTaskBase 146 | infileObj.setScatterTask(PypeTask, PypeTaskBase, scatter) 147 | infileObj.getScatterTask()() 148 | 149 | def gather(*argv, **kwargv): 150 | inputObjs = sorted( kwargv["inputDataObjs"].items() ) 151 | with open(kwargv["outputDataObjs"]["completeFile"].localFileName,"w") as outf: 152 | for k, subfile in inputObjs: 153 | f = open(subfile.localFileName) 154 | outf.write(f.read()) 155 | f.close() 156 | 157 | outfileObj =\ 158 | pypeflow.data.PypeSplittableLocalFile( 159 | "splittablefile://localhost/tmp/pypetest/test_out_1.txt", 160 | nChunk = nChunk) 161 | 162 | outfileObj.setGatherTask(PypeTask, PypeTaskBase, gather) 163 | 164 | PypeScatteredTasks = pypeflow.task.PypeScatteredTasks 165 | 166 | @PypeScatteredTasks( inputDataObjs = {"inf":infileObj}, 167 | outputDataObjs = {"outf":outfileObj} ) 168 | def test_fun(*argv, **kwargv): 169 | chunk_id = kwargv["chunk_id"] 170 | self = test_fun[chunk_id] 171 | assert self.inf._path == "/tmp/pypetest/%03d_test_in_1.txt" % chunk_id 172 | with open( self.outf._path , "w") as f: 173 | in_f = open(self.inf.localFileName,"r") 174 | f.write("out:"+in_f.read()) 175 | in_f.close() 176 | return self.inf._path 177 | 178 | assert len(test_fun.getTasks()) == nChunk 179 | for i in range(nChunk): 180 | test_fun[i]() 181 | 182 | outfileObj.getGatherTask()() 183 | 184 | def test_pype_scattered_tasks_2(self): 185 | import os 186 | #os.system("rm -rf /tmp/pypetest/*") 187 | 188 | nChunk = 5 189 | 190 | infileObj =\ 191 | pypeflow.data.PypeSplittableLocalFile( 192 | "splittablefile://localhost/tmp/pypetest/test_in_2.txt", 193 | nChunk = nChunk) 194 | 195 | with open(infileObj.localFileName, "w") as f: 196 | for i in range(nChunk): 197 | f.write("file%02d\n" % i) 198 | 199 | with open("/tmp/pypetest/scatter.sh", "w") as f: 200 | f.write("#!/bin/bash\n") 201 | f.write("for f in %s;" % " ".join( ["%03d" % i for i in range(nChunk)] )) 202 | f.write('do if [ -e /tmp/pypetest/%f"_test_in.txt" ];\ 203 | then rm /tmp/pypetest/$f"_test_in.txt"; fi;\n') 204 | f.write("done\n") 205 | for i in range(nChunk): 206 | f.write("echo file%02d > /tmp/pypetest/%03d_test_in_2.txt\n" % (i, i)) 207 | 208 | PypeShellTask = pypeflow.task.PypeShellTask 209 | PypeTask = pypeflow.task.PypeTask 210 | PypeTaskBase = pypeflow.task.PypeTaskBase 211 | infileObj.setScatterTask(PypeShellTask, PypeTaskBase, "/tmp/pypetest/scatter.sh") 212 | infileObj.getScatterTask()() 213 | 214 | def gather(*argv, **kwargv): 215 | inputObjs = sorted( kwargv["inputDataObjs"].items() ) 216 | with open(kwargv["outputDataObjs"]["completeFile"].localFileName,"w") as outf: 217 | for k, subfile in inputObjs: 218 | f = open(subfile.localFileName) 219 | outf.write("out:"+f.read()) 220 | f.close() 221 | 222 | outfileObj =\ 223 | pypeflow.data.PypeSplittableLocalFile( 224 | "splittablefile://localhost/tmp/pypetest/test_out_2.txt", 225 | nChunk = nChunk) 226 | 227 | outfileObj.setGatherTask(PypeTask, PypeTaskBase, gather) 228 | 229 | PypeScatteredTasks = pypeflow.task.PypeScatteredTasks 230 | 231 | @PypeScatteredTasks( inputDataObjs = {"inf":infileObj}, 232 | outputDataObjs = {"outf":outfileObj}, 233 | comment="xyz") 234 | def test_fun_2(*argv, **kwargv): 235 | assert kwargv["comment"] == "xyz" 236 | chunk_id = kwargv["chunk_id"] 237 | self = test_fun_2[chunk_id] 238 | assert self.inf._path == "/tmp/pypetest/%03d_test_in_2.txt" % chunk_id 239 | with open( self.outf._path , "w") as f: 240 | f.write("file%02d\n" % chunk_id) 241 | return self.inf._path 242 | 243 | assert len(test_fun_2.getTasks()) == nChunk 244 | for i in range(nChunk): 245 | test_fun_2[i]() 246 | 247 | outfileObj.getGatherTask()() 248 | 249 | def test_pype_scattered_tasks_3(self): 250 | import os 251 | #os.system("rm -rf /tmp/pypetest/*") 252 | nChunk = 5 253 | 254 | 255 | infileObj0 =\ 256 | pypeflow.data.PypeLocalFile( 257 | "file://localhost/tmp/pypetest/test_in_0.txt") 258 | with open(infileObj0.localFileName,"w") as f: 259 | f.write("prefix:") 260 | 261 | infileObj =\ 262 | pypeflow.data.PypeSplittableLocalFile( 263 | "splittablefile://localhost/tmp/pypetest/test_in_3.txt", 264 | nChunk = nChunk) 265 | 266 | with open(infileObj.localFileName, "w") as f: 267 | for i in range(nChunk): 268 | f.write("file%02d\n" % i) 269 | 270 | def scatter(*argv, **kwargv): 271 | outputObjs = sorted( kwargv["outputDataObjs"].items() ) 272 | nOut = len(outputObjs) 273 | outputObjs = [ (o[0], o[1], open(o[1].localFileName, "w")) for o in outputObjs] 274 | with open(kwargv["inputDataObjs"]["completeFile"].localFileName,"r") as f: 275 | i = 0 276 | for l in f: 277 | outf = outputObjs[i % nOut][2] 278 | outf.write(l) 279 | i += 1 280 | for o in outputObjs: 281 | o[2].close() 282 | 283 | PypeShellTask = pypeflow.task.PypeShellTask 284 | PypeTask = pypeflow.task.PypeTask 285 | PypeTaskBase = pypeflow.task.PypeTaskBase 286 | infileObj.setScatterTask(PypeTask, PypeTaskBase, scatter) 287 | infileObj.getScatterTask()() 288 | 289 | def gather(*argv, **kwargv): 290 | inputObjs = sorted( kwargv["inputDataObjs"].items() ) 291 | with open(kwargv["outputDataObjs"]["completeFile"].localFileName,"w") as outf: 292 | for k, subfile in inputObjs: 293 | f = open(subfile.localFileName) 294 | outf.write(f.read()) 295 | f.close() 296 | 297 | outfileObj3 =\ 298 | pypeflow.data.PypeSplittableLocalFile( 299 | "splittablefile://localhost/tmp/pypetest/test_out_3.txt", 300 | nChunk = nChunk) 301 | 302 | outfileObj3.setGatherTask(PypeTask, PypeTaskBase, gather) 303 | 304 | PypeScatteredTasks = pypeflow.task.PypeScatteredTasks 305 | 306 | @PypeScatteredTasks( inputDataObjs = {"inf":infileObj, "prefix":infileObj0}, 307 | outputDataObjs = {"outf":outfileObj3} ) 308 | def test_fun_3(*argv, **kwargv): 309 | chunk_id = kwargv["chunk_id"] 310 | self = test_fun_3[chunk_id] 311 | 312 | assert self.inf._path == "/tmp/pypetest/%03d_test_in_3.txt" % chunk_id 313 | with open( self.prefix.localFileName, "r") as f: 314 | prefix = f.read() 315 | 316 | with open( self.outf._path, "w") as f: 317 | in_f = open(self.inf.localFileName,"r") 318 | f.write(prefix + in_f.read()) 319 | in_f.close() 320 | return self.inf._path 321 | 322 | assert len(test_fun_3.getTasks()) == nChunk 323 | for i in range(nChunk): 324 | test_fun_3[i]() 325 | 326 | outfileObj3.getGatherTask()() 327 | -------------------------------------------------------------------------------- /presentation/pypeFLOW_tutorial.rst: -------------------------------------------------------------------------------- 1 | 2 | pypeFLOW Tutorial 3 | ================= 4 | 5 | .. image:: escher--unbelievable-527581_1024_768.jpg 6 | :scale: 40% 7 | :align: left 8 | 9 | ----------------- 10 | 11 | What is pypeFLOW? 12 | ----------------- 13 | 14 | What is pypeFLOW? A toolkit to contruct data processing work flow 15 | 16 | Tracking data processing within the Python language 17 | 18 | .. image:: pipelines.png 19 | :scale: 70 % 20 | :align: center 21 | 22 | ----------------- 23 | 24 | Basic Objects 25 | ------------- 26 | 27 | data objects (defined in ``pypeflow.data.*``) 28 | 29 | task objects (defined in ``pypeflow.task.*``) 30 | 31 | workflow objects (defined in ``pypeflow.controller.*``) 32 | 33 | Analogous to Makefile 34 | 35 | .. code-block:: python 36 | 37 | @PypeTask( inputs = {'dep1':dep1, 'dep2':dep2}, 38 | outputs = {'target':target} ) 39 | def do_something_to_get_the_target(self, *argv, **kwargv): 40 | ... 41 | 42 | is equivalent to 43 | 44 | .. code-block:: make 45 | 46 | target: dep1 dep2 47 | do_something_to_get_the_target ... 48 | 49 | * Every PypeObjects is initialized by an URL and uniquely identifiable by it. 50 | 51 | --------------------- 52 | 53 | Data Objects 54 | ------------ 55 | 56 | ``PypeLocalFile`` is an object representing a reference to local file 57 | 58 | .. code-block:: python 59 | 60 | f = PypeLocalFile("file://localhost/home/jchin/test/test.txt") 61 | 62 | ``f`` is a local file at ``/home/jchin/test/test.txt`` 63 | 64 | .. code-block:: python 65 | 66 | assert f.URL == "file://localhost/home/jchin/test/test.txt" 67 | assert f.localFileName == "/home/jchin/test/test.txt" 68 | 69 | 70 | ------------------------ 71 | 72 | Basic Task Objects 73 | ------------------ 74 | 75 | `PypeTaskBase`` is the base class representing a `task` that converts some 76 | input files to some output files. 77 | 78 | Such `task` is typically constructed by using a decorator (e.g. ``PypeTask``) 79 | to wrap a function into a ``PypeTaskBase`` objects (or objects of the 80 | subclasses of ``PypeTaskBase``) 81 | 82 | One needs to specify the input and output data objects within the decorator. 83 | The data objects can be referred within the task function that gets wrapped. 84 | 85 | Example: 86 | 87 | .. code-block:: python 88 | 89 | in_file1 = PypeLocalFile("file://localhost/home/jchin/test/test.txt") 90 | 91 | @PypeTask( inputs = {"in_file1": in_file1, "in_file2": in_file2}, 92 | outputs = {"out_file2": out_file2, "out_file2": out_file2} ) 93 | def task(self, *argv, **kwargv): 94 | assert self.in_file1.localFileName == "/home/jchin/test/test.txt" 95 | #do somethings to generate out_file1 and out_file2 96 | 97 | assert task.in_file1 == in_file1 98 | 99 | ------------------------ 100 | 101 | Task Decorator is Actually a Function 102 | ------------------------------------- 103 | 104 | If you don't like Python's decorator, you can generate tasks by calling the 105 | decorator function directly. This is useful to generate a number of tasks 106 | programmatically, e.g., using a loop to generate a number of tasks. 107 | 108 | .. code-block:: python 109 | 110 | tasks = [] 111 | def task_func(self, *argv, **kwargv): 112 | # do something 113 | pass 114 | 115 | for i in range(10): 116 | # task_decorator is a function that takes a function as an input argument 117 | # and it returns a PypeTaskBase object 118 | task_decorator = PypeTask(inputs={"f":inputObjs[i]}, 119 | outputs={"g":outputObjs[i]}, 120 | URL="task://localhost/task%s" % i) 121 | t = task_decorator(task_func) 122 | tasks.append(t) 123 | 124 | ----------------------- 125 | 126 | Different Kind of Task Objects 127 | ------------------------------ 128 | 129 | Different ``*Task`` decorators can wrap different kind of function (or 130 | objects, e.g shell script strings) 131 | 132 | - ``PypeTask``, wrap Python function, run as a Python function 133 | 134 | - ``PypeShellTask``, wrap a string as shell script, run as a Python function 135 | that executes the shell script 136 | 137 | - other decorators for different purposes can be written as needed (e.g. 138 | ``PypeSGETask``) 139 | 140 | One can use ``TaskType`` keyword argument in the decorator to control the 141 | output task types 142 | 143 | - Simple task type: ``PypeTaskBase`` 144 | 145 | - Task type that can be run concurrently within different threads: ``PypeThreadTaskBase`` 146 | 147 | 148 | ----------------------- 149 | 150 | Some Examples About Tasks I 151 | --------------------------- 152 | 153 | .. code-block:: python 154 | 155 | @PypeTask( ..., TaskType = PypeTaskBase) 156 | def simple_py_func(self, *argv, **kwargv): 157 | ... 158 | 159 | @PypeTask( ..., TaskType = PypeThreadTaskBase) 160 | def simple_py_func(self, *argv, **kwargv): 161 | ... 162 | 163 | t = PypeShellTask( ..., TaskType = PypeTaskBase)("#!/bin/bash; echo I am a task") 164 | 165 | t = PypeShellTask( ..., TaskType = PypeThreadTaskBase)("#!/bin/bash; echo I am a task") 166 | 167 | ----------------------- 168 | 169 | Some Examples About Tasks II 170 | ---------------------------- 171 | 172 | An instance of the ``PythonTaskBase`` class is a "callable" object, namely, 173 | it implements ``__call__`` method. When it gets called, it will check the 174 | dependency of the input and output objects and make a decision whether to 175 | execute the wrapped function. 176 | 177 | .. code-block:: python 178 | 179 | task_decorator = PypeTask(inputs={"f":f}, 180 | outputs={"g":g}) 181 | def task_func(self, *argv, **kwargv): 182 | do_something() 183 | 184 | # calling task_func() will return True and the original task_func is executed 185 | # if f is newer than g 186 | 187 | # assuming g does not exist 188 | task_func() # return True, do_something() is excuted, assuming g is generated 189 | # run it again 190 | task_func() # return False, the original task_func is not called, since g is newer than f 191 | 192 | 193 | 194 | ----------------------- 195 | 196 | Workflow Objects 197 | ---------------- 198 | 199 | A ``PypeWorkflow`` object contains a collection of ``PypeDataObjects`` and 200 | ``PypeTaskBase`` objects. It calculates the dependency graph and executes all 201 | tasks with the correct order. 202 | 203 | * ``PypeWorkflow``: vanilla workflow class, one task at a time 204 | * ``PypeThreadWorkflow``: workflow class that can run tasks concurrently using 205 | Python thread library 206 | * ``PypeMPWorkflow``: workflow class that can run tasks concurrently using Python 207 | multiprocessing library 208 | 209 | ----------------------- 210 | 211 | Workflow Building Pattern 212 | ------------------------- 213 | 214 | Set up a workflow object 215 | 216 | .. code-block:: python 217 | 218 | wf = PypeWorkflow(...) 219 | wf = PypeMPWorkflow(...) 220 | 221 | Set up a task 222 | 223 | - Set up data objects 224 | - Define a ``task_func`` to be wrapped 225 | - Use ``PypeTask`` decorator to create the real ``PypeTaskBase`` object 226 | 227 | Add the task into the workflow (The inputs and outputs will be added automatically) 228 | 229 | Set up more tasks and add them into the workflow (``wf.addTasks([t1,t2,...])``) 230 | 231 | Call ``wf.refreshTargets(target_list)`` to execute the tasks (only task that does not 232 | satisfy the dependency constrain will be execute) 233 | 234 | ----------------------- 235 | 236 | Put It All Together 237 | ------------------- 238 | 239 | `Code Demo `_. 240 | 241 | `Embarrassing Parallelization Workflow `_. 242 | 243 | ------------------------ 244 | 245 | Mutable Data Objects & State Objects 246 | ------------------------------------ 247 | 248 | Issue: 249 | 250 | * Side effect: If a data object (e.g. various gff, cmp.h5 files) is 251 | both input and output, we can not use it to calculate dependency. 252 | * Such file usually has some "internal states" that affect 253 | how tasks should be executed 254 | 255 | Solution 256 | 257 | * Be explicit. 258 | * introduce "mutableDataObjs" for a task indicating those data objects that a 259 | task can modified. If an object is used as "mutableDataObjs", it is not used 260 | for calculating the task dependency. 261 | * The standard "inputs" and "outputs" should be "immutable" objects within the 262 | scope of the workflow. 263 | * Special state objects to keep track the states. The state objects are used as 264 | the input objects and/or output objects to control the task dependency (see 265 | `Example `_) 266 | 267 | ------------------------- 268 | 269 | Output Collision Detection 270 | -------------------------- 271 | 272 | The dependency graph as a direct acyclic graph helps to find 273 | independent tasks that can be run concurrently 274 | 275 | However, in the case that multiple tasks write to the same 276 | output file, we need to detect "output collision" and do not 277 | allow tasks that writes to the same to be run concurrently. 278 | 279 | Code snippet finding tasks that can be submitted 280 | 281 | .. code-block:: python 282 | 283 | jobsReadyToBeSubmitted = [] 284 | 285 | for URL, taskObj, tStatus in sortedTaskList: 286 | prereqJobURLs = prereqJobURLMap[URL] 287 | outputCollision = False 288 | 289 | for dataObj in taskObj.outputDataObjs.values() + taskObj.mutableDataObjs.values(): 290 | for fromTaskObjURL, activeDataObjURL in activeDataObjs: 291 | if dataObj.URL == activeDataObjURL and taskObj.URL != fromTaskObjURL: 292 | logger.debug( "output collision detected for data object:"+str(dataObj)) 293 | outputCollision = True 294 | break 295 | 296 | if outputCollision: #the task can not be executed 297 | continue 298 | ... 299 | 300 | 301 | ------------------------- 302 | 303 | Scatter-Gather Pattern 304 | ---------------------- 305 | 306 | Pattern: 307 | 308 | - Start with a file 309 | 310 | - Split it into a number of small files of the same type 311 | 312 | - process them as processing the original file 313 | 314 | - generate some partial results 315 | 316 | - put partial results back into a single file 317 | 318 | Complexity 319 | 320 | - Multiple input files / output files 321 | 322 | - Chaining of scattered tasks 323 | 324 | ------------------------------------ 325 | 326 | Encapsulating Scattered Files 327 | ----------------------------- 328 | 329 | ``PypeSplittableLocalFile``: Represent a PypeData object that has two 330 | different local file representations: 331 | 332 | - the whole file (could be a virtual one) 333 | - the split files 334 | 335 | Such data object can have either a scatter task attached or a gather task 336 | attached. 337 | 338 | - If a scatter task is attached, the task will be inserted to generate the 339 | scattered files. 340 | 341 | - If a gather task is attached, the task will be inserted to generate the 342 | whole file. 343 | 344 | - If neither scatter task nor gather task is specified, then the file is 345 | mostly like intermediate data. Namely, the whole file representation is 346 | not used any place else. 347 | 348 | - One can not specify scatter task and gather task for the same object since it 349 | will create a loop. 350 | 351 | 352 | 353 | 354 | ------------------------------------ 355 | 356 | Generate Scattered Tasks 357 | ------------------------ 358 | 359 | Special decorator to generate a set of "scattered tasks": 360 | 361 | - Explicitly generating a collection of tasks that work on the split files 362 | 363 | - Special task decorators to generate the collection: 364 | 365 | ``PypeScatteredTasks``: a decorator that takes a function as an input and generate 366 | a collection of tasks that does the real work (alias as ``getPypeScatteredTasks`` 367 | to be used as a regular function) 368 | 369 | ``PypeScatteredTasks/getPypeScatteredTasks`` returns a ``PypeTaskCollection`` object 370 | which contains all the sub-tasks / scatter tasks / gather tasks. 371 | 372 | When a ``PypeTaskCollection`` object is added into a workflow, the real sub-tasks are 373 | added automatically. 374 | 375 | `Example / Demo `_ 376 | 377 | ------------------------- 378 | 379 | FOFN Mapper 380 | ----------- 381 | 382 | A special decorator/function that takes a FOFN (file of file names) as the main 383 | input and generate the tasks with the inputs are the files specified in 384 | the FOFN. ( This is different from a "scatter" task which keeps the file 385 | type the same. ) 386 | 387 | .. code-block:: python 388 | 389 | def outTemplate(fn): 390 | return fn + ".out" 391 | 392 | def task(self, *argv, **kwargv): 393 | in_f = self.in_f 394 | out_f = self.out_f 395 | #do something with in_f, and write something to out_f 396 | 397 | tasks = getPypeFOFNMapTasks(FOFNFileName = "./file.fofn", 398 | outTemplateFunc = outTemplate, 399 | TaskType = PypeThreadTaskBase, 400 | parameters = dict(nSlots = 8))( alignTask ) 401 | 402 | for t in tasks:# You can run the tasks in sequential 403 | t() 404 | 405 | wf = PypeThreadWorkflow() # or run them in parallel using thread or multiprocessing 406 | wf.CONCURRENT_THREAD_ALLOWED = nproc 407 | wf.MAX_NUMBER_TASK_SLOT = nproc 408 | wf.addTasks(tasks) 409 | wf.refreshTargets(exitOnFailure=False) 410 | 411 | 412 | --------------------------------- 413 | 414 | Query Workflow Objects 415 | ---------------------- 416 | 417 | Workflows has a canonical RDF representation. One can query the DAG using SPARQ 418 | 419 | For example, give a workflow DAG, what are the workflow inputs and outputs 420 | 421 | .. code-block:: python 422 | 423 | @property 424 | def inputDataObjects(self): 425 | graph = self._RDFGraph 426 | inputObjs = [] 427 | for obj in self.dataObjects: 428 | r = graph.query('SELECT ?o WHERE {<%s> pype:prereq ?o . }' % obj.URL, 429 | initNs=dict(pype=pypeNS)) 430 | if len(r) == 0: 431 | inputObjs.append(obj) 432 | return inputObjs 433 | 434 | 435 | workflow.inputDataObjects # <- the input data objects of the whole workflow 436 | 437 | ---------------------------- 438 | 439 | Update Workflow Objects 440 | ----------------------- 441 | 442 | We can redirect the inputs and outputs to different underlying files using 443 | ``workflow.updateURL()`` 444 | 445 | .. code-block:: python 446 | 447 | def updateURL(self, oldURL, newURL): 448 | obj = self._pypeObjects[oldURL] 449 | obj._updateURL(newURL) 450 | self._pypeObjects[newURL] = obj 451 | del self._pypeObjects[oldURL] 452 | 453 | It is possible to build a workflow structure and set up the real inputs 454 | and outputs later. This is useful to setup the workflow input/output from 455 | command line options and/or an XML configuration file. 456 | 457 | .. code-block:: python 458 | 459 | for o in workflow.inputDataObjects: 460 | if o.URL == "files://virtual/xyz": 461 | realInputFile = os.path.abspath(sys.argv[1]) 462 | o.updateURL("files://localhost%s" % realInputFile) 463 | ... 464 | 465 | ------------------------- 466 | 467 | Debugging Support 468 | ----------------- 469 | 470 | graphviz dot output 471 | 472 | logging 473 | 474 | test coverage about 70%, 22 tests now 475 | 476 | The whole thing is about 2000 LOC (without counting 477 | testing code.):: 478 | 479 | $wc src/pypeflow/*.py 480 | 481 | 0 0 0 src/pypeflow/__init__.py 482 | 148 539 4428 src/pypeflow/common.py 483 | 744 2603 28166 src/pypeflow/controller.py 484 | 313 1140 11096 src/pypeflow/data.py 485 | 814 2645 28005 src/pypeflow/task.py 486 | 2019 6927 71695 total 487 | 488 | ---------------------------- 489 | 490 | What's Next? 491 | ------------ 492 | 493 | * I will use this PypeFLOW for producing better reproducible 494 | bioinformatics analysis developed with in Python/IPython notebook 495 | 496 | * Some new features: 497 | 498 | - Supporting data object in memory? mmap file? numpy array? 499 | - Remote data objects 500 | - HDF5 data sets as native data objects 501 | - direct python function execution (through IPython parallel or Pyro like RPC call) 502 | 503 | * Similar framework for streaming data processing rather than batch data 504 | processing 505 | 506 | -------------------------------------------------------------------------------- /pwatcher/blocking.py: -------------------------------------------------------------------------------- 1 | """Blocking process-watcher. 2 | 3 | See fs_based.py. Here, delete is a no-op, and run() starts threads, so 4 | the main program needs to wait for threads to finish somehow. 5 | 6 | Typical submission_string: 7 | 8 | qsub -S /bin/bash -sync y -V -q production -N ${JOB_ID} \\\n -o "${STDOUT_FILE}" \\\n -e "${STDERR_FILE}" \\\n -pe smp ${NPROC} -l h_vmem=${MB}M \\\n "${CMD}" 9 | """ 10 | try: 11 | from shlex import quote 12 | except ImportError: 13 | from pipes import quote 14 | import collections 15 | import contextlib 16 | import copy 17 | import glob 18 | import json 19 | import logging 20 | import os 21 | import pprint 22 | import re 23 | import signal 24 | import string 25 | import subprocess 26 | import sys 27 | import threading 28 | import time 29 | import traceback 30 | 31 | log = logging.getLogger(__name__) 32 | 33 | LOCAL_SUBMISSION_STRING = '/bin/bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}' # for job_local override 34 | STATE_FN = 'state.py' 35 | Job = collections.namedtuple('Job', ['jobid', 'cmd', 'rundir', 'options']) 36 | MetaJob = collections.namedtuple('MetaJob', ['job', 'lang_exe']) 37 | lang_python_exe = sys.executable 38 | lang_bash_exe = '/bin/bash' 39 | 40 | @contextlib.contextmanager 41 | def cd(newdir): 42 | prevdir = os.getcwd() 43 | log.debug('CD: %r <- %r' %(newdir, prevdir)) 44 | os.chdir(os.path.expanduser(newdir)) 45 | try: 46 | yield 47 | finally: 48 | log.debug('CD: %r -> %r' %(newdir, prevdir)) 49 | os.chdir(prevdir) 50 | 51 | class MetaJobClass(object): 52 | ext = { 53 | lang_python_exe: '.py', 54 | lang_bash_exe: '.bash', 55 | } 56 | def get_wrapper(self): 57 | # Totally by convention, for now. 58 | return '%s/run-%s%s' %(self.mj.job.rundir, self.mj.job.jobid, self.ext[self.mj.lang_exe]) 59 | def get_sentinel(self): 60 | return 'exit-%s' %self.mj.job.jobid # in watched dir 61 | def get_pid(self): 62 | return self.mj.pid 63 | def kill(self, pid, sig): 64 | stored_pid = self.get_pid() 65 | if not pid: 66 | pid = stored_pid 67 | log.info('Not passed a pid to kill. Using stored pid:%s' %pid) 68 | if pid and stored_pid: 69 | if pid != stored_pid: 70 | log.error('pid:%s != stored_pid:%s' %(pid, stored_pid)) 71 | os.kill(pid, sig) 72 | def __init__(self, mj): 73 | self.mj = mj 74 | class State(object): 75 | def notify_threaded(self, jobid): 76 | self.jobids_threaded.add(jobid) 77 | def notify_started(self, jobid): 78 | #state.top['jobids_submitted'].append(jobid) 79 | self.jobids_submitted.add(jobid) 80 | self.jobids_threaded.remove(jobid) 81 | log.debug('Thread notify_started({}).'.format(jobid)) 82 | def notify_exited(self, jobid, rc): 83 | #self.top['jobid2exit'][jobid] = rc 84 | self.jobid2exit[jobid] = rc 85 | self.jobids_submitted.remove(jobid) 86 | log.debug('Thread notify_exited({}->{}).'.format(jobid, rc)) 87 | def set_job(self, jobid, mjob): 88 | # Is this needed? For now, we are not actually saving state, so no. 89 | self.top['jobs'][jobid] = mjob 90 | def update_jobid2status(self, jobid2status): 91 | for jobid in self.jobids_threaded: 92 | status = 'THREADED' 93 | jobid2status[jobid] = status 94 | for jobid in self.jobids_submitted: 95 | status = 'RUNNING' 96 | # but actually it might not have started yet, or it could be dead, since we have blocking qsub calls 97 | jobid2status[jobid] = status 98 | for jobid, rc in self.jobid2exit.items(): 99 | status = 'EXIT {}'.format(rc) 100 | jobid2status[jobid] = status 101 | def get_running_jobids(self): 102 | return list(self.jobids_submitted) 103 | def serialize(self): 104 | return pprint.pformat(self.top) 105 | @staticmethod 106 | def deserialize(directory, content): 107 | state = State(directory) 108 | state.top = eval(content) 109 | state.content_prev = content 110 | return state 111 | @staticmethod 112 | def create(directory): 113 | state = State(directory) 114 | #makedirs(state.get_directory_wrappers()) 115 | #makedirs(state.get_directory_jobs()) 116 | return state 117 | def __init__(self, directory): 118 | self.__directory = os.path.abspath(directory) 119 | self.content_prev = '' 120 | self.top = dict() # for serialization, when we decide we need it 121 | self.top['jobs'] = dict() 122 | #self.top['jobids_submitted'] = list() 123 | #self.top['jobid2exit'] = dict() 124 | self.jobids_threaded = set() 125 | self.jobids_submitted = set() 126 | self.jobid2exit = dict() 127 | 128 | class SafeState(object): 129 | """Synchronized State proxy for accessing any 130 | data which might be modified in a Thread. 131 | """ 132 | def notify_threaded(self, jobid): 133 | with self.lock: 134 | self.state.notify_threaded(jobid) 135 | def notify_started(self, jobid): 136 | with self.lock: 137 | self.state.notify_started(jobid) 138 | def notify_exited(self, jobid, rc): 139 | with self.lock: 140 | self.state.notify_exited(jobid, rc) 141 | def update_jobid2status(self, table): 142 | with self.lock: 143 | return self.state.update_jobid2status(table) 144 | def get_running_jobids(self): 145 | with self.lock: 146 | return self.state.get_running_jobids() 147 | def serialize(self): 148 | with self.lock: 149 | return self.state.serialize() 150 | def __getattr__(self, name): 151 | """For all other methods, just delegate. 152 | """ 153 | return getattr(self.state, name) 154 | def __init__(self, state): 155 | self.state = state 156 | self.lock = threading.Lock() 157 | 158 | def get_state(directory): 159 | """For now, we never write. 160 | """ 161 | state_fn = os.path.join(directory, STATE_FN) 162 | if not os.path.exists(state_fn): 163 | return State.create(directory) 164 | assert False, 'No state directory needed, for now.' 165 | try: 166 | return State.deserialize(directory, open(state_fn).read()) 167 | except Exception: 168 | log.exception('Failed to read state "%s". Ignoring (and soon over-writing) current state.'%state_fn) 169 | # TODO: Backup previous STATE_FN? 170 | return State(directory) 171 | def State_save(state): 172 | # TODO: RW Locks, maybe for runtime of whole program. 173 | content = state.serialize() 174 | content_prev = state.content_prev 175 | if content == content_prev: 176 | return 177 | fn = state.get_state_fn() 178 | open(fn, 'w').write(content) 179 | log.debug('saved state to %s' %repr(os.path.abspath(fn))) 180 | def Job_get_MetaJob(job, lang_exe=lang_bash_exe): 181 | return MetaJob(job, lang_exe=lang_exe) 182 | def MetaJob_wrap(mjob, state): 183 | """Write wrapper contents to mjob.wrapper. 184 | """ 185 | metajob_rundir = mjob.job.rundir 186 | wdir = metajob_rundir 187 | 188 | bash_template = """#!%(lang_exe)s 189 | cmd="%(cmd)s" 190 | rundir="%(rundir)s" 191 | finish() { 192 | echo "finish code: $?" 193 | } 194 | trap finish 0 195 | #printenv 196 | echo 197 | set -ex 198 | while [ ! -d "$rundir" ]; do sleep 1; done 199 | cd "$rundir" 200 | eval "$cmd" 201 | """ 202 | mji = MetaJobClass(mjob) 203 | wrapper_fn = os.path.join(wdir, mji.get_wrapper()) 204 | command = mjob.job.cmd 205 | 206 | wrapped = bash_template %dict( 207 | lang_exe=mjob.lang_exe, 208 | cmd=command, 209 | rundir=metajob_rundir, 210 | ) 211 | log.debug('Writing wrapper "%s"' %wrapper_fn) 212 | open(wrapper_fn, 'w').write(wrapped) 213 | st = os.stat(wrapper_fn) 214 | os.chmod(wrapper_fn, st.st_mode | 0o111) 215 | 216 | class JobThread(threading.Thread): 217 | def run(self): 218 | """Propagate environment, plus env_extra. 219 | """ 220 | try: 221 | self.notify_start(self.jobname) 222 | log.debug('hello! started Thread {}'.format(threading.current_thread())) 223 | myenv = dict(os.environ) 224 | myenv.update(self.env_extra) 225 | #log.debug('myenv:\n{}'.format(pprint.pformat(myenv))) 226 | log.info("Popen: '{}'".format(self.cmd)) 227 | if not self.cmd: 228 | msg = 'Why is self.cmd empty? {} {} {!r}'.format(self, self.jobname, self.cmd) 229 | raise Exception(msg) 230 | p = subprocess.Popen(self.cmd, env=myenv, shell=True) 231 | log.debug("pid: {}".format(p.pid)) 232 | p.wait() 233 | rc = p.returncode 234 | log.debug("rc: {}".format(rc)) 235 | self.notify_exit(self.jobname, rc) 236 | except: 237 | log.exception('Failed to submit {}: {!r} Setting rc=42.'.format(self.jobname, self.cmd)) 238 | self.notify_exit(self.jobname, 42) 239 | def __init__(self, jobname, cmd, notify_start, notify_exit, env_extra): 240 | super(JobThread, self).__init__() 241 | self.jobname = jobname 242 | self.cmd = cmd 243 | self.notify_start = notify_start 244 | self.notify_exit = notify_exit 245 | self.env_extra = env_extra 246 | 247 | class StringJobSubmitter(object): 248 | """Substitute some variables into self.submission_string. 249 | Use mains/job_start.sh as the top script. That requires 250 | PYPEFLOW_JOB_START_SCRIPT in the environment as the real 251 | script to run. This way, we are guaranteed that the top script exists, 252 | and we can wait for the rest to appear in the filesystem. 253 | """ 254 | def submit(self, jobname, mjob, state): 255 | """Prepare job (based on wrappers) and submit as a new thread. 256 | """ 257 | state.set_job(jobname, mjob) 258 | jobname = mjob.job.jobid 259 | job_dict = mjob.job.options 260 | #nproc = mjob.job.options['NPROC'] 261 | #mb = mjob.job.options['MB'] 262 | mji = MetaJobClass(mjob) 263 | #script_fn = os.path.join(state.get_directory_wrappers(), mji.get_wrapper()) 264 | script_fn = mji.get_wrapper() 265 | exe = mjob.lang_exe 266 | 267 | state.notify_threaded(jobname) 268 | self.start(jobname, state, exe, script_fn, job_dict) # Can raise 269 | def get_cmd(self, job_name, script_fn, job_dict): 270 | """Vars: 271 | (The old ones.) JOB_ID, STDOUT_FILE, STDERR_FILE, NPROC, MB, CMD 272 | """ 273 | # We wrap in a program that waits for the executable to exist, so 274 | # the filesystem has time to catch up on the remote machine. 275 | # Hopefully, this will allow dependencies to become ready as well. 276 | job_start_fn = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mains/job_start.sh') 277 | mapping = dict() 278 | stdout = script_fn + '.stdout' 279 | stderr = script_fn + '.stderr' 280 | run_dir = os.getcwd() 281 | mapping = dict( 282 | JOB_EXE='/bin/bash', 283 | JOB_NAME=job_name, JOB_ID=job_name, 284 | #JOB_OPTS=JOB_OPTS, 285 | #JOB_QUEUE=job_queue, 286 | JOB_SCRIPT=job_start_fn, CMD=job_start_fn, 287 | JOB_DIR=run_dir, DIR=run_dir, 288 | JOB_STDOUT=stdout, STDOUT_FILE=stdout, 289 | JOB_STDERR=stderr, STDERR_FILE=stderr, 290 | #MB=pypeflow_mb, 291 | #NPROC=pypeflow_nproc, 292 | ) 293 | mapping.update(job_dict) 294 | if 'JOB_OPTS' in mapping: 295 | # a special two-level mapping: ${JOB_OPTS} is substituted first 296 | mapping['JOB_OPTS'] = self.sub(mapping['JOB_OPTS'], mapping) 297 | return self.sub(self.submission_string, mapping) 298 | @staticmethod 299 | def sub(template, mapping): 300 | t = string.Template(template) 301 | try: 302 | return t.substitute(mapping) 303 | except KeyError: 304 | print(repr(mapping)) 305 | msg = 'Template substitution failed:\n template={!r}\n mapping={}'.format( 306 | template, pprint.pformat(mapping)) 307 | log.exception(msg) 308 | raise 309 | def start(self, jobname, state, exe, script_fn, job_dict): 310 | """Run job in thread. 311 | Thread will notify state. 312 | Can raise. 313 | """ 314 | #cmd = script_fn 315 | cmd = self.get_cmd(jobname, script_fn, job_dict) 316 | # job_start.sh relies on PYPEFLOW_* 317 | env_extra = { 318 | "PYPEFLOW_JOB_START_SCRIPT": script_fn, 319 | "PYPEFLOW_JOB_START_TIMEOUT": "60", 320 | } 321 | log.debug('env_extra={}'.format(pprint.pformat(env_extra))) 322 | notify_start = state.notify_started 323 | notify_exit = state.notify_exited 324 | th = JobThread(jobname, cmd, notify_start, notify_exit, env_extra) 325 | #th.setDaemon(True) 326 | th.start() 327 | def __repr__(self): 328 | return 'StringJobSubmitter(%s)' %repr(self.submission_string) 329 | def __init__(self, submission_string): 330 | self.submission_string = submission_string 331 | 332 | def link_rundir(state_rundir, user_rundir): 333 | if user_rundir: 334 | link_fn = os.path.join(user_rundir, 'pwatcher.dir') 335 | if os.path.lexists(link_fn): 336 | os.unlink(link_fn) 337 | os.symlink(os.path.abspath(state_rundir), link_fn) 338 | 339 | def cmd_run(state, jobids, job_type, job_dict): 340 | """ 341 | Wrap them and run them locally, each in the foreground of a thread. 342 | """ 343 | jobs = dict() 344 | submitted = list() 345 | result = {'submitted': submitted} 346 | if job_type != 'string': 347 | log.debug("NOTE: In blocking pwatcher, job_type={!r}, should be 'string'".format(job_type)) 348 | for jobid, desc in jobids.items(): 349 | assert 'cmd' in desc 350 | cmd = desc['cmd'] 351 | if 'rundir' in desc: 352 | rundir = desc['rundir'] 353 | else: 354 | rundir = os.path.dirname(cmd) 355 | # These are all required now. 356 | #nproc = desc['job_nproc'] 357 | #mb = desc['job_mb'] 358 | local = int(desc['job_local']) 359 | options = copy.deepcopy(desc['job_dict']) #dict(NPROC=nproc, MB=mb, local=local) 360 | options['local'] = local 361 | jobs[jobid] = Job(jobid, cmd, rundir, options) 362 | log.debug('jobs:\n%s' %pprint.pformat(jobs)) 363 | submission_string = job_dict['submit'] 364 | basic_submitter = StringJobSubmitter(submission_string) 365 | local_submitter = StringJobSubmitter(LOCAL_SUBMISSION_STRING) 366 | log.debug('Basic submitter: {!r}'.format(basic_submitter)) 367 | for jobid, job in jobs.items(): 368 | #desc = jobids[jobid] 369 | log.debug(' starting job %s' %pprint.pformat(job)) 370 | mjob = Job_get_MetaJob(job) 371 | MetaJob_wrap(mjob, state) 372 | try: 373 | #link_rundir(state.get_directory_job(jobid), desc.get('rundir')) 374 | if job.options['local']: 375 | submitter = local_submitter 376 | else: 377 | submitter = basic_submitter 378 | if not submission_string: 379 | raise Exception('No "submit" key in job_dict:{!r}.'.format(job_dict)) 380 | submitter.submit(jobid, mjob, state) 381 | submitted.append(jobid) 382 | except Exception: 383 | raise 384 | log.exception('Failed to submit background-job:\n{!r}'.format( 385 | submitter)) 386 | return result 387 | # The caller is responsible for deciding what to do about job-submission failures. Re-try, maybe? 388 | 389 | def system(call, checked=False): 390 | log.info('!{}'.format(call)) 391 | rc = os.system(call) 392 | if checked and rc: 393 | raise Exception('{} <- {!r}'.format(rc, call)) 394 | return rc 395 | 396 | _warned = dict() 397 | def warnonce(hashkey, msg): 398 | if hashkey in _warned: 399 | return 400 | log.warning(msg) 401 | _warned[hashkey] = True 402 | 403 | def cmd_query(state, which, jobids): 404 | """Return the state of named jobids. 405 | If which=='list', then query jobs listed as jobids. 406 | If which=='known', then query all known jobs. 407 | If which=='infer', same as 'known' now. 408 | """ 409 | result = dict() 410 | jobstats = dict() 411 | result['jobids'] = jobstats 412 | if which == 'list': 413 | for jobid in jobids: 414 | jobstats[jobid] = 'UNKNOWN' 415 | state.update_jobid2status(jobstats) 416 | jobids = set(jobids) 417 | if which == 'list': 418 | for jobid in list(jobstats.keys()): 419 | # TODO: This might remove thousands. We should pass jobids along to update_jobid2status(). 420 | if jobid not in jobids: 421 | del jobstats[jobid] 422 | return result 423 | def cmd_delete(state, which, jobids): 424 | """Kill designated jobs, including (hopefully) their 425 | entire process groups. 426 | If which=='list', then kill all jobs listed as jobids. 427 | If which=='known', then kill all known jobs. 428 | If which=='infer', then kill all jobs with heartbeats. 429 | """ 430 | log.error('Noop. We cannot kill blocked threads. Hopefully, everything will die on SIGTERM.') 431 | def makedirs(path): 432 | if not os.path.isdir(path): 433 | os.makedirs(path) 434 | def readjson(ifs): 435 | """Del keys that start with ~. 436 | That lets us have trailing commas on all other lines. 437 | """ 438 | content = ifs.read() 439 | log.debug('content:%s' %repr(content)) 440 | jsonval = json.loads(content) 441 | #pprint.pprint(jsonval) 442 | def striptildes(subd): 443 | if not isinstance(subd, dict): 444 | return 445 | for k,v in list(subd.items()): 446 | if k.startswith('~'): 447 | del subd[k] 448 | else: 449 | striptildes(v) 450 | striptildes(jsonval) 451 | #pprint.pprint(jsonval) 452 | return jsonval 453 | 454 | class ProcessWatcher(object): 455 | def run(self, jobids, job_type, job_defaults_dict): 456 | #import traceback; log.debug(''.join(traceback.format_stack())) 457 | log.debug('run(jobids={}, job_type={}, job_defaults_dict={})'.format( 458 | '<%s>'%len(jobids), job_type, job_defaults_dict)) 459 | return cmd_run(self.state, jobids, job_type, job_defaults_dict) 460 | def query(self, which='list', jobids=[]): 461 | log.debug('query(which={!r}, jobids={})'.format( 462 | which, '<%s>'%len(jobids))) 463 | return cmd_query(self.state, which, jobids) 464 | def delete(self, which='list', jobids=[]): 465 | log.debug('delete(which={!r}, jobids={})'.format( 466 | which, '<%s>'%len(jobids))) 467 | return cmd_delete(self.state, which, jobids) 468 | def __init__(self, state): 469 | # state must be thread-safe 470 | self.state = state 471 | 472 | def get_process_watcher(directory): 473 | state = get_state(directory) 474 | state = SafeState(state) # thread-safe proxy 475 | #log.debug('state =\n%s' %pprint.pformat(state.top)) 476 | return ProcessWatcher(state) 477 | #State_save(state) 478 | 479 | @contextlib.contextmanager 480 | def process_watcher(directory): 481 | """This will (someday) hold a lock, so that 482 | the State can be written safely at the end. 483 | """ 484 | state = get_state(directory) 485 | state = SafeState(state) # thread-safe proxy 486 | #log.debug('state =\n%s' %pprint.pformat(state.top)) 487 | yield ProcessWatcher(state) 488 | #State_save(state) 489 | 490 | def main(prog, cmd, state_dir='mainpwatcher', argsfile=None): 491 | logging.basicConfig() 492 | logging.getLogger().setLevel(logging.NOTSET) 493 | log.warning('logging basically configured') 494 | log.debug('debug mode on') 495 | assert cmd in ['run', 'query', 'delete'] 496 | ifs = sys.stdin if not argsfile else open(argsfile) 497 | argsdict = readjson(ifs) 498 | log.info('argsdict =\n%s' %pprint.pformat(argsdict)) 499 | with process_watcher(state_dir) as watcher: 500 | result = getattr(watcher, cmd)(**argsdict) 501 | if result is not None: 502 | log.info('getattr({!r}, {!r}): {}'.format( 503 | watcher, cmd, pprint.pformat(result))) 504 | log.info('Waiting for running jobs...r') 505 | while watcher.state.get_running_jobids(): 506 | log.info('running: {!s}'.format(watcher.state.get_running_jobids())) 507 | time.sleep(1) 508 | 509 | if __name__ == "__main__": 510 | #import pdb 511 | #pdb.set_trace() 512 | main(*sys.argv) # pylint: disable=no-value-for-parameter 513 | -------------------------------------------------------------------------------- /src/tests/test_pypeflow_controller.py: -------------------------------------------------------------------------------- 1 | from nose import SkipTest 2 | from nose.tools import assert_equal 3 | import pypeflow.task 4 | import pypeflow.data 5 | import pypeflow.controller 6 | 7 | class TestPypeNode: 8 | def test___init__(self): 9 | # pype_node = PypeNode(obj) 10 | raise SkipTest # TODO: implement your test here 11 | 12 | def test_addAnInNode(self): 13 | # pype_node = PypeNode(obj) 14 | # assert_equal(expected, pype_node.addAnInNode(obj)) 15 | raise SkipTest # TODO: implement your test here 16 | 17 | def test_addAnOutNode(self): 18 | # pype_node = PypeNode(obj) 19 | # assert_equal(expected, pype_node.addAnOutNode(obj)) 20 | raise SkipTest # TODO: implement your test here 21 | 22 | def test_depth(self): 23 | # pype_node = PypeNode(obj) 24 | # assert_equal(expected, pype_node.depth()) 25 | raise SkipTest # TODO: implement your test here 26 | 27 | def test_inDegree(self): 28 | # pype_node = PypeNode(obj) 29 | # assert_equal(expected, pype_node.inDegree()) 30 | raise SkipTest # TODO: implement your test here 31 | 32 | def test_outDegree(self): 33 | # pype_node = PypeNode(obj) 34 | # assert_equal(expected, pype_node.outDegree()) 35 | raise SkipTest # TODO: implement your test here 36 | 37 | def test_removeAnInNode(self): 38 | # pype_node = PypeNode(obj) 39 | # assert_equal(expected, pype_node.removeAnInNode(obj)) 40 | raise SkipTest # TODO: implement your test here 41 | 42 | def test_removeAnOutNode(self): 43 | # pype_node = PypeNode(obj) 44 | # assert_equal(expected, pype_node.removeAnOutNode(obj)) 45 | raise SkipTest # TODO: implement your test here 46 | 47 | class TestPypeGraph: 48 | def test___getitem__(self): 49 | # pype_graph = PypeGraph(RDFGraph, subGraphNodes) 50 | # assert_equal(expected, pype_graph.__getitem__(url)) 51 | raise SkipTest # TODO: implement your test here 52 | 53 | def test___init__(self): 54 | # pype_graph = PypeGraph(RDFGraph, subGraphNodes) 55 | raise SkipTest # TODO: implement your test here 56 | 57 | def test_tSort(self): 58 | # pype_graph = PypeGraph(RDFGraph, subGraphNodes) 59 | # assert_equal(expected, pype_graph.tSort()) 60 | raise SkipTest # TODO: implement your test here 61 | 62 | class TestPypeWorkflow: 63 | def test___init__(self): 64 | # pype_workflow = PypeWorkflow(URL, **attributes) 65 | raise SkipTest # TODO: implement your test here 66 | 67 | def test_addObject(self): 68 | # pype_workflow = PypeWorkflow(URL, **attributes) 69 | # assert_equal(expected, pype_workflow.addObject(obj)) 70 | raise SkipTest # TODO: implement your test here 71 | 72 | def test_addObjects(self): 73 | # pype_workflow = PypeWorkflow(URL, **attributes) 74 | # assert_equal(expected, pype_workflow.addObjects(objs)) 75 | raise SkipTest # TODO: implement your test here 76 | 77 | def test_addTask(self): 78 | # pype_workflow = PypeWorkflow(URL, **attributes) 79 | # assert_equal(expected, pype_workflow.addTask(taskObj)) 80 | raise SkipTest # TODO: implement your test here 81 | 82 | def test_addTasks(self): 83 | # pype_workflow = PypeWorkflow(URL, **attributes) 84 | # assert_equal(expected, pype_workflow.addTasks(taskObjs)) 85 | raise SkipTest # TODO: implement your test here 86 | 87 | def test_dataObjects(self): 88 | # pype_workflow = PypeWorkflow(URL, **attributes) 89 | # assert_equal(expected, pype_workflow.dataObjects()) 90 | raise SkipTest # TODO: implement your test here 91 | 92 | def test_graphvizDot(self): 93 | # pype_workflow = PypeWorkflow(URL, **attributes) 94 | # assert_equal(expected, pype_workflow.graphvizDot()) 95 | raise SkipTest # TODO: implement your test here 96 | 97 | def test_graphvizShortNameDot(self): 98 | # pype_workflow = PypeWorkflow(URL, **attributes) 99 | # assert_equal(expected, pype_workflow.graphvizShortNameDot()) 100 | raise SkipTest # TODO: implement your test here 101 | 102 | def test_makeFileStr(self): 103 | # pype_workflow = PypeWorkflow(URL, **attributes) 104 | # assert_equal(expected, pype_workflow.makeFileStr()) 105 | raise SkipTest # TODO: implement your test here 106 | 107 | def test_refreshTargets(self): 108 | # pype_workflow = PypeWorkflow(URL, **attributes) 109 | # assert_equal(expected, pype_workflow.refreshTargets(objs, callback)) 110 | raise SkipTest # TODO: implement your test here 111 | 112 | def test_removeObjects(self): 113 | # pype_workflow = PypeWorkflow(URL, **attributes) 114 | # assert_equal(expected, pype_workflow.removeObjects(objs)) 115 | raise SkipTest # TODO: implement your test here 116 | 117 | def test_removeTask(self): 118 | # pype_workflow = PypeWorkflow(URL, **attributes) 119 | # assert_equal(expected, pype_workflow.removeTask(taskObj)) 120 | raise SkipTest # TODO: implement your test here 121 | 122 | def test_removeTasks(self): 123 | # pype_workflow = PypeWorkflow(URL, **attributes) 124 | # assert_equal(expected, pype_workflow.removeTasks(taskObjs)) 125 | raise SkipTest # TODO: implement your test here 126 | 127 | def test_setReferenceRDFGraph(self): 128 | # pype_workflow = PypeWorkflow(URL, **attributes) 129 | # assert_equal(expected, pype_workflow.setReferenceRDFGraph(fn)) 130 | raise SkipTest # TODO: implement your test here 131 | 132 | def test_tasks(self): 133 | # pype_workflow = PypeWorkflow(URL, **attributes) 134 | # assert_equal(expected, pype_workflow.tasks()) 135 | raise SkipTest # TODO: implement your test here 136 | 137 | def test_scatterTask(self): 138 | 139 | import os 140 | os.system("rm -rf /tmp/pypetest/*") 141 | nChunk = 3 142 | 143 | infileObj0 =\ 144 | pypeflow.data.PypeLocalFile( 145 | "file://localhost/tmp/pypetest/test_in_0.txt") 146 | with open(infileObj0.localFileName,"w") as f: 147 | f.write("prefix4:") 148 | 149 | infileObj4 =\ 150 | pypeflow.data.PypeSplittableLocalFile( 151 | "splittablefile://localhost/tmp/pypetest/test_in_4.txt", 152 | nChunk = nChunk) 153 | 154 | with open(infileObj4.localFileName, "w") as f: 155 | for i in range(nChunk): 156 | f.write("file%02d\n" % i) 157 | 158 | def scatter(*argv, **kwargv): 159 | outputObjs = sorted( kwargv["outputDataObjs"].items() ) 160 | nOut = len(outputObjs) 161 | outputObjs = [ (o[0], o[1], open(o[1].localFileName, "w")) for o in outputObjs] 162 | with open(kwargv["inputDataObjs"]["completeFile"].localFileName,"r") as f: 163 | i = 0 164 | for l in f: 165 | outf = outputObjs[i % nOut][2] 166 | outf.write(l) 167 | i += 1 168 | for o in outputObjs: 169 | o[2].close() 170 | 171 | PypeShellTask = pypeflow.task.PypeShellTask 172 | PypeTask = pypeflow.task.PypeTask 173 | PypeTaskBase = pypeflow.task.PypeTaskBase 174 | infileObj4.setScatterTask(PypeTask, PypeTaskBase, scatter) 175 | 176 | def gather(*argv, **kwargv): 177 | inputObjs = sorted( kwargv["inputDataObjs"].items() ) 178 | with open(kwargv["outputDataObjs"]["completeFile"].localFileName,"w") as outf: 179 | for k, subfile in inputObjs: 180 | f = open(subfile.localFileName) 181 | outf.write(f.read()) 182 | f.close() 183 | 184 | outfileObj4 =\ 185 | pypeflow.data.PypeSplittableLocalFile( 186 | "splittablefile://localhost/tmp/pypetest/test_out_4.txt", 187 | nChunk = nChunk) 188 | 189 | outfileObj4.setGatherTask(PypeTask, PypeTaskBase, gather) 190 | 191 | PypeScatteredTasks = pypeflow.task.PypeScatteredTasks 192 | 193 | @PypeScatteredTasks( inputDataObjs = {"inf":infileObj4, "prefix":infileObj0}, 194 | outputDataObjs = {"outf":outfileObj4}, 195 | URL="tasks://test_fun_4") 196 | def test_fun_4(*argv, **kwargv): 197 | chunk_id = kwargv["chunk_id"] 198 | self = test_fun_4[chunk_id] 199 | 200 | assert self.inf._path == "/tmp/pypetest/%03d_test_in_4.txt" % chunk_id 201 | with open( self.prefix.localFileName, "r") as f: 202 | prefix = f.read() 203 | 204 | with open( self.outf._path, "w") as f: 205 | in_f = open(self.inf.localFileName,"r") 206 | f.write(prefix + in_f.read()) 207 | in_f.close() 208 | return self.inf._path 209 | 210 | outfileObj5 =\ 211 | pypeflow.data.PypeSplittableLocalFile( 212 | "splittablefile://localhost/tmp/pypetest/test_out_5.txt", 213 | nChunk = nChunk) 214 | outfileObj5.setGatherTask(PypeTask, PypeTaskBase, gather) 215 | 216 | @PypeScatteredTasks( inputDataObjs = {"inf":infileObj4, "prefix":infileObj0}, 217 | outputDataObjs = {"outf":outfileObj5}, 218 | URL="tasks://test_fun_5") 219 | def test_fun_5(*argv, **kwargv): 220 | chunk_id = kwargv["chunk_id"] 221 | self = test_fun_5[chunk_id] 222 | 223 | assert self.inf._path == "/tmp/pypetest/%03d_test_in_4.txt" % chunk_id 224 | with open( self.prefix.localFileName, "r") as f: 225 | prefix = f.read() 226 | 227 | with open( self.outf._path, "w") as f: 228 | in_f = open(self.inf.localFileName,"r") 229 | f.write(prefix +"2:"+ in_f.read()) 230 | in_f.close() 231 | return self.inf._path 232 | assert len(test_fun_4.getTasks()) == nChunk 233 | 234 | wf = pypeflow.controller.PypeWorkflow() 235 | wf.addTasks( [test_fun_4, test_fun_5] ) 236 | print(wf.graphvizDot) 237 | wf.refreshTargets( [outfileObj4, outfileObj5] ) 238 | 239 | class TestPypeThreadWorkflow: 240 | def test___init__(self): 241 | # pype_thread_workflow = PypeThreadWorkflow(URL, **attributes) 242 | raise SkipTest # TODO: implement your test here 243 | 244 | def test_addTasks(self): 245 | # pype_thread_workflow = PypeThreadWorkflow(URL, **attributes) 246 | # assert_equal(expected, pype_thread_workflow.addTasks(taskObjs)) 247 | raise SkipTest # TODO: implement your test here 248 | 249 | def test_refreshTargets(self): 250 | # pype_thread_workflow = PypeThreadWorkflow(URL, **attributes) 251 | # assert_equal(expected, pype_thread_workflow.refreshTargets(objs, callback, updateFreq, exitOnFailure)) 252 | raise SkipTest # TODO: implement your test here 253 | 254 | def test_setNumThreadAllowed(self): 255 | # pype_thread_workflow = PypeThreadWorkflow(URL, **attributes) 256 | # assert_equal(expected, pype_thread_workflow.setNumThreadAllowed(nT, nS)) 257 | raise SkipTest # TODO: implement your test here 258 | 259 | def test_mutableDataObjects(self): 260 | 261 | infileObj =\ 262 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_in.txt") 263 | 264 | outfileObj =\ 265 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out.txt") 266 | 267 | out1 =\ 268 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out1.txt") 269 | 270 | out2 =\ 271 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out2.txt") 272 | 273 | out3 =\ 274 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out3.txt") 275 | 276 | import os 277 | os.system("rm -rf /tmp/pypetest/*") 278 | 279 | with open(infileObj.localFileName,"w") as f: 280 | f.write("test") 281 | 282 | PypeThreadWorkflow = pypeflow.controller.PypeThreadWorkflow 283 | PypeThreadTaskBase = pypeflow.controller.PypeThreadTaskBase 284 | PypeTask = pypeflow.task.PypeTask 285 | wf = PypeThreadWorkflow() 286 | 287 | @PypeTask(mutableDataObjs={"out":outfileObj}, 288 | outputDataObjs={"out1":out1}, 289 | inputDataObjs={"in":infileObj}, 290 | TaskType=PypeThreadTaskBase) 291 | def task1(task): 292 | with open(task.out.localFileName, "a") as f: 293 | print("written by task1", file=f) 294 | with open(task.out1.localFileName, "w") as f: 295 | print("written by task1", file=f) 296 | 297 | @PypeTask(mutableDataObjs={"out":outfileObj}, 298 | outputDataObjs={"out2":out2}, 299 | inputDataObjs={"in":infileObj}, 300 | TaskType=PypeThreadTaskBase) 301 | def task2(task): 302 | with open(task.out.localFileName, "a") as f: 303 | print("written by task2", file=f) 304 | with open(task.out2.localFileName, "w") as f: 305 | print("written by task2", file=f) 306 | 307 | @PypeTask(mutableDataObjs={"out":outfileObj}, 308 | outputDataObjs={"out3":out3}, 309 | inputDataObjs={"in":infileObj}, 310 | TaskType=PypeThreadTaskBase) 311 | def task3(task): 312 | with open(task.out.localFileName, "a") as f: 313 | print("written by task3", file=f) 314 | with open(task.out3.localFileName, "w") as f: 315 | print("written by task3", file=f) 316 | 317 | wf = PypeThreadWorkflow() 318 | wf.addTasks([task1, task2, task3]) 319 | 320 | wf.refreshTargets() 321 | 322 | outputSet = set() 323 | outputSet.add("written by task1") 324 | outputSet.add("written by task2") 325 | outputSet.add("written by task3") 326 | 327 | with open(outfileObj.localFileName) as f: 328 | i = 0 329 | for l in f: 330 | l = l.strip() 331 | assert l in outputSet 332 | i += 1 333 | assert_equal(i, 3) 334 | 335 | def test_stateDataObjects(self): 336 | 337 | infileObj =\ 338 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_in.txt") 339 | 340 | outfileObj =\ 341 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out.txt") 342 | 343 | out1 =\ 344 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out1.txt") 345 | 346 | out2 =\ 347 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out2.txt") 348 | 349 | out3 =\ 350 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out3.txt") 351 | 352 | s1 =\ 353 | pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state1") 354 | 355 | s2 =\ 356 | pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state2") 357 | 358 | s3 =\ 359 | pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state3") 360 | 361 | import os 362 | import time 363 | os.system("rm -rf /tmp/pypetest/*") 364 | time.sleep(2) 365 | 366 | with open(infileObj.localFileName,"w") as f: 367 | f.write("test") 368 | 369 | PypeThreadWorkflow = pypeflow.controller.PypeThreadWorkflow 370 | PypeThreadTaskBase = pypeflow.controller.PypeThreadTaskBase 371 | PypeTask = pypeflow.task.PypeTask 372 | wf = PypeThreadWorkflow() 373 | 374 | @PypeTask(mutableDataObjs = {"out":outfileObj}, 375 | outputDataObjs = {"out1":out1, "s1":s1}, 376 | inputDataObjs = {"in":infileObj}, 377 | TaskType=PypeThreadTaskBase) 378 | def task1(task): 379 | with open(task.out.localFileName, "a") as f: 380 | print("written by task1", file=f) 381 | with open(task.s1.localFileName, "w") as f: 382 | print("state set", file=f) 383 | with open(task.out1.localFileName, "w") as f: 384 | print("written by task1", file=f) 385 | 386 | @PypeTask(mutableDataObjs = {"out":outfileObj}, 387 | outputDataObjs = {"out2":out2, "s2":s2}, 388 | inputDataObjs = {"in":infileObj, "s1":s1}, 389 | TaskType=PypeThreadTaskBase) 390 | def task2(task): 391 | with open(task.out.localFileName, "a") as f: 392 | print("written by task2", file=f) 393 | with open(task.s2.localFileName, "w") as f: 394 | print("state set", file=f) 395 | with open(task.out2.localFileName, "w") as f: 396 | print("written by task2", file=f) 397 | 398 | @PypeTask(mutableDataObjs = {"out":outfileObj}, 399 | outputDataObjs = {"out3":out3, "s3":s3}, 400 | inputDataObjs = {"in":infileObj, "s2":s2}, 401 | TaskType=PypeThreadTaskBase) 402 | def task3(task): 403 | with open(task.out.localFileName, "a") as f: 404 | print("written by task3", file=f) 405 | with open(task.s3.localFileName, "w") as f: 406 | print("state set", file=f) 407 | with open(task.out3.localFileName, "w") as f: 408 | print("written by task3", file=f) 409 | 410 | wf = PypeThreadWorkflow() 411 | wf.addTasks([task1, task2, task3]) 412 | 413 | wf.refreshTargets() 414 | 415 | with open(outfileObj.localFileName) as f: 416 | i = 0 417 | for l in f: 418 | i += 1 419 | l = l.strip() 420 | assert l == "written by task%d" % i 421 | assert i == 3 422 | 423 | def test_stateDataObjects2(self): 424 | 425 | infileObj =\ 426 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_in.txt") 427 | 428 | outfileObj =\ 429 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out.txt") 430 | 431 | out1 =\ 432 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out1.txt") 433 | 434 | out2 =\ 435 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out2.txt") 436 | 437 | out3 =\ 438 | pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out3.txt") 439 | 440 | s1 =\ 441 | pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state1") 442 | 443 | s2 =\ 444 | pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state2") 445 | 446 | s3 =\ 447 | pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state3") 448 | 449 | import os 450 | import time 451 | os.system("rm -rf /tmp/pypetest/*") 452 | time.sleep(2) 453 | 454 | with open(infileObj.localFileName,"w") as f: 455 | f.write("test") 456 | 457 | PypeThreadWorkflow = pypeflow.controller.PypeThreadWorkflow 458 | PypeThreadTaskBase = pypeflow.controller.PypeThreadTaskBase 459 | PypeTask = pypeflow.task.PypeTask 460 | wf = PypeThreadWorkflow() 461 | 462 | @PypeTask( outputDataObjs = {"out1":out1, "s1":s1}, 463 | inputDataObjs = {"in":infileObj}, 464 | TaskType=PypeThreadTaskBase) 465 | def task1(task): 466 | with open(task.s1.localFileName, "w") as f: 467 | print("state set", file=f) 468 | with open(task.out1.localFileName, "w") as f: 469 | print("written by task1", file=f) 470 | 471 | @PypeTask(outputDataObjs = {"out2":out2, "s2":s2}, 472 | inputDataObjs = {"in":infileObj, "s1":s1}, 473 | TaskType=PypeThreadTaskBase) 474 | def task2(task): 475 | with open(task.s2.localFileName, "w") as f: 476 | print("state set", file=f) 477 | with open(task.out2.localFileName, "w") as f: 478 | print("written by task2", file=f) 479 | 480 | @PypeTask(outputDataObjs = {"out3":out3, "s3":s3}, 481 | inputDataObjs = {"in":infileObj, "s2":s2}, 482 | TaskType=PypeThreadTaskBase) 483 | def task3(task): 484 | with open(task.s3.localFileName, "w") as f: 485 | print("state set", file=f) 486 | with open(task.out3.localFileName, "w") as f: 487 | print("written by task3", file=f) 488 | 489 | wf = PypeThreadWorkflow() 490 | wf.addTasks([task1, task2, task3]) 491 | 492 | wf.refreshTargets([s3]) 493 | 494 | for i in range(1,4): 495 | with open("/tmp/pypetest/test_for_shared_output_out%d.txt" % i) as f: 496 | l = f.read().strip() 497 | assert l == "written by task%d" % i 498 | --------------------------------------------------------------------------------