├── src
    ├── __init__.py
    ├── .gitignore
    └── tests
    │   ├── test_pypeflow_common.py
    │   ├── test_pypeflow_data.py
    │   ├── test_pypeflow_task.py
    │   └── test_pypeflow_controller.py
├── pwatcher
    ├── __init__.py
    ├── mains
    │   ├── __init__.py
    │   ├── pwatcher.py
    │   ├── job_start.sh
    │   ├── pypeflow_example.py
    │   ├── query_server.py
    │   ├── fs_heartbeat.py
    │   └── network_heartbeat.py
    └── blocking.py
├── pypeflow
    ├── mains
    │   └── __init__.py
    ├── __init__.py
    ├── pwatcher_workflow.py
    ├── sample_tasks.py
    ├── util.py
    ├── do_support.py
    ├── tasks.py
    ├── io.py
    └── do_task.py
├── example
    ├── testdata
    │   └── .placeholder
    ├── README.txt
    ├── test_shutdown.py
    └── PypeTest.py
├── examples-pwatcher
    ├── .gitignore
    ├── ab
    │   ├── delete.json
    │   ├── jobs
    │   │   ├── b
    │   │   └── c
    │   ├── query-ab.json
    │   ├── makefile
    │   ├── run.json
    │   └── logging-cfg.json
    └── README.md
├── doc
    ├── Example1.png
    ├── Example2.png
    ├── modules.rst
    ├── index.rst
    ├── pypeflow.rst
    ├── concurrent_execution.rst
    ├── introduction.rst
    ├── rdf_resprentation.rst
    ├── Makefile
    ├── examples.rst
    └── conf.py
├── presentation
    ├── pipelines.png
    ├── escher--unbelievable-527581_1024_768.jpg
    └── pypeFLOW_tutorial.rst
├── .gitignore
├── bamboo-specs
    ├── .settings
    │   ├── org.eclipse.m2e.core.prefs
    │   ├── org.eclipse.core.resources.prefs
    │   └── org.eclipse.jdt.core.prefs
    ├── .gitignore
    ├── src
    │   ├── test
    │   │   └── java
    │   │   │   └── pacbio
    │   │   │       └── PlanSpecTest.java
    │   └── main
    │   │   └── java
    │   │       └── pacbio
    │   │           └── PlanSpec.java
    ├── .hgignore
    ├── .project
    ├── .classpath
    └── pom.xml
├── travis.sh
├── makefile
├── .travis.yml
├── setup.py
├── bamboo_wheel.sh
├── test
    ├── test_do_task.py
    └── test_integ.py
├── LICENSE
├── README.rst
└── readme.slurm.md


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pwatcher/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pwatcher/mains/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pypeflow/mains/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example/testdata/.placeholder:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.pyo
3 | 


--------------------------------------------------------------------------------
/examples-pwatcher/.gitignore:
--------------------------------------------------------------------------------
1 | pwatched/
2 | foo/
3 | 


--------------------------------------------------------------------------------
/examples-pwatcher/ab/delete.json:
--------------------------------------------------------------------------------
1 | {"which":"infer"}
2 | 


--------------------------------------------------------------------------------
/examples-pwatcher/ab/jobs/b:
--------------------------------------------------------------------------------
1 | echo 'hi b'
2 | sleep 5
3 | echo 'bye b'
4 | 


--------------------------------------------------------------------------------
/examples-pwatcher/ab/jobs/c:
--------------------------------------------------------------------------------
1 | echo 'hi c'
2 | sleep 100
3 | echo 'bye c'
4 | 


--------------------------------------------------------------------------------
/doc/Example1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacificBiosciences/pypeFLOW/HEAD/doc/Example1.png


--------------------------------------------------------------------------------
/doc/Example2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacificBiosciences/pypeFLOW/HEAD/doc/Example2.png


--------------------------------------------------------------------------------
/doc/modules.rst:
--------------------------------------------------------------------------------
1 | pypeflow
2 | ========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    pypeflow
8 | 


--------------------------------------------------------------------------------
/presentation/pipelines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacificBiosciences/pypeFLOW/HEAD/presentation/pipelines.png


--------------------------------------------------------------------------------
/examples-pwatcher/ab/query-ab.json:
--------------------------------------------------------------------------------
1 | {
2 |     "which": "list",
3 |     "jobids": ["b", "c"],
4 |     "~end": {}
5 | }
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /build/
2 | /dist/
3 | /src/pypeflow.egg-info/
4 | /src/tests/__pycache__/
5 | /wheelhouse/
6 | /artifacts/
7 | /*.xml
8 | .pytest_cache/
9 | 


--------------------------------------------------------------------------------
/bamboo-specs/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/presentation/escher--unbelievable-527581_1024_768.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacificBiosciences/pypeFLOW/HEAD/presentation/escher--unbelievable-527581_1024_768.jpg


--------------------------------------------------------------------------------
/bamboo-specs/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding//src/test/java=UTF-8
4 | encoding/<project>=UTF-8
5 | 


--------------------------------------------------------------------------------
/pypeflow/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.3.0' # should match setup.py
2 | 
3 | try:
4 |     import sys, pkg_resources
5 |     sys.stderr.write('{}\n'.format(pkg_resources.get_distribution('pypeflow')))
6 | except Exception:
7 |     pass
8 | 


--------------------------------------------------------------------------------
/examples-pwatcher/ab/makefile:
--------------------------------------------------------------------------------
 1 | SHELL:=bash
 2 | run:
 3 | 	pwatcher-main run < run.json
 4 | 	pstree -pgsu $${USER}
 5 | query:
 6 | 	pwatcher-main query < query-ab.json
 7 | delete:
 8 | 	pwatcher-main delete <<< $$(echo '{"which":"infer"}')
 9 | clean:
10 | 	rm -rf pwatched *.log
11 | 


--------------------------------------------------------------------------------
/travis.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # -e: fail on error
 3 | # -v: show commands
 4 | # -x: show expanded commands
 5 | set -vex
 6 | 
 7 | #env | sort
 8 | sudo mkdir -p /tmp
 9 | sudo chmod a+wrx /tmp
10 | python setup.py install
11 | nosetests --with-doctest -v pypeflow/ pwatcher/fs_based.py
12 | 


--------------------------------------------------------------------------------
/examples-pwatcher/ab/run.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "jobids": {
 3 |         "c": {
 4 |             "cmd": "/bin/bash jobs/c",
 5 |             "rundir": "./jobs"
 6 |         },
 7 |         "b": {
 8 |             "cmd": "/bin/bash jobs/b",
 9 |             "rundir": "./jobs"
10 |         }
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/bamboo-specs/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
3 | org.eclipse.jdt.core.compiler.compliance=1.8
4 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
5 | org.eclipse.jdt.core.compiler.release=disabled
6 | org.eclipse.jdt.core.compiler.source=1.8
7 | 


--------------------------------------------------------------------------------
/pwatcher/mains/pwatcher.py:
--------------------------------------------------------------------------------
 1 | from .. import fs_based
 2 | import pdb
 3 | import sys
 4 | 
 5 | def main():
 6 |     fs_based.main(*sys.argv) # pylint: disable=no-value-for-parameter
 7 | 
 8 | # If run directly, rather than via the 'entry-point',
 9 | # then pdb will be used.
10 | if __name__ == "__main__":
11 |     #pdb.set_trace()
12 |     main()
13 | 


--------------------------------------------------------------------------------
/pypeflow/pwatcher_workflow.py:
--------------------------------------------------------------------------------
 1 | from .simple_pwatcher_bridge import (
 2 |         PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
 3 |         makePypeLocalFile, fn, PypeTask)
 4 | PypeThreadTaskBase = MyFakePypeThreadTaskBase
 5 | 
 6 | __all__ = [
 7 |         'PypeProcWatcherWorkflow', 'PypeThreadTaskBase',
 8 |         'makePypeLocalFile', 'fn', 'PypeTask',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/example/README.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | Try "pythone PypeTest.py localshell 1" to run some random dependent shell script jobs.
 4 | You can stop the python script and restart using "pythone PypeTest.py localshell 0".
 5 | 
 6 | Other simple examples:
 7 | 
 8 | python3 PypeTest.py internal 1 #the task is done within python functions
 9 | 
10 | python3 PypeTest.py #other simple test
11 | 


--------------------------------------------------------------------------------
/bamboo-specs/.gitignore:
--------------------------------------------------------------------------------
 1 | .credentials
 2 | 
 3 | ### Maven
 4 | target/
 5 | pom.xml.tag
 6 | pom.xml.releaseBackup
 7 | pom.xml.versionsBackup
 8 | pom.xml.next
 9 | release.properties
10 | dependency-reduced-pom.xml
11 | buildNumber.properties
12 | 
13 | ### Java
14 | # Compiled class file
15 | *.class
16 | 
17 | # Log file
18 | *.log
19 | 
20 | # Package Files #
21 | *.jar
22 | 
23 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
24 | hs_err_pid*
25 | 
26 | ### Idea
27 | .idea/
28 | *.iml
29 | 


--------------------------------------------------------------------------------
/bamboo-specs/src/test/java/pacbio/PlanSpecTest.java:
--------------------------------------------------------------------------------
 1 | package pacbio;
 2 | 
 3 | import com.atlassian.bamboo.specs.api.builders.plan.Plan;
 4 | import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
 5 | import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
 6 | import org.junit.Test;
 7 | 
 8 | public class PlanSpecTest {
 9 |     @Test
10 |     public void checkYourPlanOffline() {
11 |         Plan plan = new PlanSpec().createPlan();
12 | 
13 |         EntityPropertiesBuilders.build(plan);
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | WHEELHOUSE?=wheelhouse
 2 | PIP=pip3 wheel --wheel-dir ${WHEELHOUSE} --find-links ${WHEELHOUSE}
 3 | MY_TEST_FLAGS?=-v -s --durations=0
 4 | 
 5 | default:
 6 | pylint:
 7 | 	pylint --errors-only pypeflow/ pwatcher/
 8 | pytest:
 9 | 	python3 -c 'import pypeflow; print(pypeflow)'
10 | 	py.test ${MY_TEST_FLAGS} --junit-xml=nosetests.xml --doctest-modules pypeflow/ pwatcher/ test/
11 | autopep8:
12 | 	autopep8 --max-line-length=120 -ir -j0 pypeflow/ pwatcher/
13 | wheel:
14 | 	which pip3
15 | 	${PIP} --no-deps .
16 | 	ls -larth ${WHEELHOUSE}
17 | 


--------------------------------------------------------------------------------
/bamboo-specs/.hgignore:
--------------------------------------------------------------------------------
 1 | syntax: glob
 2 | 
 3 | .credentials
 4 | 
 5 | ### Maven
 6 | target/
 7 | pom.xml.tag
 8 | pom.xml.releaseBackup
 9 | pom.xml.versionsBackup
10 | pom.xml.next
11 | release.properties
12 | dependency-reduced-pom.xml
13 | buildNumber.properties
14 | 
15 | ### Java
16 | # Compiled class file
17 | *.class
18 | 
19 | # Log file
20 | *.log
21 | 
22 | # Package Files #
23 | *.jar
24 | 
25 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
26 | hs_err_pid*
27 | 
28 | ### Idea
29 | .idea/
30 | *.iml
31 | 


--------------------------------------------------------------------------------
/pypeflow/sample_tasks.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import logging
 4 | from .tasks import gen_task
 5 | from .simple_pwatcher_bridge import (
 6 |     PypeLocalFile, makePypeLocalFile, fn,
 7 |     PypeTask, #Dist,
 8 | )
 9 | LOG = logging.getLogger(__name__)
10 | 
11 | 
12 | def create_task(i1, o1):
13 |     script = """
14 | cat {input.i1} > {output.o1}
15 | """
16 |     return gen_task(
17 |             script=script,
18 |             inputs={
19 |                 'i1': i1,
20 |             },
21 |             outputs={
22 |                 'o1': o1,
23 |             },
24 |             parameters={},
25 |     )
26 | 


--------------------------------------------------------------------------------
/pypeflow/util.py:
--------------------------------------------------------------------------------
 1 | """Old stuff
 2 | Prefer io.py now.
 3 | """
 4 | import logging
 5 | import os
 6 | from .io import (cd, touch, mkdirs, syscall as system)
 7 | 
 8 | LOG = logging.getLogger()
 9 | 
10 | def run(script_fn):
11 |     cwd, basename = os.path.split(script_fn)
12 |     with cd(cwd):
13 |         system('/bin/bash {}'.format(basename))
14 | def rmdirs(path):
15 |     if os.path.isdir(path):
16 |         if len(path) < 20 and 'home' in path:
17 |             LOG.error('Refusing to rm {!r} since it might be your homedir.'.format(path))
18 |             return
19 |         cmd = 'rm -rf {}'.format(path)
20 |         system(cmd)
21 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. PypeFlow documentation master file, created by
 2 |    sphinx-quickstart on Tue Jan 10 21:13:17 2012.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | pypeFLOW
 7 | ====================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    introduction
15 |    installation
16 |    examples
17 |    concurrent_execution
18 |    rdf_resprentation
19 |    modules
20 | 
21 | 
22 | Indices and tables
23 | ==================
24 | 
25 | * :ref:`genindex`
26 | * :ref:`modindex`
27 | * :ref:`search`
28 | 
29 | 


--------------------------------------------------------------------------------
/bamboo-specs/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>bamboo-specs</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.jdt.core.javanature</nature>
21 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
22 | 	</natures>
23 | </projectDescription>
24 | 


--------------------------------------------------------------------------------
/doc/pypeflow.rst:
--------------------------------------------------------------------------------
 1 | pypeflow Package
 2 | ================
 3 | 
 4 | :mod:`common` Module
 5 | --------------------
 6 | 
 7 | .. automodule:: pypeflow.common
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 
12 | :mod:`controller` Module
13 | ------------------------
14 | 
15 | .. automodule:: pypeflow.controller
16 |     :members:
17 |     :undoc-members:
18 |     :show-inheritance:
19 | 
20 | :mod:`data` Module
21 | ------------------
22 | 
23 | .. automodule:: pypeflow.data
24 |     :members:
25 |     :undoc-members:
26 |     :show-inheritance:
27 | 
28 | :mod:`task` Module
29 | ------------------
30 | 
31 | .. automodule:: pypeflow.task
32 |     :members:
33 |     :undoc-members:
34 |     :show-inheritance:
35 | 
36 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Build matrix / environment variable are explained on:
 2 | # http://about.travis-ci.org/docs/user/build-configuration/
 3 | # This file can be validated on:
 4 | # http://lint.travis-ci.org/
 5 | 
 6 | #before_install: sudo apt-get install -y cmake
 7 | # cmake is pre-installed in Travis for both linux and osx
 8 | 
 9 | #before_install:
10 | #  - sudo apt-get update -qq
11 | #  - sudo apt-get install -qq valgrind
12 | sudo: required
13 | os:
14 |   - linux
15 | language: python
16 | #compiler:
17 | #  - gcc
18 | #  - clang
19 | script: ./travis.sh
20 | #env:
21 | #  matrix:
22 | #    - SHARED_LIB=ON  STATIC_LIB=ON CMAKE_PKG=ON  BUILD_TYPE=release VERBOSE_MAKE=false
23 | #    - SHARED_LIB=OFF STATIC_LIB=ON CMAKE_PKG=OFF BUILD_TYPE=debug   VERBOSE_MAKE=true VERBOSE
24 | notifications:
25 |   email: false
26 | 


--------------------------------------------------------------------------------
/doc/concurrent_execution.rst:
--------------------------------------------------------------------------------
 1 | Concurrent Execution
 2 | ======================
 3 | 
 4 | ``PypeThreadTaskBase`` provides the base class for task that can
 5 | be run concurrently. If a task is built with ``PypeThreadTaskBase``,
 6 | it has to be used with ``PypeThreadWorkflow``.  And all other tasks
 7 | in the workflow should be ``PypeThreadTaskBase`` objects too. We simply
 8 | use python thread for concurrent tasks. Due to the Python GIL, it is
 9 | not recommand to have python function for intensive computing as task.
10 | The main purpose for ``PypeThreadTaskBase`` is for building tasks that wrapped
11 | some shell commands for running locally or through a cluster environment.
12 | In the furture, it is possible to add multiprocessing based support
13 | for computation intensive python functions as tasks to avoid the GIL.
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/src/tests/test_pypeflow_common.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import assert_equal
 2 | from nose import SkipTest
 3 | 
 4 | class TestPypeObject:
 5 |     def TestRDFXML(self):
 6 |         # pype_object = PypeObject(URL, **attributes)
 7 |         # assert_equal(expected, pype_object.RDFXML())
 8 |         raise SkipTest # TODO: implement your test here
 9 | 
10 |     def test___init__(self):
11 |         # pype_object = PypeObject(URL, **attributes)
12 |         raise SkipTest # TODO: implement your test here
13 | 
14 | class TestRunShellCmd:
15 |     def test_run_shell_cmd(self):
16 |         # assert_equal(expected, runShellCmd(args, **kwargs))
17 |         raise SkipTest # TODO: implement your test here
18 | 
19 | class TestRunSgeSyncJob:
20 |     def test_run_sge_sync_job(self):
21 |         # assert_equal(expected, runSgeSyncJob(args))
22 |         raise SkipTest # TODO: implement your test here
23 | 
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension, find_packages
 2 | import subprocess
 3 | 
 4 | try:
 5 |     local_version = '+git.{}'.format(
 6 |         subprocess.check_output('git rev-parse HEAD', shell=True, encoding='utf8'))
 7 | except Exception:
 8 |     local_version = ''
 9 | 
10 | setup(
11 |     name = 'pypeflow',
12 |     version='2.3.0' + local_version, # should match __init__.py
13 |     author='J. Chin',
14 |     author_email='cschin@infoecho.net',
15 |     license='LICENSE.txt',
16 |     packages=find_packages(),
17 |     package_dir = {'':'.'},
18 |     zip_safe = False,
19 |     install_requires=[
20 |         'networkx >=1.9.1',
21 |     ],
22 |     entry_points = {'console_scripts': [
23 |             'pwatcher-main=pwatcher.mains.pwatcher:main',
24 |             'pwatcher-pypeflow-example=pwatcher.mains.pypeflow_example:main',
25 |             'heartbeat-wrapper=pwatcher.mains.fs_heartbeat:main',
26 |         ],
27 |     },
28 |     package_data={'pwatcher.mains': ['*.sh']}
29 | )
30 | 


--------------------------------------------------------------------------------
/bamboo_wheel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | type module >& /dev/null || . /mnt/software/Modules/current/init/bash
 3 | module purge
 4 | module load gcc
 5 | 
 6 | set -vex
 7 | ls -larth ..
 8 | ls -larth
 9 | pwd
10 | 
11 | export WHEELHOUSE=./wheelhouse
12 | mkdir -p ${WHEELHOUSE}
13 | 
14 | # Give everybody read/write access.
15 | umask 0000
16 | 
17 | 
18 | module load python/3.7.3
19 | make wheel
20 | 
21 | # http://bamboo.pacificbiosciences.com:8085/build/admin/edit/defaultBuildArtifact.action?buildKey=SAT-TAGDEPS-JOB1
22 | # For old artifact config:
23 | mkdir -p ./artifacts/gcc-6.4.0/wheelhouse
24 | rsync -av ${WHEELHOUSE}/pypeflow*.whl artifacts/gcc-6.4.0/wheelhouse/
25 | 
26 | 
27 | # Select export dir based on Bamboo branch, but only for develop and master.
28 | case "${bamboo_planRepository_branchName}" in
29 |   develop|master)
30 |     WHEELHOUSE="/mnt/software/p/python/wheelhouse/${bamboo_planRepository_branchName}/"
31 |     rsync -av ./wheelhouse/ ${WHEELHOUSE}
32 |     ;;
33 |   *)
34 |     ;;
35 | esac
36 | 


--------------------------------------------------------------------------------
/examples-pwatcher/ab/logging-cfg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "disable_existing_loggers": false,
 3 |     "filters": {},
 4 |     "formatters": {
 5 |         "format_brief": {
 6 |             "format": "%(levelname)s: %(message)s"
 7 |         },
 8 |         "format_full": {
 9 |             "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
10 |         }
11 |     },
12 |     "handlers": {
13 |         "handler_file_all": {
14 |             "class": "logging.FileHandler",
15 |             "filename": "workflow.log",
16 |             "formatter": "format_full",
17 |             "level": "DEBUG",
18 |             "mode": "w"
19 |         },
20 |         "handler_stream": {
21 |             "class": "logging.StreamHandler",
22 |             "formatter": "format_brief",
23 |             "level": "INFO",
24 |             "stream": "ext://sys.stderr"
25 |         }
26 |     },
27 |     "loggers": {
28 |         "": {
29 |             "handlers": [
30 |                 "handler_file_all",
31 |                 "handler_stream"
32 |             ],
33 |             "level": "NOTSET"
34 |         }
35 |     },
36 |     "root": {
37 |     },
38 |     "version": 1
39 | }
40 | 


--------------------------------------------------------------------------------
/bamboo-specs/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
10 | 		<attributes>
11 | 			<attribute name="optional" value="true"/>
12 | 			<attribute name="maven.pomderived" value="true"/>
13 | 			<attribute name="test" value="true"/>
14 | 		</attributes>
15 | 	</classpathentry>
16 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
17 | 		<attributes>
18 | 			<attribute name="maven.pomderived" value="true"/>
19 | 		</attributes>
20 | 	</classpathentry>
21 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
22 | 		<attributes>
23 | 			<attribute name="maven.pomderived" value="true"/>
24 | 		</attributes>
25 | 	</classpathentry>
26 | 	<classpathentry kind="output" path="target/classes"/>
27 | </classpath>
28 | 


--------------------------------------------------------------------------------
/test/test_do_task.py:
--------------------------------------------------------------------------------
 1 | from pypeflow import do_task as M
 2 | import pytest
 3 | 
 4 | testdata = [
 5 |         # no subs
 6 |         ({}, {}, {},
 7 | """\
 8 | echo hello
 9 | """,
10 | """\
11 | echo hello
12 | """),
13 |         # simple subs (with quoting)
14 |         ({'ii': 'II'}, {'oo': 'O O'}, {'pp': 'PP DO NOT QUOTE'},
15 | """\
16 | echo {input.ii}
17 | echo {output.oo}
18 | echo {params.pp}
19 | """,
20 | """\
21 | echo II
22 | echo 'O O'
23 | echo PP DO NOT QUOTE
24 | """),
25 |         # input.ALL
26 |         ({'ii': 'II', 'ij': 'IJ'}, {'oo': 'OO'}, {'pp': 'PP'},
27 | """\
28 | echo {input.ALL}
29 | echo {output.oo}
30 | echo {params.pp}
31 | """,
32 | """\
33 | echo II IJ
34 | echo OO
35 | echo PP
36 | """),
37 |         # input.i* (with quoting)
38 |         ({'ii': 'II', 'ij': 'I J', 'ia': 'IA', 'ab': 'AB'}, {'oo': 'OO'}, {'pp': 'PP'},
39 | """\
40 | echo {input.i*}
41 | echo {input.ab}
42 | echo {output.oo}
43 | echo {params.pp}
44 | """,
45 | """\
46 | echo 'I J' IA II
47 | echo AB
48 | echo OO
49 | echo PP
50 | """),
51 | ]
52 | 
53 | @pytest.mark.parametrize("args", testdata)
54 | def test_sub(args):
55 |     myi, myo, myp, t, expected = args
56 |     got = M.sub(t, myi, myo, myp)
57 |     assert expected == got
58 | 


--------------------------------------------------------------------------------
/pwatcher/mains/job_start.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # vim: filetype=sh ts=4 sw=4 sts=4 et:
 3 | #
 4 | # Wait until file exists, then spawn.
 5 | 
 6 | # This is not Python because the start_tmpl from pbsmrtpipe always runs bash.
 7 | # But we use the .py extension because we want this installed with our Python
 8 | # code, so we do not need to deal with mobs for installation. (But we might
 9 | # need to chmod +x.)
10 | #
11 | # This can be run via
12 | #
13 | #     bash -c pwatcher/mains.job_start.py myprog 60
14 | #
15 | # Note: If anyone replaces this, you must ensure that running this is exactly equivalent
16 | # to running the "executable". In other words, no 'mkdir', no 'cd', etc. That will help
17 | # with debugging.
18 | 
19 | set -vex
20 | executable=${PYPEFLOW_JOB_START_SCRIPT}
21 | timeout=${PYPEFLOW_JOB_START_TIMEOUT:-60} # wait 60s by default
22 | 
23 | # Wait up to timeout seconds for the executable to become "executable",
24 | # then exec.
25 | #timeleft = int(timeout)
26 | while [[ ! -x "${executable}" ]]; do
27 |     if [[ "${timeout}" == "0" ]]; then
28 |         echo "timed out waiting for (${executable})"
29 |         exit 77
30 |     fi
31 |     echo "not executable: '${executable}', waiting ${timeout}s"
32 |     sleep 1
33 |     timeout=$((timeout-1))
34 | done
35 | 
36 | /bin/bash ${executable}
37 | 


--------------------------------------------------------------------------------
/bamboo-specs/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <parent>
 6 |     <groupId>com.atlassian.bamboo</groupId>
 7 |     <artifactId>bamboo-specs-parent</artifactId>
 8 |     <version>6.7.1</version>
 9 |     <relativePath/>
10 |   </parent>
11 | 
12 |   <groupId>com.pacb</groupId>
13 |   <artifactId>bamboo-specs</artifactId>
14 |   <version>1.0.0-SNAPSHOT</version>
15 |   <packaging>jar</packaging>
16 | 
17 |   <dependencies>
18 |     <dependency>
19 |       <groupId>com.atlassian.bamboo</groupId>
20 |       <artifactId>bamboo-specs-api</artifactId>
21 |     </dependency>
22 |     <dependency>
23 |       <groupId>com.atlassian.bamboo</groupId>
24 |       <artifactId>bamboo-specs</artifactId>
25 |     </dependency>
26 | 
27 |     <!-- Test dependencies -->
28 |     <dependency>
29 |       <groupId>junit</groupId>
30 |       <artifactId>junit</artifactId>
31 |       <scope>test</scope>
32 |     </dependency>
33 |   </dependencies>
34 | 
35 |   <!-- run 'mvn test' to perform offline validation of the plan -->
36 |   <!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
37 | </project>
38 | 


--------------------------------------------------------------------------------
/pypeflow/do_support.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.config
 3 | import os
 4 | import string
 5 | import io
 6 | LOG = logging.getLogger(__name__)
 7 | BASH = '/bin/bash'
 8 | 
 9 | # This is used by some programs in falcon_kit/mains.
10 | simple_logging_config = """
11 | [loggers]
12 | keys=root
13 | 
14 | [handlers]
15 | keys=stream
16 | 
17 | [formatters]
18 | keys=form01,form02
19 | 
20 | [logger_root]
21 | level=NOTSET
22 | handlers=stream
23 | 
24 | [handler_stream]
25 | class=StreamHandler
26 | level=${FALCON_LOG_LEVEL}
27 | formatter=form01
28 | args=(sys.stderr,)
29 | 
30 | [formatter_form01]
31 | format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
32 | 
33 | [formatter_form02]
34 | format=[%(levelname)s]%(message)s
35 | """
36 | def setup_simple_logging(FALCON_LOG_LEVEL='DEBUG', **ignored):
37 |     cfg = string.Template(simple_logging_config).substitute(FALCON_LOG_LEVEL=FALCON_LOG_LEVEL)
38 |     logger_fileobj = io.StringIO(cfg)
39 |     defaults = {}
40 |     logging.config.fileConfig(logger_fileobj, defaults=defaults, disable_existing_loggers=False)
41 | 
42 | def run_bash(script_fn):
43 |     # Assume script was written by this program, so we know it is
44 |     # available in the filesystem.
45 |     # However, we cannot be sure that the execute permission is set,
46 |     # so run it as a script.
47 |     cmd = '{} -vex {}'.format(BASH, script_fn)
48 |     LOG.info('!{}'.format(cmd))
49 |     rc = os.system(cmd)
50 |     if rc:
51 |         raise Exception('{} <- {!r}'.format(rc, cmd))
52 | 


--------------------------------------------------------------------------------
/test/test_integ.py:
--------------------------------------------------------------------------------
 1 | from pypeflow.simple_pwatcher_bridge import (
 2 |     PypeProcWatcherWorkflow,
 3 |     PRODUCERS,
 4 | )
 5 | from pypeflow import sample_tasks
 6 | from pypeflow import util
 7 | import os
 8 | 
 9 | def setup_workflow():
10 |     PRODUCERS.clear() # Forget any PypeTasks already defined.
11 | 
12 |     job_defaults = {
13 |         'job_type': 'string',
14 |         #'submit': 'bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}',
15 |         'submit': 'bash -C ${CMD}',
16 |         #'JOB_OPTS': '-pe smp 8 -q bigmem',
17 |         'pwatcher_type': 'blocking',
18 |         #'pwatcher_directory': config.get('pwatcher_directory', 'mypwatcher'),
19 |         #'use_tmpdir': '/scratch',
20 |         'njobs': 4,
21 |     }
22 |     wf = PypeProcWatcherWorkflow(
23 |         job_defaults=job_defaults,
24 |     )
25 |     return wf
26 | 
27 | def try_workflow(text, create_task):
28 |     """Test the whole workflow.
29 |     'text' is anything.
30 |     'create_tasks' signature: create_task(i1, o1)
31 |     """
32 |     wf = setup_workflow()
33 |     wf.max_jobs = 2
34 | 
35 |     i1 = './in/i1'
36 |     o1 = './run/dir1/o1.txt'
37 |     util.mkdirs('in/')
38 |     with open('in/i1', 'w') as ofs:
39 |         ofs.write(text)
40 |     assert os.path.exists(i1)
41 |     assert not os.path.exists(o1)
42 | 
43 |     task = create_task(i1, o1)
44 |     wf.addTask(task)
45 |     wf.refreshTargets()
46 | 
47 |     assert os.path.exists(o1)
48 |     assert text == open(o1).read()
49 | 
50 | def test_new(tmpdir):
51 |     with tmpdir.as_cwd():
52 |         try_workflow('bash-based', sample_tasks.create_task)
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The Clear BSD License
 2 | 
 3 | Copyright (c) 2016, Jason Chin
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted (subject to the limitations in the disclaimer
 8 | below) provided that the following conditions are met:
 9 | 
10 | * Redistributions of source code must retain the above copyright notice, this
11 |   list of conditions and the following disclaimer.
12 | 
13 | * Redistributions in binary form must reproduce the above copyright notice,
14 |   this list of conditions and the following disclaimer in the documentation
15 |   and/or other materials provided with the distribution.
16 | 
17 | * Neither the name of the copyright holder nor the names of its contributors may be used
18 |   to endorse or promote products derived from this software without specific
19 |   prior written permission.
20 | 
21 | NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS
22 | LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
26 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
28 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 | OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
32 | DAMAGE.
33 | 


--------------------------------------------------------------------------------
/examples-pwatcher/README.md:
--------------------------------------------------------------------------------
 1 | ## pwatcher
 2 | Filesystem-based process-watcher.
 3 | 
 4 | Sometimes, the filesystem is the only reliable way to communicate between
 5 | processes on different machines.  **pwatcher** will watch for
 6 | sentinels and heartbeats.
 7 | 
 8 | Two basic ideas:
 9 | 
10 | 1. To store sentinel-files in a single directory, in order to reduce
11 |    the burden on the filesystem.
12 | 2. To use a background thread to update heartbeat-files periodically,
13 |    in order to avoid waiting forever on dead jobs.
14 | 
15 | ## API
16 | **pwatcher** can be used as a separate process or as a Python module.
17 | If you use it as a module, you should use the contextmanager in order
18 | to release locks quickly. That way, users can query via the command-line
19 | even while a large job is ongoing.
20 | 
21 | There are three commands in the API.
22 | 
23 | 1. `run`
24 | 2. `query`
25 | 3. `delete`
26 | 
27 | They all can be called from the command-line by supplying the arguments as JSON.
28 | 
29 | ### Examples
30 | #### Using **pwatcher**
31 | ```
32 | pip install -e .
33 | cd examples-pwatcher/ab
34 | pwatcher-main run < run.json
35 | pwatcher-main query < query.json
36 | pwatcher-main delete < delete.json
37 | ls pwatched/
38 | ```
39 | #### pypeFLOW example
40 | To run this example, you must first install **pypeFLOW**.
41 | ```
42 | mkdir foo
43 | cd foo
44 | pypeflow_example
45 | ```
46 | That should create:
47 | * directory `mytmp`
48 |   * for pypeflow outputs
49 | * directory `watched`
50 |   * `state.py`
51 |   * wrappers
52 |   * sentinel-files, touched on exit
53 |   * heartbeat-files, usually removed when done
54 | * some basic taskrunners
55 | 
56 | ### Plans
57 | The API needs a bit of clean-up, but the basic functionality is there.
58 | I still have to inject the grid-control commands.
59 | 
60 | I hope to replace **FALCON**'s `fc_run.py` soon!
61 | 


--------------------------------------------------------------------------------
/pypeflow/tasks.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import collections
 4 | import logging
 5 | import os
 6 | import pprint
 7 | from .simple_pwatcher_bridge import (PypeTask, Dist)
 8 | from . import io
 9 | 
10 | LOG = logging.getLogger(__name__)
11 | 
12 | 
13 | def task_generic_bash_script(self):
14 |     """Generic script task.
15 |     The script template should be in
16 |       self.bash_template
17 |     The template will be substituted by
18 |     the content of "self" and of "self.parameters".
19 |     (That is a little messy, but good enough for now.)
20 |     """
21 |     self_dict = dict()
22 |     self_dict.update(self.__dict__)
23 |     self_dict.update(self.parameters)
24 |     script_unsub = self.bash_template
25 |     script = script_unsub.format(**self_dict)
26 |     script_fn = 'script.sh'
27 |     with open(script_fn, 'w') as ofs:
28 |         ofs.write(script)
29 |     self.generated_script_fn = script_fn
30 | 
31 | 
32 | def gen_task(script, inputs, outputs, parameters=None, dist=None):
33 |     """
34 |     dist is used in two ways:
35 |     1) in the pwatcher, to control job-distribution
36 |     2) as additional parameters:
37 |       - params.pypeflow_nproc
38 |       - params.pypeflow_mb
39 |     """
40 |     if parameters is None:
41 |         parameters = dict()
42 |     if dist is None:
43 |         dist = Dist()
44 |     LOG.debug('gen_task({}\n\tinputs={!r},\n\toutputs={!r})'.format(
45 |         script, inputs, outputs))
46 |     parameters = dict(parameters) # copy
47 |     parameters['pypeflow_nproc'] = dist.pypeflow_nproc
48 |     parameters['pypeflow_mb'] = dist.pypeflow_mb
49 |     LOG.debug(' parameters={}'.format(
50 |         pprint.pformat(parameters)))
51 |     LOG.debug(' dist.job_dict={}'.format(
52 |         pprint.pformat(dist.job_dict)))
53 |     def validate_dict(mydict):
54 |         "Python identifiers are illegal as keys."
55 |         try:
56 |             collections.namedtuple('validate', list(mydict.keys()))
57 |         except ValueError as exc:
58 |             LOG.exception('Bad key name in task definition dict {!r}'.format(mydict))
59 |             raise
60 |     validate_dict(inputs)
61 |     validate_dict(outputs)
62 |     validate_dict(parameters)
63 |     make_task = PypeTask(
64 |             inputs={k: v for k,v in inputs.items()},
65 |             outputs={k: v for k,v in outputs.items()},
66 |             parameters=parameters,
67 |             bash_template=script,
68 |             dist=dist,
69 |             )
70 |     return make_task(task_generic_bash_script)
71 | 


--------------------------------------------------------------------------------
/doc/introduction.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Introduction
 3 | ============
 4 | 
 5 | 
 6 | What is pypeFLOW
 7 | ================
 8 | 
 9 | pypeFLOW is light weight and reusable make / flow data process
10 | library written in Python.
11 | 
12 | Most of bioinformatics analysis or general data analysis
13 | includes various steps combining data files, transforming
14 | files between different formats and calculating statistics
15 | with a variety of tools. Ian Holmes has a great summary and
16 | opinions about bioinformatics workflow at
17 | http://biowiki.org/BioinformaticsWorkflows.  It is
18 | interesting that such analysis workflow is really similar to
19 | constructing software without an IDE in general.  Using a
20 | "makefile" file for managing bioinformatics analysis
21 | workflow is actually great for generating reproducible and
22 | reusable analysis procedure.  Combining with a proper
23 | version control tool, one will be able to manage to work
24 | with a divergent set of data and tools over a period of time
25 | for a project especially when there are complicate
26 | dependence between the data, tools and customized code
27 | for the analysis tasks.
28 | 
29 | However, using "make" and "makefile" implies all data
30 | analysis steps are done by some command line tools. If you
31 | have some customized analysis tasks, you will have to write
32 | some scripts and to make them into command line tools.  In
33 | my personal experience, I find it is convenient to bypass
34 | such burden and to combine those quick and simple steps in a
35 | single scripts. The only caveat is that if an analyst does
36 | not save the results of any intermediate steps, he or she
37 | has to repeat the computation all over again for every steps
38 | from the beginning. This will waste a lot of computation
39 | cycles and personal time.  Well, the solution is simple,
40 | just like the traditional software building process, one
41 | have to track the dependencies and analyze them and only
42 | reprocess those parts that are necessary to get the most
43 | up-to-date final results.
44 | 
45 | General Design Principles
46 | =========================
47 | 
48 |     - Explicitly modeling data and task dependency
49 |     - Support declarative programming style within Python while
50 |       maintaining something that imperative programming dose the
51 |       best
52 |     - Utilize RDF meta-data framework
53 |     - Keep it simple if possible
54 | 
55 | Features
56 | ========
57 | 
58 |     - Multiple concurrent tasks scheduling and running
59 |     - Support tasks as simple shell script (considering clustering
60 |       job submission in mind)
61 |     - reasonable simple interface for declarative programming
62 | 
63 | 


--------------------------------------------------------------------------------
/example/test_shutdown.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # @author Jason Chin
 3 | #
 4 | # Copyright (C) 2010 by Jason Chin 
 5 | # Copyright (C) 2011 by Jason Chin
 6 | #
 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | # of this software and associated documentation files (the "Software"), to deal
 9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in
15 | # all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 | # THE SOFTWARE.
24 | 
25 | import sys
26 | import os 
27 | 
28 | 
29 | from pypeflow.common import * 
30 | from pypeflow.task import PypeThreadTaskBase, PypeTaskBase
31 | from pypeflow.task import PypeTask, PypeShellTask, PypeSGETask, PypeDistributibleTask
32 | from pypeflow.controller import PypeWorkflow, PypeThreadWorkflow, PypeMPWorkflow
33 | from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
34 | import logging
35 | import time
36 | 
37 | logger = logging.getLogger()
38 | #logger.setLevel(logging.INFO)
39 | logger.setLevel(logging.DEBUG)
40 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
41 | ch = logging.StreamHandler()
42 | ch.setLevel(logging.DEBUG)
43 | ch.setFormatter(formatter)
44 | logger.addHandler(ch)
45 | 
46 | inputs = {"input": makePypeLocalFile("/tmp/test1_input")}
47 | outputs = {"output": makePypeLocalFile("/tmp/test1_output")}
48 | os.system("touch /tmp/test1_input")
49 | 
50 | @PypeTask(inputs = inputs, outputs = outputs, TaskType = PypeThreadTaskBase)
51 | def f(self):
52 |     i = 0
53 |     while 1:
54 |         time.sleep(0.1)
55 |         if self.shutdown_event != None and self.shutdown_event.is_set():
56 |             break
57 |         if i > 10:
58 |             break
59 |         i += 1
60 |     if self.shutdown_event == None or not self.shutdown_event.is_set():
61 |         os.system("touch %s" % fn(self.output))
62 | 
63 | wf = PypeThreadWorkflow()
64 | wf.addTasks([f])
65 | wf.refreshTargets()
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | What is pypeFLOW
 2 | ================
 3 | 
 4 | pypeFLOW is light weight and reusable make / flow data process
 5 | library written in Python.
 6 | 
 7 | Most of bioinformatics analysis or general data analysis
 8 | includes various steps combining data files, transforming
 9 | files between different formats and calculating statistics
10 | with a variety of tools. Ian Holmes has a great summary and
11 | opinions about bioinformatics workflow at
12 | http://biowiki.org/BioinformaticsWorkflows.  It is
13 | interesting that such analysis workflow is really similar to
14 | constructing software without an IDE in general.  Using a
15 | "makefile" file for managing bioinformatics analysis
16 | workflow is actually great for generating reproducible and
17 | reusable analysis procedure.  Combining with a proper
18 | version control tool, one will be able to manage to work
19 | with a divergent set of data and tools over a period of time
20 | for a project especially when there are complicate
21 | dependence between the data, tools and customized code
22 | for the analysis tasks.
23 | 
24 | However, using "make" and "makefile" implies all data
25 | analysis steps are done by some command line tools. If you
26 | have some customized analysis tasks, you will have to write
27 | some scripts and to make them into command line tools.  In
28 | my personal experience, I find it is convenient to bypass
29 | such burden and to combine those quick and simple steps in a
30 | single scripts. The only caveat is that if an analyst does
31 | not save the results of any intermediate steps, he or she
32 | has to repeat the computation all over again for every steps
33 | from the beginning. This will waste a lot of computation
34 | cycles and personal time.  Well, the solution is simple,
35 | just like the traditional software building process, one
36 | have to track the dependencies and analyze them and only
37 | reprocess those parts that are necessary to get the most
38 | up-to-date final results.
39 | 
40 | General Design Principles
41 | =========================
42 | 
43 |     - Explicitly modeling data and task dependency
44 |     - Support declarative programming style within Python while
45 |       maintaining some thing that imperative programming dose the
46 |       best
47 |     - Utilize RDF meta-data framework
48 |     - Keep it simple if possible
49 | 
50 | Features
51 | ========
52 | 
53 |     - Multiple concurrent task scheduling and running
54 |     - Support task as simple shell script (considering clustering
55 |       job submission in mind)
56 |     - reasonable simple interface for declarative programming
57 | 
58 | General Installation
59 | ====================
60 | 
61 | pypeFlow uses the standard python setup.py for installation::
62 |     
63 |     python setup.py install
64 | 
65 | Once install, a brief documentation can be generated by::
66 | 
67 |     cd doc
68 |     make html
69 | 
70 | The generate sphinx html document can be viewed by point your web browser 
71 | to ``_build/html/index.html`` in the ``doc`` directory.
72 | 
73 | DISCLAIMER
74 | ----------
75 | THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, ARE PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF THIS SITE, ALL SITE-RELATED SERVICES, AND ANY THIRD PARTY WEBSITES OR APPLICATIONS. NO ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A WARRANTY OF ANY KIND. ANY REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACIFIC BIOSCIENCES.
76 | 


--------------------------------------------------------------------------------
/pwatcher/mains/pypeflow_example.py:
--------------------------------------------------------------------------------
  1 | from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
  2 |         makePypeLocalFile, fn, PypeTask)
  3 | import json
  4 | import logging.config
  5 | import os
  6 | import sys
  7 | 
  8 | JOB_TYPE = os.environ.get('JOB_TYPE', 'local')
  9 | SLEEP_S = os.environ.get('SLEEP_S', '1')
 10 | log = logging.getLogger(__name__)
 11 | 
 12 | def spawn(args, check=False):
 13 |     cmd = args[0]
 14 |     log.debug('$(%s %s)' %(cmd, repr(args)))
 15 |     rc = os.spawnv(os.P_WAIT, cmd, args) # spawnvp for PATH lookup
 16 |     msg = "Call %r returned %d." % (cmd, rc)
 17 |     if rc:
 18 |         log.warning(msg)
 19 |         if check:
 20 |             raise Exception(msg)
 21 |     else:
 22 |         log.debug(msg)
 23 |     return rc
 24 | def system(call, check=False):
 25 |     log.debug('$(%s)' %repr(call))
 26 |     rc = os.system(call)
 27 |     msg = "Call %r returned %d." % (call, rc)
 28 |     if rc:
 29 |         log.warning(msg)
 30 |         if check:
 31 |             raise Exception(msg)
 32 |     else:
 33 |         log.debug(msg)
 34 |     return rc
 35 | def makedirs(d):
 36 |     if not os.path.isdir(d):
 37 |         os.makedirs(d)
 38 | def taskrun0(self):
 39 |     template = """
 40 | sleep_s=%(sleep_s)s
 41 | ofile=%(ofile)s
 42 | 
 43 | set -vex
 44 | echo start0
 45 | sleep ${sleep_s}
 46 | touch ${ofile}
 47 | echo end0
 48 | """
 49 |     bash = template %dict(
 50 |         #ifile=fn(self.i0),
 51 |         ofile=fn(self.f0),
 52 |         sleep_s=self.parameters['sleep_s'],
 53 |     )
 54 |     log.debug('taskrun0 bash:\n' + bash)
 55 |     script = 'taskrun0.sh'
 56 |     with open(script, 'w') as ofs:
 57 |         ofs.write(bash)
 58 |     #system("bash {}".format(script), check=True)
 59 |     #spawn(['/bin/bash', script], check=True) # Beware! Hard to kill procs.
 60 |     self.generated_script_fn = script
 61 |     return script
 62 | def taskrun1(self):
 63 |     template = """
 64 | sleep_s=%(sleep_s)s
 65 | ifile=%(ifile)s
 66 | ofile=%(ofile)s
 67 | 
 68 | set -vex
 69 | echo start1
 70 | sleep ${sleep_s}
 71 | cp -f ${ifile} ${ofile}
 72 | echo end1
 73 | """
 74 |     bash = template %dict(
 75 |         ifile=fn(self.f0),
 76 |         ofile=fn(self.f1),
 77 |         sleep_s=self.parameters['sleep_s'],
 78 |     )
 79 |     log.debug('taskrun1 bash:\n' + bash)
 80 |     script = 'taskrun1.sh'
 81 |     with open(script, 'w') as ofs:
 82 |         ofs.write(bash)
 83 |     #system("bash {}".format(script), check=True)
 84 |     self.generated_script_fn = script
 85 |     return script
 86 | 
 87 | def main():
 88 |     lfn = 'logging-cfg.json'
 89 |     if os.path.exists(lfn):
 90 |         logging.config.dictConfig(json.load(open(lfn)))
 91 |     else:
 92 |         logging.basicConfig()
 93 |         logging.getLogger().setLevel(logging.NOTSET)
 94 |         try:
 95 |             import logging_tree
 96 |             logging_tree.printout()
 97 |         except ImportError:
 98 |             pass
 99 |     log.debug('DEBUG LOGGING ON')
100 |     log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format(
101 |         JOB_TYPE, SLEEP_S))
102 |     exitOnFailure=False
103 |     concurrent_jobs=2
104 |     Workflow = PypeProcWatcherWorkflow
105 |     wf = Workflow(job_type=JOB_TYPE)
106 |     wf.max_jobs = concurrent_jobs
107 | 
108 |     par = dict(sleep_s=SLEEP_S)
109 |     DIR ='mytmp'
110 |     makedirs(DIR)
111 |     f0 = makePypeLocalFile('mytmp/f0')
112 |     f1 = makePypeLocalFile('mytmp/f1')
113 |     make_task = PypeTask(
114 |             inputs = {},
115 |             outputs = {'f0': f0},
116 |             parameters = par,
117 |     )
118 |     task = make_task(taskrun0)
119 |     wf.addTasks([task])
120 |     make_task = PypeTask(
121 |             inputs = {'f0': f0},
122 |             outputs = {'f1': f1},
123 |             parameters = par,
124 |     )
125 |     task = make_task(taskrun1)
126 |     wf.addTasks([task])
127 |     wf.refreshTargets([task])
128 |     #wf.refreshTargets(exitOnFailure=exitOnFailure)
129 | 
130 | if __name__ == "__main__":
131 |     main()
132 | 


--------------------------------------------------------------------------------
/pypeflow/io.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import logging
  3 | import os
  4 | 
  5 | LOG = logging.getLogger()
  6 | 
  7 | 
  8 | def mkdirs(*dirnames):
  9 |     for dirname in dirnames:
 10 |         if not dirname:
 11 |             continue # '' => curdir
 12 |         if not os.path.isdir(dirname):
 13 |             os.makedirs(dirname)
 14 |             if len(dirnames) == 1:
 15 |                 LOG.debug('mkdir -p "{}"'.format(dirnames[0]))
 16 | 
 17 | 
 18 | def syscall(call, nocheck=False):
 19 |     """Raise Exception in error, unless nocheck==True
 20 |     """
 21 |     LOG.info('$(%s)' % repr(call))
 22 |     rc = os.system(call)
 23 |     msg = 'Call %r returned %d.' % (call, rc)
 24 |     if rc:
 25 |         LOG.warning(msg)
 26 |         if not nocheck:
 27 |             raise Exception(msg)
 28 |     else:
 29 |         LOG.debug(msg)
 30 |     return rc
 31 | 
 32 | 
 33 | def capture(cmd, nocheck=False):
 34 |     """Capture output, maybe checking return-code.
 35 |     Return stdout, fully captured.
 36 |     Wait for subproc to finish.
 37 |     Warn if empty.
 38 |     Raise on non-zero exit-code, unless nocheck.
 39 |     """
 40 |     import subprocess
 41 |     LOG.info('$ {} >'.format(cmd))
 42 |     proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding='ascii')
 43 |     stdout, stderr = proc.communicate()
 44 |     rc = proc.returncode
 45 |     if rc:
 46 |         msg = '{} <- {!r}:\n{}'.format(rc, cmd, stdout)
 47 |         if nocheck:
 48 |             LOG.debug(msg)
 49 |         else:
 50 |             raise Exception(msg)
 51 |     assert stderr is None, '{!r} != None'.format(stderr)
 52 |     output = stdout
 53 |     if not output:
 54 |         msg = '{!r} failed to produce any output.'.format(cmd)
 55 |         LOG.warning(msg)
 56 |     return output
 57 | 
 58 | 
 59 | def symlink(src, name, force=True):
 60 |     if os.path.lexists(name):
 61 |         os.unlink(name)
 62 |     os.symlink(src, name)
 63 | 
 64 | 
 65 | def fix_relative_symlinks(currdir, origdir, recursive=True, relparent='..'):
 66 |     """
 67 |     Fix relative symlinks after cp/rsync, assuming they had
 68 |     been defined relative to 'origdir'.
 69 |     If 'recursive', then perform this in all (non-symlinked) sub-dirs also.
 70 |     Skip relative links that point upward shallower than relparent, and warn.
 71 |     (Always skip absolute symlinks; we assume those already point to persistent space.)
 72 |     """
 73 |     if recursive:
 74 |         for dn in os.listdir(currdir):
 75 |             if not os.path.islink(dn) and os.path.isdir(dn):
 76 |                 fix_relative_symlinks(os.path.join(currdir, dn), os.path.join(origdir, dn), recursive,
 77 |                         os.path.join('..', relparent))
 78 |     for fn in os.listdir(currdir):
 79 |         fn = os.path.join(currdir, fn)
 80 |         if not os.path.islink(fn):
 81 |             continue
 82 |         oldlink = os.readlink(fn)
 83 |         if os.path.isabs(oldlink):
 84 |             continue
 85 |         if not os.path.normpath(oldlink).startswith(relparent):
 86 |             msg = 'Symlink {}->{} seems to point within the origdir tree. This is unexpected. relparent={}'.format(
 87 |                 fn, oldlink, relparent)
 88 |             raise Exception(msg)
 89 |             #LOG.warning(msg)
 90 |             #continue
 91 |         newlink = os.path.relpath(os.path.join(origdir, oldlink), currdir)
 92 |         LOG.debug('Fix symlink to {!r} from {!r}'.format(newlink, oldlink))
 93 |         symlink(newlink, fn)
 94 | 
 95 | 
 96 | def rm(*f):
 97 |     syscall('rm -f {}'.format(' '.join(f)))
 98 | 
 99 | 
100 | def touch(*paths):
101 |     msg = 'touch {!r}'.format(paths)
102 |     LOG.debug(msg)
103 |     for path in paths:
104 |         if os.path.exists(path):
105 |             os.utime(path, None)
106 |         else:
107 |             open(path, 'a').close()
108 | 
109 | 
110 | def filesize(fn):
111 |     """In bytes.
112 |     Raise if fn does not exist.
113 |     """
114 |     return os.stat(fn).st_size
115 | 
116 | 
117 | def exists_and_not_empty(fn):
118 |     if not os.path.exists(fn):
119 |         return False
120 |     if 0 == filesize(fn):
121 |         LOG.debug('File {} is empty.'.format(fn))
122 |         return False
123 |     return True
124 | 
125 | 
126 | @contextlib.contextmanager
127 | def cd(newdir):
128 |     # https://stackoverflow.com/a/24176022
129 |     prevdir = os.getcwd()
130 |     LOG.info('CD: %r <- %r' % (newdir, prevdir))
131 |     os.chdir(os.path.expanduser(newdir))
132 |     try:
133 |         yield
134 |     finally:
135 |         LOG.info('CD: %r -> %r' % (newdir, prevdir))
136 |         os.chdir(prevdir)
137 | 


--------------------------------------------------------------------------------
/bamboo-specs/src/main/java/pacbio/PlanSpec.java:
--------------------------------------------------------------------------------
 1 | package pacbio;
 2 | 
 3 | import com.atlassian.bamboo.specs.api.BambooSpec;
 4 | import com.atlassian.bamboo.specs.api.builders.BambooKey;
 5 | import com.atlassian.bamboo.specs.api.builders.BambooOid;
 6 | import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
 7 | import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
 8 | import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
 9 | import com.atlassian.bamboo.specs.api.builders.plan.Job;
10 | import com.atlassian.bamboo.specs.api.builders.plan.Plan;
11 | import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
12 | import com.atlassian.bamboo.specs.api.builders.plan.Stage;
13 | import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
14 | import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
15 | import com.atlassian.bamboo.specs.api.builders.plan.configuration.ConcurrentBuilds;
16 | import com.atlassian.bamboo.specs.api.builders.project.Project;
17 | import com.atlassian.bamboo.specs.api.builders.requirement.Requirement;
18 | import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
19 | import com.atlassian.bamboo.specs.builders.task.ScriptTask;
20 | import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
21 | import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
22 | import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties;
23 | import com.atlassian.bamboo.specs.util.BambooServer;
24 | 
25 | @BambooSpec
26 | public class PlanSpec {
27 | 
28 |     public Plan plan() {
29 |         final Plan plan = new Plan(new Project()
30 | 
31 |                 .key(new BambooKey("SAT"))
32 |                 .name("SMRT Analysis Tools (SAT)"),
33 |             "pypeflow3",
34 |             new BambooKey("PYPBS"))
35 |             .description("Plan created from Bamboo Java Specs,  modify http://bitbucket.pacificbiosciences.com:7990/projects/SAT/repos/pypeflow3/browse project to update the plan.")
36 | 
37 |             .pluginConfigurations(new ConcurrentBuilds()
38 |                     .useSystemWideDefault(false)
39 |                     .maximumNumberOfConcurrentBuilds(4))
40 |             .stages(new Stage("Default Stage")
41 |                     .jobs(new Job("Default Job",
42 |                             new BambooKey("JOB1"))
43 |                             .tasks(new VcsCheckoutTask()
44 |                                     .description("Checkout Default Repository")
45 |                                     .checkoutItems(new CheckoutItem().defaultRepository()),
46 |                                 new ScriptTask()
47 |                                     .description("build")
48 |                                     .location(ScriptTaskProperties.Location.FILE)
49 |                                     .fileFromPath("build.sh"))
50 |                             .requirements(new Requirement("system.os")
51 |                                     .matchValue("linux")
52 |                                     .matchType(Requirement.MatchType.EQUALS))))
53 |             .linkedRepositories("pypeflow3")
54 | 
55 |             .triggers(new BitbucketServerTrigger())
56 |             .planBranchManagement(new PlanBranchManagement()
57 |                 .createForPullRequest()
58 |                 .delete(new BranchCleanup()
59 |                     .whenRemovedFromRepositoryAfterDays(7)
60 |                     .whenInactiveInRepositoryAfterDays(30))
61 |                 .notificationForCommitters())
62 |             .forceStopHungBuilds();
63 |         return plan;
64 |     }
65 | 
66 |     public PlanPermissions planPermission() {
67 |         final PlanPermissions planPermission = new PlanPermissions(new PlanIdentifier("SAT", "PYPBS"))
68 |             .permissions(new Permissions()
69 |                     .userPermissions("cdunn", PermissionType.VIEW, PermissionType.BUILD, PermissionType.CLONE, PermissionType.EDIT, PermissionType.ADMIN)
70 |                     .userPermissions("bli", PermissionType.BUILD, PermissionType.CLONE, PermissionType.ADMIN, PermissionType.VIEW, PermissionType.EDIT));
71 |         return planPermission;
72 |     }
73 | 
74 |     public static void main(String... argv) {
75 |         //By default credentials are read from the '.credentials' file.
76 |         BambooServer bambooServer = new BambooServer("http://bamboo.pacificbiosciences.com:8085");
77 |         final PlanSpec planSpec = new PlanSpec();
78 | 
79 |         final Plan plan = planSpec.plan();
80 |         bambooServer.publish(plan);
81 | 
82 |         final PlanPermissions planPermission = planSpec.planPermission();
83 |         bambooServer.publish(planPermission);
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/pwatcher/mains/query_server.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Query the heartbeat server from the command line.
  3 | As an argument, takes either server:port or the falcon run directory
  4 | (if not argument is given, uses the current directory).
  5 | """
  6 | import argparse
  7 | import collections
  8 | import os
  9 | import re
 10 | import socket
 11 | import sys
 12 | 
 13 | STATE_FN = 'state.py'		# taken from network_based.py
 14 | STATE_DIR = 'mypwatcher'	# taken from pwatcher_bridge.py
 15 | 
 16 | # send message delimited with a \0
 17 | def socket_send(socket, message):
 18 |     socket.sendall(b'{}\0'.format(message))
 19 | 
 20 | # receive all of \0 delimited message
 21 | # may discard content past \0, if any, so not safe to call twice on same socket
 22 | def socket_read(socket):
 23 |     buffer = bytearray(b' ' * 256)
 24 |     nbytes = socket.recv_into(buffer, 256)
 25 |     if nbytes == 0:             # empty message
 26 |         return
 27 |     message = ''
 28 |     while nbytes != 0:
 29 |         try:    # index() raises when it can't find the character
 30 |             i = buffer[:nbytes].index('\0')
 31 |             message += str(buffer[:i])  # discard past delimiter
 32 |             break
 33 |         except ValueError:      # haven't reached end yet
 34 |             message += str(buffer)
 35 |         nbytes = socket.recv_into(buffer, 256)
 36 |     return message
 37 | 
 38 | # get server and port from watcher state file
 39 | def use_state(filename):
 40 |     with open(filename, 'r') as f:
 41 |         for line in f:
 42 |             match = re.match(r" 'server': \('([^']+)', (\d+)\)", line)
 43 |             if match:
 44 |                 return (match.group(1), int(match.group(2)))
 45 |     print('Error: could not find server info in state file {}'.format(filename))
 46 | 
 47 | def parse_args():
 48 |     parser = argparse.ArgumentParser(description='query falcon network based heartbeat server')
 49 |     parser.add_argument('-s', '--server', help='<server_name:port>')
 50 |     parser.add_argument('-f', '--file', help='location of pwatcher state file')
 51 |     parser.add_argument('-d', '--debug', default=False, action='store_const', const=True, help='get server state instead of process list')
 52 |     parser.add_argument('sf', nargs='?', help='specify server or file')
 53 |     return parser.parse_args()
 54 | 
 55 | # parse command line argument (if any) to find server info
 56 | def find_server(args):
 57 |     i = 0
 58 |     if args.server:
 59 |         i += 1
 60 |     if args.file:
 61 |         i += 1
 62 |     if args.sf:
 63 |         i += 1
 64 |     if i > 1:
 65 |         raise Exception('Error: may only specify server once. Try "--help".')
 66 |     if args.sf:
 67 |         if os.path.exists(args.sf):
 68 |             args.file = args.sf
 69 |         else:
 70 |             try:
 71 |                 args.sf.index(':')
 72 |             except ValueError:
 73 |                 print('Error: could not parse argument as file or server:port: {}'.format(args.sf))
 74 |                 return
 75 |             args.server = args.sf
 76 |     if args.server:
 77 |         try:
 78 |             i = args.server.index(':')
 79 |         except ValueError:
 80 |             print('Error: could not parse argument as server:port: {}'.format(args.server))
 81 |             return
 82 |         server = args.server[:i]
 83 |         port = int(args.server[i + 1:])
 84 |         return (server, port)
 85 |     if not args.file:
 86 |         args.file = '.'
 87 |     if os.path.isfile(args.file):
 88 |         return use_state(args.file)
 89 |     elif os.path.isdir(args.file):
 90 |         if os.path.isfile(os.path.join(args.file, STATE_FN)):
 91 |             return use_state(os.path.join(args.file, STATE_FN))
 92 |         elif os.path.isfile(os.path.join(args.file, STATE_DIR, STATE_FN)):
 93 |             return use_state(os.path.join(args.file, STATE_DIR, STATE_FN))
 94 |     print('Error: could not find state file: {}'.format(args.file))
 95 | 
 96 | def main():
 97 |     args = parse_args()
 98 |     server = find_server(args)
 99 |     if not server:
100 |         sys.exit(1)
101 |     s = socket.socket()
102 |     s.connect(server)
103 | 
104 |     if args.debug:
105 |         socket_send(s, 'D')
106 |         server_state = socket_read(s)
107 |         s.close()
108 |         state = eval(server_state)
109 |         for jobid, val in state.items():
110 |             print('{}: {} {} {} {}'.format(jobid, val[0], val[1], val[2], val[3]))
111 |     else:
112 |         socket_send(s, 'L')
113 |         jobids = socket_read(s)
114 |         s.close()
115 |         for jobid in jobids.split():
116 |             s = socket.socket()
117 |             s.connect(server)
118 |             socket_send(s, 'Q {}'.format(jobid))
119 |             m = socket_read(s)
120 |             s.close()
121 |             print('{} {}'.format(jobid, m))
122 | 
123 | if __name__ == "__main__":
124 |     main()
125 | 


--------------------------------------------------------------------------------
/pwatcher/mains/fs_heartbeat.py:
--------------------------------------------------------------------------------
  1 | """Filesystem heartbeat wrapper
  2 | 
  3 | Perl might be better for efficiency.
  4 | But we will use python for now.
  5 | 
  6 | Non-zero status means *this* failed, not the wrapped command.
  7 | """
  8 | import argparse
  9 | import os
 10 | import socket
 11 | import sys
 12 | import threading
 13 | import time
 14 | 
 15 | DESCRIPTION = """
 16 | We wrap a system call to produce both a heartbeat and an exit-sentinel
 17 | in the filesystem.
 18 | """
 19 | EPILOG = """
 20 | We share stderr/stdout with the command. We log to stderr (for now).
 21 | """
 22 | HEARTBEAT_TEMPLATE = '0 {pid} {pgid}\n'
 23 | EXIT_TEMPLATE = '{exit_code}'
 24 | 
 25 | class _Formatter(argparse.RawDescriptionHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
 26 |     pass
 27 | _FORMATTER_CLASS = _Formatter
 28 | 
 29 | def parse_args(args):
 30 |     parser = argparse.ArgumentParser(
 31 |         description=DESCRIPTION,
 32 |         epilog=EPILOG,
 33 |         formatter_class=_FORMATTER_CLASS,
 34 |     )
 35 |     parser.add_argument('--rate',
 36 |         help='Heartbeat rate, in seconds',
 37 |         type=float,
 38 |         default=1.0, # TODO: Make this at least 10, maybe 60.
 39 |     )
 40 |     parser.add_argument('--heartbeat-file',
 41 |         help='Path to heartbeat file. The first line will have the format {!r}. The rest are just elapsed time'.format(
 42 |             HEARTBEAT_TEMPLATE),
 43 |         required=True,
 44 |     )
 45 |     parser.add_argument('--exit-file',
 46 |         help='Path to exit sentinel file. At end, it will have the format {!r}'.format(
 47 |             EXIT_TEMPLATE),
 48 |         required=True,
 49 |     )
 50 |     parser.add_argument('--directory',
 51 |         help='Directory in which to run COMMAND.',
 52 |         default='.',
 53 |     )
 54 |     parser.add_argument('command',
 55 |         help='System call (to be joined by " "). We will block on this and return its result.',
 56 |         nargs='+',
 57 |         #required=True,
 58 |     )
 59 |     return parser.parse_args(args)
 60 | 
 61 | def log(msg):
 62 |     sys.stderr.write(msg)
 63 |     sys.stderr.write('\n')
 64 |     #sys.stdout.flush() # If we use stdout.
 65 | 
 66 | def thread_heartbeat(heartbeat_fn, sleep_s):
 67 |     with open(heartbeat_fn, 'w') as ofs:
 68 |         pid = os.getpid()
 69 |         pgid = os.getpgid(0)
 70 |         ofs.write(HEARTBEAT_TEMPLATE.format(
 71 |             **locals()))
 72 |         elapsed = 0
 73 |         ctime = 0
 74 |         while True:
 75 |             #ctime = time.time()
 76 |             ofs.write('{elapsed} {ctime}\n'.format(
 77 |                 **locals()))
 78 |             ofs.flush()
 79 |             time.sleep(sleep_s)
 80 |             elapsed += 1
 81 | 
 82 | def start_heartbeat(heartbeat_fn, sleep_s):
 83 |     hb = threading.Thread(target=thread_heartbeat, args=(heartbeat_fn, sleep_s))
 84 |     log('alive? {}'.format(
 85 |         bool(hb.is_alive())))
 86 |     hb.daemon = True
 87 |     hb.start()
 88 |     return hb
 89 | 
 90 | def run(args):
 91 |     os.chdir(args.directory)
 92 |     heartbeat_fn = os.path.abspath(args.heartbeat_file)
 93 |     exit_fn = os.path.abspath(args.exit_file)
 94 |     cwd = os.getcwd()
 95 |     hostname = socket.getfqdn()
 96 |     sleep_s = args.rate
 97 |     log("""
 98 | cwd:{cwd!r}
 99 | hostname={hostname}
100 | heartbeat_fn={heartbeat_fn!r}
101 | exit_fn={exit_fn!r}
102 | sleep_s={sleep_s!r}""".format(
103 |         **locals()))
104 |     if os.path.exists(exit_fn):
105 |         os.remove(exit_fn)
106 |     if os.path.exists(heartbeat_fn):
107 |         os.remove(heartbeat_fn)
108 |     #os.system('touch {}'.format(heartbeat_fn)) # This would be over-written anyway.
109 |     log("before setpgid: pid={} pgid={}".format(os.getpid(), os.getpgid(0)))
110 |     try:
111 |         os.setpgid(0, 0) # This allows the entire tree of procs to be killed.
112 |         log(" after setpgid: pid={} pgid={}".format(
113 |             os.getpid(), os.getpgid(0)))
114 |     except OSError as e:
115 |         log(' Unable to set pgid. Possibly a grid job? Hopefully there will be no dangling processes when killed: {}'.format(
116 |             repr(e)))
117 | 
118 |     #thread = start_heartbeat(heartbeat_fn, sleep_s)
119 | 
120 |     #log('alive? {} pid={} pgid={}'.format(
121 |     #    bool(thread.is_alive()), os.getpid(), os.getpgid(0)))
122 | 
123 |     call = ' '.join(args.command)
124 |     log('In cwd: {}, Blocking call: {!r}'.format(
125 |         os.getcwd(), call))
126 |     rc = os.system(call) # Blocking.
127 | 
128 |     log(' returned: {!r}'.format(
129 |         rc))
130 | 
131 |     # Do not delete the heartbeat here. The discoverer of the exit-sentinel will do that,
132 |     # to avoid a race condition.
133 |     #if os.path.exists(heartbeat_fn):
134 |     #    os.remove(heartbeat_fn)
135 | 
136 |     exit_tmp_fn = exit_fn + '.tmp'
137 |     with open(exit_tmp_fn, 'w') as ofs:
138 |         ofs.write(EXIT_TEMPLATE.format(
139 |             exit_code=rc))
140 |     os.rename(exit_tmp_fn, exit_fn) # atomic
141 |     # sys.exit(rc) # No-one would see this anyway.
142 | 
143 | def main():
144 |     args = parse_args(sys.argv[1:])
145 |     log(repr(args))
146 |     run(args)
147 | 
148 | if __name__ == "__main__":
149 |     main()
150 | 


--------------------------------------------------------------------------------
/doc/rdf_resprentation.rst:
--------------------------------------------------------------------------------
  1 | ==================
  2 | RDF representation
  3 | ==================
  4 | 
  5 | We use a RDF framework to track the relationship between different PypeFLOW objects.
  6 | The relationship between different object in a workflow is described by an RDF triple.
  7 | 
  8 | Here are two properties are shared by all PypeFLOW object (defined in ``PypeObject``)::
  9 | 
 10 |     @property 
 11 |     def _RDFGraph(self):
 12 |         graph = Graph()
 13 | 
 14 |         for k, v in self.__dict__.iteritems():
 15 |             if k == "URL": continue
 16 |             if k[0] == "_": continue
 17 |             if hasattr(v, "URL"):
 18 |                 graph.add( ( URIRef(self.URL), pypeNS[k], URIRef(v.URL) ) )
 19 |         return graph
 20 | 
 21 | 
 22 |     
 23 |     @property
 24 |     def RDFXML(self):
 25 | 
 26 |         """ 
 27 |         RDF XML representation of the everything related to the PypeObject 
 28 |         """
 29 | 
 30 |         return self._RDFGraph.serialize() 
 31 | 
 32 | 
 33 | Most relatons used in a workflow are likely be constructed during declaration 
 34 | In ``PypeTask``, the RDF graph is populated as::
 35 | 
 36 |     @property
 37 |     def _RDFGraph(self):
 38 |         graph = Graph()
 39 |         for k, v in self.__dict__.iteritems():
 40 |             if k == "URL": continue
 41 |             if k[0] == "_": continue
 42 |             if k in ["inputDataObjs", "outputDataObjs", "mutableDataObjs", "parameters"]:
 43 |                 if k == "inputDataObjs":
 44 |                     for ft, f in v.iteritems():
 45 |                         graph.add( (URIRef(self.URL), pypeNS["prereq"], URIRef(f.URL) ) )
 46 |                 elif k == "outputDataObjs":
 47 |                     for ft, f in v.iteritems():
 48 |                         graph.add( (URIRef(f.URL), pypeNS["prereq"], URIRef(self.URL) ) )
 49 |                 elif k == "mutableDataObjs":
 50 |                     for ft, f in v.iteritems():
 51 |                         graph.add( (URIRef(self.URL), pypeNS["hasMutable"], URIRef(f.URL)   ) )
 52 |                 elif k == "parameters":
 53 |                     graph.add( (URIRef(self.URL), pypeNS["hasParameters"], Literal(json.dumps(v)) ) )
 54 |             
 55 |                 continue
 56 | 
 57 |             if k in self.inputDataObjs:
 58 |                 graph.add( ( URIRef(self.URL), pypeNS["inputDataObject"], URIRef(v.URL) ) )
 59 |                 continue
 60 | 
 61 |             if k in self.outputDataObjs:
 62 |                 graph.add( ( URIRef(self.URL), pypeNS["outputDataObject"], URIRef(v.URL) ) )
 63 |                 continue
 64 | 
 65 |             if k in self.mutableDataObjs:
 66 |                 graph.add( ( URIRef(self.URL), pypeNS["mutableDataObject"], URIRef(v.URL) ) )
 67 |                 continue
 68 | 
 69 |             if hasattr(v, "URL"):
 70 |                 graph.add( ( URIRef(self.URL), pypeNS[k], URIRef(v.URL) ) )
 71 | 
 72 |             graph.add(  ( URIRef(self.URL), pypeNS["codeMD5digest"], Literal(self._codeMD5digest) ) )
 73 |             graph.add(  ( URIRef(self.URL), pypeNS["parameterMD5digest"], Literal(self._paramMD5digest) ) )
 74 | 
 75 |         return graph
 76 | 
 77 | Here is the code that put the statement the input data objects are the
 78 | "prerequisite" object of the task::
 79 | 
 80 |        if k == "inputDataObjs":
 81 |             for ft, f in v.iteritems():
 82 |                 graph.add( (URIRef(self.URL), pypeNS["prereq"], URIRef(f.URL) ) )
 83 | 
 84 | Similarly a task is a "prerequisite" object of its output data objects::
 85 | 
 86 |         elif k == "outputDataObjs":
 87 |             for ft, f in v.iteritems():
 88 |                 graph.add( (URIRef(f.URL), pypeNS["prereq"], URIRef(self.URL) ) )
 89 | 
 90 | Typically, an output data object should only has a single prerequisite object. In the case that
 91 | a data object will be modified by multiple tasks or served as input and output at the same
 92 | time, one should specify such data object as ``mutableDataObject``.
 93 | 
 94 | When a workflow tracing the execution order, only the ``pre-req`` relation is used. However,
 95 | one can use the RDF statement to store various attributes for an object. For example, in
 96 | the above code, we explicitly specify the input data objects as an attributes::
 97 | 
 98 |         if k in self.inputDataObjs:
 99 |             graph.add( ( URIRef(self.URL), pypeNS["inputDataObject"], URIRef(v.URL) ) )
100 | 
101 | Here is an example of the RDF triples serialized as XML-RDF::
102 | 
103 |       <rdf:Description rdf:about="task://testTask">
104 |         <ns1:prereq rdf:resource="file://localhost/Sandbox/ref.fa"/>
105 |         <ns1:prereq rdf:resource="file://localhost/Sandbox/test.fa"/>
106 |         <ns1:outputDataObject rdf:resource="file://localhost/Users/Sandbox/aln.txt"/>
107 |         <ns1:codeMD5digest>122d234ed92c29b77c14a2c8b52c0e4c</ns1:codeMD5digest>
108 |         <ns1:parameterMD5digest>c1ce51016644b55e38bf089f47875062</ns1:parameterMD5digest>
109 |         <ns1:inputDataObject rdf:resource="file://localhost/Sandbox/ref.fa"/>
110 |         <ns1:inputDataObject rdf:resource="file://localhost/Sandbox/test.fa"/>
111 |       </rdf:Description>
112 | 
113 | If we would like to group different tasks into a module, we can use such RDF statement::
114 | 
115 |       <rdf:Description rdf:about="task://testTask">
116 |         <ns1:in_module rdf:resource="module://workflow/module1"/>
117 |       </rdf:Description>
118 | 
119 | This can be generated by inserting the following statement in python code::
120 | 
121 |     class MyTaskWithModule(PypeTask):
122 | 
123 |         def assign_module(self, module):
124 |             self._in_modules.append(module)
125 | 
126 |         @property
127 |         def _RDFGraph(self):
128 |             g = super(MyTaskWithModule, self)._RDFGraph 
129 |             for m in self._in_modules:
130 |                 g.add( ( URIRef(self.URL), pypeNS["inModule"], URIRef(m.URL) ) )
131 |             return g
132 | 
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PypeFlow.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PypeFlow.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/PypeFlow"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PypeFlow"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/pwatcher/mains/network_heartbeat.py:
--------------------------------------------------------------------------------
  1 | """Network server heartbeat wrapper
  2 | 
  3 | Perl might be better for efficiency.
  4 | But we will use python for now.
  5 | 
  6 | Non-zero status means *this* failed, not the wrapped command.
  7 | """
  8 | import argparse
  9 | import os
 10 | import shlex
 11 | import socket
 12 | import subprocess
 13 | import sys
 14 | import threading
 15 | import time
 16 | 
 17 | DESCRIPTION = """
 18 | We wrap a system call to produce a heartbeat.
 19 | """
 20 | EPILOG = """
 21 | We log to the status server, and forward command stdout/stderr as well.
 22 | """
 23 | 
 24 | class _Formatter(argparse.RawDescriptionHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
 25 |     pass
 26 | _FORMATTER_CLASS = _Formatter
 27 | 
 28 | def parse_args(args):
 29 |     parser = argparse.ArgumentParser(
 30 |         description=DESCRIPTION,
 31 |         epilog=EPILOG,
 32 |         formatter_class=_FORMATTER_CLASS,
 33 |     )
 34 |     parser.add_argument('--rate',
 35 |         help='Heartbeat rate, in seconds',
 36 |         type=int,
 37 |         default=600,
 38 |     )
 39 |     parser.add_argument('--heartbeat-server',
 40 |         help='Address of the heartbeat server',
 41 |         required=True,
 42 |     )
 43 |     parser.add_argument('--heartbeat-port',
 44 |         help='Port of the heartbeat server',
 45 |         type=int,
 46 |         required=True,
 47 |     )
 48 |     parser.add_argument('--jobid',
 49 |         help='Our jobid',
 50 |         required=True,
 51 |     )
 52 |     parser.add_argument('--exit-dir',
 53 |         help='Path to emergency exit sentinel directory',
 54 |         required=True,
 55 |     )
 56 |     parser.add_argument('--directory',
 57 |         help='Directory in which to run COMMAND.',
 58 |         default='.',
 59 |     )
 60 |     parser.add_argument('command',
 61 |         help='System call (to be joined by " "). We will block on this and return its result.',
 62 |         nargs='+',
 63 |         #required=True,
 64 |     )
 65 |     return parser.parse_args(args)
 66 | 
 67 | # send message delimited with a \0
 68 | def socket_send(socket, message):
 69 |     socket.sendall(b'{}\0'.format(message))
 70 | 
 71 | def log(heartbeat_server, jobid, msg):
 72 |     hsocket = socket.socket()
 73 |     try:
 74 |         hsocket.connect(heartbeat_server)
 75 |         socket_send(hsocket, 's {} {}\n'.format(jobid, msg))
 76 |         hsocket.close()
 77 |     except IOError:		# better to miss a line than terminate
 78 |         pass
 79 | 
 80 | def thread_heartbeat(heartbeat_server, jobid, sleep_s):
 81 |     pid = os.getpid()
 82 |     pgid = os.getpgid(0)
 83 |     hsocket = socket.socket()
 84 |     try:
 85 |         hsocket.connect(heartbeat_server)
 86 |         socket_send(hsocket, 'i {} {} {}'.format(jobid, pid, pgid))
 87 |         hsocket.close()
 88 |     except IOError:	# we hope it's a temporary error
 89 |         pass
 90 |     while True:
 91 |         time.sleep(sleep_s)
 92 |         hsocket = socket.socket()
 93 |         try:
 94 |             hsocket.connect(heartbeat_server)
 95 |             socket_send(hsocket, 'h {}'.format(jobid))
 96 |             hsocket.close()
 97 |         except IOError:	# we hope it's a temporary error
 98 |             pass
 99 | 
100 | def start_heartbeat(heartbeat_server, jobid, sleep_s):
101 |     hb = threading.Thread(target=thread_heartbeat, args=(heartbeat_server, jobid, sleep_s))
102 |     log(heartbeat_server, jobid, 'alive? {}'.format(
103 |         bool(hb.is_alive())))
104 |     hb.daemon = True
105 |     hb.start()
106 |     return hb
107 | 
108 | def run(args):
109 |     heartbeat_server = (args.heartbeat_server, args.heartbeat_port)
110 |     jobid = args.jobid
111 |     log(heartbeat_server, jobid, repr(args))
112 |     os.chdir(args.directory)
113 |     exit_dir = args.exit_dir
114 |     exit_fn = os.path.join(os.path.abspath(exit_dir), jobid)
115 |     cwd = os.getcwd()
116 |     hostname = socket.getfqdn()
117 |     sleep_s = args.rate
118 |     log(heartbeat_server, jobid, """
119 | cwd:{cwd!r}
120 | hostname={hostname}
121 | heartbeat_server={heartbeat_server!r}
122 | jobid={jobid}
123 | exit_dir={exit_dir!r}
124 | sleep_s={sleep_s!r}""".format(
125 |         **locals()))
126 |     log(heartbeat_server, jobid, "before setpgid: pid={} pgid={}".format(os.getpid(), os.getpgid(0)))
127 |     try:
128 |         os.setpgid(0, 0) # This allows the entire tree of procs to be killed.
129 |         log(heartbeat_server, jobid, " after setpgid: pid={} pgid={}".format(
130 |             os.getpid(), os.getpgid(0)))
131 |     except OSError as e:
132 |         log(heartbeat_server, jobid, ' Unable to set pgid. Possibly a grid job? Hopefully there will be no dangling processes when killed: {}'.format(
133 |             repr(e)))
134 | 
135 |     thread = start_heartbeat(heartbeat_server, jobid, sleep_s)
136 | 
137 |     log(heartbeat_server, jobid, 'alive? {} pid={} pgid={}'.format(
138 |         bool(thread.is_alive()), os.getpid(), os.getpgid(0)))
139 | 
140 |     call = ' '.join(args.command)
141 |     log(heartbeat_server, jobid, 'In cwd: {}, Blocking call: {!r}'.format(
142 |         os.getcwd(), call))
143 |     sp = subprocess.Popen(shlex.split(call), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
144 |     # forward all output to server until job ends, then get exit value
145 |     with sp.stdout as f:
146 |         for line in iter(f.readline, b''):
147 |             # can't use log() for this because it appends a \n
148 |             hsocket = socket.socket()
149 |             try:
150 |                 hsocket.connect(heartbeat_server)
151 |                 socket_send(hsocket, 's {} {}'.format(jobid, line))
152 |                 hsocket.close()
153 |             except IOError:		# better to miss a line than terminate
154 |                 pass
155 |     rc = sp.wait()
156 | 
157 |     log(heartbeat_server, jobid, ' returned: {!r}'.format(
158 |         rc))
159 | 
160 |     hsocket = socket.socket()
161 |     try:
162 |         hsocket.connect(heartbeat_server)
163 |         socket_send(hsocket, 'e {} {}'.format(jobid, rc))
164 |         hsocket.close()
165 |     except IOError as e:
166 |         log(heartbeat_server, jobid, 'could not update heartbeat server with exit status: {} {}: {!r}'.format(jobid, rc, e))
167 |         with open(exit_fn, 'w') as f:
168 |             f.write(str(rc))
169 |     # sys.exit(rc) # No-one would see this anyway.
170 | 
171 | def main():
172 |     args = parse_args(sys.argv[1:])
173 |     run(args)
174 | 
175 | if __name__ == "__main__":
176 |     main()
177 | 


--------------------------------------------------------------------------------
/readme.slurm.md:
--------------------------------------------------------------------------------
  1 | ```
  2 | Usage: sbatch [OPTIONS...] executable [args...]
  3 | 
  4 | Parallel run options:
  5 |   -a, --array=indexes         job array index values
  6 |   -A, --account=name          charge job to specified account
  7 |       --bb=<spec>             burst buffer specifications
  8 |       --begin=time            defer job until HH:MM MM/DD/YY
  9 |   -M, --clusters=names        Comma separated list of clusters to issue
 10 |                               commands to.  Default is current cluster.
 11 |                               Name of 'all' will submit to run on all clusters.
 12 |       --comment=name          arbitrary comment
 13 |       --cpu-freq=min[-max[:gov]] requested cpu frequency (and governor)
 14 |   -c, --cpus-per-task=ncpus   number of cpus required per task
 15 |   -d, --dependency=type:jobid defer job until condition on jobid is satisfied
 16 |   -D, --workdir=directory     set working directory for batch script
 17 |   -e, --error=err             file for batch script's standard error
 18 |       --export[=names]        specify environment variables to export
 19 |       --export-file=file|fd   specify environment variables file or file
 20 |                               descriptor to export
 21 |       --get-user-env          load environment from local cluster
 22 |       --gid=group_id          group ID to run job as (user root only)
 23 |       --gres=list             required generic resources
 24 |   -H, --hold                  submit job in held state
 25 |       --ignore-pbs            Ignore #PBS options in the batch script
 26 |   -i, --input=in              file for batch script's standard input
 27 |   -I, --immediate             exit if resources are not immediately available
 28 |       --jobid=id              run under already allocated job
 29 |   -J, --job-name=jobname      name of job
 30 |   -k, --no-kill               do not kill job on node failure
 31 |   -L, --licenses=names        required license, comma separated
 32 |   -m, --distribution=type     distribution method for processes to nodes
 33 |                               (type = block|cyclic|arbitrary)
 34 |       --mail-type=type        notify on state change: BEGIN, END, FAIL or ALL
 35 |       --mail-user=user        who to send email notification for job state
 36 |                               changes
 37 |   -n, --ntasks=ntasks         number of tasks to run
 38 |       --nice[=value]          decrease scheduling priority by value
 39 |       --no-requeue            if set, do not permit the job to be requeued
 40 |       --ntasks-per-node=n     number of tasks to invoke on each node
 41 |   -N, --nodes=N               number of nodes on which to run (N = min[-max])
 42 |   -o, --output=out            file for batch script's standard output
 43 |   -O, --overcommit            overcommit resources
 44 |   -p, --partition=partition   partition requested
 45 |       --parsable              outputs only the jobid and cluster name (if present),
 46 |                               separated by semicolon, only on successful submission.
 47 |       --power=flags           power management options
 48 |       --priority=value        set the priority of the job to value
 49 |       --profile=value         enable acct_gather_profile for detailed data
 50 |                               value is all or none or any combination of
 51 |                               energy, lustre, network or task
 52 |       --propagate[=rlimits]   propagate all [or specific list of] rlimits
 53 |       --qos=qos               quality of service
 54 |   -Q, --quiet                 quiet mode (suppress informational messages)
 55 |       --reboot                reboot compute nodes before starting job
 56 |       --requeue               if set, permit the job to be requeued
 57 |   -s, --share                 share nodes with other jobs
 58 |   -S, --core-spec=cores       count of reserved cores
 59 |       --sicp                  If specified, signifies job is to receive
 60 |       --signal=[B:]num[@time] send signal when time limit within time seconds
 61 |       --switches=max-switches{@max-time-to-wait}
 62 |                               Optimum switches and max time to wait for optimum
 63 |       --thread-spec=threads   count of reserved threads
 64 |   -t, --time=minutes          time limit
 65 |       --time-min=minutes      minimum time limit (if distinct)
 66 |       --uid=user_id           user ID to run job as (user root only)
 67 |   -v, --verbose               verbose mode (multiple -v's increase verbosity)
 68 |       --wckey=wckey           wckey to run job under
 69 |       --wrap[=command string] wrap command string in a sh script and submit
 70 | 
 71 | Constraint options:
 72 |       --contiguous            demand a contiguous range of nodes
 73 |   -C, --constraint=list       specify a list of constraints
 74 |   -F, --nodefile=filename     request a specific list of hosts
 75 |       --mem=MB                minimum amount of real memory
 76 |       --mincpus=n             minimum number of logical processors (threads)
 77 |                               per node
 78 |       --reservation=name      allocate resources from named reservation
 79 |       --tmp=MB                minimum amount of temporary disk
 80 |   -w, --nodelist=hosts...     request a specific list of hosts
 81 |   -x, --exclude=hosts...      exclude a specific list of hosts
 82 | 
 83 | Consumable resources related options:
 84 |       --exclusive[=user]      allocate nodes in exclusive mode when
 85 |                               cpu consumable resource is enabled
 86 |       --mem-per-cpu=MB        maximum amount of real memory per allocated
 87 |                               cpu required by the job.
 88 |                               --mem >= --mem-per-cpu if --mem is specified.
 89 | 
 90 | Affinity/Multi-core options: (when the task/affinity plugin is enabled)
 91 |   -B  --extra-node-info=S[:C[:T]]            Expands to:
 92 |        --sockets-per-node=S   number of sockets per node to allocate
 93 |        --cores-per-socket=C   number of cores per socket to allocate
 94 |        --threads-per-core=T   number of threads per core to allocate
 95 |                               each field can be 'min' or wildcard '*'
 96 |                               total cpus requested = (N x S x C x T)
 97 | 
 98 |       --ntasks-per-core=n     number of tasks to invoke on each core
 99 |       --ntasks-per-socket=n   number of tasks to invoke on each socket
100 | 
101 | 
102 | Help options:
103 |   -h, --help                  show this help message
104 |   -u, --usage                 display brief usage message
105 | 
106 | Other options:
107 |   -V, --version               output version information and exit
108 | ```
109 | 
110 | * https://github.com/PacificBiosciences/FALCON-integrate/issues/53
111 | * http://slurm.schedmd.com/
112 | * http://slurm.schedmd.com/sbatch.html
113 | 


--------------------------------------------------------------------------------
/doc/examples.rst:
--------------------------------------------------------------------------------
  1 | ==================
  2 | Usage and Examples
  3 | ==================
  4 | 
  5 | pypeFLOW Objects
  6 | ================
  7 | 
  8 | There are three different kinds of pypeFLOW objects:
  9 |     
 10 |     1. Data Object
 11 |     2. Task Object
 12 |     3. Workflow Object
 13 | 
 14 | Data Object
 15 | ============
 16 | 
 17 | The data objects represent the input and output data that is
 18 | processed by a pypeFLOW task objects.
 19 | 
 20 | Currently, only local file objects are implemented. In the
 21 | future, more general data (e.g. a remote file or Amazon S3
 22 | object) can be supported.
 23 | 
 24 | Here is how to create an instance of a local file data object::
 25 | 
 26 |     f1 = makePypeLocalFile("filename")
 27 | 
 28 | The ``makePypeLocalFile`` returns a ``PypeLocalFile``
 29 | object. It does not create the file in the file system.
 30 | 
 31 | Task Object
 32 | ============
 33 | 
 34 | A test object is generally created by the ``@PypeTask`` or
 35 | ``@PypeShellTask`` decorator with a task function. You will
 36 | need to specifiy the input files and the output files with
 37 | ``@Task`` decorator.  The task function should declared with
 38 | the variable argugment lists ``*argv, **kwargv`` as
 39 | argumnets::
 40 | 
 41 |     @PypeTask(inputDataObjs={"fasta":f1, "ref":f2},
 42 |               outputDataObjs={"aln":f3})
 43 |     def testTask(*argv, **kwargv):
 44 |         print("testTask is running")
 45 |         print("fasta input filename is %s" %  testTask.fasta.localFileName)
 46 |         # do something to create the output file(s)
 47 | 
 48 | The decorator return a callable object with the same name of
 49 | the function. The above example returns an instance of
 50 | ``PypeTaskBase`` object. Within a task function, the input
 51 | and output data objects can be retrieved as an instance
 52 | attribute. For example, within the ``testTask`` function,
 53 | ``testTask.fasta`` is the ``f1`` data object.
 54 | ``testTaske.fasta.localFileName`` will give the local file
 55 | of the file data object.
 56 | 
 57 | Workflow Object
 58 | ===============
 59 | 
 60 | The workflow object contains task objects and data objects.
 61 | It creates the dependency directed acyclic graph (DAG)
 62 | according to input and output data objects specified for
 63 | each task.  Circular dependency will be detected. Output
 64 | data files should be only dependent on one single task. It
 65 | is generally not a good idea to write to the same "output
 66 | file" from two tasks.
 67 | 
 68 | The general pattern to specify a workflow is
 69 | 
 70 |     1. Initialize a workflow object
 71 | 
 72 |     2. Add data objects and task objects. One can only add
 73 |        task objects, the data objects that a task object is
 74 |        dependent on will be added into the workflow
 75 |        automatically.
 76 | 
 77 |     3. Specify the data objects to be "refreshed". Namely,
 78 |        the workflow controller will evaluate the DAG and try to
 79 |        update the specified data objects if necessary. If a
 80 |        workflow has been executed and the initial and
 81 |        intermediate data objects do not change, then no task
 82 |        will be executed.  The tasks will be only executed when
 83 |        the dependency is not satisfied. Currently, we use the
 84 |        data objects' time stamp to determine the dependency. If
 85 |        the output files are newer than the input files for a
 86 |        task, the task will be ignored.
 87 | 
 88 | See the following section for an example.
 89 | 
 90 | Simple Example
 91 | ================
 92 | 
 93 | A simple workflow can look like this::
 94 | 
 95 |     from pypeflow.common import * 
 96 |     from pypeflow.task import PypeThreadTaskBase, PypeTaskBase
 97 |     from pypeflow.task import PypeTask, PypeShellTask, PypeSGETask, PypeDistributibleTask
 98 |     from pypeflow.controller import PypeWorkflow, PypeThreadWorkflow
 99 |     from pypeflow.data import PypeLocalFile, makePypeLocalFile
100 | 
101 |     def simpleTest():
102 | 
103 |         wf = PypeWorkflow() 
104 |         
105 |         # f1 and f2 are the mock input files
106 |         f1 = makePypeLocalFile("test.fa")
107 |         f2 = makePypeLocalFile("ref.fa")
108 |         
109 |         # f3 is the object of the expected output of the "testTask"
110 |         f3 = makePypeLocalFile("aln.txt", readOnly=False)
111 | 
112 |         # create the mock files
113 |         os.system("touch %s" % f1.localFileName)
114 |         os.system("touch %s" % f2.localFileName)
115 |        
116 |         # the testTask will take f1 (as "testTask.fasta") and f2 (as "testTask.ref") and generate f3 (as "testTask.aln")
117 |         @PypeTask(inputDataObjs={"fasta":f1, "ref":f2},
118 |                   outputDataObjs={"aln":f3},
119 |                   parameters={"a":10}, **{"b":12})
120 |         def testTask(*argv, **kwargv):
121 |             print("testTask is running")
122 |             for ft, f in testTask.outputDataObjs.iteritems():
123 |                 #os.system("touch %s" % f.localFileName)
124 |                 runShellCmd(["touch", "%s" % f.localFileName])
125 |                 runShellCmd(["sleep", "5" ])
126 | 
127 |         # the testTask will take f1 (as "testTask.fasta") and f3 (as "testTask.aln") and generate f4 (as "testTask.aln2")
128 |         f4 = makePypeLocalFile("aln2.txt", readOnly=False)
129 |         @PypeTask(inputDataObjs={"fasta":f1, "aln":f3},
130 |                   outputDataObjs={"aln2":f4},
131 |                   parameters={"a":10}, **{"b":12})
132 |         def testTask2(*argv, **kwargv):
133 |             print("testTask2 is running")
134 |             for ft, f in testTask2.outputDataObjs.iteritems():
135 |                 #os.system("touch %s" % f.localFileName)
136 |                 runShellCmd(["touch", "%s" % f.localFileName])
137 |         
138 |         # one can add objects one by one to the workflow
139 |         #wf.addObjects([f1,f2,f3,f4]) 
140 |         #wf.addObjects([testTask, testTask2])
141 |        
142 |         # or, one can add the "tasks" into the workflow, the input and output data objects will be added automatically
143 |         wf.addTasks([testTask, testTask2])
144 | 
145 |         #print out the RDFXML file that represents the workflow
146 |         print (wf.RDFXML)
147 |         #a graphviz dot for rendering the dependency graph if one
148 |         print (wf.graphvizDot)
149 | 
150 |         # execute the workflow until f4 is updated
151 |         wf.refreshTargets([f4])
152 | 
153 |         # mock the case that f1 is updated
154 |         print("re-touch f1")
155 |         os.system("sleep 1;touch %s;" % f1.localFileName)
156 |         wf.refreshTargets([f4])
157 | 
158 |         # mock the case that f3 is updated
159 |         print("re-touch f3")
160 |         os.system("sleep 1;touch %s;" % f3.localFileName)
161 | 
162 | The dependecy graph is shown below:
163 | 
164 | .. image:: Example1.png
165 |    :width: 400 px
166 | 
167 | In the ``example/`` directory, you can generate a more complicated mock example and excute it
168 | by running the ``PypeTest.py`` script with ``python3 PypeTest.py localshell 1``.
169 | 
170 | The dependency graph of the mock workflow looks like:
171 | 
172 | .. image:: Example2.png
173 |    :width: 600 px
174 | 
175 | 
176 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # pypeFlow documentation build configuration file, created by
  4 | # sphinx-quickstart on Tue Jan 10 21:13:17 2012.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #sys.path.insert(0, os.path.abspath('.'))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # If your documentation needs a minimal Sphinx version, state it here.
 24 | #needs_sphinx = '1.0'
 25 | 
 26 | # Add any Sphinx extension module names here, as strings. They can be extensions
 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode']
 29 | 
 30 | # Add any paths that contain templates here, relative to this directory.
 31 | templates_path = ['_templates']
 32 | 
 33 | # The suffix of source filenames.
 34 | source_suffix = '.rst'
 35 | 
 36 | # The encoding of source files.
 37 | #source_encoding = 'utf-8-sig'
 38 | 
 39 | # The master toctree document.
 40 | master_doc = 'index'
 41 | 
 42 | # General information about the project.
 43 | project = u'pypeFlow'
 44 | copyright = u'2012, Chen-Shan Chin'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = '0.1'
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = '0.1'
 54 | 
 55 | # The language for content autogenerated by Sphinx. Refer to documentation
 56 | # for a list of supported languages.
 57 | #language = None
 58 | 
 59 | # There are two options for replacing |today|: either, you set today to some
 60 | # non-false value, then it is used:
 61 | #today = ''
 62 | # Else, today_fmt is used as the format for a strftime call.
 63 | #today_fmt = '%B %d, %Y'
 64 | 
 65 | # List of patterns, relative to source directory, that match files and
 66 | # directories to ignore when looking for source files.
 67 | exclude_patterns = ['_build']
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | #default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | #add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | #add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | #show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | #modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 93 | # a list of builtin themes.
 94 | html_theme = 'nature'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #html_theme_options = {}
100 | 
101 | # Add any paths that contain custom themes here, relative to this directory.
102 | #html_theme_path = []
103 | 
104 | # The name for this set of Sphinx documents.  If None, it defaults to
105 | # "<project> v<release> documentation".
106 | #html_title = None
107 | 
108 | # A shorter title for the navigation bar.  Default is the same as html_title.
109 | #html_short_title = None
110 | 
111 | # The name of an image file (relative to this directory) to place at the top
112 | # of the sidebar.
113 | #html_logo = None
114 | 
115 | # The name of an image file (within the static path) to use as favicon of the
116 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
117 | # pixels large.
118 | #html_favicon = None
119 | 
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | html_static_path = ['_static']
124 | 
125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126 | # using the given strftime format.
127 | #html_last_updated_fmt = '%b %d, %Y'
128 | 
129 | # If true, SmartyPants will be used to convert quotes and dashes to
130 | # typographically correct entities.
131 | #html_use_smartypants = True
132 | 
133 | # Custom sidebar templates, maps document names to template names.
134 | #html_sidebars = {}
135 | 
136 | # Additional templates that should be rendered to pages, maps page names to
137 | # template names.
138 | #html_additional_pages = {}
139 | 
140 | # If false, no module index is generated.
141 | #html_domain_indices = True
142 | 
143 | # If false, no index is generated.
144 | #html_use_index = True
145 | 
146 | # If true, the index is split into individual pages for each letter.
147 | #html_split_index = False
148 | 
149 | # If true, links to the reST sources are added to the pages.
150 | #html_show_sourcelink = True
151 | 
152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153 | #html_show_sphinx = True
154 | 
155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156 | #html_show_copyright = True
157 | 
158 | # If true, an OpenSearch description file will be output, and all pages will
159 | # contain a <link> tag referring to it.  The value of this option must be the
160 | # base URL from which the finished HTML is served.
161 | #html_use_opensearch = ''
162 | 
163 | # This is the file name suffix for HTML files (e.g. ".xhtml").
164 | #html_file_suffix = None
165 | 
166 | # Output file base name for HTML help builder.
167 | htmlhelp_basename = 'pypeFlowdoc'
168 | 
169 | 
170 | # -- Options for LaTeX output --------------------------------------------------
171 | 
172 | latex_elements = {
173 | # The paper size ('letterpaper' or 'a4paper').
174 | #'papersize': 'letterpaper',
175 | 
176 | # The font size ('10pt', '11pt' or '12pt').
177 | #'pointsize': '10pt',
178 | 
179 | # Additional stuff for the LaTeX preamble.
180 | #'preamble': '',
181 | }
182 | 
183 | # Grouping the document tree into LaTeX files. List of tuples
184 | # (source start file, target name, title, author, documentclass [howto/manual]).
185 | latex_documents = [
186 |   ('index', 'pypeFlow.tex', u'pypeFlow Documentation',
187 |    u'Chen-Shan Chin', 'manual'),
188 | ]
189 | 
190 | # The name of an image file (relative to this directory) to place at the top of
191 | # the title page.
192 | #latex_logo = None
193 | 
194 | # For "manual" documents, if this is true, then toplevel headings are parts,
195 | # not chapters.
196 | #latex_use_parts = False
197 | 
198 | # If true, show page references after internal links.
199 | #latex_show_pagerefs = False
200 | 
201 | # If true, show URL addresses after external links.
202 | #latex_show_urls = False
203 | 
204 | # Documents to append as an appendix to all manuals.
205 | #latex_appendices = []
206 | 
207 | # If false, no module index is generated.
208 | #latex_domain_indices = True
209 | 
210 | 
211 | # -- Options for manual page output --------------------------------------------
212 | 
213 | # One entry per manual page. List of tuples
214 | # (source start file, name, description, authors, manual section).
215 | man_pages = [
216 |     ('index', 'pypeflow', u'PypeFlow Documentation',
217 |      [u'Chen-Shan Chin'], 1)
218 | ]
219 | 
220 | # If true, show URL addresses after external links.
221 | #man_show_urls = False
222 | 
223 | 
224 | # -- Options for Texinfo output ------------------------------------------------
225 | 
226 | # Grouping the document tree into Texinfo files. List of tuples
227 | # (source start file, target name, title, author,
228 | #  dir menu entry, description, category)
229 | texinfo_documents = [
230 |   ('index', 'pypeFlow', u'pypeFlow Documentation',
231 |    u'Chen-Shan Chin', 'PypeFlow', 'One line description of project.',
232 |    'Miscellaneous'),
233 | ]
234 | 
235 | # Documents to append as an appendix to all manuals.
236 | #texinfo_appendices = []
237 | 
238 | # If false, no module index is generated.
239 | #texinfo_domain_indices = True
240 | 
241 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
242 | #texinfo_show_urls = 'footnote'
243 | 


--------------------------------------------------------------------------------
/src/tests/test_pypeflow_data.py:
--------------------------------------------------------------------------------
  1 | from nose.tools import assert_equal
  2 | from nose import SkipTest
  3 | import tempfile
  4 | import pypeflow.data
  5 | import pypeflow.task
  6 | import os
  7 | 
  8 | PypeLocalFileCollection = pypeflow.data.PypeLocalFileCollection
  9 | PypeLocalFile = pypeflow.data.PypeLocalFile
 10 | fn = pypeflow.data.fn
 11 | 
 12 | class TestFn:
 13 |     def test_fn(self):
 14 |         file = PypeLocalFile("file://localhost/test1")
 15 |         assert fn(file) == "/test1"
 16 |         file = PypeLocalFile("file://localhost/test1/")
 17 |         assert fn(file) == "/test1/"
 18 |         file = PypeLocalFile("file://localhost/tmp/test1")
 19 |         assert fn(file) == "/tmp/test1"
 20 |         file = PypeLocalFile("file://localhost"+ os.path.abspath("./test1"))
 21 |         assert fn(file) == os.path.abspath("./test1") 
 22 | 
 23 | class TestPypeDataObjectBase: #this class can not be tested directly
 24 |     pass
 25 | 
 26 | class TestPypeLocalFile:
 27 |     def test___init__(self):
 28 |         obj = PypeLocalFile("file://localhost/test")
 29 |         assert fn(obj) == "/test"
 30 |         obj = PypeLocalFile("file://localhost/test", **{"x":123})
 31 |         assert obj.x == 123
 32 | 
 33 |     def test_clean(self):
 34 |         # pype_local_file = PypeLocalFile(URL, readOnly, **attributes)
 35 |         # assert_equal(expected, pype_local_file.clean())
 36 |         raise SkipTest # TODO: implement your test here
 37 | 
 38 |     def test_exists(self):
 39 |         obj = PypeLocalFile("file://localhost/tmp/pypetest/test")
 40 |         os.system("mkdir -p /tmp/pypetest/; touch /tmp/pypetest/test")
 41 |         assert obj.exists == True
 42 |         os.system("rm /tmp/pypetest/test")
 43 |         assert obj.exists == False
 44 | 
 45 | 
 46 |         # pype_local_file = PypeLocalFile(URL, readOnly, **attributes)
 47 |         # assert_equal(expected, pype_local_file.exists())
 48 |         raise SkipTest # TODO: implement your test here
 49 | 
 50 |     def test_path(self):
 51 |         # pype_local_file = PypeLocalFile(URL, readOnly, **attributes)
 52 |         # assert_equal(expected, pype_local_file.path())
 53 |         raise SkipTest # TODO: implement your test here
 54 | 
 55 |     def test_timeStamp(self):
 56 |         # pype_local_file = PypeLocalFile(URL, readOnly, **attributes)
 57 |         # assert_equal(expected, pype_local_file.timeStamp())
 58 |         raise SkipTest # TODO: implement your test here
 59 | 
 60 |     def test_verify(self):
 61 |         # pype_local_file = PypeLocalFile(URL, readOnly, **attributes)
 62 |         # assert_equal(expected, pype_local_file.verify())
 63 |         raise SkipTest # TODO: implement your test here
 64 | 
 65 | class TestPypeLocalFileColletion:
 66 | 
 67 |     def test___init__(self):
 68 |         files = PypeLocalFileCollection("files://localhost/tmp/pypetest/test1")
 69 |         assert files.URL == "files://localhost/tmp/pypetest/test1"
 70 |         assert files.localFileName == None
 71 | 
 72 |     def test_addLocalFile(self):
 73 |         files = PypeLocalFileCollection("files://localhost/tmp/pypetest/test1")
 74 |         aNewFile = PypeLocalFile("file://localhost/tmp/pypetest/test2")
 75 |         files.addLocalFile(aNewFile)
 76 |         assert files.localFileName == files.localFiles[0].localFileName 
 77 |         assert fn(files) == fn(files.localFiles[0])
 78 | 
 79 |     def test_timeStamp(self):
 80 |         raise SkipTest # TODO: implement your test here
 81 | 
 82 |     def exists(self):
 83 |         raise SkipTest # TODO: implement your test here
 84 | 
 85 | class TestPypeHDF5Dataset:
 86 |     pass
 87 | 
 88 | class TestPypeLocalCompositeFile:
 89 |     def test___init__(self):
 90 |         # pype_local_composite_file = PypeLocalCompositeFile(URL, readOnly, **attributes)
 91 |         raise SkipTest # TODO: implement your test here
 92 | 
 93 | class TestMakePypeLocalFile:
 94 |     def test_make_pype_local_file(self):
 95 |         # assert_equal(expected, makePypeLocalFile(aLocalFileName, readOnly, **attributes))
 96 |         raise SkipTest # TODO: implement your test here
 97 | 
 98 | class TestPypeSplittableLocalFile:
 99 |     def test___init__(self):
100 |         pype_splittable_local_file =\
101 |         pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/./test.txt", 
102 |                                               nChunk=5)
103 |         for i in range(5):
104 |             assert pype_splittable_local_file._splittedFiles[i].URL ==\
105 |             'file://localhost/./%03d_test.txt' % i
106 | 
107 |     def test_setGatherTask(self):
108 | 
109 |         for i in range(5):
110 |             with open("/tmp/pypetest/%03d_test_fofn.txt" % i, "w") as f:
111 |                 f.write("file%02d\n" % i)
112 | 
113 |         pype_splittable_local_file =\
114 |         pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/tmp/pypetest/test_fofn.txt", 
115 |                                               nChunk=5)
116 |         with open("/tmp/pypetest/gather.sh", "w") as f:
117 |             f.write("#!/bin/bash\n")
118 |             f.write("if [ -e /tmp/pypetest/test_fofn.txt ]; then rm /tmp/pypetest/test_fofn.txt; fi\n")
119 |             f.write("for f in %s;" % " ".join( ["%03d" % i for i in range(5)] )) 
120 |             f.write('do cat /tmp/pypetest/$f"_test_fofn.txt" >> /tmp/pypetest/test_fofn.txt\n')
121 |             f.write("done\n")
122 | 
123 |         PypeShellTask = pypeflow.task.PypeShellTask
124 |         PypeTaskBase = pypeflow.task.PypeTaskBase
125 |         pype_splittable_local_file.setGatherTask(PypeShellTask, 
126 |                                                  PypeTaskBase, 
127 |                                                  "/tmp/pypetest/gather.sh")
128 |         pype_splittable_local_file.getGatherTask()()
129 | 
130 |         with open("/tmp/pypetest/test_fofn.txt") as f:
131 |             i = 0
132 |             for l in f:
133 |                 l = l.strip()
134 |                 assert l == "file%02d" % i
135 |                 i += 1
136 | 
137 |         import os
138 |         for i in range(5):
139 |             os.system(" rm  /tmp/pypetest/%03d_test_fofn.txt" % i)
140 | 
141 |     def test_setScatterTask(self):
142 |         
143 |         with open("/tmp/pypetest/test_fofn.txt", "w") as f:
144 |             for i in range(5):
145 |                 f.write("file%02d\n" % i)
146 | 
147 |         pype_splittable_local_file =\
148 |         pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/tmp/pypetest/test_fofn.txt", 
149 |                                               nChunk=5)
150 | 
151 |         with open("/tmp/pypetest/scatter.sh", "w") as f:
152 |             f.write("#!/bin/bash\n")
153 |             f.write("for f in %s;" % " ".join( ["%03d" % i for i in range(5)] )) 
154 |             f.write('do if [ -e /tmp/pypetest/%f"_test_fofn.txt" ]; \
155 |                         then rm /tmp/pypetest/$f"_test_fofn.txt"; fi\n')
156 |             f.write("done\n")
157 |             for i in range(5):
158 |                 f.write("echo file%02d > /tmp/pypetest/%03d_test_fofn.txt\n" % (i, i))
159 | 
160 |         PypeShellTask = pypeflow.task.PypeShellTask
161 |         PypeTaskBase = pypeflow.task.PypeTaskBase
162 |         pype_splittable_local_file.setScatterTask(PypeShellTask, 
163 |                                                   PypeTaskBase, 
164 |                                                   "/tmp/pypetest/scatter.sh")
165 |         pype_splittable_local_file.getScatterTask()()
166 | 
167 |         for i in range(5):
168 |             with open("/tmp/pypetest/%03d_test_fofn.txt" % i) as f:
169 |                 l = f.read().strip()
170 |                 assert l == "file%02d" % i
171 | 
172 | 
173 |     def test_getGatherTask(self):
174 |         pype_splittable_local_file =\
175 |         pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/tmp/pypetest/test_fofn.txt", 
176 |                                               nChunk=5)
177 |         PypeShellTask = pypeflow.task.PypeShellTask
178 |         PypeTaskBase = pypeflow.task.PypeTaskBase
179 |         pype_splittable_local_file.setGatherTask(PypeShellTask, PypeTaskBase, "/tmp/pypetest/gather.sh")
180 |         assert pype_splittable_local_file.getGatherTask() == pype_splittable_local_file._gatherTask
181 |         assert pype_splittable_local_file.getScatterTask() == None
182 | 
183 |     def test_getScatterTask(self):
184 |         pype_splittable_local_file =\
185 |         pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/tmp/pypetest/test_fofn.txt", 
186 |                                               nChunk=5)
187 |         PypeShellTask = pypeflow.task.PypeShellTask
188 |         PypeTaskBase = pypeflow.task.PypeTaskBase
189 |         pype_splittable_local_file.setScatterTask(PypeShellTask, PypeTaskBase, "/tmp/pypetest/scatter.sh")
190 |         #pype_splittable_local_file.getScatterTask()
191 |         assert pype_splittable_local_file.getScatterTask() == pype_splittable_local_file._scatterTask
192 |         assert pype_splittable_local_file.getGatherTask() == None
193 | 
194 |     def test_getSplittedFiles(self):
195 |         pype_splittable_local_file =\
196 |         pypeflow.data.PypeSplittableLocalFile("splittablefile://localhost/tmp/pypetest/test.txt", 
197 |                                               nChunk=5)
198 |         i = 0
199 |         for f in pype_splittable_local_file.getSplittedFiles():
200 |             assert f.URL ==\
201 |             'file://localhost/tmp/pypetest/%03d_test.txt' % i
202 |             i += 1
203 | 


--------------------------------------------------------------------------------
/pypeflow/do_task.py:
--------------------------------------------------------------------------------
  1 | from . import do_support, util
  2 | from .io import fix_relative_symlinks
  3 | import argparse
  4 | import copy
  5 | import importlib
  6 | import inspect
  7 | import json
  8 | import logging
  9 | import os
 10 | import pprint
 11 | import re
 12 | import string
 13 | import sys
 14 | import time
 15 | from shlex import quote
 16 | DONE = 'done'
 17 | STATUS = 'status'
 18 | TIMEOUT = 30
 19 | LOG = logging.getLogger()
 20 | DESCRIPTION = """Given a JSON description, call a python-function.
 21 | """
 22 | EPILOG = """
 23 | The JSON looks like this:
 24 | {
 25 |     "inputs": {"input-name": "filename"},
 26 |     "outputs": {"output-name": "output-filename (relative)"},
 27 |     "bash_template_fn": "template.sh",
 28 |     "parameters": {}
 29 | }
 30 | 
 31 | This program will run on the work host, and it will do several things:
 32 |     - Run in CWD.
 33 |     - Verify that inputs are available. (Wait til timeout if not.)
 34 |     - Possibly, cd to tmpdir and create symlinks from inputs.
 35 |     - Run the python-function.
 36 |       - Its module must be available (e.g. in PYTHONPATH).
 37 |       - Pass a kwd-dict of the union of inputs/outputs/parameters.
 38 |       - Ignore return-value. Expect exceptions.
 39 |     - Possibly, mv outputs from tmpdir to workdir.
 40 |     - Write exit-code into STATUS.
 41 |     - Touch DONE on success.
 42 | """
 43 | """
 44 | (Someday, we might also support runnable Python modules, or even executables via execvp().)
 45 | 
 46 | Note: qsub will *not* run this directly. There is a higher layer.
 47 | """
 48 | 
 49 | def get_parser():
 50 |     class _Formatter(argparse.RawDescriptionHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
 51 |         pass
 52 |     parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG,
 53 |         formatter_class=_Formatter,
 54 |     )
 55 |     parser.add_argument('--timeout',
 56 |         type=int, default=TIMEOUT,
 57 |         help='How many seconds to wait for input files (and JSON) to exist. (default: %(default)s')
 58 |     parser.add_argument('--tmpdir',
 59 |         help='Root directory to run in. (Sub-dir name will be based on CWD.)')
 60 |     parser.add_argument('json_fn',
 61 |         help='JSON file, as per epilog.')
 62 |     return parser
 63 | 
 64 | def wait_for(fn, timeout=None):
 65 |     if timeout is None:
 66 |         global TIMEOUT
 67 |         timeout = copy.copy(TIMEOUT) # just to be clear
 68 |     try:
 69 |         _wait_for(fn, timeout)
 70 |     except BaseException:
 71 |         LOG.exception('Was waiting for {!r}'.format(fn))
 72 |         raise
 73 | 
 74 | def _wait_for(fn, timeout):
 75 |     LOG.debug('Checking existence of {!r} with timeout={}'.format(fn, timeout))
 76 |     dirname = os.path.dirname(fn)
 77 |     if os.path.exists(dirname):
 78 |         if not os.access(dirname, os.X_OK):
 79 |             raise Exception('Cannot x into dir {!r}'.format(dirname))
 80 |     while not os.path.exists(fn):
 81 |         if timeout > 0:
 82 |             time.sleep(1)
 83 |             timeout -= 1
 84 |         else:
 85 |             raise Exception('Timed out waiting for {!r}'.format(fn))
 86 |     assert os.access(fn, os.R_OK), '{!r} not readable'.format(fn)
 87 | 
 88 | def get_func(python_function):
 89 |     mod_name, func_name = os.path.splitext(python_function)
 90 |     func_name = func_name[1:] # skip dot
 91 |     mod = importlib.import_module(mod_name)
 92 |     func = getattr(mod, func_name)
 93 |     return func
 94 | 
 95 | class OldTaskRunner(object):
 96 |     def __init__(self, inputs, outputs, parameters):
 97 |         for k,v in (list(inputs.items()) + list(outputs.items())):
 98 |             setattr(self, k, v)
 99 |         self.parameters = parameters
100 |         self.inputs = inputs
101 |         self.outputs = outputs
102 | 
103 | def run_python_func(func, inputs, outputs, parameters):
104 |     if False:
105 |         kwds = dict()
106 |         kwds.update(inputs)
107 |         kwds.update(outputs)
108 |         kwds.update(parameters)
109 |         func(**kwds)
110 |     else:
111 |         # old way, for now
112 |         cwd = os.getcwd()
113 |         parameters['cwd'] = cwd
114 |         self = OldTaskRunner(inputs, outputs, parameters)
115 |         func(self=self)
116 |         script_fn = getattr(self, 'generated_script_fn', None)
117 |         if script_fn is not None:
118 |             do_support.run_bash(script_fn)
119 | 
120 | def run_python(python_function_name, myinputs, myoutputs, parameters):
121 |     func = get_func(python_function_name)
122 |     try:
123 |         run_python_func(func, myinputs, myoutputs, parameters)
124 |     except TypeError:
125 |         # Report the actual function spec.
126 |         LOG.error('For function "{}", {}'.format(python_function_name, inspect.getargspec(func)))
127 |         raise
128 | 
129 | class Attrs(object):
130 |     """This facilitates substitution of values in string.
131 |     """
132 |     def __str__(self):
133 |         # For this, all values must be strings.
134 |         return ' '.join(f for f in self.kwds.values())
135 |     def __getattr__(self, name):
136 |         # For this, values can be string, int, float, etc.
137 |         if '*' in name:
138 |             re_star = re.compile('^' + name.replace('*', '.*') + '$')
139 |             result = (v for (k,v) in self.kwds.items() if re_star.search(k))
140 |         elif 'ALL' == name:
141 |             result = iter(self.kwds.values())
142 |         else:
143 |             result = [str(self.kwds[name])]
144 |         return ' '.join(self.quote(v) for v in sorted(result))
145 |     def __init__(self, kwds, quote=quote):
146 |         self.kwds = kwds
147 |         self.quote = quote
148 | 
149 | def sub(bash_template, myinputs, myoutputs, parameters):
150 |     # Set substitution dict
151 |     var_dict = dict()
152 |     valid_parameters = {k:v for k,v in parameters.items() if not k.startswith('_')}
153 |     assert 'input' not in parameters
154 |     assert 'output' not in parameters
155 |     # input/output/params are the main values substituted in the subset of
156 |     # snakemake which we support.
157 |     var_dict['input'] = Attrs(myinputs)
158 |     var_dict['output'] = Attrs(myoutputs)
159 |     var_dict['params'] = Attrs(valid_parameters, quote=lambda x:x)
160 |     fmtr = string.Formatter()
161 |     return fmtr.vformat(bash_template, [], var_dict)
162 | 
163 | def run_bash(bash_template, myinputs, myoutputs, parameters):
164 |     # Like snakemake, we use bash "strict mode", but we add -vx.
165 |     # http://redsymbol.net/articles/unofficial-bash-strict-mode/
166 |     prefix = """
167 | IFS=$'\n\t'
168 | set -vxeuo pipefail
169 | hostname
170 | pwd
171 | date
172 | """
173 |     # Substitute
174 |     try:
175 |         task_lines = sub(bash_template, myinputs, myoutputs, parameters)
176 |     except Exception:
177 |         msg = """\
178 | Failed to substitute var_dict
179 |   inputs: {}
180 |   outputs: {}
181 |   parameters: {}
182 | into bash script:
183 | {}
184 | Possibly you forgot to use "input.foo" "output.bar" "params.fubar" etc. in your script?
185 | """.format(myinputs, myoutputs, parameters, bash_template)
186 |         LOG.error(msg)
187 |         raise
188 | 
189 |     postfix = """
190 | date
191 | """
192 |     # Combine
193 |     bash_content = prefix + task_lines  + postfix
194 | 
195 |     # Write user_script.sh
196 |     bash_fn = 'user_script.sh'
197 |     with open(bash_fn, 'w') as ofs:
198 |         ofs.write(bash_content)
199 |     cmd = '/bin/bash {}'.format(bash_fn)
200 |     util.system(cmd)
201 | 
202 | def run_cfg_in_tmpdir(cfg, tmpdir, relpath):
203 |     """
204 |     Accept 'inputs', 'outputs', 'parameters' in cfg.
205 |     Relativize 'inputs' relative to relpath, unless running in tmpdir.
206 |     ('outputs' are always relative to rundir.)
207 |     If 'bash_template_fn' in cfg, then substitute and use it.
208 |     """
209 |     inputs = cfg['inputs']
210 |     outputs = cfg['outputs']
211 |     parameters = cfg['parameters']
212 |     bash_template_fn = cfg['bash_template_fn']
213 |     for k,v in list(inputs.items()):
214 |         if not os.path.isabs(v):
215 |             inputs[k] = os.path.normpath(os.path.join(relpath, v))
216 |             if tmpdir:
217 |                 inputs[k] = os.path.abspath(inputs[k])
218 |     for fn in inputs.values():
219 |         wait_for(fn)
220 |     wait_for(bash_template_fn)
221 |     bash_template = open(bash_template_fn).read()
222 |     myinputs = dict(inputs)
223 |     myoutputs = dict(outputs)
224 |     finaloutdir = os.getcwd()
225 |     if tmpdir:
226 |         import getpass
227 |         user = getpass.getuser()
228 |         pid = os.getpid()
229 |         myrundir = '{tmpdir}/{user}/pypetmp/{finaloutdir}'.format(**locals())
230 |         util.rmdirs(myrundir)
231 |         util.mkdirs(myrundir)
232 |         # TODO(CD): Copy inputs w/ flock.
233 |     else:
234 |         myrundir = finaloutdir
235 |     with util.cd(myrundir):
236 |         if tmpdir:
237 |             # Check again, in case we have the paths wrong.
238 |             for fn in inputs.values():
239 |                 wait_for(fn, 0)
240 |         # TODO(CD): Write a script in wdir even when running in tmpdir (so we can see it on error).
241 |         run_bash(bash_template, myinputs, myoutputs, parameters)
242 |     if tmpdir:
243 |         """
244 |         for k,v in outputs.iteritems():
245 |             cmd = 'mv -f {} {}'.format(
246 |                 os.path.join(myrundir, v),
247 |                 os.path.join(finaloutdir, v))
248 |             util.system(cmd)
249 |         """
250 |         cmd = 'rsync -av {}/ {}; rm -rf {}'.format(myrundir, finaloutdir, myrundir)
251 |         util.system(cmd)
252 |         fix_relative_symlinks(finaloutdir, myrundir, recursive=True)
253 |     for fn in cfg['outputs'].values():
254 |         wait_for(fn)
255 | 
256 | def run(json_fn, timeout, tmpdir):
257 |     if isinstance(timeout, int):
258 |         global TIMEOUT
259 |         TIMEOUT = timeout
260 |     wait_for(json_fn)
261 |     LOG.debug('Loading JSON from {!r}'.format(json_fn))
262 |     cfg = json.loads(open(json_fn).read())
263 |     LOG.debug(pprint.pformat(cfg))
264 |     rundir = os.path.normpath(os.path.dirname(json_fn))
265 |     with util.cd(rundir):
266 |         run_cfg_in_tmpdir(cfg, tmpdir, '.')
267 | 
268 | def main():
269 |     parser = get_parser()
270 |     parsed_args = parser.parse_args(sys.argv[1:])
271 |     try:
272 |         run(**vars(parsed_args))
273 |     except Exception:
274 |         LOG.critical('Error in {} with args={!r}'.format(sys.argv[0], pprint.pformat(vars(parsed_args))))
275 |         raise
276 | 
277 | if __name__ == "__main__":
278 |     do_support.setup_simple_logging(**os.environ)
279 |     LOG.debug('Running "{}"'.format(' '.join(sys.argv)))
280 |     main()
281 | 


--------------------------------------------------------------------------------
/example/PypeTest.py:
--------------------------------------------------------------------------------
  1 | # @author Jason Chin
  2 | 
  3 | import sys
  4 | import os 
  5 | 
  6 | 
  7 | from pypeflow.common import * 
  8 | from pypeflow.task import PypeThreadTaskBase, PypeTaskBase
  9 | from pypeflow.task import PypeTask, PypeShellTask, PypeSGETask, PypeDistributibleTask
 10 | from pypeflow.controller import PypeWorkflow, PypeThreadWorkflow, PypeMPWorkflow
 11 | from pypeflow.data import PypeLocalFile, makePypeLocalFile
 12 | import logging
 13 | 
 14 | logger = logging.getLogger()
 15 | #logger.setLevel(logging.INFO)
 16 | logger.setLevel(logging.DEBUG)
 17 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 18 | ch = logging.StreamHandler()
 19 | ch.setLevel(logging.DEBUG)
 20 | ch.setFormatter(formatter)
 21 | logger.addHandler(ch)
 22 | 
 23 | 
 24 | def simpleTest():
 25 | 
 26 |     wf = PypeWorkflow() 
 27 |     
 28 |     # f1 and f2 are the mock input files
 29 |     f1 = makePypeLocalFile("test.fa")
 30 |     f2 = makePypeLocalFile("ref.fa")
 31 |     
 32 |     # f3 is the object of the expected output of the "testTask"
 33 |     f3 = makePypeLocalFile("aln.txt", readOnly=False)
 34 | 
 35 |     # create the mock files
 36 |     os.system("touch %s" % f1.localFileName)
 37 |     os.system("touch %s" % f2.localFileName)
 38 |    
 39 |     # the testTask will take f1 (as "testTask.fasta") and f2 (as "testTask.ref") and generate f3 (as "testTask.aln")
 40 |     @PypeTask(inputDataObjs={"fasta":f1, "ref":f2},
 41 |               outputDataObjs={"aln":f3},
 42 |               parameters={"a":10}, **{"b":12})
 43 |     def testTask(*argv, **kwargv):
 44 |         print("testTask is running")
 45 |         print("fasta input filename is %s" %  testTask.fasta.localFileName)
 46 |         for ft, f in testTask.outputDataObjs.items():
 47 |             #os.system("touch %s" % f.localFileName)
 48 |             runShellCmd(["touch", "%s" % f.localFileName])
 49 |             runShellCmd(["sleep", "5" ])
 50 | 
 51 |     # the testTask will take f1 (as "testTask.fasta") and f3 (as "testTask.aln") and generate f4 (as "testTask.aln2")
 52 |     f4 = makePypeLocalFile("aln2.txt", readOnly=False)
 53 |     @PypeTask(inputDataObjs={"fasta":f1, "aln":f3},
 54 |               outputDataObjs={"aln2":f4},
 55 |               parameters={"a":10}, **{"b":12})
 56 |     def testTask2(*argv, **kwargv):
 57 |         print("testTask2 is running")
 58 |         for ft, f in testTask2.outputDataObjs.items():
 59 |             #os.system("touch %s" % f.localFileName)
 60 |             runShellCmd(["touch", "%s" % f.localFileName])
 61 |     
 62 |     # one can add objects one by one to the workflow
 63 |     #wf.addObjects([f1,f2,f3,f4]) 
 64 |     #wf.addObjects([testTask, testTask2])
 65 |    
 66 |     # or, one can add the "tasks" into the workflow, the input and output data objects will be added automatically
 67 |     wf.addTasks([testTask, testTask2])
 68 | 
 69 |     #print out the RDFXML file that represents the workflow
 70 |     print (wf.RDFXML)
 71 |     #a graphviz dot for rendering the dependency graph if one
 72 |     print (wf.graphvizDot)
 73 | 
 74 |     # execute the workflow until f4 is updated
 75 |     wf.refreshTargets([f4])
 76 | 
 77 |     # mock the case that f1 is updated
 78 |     print("re-touch f1")
 79 |     os.system("sleep 1;touch %s;" % f1.localFileName)
 80 |     wf.refreshTargets([f4])
 81 | 
 82 |     # mock the case that f3 is updated
 83 |     print("re-touch f3")
 84 |     os.system("sleep 1;touch %s;" % f3.localFileName)
 85 | 
 86 | def simpleTest2():
 87 | 
 88 |     wf = PypeWorkflow()
 89 | 
 90 |     f1 = makePypeLocalFile("test.fa")
 91 |     f2 = makePypeLocalFile("ref.fa")
 92 |     f3 = makePypeLocalFile("aln.txt", readOnly=False)
 93 |     f4 = makePypeLocalFile("aln2.txt", readOnly=False)
 94 | 
 95 |     os.system("touch %s" % f1.localFileName)
 96 |     os.system("touch %s" % f2.localFileName)
 97 |     
 98 |     @PypeTask(inputDataObjs={"fasta":f1, "ref":f2},
 99 |               outputDataObjs={"aln":f3},
100 |               parameters={"a":10}, **{"b":12})
101 |     def testTask(*argv, **kwargv):
102 |         print("testTask is running")
103 |         for ft, f in testTask.outputDataObjs.items():
104 |             #os.system("touch %s" % f.localFileName)
105 |             runShellCmd(["touch", "%s" % f.localFileName])
106 |             runShellCmd(["sleep", "5" ])
107 | 
108 |     @PypeTask(inputDataObjs={"fasta":f1, "aln":f3},
109 |               outputDataObjs={"aln2":f4},
110 |               parameters={"a":10}, **{"b":12})
111 |     def testTask2(*argv, **kwargv):
112 |         print("testTask2 is running")
113 |         for ft, f in testTask2.outputDataObjs.items():
114 |             #os.system("touch %s" % f.localFileName)
115 |             runShellCmd(["touch", "%s" % f.localFileName])
116 |         
117 |     #wf.addObjects([f1,f2,f3,f4]) wf.addObjects([testTask, testTask2])
118 |     
119 |     wf.addTasks([testTask, testTask2])
120 | 
121 |     print (wf.RDFXML)
122 |     print (wf.graphvizDot)
123 | 
124 |     #aGraph = PypeGraph(wf._RDFGraph) print(aGraph.tSort())
125 | 
126 |     wf.refreshTargets([f4])
127 | 
128 |     print("re-touch f1")
129 |     os.system("sleep 1;touch %s;" % f1.localFileName)
130 |     wf.refreshTargets([f4])
131 | 
132 |     print("re-touch f3")
133 |     os.system("sleep 1;touch %s;" % f3.localFileName)
134 | 
135 | def testDistributed(runmode, cleanup):
136 |     logger.info("test start")
137 |     baseDir = "."
138 |     import random
139 |     random.seed(1984)
140 |     #PypeThreadWorkflow.setNumThreadAllowed(20,20)
141 |     #wf = PypeThreadWorkflow()
142 |     PypeMPWorkflow.setNumThreadAllowed(20,20)
143 |     wf = PypeMPWorkflow()
144 |     allTasks = []
145 |     for layer in range(5):
146 |         fN = random.randint(3,7)
147 |         fin = [None] * fN
148 |         fout = [None] * fN
149 |         fmut = [None] * fN
150 |         for w in range(fN):
151 |             fin[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer, w) )
152 |             fout[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer+1, w) )
153 |             fmut[w] = makePypeLocalFile(baseDir + "/testdata/m_testfile_l%d_w%d.dat" % (layer+1, w) )
154 |             #wf.addObjects([fin[w], fout[w], fmut[w]])
155 | 
156 |         for w in range(fN):
157 |             inputDataObjs = {}
158 |             outputDataObjs = {}
159 |             mutableDataObjs = {}
160 |             for i in range(5):
161 |                 inputDataObjs["infile%d" % i] = random.choice(fin)
162 | 
163 |             i = 0
164 |             for obj in random.sample(fmut,2):
165 |                 #mutableDataObjs["outfile%d" % i] = obj
166 |                 i += 1
167 |             outputDataObjs["outfile%d" % i] = fout[w]
168 | 
169 |             shellCmd = "sleep 1\n" + "\n".join([ "echo %d %d ...  >> %s" % (layer, w, of.localFileName) for of in outputDataObjs.values() ]) + "\nsleep 10"
170 |             shellCmd += "sleep 1\n" + "\n".join([ "echo %d %d ...  >> %s" % (layer, w, of.localFileName) for of in mutableDataObjs.values() ]) + "\nsleep 10"
171 |             shellFileName = baseDir + "/testdata/task_l%d_w%d.sh" % (layer, w)
172 |             shfile = open(shellFileName, 'w')
173 |             print(shellCmd, file=shfile)
174 |             shfile.close()
175 | 
176 |             if runmode == "internal":
177 |                 def t1(self):
178 |                     runShellCmd(["sleep", "%d" % random.randint(0,20) ])
179 | 
180 |                     for of in self.outputDataObjs.values():
181 |                         runShellCmd(["touch", of.localFileName])
182 | 
183 |                 task = PypeTask(inputDataObjs = inputDataObjs,
184 |                                 outputDataObjs = outputDataObjs, 
185 |                                 mutableDataObjs = mutableDataObjs,
186 |                                 URL="task://internal/task_l%d_w%d" % (layer, w), 
187 |                                 TaskType=PypeThreadTaskBase) ( t1 )
188 | 
189 |             elif runmode == "localshell":
190 |                 task = PypeShellTask(inputDataObjs = inputDataObjs,
191 |                                      outputDataObjs = outputDataObjs, 
192 |                                      mutableDataObjs = mutableDataObjs,
193 |                                      URL="task://localshell/task_l%d_w%d" % (layer, w), 
194 |                                      TaskType=PypeThreadTaskBase) ( "%s" % shellFileName )
195 | 
196 |             elif runmode == "sge": 
197 |                 task = PypeSGETask(inputDataObjs = inputDataObjs,
198 |                                    outputDataObjs = outputDataObjs, 
199 |                                    mutableDataObjs = mutableDataObjs,
200 |                                    URL="task://sge/task_l%d_w%d" % (layer, w), 
201 |                                    TaskType=PypeThreadTaskBase) ( "%s" % shellFileName )
202 | 
203 |             elif runmode == "mixed":
204 |                 #distributed = random.choice( (False, True) )
205 |                 distributed = True if w % 3 == 0 else False
206 |                 task = PypeDistributibleTask(inputDataObjs = inputDataObjs,
207 |                                    outputDataObjs = outputDataObjs,
208 |                                    mutableDataObjs = mutableDataObjs,
209 |                                    URL="task://sge/task_l%d_w%d" % (layer, w), 
210 |                                    distributed=distributed,
211 |                                    TaskType=PypeThreadTaskBase) ( "%s" % shellFileName )
212 | 
213 |             wf.addTasks([task])
214 |             allTasks.append(task)
215 | 
216 |     for URL in wf._pypeObjects:
217 |         prereqJobURLs = [str(u) for u in wf._RDFGraph.transitive_objects(URIRef(URL), pypeNS["prereq"])
218 |                                         if isinstance(wf._pypeObjects[str(u)], PypeLocalFile) and str(u) != URL ]
219 |         if len(prereqJobURLs) == 0:
220 |             if cleanup == "1":
221 |                 os.system("echo start > %s" % wf._pypeObjects[URL].localFileName)
222 |             pass
223 |     wf.refreshTargets(allTasks)
224 |     dotFile = open("test.dot","w")
225 |     #print >>dotFile, wf.graphvizShortNameDot
226 |     print(wf.graphvizDot, file=dotFile)
227 |     dotFile.close()
228 |     dotFile = open("test_short_name.dot","w")
229 |     print(wf.graphvizShortNameDot, file=dotFile)
230 |     dotFile.close()
231 |     rdfFile = open("test.rdf","w")
232 |     print(wf.RDFXML, file=rdfFile)
233 |     rdfFile.close()
234 |     if runmode != "internal":
235 |         mkFile = open("test.mk","w")
236 |         print(wf.makeFileStr, file=mkFile)
237 |         mkFile.close()
238 | 
239 | if __name__ == "__main__":
240 |     try:
241 |         testDistributed(sys.argv[1], sys.argv[2])
242 |     except IndexError:
243 |         print("try: python3 PypeTest.py localshell 1")
244 |         print("running simpleTest()")
245 |         simpleTest()
246 | 
247 | 


--------------------------------------------------------------------------------
/src/tests/test_pypeflow_task.py:
--------------------------------------------------------------------------------
  1 | from nose.tools import assert_equal
  2 | from nose import SkipTest
  3 | import pypeflow.task
  4 | import pypeflow.data
  5 | 
  6 | class TestPypeTaskBase:
  7 |     def test___call__(self):
  8 |         # pype_task_base = PypeTaskBase(URL, *argv, **kwargv)
  9 |         # assert_equal(expected, pype_task_base.__call__(*argv, **kwargv))
 10 |         raise SkipTest # TODO: implement your test here
 11 | 
 12 |     def test___init__(self):
 13 |         # pype_task_base = PypeTaskBase(URL, *argv, **kwargv)
 14 |         raise SkipTest # TODO: implement your test here
 15 | 
 16 |     def test_finalize(self):
 17 |         # pype_task_base = PypeTaskBase(URL, *argv, **kwargv)
 18 |         # assert_equal(expected, pype_task_base.finalize())
 19 |         raise SkipTest # TODO: implement your test here
 20 | 
 21 |     def test_setInputs(self):
 22 |         # pype_task_base = PypeTaskBase(URL, *argv, **kwargv)
 23 |         # assert_equal(expected, pype_task_base.setInputs(inputDataObjs))
 24 |         raise SkipTest # TODO: implement your test here
 25 | 
 26 |     def test_setOutputs(self):
 27 |         # pype_task_base = PypeTaskBase(URL, *argv, **kwargv)
 28 |         # assert_equal(expected, pype_task_base.setOutputs(outputDataObjs))
 29 |         raise SkipTest # TODO: implement your test here
 30 | 
 31 |     def test_setReferenceMD5(self):
 32 |         # pype_task_base = PypeTaskBase(URL, *argv, **kwargv)
 33 |         # assert_equal(expected, pype_task_base.setReferenceMD5(md5Str))
 34 |         raise SkipTest # TODO: implement your test here
 35 | 
 36 |     def test_status(self):
 37 |         # pype_task_base = PypeTaskBase(URL, *argv, **kwargv)
 38 |         # assert_equal(expected, pype_task_base.status())
 39 |         raise SkipTest # TODO: implement your test here
 40 | 
 41 | class TestPypeThreadTaskBase:
 42 |     def test___call__(self):
 43 |         # pype_thread_task_base = PypeThreadTaskBase()
 44 |         # assert_equal(expected, pype_thread_task_base.__call__(*argv, **kwargv))
 45 |         raise SkipTest # TODO: implement your test here
 46 | 
 47 |     def test_nSlots(self):
 48 |         # pype_thread_task_base = PypeThreadTaskBase()
 49 |         # assert_equal(expected, pype_thread_task_base.nSlots())
 50 |         raise SkipTest # TODO: implement your test here
 51 | 
 52 |     def test_setMessageQueue(self):
 53 |         # pype_thread_task_base = PypeThreadTaskBase()
 54 |         # assert_equal(expected, pype_thread_task_base.setMessageQueue(q))
 55 |         raise SkipTest # TODO: implement your test here
 56 | 
 57 | class TestPypeDistributiableTaskBase:
 58 |     def test___init__(self):
 59 |         # pype_distributiable_task_base = PypeDistributiableTaskBase(URL, *argv, **kwargv)
 60 |         raise SkipTest # TODO: implement your test here
 61 | 
 62 | 
 63 | class TestPypeTask:
 64 |     def test_pype_task(self):
 65 |         # assert_equal(expected, PypeTask(*argv, **kwargv))
 66 |         raise SkipTest # TODO: implement your test here
 67 | 
 68 | class TestPypeShellTask:
 69 |     def test_pype_shell_task(self):
 70 |         # assert_equal(expected, PypeShellTask(*argv, **kwargv))
 71 |         raise SkipTest # TODO: implement your test here
 72 | 
 73 | class TestPypeSGETask:
 74 |     def test_pype_sge_task(self):
 75 |         # assert_equal(expected, PypeSGETask(*argv, **kwargv))
 76 |         raise SkipTest # TODO: implement your test here
 77 | 
 78 | class TestPypeDistributibleTask:
 79 |     def test_pype_distributible_task(self):
 80 |         # assert_equal(expected, PypeDistributibleTask(*argv, **kwargv))
 81 |         raise SkipTest # TODO: implement your test here
 82 | 
 83 | 
 84 | class TestTimeStampCompare:
 85 |     def test_time_stamp_compare(self):
 86 |         # assert_equal(expected, timeStampCompare(inputDataObjs, outputDataObjs, parameters))
 87 |         raise SkipTest # TODO: implement your test here
 88 | 
 89 | class TestPypeTaskCollectionBase:
 90 |     def test___init__(self):
 91 |         # pype_task_collection_base = PypeTaskCollectionBase(URL, tasks)
 92 |         raise SkipTest # TODO: implement your test here
 93 | 
 94 |     def test_getTasks(self):
 95 |         # pype_task_collection_base = PypeTaskCollectionBase(URL, tasks)
 96 |         # assert_equal(expected, pype_task_collection_base.getTasks())
 97 |         raise SkipTest # TODO: implement your test here
 98 | 
 99 | class TestPypeTaskCollection:
100 |     def test___init__(self):
101 |         # pype_task_collection = PypeTaskCollection(URL, tasks)
102 |         raise SkipTest # TODO: implement your test here
103 | 
104 |     def test_addTask(self):
105 |         # pype_task_collection = PypeTaskCollection(URL, tasks)
106 |         # assert_equal(expected, pype_task_collection.addTask(task))
107 |         raise SkipTest # TODO: implement your test here
108 | 
109 |     def test_getTasks(self):
110 |         # pype_task_collection = PypeTaskCollection(URL, tasks)
111 |         # assert_equal(expected, pype_task_collection.getTasks())
112 |         raise SkipTest # TODO: implement your test here
113 | 
114 | class TestPypeScatteredTasks:
115 | 
116 |     def test_pype_scattered_tasks(self):
117 |         import os
118 |         #os.system("rm -rf /tmp/pypetest/*")
119 |         nChunk = 5 
120 | 
121 |         infileObj =\
122 |         pypeflow.data.PypeSplittableLocalFile(
123 |                       "splittablefile://localhost/tmp/pypetest/test_in_1.txt", 
124 |                       nChunk = nChunk)
125 | 
126 |         with open(infileObj.localFileName, "w") as f:
127 |             for i in range(nChunk):
128 |                 f.write("file%02d\n" % i)
129 | 
130 |         def scatter(*argv, **kwargv):
131 |             outputObjs = sorted( kwargv["outputDataObjs"].items() )
132 |             nOut = len(outputObjs)
133 |             outputObjs = [ (o[0], o[1], open(o[1].localFileName, "w")) for o in outputObjs]
134 |             with open(kwargv["inputDataObjs"]["completeFile"].localFileName,"r") as f:
135 |                 i = 0
136 |                 for l in f:
137 |                    outf = outputObjs[i % nOut][2]
138 |                    outf.write(l)
139 |                    i += 1
140 |             for o in outputObjs:
141 |                 o[2].close()
142 | 
143 |         PypeShellTask = pypeflow.task.PypeShellTask
144 |         PypeTask = pypeflow.task.PypeTask
145 |         PypeTaskBase = pypeflow.task.PypeTaskBase
146 |         infileObj.setScatterTask(PypeTask, PypeTaskBase, scatter)
147 |         infileObj.getScatterTask()()
148 |         
149 |         def gather(*argv, **kwargv):
150 |             inputObjs = sorted( kwargv["inputDataObjs"].items() )
151 |             with open(kwargv["outputDataObjs"]["completeFile"].localFileName,"w") as outf:
152 |                 for k, subfile in inputObjs:
153 |                     f = open(subfile.localFileName)
154 |                     outf.write(f.read())
155 |                     f.close()
156 | 
157 |         outfileObj =\
158 |         pypeflow.data.PypeSplittableLocalFile(
159 |                       "splittablefile://localhost/tmp/pypetest/test_out_1.txt", 
160 |                       nChunk = nChunk)
161 | 
162 |         outfileObj.setGatherTask(PypeTask, PypeTaskBase, gather)
163 | 
164 |         PypeScatteredTasks = pypeflow.task.PypeScatteredTasks
165 | 
166 |         @PypeScatteredTasks( inputDataObjs = {"inf":infileObj},
167 |                              outputDataObjs = {"outf":outfileObj} )
168 |         def test_fun(*argv, **kwargv):
169 |             chunk_id = kwargv["chunk_id"]
170 |             self = test_fun[chunk_id]
171 |             assert self.inf._path == "/tmp/pypetest/%03d_test_in_1.txt" % chunk_id
172 |             with open(  self.outf._path , "w") as f:
173 |                 in_f = open(self.inf.localFileName,"r")
174 |                 f.write("out:"+in_f.read())
175 |                 in_f.close()
176 |             return self.inf._path
177 | 
178 |         assert len(test_fun.getTasks()) == nChunk 
179 |         for i in range(nChunk):
180 |             test_fun[i]()
181 | 
182 |         outfileObj.getGatherTask()()
183 |         
184 |     def test_pype_scattered_tasks_2(self):
185 |         import os
186 |         #os.system("rm -rf /tmp/pypetest/*")
187 | 
188 |         nChunk = 5 
189 | 
190 |         infileObj =\
191 |         pypeflow.data.PypeSplittableLocalFile(
192 |                       "splittablefile://localhost/tmp/pypetest/test_in_2.txt", 
193 |                       nChunk = nChunk)
194 | 
195 |         with open(infileObj.localFileName, "w") as f:
196 |             for i in range(nChunk):
197 |                 f.write("file%02d\n" % i)
198 | 
199 |         with open("/tmp/pypetest/scatter.sh", "w") as f:
200 |             f.write("#!/bin/bash\n")
201 |             f.write("for f in %s;" % " ".join( ["%03d" % i for i in range(nChunk)] )) 
202 |             f.write('do if [ -e /tmp/pypetest/%f"_test_in.txt" ];\
203 |                         then rm /tmp/pypetest/$f"_test_in.txt"; fi;\n')
204 |             f.write("done\n")
205 |             for i in range(nChunk):
206 |                 f.write("echo file%02d > /tmp/pypetest/%03d_test_in_2.txt\n" % (i, i))
207 | 
208 |         PypeShellTask = pypeflow.task.PypeShellTask
209 |         PypeTask = pypeflow.task.PypeTask
210 |         PypeTaskBase = pypeflow.task.PypeTaskBase
211 |         infileObj.setScatterTask(PypeShellTask, PypeTaskBase, "/tmp/pypetest/scatter.sh")
212 |         infileObj.getScatterTask()()
213 |         
214 |         def gather(*argv, **kwargv):
215 |             inputObjs = sorted( kwargv["inputDataObjs"].items() )
216 |             with open(kwargv["outputDataObjs"]["completeFile"].localFileName,"w") as outf:
217 |                 for k, subfile in inputObjs:
218 |                     f = open(subfile.localFileName)
219 |                     outf.write("out:"+f.read())
220 |                     f.close()
221 | 
222 |         outfileObj =\
223 |         pypeflow.data.PypeSplittableLocalFile(
224 |                       "splittablefile://localhost/tmp/pypetest/test_out_2.txt", 
225 |                       nChunk = nChunk)
226 | 
227 |         outfileObj.setGatherTask(PypeTask, PypeTaskBase, gather)
228 | 
229 |         PypeScatteredTasks = pypeflow.task.PypeScatteredTasks
230 | 
231 |         @PypeScatteredTasks( inputDataObjs = {"inf":infileObj},
232 |                              outputDataObjs = {"outf":outfileObj},
233 |                              comment="xyz")
234 |         def test_fun_2(*argv, **kwargv):
235 |             assert kwargv["comment"] == "xyz"
236 |             chunk_id = kwargv["chunk_id"]
237 |             self = test_fun_2[chunk_id]
238 |             assert self.inf._path == "/tmp/pypetest/%03d_test_in_2.txt" % chunk_id
239 |             with open(  self.outf._path , "w") as f:
240 |                 f.write("file%02d\n" % chunk_id)
241 |             return self.inf._path
242 | 
243 |         assert len(test_fun_2.getTasks()) == nChunk 
244 |         for i in range(nChunk):
245 |             test_fun_2[i]()
246 | 
247 |         outfileObj.getGatherTask()()
248 | 
249 |     def test_pype_scattered_tasks_3(self):
250 |         import os
251 |         #os.system("rm -rf /tmp/pypetest/*")
252 |         nChunk = 5 
253 | 
254 | 
255 |         infileObj0 =\
256 |         pypeflow.data.PypeLocalFile(
257 |                       "file://localhost/tmp/pypetest/test_in_0.txt")
258 |         with open(infileObj0.localFileName,"w") as f:
259 |             f.write("prefix:")
260 | 
261 |         infileObj =\
262 |         pypeflow.data.PypeSplittableLocalFile(
263 |                       "splittablefile://localhost/tmp/pypetest/test_in_3.txt", 
264 |                       nChunk = nChunk)
265 | 
266 |         with open(infileObj.localFileName, "w") as f:
267 |             for i in range(nChunk):
268 |                 f.write("file%02d\n" % i)
269 | 
270 |         def scatter(*argv, **kwargv):
271 |             outputObjs = sorted( kwargv["outputDataObjs"].items() )
272 |             nOut = len(outputObjs)
273 |             outputObjs = [ (o[0], o[1], open(o[1].localFileName, "w")) for o in outputObjs]
274 |             with open(kwargv["inputDataObjs"]["completeFile"].localFileName,"r") as f:
275 |                 i = 0
276 |                 for l in f:
277 |                     outf = outputObjs[i % nOut][2]
278 |                     outf.write(l)
279 |                     i += 1
280 |             for o in outputObjs:
281 |                 o[2].close()
282 | 
283 |         PypeShellTask = pypeflow.task.PypeShellTask
284 |         PypeTask = pypeflow.task.PypeTask
285 |         PypeTaskBase = pypeflow.task.PypeTaskBase
286 |         infileObj.setScatterTask(PypeTask, PypeTaskBase, scatter)
287 |         infileObj.getScatterTask()()
288 |         
289 |         def gather(*argv, **kwargv):
290 |             inputObjs = sorted( kwargv["inputDataObjs"].items() )
291 |             with open(kwargv["outputDataObjs"]["completeFile"].localFileName,"w") as outf:
292 |                 for k, subfile in inputObjs:
293 |                     f = open(subfile.localFileName)
294 |                     outf.write(f.read())
295 |                     f.close()
296 | 
297 |         outfileObj3 =\
298 |         pypeflow.data.PypeSplittableLocalFile(
299 |                       "splittablefile://localhost/tmp/pypetest/test_out_3.txt", 
300 |                       nChunk = nChunk)
301 | 
302 |         outfileObj3.setGatherTask(PypeTask, PypeTaskBase, gather)
303 | 
304 |         PypeScatteredTasks = pypeflow.task.PypeScatteredTasks
305 | 
306 |         @PypeScatteredTasks( inputDataObjs = {"inf":infileObj, "prefix":infileObj0},
307 |                              outputDataObjs = {"outf":outfileObj3} )
308 |         def test_fun_3(*argv, **kwargv):
309 |             chunk_id = kwargv["chunk_id"]
310 |             self = test_fun_3[chunk_id]
311 | 
312 |             assert self.inf._path == "/tmp/pypetest/%03d_test_in_3.txt" % chunk_id
313 |             with open( self.prefix.localFileName, "r") as f:
314 |                 prefix = f.read()
315 | 
316 |             with open( self.outf._path, "w") as f:
317 |                 in_f = open(self.inf.localFileName,"r")
318 |                 f.write(prefix + in_f.read())
319 |                 in_f.close()
320 |             return self.inf._path
321 | 
322 |         assert len(test_fun_3.getTasks()) == nChunk 
323 |         for i in range(nChunk):
324 |             test_fun_3[i]()
325 | 
326 |         outfileObj3.getGatherTask()()
327 | 


--------------------------------------------------------------------------------
/presentation/pypeFLOW_tutorial.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | pypeFLOW Tutorial
  3 | =================
  4 | 
  5 | .. image:: escher--unbelievable-527581_1024_768.jpg
  6 |    :scale: 40%
  7 |    :align: left
  8 | 
  9 | -----------------
 10 | 
 11 | What is pypeFLOW?
 12 | -----------------
 13 | 
 14 | What is pypeFLOW?  A toolkit to contruct data processing work flow
 15 | 
 16 | Tracking data processing within the Python language
 17 | 
 18 | .. image:: pipelines.png
 19 |    :scale: 70 %
 20 |    :align: center
 21 | 
 22 | -----------------
 23 | 
 24 | Basic Objects
 25 | -------------
 26 | 
 27 | data objects (defined in ``pypeflow.data.*``)
 28 | 
 29 | task objects (defined in ``pypeflow.task.*``)
 30 | 
 31 | workflow objects (defined in ``pypeflow.controller.*``)
 32 | 
 33 | Analogous to Makefile
 34 | 
 35 | .. code-block:: python
 36 |     
 37 |     @PypeTask( inputs = {'dep1':dep1, 'dep2':dep2},
 38 |                outputs = {'target':target} )
 39 |     def do_something_to_get_the_target(self, *argv, **kwargv):
 40 |         ...
 41 | 
 42 | is equivalent to
 43 | 
 44 | .. code-block:: make
 45 | 
 46 |     target: dep1 dep2
 47 |         do_something_to_get_the_target ...
 48 | 
 49 | * Every PypeObjects is initialized by an URL and uniquely identifiable by it. 
 50 | 
 51 | ---------------------
 52 | 
 53 | Data Objects
 54 | ------------
 55 | 
 56 | ``PypeLocalFile`` is an object representing a reference to local file
 57 | 
 58 | .. code-block:: python
 59 | 
 60 |     f = PypeLocalFile("file://localhost/home/jchin/test/test.txt")
 61 | 
 62 | ``f`` is a local file at ``/home/jchin/test/test.txt``
 63 | 
 64 | .. code-block:: python
 65 | 
 66 |     assert f.URL == "file://localhost/home/jchin/test/test.txt"
 67 |     assert f.localFileName == "/home/jchin/test/test.txt"
 68 | 
 69 | 
 70 | ------------------------
 71 | 
 72 | Basic Task Objects
 73 | ------------------
 74 | 
 75 | `PypeTaskBase`` is the base class representing a `task` that converts some 
 76 | input files to some output files. 
 77 | 
 78 | Such `task` is typically constructed by using a decorator (e.g. ``PypeTask``)
 79 | to wrap a function into a ``PypeTaskBase`` objects (or objects of the 
 80 | subclasses of ``PypeTaskBase``)
 81 | 
 82 | One needs to specify the input and output data objects within the decorator.
 83 | The data objects can be referred within the task function that gets wrapped.
 84 | 
 85 | Example:
 86 | 
 87 | .. code-block:: python
 88 |     
 89 |     in_file1 = PypeLocalFile("file://localhost/home/jchin/test/test.txt")
 90 | 
 91 |     @PypeTask( inputs = {"in_file1": in_file1, "in_file2": in_file2},
 92 |                outputs = {"out_file2": out_file2, "out_file2": out_file2} )
 93 |     def task(self, *argv, **kwargv):
 94 |         assert self.in_file1.localFileName == "/home/jchin/test/test.txt"
 95 |         #do somethings to generate out_file1 and out_file2
 96 |         
 97 |     assert task.in_file1 == in_file1
 98 | 
 99 | ------------------------
100 | 
101 | Task Decorator is Actually a Function
102 | -------------------------------------
103 | 
104 | If you don't like Python's decorator, you can generate tasks by calling the
105 | decorator function directly. This is useful to generate a number of tasks 
106 | programmatically, e.g., using a loop to generate a number of tasks. 
107 | 
108 | .. code-block:: python
109 | 
110 |     tasks = []
111 |     def task_func(self, *argv, **kwargv):
112 |         # do something
113 |         pass
114 | 
115 |     for i in range(10):
116 |         # task_decorator is a function that takes a function as an input argument
117 |         # and it returns a PypeTaskBase object 
118 |         task_decorator = PypeTask(inputs={"f":inputObjs[i]},
119 |                                   outputs={"g":outputObjs[i]},
120 |                                   URL="task://localhost/task%s" % i) 
121 |         t = task_decorator(task_func)
122 |         tasks.append(t)
123 | 
124 | -----------------------
125 | 
126 | Different Kind of Task Objects 
127 | ------------------------------
128 | 
129 | Different ``*Task`` decorators can wrap different kind of function (or
130 | objects, e.g shell script strings)
131 | 
132 |     - ``PypeTask``, wrap Python function, run as a Python function
133 | 
134 |     - ``PypeShellTask``, wrap a string as shell script, run as a Python function
135 |       that executes the shell script
136 | 
137 |     - other decorators for different purposes can be written as needed (e.g. 
138 |       ``PypeSGETask``)
139 | 
140 | One can use ``TaskType`` keyword argument in the decorator to control the
141 | output task types
142 | 
143 |     - Simple task type: ``PypeTaskBase``
144 | 
145 |     - Task type that can be run concurrently within different threads: ``PypeThreadTaskBase``
146 | 
147 |     
148 | -----------------------
149 | 
150 | Some Examples About Tasks I
151 | ---------------------------
152 | 
153 | .. code-block:: python
154 | 
155 |     @PypeTask( ..., TaskType = PypeTaskBase)
156 |     def simple_py_func(self, *argv, **kwargv):
157 |         ...
158 | 
159 |     @PypeTask( ..., TaskType = PypeThreadTaskBase)
160 |     def simple_py_func(self, *argv, **kwargv):
161 |         ...
162 | 
163 |     t = PypeShellTask( ..., TaskType = PypeTaskBase)("#!/bin/bash; echo I am a task")
164 | 
165 |     t = PypeShellTask( ..., TaskType = PypeThreadTaskBase)("#!/bin/bash; echo I am a task")
166 | 
167 | -----------------------
168 | 
169 | Some Examples About Tasks II
170 | ----------------------------
171 | 
172 | An instance of the ``PythonTaskBase`` class is a "callable" object, namely, 
173 | it implements ``__call__`` method.  When it gets called, it will check the 
174 | dependency of the input and output objects and make a decision whether to 
175 | execute the wrapped function.
176 | 
177 | .. code-block:: python
178 | 
179 |     task_decorator = PypeTask(inputs={"f":f},
180 |                               outputs={"g":g}) 
181 |     def task_func(self, *argv, **kwargv):
182 |         do_something()
183 | 
184 |     # calling task_func() will return True and the original task_func is executed
185 |     # if f is newer than g
186 | 
187 |     # assuming g does not exist
188 |     task_func() # return True, do_something() is excuted, assuming g is generated
189 |     # run it again
190 |     task_func() # return False, the original task_func is not called, since g is newer than f
191 | 
192 | 
193 |     
194 | -----------------------
195 | 
196 | Workflow Objects 
197 | ----------------
198 | 
199 | A ``PypeWorkflow`` object contains a collection of ``PypeDataObjects`` and
200 | ``PypeTaskBase`` objects. It calculates the dependency graph and executes all
201 | tasks with the correct order.
202 | 
203 | * ``PypeWorkflow``: vanilla workflow class, one task at a time
204 | * ``PypeThreadWorkflow``: workflow class that can run tasks concurrently using 
205 |   Python thread library
206 | * ``PypeMPWorkflow``: workflow class that can run tasks concurrently using Python
207 |   multiprocessing library
208 | 
209 | -----------------------
210 | 
211 | Workflow Building Pattern  
212 | -------------------------
213 | 
214 | Set up a workflow object 
215 | 
216 | .. code-block:: python
217 | 
218 |     wf = PypeWorkflow(...)
219 |     wf = PypeMPWorkflow(...)
220 | 
221 | Set up a task
222 | 
223 |     - Set up data objects
224 |     - Define a ``task_func`` to be wrapped
225 |     - Use ``PypeTask`` decorator to create the real ``PypeTaskBase`` object
226 | 
227 | Add the task into the workflow (The inputs and outputs will be added automatically)
228 | 
229 | Set up more tasks and add them into the workflow (``wf.addTasks([t1,t2,...])``)
230 | 
231 | Call ``wf.refreshTargets(target_list)`` to execute the tasks (only task that does not
232 | satisfy the dependency constrain will be execute)
233 | 
234 | -----------------------
235 | 
236 | Put It All Together
237 | -------------------
238 | 
239 | `Code Demo <http://localhost/place_holder>`_.
240 | 
241 | `Embarrassing Parallelization Workflow <https://localhost/plact_holder>`_. 
242 | 
243 | ------------------------
244 | 
245 | Mutable Data Objects & State Objects
246 | ------------------------------------
247 | 
248 | Issue:
249 | 
250 |   * Side effect: If a data object (e.g. various gff, cmp.h5 files) is 
251 |     both input and output, we can not use it to calculate dependency. 
252 |   * Such file usually has some "internal states" that affect
253 |     how tasks should be executed
254 | 
255 | Solution
256 | 
257 |   * Be explicit.
258 |   * introduce "mutableDataObjs" for a task indicating those data objects that a 
259 |     task can modified.  If an object is used as "mutableDataObjs", it is not used
260 |     for calculating the task dependency.
261 |   * The standard "inputs" and "outputs" should be "immutable" objects within the
262 |     scope of the workflow.
263 |   * Special state objects to keep track the states. The state objects are used as
264 |     the input objects and/or output objects to control the task dependency (see 
265 |     `Example <http://localhost/place_holder>`_)
266 | 
267 | -------------------------
268 | 
269 | Output Collision Detection
270 | --------------------------
271 | 
272 | The dependency graph as a direct acyclic graph helps to find 
273 | independent tasks that can be run concurrently
274 | 
275 | However, in the case that multiple tasks write to the same
276 | output file, we need to detect "output collision" and do not
277 | allow tasks that writes to the same to be run concurrently.
278 | 
279 | Code snippet finding tasks that can be submitted
280 | 
281 | .. code-block:: python
282 | 
283 |     jobsReadyToBeSubmitted = []
284 | 
285 |     for URL, taskObj, tStatus in sortedTaskList:
286 |         prereqJobURLs = prereqJobURLMap[URL]
287 |         outputCollision = False
288 | 
289 |         for dataObj in taskObj.outputDataObjs.values() + taskObj.mutableDataObjs.values():
290 |             for fromTaskObjURL, activeDataObjURL in activeDataObjs:
291 |                 if dataObj.URL == activeDataObjURL and taskObj.URL != fromTaskObjURL:
292 |                     logger.debug( "output collision detected for data object:"+str(dataObj))
293 |                     outputCollision = True
294 |                     break
295 |         
296 |         if outputCollision: #the task can not be executed
297 |             continue
298 |     ...
299 | 
300 | 
301 | -------------------------
302 | 
303 | Scatter-Gather Pattern
304 | ----------------------
305 | 
306 | Pattern:
307 | 
308 |     - Start with a file  
309 |    
310 |     - Split it into a number of small files of the same type 
311 |    
312 |     - process them as processing the original file 
313 |    
314 |     - generate some partial results 
315 |     
316 |     - put partial results back into a single file 
317 | 
318 | Complexity
319 |    
320 |     - Multiple input files / output files 
321 | 
322 |     - Chaining of scattered tasks
323 | 
324 | ------------------------------------
325 | 
326 | Encapsulating Scattered Files 
327 | -----------------------------
328 | 
329 | ``PypeSplittableLocalFile``: Represent a PypeData object that has two
330 | different local file representations:
331 | 
332 |    - the whole file (could be a virtual one)
333 |    - the split files
334 | 
335 | Such data object can have either a scatter task attached or a gather task
336 | attached.
337 | 
338 |    - If a scatter task is attached, the task will be inserted to generate the
339 |      scattered files.
340 | 
341 |    - If a gather task is attached, the task will be inserted to generate the
342 |      whole file.
343 | 
344 |    - If neither scatter task nor gather task is specified, then the file is
345 |      mostly like intermediate data.  Namely, the whole file representation is
346 |      not used any place else.
347 | 
348 |    - One can not specify scatter task and gather task for the same object since it
349 |      will create a loop.
350 | 
351 | 
352 | 
353 | 
354 | ------------------------------------
355 | 
356 | Generate Scattered Tasks
357 | ------------------------
358 |     
359 | Special decorator to generate a set of "scattered tasks":
360 |     
361 |     - Explicitly generating a collection of tasks that work on the split files
362 | 
363 |     - Special task decorators to generate the collection:
364 | 
365 |      ``PypeScatteredTasks``: a decorator that takes a function as an input and generate
366 |      a collection of tasks that does the real work (alias as ``getPypeScatteredTasks``
367 |      to be used as a regular function)
368 |     
369 |      ``PypeScatteredTasks/getPypeScatteredTasks`` returns a ``PypeTaskCollection`` object
370 |      which contains all the sub-tasks / scatter tasks / gather tasks.
371 | 
372 | When a ``PypeTaskCollection`` object is added into a workflow, the real sub-tasks are 
373 | added automatically.
374 | 
375 | `Example / Demo <http://localhost/place_holder>`_
376 | 
377 | -------------------------
378 | 
379 | FOFN Mapper
380 | -----------
381 | 
382 | A special decorator/function that takes a FOFN (file of file names) as the main
383 | input and generate the tasks with the inputs are the files specified in
384 | the FOFN. ( This is different from a "scatter" task which keeps the file 
385 | type the same. ) 
386 | 
387 | .. code-block:: python
388 | 
389 |     def outTemplate(fn):
390 |         return fn + ".out"
391 | 
392 |     def task(self, *argv, **kwargv):
393 |         in_f = self.in_f
394 |         out_f = self.out_f
395 |         #do something with in_f, and write something to out_f
396 | 
397 |     tasks = getPypeFOFNMapTasks(FOFNFileName = "./file.fofn", 
398 |             outTemplateFunc = outTemplate, 
399 |             TaskType = PypeThreadTaskBase,
400 |             parameters = dict(nSlots = 8))( alignTask )
401 | 
402 |     for t in tasks:# You can run the tasks in sequential 
403 |         t()
404 | 
405 |     wf = PypeThreadWorkflow() # or run them in parallel using thread or multiprocessing
406 |     wf.CONCURRENT_THREAD_ALLOWED = nproc 
407 |     wf.MAX_NUMBER_TASK_SLOT = nproc
408 |     wf.addTasks(tasks)
409 |     wf.refreshTargets(exitOnFailure=False)
410 | 
411 | 
412 | ---------------------------------
413 | 
414 | Query Workflow Objects
415 | ----------------------
416 | 
417 | Workflows has a canonical RDF representation. One can query the DAG using SPARQ
418 | 
419 | For example, give a workflow DAG, what are the workflow inputs and outputs
420 | 
421 | .. code-block:: python
422 | 
423 |     @property
424 |     def inputDataObjects(self):
425 |         graph = self._RDFGraph
426 |         inputObjs = []
427 |         for obj in self.dataObjects:
428 |             r = graph.query('SELECT ?o WHERE {<%s> pype:prereq ?o .  }' % obj.URL, 
429 |                                                            initNs=dict(pype=pypeNS))
430 |             if len(r) == 0:
431 |                 inputObjs.append(obj)
432 |         return inputObjs
433 | 
434 | 
435 |     workflow.inputDataObjects # <- the input data objects of the whole workflow
436 | 
437 | ----------------------------
438 | 
439 | Update Workflow Objects
440 | -----------------------
441 | 
442 | We can redirect the inputs and outputs to different underlying files using
443 | ``workflow.updateURL()``
444 | 
445 | .. code-block:: python
446 | 
447 |     def updateURL(self, oldURL, newURL):
448 |         obj = self._pypeObjects[oldURL]
449 |         obj._updateURL(newURL)
450 |         self._pypeObjects[newURL] = obj
451 |         del self._pypeObjects[oldURL]
452 | 
453 | It is possible to build a workflow structure and set up the real inputs
454 | and outputs later. This is useful to setup the workflow input/output from
455 | command line options and/or an XML configuration file.
456 | 
457 | .. code-block:: python
458 | 
459 |     for o in workflow.inputDataObjects: 
460 |         if o.URL == "files://virtual/xyz":
461 |             realInputFile = os.path.abspath(sys.argv[1])
462 |             o.updateURL("files://localhost%s" % realInputFile)
463 |     ...
464 | 
465 | -------------------------
466 | 
467 | Debugging Support
468 | -----------------
469 | 
470 | graphviz dot output
471 | 
472 | logging
473 | 
474 | test coverage about 70%, 22 tests now
475 | 
476 | The whole thing is about 2000 LOC (without counting
477 | testing code.)::
478 | 
479 |     $wc src/pypeflow/*.py
480 | 
481 |       0       0       0 src/pypeflow/__init__.py
482 |     148     539    4428 src/pypeflow/common.py
483 |     744    2603   28166 src/pypeflow/controller.py
484 |     313    1140   11096 src/pypeflow/data.py
485 |     814    2645   28005 src/pypeflow/task.py
486 |    2019    6927   71695 total
487 | 
488 | ----------------------------
489 | 
490 | What's Next?
491 | ------------
492 | 
493 | * I will use this PypeFLOW for producing better reproducible 
494 |   bioinformatics analysis developed with in Python/IPython notebook
495 | 
496 | * Some new features:
497 | 
498 |   - Supporting data object in memory? mmap file? numpy array?
499 |   - Remote data objects
500 |   - HDF5 data sets as native data objects
501 |   - direct python function execution (through IPython parallel or Pyro like RPC call)
502 | 
503 | * Similar framework for streaming data processing rather than batch data
504 |   processing
505 | 
506 | 


--------------------------------------------------------------------------------
/pwatcher/blocking.py:
--------------------------------------------------------------------------------
  1 | """Blocking process-watcher.
  2 | 
  3 | See fs_based.py. Here, delete is a no-op, and run() starts threads, so
  4 | the main program needs to wait for threads to finish somehow.
  5 | 
  6 | Typical submission_string:
  7 | 
  8 |     qsub -S /bin/bash -sync y -V -q production -N ${JOB_ID} \\\n -o "${STDOUT_FILE}" \\\n -e "${STDERR_FILE}" \\\n -pe smp ${NPROC} -l h_vmem=${MB}M \\\n "${CMD}"
  9 | """
 10 | try:
 11 |     from shlex import quote
 12 | except ImportError:
 13 |     from pipes import quote
 14 | import collections
 15 | import contextlib
 16 | import copy
 17 | import glob
 18 | import json
 19 | import logging
 20 | import os
 21 | import pprint
 22 | import re
 23 | import signal
 24 | import string
 25 | import subprocess
 26 | import sys
 27 | import threading
 28 | import time
 29 | import traceback
 30 | 
 31 | log = logging.getLogger(__name__)
 32 | 
 33 | LOCAL_SUBMISSION_STRING = '/bin/bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}' # for job_local override
 34 | STATE_FN = 'state.py'
 35 | Job = collections.namedtuple('Job', ['jobid', 'cmd', 'rundir', 'options'])
 36 | MetaJob = collections.namedtuple('MetaJob', ['job', 'lang_exe'])
 37 | lang_python_exe = sys.executable
 38 | lang_bash_exe = '/bin/bash'
 39 | 
 40 | @contextlib.contextmanager
 41 | def cd(newdir):
 42 |     prevdir = os.getcwd()
 43 |     log.debug('CD: %r <- %r' %(newdir, prevdir))
 44 |     os.chdir(os.path.expanduser(newdir))
 45 |     try:
 46 |         yield
 47 |     finally:
 48 |         log.debug('CD: %r -> %r' %(newdir, prevdir))
 49 |         os.chdir(prevdir)
 50 | 
 51 | class MetaJobClass(object):
 52 |     ext = {
 53 |         lang_python_exe: '.py',
 54 |         lang_bash_exe: '.bash',
 55 |     }
 56 |     def get_wrapper(self):
 57 |         # Totally by convention, for now.
 58 |         return '%s/run-%s%s' %(self.mj.job.rundir, self.mj.job.jobid, self.ext[self.mj.lang_exe])
 59 |     def get_sentinel(self):
 60 |         return 'exit-%s' %self.mj.job.jobid # in watched dir
 61 |     def get_pid(self):
 62 |         return self.mj.pid
 63 |     def kill(self, pid, sig):
 64 |         stored_pid = self.get_pid()
 65 |         if not pid:
 66 |             pid = stored_pid
 67 |             log.info('Not passed a pid to kill. Using stored pid:%s' %pid)
 68 |         if pid and stored_pid:
 69 |             if pid != stored_pid:
 70 |                 log.error('pid:%s != stored_pid:%s' %(pid, stored_pid))
 71 |         os.kill(pid, sig)
 72 |     def __init__(self, mj):
 73 |         self.mj = mj
 74 | class State(object):
 75 |     def notify_threaded(self, jobid):
 76 |         self.jobids_threaded.add(jobid)
 77 |     def notify_started(self, jobid):
 78 |         #state.top['jobids_submitted'].append(jobid)
 79 |         self.jobids_submitted.add(jobid)
 80 |         self.jobids_threaded.remove(jobid)
 81 |         log.debug('Thread notify_started({}).'.format(jobid))
 82 |     def notify_exited(self, jobid, rc):
 83 |         #self.top['jobid2exit'][jobid] = rc
 84 |         self.jobid2exit[jobid] = rc
 85 |         self.jobids_submitted.remove(jobid)
 86 |         log.debug('Thread notify_exited({}->{}).'.format(jobid, rc))
 87 |     def set_job(self, jobid, mjob):
 88 |         # Is this needed? For now, we are not actually saving state, so no.
 89 |         self.top['jobs'][jobid] = mjob
 90 |     def update_jobid2status(self, jobid2status):
 91 |         for jobid in self.jobids_threaded:
 92 |             status = 'THREADED'
 93 |             jobid2status[jobid] = status
 94 |         for jobid in self.jobids_submitted:
 95 |             status = 'RUNNING'
 96 |             # but actually it might not have started yet, or it could be dead, since we have blocking qsub calls
 97 |             jobid2status[jobid] = status
 98 |         for jobid, rc in self.jobid2exit.items():
 99 |             status = 'EXIT {}'.format(rc)
100 |             jobid2status[jobid] = status
101 |     def get_running_jobids(self):
102 |         return list(self.jobids_submitted)
103 |     def serialize(self):
104 |         return pprint.pformat(self.top)
105 |     @staticmethod
106 |     def deserialize(directory, content):
107 |         state = State(directory)
108 |         state.top = eval(content)
109 |         state.content_prev = content
110 |         return state
111 |     @staticmethod
112 |     def create(directory):
113 |         state = State(directory)
114 |         #makedirs(state.get_directory_wrappers())
115 |         #makedirs(state.get_directory_jobs())
116 |         return state
117 |     def __init__(self, directory):
118 |         self.__directory = os.path.abspath(directory)
119 |         self.content_prev = ''
120 |         self.top = dict() # for serialization, when we decide we need it
121 |         self.top['jobs'] = dict()
122 |         #self.top['jobids_submitted'] = list()
123 |         #self.top['jobid2exit'] = dict()
124 |         self.jobids_threaded = set()
125 |         self.jobids_submitted = set()
126 |         self.jobid2exit = dict()
127 | 
128 | class SafeState(object):
129 |     """Synchronized State proxy for accessing any
130 |     data which might be modified in a Thread.
131 |     """
132 |     def notify_threaded(self, jobid):
133 |         with self.lock:
134 |             self.state.notify_threaded(jobid)
135 |     def notify_started(self, jobid):
136 |         with self.lock:
137 |             self.state.notify_started(jobid)
138 |     def notify_exited(self, jobid, rc):
139 |         with self.lock:
140 |             self.state.notify_exited(jobid, rc)
141 |     def update_jobid2status(self, table):
142 |         with self.lock:
143 |             return self.state.update_jobid2status(table)
144 |     def get_running_jobids(self):
145 |         with self.lock:
146 |             return self.state.get_running_jobids()
147 |     def serialize(self):
148 |         with self.lock:
149 |             return self.state.serialize()
150 |     def __getattr__(self, name):
151 |         """For all other methods, just delegate.
152 |         """
153 |         return getattr(self.state, name)
154 |     def __init__(self, state):
155 |         self.state = state
156 |         self.lock = threading.Lock()
157 | 
158 | def get_state(directory):
159 |     """For now, we never write.
160 |     """
161 |     state_fn = os.path.join(directory, STATE_FN)
162 |     if not os.path.exists(state_fn):
163 |         return State.create(directory)
164 |     assert False, 'No state directory needed, for now.'
165 |     try:
166 |         return State.deserialize(directory, open(state_fn).read())
167 |     except Exception:
168 |         log.exception('Failed to read state "%s". Ignoring (and soon over-writing) current state.'%state_fn)
169 |         # TODO: Backup previous STATE_FN?
170 |         return State(directory)
171 | def State_save(state):
172 |     # TODO: RW Locks, maybe for runtime of whole program.
173 |     content = state.serialize()
174 |     content_prev = state.content_prev
175 |     if content == content_prev:
176 |         return
177 |     fn = state.get_state_fn()
178 |     open(fn, 'w').write(content)
179 |     log.debug('saved state to %s' %repr(os.path.abspath(fn)))
180 | def Job_get_MetaJob(job, lang_exe=lang_bash_exe):
181 |     return MetaJob(job, lang_exe=lang_exe)
182 | def MetaJob_wrap(mjob, state):
183 |     """Write wrapper contents to mjob.wrapper.
184 |     """
185 |     metajob_rundir = mjob.job.rundir
186 |     wdir = metajob_rundir
187 | 
188 |     bash_template = """#!%(lang_exe)s
189 | cmd="%(cmd)s"
190 | rundir="%(rundir)s"
191 | finish() {
192 |   echo "finish code: $?"
193 | }
194 | trap finish 0
195 | #printenv
196 | echo
197 | set -ex
198 | while [ ! -d "$rundir" ]; do sleep 1; done
199 | cd "$rundir"
200 | eval "$cmd"
201 |     """
202 |     mji = MetaJobClass(mjob)
203 |     wrapper_fn = os.path.join(wdir, mji.get_wrapper())
204 |     command = mjob.job.cmd
205 | 
206 |     wrapped = bash_template %dict(
207 |         lang_exe=mjob.lang_exe,
208 |         cmd=command,
209 |         rundir=metajob_rundir,
210 |     )
211 |     log.debug('Writing wrapper "%s"' %wrapper_fn)
212 |     open(wrapper_fn, 'w').write(wrapped)
213 |     st = os.stat(wrapper_fn)
214 |     os.chmod(wrapper_fn, st.st_mode | 0o111)
215 | 
216 | class JobThread(threading.Thread):
217 |     def run(self):
218 |         """Propagate environment, plus env_extra.
219 |         """
220 |         try:
221 |             self.notify_start(self.jobname)
222 |             log.debug('hello! started Thread {}'.format(threading.current_thread()))
223 |             myenv = dict(os.environ)
224 |             myenv.update(self.env_extra)
225 |             #log.debug('myenv:\n{}'.format(pprint.pformat(myenv)))
226 |             log.info("Popen: '{}'".format(self.cmd))
227 |             if not self.cmd:
228 |                 msg = 'Why is self.cmd empty? {} {} {!r}'.format(self, self.jobname, self.cmd)
229 |                 raise Exception(msg)
230 |             p = subprocess.Popen(self.cmd, env=myenv, shell=True)
231 |             log.debug("pid: {}".format(p.pid))
232 |             p.wait()
233 |             rc = p.returncode
234 |             log.debug("rc: {}".format(rc))
235 |             self.notify_exit(self.jobname, rc)
236 |         except:
237 |             log.exception('Failed to submit {}: {!r} Setting rc=42.'.format(self.jobname, self.cmd))
238 |             self.notify_exit(self.jobname, 42)
239 |     def __init__(self, jobname, cmd, notify_start, notify_exit, env_extra):
240 |         super(JobThread, self).__init__()
241 |         self.jobname = jobname
242 |         self.cmd = cmd
243 |         self.notify_start = notify_start
244 |         self.notify_exit = notify_exit
245 |         self.env_extra = env_extra
246 | 
247 | class StringJobSubmitter(object):
248 |     """Substitute some variables into self.submission_string.
249 |     Use mains/job_start.sh as the top script. That requires
250 |     PYPEFLOW_JOB_START_SCRIPT in the environment as the real
251 |     script to run. This way, we are guaranteed that the top script exists,
252 |     and we can wait for the rest to appear in the filesystem.
253 |     """
254 |     def submit(self, jobname, mjob, state):
255 |         """Prepare job (based on wrappers) and submit as a new thread.
256 |         """
257 |         state.set_job(jobname, mjob)
258 |         jobname = mjob.job.jobid
259 |         job_dict = mjob.job.options
260 |         #nproc = mjob.job.options['NPROC']
261 |         #mb = mjob.job.options['MB']
262 |         mji = MetaJobClass(mjob)
263 |         #script_fn = os.path.join(state.get_directory_wrappers(), mji.get_wrapper())
264 |         script_fn = mji.get_wrapper()
265 |         exe = mjob.lang_exe
266 | 
267 |         state.notify_threaded(jobname)
268 |         self.start(jobname, state, exe, script_fn, job_dict) # Can raise
269 |     def get_cmd(self, job_name, script_fn, job_dict):
270 |         """Vars:
271 |         (The old ones.) JOB_ID, STDOUT_FILE, STDERR_FILE, NPROC, MB, CMD
272 |         """
273 |         # We wrap in a program that waits for the executable to exist, so
274 |         # the filesystem has time to catch up on the remote machine.
275 |         # Hopefully, this will allow dependencies to become ready as well.
276 |         job_start_fn = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mains/job_start.sh')
277 |         mapping = dict()
278 |         stdout = script_fn + '.stdout'
279 |         stderr = script_fn + '.stderr'
280 |         run_dir = os.getcwd()
281 |         mapping = dict(
282 |                 JOB_EXE='/bin/bash',
283 |                 JOB_NAME=job_name, JOB_ID=job_name,
284 |                 #JOB_OPTS=JOB_OPTS,
285 |                 #JOB_QUEUE=job_queue,
286 |                 JOB_SCRIPT=job_start_fn, CMD=job_start_fn,
287 |                 JOB_DIR=run_dir, DIR=run_dir,
288 |                 JOB_STDOUT=stdout, STDOUT_FILE=stdout,
289 |                 JOB_STDERR=stderr, STDERR_FILE=stderr,
290 |                 #MB=pypeflow_mb,
291 |                 #NPROC=pypeflow_nproc,
292 |         )
293 |         mapping.update(job_dict)
294 |         if 'JOB_OPTS' in mapping:
295 |             # a special two-level mapping: ${JOB_OPTS} is substituted first
296 |             mapping['JOB_OPTS'] = self.sub(mapping['JOB_OPTS'], mapping)
297 |         return self.sub(self.submission_string, mapping)
298 |     @staticmethod
299 |     def sub(template, mapping):
300 |         t = string.Template(template)
301 |         try:
302 |             return t.substitute(mapping)
303 |         except KeyError:
304 |             print(repr(mapping))
305 |             msg = 'Template substitution failed:\n template={!r}\n mapping={}'.format(
306 |                     template, pprint.pformat(mapping))
307 |             log.exception(msg)
308 |             raise
309 |     def start(self, jobname, state, exe, script_fn, job_dict):
310 |         """Run job in thread.
311 |         Thread will notify state.
312 |         Can raise.
313 |         """
314 |         #cmd = script_fn
315 |         cmd = self.get_cmd(jobname, script_fn, job_dict)
316 |         # job_start.sh relies on PYPEFLOW_*
317 |         env_extra = {
318 |             "PYPEFLOW_JOB_START_SCRIPT": script_fn,
319 |             "PYPEFLOW_JOB_START_TIMEOUT": "60",
320 |         }
321 |         log.debug('env_extra={}'.format(pprint.pformat(env_extra)))
322 |         notify_start = state.notify_started
323 |         notify_exit = state.notify_exited
324 |         th = JobThread(jobname, cmd, notify_start, notify_exit, env_extra)
325 |         #th.setDaemon(True)
326 |         th.start()
327 |     def __repr__(self):
328 |         return 'StringJobSubmitter(%s)' %repr(self.submission_string)
329 |     def __init__(self, submission_string):
330 |         self.submission_string = submission_string
331 | 
332 | def link_rundir(state_rundir, user_rundir):
333 |     if user_rundir:
334 |         link_fn = os.path.join(user_rundir, 'pwatcher.dir')
335 |         if os.path.lexists(link_fn):
336 |             os.unlink(link_fn)
337 |         os.symlink(os.path.abspath(state_rundir), link_fn)
338 | 
339 | def cmd_run(state, jobids, job_type, job_dict):
340 |     """
341 |     Wrap them and run them locally, each in the foreground of a thread.
342 |     """
343 |     jobs = dict()
344 |     submitted = list()
345 |     result = {'submitted': submitted}
346 |     if job_type != 'string':
347 |         log.debug("NOTE: In blocking pwatcher, job_type={!r}, should be 'string'".format(job_type))
348 |     for jobid, desc in jobids.items():
349 |         assert 'cmd' in desc
350 |         cmd = desc['cmd']
351 |         if 'rundir' in desc:
352 |             rundir = desc['rundir']
353 |         else:
354 |             rundir = os.path.dirname(cmd)
355 |         # These are all required now.
356 |         #nproc = desc['job_nproc']
357 |         #mb = desc['job_mb']
358 |         local = int(desc['job_local'])
359 |         options = copy.deepcopy(desc['job_dict']) #dict(NPROC=nproc, MB=mb, local=local)
360 |         options['local'] = local
361 |         jobs[jobid] = Job(jobid, cmd, rundir, options)
362 |     log.debug('jobs:\n%s' %pprint.pformat(jobs))
363 |     submission_string = job_dict['submit']
364 |     basic_submitter = StringJobSubmitter(submission_string)
365 |     local_submitter = StringJobSubmitter(LOCAL_SUBMISSION_STRING)
366 |     log.debug('Basic submitter: {!r}'.format(basic_submitter))
367 |     for jobid, job in jobs.items():
368 |         #desc = jobids[jobid]
369 |         log.debug(' starting job %s' %pprint.pformat(job))
370 |         mjob = Job_get_MetaJob(job)
371 |         MetaJob_wrap(mjob, state)
372 |         try:
373 |             #link_rundir(state.get_directory_job(jobid), desc.get('rundir'))
374 |             if job.options['local']:
375 |                 submitter = local_submitter
376 |             else:
377 |                 submitter = basic_submitter
378 |                 if not submission_string:
379 |                     raise Exception('No "submit" key in job_dict:{!r}.'.format(job_dict))
380 |             submitter.submit(jobid, mjob, state)
381 |             submitted.append(jobid)
382 |         except Exception:
383 |             raise
384 |             log.exception('Failed to submit background-job:\n{!r}'.format(
385 |                 submitter))
386 |     return result
387 |     # The caller is responsible for deciding what to do about job-submission failures. Re-try, maybe?
388 | 
389 | def system(call, checked=False):
390 |     log.info('!{}'.format(call))
391 |     rc = os.system(call)
392 |     if checked and rc:
393 |         raise Exception('{} <- {!r}'.format(rc, call))
394 |     return rc
395 | 
396 | _warned = dict()
397 | def warnonce(hashkey, msg):
398 |     if hashkey in _warned:
399 |         return
400 |     log.warning(msg)
401 |     _warned[hashkey] = True
402 | 
403 | def cmd_query(state, which, jobids):
404 |     """Return the state of named jobids.
405 |     If which=='list', then query jobs listed as jobids.
406 |     If which=='known', then query all known jobs.
407 |     If which=='infer', same as 'known' now.
408 |     """
409 |     result = dict()
410 |     jobstats = dict()
411 |     result['jobids'] = jobstats
412 |     if which == 'list':
413 |         for jobid in jobids:
414 |             jobstats[jobid] = 'UNKNOWN'
415 |     state.update_jobid2status(jobstats)
416 |     jobids = set(jobids)
417 |     if which == 'list':
418 |         for jobid in list(jobstats.keys()):
419 |             # TODO: This might remove thousands. We should pass jobids along to update_jobid2status().
420 |             if jobid not in jobids:
421 |                 del jobstats[jobid]
422 |     return result
423 | def cmd_delete(state, which, jobids):
424 |     """Kill designated jobs, including (hopefully) their
425 |     entire process groups.
426 |     If which=='list', then kill all jobs listed as jobids.
427 |     If which=='known', then kill all known jobs.
428 |     If which=='infer', then kill all jobs with heartbeats.
429 |     """
430 |     log.error('Noop. We cannot kill blocked threads. Hopefully, everything will die on SIGTERM.')
431 | def makedirs(path):
432 |     if not os.path.isdir(path):
433 |         os.makedirs(path)
434 | def readjson(ifs):
435 |     """Del keys that start with ~.
436 |     That lets us have trailing commas on all other lines.
437 |     """
438 |     content = ifs.read()
439 |     log.debug('content:%s' %repr(content))
440 |     jsonval = json.loads(content)
441 |     #pprint.pprint(jsonval)
442 |     def striptildes(subd):
443 |         if not isinstance(subd, dict):
444 |             return
445 |         for k,v in list(subd.items()):
446 |             if k.startswith('~'):
447 |                 del subd[k]
448 |             else:
449 |                 striptildes(v)
450 |     striptildes(jsonval)
451 |     #pprint.pprint(jsonval)
452 |     return jsonval
453 | 
454 | class ProcessWatcher(object):
455 |     def run(self, jobids, job_type, job_defaults_dict):
456 |         #import traceback; log.debug(''.join(traceback.format_stack()))
457 |         log.debug('run(jobids={}, job_type={}, job_defaults_dict={})'.format(
458 |             '<%s>'%len(jobids), job_type, job_defaults_dict))
459 |         return cmd_run(self.state, jobids, job_type, job_defaults_dict)
460 |     def query(self, which='list', jobids=[]):
461 |         log.debug('query(which={!r}, jobids={})'.format(
462 |             which, '<%s>'%len(jobids)))
463 |         return cmd_query(self.state, which, jobids)
464 |     def delete(self, which='list', jobids=[]):
465 |         log.debug('delete(which={!r}, jobids={})'.format(
466 |             which, '<%s>'%len(jobids)))
467 |         return cmd_delete(self.state, which, jobids)
468 |     def __init__(self, state):
469 |         # state must be thread-safe
470 |         self.state = state
471 | 
472 | def get_process_watcher(directory):
473 |     state = get_state(directory)
474 |     state = SafeState(state) # thread-safe proxy
475 |     #log.debug('state =\n%s' %pprint.pformat(state.top))
476 |     return ProcessWatcher(state)
477 |     #State_save(state)
478 | 
479 | @contextlib.contextmanager
480 | def process_watcher(directory):
481 |     """This will (someday) hold a lock, so that
482 |     the State can be written safely at the end.
483 |     """
484 |     state = get_state(directory)
485 |     state = SafeState(state) # thread-safe proxy
486 |     #log.debug('state =\n%s' %pprint.pformat(state.top))
487 |     yield ProcessWatcher(state)
488 |     #State_save(state)
489 | 
490 | def main(prog, cmd, state_dir='mainpwatcher', argsfile=None):
491 |     logging.basicConfig()
492 |     logging.getLogger().setLevel(logging.NOTSET)
493 |     log.warning('logging basically configured')
494 |     log.debug('debug mode on')
495 |     assert cmd in ['run', 'query', 'delete']
496 |     ifs = sys.stdin if not argsfile else open(argsfile)
497 |     argsdict = readjson(ifs)
498 |     log.info('argsdict =\n%s' %pprint.pformat(argsdict))
499 |     with process_watcher(state_dir) as watcher:
500 |         result = getattr(watcher, cmd)(**argsdict)
501 |         if result is not None:
502 |             log.info('getattr({!r}, {!r}): {}'.format(
503 |                 watcher, cmd, pprint.pformat(result)))
504 |         log.info('Waiting for running jobs...r')
505 |         while watcher.state.get_running_jobids():
506 |             log.info('running: {!s}'.format(watcher.state.get_running_jobids()))
507 |             time.sleep(1)
508 | 
509 | if __name__ == "__main__":
510 |     #import pdb
511 |     #pdb.set_trace()
512 |     main(*sys.argv) # pylint: disable=no-value-for-parameter
513 | 


--------------------------------------------------------------------------------
/src/tests/test_pypeflow_controller.py:
--------------------------------------------------------------------------------
  1 | from nose import SkipTest
  2 | from nose.tools import assert_equal
  3 | import pypeflow.task
  4 | import pypeflow.data
  5 | import pypeflow.controller
  6 | 
  7 | class TestPypeNode:
  8 |     def test___init__(self):
  9 |         # pype_node = PypeNode(obj)
 10 |         raise SkipTest # TODO: implement your test here
 11 | 
 12 |     def test_addAnInNode(self):
 13 |         # pype_node = PypeNode(obj)
 14 |         # assert_equal(expected, pype_node.addAnInNode(obj))
 15 |         raise SkipTest # TODO: implement your test here
 16 | 
 17 |     def test_addAnOutNode(self):
 18 |         # pype_node = PypeNode(obj)
 19 |         # assert_equal(expected, pype_node.addAnOutNode(obj))
 20 |         raise SkipTest # TODO: implement your test here
 21 | 
 22 |     def test_depth(self):
 23 |         # pype_node = PypeNode(obj)
 24 |         # assert_equal(expected, pype_node.depth())
 25 |         raise SkipTest # TODO: implement your test here
 26 | 
 27 |     def test_inDegree(self):
 28 |         # pype_node = PypeNode(obj)
 29 |         # assert_equal(expected, pype_node.inDegree())
 30 |         raise SkipTest # TODO: implement your test here
 31 | 
 32 |     def test_outDegree(self):
 33 |         # pype_node = PypeNode(obj)
 34 |         # assert_equal(expected, pype_node.outDegree())
 35 |         raise SkipTest # TODO: implement your test here
 36 | 
 37 |     def test_removeAnInNode(self):
 38 |         # pype_node = PypeNode(obj)
 39 |         # assert_equal(expected, pype_node.removeAnInNode(obj))
 40 |         raise SkipTest # TODO: implement your test here
 41 | 
 42 |     def test_removeAnOutNode(self):
 43 |         # pype_node = PypeNode(obj)
 44 |         # assert_equal(expected, pype_node.removeAnOutNode(obj))
 45 |         raise SkipTest # TODO: implement your test here
 46 | 
 47 | class TestPypeGraph:
 48 |     def test___getitem__(self):
 49 |         # pype_graph = PypeGraph(RDFGraph, subGraphNodes)
 50 |         # assert_equal(expected, pype_graph.__getitem__(url))
 51 |         raise SkipTest # TODO: implement your test here
 52 | 
 53 |     def test___init__(self):
 54 |         # pype_graph = PypeGraph(RDFGraph, subGraphNodes)
 55 |         raise SkipTest # TODO: implement your test here
 56 | 
 57 |     def test_tSort(self):
 58 |         # pype_graph = PypeGraph(RDFGraph, subGraphNodes)
 59 |         # assert_equal(expected, pype_graph.tSort())
 60 |         raise SkipTest # TODO: implement your test here
 61 | 
 62 | class TestPypeWorkflow:
 63 |     def test___init__(self):
 64 |         # pype_workflow = PypeWorkflow(URL, **attributes)
 65 |         raise SkipTest # TODO: implement your test here
 66 | 
 67 |     def test_addObject(self):
 68 |         # pype_workflow = PypeWorkflow(URL, **attributes)
 69 |         # assert_equal(expected, pype_workflow.addObject(obj))
 70 |         raise SkipTest # TODO: implement your test here
 71 | 
 72 |     def test_addObjects(self):
 73 |         # pype_workflow = PypeWorkflow(URL, **attributes)
 74 |         # assert_equal(expected, pype_workflow.addObjects(objs))
 75 |         raise SkipTest # TODO: implement your test here
 76 | 
 77 |     def test_addTask(self):
 78 |         # pype_workflow = PypeWorkflow(URL, **attributes)
 79 |         # assert_equal(expected, pype_workflow.addTask(taskObj))
 80 |         raise SkipTest # TODO: implement your test here
 81 | 
 82 |     def test_addTasks(self):
 83 |         # pype_workflow = PypeWorkflow(URL, **attributes)
 84 |         # assert_equal(expected, pype_workflow.addTasks(taskObjs))
 85 |         raise SkipTest # TODO: implement your test here
 86 | 
 87 |     def test_dataObjects(self):
 88 |         # pype_workflow = PypeWorkflow(URL, **attributes)
 89 |         # assert_equal(expected, pype_workflow.dataObjects())
 90 |         raise SkipTest # TODO: implement your test here
 91 | 
 92 |     def test_graphvizDot(self):
 93 |         # pype_workflow = PypeWorkflow(URL, **attributes)
 94 |         # assert_equal(expected, pype_workflow.graphvizDot())
 95 |         raise SkipTest # TODO: implement your test here
 96 | 
 97 |     def test_graphvizShortNameDot(self):
 98 |         # pype_workflow = PypeWorkflow(URL, **attributes)
 99 |         # assert_equal(expected, pype_workflow.graphvizShortNameDot())
100 |         raise SkipTest # TODO: implement your test here
101 | 
102 |     def test_makeFileStr(self):
103 |         # pype_workflow = PypeWorkflow(URL, **attributes)
104 |         # assert_equal(expected, pype_workflow.makeFileStr())
105 |         raise SkipTest # TODO: implement your test here
106 | 
107 |     def test_refreshTargets(self):
108 |         # pype_workflow = PypeWorkflow(URL, **attributes)
109 |         # assert_equal(expected, pype_workflow.refreshTargets(objs, callback))
110 |         raise SkipTest # TODO: implement your test here
111 | 
112 |     def test_removeObjects(self):
113 |         # pype_workflow = PypeWorkflow(URL, **attributes)
114 |         # assert_equal(expected, pype_workflow.removeObjects(objs))
115 |         raise SkipTest # TODO: implement your test here
116 | 
117 |     def test_removeTask(self):
118 |         # pype_workflow = PypeWorkflow(URL, **attributes)
119 |         # assert_equal(expected, pype_workflow.removeTask(taskObj))
120 |         raise SkipTest # TODO: implement your test here
121 | 
122 |     def test_removeTasks(self):
123 |         # pype_workflow = PypeWorkflow(URL, **attributes)
124 |         # assert_equal(expected, pype_workflow.removeTasks(taskObjs))
125 |         raise SkipTest # TODO: implement your test here
126 | 
127 |     def test_setReferenceRDFGraph(self):
128 |         # pype_workflow = PypeWorkflow(URL, **attributes)
129 |         # assert_equal(expected, pype_workflow.setReferenceRDFGraph(fn))
130 |         raise SkipTest # TODO: implement your test here
131 | 
132 |     def test_tasks(self):
133 |         # pype_workflow = PypeWorkflow(URL, **attributes)
134 |         # assert_equal(expected, pype_workflow.tasks())
135 |         raise SkipTest # TODO: implement your test here
136 | 
137 |     def test_scatterTask(self):
138 |         
139 |         import os
140 |         os.system("rm -rf /tmp/pypetest/*")
141 |         nChunk = 3 
142 | 
143 |         infileObj0 =\
144 |         pypeflow.data.PypeLocalFile(
145 |                       "file://localhost/tmp/pypetest/test_in_0.txt")
146 |         with open(infileObj0.localFileName,"w") as f:
147 |             f.write("prefix4:")
148 | 
149 |         infileObj4 =\
150 |         pypeflow.data.PypeSplittableLocalFile(
151 |                       "splittablefile://localhost/tmp/pypetest/test_in_4.txt", 
152 |                       nChunk = nChunk)
153 | 
154 |         with open(infileObj4.localFileName, "w") as f:
155 |             for i in range(nChunk):
156 |                 f.write("file%02d\n" % i)
157 | 
158 |         def scatter(*argv, **kwargv):
159 |             outputObjs = sorted( kwargv["outputDataObjs"].items() )
160 |             nOut = len(outputObjs)
161 |             outputObjs = [ (o[0], o[1], open(o[1].localFileName, "w")) for o in outputObjs]
162 |             with open(kwargv["inputDataObjs"]["completeFile"].localFileName,"r") as f:
163 |                 i = 0
164 |                 for l in f:
165 |                     outf = outputObjs[i % nOut][2]
166 |                     outf.write(l)
167 |                     i += 1
168 |             for o in outputObjs:
169 |                 o[2].close()
170 | 
171 |         PypeShellTask = pypeflow.task.PypeShellTask
172 |         PypeTask = pypeflow.task.PypeTask
173 |         PypeTaskBase = pypeflow.task.PypeTaskBase
174 |         infileObj4.setScatterTask(PypeTask, PypeTaskBase, scatter)
175 |         
176 |         def gather(*argv, **kwargv):
177 |             inputObjs = sorted( kwargv["inputDataObjs"].items() )
178 |             with open(kwargv["outputDataObjs"]["completeFile"].localFileName,"w") as outf:
179 |                 for k, subfile in inputObjs:
180 |                     f = open(subfile.localFileName)
181 |                     outf.write(f.read())
182 |                     f.close()
183 | 
184 |         outfileObj4 =\
185 |         pypeflow.data.PypeSplittableLocalFile(
186 |                       "splittablefile://localhost/tmp/pypetest/test_out_4.txt", 
187 |                       nChunk = nChunk)
188 | 
189 |         outfileObj4.setGatherTask(PypeTask, PypeTaskBase, gather)
190 | 
191 |         PypeScatteredTasks = pypeflow.task.PypeScatteredTasks
192 | 
193 |         @PypeScatteredTasks( inputDataObjs = {"inf":infileObj4, "prefix":infileObj0},
194 |                              outputDataObjs = {"outf":outfileObj4},
195 |                              URL="tasks://test_fun_4")
196 |         def test_fun_4(*argv, **kwargv):
197 |             chunk_id = kwargv["chunk_id"]
198 |             self = test_fun_4[chunk_id]
199 | 
200 |             assert self.inf._path == "/tmp/pypetest/%03d_test_in_4.txt" % chunk_id
201 |             with open( self.prefix.localFileName, "r") as f:
202 |                 prefix = f.read()
203 | 
204 |             with open( self.outf._path, "w") as f:
205 |                 in_f = open(self.inf.localFileName,"r")
206 |                 f.write(prefix + in_f.read())
207 |                 in_f.close()
208 |             return self.inf._path
209 | 
210 |         outfileObj5 =\
211 |         pypeflow.data.PypeSplittableLocalFile(
212 |                       "splittablefile://localhost/tmp/pypetest/test_out_5.txt", 
213 |                       nChunk = nChunk)
214 |         outfileObj5.setGatherTask(PypeTask, PypeTaskBase, gather)
215 | 
216 |         @PypeScatteredTasks( inputDataObjs = {"inf":infileObj4, "prefix":infileObj0},
217 |                              outputDataObjs = {"outf":outfileObj5},
218 |                              URL="tasks://test_fun_5")
219 |         def test_fun_5(*argv, **kwargv):
220 |             chunk_id = kwargv["chunk_id"]
221 |             self = test_fun_5[chunk_id]
222 | 
223 |             assert self.inf._path == "/tmp/pypetest/%03d_test_in_4.txt" % chunk_id
224 |             with open( self.prefix.localFileName, "r") as f:
225 |                 prefix = f.read()
226 | 
227 |             with open( self.outf._path, "w") as f:
228 |                 in_f = open(self.inf.localFileName,"r")
229 |                 f.write(prefix +"2:"+ in_f.read())
230 |                 in_f.close()
231 |             return self.inf._path
232 |         assert len(test_fun_4.getTasks()) == nChunk 
233 | 
234 |         wf = pypeflow.controller.PypeWorkflow()
235 |         wf.addTasks( [test_fun_4, test_fun_5] )
236 |         print(wf.graphvizDot)
237 |         wf.refreshTargets( [outfileObj4, outfileObj5] )
238 |     
239 | class TestPypeThreadWorkflow:
240 |     def test___init__(self):
241 |         # pype_thread_workflow = PypeThreadWorkflow(URL, **attributes)
242 |         raise SkipTest # TODO: implement your test here
243 | 
244 |     def test_addTasks(self):
245 |         # pype_thread_workflow = PypeThreadWorkflow(URL, **attributes)
246 |         # assert_equal(expected, pype_thread_workflow.addTasks(taskObjs))
247 |         raise SkipTest # TODO: implement your test here
248 | 
249 |     def test_refreshTargets(self):
250 |         # pype_thread_workflow = PypeThreadWorkflow(URL, **attributes)
251 |         # assert_equal(expected, pype_thread_workflow.refreshTargets(objs, callback, updateFreq, exitOnFailure))
252 |         raise SkipTest # TODO: implement your test here
253 | 
254 |     def test_setNumThreadAllowed(self):
255 |         # pype_thread_workflow = PypeThreadWorkflow(URL, **attributes)
256 |         # assert_equal(expected, pype_thread_workflow.setNumThreadAllowed(nT, nS))
257 |         raise SkipTest # TODO: implement your test here
258 | 
259 |     def test_mutableDataObjects(self):
260 | 
261 |         infileObj =\
262 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_in.txt")
263 | 
264 |         outfileObj =\
265 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out.txt")
266 | 
267 |         out1 =\
268 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out1.txt")
269 | 
270 |         out2 =\
271 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out2.txt")
272 | 
273 |         out3 =\
274 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out3.txt")
275 | 
276 |         import os
277 |         os.system("rm -rf /tmp/pypetest/*")
278 | 
279 |         with open(infileObj.localFileName,"w") as f:
280 |             f.write("test")
281 | 
282 |         PypeThreadWorkflow = pypeflow.controller.PypeThreadWorkflow
283 |         PypeThreadTaskBase = pypeflow.controller.PypeThreadTaskBase
284 |         PypeTask = pypeflow.task.PypeTask
285 |         wf = PypeThreadWorkflow()
286 | 
287 |         @PypeTask(mutableDataObjs={"out":outfileObj},
288 |                   outputDataObjs={"out1":out1},
289 |                   inputDataObjs={"in":infileObj},
290 |                   TaskType=PypeThreadTaskBase)
291 |         def task1(task):
292 |             with open(task.out.localFileName, "a") as f:
293 |                 print("written by task1", file=f)
294 |             with open(task.out1.localFileName, "w") as f:
295 |                 print("written by task1", file=f)
296 | 
297 |         @PypeTask(mutableDataObjs={"out":outfileObj},
298 |                   outputDataObjs={"out2":out2},
299 |                   inputDataObjs={"in":infileObj},
300 |                   TaskType=PypeThreadTaskBase)
301 |         def task2(task):
302 |             with open(task.out.localFileName, "a") as f:
303 |                 print("written by task2", file=f)
304 |             with open(task.out2.localFileName, "w") as f:
305 |                 print("written by task2", file=f)
306 | 
307 |         @PypeTask(mutableDataObjs={"out":outfileObj},
308 |                   outputDataObjs={"out3":out3},
309 |                   inputDataObjs={"in":infileObj},
310 |                   TaskType=PypeThreadTaskBase)
311 |         def task3(task):
312 |             with open(task.out.localFileName, "a") as f:
313 |                 print("written by task3", file=f)
314 |             with open(task.out3.localFileName, "w") as f:
315 |                 print("written by task3", file=f)
316 | 
317 |         wf = PypeThreadWorkflow()
318 |         wf.addTasks([task1, task2, task3])
319 | 
320 |         wf.refreshTargets()
321 | 
322 |         outputSet = set()
323 |         outputSet.add("written by task1")
324 |         outputSet.add("written by task2")
325 |         outputSet.add("written by task3")
326 | 
327 |         with open(outfileObj.localFileName) as f:
328 |             i = 0
329 |             for l in f:
330 |                 l = l.strip()
331 |                 assert l in outputSet
332 |                 i += 1
333 |             assert_equal(i, 3)
334 | 
335 |     def test_stateDataObjects(self):
336 | 
337 |         infileObj =\
338 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_in.txt")
339 | 
340 |         outfileObj =\
341 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out.txt")
342 | 
343 |         out1 =\
344 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out1.txt")
345 | 
346 |         out2 =\
347 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out2.txt")
348 | 
349 |         out3 =\
350 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out3.txt")
351 | 
352 |         s1 =\
353 |         pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state1")
354 | 
355 |         s2 =\
356 |         pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state2")
357 | 
358 |         s3 =\
359 |         pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state3")
360 | 
361 |         import os
362 |         import time
363 |         os.system("rm -rf /tmp/pypetest/*")
364 |         time.sleep(2)
365 | 
366 |         with open(infileObj.localFileName,"w") as f:
367 |             f.write("test")
368 | 
369 |         PypeThreadWorkflow = pypeflow.controller.PypeThreadWorkflow
370 |         PypeThreadTaskBase = pypeflow.controller.PypeThreadTaskBase
371 |         PypeTask = pypeflow.task.PypeTask
372 |         wf = PypeThreadWorkflow()
373 | 
374 |         @PypeTask(mutableDataObjs = {"out":outfileObj},
375 |                   outputDataObjs = {"out1":out1, "s1":s1},
376 |                   inputDataObjs = {"in":infileObj},
377 |                   TaskType=PypeThreadTaskBase)
378 |         def task1(task):
379 |             with open(task.out.localFileName, "a") as f:
380 |                 print("written by task1", file=f)
381 |             with open(task.s1.localFileName, "w") as f:
382 |                 print("state set", file=f)
383 |             with open(task.out1.localFileName, "w") as f:
384 |                 print("written by task1", file=f)
385 | 
386 |         @PypeTask(mutableDataObjs = {"out":outfileObj},
387 |                   outputDataObjs = {"out2":out2, "s2":s2},
388 |                   inputDataObjs = {"in":infileObj, "s1":s1},
389 |                   TaskType=PypeThreadTaskBase)
390 |         def task2(task):
391 |             with open(task.out.localFileName, "a") as f:
392 |                 print("written by task2", file=f)
393 |             with open(task.s2.localFileName, "w") as f:
394 |                 print("state set", file=f)
395 |             with open(task.out2.localFileName, "w") as f:
396 |                 print("written by task2", file=f)
397 | 
398 |         @PypeTask(mutableDataObjs = {"out":outfileObj},
399 |                   outputDataObjs = {"out3":out3, "s3":s3},
400 |                   inputDataObjs = {"in":infileObj, "s2":s2},
401 |                   TaskType=PypeThreadTaskBase)
402 |         def task3(task):
403 |             with open(task.out.localFileName, "a") as f:
404 |                 print("written by task3", file=f)
405 |             with open(task.s3.localFileName, "w") as f:
406 |                 print("state set", file=f)
407 |             with open(task.out3.localFileName, "w") as f:
408 |                 print("written by task3", file=f)
409 | 
410 |         wf = PypeThreadWorkflow()
411 |         wf.addTasks([task1, task2, task3])
412 | 
413 |         wf.refreshTargets()
414 | 
415 |         with open(outfileObj.localFileName) as f:
416 |             i = 0
417 |             for l in f:
418 |                 i += 1
419 |                 l = l.strip()
420 |                 assert l == "written by task%d" % i
421 |         assert i == 3
422 | 
423 |     def test_stateDataObjects2(self):
424 | 
425 |         infileObj =\
426 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_in.txt")
427 | 
428 |         outfileObj =\
429 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out.txt")
430 | 
431 |         out1 =\
432 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out1.txt")
433 | 
434 |         out2 =\
435 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out2.txt")
436 | 
437 |         out3 =\
438 |         pypeflow.data.PypeLocalFile("file://localhost/tmp/pypetest/test_for_shared_output_out3.txt")
439 | 
440 |         s1 =\
441 |         pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state1")
442 | 
443 |         s2 =\
444 |         pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state2")
445 | 
446 |         s3 =\
447 |         pypeflow.data.PypeLocalFile("state://localhost/tmp/pypetest/.state3")
448 | 
449 |         import os
450 |         import time
451 |         os.system("rm -rf /tmp/pypetest/*")
452 |         time.sleep(2)
453 | 
454 |         with open(infileObj.localFileName,"w") as f:
455 |             f.write("test")
456 | 
457 |         PypeThreadWorkflow = pypeflow.controller.PypeThreadWorkflow
458 |         PypeThreadTaskBase = pypeflow.controller.PypeThreadTaskBase
459 |         PypeTask = pypeflow.task.PypeTask
460 |         wf = PypeThreadWorkflow()
461 | 
462 |         @PypeTask( outputDataObjs = {"out1":out1, "s1":s1},
463 |                   inputDataObjs = {"in":infileObj},
464 |                   TaskType=PypeThreadTaskBase)
465 |         def task1(task):
466 |             with open(task.s1.localFileName, "w") as f:
467 |                 print("state set", file=f)
468 |             with open(task.out1.localFileName, "w") as f:
469 |                 print("written by task1", file=f)
470 | 
471 |         @PypeTask(outputDataObjs = {"out2":out2, "s2":s2},
472 |                   inputDataObjs = {"in":infileObj, "s1":s1},
473 |                   TaskType=PypeThreadTaskBase)
474 |         def task2(task):
475 |             with open(task.s2.localFileName, "w") as f:
476 |                 print("state set", file=f)
477 |             with open(task.out2.localFileName, "w") as f:
478 |                 print("written by task2", file=f)
479 | 
480 |         @PypeTask(outputDataObjs = {"out3":out3, "s3":s3},
481 |                   inputDataObjs = {"in":infileObj, "s2":s2},
482 |                   TaskType=PypeThreadTaskBase)
483 |         def task3(task):
484 |             with open(task.s3.localFileName, "w") as f:
485 |                 print("state set", file=f)
486 |             with open(task.out3.localFileName, "w") as f:
487 |                 print("written by task3", file=f)
488 | 
489 |         wf = PypeThreadWorkflow()
490 |         wf.addTasks([task1, task2, task3])
491 | 
492 |         wf.refreshTargets([s3])
493 | 
494 |         for i in range(1,4):
495 |             with open("/tmp/pypetest/test_for_shared_output_out%d.txt" % i) as f:
496 |                 l = f.read().strip()
497 |                 assert l == "written by task%d" % i
498 | 


--------------------------------------------------------------------------------