├── src ├── test │ ├── resources │ │ ├── input-one.txt │ │ ├── input-two.txt │ │ ├── output-one.txt │ │ ├── test-folder │ │ │ ├── file1.txt │ │ │ └── file2.txt │ │ ├── task-two.yaml │ │ ├── workflowArgs.csv │ │ ├── parallel-graph.yaml │ │ ├── folder-copy.yaml │ │ ├── linear-graph.yaml │ │ ├── gather-graph.yaml │ │ ├── param-sub.yaml │ │ ├── reordered-graph.yaml │ │ ├── task-one.yaml │ │ ├── branching-graph.yaml │ │ ├── workflow.yaml │ │ ├── cwl-graph.yaml │ │ └── complex-graph.yaml │ └── java │ │ └── com │ │ └── google │ │ └── cloud │ │ └── genomics │ │ └── dockerflow │ │ ├── args │ │ └── ArgsTableBuilderTest.java │ │ ├── examples │ │ ├── ExampleGraphsITCase.java │ │ ├── ComplexGraph.java │ │ ├── LinearGraph.java │ │ ├── MultiLinearGraph.java │ │ └── ExampleGraphsTest.java │ │ ├── DockerflowITCase.java │ │ ├── TestUtils.java │ │ ├── dataflow │ │ └── DataflowFactoryTest.java │ │ └── DockerflowTest.java └── main │ └── java │ └── com │ └── google │ └── cloud │ └── genomics │ └── dockerflow │ ├── workflow │ ├── GraphItem.java │ ├── WorkflowDefn.java │ └── Workflow.java │ ├── runner │ ├── TaskException.java │ ├── Operation.java │ └── TaskRunner.java │ ├── transform │ ├── BreakFusion.java │ ├── MergeBranches.java │ ├── WaitForOperation.java │ └── DeleteIntermediateFiles.java │ ├── dataflow │ ├── DataflowBuilder.java │ └── DataflowFactory.java │ ├── DockerflowConstants.java │ ├── args │ ├── WorkflowArgs.java │ ├── ArgsBuilder.java │ └── ArgsTableBuilder.java │ ├── util │ ├── HttpUtils.java │ ├── StringUtils.java │ └── FileUtils.java │ └── Dockerflow.java ├── examples ├── hello │ ├── hello-args.yaml │ ├── hello-workflow.yaml │ ├── HelloWorkflow.java │ └── README.md └── gatk │ ├── README.md │ └── gatk-args.yaml ├── bin └── dockerflow ├── .gitignore ├── CONTRIBUTING.md ├── pom.xml ├── README.md └── LICENSE /src/test/resources/input-one.txt: -------------------------------------------------------------------------------- 1 | cat 2 | -------------------------------------------------------------------------------- /src/test/resources/input-two.txt: -------------------------------------------------------------------------------- 1 | dog 2 | -------------------------------------------------------------------------------- /src/test/resources/output-one.txt: -------------------------------------------------------------------------------- 1 | cat 2 | hello 3 | -------------------------------------------------------------------------------- /src/test/resources/test-folder/file1.txt: -------------------------------------------------------------------------------- 1 | cat 2 | hello -------------------------------------------------------------------------------- /src/test/resources/test-folder/file2.txt: -------------------------------------------------------------------------------- 1 | dog 2 | hello -------------------------------------------------------------------------------- /examples/hello/hello-args.yaml: -------------------------------------------------------------------------------- 1 | inputs: 2 | Hello.message: "Hello, World!" -------------------------------------------------------------------------------- /bin/dockerflow: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | : "${DOCKERFLOW_HOME:?Error: DOCKERFLOW_HOME not set. It should be the dockerflow installation folder}" 3 | 4 | java -jar ${DOCKERFLOW_HOME}/target/dockerflow*dependencies.jar "$@" 5 | -------------------------------------------------------------------------------- /examples/hello/hello-workflow.yaml: -------------------------------------------------------------------------------- 1 | defn: 2 | name: HelloWorkflow 3 | steps: 4 | - defn: 5 | name: Hello 6 | inputParameters: 7 | - name: message 8 | docker: 9 | imageName: ubuntu 10 | cmd: echo $message 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | target 3 | 4 | # Mobile Tools for Java (J2ME) 5 | .mtj.tmp/ 6 | 7 | # Package Files # 8 | *.jar 9 | *.war 10 | *.ear 11 | 12 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 13 | hs_err_pid* 14 | 15 | # Eclipse 16 | .DS_Store 17 | -------------------------------------------------------------------------------- /src/test/resources/task-two.yaml: -------------------------------------------------------------------------------- 1 | name: TaskTwo 2 | inputParameters: 3 | - name: inputFile 4 | type: file 5 | - name: message 6 | defaultValue: goodbye 7 | outputParameters: 8 | - name: outputFile 9 | type: file 10 | docker: 11 | imageName: ubuntu 12 | cmd: | 13 | cp ${inputFile} ${outputFile} 14 | echo ${message} >> ${outputFile} -------------------------------------------------------------------------------- /src/test/resources/workflowArgs.csv: -------------------------------------------------------------------------------- 1 | "inputs=TaskOne.inputFile","outputs=TaskOne.outputFile","inputs=TaskTwo.inputFile","outputs=TaskTwo.outputFile","inputs=TaskOne.message","inputs=TaskTwo.message" 2 | "../../input-one.txt","output-one.txt","../TaskOne/output-one.txt","output-two.txt","hello","goodbye" 3 | "../../input-two.txt","output-one.txt","../TaskOne/output-one.txt","output-two.txt","goodbye","hello" 4 | -------------------------------------------------------------------------------- /src/test/resources/parallel-graph.yaml: -------------------------------------------------------------------------------- 1 | version: v1alpha2 2 | defn: 3 | name: ParallelGraph 4 | description: A simple file-parallel graph 5 | args: 6 | inputs: 7 | BASE_DIR: REQUIRED 8 | stepOne.inputFile: | 9 | ${BASE_DIR}/input-one.txt 10 | ${BASE_DIR}/input-two.txt 11 | outputs: 12 | stepOne.outputFile: output-one.txt 13 | steps: 14 | - defn: 15 | name: stepOne 16 | defnFile: task-one.yaml 17 | scatterBy: inputFile 18 | -------------------------------------------------------------------------------- /src/test/resources/folder-copy.yaml: -------------------------------------------------------------------------------- 1 | version: v1alpha2 2 | defn: 3 | name: FolderCopy 4 | description: Test copying input and output folders 5 | steps: 6 | - defn: 7 | name: stepOne 8 | inputParameters: 9 | - name: inputFolder 10 | type: folder 11 | outputParameters: 12 | - name: outputFolder 13 | type: folder 14 | resources: 15 | docker: 16 | imageName: ubuntu:16.04 17 | cmd: cp -rf ${inputFolder}/* ${outputFolder} -------------------------------------------------------------------------------- /src/test/resources/linear-graph.yaml: -------------------------------------------------------------------------------- 1 | version: v1alpha2 2 | defn: 3 | name: LinearGraph 4 | description: A simple linear graph. 5 | args: 6 | inputs: 7 | BASE_DIR: REQUIRED 8 | stepOne.inputFile: ${BASE_DIR}/input-one.txt 9 | stepTwo.inputFile: ${stepOne.outputFile} 10 | outputs: 11 | stepOne.outputFile: output-one.txt 12 | stepTwo.outputFile: output-two.txt 13 | steps: 14 | - defn: 15 | name: stepOne 16 | defnFile: task-one.yaml 17 | - defn: 18 | name: stepTwo 19 | defnFile: task-two.yaml 20 | -------------------------------------------------------------------------------- /src/test/resources/gather-graph.yaml: -------------------------------------------------------------------------------- 1 | version: v1alpha2 2 | defn: 3 | name: GatherGraph 4 | description: An example of doing a gather step after the first task 5 | args: 6 | inputs: 7 | BASE_DIR: REQUIRED 8 | stepOne.inputFile: ${BASE_DIR}/input-one.txt 9 | stepTwo.inputFile: ${stepOne.outputFile} 10 | outputs: 11 | stepOne.outputFile: output-one.txt 12 | stepTwo.outputFile: output-two.txt 13 | steps: 14 | - defn: 15 | name: stepOne 16 | defnFile: task-one.yaml 17 | gatherBy: inputFile 18 | - defn: 19 | name: stepTwo 20 | defnFile: task-two.yaml 21 | -------------------------------------------------------------------------------- /src/test/resources/param-sub.yaml: -------------------------------------------------------------------------------- 1 | version: v1alpha2 2 | defn: 3 | name: ParameterSubstitution 4 | description: Replace variables with command-line parameters. 5 | args: 6 | inputs: 7 | BASE_DIR: REQUIRED 8 | stepOne.inputFile: ${BASE_DIR}/input-one.txt 9 | stepTwo.inputFile: ${BASE_DIR}/output-one.txt 10 | outputs: 11 | stepOne.outputFile: ${BASE_DIR}/output-one.txt 12 | stepTwo.outputFile: ${BASE_DIR}/output-two.txt 13 | graph: 14 | - stepOne 15 | - stepTwo 16 | steps: 17 | - defn: 18 | name: stepOne 19 | defnFile: task-one.yaml 20 | - defn: 21 | name: stepTwo 22 | defnFile: task-two.yaml 23 | -------------------------------------------------------------------------------- /src/test/resources/reordered-graph.yaml: -------------------------------------------------------------------------------- 1 | version: v1alpha2 2 | defn: 3 | name: ReorderedGraph 4 | description: A simple linear graph, with task steps listed out of order. 5 | args: 6 | inputs: 7 | BASE_DIR: REQUIRED 8 | stepOne.inputFile: ${BASE_DIR}/reordered/output-two.txt 9 | stepTwo.inputFile: ${BASE_DIR}/input-two.txt 10 | outputs: 11 | stepOne.outputFile: ${BASE_DIR}/reordered/output-one.txt 12 | stepTwo.outputFile: ${BASE_DIR}/reordered/output-two.txt 13 | graph: 14 | - stepTwo 15 | - stepOne 16 | steps: 17 | - defn: 18 | name: stepOne 19 | defnFile: task-one.yaml 20 | - defn: 21 | name: stepTwo 22 | defnFile: task-two.yaml 23 | -------------------------------------------------------------------------------- /src/test/resources/task-one.yaml: -------------------------------------------------------------------------------- 1 | name: TaskOne 2 | description: Copy a file and append a message to the end 3 | inputParameters: 4 | - name: inputFile 5 | type: file 6 | - name: message 7 | defaultValue: hello 8 | outputParameters: 9 | - name: outputFile 10 | type: file 11 | resources: 12 | minimumCpuCores: 1 13 | minimumRamGb: 1 14 | zones: 15 | - us-central1-a 16 | - us-central1-b 17 | - us-central1-c 18 | - us-central1-f 19 | - us-east1-b 20 | - us-east1-c 21 | - us-east1-d 22 | disks: 23 | - name: data 24 | type: PERSISTENT_HDD 25 | sizeGb: 100 26 | mountPoint: /mnt/data 27 | docker: 28 | imageName: ubuntu 29 | cmd: | 30 | cp ${inputFile} ${outputFile} 31 | echo ${message} >> ${outputFile} 32 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/workflow/GraphItem.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.workflow; 17 | 18 | /** A node or edge in a directed acyclic graph. */ 19 | public interface GraphItem {} 20 | -------------------------------------------------------------------------------- /src/test/resources/branching-graph.yaml: -------------------------------------------------------------------------------- 1 | version: v1alpha2 2 | defn: 3 | name: BranchingGraph 4 | description: A task that branches into two separate execution paths, then merges. 5 | args: 6 | inputs: 7 | BASE_DIR: REQUIRED 8 | stepOne.inputFile: ${BASE_DIR}/input-one.txt 9 | stepTwo.inputFile: ${BASE_DIR}/input-two.txt 10 | stepThree.inputFile: ${BASE_DIR}/branching/output-one.txt 11 | outputs: 12 | stepOne.outputFile: ${BASE_DIR}/branching/output-one.txt 13 | stepTwo.outputFile: ${BASE_DIR}/branching/output-two.txt 14 | stepThree.outputFile: ${BASE_DIR}/branching/output-three.txt 15 | graph: 16 | - BRANCH: 17 | - stepOne 18 | - stepTwo 19 | - stepThree 20 | steps: 21 | - defn: 22 | name: stepOne 23 | defnFile: task-one.yaml 24 | - defn: 25 | name: stepTwo 26 | defnFile: task-one.yaml 27 | - defn: 28 | name: stepThree 29 | defnFile: task-two.yaml 30 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/runner/TaskException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.runner; 17 | 18 | /** 19 | * An exception occurred when running the Docker command. 20 | */ 21 | @SuppressWarnings("serial") 22 | public class TaskException extends RuntimeException { 23 | public TaskException(String msg) { 24 | super(msg); 25 | } 26 | 27 | public TaskException(String msg, Throwable cause) { 28 | super(msg, cause); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/test/resources/workflow.yaml: -------------------------------------------------------------------------------- 1 | version: v1alpha2 2 | defn: 3 | name: LinearGraph 4 | description: A simple linear graph. 5 | args: 6 | inputs: 7 | BASE_DIR: REQUIRED 8 | steps: 9 | - defn: 10 | name: stepOne 11 | inputParameters: 12 | - name: inputFile 13 | defaultValue: ${BASE_DIR}/input-one.txt 14 | type: file 15 | - name: message 16 | defaultValue: hello 17 | outputParameters: 18 | - name: outputFile 19 | defaultValue: ${BASE_DIR}/linear/output-one.txt 20 | type: file 21 | docker: 22 | imageName: ubuntu 23 | cmd: "cp /mnt/data/in.txt /mnt/data/out.txt; echo ${message} >> /mnt/data/out.txt" 24 | - defn: 25 | name: stepTwo 26 | inputParameters: 27 | - name: inputFile 28 | defaultValue: ${BASE_DIR}/linear/output-one.txt 29 | type: file 30 | - name: message 31 | defaultValue: goodbye 32 | outputParameters: 33 | - name: outputFile 34 | defaultValue: ${BASE_DIR}/linear/output-two.txt 35 | type: file 36 | docker: 37 | imageName: ubuntu 38 | cmd: "cp /mnt/data/in.txt /mnt/data/out.txt; echo ${message} >> /mnt/data/out.txt" 39 | -------------------------------------------------------------------------------- /src/test/resources/cwl-graph.yaml: -------------------------------------------------------------------------------- 1 | version: v1alpha2 2 | defn: 3 | name: CWL_Test 4 | steps: 5 | - defn: 6 | name: SimpleTask 7 | inputParameters: 8 | - name: inputFile 9 | type: file 10 | - name: string 11 | type: string[] 12 | defaultValue: onetwothree 13 | inputBinding: 14 | prefix: "-k " 15 | itemSeparator: " -k " 16 | separate: true 17 | position: 3 18 | - name: bool 19 | type: boolean 20 | inputBinding: 21 | position: 2 22 | prefix: -b 23 | - name: message 24 | defaultValue: ${ 1+ ${size} } 25 | - name: array 26 | type: array 27 | defaultValue: one two 28 | inputBinding: 29 | prefix: "-a " 30 | itemSeparator: " -a " 31 | separate: true 32 | position: 1 33 | - name: file 34 | type: File 35 | inputBinding: 36 | position: 4 37 | prefix: "--file=" 38 | separate: false 39 | outputParameters: 40 | - name: outputFile 41 | type: file 42 | docker: 43 | imageName: ubuntu 44 | cmd: "echo" 45 | args: 46 | inputs: 47 | string: five six seven 48 | file: gs://b/d/test.txt 49 | size: 4 50 | -------------------------------------------------------------------------------- /examples/hello/HelloWorkflow.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import java.io.IOException; 18 | 19 | import com.google.cloud.genomics.dockerflow.task.Task; 20 | import com.google.cloud.genomics.dockerflow.task.TaskBuilder; 21 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 22 | import com.google.cloud.genomics.dockerflow.workflow.WorkflowDefn; 23 | 24 | /** 25 | * A Hello, World example in Java. 26 | */ 27 | public class HelloWorkflow implements WorkflowDefn { 28 | 29 | @Override 30 | public Workflow createWorkflow(String[] args) throws IOException { 31 | Task hello = 32 | TaskBuilder.named("Hello").input("message").script("echo $message").build(); 33 | return TaskBuilder.named("HelloWorkflow").steps(hello).args(args).build(); 34 | } 35 | } -------------------------------------------------------------------------------- /examples/hello/README.md: -------------------------------------------------------------------------------- 1 | ### Disclaimer 2 | 3 | This is not an official Google product. 4 | 5 | ## Hello, World example 6 | 7 | This is a "hello, world" example of Dockerflow, showing both YAML and Java versions. 8 | 9 | ## Prerequisites 10 | 11 | 1. Complete the [Dockerflow](/googlegenomics/dockerflow) Getting Started instructions. 12 | 13 | ## Running the example 14 | 15 | There are two ways to run the example: from the YAML file, and from Java. It's entirely up to you 16 | if you prefer to write Java code or YAML. 17 | 18 | ### Running from a YAML definition 19 | 20 | To run the example using the YAML file: 21 | 22 | dockerflow --project=MY-PROJECT \ 23 | --workflow-file=hello-workflow.yaml \ 24 | --args-file=hello-args.yaml \ 25 | --workspace=gs://MY-BUCKET/MY-PATH 26 | 27 | Replace `PATH/TO` with the path to your jar file. 28 | Set `MY-BUCKET` and `MY-PATH` to a bucket and path that you'd like to use to store output 29 | files, working files, and logs. Set `MY-PROJECT` to your cloud project name. 30 | 31 | The output will be located at `gs://MY-BUCKET/MY-PATH/logs/Hello/task-stdout.log`. 32 | 33 | ### Running from Java 34 | 35 | To run the same example using the Java definition rather than YAML, first compile the class: 36 | 37 | javac -cp PATH/TO/dockerflow*dependencies.jar HelloWorkflow.java 38 | 39 | Then run the workflow from the Java class: 40 | 41 | dockerflow --project=MY-PROJECT \ 42 | --workflow-class=HelloWorkflow \ 43 | --args-file=hello-args.yaml \ 44 | --workspace=gs://MY-BUCKET/MY-PATH 45 | 46 | Set `MY-BUCKET` and `MY-PATH` to a bucket and path that you'd like to use to store output 47 | files, working files, and logs. Set `MY-PROJECT` to your cloud project name. 48 | 49 | The output will be located at `gs://MY-BUCKET/MY-PATH/logs/Hello/task-stdout.log`. -------------------------------------------------------------------------------- /src/test/resources/complex-graph.yaml: -------------------------------------------------------------------------------- 1 | version: v1alpha2 2 | defn: 3 | name: ComplexGraph 4 | description: A more complex directed acyclic graph. 5 | graph: 6 | - one 7 | - BRANCH: 8 | - BRANCH: 9 | - - two 10 | - three 11 | - four 12 | - - five 13 | - six 14 | - seven 15 | - eight 16 | steps: 17 | - defn: 18 | name: one 19 | inputParameters: 20 | - name: msg 21 | defaultValue: one 22 | docker: 23 | imageName: ubuntu 24 | cmd: "echo Task=${msg}" 25 | - defn: 26 | name: two 27 | inputParameters: 28 | - name: msg 29 | defaultValue: two 30 | docker: 31 | imageName: ubuntu 32 | cmd: "echo Task=${msg}" 33 | - defn: 34 | name: three 35 | inputParameters: 36 | - name: msg 37 | defaultValue: three 38 | docker: 39 | imageName: ubuntu 40 | cmd: "echo Task=${msg}" 41 | - defn: 42 | name: four 43 | inputParameters: 44 | - name: msg 45 | defaultValue: four 46 | docker: 47 | imageName: ubuntu 48 | cmd: "echo Task=${msg}" 49 | - defn: 50 | name: five 51 | inputParameters: 52 | - name: msg 53 | defaultValue: five 54 | docker: 55 | imageName: ubuntu 56 | cmd: "echo Task=${msg}" 57 | - defn: 58 | name: six 59 | inputParameters: 60 | - name: msg 61 | defaultValue: six 62 | docker: 63 | imageName: ubuntu 64 | cmd: "echo Task=${msg}" 65 | - defn: 66 | name: seven 67 | inputParameters: 68 | - name: msg 69 | defaultValue: seven 70 | docker: 71 | imageName: ubuntu 72 | cmd: "echo Task=${msg}" 73 | - defn: 74 | name: eight 75 | inputParameters: 76 | - name: msg 77 | defaultValue: eight 78 | docker: 79 | imageName: ubuntu 80 | cmd: "echo Task=${msg}" 81 | -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/genomics/dockerflow/args/ArgsTableBuilderTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.args; 17 | 18 | import com.google.cloud.genomics.dockerflow.TestUtils; 19 | import com.google.cloud.genomics.dockerflow.util.StringUtils; 20 | 21 | import java.io.IOException; 22 | import java.util.Map; 23 | import static org.junit.Assert.assertTrue; 24 | import org.junit.Test; 25 | import org.slf4j.Logger; 26 | import org.slf4j.LoggerFactory; 27 | 28 | /** 29 | * Test running with multiple input parameter sets. 30 | */ 31 | public class ArgsTableBuilderTest { 32 | private static final Logger LOG = LoggerFactory.getLogger(ArgsTableBuilderTest.class); 33 | 34 | @Test 35 | public void testLoadCsv() throws IOException { 36 | Map m = 37 | ArgsTableBuilder.fromFile(TestUtils.RESOURCE_DIR + "/workflowArgs.csv") 38 | .project(TestUtils.TEST_PROJECT) 39 | .preemptible(true) 40 | .build(); 41 | 42 | String json = StringUtils.toJson(m); 43 | LOG.info(json); 44 | 45 | assertTrue("Project ID not set", json.contains("projectId")); 46 | assertTrue("Preemptible setting missing", json.contains("preemptible")); 47 | assertTrue("inputFile path wrong", json.contains("../TaskOne/output-one.txt")); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Want to contribute? Great! Please read this page so your submission can go 4 | through smoothly. 5 | 6 | ## Contributor License Agreement 7 | 8 | Before we can use your code, you must sign the 9 | [Google Individual Contributor License Agreement](https://cla.developers.google.com/about/google-individual) 10 | (CLA), which you can do online. The CLA is necessary mainly because you own the 11 | copyright to your changes, even after your contribution becomes part of our 12 | codebase, so we need your permission to use and distribute your code. We also 13 | need to be sure of various other things — for instance that you'll tell us if 14 | you know that your code infringes on other people's patents. 15 | 16 | Contributions made by corporations are covered by a different agreement than 17 | the one above. If you work for a company that wants to allow you to contribute 18 | your work, then you'll need to sign a 19 | [Software Grant and Corporate Contributor License Agreement](https://cla.developers.google.com/about/google-corporate). 20 | 21 | You don't have to sign the CLA until after you've submitted your code for review 22 | and a member has approved it, but you must do it before we can put your code 23 | into the repository. Before you start working on a larger contribution, you 24 | should get in touch with us first through the issue tracker with your idea so 25 | that we can help out and possibly guide you. Coordinating up front makes it much 26 | easier to avoid frustration later on. 27 | 28 | ## Developer Workflow 29 | 30 | If you would like to add a new feature, cmdlet, or change, first 31 | [create a new Issue](https://github.com/googlegenomics/dockerflow/issues/new). 32 | There we will triage the idea and discuss any design or implementation details. 33 | 34 | Contributors are expected to do their work in a local fork and submit code for 35 | consideration via a GitHub pull request. 36 | 37 | When the pull request process deems the change ready, it will be merged directly 38 | into the tree. Congratulations and thank you! -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/workflow/WorkflowDefn.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.workflow; 17 | 18 | import com.google.cloud.dataflow.sdk.Pipeline; 19 | import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions; 20 | import com.google.cloud.genomics.dockerflow.args.WorkflowArgs; 21 | import com.google.cloud.genomics.dockerflow.dataflow.DataflowBuilder; 22 | 23 | import java.io.IOException; 24 | import java.net.URISyntaxException; 25 | import java.util.Map; 26 | 27 | /** 28 | * A workflow definition. Typically you'll implement only one of the methods: 29 | * createWorkflow if you're going to use the WorkflowBuilder, and createDataflow 30 | * if you're going to construct the Dataflow pipeline using the PTransform 31 | * classes directly. 32 | */ 33 | public interface WorkflowDefn { 34 | 35 | /** 36 | * The workflow defn implementation is responsible for defining the workflow steps and default 37 | * args, and creating a Dataflow pipeline. 38 | * 39 | * @throws URISyntaxException 40 | */ 41 | default Pipeline createDataflow( 42 | Map argsTable, DataflowPipelineOptions pipelineOptions, String[] args) 43 | throws IOException { 44 | return DataflowBuilder.of(createWorkflow(args)) 45 | .createFrom(argsTable) 46 | .pipelineOptions(pipelineOptions) 47 | .build(); 48 | } 49 | 50 | /** 51 | * Create the workflow. 52 | */ 53 | default Workflow createWorkflow(String[] args) throws IOException { 54 | return null; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/transform/BreakFusion.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.transform; 17 | 18 | import com.google.cloud.dataflow.sdk.transforms.Combine; 19 | import com.google.cloud.dataflow.sdk.transforms.DoFn; 20 | import com.google.cloud.dataflow.sdk.transforms.PTransform; 21 | import com.google.cloud.dataflow.sdk.transforms.ParDo; 22 | import com.google.cloud.dataflow.sdk.transforms.SerializableFunction; 23 | import com.google.cloud.dataflow.sdk.transforms.Values; 24 | import com.google.cloud.dataflow.sdk.values.KV; 25 | import com.google.cloud.dataflow.sdk.values.PCollection; 26 | 27 | /** Break Dataflow fusion. */ 28 | @SuppressWarnings("serial") 29 | public class BreakFusion extends PTransform, PCollection> { 30 | 31 | public BreakFusion() {} 32 | 33 | public BreakFusion(String name) { 34 | super(name); 35 | } 36 | 37 | @Override 38 | public PCollection apply(PCollection input) { 39 | return input 40 | .apply(ParDo.named("BreakFusion").of(new DummyMapFn())) 41 | .apply(Combine.perKey(new First())) 42 | .apply(Values.create()); 43 | } 44 | 45 | static class DummyMapFn extends DoFn> { 46 | 47 | @Override 48 | public void processElement(DoFn>.ProcessContext c) throws Exception { 49 | c.output(KV.of(String.valueOf(c.element().hashCode()), c.element())); 50 | } 51 | } 52 | 53 | /** 54 | * Return the first element. Since ordering is not guaranteed, it should be treated as effectively 55 | * a random element. 56 | */ 57 | static class First implements SerializableFunction, T> { 58 | 59 | @Override 60 | public T apply(Iterable input) { 61 | return input.iterator() != null && input.iterator().hasNext() 62 | ? input.iterator().next() 63 | : null; 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/genomics/dockerflow/examples/ExampleGraphsITCase.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | package com.google.cloud.genomics.dockerflow.examples; 15 | 16 | import static org.junit.Assert.assertTrue; 17 | 18 | import com.google.cloud.genomics.dockerflow.DockerflowConstants; 19 | import com.google.cloud.genomics.dockerflow.TestUtils; 20 | import com.google.cloud.genomics.dockerflow.util.FileUtils; 21 | import java.io.IOException; 22 | import org.junit.Test; 23 | import org.slf4j.Logger; 24 | import org.slf4j.LoggerFactory; 25 | 26 | /** 27 | * Integration tests. Runs the same tests as the superclass, but remotely with Dataflow's blocking 28 | * runner. 29 | *

To run, you'll need to gsutil cp all of the src/test/resources files to TEST_GCS_PATH. 30 | * 31 | *

After running integration tests, clean up by running: 32 | *

gsutil -m rm -r TEST_BASE_DIR/*
33 | */ 34 | public class ExampleGraphsITCase extends ExampleGraphsTest { 35 | private static Logger LOG = LoggerFactory.getLogger(ExampleGraphsITCase.class); 36 | 37 | public ExampleGraphsITCase() throws IOException { 38 | utils.baseDir = TestUtils.TEST_GCS_PATH; 39 | utils.runner = DockerflowConstants.BLOCKING_RUNNER; 40 | utils.checkOutput = true; 41 | LOG.info("Running with GCS paths, blocking runner, and output file checks enabled"); 42 | } 43 | 44 | @Test 45 | @Override 46 | public void testTask() throws Exception { 47 | super.testTask(); 48 | } 49 | 50 | @Test 51 | @Override 52 | public void testLinearGraph() throws Exception { 53 | super.testLinearGraph(); 54 | } 55 | 56 | @Test 57 | @Override 58 | public void testMultiLinearGraph() throws Exception { 59 | super.testMultiLinearGraph(); 60 | } 61 | 62 | @Test 63 | @Override 64 | public void testComplexGraph() throws Exception { 65 | super.testComplexGraph(); 66 | } 67 | 68 | @Test 69 | public void testFileExists() throws Exception { 70 | assertTrue(FileUtils.gcsPathExists(utils.baseDir + "/task-one.yaml")); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/genomics/dockerflow/examples/ComplexGraph.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.examples; 17 | 18 | import com.google.cloud.genomics.dockerflow.dataflow.DataflowBuilder; 19 | import com.google.cloud.genomics.dockerflow.task.Task; 20 | import com.google.cloud.genomics.dockerflow.task.TaskBuilder; 21 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 22 | import com.google.cloud.genomics.dockerflow.workflow.Workflow.Branch; 23 | import com.google.cloud.genomics.dockerflow.workflow.Workflow.Steps; 24 | import java.io.IOException; 25 | import org.slf4j.Logger; 26 | import org.slf4j.LoggerFactory; 27 | 28 | /** 29 | * Run a complex workflow graph with Docker steps using Dataflow for orchestration. 30 | * 31 | *

Required command-line arguments: 32 | * 33 | *

34 |  * --project=YOUR_PROJECT_ID
35 |  * --workspace=gs://YOUR_BUCKET/DIR
36 |  * --runner=DATAFLOW_RUNNER_NAME
37 |  * --max-workers=INT (recommended=3)
38 |  * 
39 | */ 40 | public class ComplexGraph { 41 | private static final Logger LOG = LoggerFactory.getLogger(ComplexGraph.class); 42 | 43 | public static void main(String[] args) throws IOException { 44 | LOG.info("Defining and running Dataflow pipeline"); 45 | Workflow w = 46 | TaskBuilder.named(ComplexGraph.class.getSimpleName()) 47 | .steps( 48 | Steps.of( 49 | task("one"), 50 | Branch.of( 51 | Branch.of(Steps.of(task("two"), task("three")), task("four")), 52 | Steps.of(task("five"), task("six"), task("seven")), 53 | task("eight")))) 54 | .build(); 55 | DataflowBuilder.of(w).createFrom(args).pipelineOptions(args).build().run(); 56 | } 57 | 58 | public static Task task(String name) throws IOException { 59 | LOG.info("Building Docker task: " + name); 60 | return TaskBuilder.named(name) 61 | .input("name", name) 62 | .docker("ubuntu") 63 | .script("echo Task=${name}") 64 | .build(); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/transform/MergeBranches.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.transform; 17 | 18 | import com.google.cloud.dataflow.sdk.transforms.Combine; 19 | import com.google.cloud.dataflow.sdk.transforms.Flatten; 20 | import com.google.cloud.dataflow.sdk.transforms.PTransform; 21 | import com.google.cloud.dataflow.sdk.transforms.SerializableFunction; 22 | import com.google.cloud.dataflow.sdk.values.KV; 23 | import com.google.cloud.dataflow.sdk.values.PCollection; 24 | import com.google.cloud.dataflow.sdk.values.PCollectionList; 25 | import com.google.cloud.genomics.dockerflow.args.WorkflowArgs; 26 | 27 | /** Merge branches in the graph. */ 28 | @SuppressWarnings("serial") 29 | public class MergeBranches 30 | extends PTransform< 31 | PCollectionList>, PCollection>> { 32 | 33 | public MergeBranches() { 34 | super(); 35 | } 36 | 37 | public MergeBranches(String name) { 38 | super(name); 39 | } 40 | 41 | @Override 42 | public PCollection> apply( 43 | PCollectionList> input) { 44 | return input 45 | .apply(Flatten.>pCollections()) 46 | .apply(Combine.globally(new Merge())); 47 | } 48 | 49 | private static class Merge 50 | implements SerializableFunction< 51 | Iterable>, KV> { 52 | 53 | @Override 54 | public KV apply(Iterable> input) { 55 | String key = null; 56 | WorkflowArgs retval = null; 57 | 58 | // Merge arguments 59 | for (KV kv : input) { 60 | 61 | // Modify a copy 62 | WorkflowArgs wa = new WorkflowArgs(kv.getValue()); 63 | 64 | // First time, nothing to merge 65 | if (retval == null) { 66 | key = kv.getKey(); 67 | retval = wa; 68 | // Find differences and merge 69 | } else { 70 | retval.gatherArgs(wa); 71 | } 72 | } 73 | return KV.of(key, retval); 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/dataflow/DataflowBuilder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.dataflow; 17 | 18 | import com.google.cloud.dataflow.sdk.Pipeline; 19 | import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions; 20 | import com.google.cloud.genomics.dockerflow.args.ArgsTableBuilder; 21 | import com.google.cloud.genomics.dockerflow.args.WorkflowArgs; 22 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 23 | import java.io.IOException; 24 | import java.util.Map; 25 | 26 | /** 27 | * Builder for workflow graphs. 28 | */ 29 | public class DataflowBuilder { 30 | private Workflow workflow; 31 | private Map workflowArgs; 32 | private DataflowPipelineOptions pipelineOptions; 33 | 34 | /** 35 | * Constructor. 36 | * 37 | * @param name 38 | * @return 39 | * @throws IOException 40 | */ 41 | public static DataflowBuilder of(Workflow w) throws IOException { 42 | return new DataflowBuilder(w); 43 | } 44 | 45 | DataflowBuilder(Workflow w) throws IOException { 46 | workflow = w; 47 | } 48 | 49 | DataflowBuilder() {} 50 | 51 | /** Arguments for a single run. */ 52 | public DataflowBuilder createFrom(WorkflowArgs args) { 53 | workflowArgs = ArgsTableBuilder.of(args).build(); 54 | return this; 55 | } 56 | 57 | /** Arguments for multiple concurrent runs. */ 58 | public DataflowBuilder createFrom(Map args) { 59 | workflowArgs = args; 60 | return this; 61 | } 62 | 63 | /** Arguments from the command line for one or more runs. */ 64 | public DataflowBuilder createFrom(String[] args) throws IOException { 65 | workflowArgs = ArgsTableBuilder.fromArgs(args).build(); 66 | return this; 67 | } 68 | 69 | public DataflowBuilder pipelineOptions(String[] args) throws IOException { 70 | return pipelineOptions(DataflowFactory.pipelineOptions(args)); 71 | } 72 | 73 | public DataflowBuilder pipelineOptions(DataflowPipelineOptions options) { 74 | pipelineOptions = options; 75 | pipelineOptions.setAppName(workflow.getDefn().getName()); 76 | return this; 77 | } 78 | 79 | public Pipeline build() throws IOException { 80 | return DataflowFactory.dataflow(workflow, workflowArgs, pipelineOptions); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/runner/Operation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.runner; 17 | 18 | import java.io.Serializable; 19 | import java.util.List; 20 | import java.util.Map; 21 | 22 | /** 23 | * A replacement for the autogenerated Pipelines API's Operation object. Reason: it works with 24 | * standard json serializers. 25 | */ 26 | @SuppressWarnings("serial") 27 | public class Operation implements Serializable { 28 | private String name; 29 | private Boolean done; 30 | private Status error; 31 | private Map metadata; 32 | private Map response; 33 | 34 | public String getName() { 35 | return name; 36 | } 37 | 38 | public void setName(String name) { 39 | this.name = name; 40 | } 41 | 42 | public Boolean getDone() { 43 | return done; 44 | } 45 | 46 | public void setDone(Boolean done) { 47 | this.done = done; 48 | } 49 | 50 | public Status getError() { 51 | return error; 52 | } 53 | 54 | public void setError(Status error) { 55 | this.error = error; 56 | } 57 | 58 | public Map getMetadata() { 59 | return metadata; 60 | } 61 | 62 | public void setMetadata(Map metadata) { 63 | this.metadata = metadata; 64 | } 65 | 66 | public Map getResponse() { 67 | return response; 68 | } 69 | 70 | public void setResponse(Map response) { 71 | this.response = response; 72 | } 73 | 74 | /** 75 | * Status. 76 | */ 77 | public class Status implements Serializable { 78 | private Integer code; 79 | private String message; 80 | private List> details; 81 | 82 | public Integer getCode() { 83 | return code; 84 | } 85 | 86 | public void setCode(Integer code) { 87 | this.code = code; 88 | } 89 | 90 | public String getMessage() { 91 | return message; 92 | } 93 | 94 | public void setMessage(String message) { 95 | this.message = message; 96 | } 97 | 98 | public List> getDetails() { 99 | return details; 100 | } 101 | 102 | public void setDetails(List> details) { 103 | this.details = details; 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/genomics/dockerflow/DockerflowITCase.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow; 17 | 18 | import org.junit.Test; 19 | import org.slf4j.Logger; 20 | import org.slf4j.LoggerFactory; 21 | 22 | /** 23 | * Integration tests. Runs the same tests as the superclass, but remotely with the 24 | * Dataflow service and Pipelines API. 25 | * 26 | *

To run the tests, gsutil cp all of the src/test/resources files to TEST_GCS_PATH. 27 | * 28 | *

After running integration tests, clean up by running: 29 | *

gsutil -m rm -r TEST_BASE_DIR/*
30 | */ 31 | public class DockerflowITCase extends DockerflowTest { 32 | private static final Logger LOG = LoggerFactory.getLogger(DockerflowITCase.class); 33 | 34 | public DockerflowITCase() { 35 | utils.baseDir = TestUtils.TEST_GCS_PATH; 36 | utils.runner = DockerflowConstants.BLOCKING_RUNNER; 37 | utils.checkOutput = true; 38 | LOG.info("Running with GCS paths, blocking runner, and output file checks enabled"); 39 | } 40 | 41 | @Test 42 | @Override 43 | public void testSingleTaskNoWait() throws Exception { 44 | utils.runner = DockerflowConstants.DEFAULT_RUNNER; 45 | super.testSingleTaskNoWait(); 46 | utils.runner = DockerflowConstants.BLOCKING_RUNNER; 47 | } 48 | 49 | @Test 50 | @Override 51 | public void testSingleTask() throws Exception { 52 | super.testSingleTask(); 53 | } 54 | 55 | @Test 56 | @Override 57 | public void testParameterSubstitution() throws Exception { 58 | super.testParameterSubstitution(); 59 | } 60 | 61 | @Test 62 | @Override 63 | public void testLinearGraph() throws Exception { 64 | super.testLinearGraph(); 65 | } 66 | 67 | @Test 68 | @Override 69 | public void testScatter() throws Exception { 70 | super.testScatter(); 71 | } 72 | 73 | @Test 74 | @Override 75 | public void testGather() throws Exception { 76 | super.testGather(); 77 | } 78 | 79 | @Test 80 | @Override 81 | public void testReorderedGraph() throws Exception { 82 | super.testReorderedGraph(); 83 | } 84 | 85 | @Test 86 | @Override 87 | public void testBranchingGraph() throws Exception { 88 | super.testBranchingGraph(); 89 | } 90 | 91 | @Test 92 | @Override 93 | public void testComplexGraph() throws Exception { 94 | super.testComplexGraph(); 95 | } 96 | 97 | @Test 98 | @Override 99 | public void testFolderCopy() throws Exception { 100 | super.testFolderCopy(); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | 6 | com.google.cloud.genomics.dockerflow 7 | dockerflow 8 | 0.0.1-SNAPSHOT 9 | 10 | 11 | 12 | 13 | org.apache.maven.plugins 14 | maven-compiler-plugin 15 | 3.3 16 | 17 | 1.8 18 | 1.8 19 | 20 | 21 | 22 | maven-assembly-plugin 23 | 24 | 25 | package 26 | 27 | single 28 | 29 | 30 | 31 | 32 | 33 | 34 | com.google.cloud.genomics.dockerflow.Dockerflow 35 | 36 | 37 | 38 | jar-with-dependencies 39 | 40 | 41 | 42 | 43 | org.apache.maven.plugins 44 | maven-release-plugin 45 | 2.5.3 46 | 47 | 48 | 49 | 50 | 51 | 52 | org.codehaus.mojo 53 | exec-maven-plugin 54 | 1.4.0 55 | 56 | false 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | com.google.cloud.dataflow 66 | google-cloud-dataflow-java-sdk-all 67 | 1.9.0 68 | 69 | 70 | com.google.code.gson 71 | gson 72 | 2.6.2 73 | 74 | 75 | com.fasterxml.jackson.dataformat 76 | jackson-dataformat-yaml 77 | 2.7.4 78 | 79 | 80 | org.apache.commons 81 | commons-csv 82 | 1.1 83 | 84 | 85 | org.slf4j 86 | slf4j-jdk14 87 | 1.7.7 88 | runtime 89 | 90 | 91 | junit 92 | junit 93 | 4.11 94 | test 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/genomics/dockerflow/examples/LinearGraph.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.examples; 17 | 18 | import com.google.cloud.genomics.dockerflow.dataflow.DataflowBuilder; 19 | import com.google.cloud.genomics.dockerflow.task.Task; 20 | import com.google.cloud.genomics.dockerflow.task.TaskBuilder; 21 | import com.google.cloud.genomics.dockerflow.util.StringUtils; 22 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 23 | import java.io.IOException; 24 | import java.util.Map; 25 | import org.slf4j.Logger; 26 | import org.slf4j.LoggerFactory; 27 | 28 | /** 29 | * Run a simple two-step workflow with Docker steps using Dataflow for orchestration. 30 | * 31 | *

Required command-line arguments: 32 | * 33 | *

34 |  * --project=YOUR_PROJECT_ID
35 |  * --workspace=gs://YOUR_BUCKET/DIR
36 |  * --inputFile=FILE
37 |  * --outputFile=FILE
38 |  * --runner=DATAFLOW_RUNNER_NAME
39 |  * 
40 | */ 41 | public class LinearGraph { 42 | private static final Logger LOG = LoggerFactory.getLogger(LinearGraph.class); 43 | 44 | public static void main(String[] args) throws IOException { 45 | 46 | LOG.info("Parsing command-line arguments"); 47 | Map m = StringUtils.parseArgs(args); 48 | String inputFile = m.get("inputFile"); 49 | String outputFile = m.get("outputFile"); 50 | String tmpFile = inputFile + ".tmp"; 51 | 52 | LOG.info("Building Docker tasks"); 53 | Task stepOne = 54 | TaskBuilder.named("TaskOne") 55 | .project(m.get("project")) 56 | .logging(m.get("logging") + "/1/test.log") 57 | .zones(new String[] {"us-*"}) 58 | .inputFile("inputFile", inputFile) 59 | .input("message", "hello") 60 | .outputFile("outputFile", tmpFile) 61 | .docker("ubuntu") 62 | .script("cp ${inputFile} ${outputFile} ; echo ${message} >> ${outputFile}") 63 | .build(); 64 | 65 | Task stepTwo = 66 | TaskBuilder.named("TaskTwo") 67 | .project(m.get("project")) 68 | .logging(m.get("logging") + "/2/test.log") 69 | .zones(new String[] {"us-*"}) 70 | .inputFile("inputFile", tmpFile) 71 | .input("message", "goodbye") 72 | .outputFile("outputFile", outputFile) 73 | .docker("ubuntu") 74 | .script("cp ${inputFile} ${outputFile} ; echo ${message} >> ${outputFile}") 75 | .build(); 76 | 77 | LOG.info("Defining and running Dataflow pipeline"); 78 | Workflow w = 79 | TaskBuilder.named(LinearGraph.class.getSimpleName()).steps(stepOne, stepTwo).build(); 80 | DataflowBuilder.of(w).createFrom(args).pipelineOptions(args).build().run(); 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/transform/WaitForOperation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.transform; 17 | 18 | import com.google.cloud.dataflow.sdk.transforms.DoFn; 19 | import com.google.cloud.dataflow.sdk.values.KV; 20 | import com.google.cloud.genomics.dockerflow.args.WorkflowArgs; 21 | import com.google.cloud.genomics.dockerflow.runner.Operation; 22 | import com.google.cloud.genomics.dockerflow.runner.TaskException; 23 | import com.google.cloud.genomics.dockerflow.runner.TaskRunner; 24 | 25 | import org.slf4j.Logger; 26 | import org.slf4j.LoggerFactory; 27 | 28 | /** Poll for task completion. If testing, return immediately. */ 29 | @SuppressWarnings("serial") 30 | public class WaitForOperation extends DoFn, KV> { 31 | private static final Logger LOG = LoggerFactory.getLogger(WaitForOperation.class); 32 | 33 | public WaitForOperation() { 34 | } 35 | 36 | @Override 37 | public void processElement( 38 | DoFn, KV>.ProcessContext c) throws Exception { 39 | WorkflowArgs wa = new WorkflowArgs(c.element().getValue()); 40 | Operation o = wa.getCurrentOperation(); 41 | 42 | // Task is already done. 43 | if (o != null && o.getDone()) { 44 | c.output(c.element()); 45 | 46 | // Wait for it 47 | } else { 48 | if (wa.isTesting() != null && wa.isTesting()) { 49 | LOG.info("Running in local/test mode. Not waiting for the operation to complete."); 50 | } else { 51 | LOG.info("Waiting for " + o.getName()); 52 | o = TaskRunner.wait(wa.getCurrentOperation()); 53 | } 54 | wa.setCurrentOperation(o); 55 | 56 | LOG.info("Operation name: " + o.getName() + " completed."); 57 | 58 | // Check for errors and abort if that's the policy 59 | if (o.getError() != null) { 60 | 61 | String msg = o.getError().getMessage(); 62 | if (o.getError().getDetails() != null) { 63 | msg += ". " + o.getError().getDetails(); 64 | } 65 | 66 | // VM was preempted 67 | if (msg.indexOf("stopped unexpectedly") >= 0) { 68 | LOG.info( 69 | "VM was preempted. Task will be retried up to " 70 | + wa.getMaxTries() 71 | + " attempts."); 72 | 73 | // Abort 74 | } else if (wa.getAbortOnError()) { 75 | throw new TaskException("Operation " + o.getName() + " failed. Details: " + msg); 76 | 77 | // Log the error 78 | } else { 79 | LOG.info("Operation " + o.getName() + " failed, but not aborting. Details: " + msg); 80 | } 81 | } 82 | 83 | c.output(KV.of(c.element().getKey(), wa)); 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /examples/gatk/README.md: -------------------------------------------------------------------------------- 1 | ### Disclaimer 2 | 3 | This is not an official Google product. 4 | 5 | ## GATK with DockerFlow 6 | 7 | This example uses Dockerflow to implement a popular bioinformatics workflow, the 8 | [Broad Institute GATK](http://broadinstitute.org/gatk) Best Practices pipeline for human whole 9 | genome sequencing. 10 | 11 | This example shows how to: 12 | 13 | * Define a complex, real-world workflow in [YAML](http://yaml.org) 14 | * Define the same workflow in Java 15 | * Pass parameters to tasks 16 | * Pass an output file from one task as an input to the next 17 | * Scatter by input file or lines of a file 18 | * Gather outputs 19 | * Branch a graph to run steps in parallel 20 | 21 | Take a look at `gatk-workflow.yaml` and `GatkPairedSingleSample.java` for details. 22 | 23 | **In order to run this example, you must agree to the [GATK End User License Agreement](https://software.broadinstitute.org/gatk/download/licensing), including the attribution requirement.** 24 | 25 | ## Prerequisites 26 | 27 | 1. Complete the [Dockerflow](/googlegenomics/dockerflow) Getting Started instructions. 28 | 29 | ## Running the example 30 | 31 | There are two ways to run the example: from the YAML file, and from Java. It's entirely up to you 32 | if you prefer to write Java code or YAML. The Java is a bit more compact. The YAML doesn't need to be 33 | compiled. 34 | 35 | ### Running from a YAML definition 36 | 37 | To run the example using the YAML file: 38 | 39 | dockerflow --project=MY-PROJECT \ 40 | --workflow-file=gatk-workflow.yaml \ 41 | --args-file=gatk-args.yaml \ 42 | --workspace=gs://MY-BUCKET/MY-PATH \ 43 | --preemptible 44 | 45 | Replace `PATH/TO` with the path to your jar file. 46 | Set `MY-BUCKET` and `MY-PATH` to a bucket and path that you'd like to use to store output 47 | files, working files, and logs. Set `MY-PROJECT` to your cloud project name. 48 | 49 | Things to note: the args file contains the paths to the input files in Google Cloud Storage. You 50 | can change the paths to point to your own data. 51 | 52 | The `workspace` is the location where your logs and output files will go. 53 | 54 | By setting `--preemptible`, your workflow will run with preemptible VMs, which can save money. 55 | The downside is they're more likely to be terminated and need retries. Fortunately, DockerFlow 56 | automatically retries failed steps for you. 57 | 58 | ### Running from a Java definition 59 | 60 | To run the same example using the Java definition rather than YAML, first compile the class: 61 | 62 | javac -cp PATH/TO/dockerflow*dependencies.jar GatkPairedSingleSample.java 63 | 64 | Then run the workflow from the Java class: 65 | 66 | dockerflow --project=MY-PROJECT \ 67 | --workflow-class=GatkPairedSingleSample \ 68 | --args-file=gatk-params.yaml \ 69 | --workspace=gs://MY-BUCKET/MY-PATH \ 70 | --preemptible 71 | 72 | Set `MY-BUCKET` and `MY-PATH` to a bucket and path that you'd like to use to store output 73 | files, working files, and logs. Set `MY-PROJECT` to your cloud project name. 74 | 75 | ### Troubleshooting 76 | 77 | To see detailed log messages during workflow execution, you can run locally by setting 78 | `--runner=DirectPipelineRunner`. 79 | 80 | To do a dry run, run with the `--test` flag. (Unfortunately, this will fail for GATK at the couple 81 | of places where workflow stages depend upon the output of earlier stages. You can create dummy 82 | files and store them in the expected locations.) 83 | 84 | To resume a workflow where it left off, run with `--resume`. The workflow will start from the 85 | beginning, and if all output files from a task already exist in Google Cloud Storage, the workflow 86 | will skip ahead to the next task. 87 | 88 | ## Next steps 89 | 90 | * Write your own pipeline 91 | * Contribute to DockerFlow 92 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/DockerflowConstants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow; 17 | 18 | import com.google.cloud.dataflow.sdk.runners.BlockingDataflowPipelineRunner; 19 | import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner; 20 | import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner; 21 | 22 | /** Command-line flags and defaults. */ 23 | public interface DockerflowConstants { 24 | String ABORT = "abort"; 25 | String[] ALL_ZONES = { 26 | "asia-east1-a", 27 | "asia-east1-b", 28 | "asia-east1-c", 29 | "europe-west1-b", 30 | "europe-west1-c", 31 | "europe-west1-d", 32 | "us-central1-a", 33 | "us-central1-b", 34 | "us-central1-c", 35 | "us-central1-f", 36 | "us-east1-b", 37 | "us-east1-c", 38 | "us-east1-d", 39 | "us-west1-a", 40 | "us-west1-b" 41 | }; 42 | String API_OPERATIONS = "https://genomics.googleapis.com/v1alpha2/"; 43 | String API_RUN_PIPELINE = "https://genomics.googleapis.com/v1alpha2/pipelines:run"; 44 | String ARGS_FILE = "args-file"; 45 | String BLOCKING_RUNNER = BlockingDataflowPipelineRunner.class.getSimpleName(); 46 | /** The key in the workflow graph definition to indicate a branch in the graph. */ 47 | String BRANCH = "BRANCH"; 48 | String CPU = "cpu"; 49 | String DEFAULT_DISK_NAME = "data"; 50 | String DEFAULT_DISK_SIZE = "500"; 51 | String DEFAULT_DISK_TYPE = "PERSISTENT_HDD"; 52 | String DEFAULT_MACHINE_TYPE = "n1-standard-1"; 53 | int DEFAULT_MAX_TRIES = 3; 54 | String DEFAULT_MOUNT_POINT = "/mnt/data"; 55 | String DEFAULT_RUNNER = DataflowPipelineRunner.class.getSimpleName(); 56 | String DELETE_FILES = "delete-files"; 57 | String DIRECT_RUNNER = DirectPipelineRunner.class.getSimpleName(); 58 | String DISK_SIZE = "disk-size"; 59 | String DOCKER = "docker"; 60 | String DOCKERFLOW_WORKSPACE = "DOCKERFLOW_WORKSPACE"; 61 | String DOCKERFLOW_PROJECT = "DOCKERFLOW_PROJECT"; 62 | String DOCKERFLOW_TEST = "DOCKERFLOW_TEST"; 63 | String DOCKERFLOW_ZONES = "DOCKERFLOW_ZONES"; 64 | String GLOBALS = "globals"; 65 | String HELP = "help"; 66 | String INPUTS = "inputs"; 67 | String INPUT_FILE = "input-file"; 68 | String INPUTS_FROM_FILE = "inputs-from-file"; 69 | String KEEP_ALIVE = "keep-alive"; 70 | String LOGGING = "logging"; 71 | String MACHINE_TYPE = "machine-type"; 72 | String MAX_TRIES = "max-tries"; 73 | String MAX_WORKERS = "max-workers"; 74 | String MEMORY = "memory"; 75 | String NAME = "name"; 76 | String OUTPUTS = "outputs"; 77 | int POLL_INTERVAL = 30; 78 | String PREEMPTIBLE = "preemptible"; 79 | String PREFIX_INPUT = "<"; 80 | String PREFIX_OUTPUT = ">"; 81 | String PROJECT = "project"; 82 | String REQUIRED = "REQUIRED"; 83 | String RESUME = "resume"; 84 | String RUN_ID = "run-id"; 85 | String RUNNER = "runner"; 86 | String SCATTER = "scatter"; 87 | String SCRIPT = "script"; 88 | String SCRIPT_FILE = "script-file"; 89 | String SERVICE_ACCOUNT_NAME = "service-account-name"; 90 | String SERVICE_ACCOUNT_SCOPES = "service-account-scopes"; 91 | String STAGING = "staging"; 92 | String STAGING_LOCATION = "stagingLocation"; 93 | String TASK_FILE = "task-file"; 94 | String TEST = "test"; 95 | String WAIT = "wait"; 96 | String WILDCARD = "*"; 97 | String WORKFLOW_CLASS = "workflow-class"; 98 | String WORKFLOW_FILE = "workflow-file"; 99 | String WORKSPACE = "workspace"; 100 | String ZONES = "zones"; 101 | } 102 | -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/genomics/dockerflow/TestUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow; 17 | 18 | import static org.junit.Assert.assertNotNull; 19 | import static org.junit.Assert.assertTrue; 20 | 21 | import com.google.cloud.genomics.dockerflow.util.FileUtils; 22 | import com.google.cloud.genomics.dockerflow.util.HttpUtils; 23 | import java.io.IOException; 24 | import java.util.concurrent.TimeUnit; 25 | import org.slf4j.Logger; 26 | import org.slf4j.LoggerFactory; 27 | 28 | /** Utilities for testing. */ 29 | public class TestUtils { 30 | private static final Logger LOG = LoggerFactory.getLogger(TestUtils.class); 31 | 32 | // Environment variables 33 | public static final String TEST_PROJECT = System.getenv("TEST_PROJECT"); 34 | public static final String TEST_GCS_PATH = System.getenv("TEST_GCS_PATH"); 35 | public static final String RESOURCE_DIR = "src/test/resources"; 36 | 37 | public String baseDir = RESOURCE_DIR; // Allow local or GCS paths 38 | public String runner = DockerflowConstants.DIRECT_RUNNER; 39 | public boolean checkOutput = false; // because we can't yet on local files 40 | 41 | // Expected results 42 | public static final String OUTPUT_ONE = "cat\nhello"; 43 | public static final String OUTPUT_TWO = "dog\nhello"; 44 | public static final String OUTPUT_ONE_TWO = "cat\nhello\ngoodbye"; 45 | public static final String OUTPUT_TWO_ONE = "dog\ngoodbye\nhello"; 46 | public static final String OUTPUT_ONE_TWO_THREE = "cat\nhello\ngoodbye\nhello"; 47 | 48 | public TestUtils() { 49 | assertNotNull("You must set the TEST_PROJECT environment variable.", TestUtils.TEST_PROJECT); 50 | assertNotNull("You must set the TEST_GCS_PATH environment variable.", TestUtils.TEST_GCS_PATH); 51 | assertTrue("TEST_GCS_PATH must begin with gs:// ", TestUtils.TEST_GCS_PATH.startsWith("gs://")); 52 | assertTrue( 53 | "TEST_GCS_PATH must not end with a trailing slash /", 54 | !TestUtils.TEST_GCS_PATH.endsWith("/")); 55 | 56 | LOG.info("TEST_PROJECT=" + TestUtils.TEST_PROJECT); 57 | } 58 | 59 | /** 60 | * Read file contents with retries, since GCS is eventually consistent, and output files may not 61 | * be visible right away. 62 | */ 63 | public static String readAll(String path) { 64 | String output = null; 65 | final int maxTries = 1; 66 | int attempt = 0; 67 | do { 68 | DockerflowTest.LOG.info("Reading output from " + path); 69 | try { 70 | output = FileUtils.readAll(path); 71 | } catch (IOException e) { 72 | DockerflowTest.LOG.info("Failed attempt " + attempt + " with error: " + e.getMessage()); 73 | ++attempt; 74 | try { 75 | DockerflowTest.LOG.info("Sleeping for 20 sec"); 76 | TimeUnit.SECONDS.sleep(20); 77 | } catch (InterruptedException i) { 78 | // ignore 79 | } 80 | } 81 | } while (output == null && attempt < maxTries); 82 | return output; 83 | } 84 | 85 | public static void delete(String gcsPath) { 86 | if (!gcsPath.startsWith("gs://")) { 87 | if (gcsPath.startsWith("/")) { 88 | gcsPath = TestUtils.TEST_GCS_PATH + gcsPath; 89 | } else { 90 | gcsPath = TestUtils.TEST_GCS_PATH + "/" + gcsPath; 91 | } 92 | } 93 | try { 94 | HttpUtils.doDelete(gcsPath); 95 | } catch (IOException e) { 96 | DockerflowTest.LOG.info("Failed to delete: " + gcsPath); 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/genomics/dockerflow/examples/MultiLinearGraph.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2016 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | * use this file except in compliance with the License. You may obtain a copy of 6 | * the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | * License for the specific language governing permissions and limitations under 14 | * the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.examples; 17 | 18 | import com.google.cloud.dataflow.sdk.Pipeline; 19 | import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions; 20 | import com.google.cloud.dataflow.sdk.transforms.Create; 21 | import com.google.cloud.genomics.dockerflow.args.ArgsTableBuilder; 22 | import com.google.cloud.genomics.dockerflow.dataflow.DataflowBuilder; 23 | import com.google.cloud.genomics.dockerflow.dataflow.DataflowFactory; 24 | import com.google.cloud.genomics.dockerflow.task.Task; 25 | import com.google.cloud.genomics.dockerflow.task.TaskBuilder; 26 | import com.google.cloud.genomics.dockerflow.transform.DockerDo; 27 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 28 | import java.io.IOException; 29 | import org.slf4j.Logger; 30 | import org.slf4j.LoggerFactory; 31 | 32 | /** 33 | * Run two instances of a simple workflow, each with different inputs and outputs. All workflow 34 | * steps run in Docker using Dataflow for orchestration. 35 | * 36 | *

Required command-line arguments: 37 | * 38 | *

39 |  * --project=YOUR_PROJECT_ID
40 |  * --workspace=gs://YOUR_BUCKET/DIR
41 |  * --args-table=CSV_FILE
42 |  * --runner=DATAFLOW_RUNNER_NAME
43 |  * 
44 | */ 45 | public class MultiLinearGraph { 46 | private static final Logger LOG = LoggerFactory.getLogger(MultiLinearGraph.class); 47 | 48 | /** 49 | * Run the example using the FlowBuilder class to construct the workflow graph and Dataflow 50 | * pipeline automatically. You can use it even for arbitrarily complex directed acyclic graphs. 51 | */ 52 | public static void main(String[] args) throws IOException { 53 | LOG.info("Defining and running Dataflow pipeline"); 54 | Workflow w = 55 | TaskBuilder.named(MultiLinearGraph.class.getSimpleName()) 56 | .steps(taskOne(), taskTwo()) 57 | .build(); 58 | DataflowBuilder.of(w).createFrom(args).pipelineOptions(args).build().run(); 59 | } 60 | 61 | /** 62 | * For simple linear graphs, it's not too hard to generate the Dataflow pipeline yourself. Here's 63 | * the equivalent Dataflow code for this simple example. 64 | */ 65 | public static void manualDataflow(String[] args) throws IOException { 66 | LOG.info("Parsing Dataflow options"); 67 | DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args); 68 | o.setAppName(MultiLinearGraph.class.getSimpleName()); 69 | Pipeline p = Pipeline.create(o); 70 | 71 | p.apply(Create.of(ArgsTableBuilder.fromArgs(args).build())) 72 | .apply(DockerDo.of(taskOne())) 73 | .apply(DockerDo.of(taskTwo())); 74 | p.run(); 75 | } 76 | 77 | public static Task taskOne() throws IOException { 78 | LOG.info("Building Docker task: TaskOne."); 79 | return TaskBuilder.named("TaskOne") 80 | .inputFile("inputFile") 81 | .input("message", "hello") 82 | .outputFile("outputFile") 83 | .docker("ubuntu") 84 | .script("cp ${inputFile} ${outputFile}; echo ${message} >> ${outputFile}") 85 | .build(); 86 | } 87 | 88 | public static Task taskTwo() throws IOException { 89 | LOG.info("Building Docker task: TaskTwo."); 90 | return TaskBuilder.named("TaskTwo") 91 | .inputFile("inputFile") 92 | .input("message", "hello") 93 | .outputFile("outputFile") 94 | .docker("ubuntu") 95 | .script("cp ${inputFile} ${outputFile} ; echo ${message} >> ${outputFile}") 96 | .build(); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/transform/DeleteIntermediateFiles.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.transform; 17 | 18 | import com.google.cloud.dataflow.sdk.transforms.DoFn; 19 | import com.google.cloud.dataflow.sdk.values.KV; 20 | import com.google.cloud.genomics.dockerflow.args.WorkflowArgs; 21 | import com.google.cloud.genomics.dockerflow.task.Task; 22 | import com.google.cloud.genomics.dockerflow.task.TaskDefn.Param; 23 | import com.google.cloud.genomics.dockerflow.util.HttpUtils; 24 | import com.google.cloud.genomics.dockerflow.util.StringUtils; 25 | import java.io.IOException; 26 | import java.util.HashSet; 27 | import java.util.LinkedHashMap; 28 | import java.util.Map; 29 | import org.slf4j.Logger; 30 | import org.slf4j.LoggerFactory; 31 | 32 | /** Keep only the files named explicitly as outputs of the workflow. */ 33 | @SuppressWarnings("serial") 34 | public class DeleteIntermediateFiles 35 | extends DoFn, KV> { 36 | private static final Logger LOG = LoggerFactory.getLogger(DeleteIntermediateFiles.class); 37 | private Task task; 38 | 39 | public DeleteIntermediateFiles(Task t) { 40 | this.task = t; 41 | } 42 | 43 | @Override 44 | public void processElement( 45 | DoFn, KV>.ProcessContext c) { 46 | LOG.info("Deleting intermediate files"); 47 | 48 | String key = c.element().getKey(); 49 | WorkflowArgs wa = new WorkflowArgs(c.element().getValue()); 50 | LOG.info(StringUtils.toJson(wa)); 51 | 52 | Task t = new Task(task); 53 | t.substitute(wa.getInputs()); 54 | t.substitute(wa.getOutputs()); 55 | 56 | LOG.info("Finding files to keep"); 57 | Map toRetain = new LinkedHashMap(); 58 | if (t.getArgs() != null && t.getArgs().getOutputs() != null) { 59 | toRetain.putAll(t.getArgs().getOutputs()); 60 | } 61 | LOG.info("Files to keep:\n" + StringUtils.toJson(toRetain)); 62 | 63 | LOG.info("Finding intermediate files to delete"); 64 | Map toDelete = new LinkedHashMap(); 65 | if (wa.getOutputs() != null) { 66 | toDelete.putAll(wa.getOutputs()); 67 | } 68 | for (String name : toRetain.keySet()) { 69 | toDelete.remove(name); 70 | } 71 | 72 | for (String pathToKeep : toRetain.values()) { 73 | for (String name : new HashSet(toDelete.keySet())) { 74 | 75 | // Check if the same path has different var names 76 | if (pathToKeep.equals(toDelete.get(name))) { 77 | toDelete.remove(name); 78 | } 79 | } 80 | } 81 | 82 | LOG.info("Files to delete:\n" + StringUtils.toJson(toDelete)); 83 | 84 | LOG.info("Deleting intermediate files"); 85 | for (String name : toDelete.keySet()) { 86 | 87 | LOG.debug("Deleting: " + name + ": " + wa.getOutputs().get(name)); 88 | 89 | // There may be multiple delimited paths 90 | String val = wa.getOutputs().get(name); 91 | String[] paths; 92 | if (val != null && val.trim().length() != 0) { 93 | paths = wa.getOutputs().get(name).split(Param.ARRAY_DELIMITER_REGEX); 94 | } else { 95 | paths = new String[] {val}; 96 | } 97 | 98 | // Delete each 99 | for (String path : paths) { 100 | try { 101 | LOG.debug("Deleting file: " + path); 102 | String result = HttpUtils.doDelete(path); 103 | LOG.debug(result); 104 | 105 | // Remove from args available to downstream steps 106 | wa.getOutputs().remove(name); 107 | 108 | } catch (IOException e) { 109 | LOG.debug("Failed to delete file for output: " + name + ". Reason: " + e.getMessage()); 110 | } 111 | } 112 | } 113 | 114 | LOG.info("Finished retaining outputs"); 115 | c.output(KV.of(key, wa)); 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/args/WorkflowArgs.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.args; 17 | 18 | import com.google.cloud.dataflow.sdk.coders.DefaultCoder; 19 | import com.google.cloud.dataflow.sdk.coders.SerializableCoder; 20 | import com.google.cloud.genomics.dockerflow.DockerflowConstants; 21 | import com.google.cloud.genomics.dockerflow.runner.Operation; 22 | 23 | /** 24 | * Arguments for running an entire workflow. Individual parameters will flow down to the subtasks. 25 | */ 26 | @DefaultCoder(SerializableCoder.class) 27 | @SuppressWarnings("serial") 28 | public class WorkflowArgs extends TaskArgs { 29 | private Operation currentOperation; 30 | 31 | // To resolve any relative paths. 32 | private String workspace; 33 | 34 | // When there are multiple concurrent instances of the workflow running 35 | private int runIndex; 36 | 37 | // Add retries, such as for preemption 38 | private int maxTries = DockerflowConstants.DEFAULT_MAX_TRIES; 39 | 40 | private Boolean isTesting; 41 | 42 | // Abort on error; otherwise carry on with the workflow provided 43 | // subsequent steps are fault-tolerant 44 | private boolean abortOnError = true; 45 | 46 | // Delete intermediate files 47 | private boolean deleteIntermediateFiles; 48 | 49 | // If output files exist, attempt to resume from the last completed step. 50 | private boolean resumeFailedRun; 51 | 52 | public WorkflowArgs() { 53 | super(); 54 | } 55 | 56 | /** Copy constructor. */ 57 | public WorkflowArgs(TaskArgs ta) { 58 | super(ta); 59 | 60 | if (ta instanceof WorkflowArgs) { 61 | WorkflowArgs wa = (WorkflowArgs) ta; 62 | currentOperation = wa.currentOperation; 63 | workspace = wa.workspace; 64 | runIndex = wa.runIndex; 65 | maxTries = wa.maxTries; 66 | isTesting = wa.isTesting; 67 | abortOnError = wa.abortOnError; 68 | deleteIntermediateFiles = wa.deleteIntermediateFiles; 69 | resumeFailedRun = wa.resumeFailedRun; 70 | } 71 | } 72 | 73 | @Override 74 | public void applyArgs(TaskArgs args) { 75 | super.applyArgs(args); 76 | 77 | if (args instanceof WorkflowArgs) { 78 | WorkflowArgs wa = (WorkflowArgs) args; 79 | workspace = wa.workspace; 80 | isTesting = wa.isTesting; 81 | abortOnError = wa.abortOnError; 82 | resumeFailedRun = wa.resumeFailedRun; 83 | deleteIntermediateFiles = wa.deleteIntermediateFiles; 84 | } 85 | } 86 | 87 | @Override 88 | public void mergeDefaultArgs(TaskArgs defaultArgs) { 89 | super.mergeDefaultArgs(defaultArgs); 90 | 91 | if (defaultArgs instanceof WorkflowArgs) { 92 | WorkflowArgs wa = (WorkflowArgs) defaultArgs; 93 | 94 | if (workspace == null) { 95 | workspace = wa.getWorkspace(); 96 | } 97 | if (isTesting == null) { 98 | isTesting = wa.isTesting(); 99 | } 100 | } 101 | } 102 | 103 | /** 104 | * The name of the currently running operation. The value is set internally when a Docker task 105 | * starts and is nulled out when the task completes. 106 | */ 107 | public Operation getCurrentOperation() { 108 | return currentOperation; 109 | } 110 | 111 | public void setCurrentOperation(Operation operation) { 112 | this.currentOperation = operation; 113 | } 114 | 115 | public String getWorkspace() { 116 | return workspace; 117 | } 118 | 119 | public void setWorkspace(String workspace) { 120 | this.workspace = workspace; 121 | } 122 | 123 | public int getRunIndex() { 124 | return runIndex; 125 | } 126 | 127 | public void setRunIndex(int index) { 128 | this.runIndex = index; 129 | } 130 | 131 | public int getMaxTries() { 132 | return maxTries; 133 | } 134 | 135 | public void setMaxTries(int tries) { 136 | if (tries < 1) { 137 | throw new IllegalArgumentException("Max tries must be at least one"); 138 | } 139 | this.maxTries = tries; 140 | } 141 | 142 | public Boolean isTesting() { 143 | return isTesting; 144 | } 145 | 146 | public void setTesting(Boolean isTesting) { 147 | this.isTesting = isTesting; 148 | } 149 | 150 | public boolean getAbortOnError() { 151 | return abortOnError; 152 | } 153 | 154 | public void setAbortOnError(boolean abortOnError) { 155 | this.abortOnError = abortOnError; 156 | } 157 | 158 | public boolean getDeleteFiles() { 159 | return deleteIntermediateFiles; 160 | } 161 | 162 | public void setDeleteFiles(boolean deleteFiles) { 163 | this.deleteIntermediateFiles = deleteFiles; 164 | } 165 | 166 | public boolean getResumeFailedRun() { 167 | return resumeFailedRun; 168 | } 169 | 170 | public void setResumeFailedRun(boolean resume) { 171 | this.resumeFailedRun = resume; 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/util/HttpUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.util; 17 | 18 | import com.google.api.client.googleapis.auth.oauth2.GoogleCredential; 19 | import com.google.gson.GsonBuilder; 20 | import java.io.DataOutputStream; 21 | import java.io.IOException; 22 | import java.net.HttpURLConnection; 23 | import java.net.URL; 24 | import org.slf4j.Logger; 25 | import org.slf4j.LoggerFactory; 26 | 27 | /** 28 | * Utilities for REST calls. 29 | */ 30 | public class HttpUtils { 31 | static final Logger LOG = LoggerFactory.getLogger(HttpUtils.class); 32 | 33 | /** 34 | * Do an HTTPS GET by appending the application default token to the URL. 35 | * 36 | * @param url 37 | * @return response body as string 38 | * @throws IOException 39 | */ 40 | public static String doGet(String url) throws IOException { 41 | LOG.debug("Url for GET: " + url); 42 | String authUrl = 43 | url + "?access_token=" + GoogleCredential.getApplicationDefault().getAccessToken(); 44 | 45 | HttpURLConnection con = (HttpURLConnection) new URL(authUrl).openConnection(); 46 | con.setDoInput(true); 47 | con.setRequestMethod("GET"); 48 | 49 | int code = con.getResponseCode(); 50 | LOG.debug("Response code: " + code); 51 | 52 | if (code != HttpURLConnection.HTTP_OK) { 53 | String msg = FileUtils.readAll(con.getErrorStream()); 54 | throw new IOException("HTTP error: " + code + " for url " + url + "" + msg); 55 | } 56 | return FileUtils.readAll(con.getInputStream()); 57 | } 58 | 59 | /** 60 | * Do an HTTPS POST with a JSON object as the request body. Append the application default token 61 | * to the URL for authentication. 62 | * 63 | * @param url 64 | * @param param object to encode as JSON in the request body 65 | * @return response body as string 66 | * @throws IOException 67 | */ 68 | public static String doPost(String url, Object param) throws IOException { 69 | LOG.debug("Url for POST: " + url); 70 | String authUrl = 71 | url + "?access_token=" + GoogleCredential.getApplicationDefault().getAccessToken(); 72 | 73 | HttpURLConnection con = (HttpURLConnection) new URL(authUrl).openConnection(); 74 | con.setDoOutput(true); 75 | con.setDoInput(true); 76 | con.setRequestMethod("POST"); 77 | con.setRequestProperty("Content-Type", "application/json"); 78 | 79 | DataOutputStream out = new DataOutputStream(con.getOutputStream()); 80 | String params = new GsonBuilder().create().toJson(param); 81 | out.writeBytes(params); 82 | out.close(); 83 | LOG.debug(params); 84 | 85 | int code = con.getResponseCode(); 86 | LOG.debug("Response code: " + code); 87 | 88 | if (code != HttpURLConnection.HTTP_OK) { 89 | String msg = FileUtils.readAll(con.getErrorStream()); 90 | throw new IOException("HTTP error: " + code + " for url " + url + "" + msg); 91 | } 92 | return FileUtils.readAll(con.getInputStream()); 93 | } 94 | 95 | /** 96 | * Do an HTTPS DELETE by appending the application default token to the URL. 97 | * 98 | * @param url 99 | * @return response body as string 100 | * @throws IOException 101 | */ 102 | public static String doDelete(String gcsPath) throws IOException { 103 | if (gcsPath == null || !gcsPath.startsWith("gs://")) { 104 | throw new IOException("GCS path must be non-null and start with gs://. Value: " + gcsPath); 105 | } 106 | String url = "https://storage.googleapis.com/" + gcsPath.substring("gs://".length()); 107 | LOG.debug("Url for DELETE: " + url); 108 | 109 | String authUrl = 110 | url + "?access_token=" + GoogleCredential.getApplicationDefault().getAccessToken(); 111 | 112 | HttpURLConnection con = (HttpURLConnection) new URL(authUrl).openConnection(); 113 | con.setDoInput(true); 114 | con.setRequestMethod("DELETE"); 115 | 116 | int code = con.getResponseCode(); 117 | LOG.debug("Response code: " + code); 118 | 119 | if (code != HttpURLConnection.HTTP_OK && code != HttpURLConnection.HTTP_NO_CONTENT) { 120 | String msg = FileUtils.readAll(con.getErrorStream()); 121 | throw new IOException("HTTP error: " + code + " for url " + url + "" + msg); 122 | } 123 | return FileUtils.readAll(con.getInputStream()); 124 | } 125 | 126 | /** 127 | * Do an HTTPS HEAD by appending the application default token to the URL. 128 | * 129 | * @param url 130 | * @return response body as string 131 | * @throws IOException 132 | */ 133 | public static String doHead(String url) throws IOException { 134 | LOG.debug("Url for HEAD: " + url); 135 | String authUrl = 136 | url + "?access_token=" + GoogleCredential.getApplicationDefault().getAccessToken(); 137 | 138 | HttpURLConnection con = (HttpURLConnection) new URL(authUrl).openConnection(); 139 | con.setDoInput(true); 140 | con.setRequestMethod("HEAD"); 141 | 142 | int code = con.getResponseCode(); 143 | LOG.debug("Response code: " + code); 144 | 145 | if (code != HttpURLConnection.HTTP_OK) { 146 | throw new IOException("HTTP error: " + code + " for url "); 147 | } 148 | return FileUtils.readAll(con.getInputStream()); 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/genomics/dockerflow/dataflow/DataflowFactoryTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.dataflow; 17 | 18 | import static org.junit.Assert.assertTrue; 19 | 20 | import com.google.cloud.dataflow.sdk.Pipeline; 21 | import com.google.cloud.genomics.dockerflow.DockerflowConstants; 22 | import com.google.cloud.genomics.dockerflow.TestUtils; 23 | import com.google.cloud.genomics.dockerflow.runner.TaskRunner.TaskRequest; 24 | import com.google.cloud.genomics.dockerflow.task.TaskDefn; 25 | import com.google.cloud.genomics.dockerflow.util.FileUtils; 26 | import com.google.cloud.genomics.dockerflow.util.StringUtils; 27 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 28 | import com.google.cloud.genomics.dockerflow.workflow.WorkflowFactory; 29 | 30 | import java.io.IOException; 31 | import java.util.List; 32 | import org.junit.Test; 33 | import org.slf4j.Logger; 34 | import org.slf4j.LoggerFactory; 35 | 36 | /** 37 | * Unit tests for parsing workflows and 38 | */ 39 | public class DataflowFactoryTest implements DockerflowConstants { 40 | private static final Logger LOG = LoggerFactory.getLogger(DataflowFactoryTest.class); 41 | private static TestUtils utils = new TestUtils(); 42 | 43 | @Test 44 | public void testLoadTask() throws IOException { 45 | WorkflowFactory.loadDefn(utils.baseDir + "/task-one.yaml"); 46 | } 47 | 48 | @Test 49 | public void testParseTask() throws IOException { 50 | TaskDefn d = FileUtils.parseFile(utils.baseDir + "/task-one.yaml", TaskDefn.class); 51 | LOG.info("Round trip as: " + StringUtils.toJson(d)); 52 | } 53 | 54 | @Test 55 | public void testParseWorkflowWithParams() throws IOException { 56 | Workflow w = FileUtils.parseFile(utils.baseDir + "/linear-graph.yaml", Workflow.class); 57 | LOG.info("Round trip as: " + StringUtils.toJson(w)); 58 | } 59 | 60 | @Test 61 | public void testParseWorkflow() throws IOException { 62 | Workflow w = FileUtils.parseFile(utils.baseDir + "/workflow.yaml", Workflow.class); 63 | LOG.info("Round trip as: " + StringUtils.toJson(w)); 64 | } 65 | 66 | @Test 67 | public void testLoadWorkflow() throws Exception { 68 | Workflow w = WorkflowFactory.load(utils.baseDir + "/linear-graph.yaml"); 69 | LOG.info("Loaded workflow " + w.getDefn().getName()); 70 | } 71 | 72 | @Test 73 | public void testCreateLinearDataflow() throws Exception { 74 | Workflow w = WorkflowFactory.load(utils.baseDir + "/linear-graph.yaml"); 75 | Pipeline p = DataflowFactory.dataflow(w, null, DataflowFactory.pipelineOptions( 76 | new String[] { 77 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 78 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 79 | "--" + LOGGING + "=" + utils.baseDir + "/dlinear", 80 | "--" + RUNNER + "=" + utils.runner 81 | } 82 | )); 83 | LOG.info("Created dataflow pipeline: " + p); 84 | } 85 | 86 | @Test 87 | public void testCreateBranchingDataflow() throws Exception { 88 | Workflow w = WorkflowFactory.load(utils.baseDir + "/branching-graph.yaml"); 89 | Pipeline p = DataflowFactory.dataflow(w, null, DataflowFactory.pipelineOptions( 90 | new String[] { 91 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 92 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 93 | "--" + LOGGING + "=" + utils.baseDir + "/dlinear", 94 | "--" + RUNNER + "=" + utils.runner 95 | } 96 | )); 97 | LOG.info("Created dataflow pipeline: " + p); 98 | } 99 | 100 | @Test 101 | public void testRequestToJson() throws Exception { 102 | Workflow w = WorkflowFactory.load(utils.baseDir + "/linear-graph.yaml"); 103 | TaskRequest r = new TaskRequest(); 104 | r.setEphemeralPipeline(w.getDefn()); 105 | r.setPipelineArgs(w.getArgs()); 106 | 107 | String s = StringUtils.toJson(r); 108 | LOG.info("Serialized: " + s); 109 | 110 | r = StringUtils.fromJson(s, TaskRequest.class); 111 | LOG.info("Deserialized"); 112 | } 113 | 114 | @Test 115 | public void testWorkflowToJson() throws Exception { 116 | Workflow w = WorkflowFactory.load(utils.baseDir + "/linear-graph.yaml"); 117 | 118 | String s = StringUtils.toJson(w); 119 | LOG.info("Serialized: " + s); 120 | 121 | w = StringUtils.fromJson(s, Workflow.class); 122 | LOG.info("Deserialized"); 123 | } 124 | 125 | @Test 126 | public void testZones() throws IOException { 127 | List s = WorkflowFactory.expandZones(new String[] {"us-*"}); 128 | LOG.info(s.toString()); 129 | assertTrue(s.size() > 1); 130 | } 131 | 132 | @Test 133 | public void testJavascript() throws Exception { 134 | String s = StringUtils.evalJavaScript("${= 2 + 3 * 4}"); 135 | LOG.info(s); 136 | assertTrue(s.equals("14")); 137 | } 138 | 139 | @Test 140 | public void testResolveLoggingPaths() { 141 | String s; 142 | 143 | s = FileUtils.logPath("gs://foo/bar", "operations/123"); 144 | LOG.info(s); 145 | assertTrue(s.equals("gs://foo/bar/123.log")); 146 | 147 | s = FileUtils.stdoutPath("gs://foo/bar", "operations/123"); 148 | LOG.info(s); 149 | assertTrue(s.equals("gs://foo/bar/123-stdout.log")); 150 | 151 | s = FileUtils.stderrPath("gs://foo/bar", "operations/123"); 152 | LOG.info(s); 153 | assertTrue(s.equals("gs://foo/bar/123-stderr.log")); 154 | 155 | s = FileUtils.logPath("gs://foo/bar/log.txt", "operations/123"); 156 | LOG.info(s); 157 | assertTrue(s.equals("gs://foo/bar/log.txt")); 158 | 159 | s = FileUtils.stdoutPath("gs://foo/bar/log.txt", "operations/123"); 160 | LOG.info(s); 161 | assertTrue(s.equals("gs://foo/bar/log-stdout.txt")); 162 | 163 | s = FileUtils.stderrPath("gs://foo/bar/log.txt", "operations/123"); 164 | LOG.info(s); 165 | assertTrue(s.equals("gs://foo/bar/log-stderr.txt")); 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/util/StringUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.util; 17 | 18 | import com.fasterxml.jackson.dataformat.yaml.snakeyaml.Yaml; 19 | import com.google.api.client.googleapis.util.Utils; 20 | import com.google.cloud.genomics.dockerflow.DockerflowConstants; 21 | import com.google.gson.GsonBuilder; 22 | import java.io.IOException; 23 | import java.util.HashMap; 24 | import java.util.Map; 25 | import javax.script.ScriptEngineManager; 26 | import javax.script.ScriptException; 27 | 28 | /** 29 | * Utilities for parsing and formatting strings. 30 | */ 31 | public class StringUtils { 32 | 33 | /** 34 | * Parse command-line options of the form --key=value into a map of . 35 | * For --key without a value, set the value to true. 36 | * 37 | * @param args 38 | * @return 39 | */ 40 | public static Map parseArgs(String[] args) { 41 | Map m = new HashMap(); 42 | if (args != null) { 43 | for (String s : args) { 44 | if (s.indexOf("=") < 0) { 45 | m.put(s.replace("--", ""), Boolean.TRUE.toString()); 46 | } else { 47 | String key = s.substring(0, s.indexOf("=")).replace("--", ""); 48 | String val = s.substring(s.indexOf("=") + 1); 49 | if (m.containsKey(key)) { 50 | val = m.get(key) + "," + val; 51 | } 52 | m.put(key, val); 53 | } 54 | } 55 | } 56 | return m; 57 | } 58 | 59 | /** 60 | * Substitute all global variables of the form $(KEY) in a string. 61 | * 62 | * @param globals 63 | * @param value 64 | * @return 65 | */ 66 | public static String replaceAll(Map globals, String value) { 67 | String retval = value; 68 | 69 | if (value != null && globals != null) { 70 | for (String key : globals.keySet()) { 71 | String var = "${" + key + "}"; 72 | if (value.contains(var) 73 | && globals.get(key) != null) { 74 | retval = retval.replace(var, globals.get(key)); 75 | } 76 | } 77 | } 78 | return retval; 79 | } 80 | 81 | /** Serialize to json. */ 82 | public static String toJson(Object o) { 83 | FileUtils.LOG.debug("Serializing to json: " + (o == null ? null : o.getClass())); 84 | // For non-auto-generated Google Java classes, Gson is required; 85 | // otherwise the serialized string is empty. 86 | return new GsonBuilder().setPrettyPrinting().create().toJson(o); 87 | } 88 | 89 | /** Deserialize from json. */ 90 | public static T fromJson(String s, Class c) throws IOException { 91 | FileUtils.LOG.debug("Deserializing from json to " + c); 92 | T retval; 93 | 94 | // For some reason, this only works for auto-generated Google API 95 | // classes 96 | if (c.toString().startsWith("com.google.api.services.")) { 97 | FileUtils.LOG.debug("Using Google APIs JsonParser"); 98 | retval = Utils.getDefaultJsonFactory().createJsonParser(s).parse(c); 99 | } else { 100 | FileUtils.LOG.debug("Using Gson"); 101 | retval = new GsonBuilder().setLenient().create().fromJson(s, c); 102 | } 103 | return retval; 104 | } 105 | 106 | /** 107 | * Parse parameters of the form key=val,key2=val2. 108 | * 109 | * @param params 110 | * @param fromFile if true, read the file path contents and set as the parameter value 111 | * @return 112 | * @throws IOException 113 | */ 114 | public static Map parseParameters(String params, boolean fromFile) 115 | throws IOException { 116 | Map map = new HashMap(); 117 | 118 | String[] tokens = params.split(","); 119 | 120 | for (String token : tokens) { 121 | if (token.contains("=")) { 122 | String key = token.substring(0, token.lastIndexOf("=")); 123 | String value = token.substring(token.lastIndexOf("=") + 1); 124 | 125 | // Load local files now; variables and GCS files will be loaded lazily 126 | if (fromFile 127 | && value.indexOf("${") < 0 128 | && !value.startsWith("gs://") 129 | && !DockerflowConstants.REQUIRED.equals(value)) { 130 | value = FileUtils.readAll(value); 131 | } 132 | map.put(key, value); 133 | } else { 134 | map.put(token, Boolean.TRUE.toString()); 135 | } 136 | } 137 | return map; 138 | } 139 | 140 | /** 141 | * Evaluate a javascript expression, like "${= 2*3}". 142 | * 143 | * @param js 144 | * @return the results as a string. 145 | */ 146 | public static String evalJavaScript(String expression) throws ScriptException { 147 | FileUtils.LOG.debug("javascript: " + expression); 148 | 149 | // Remove new lines from arrays, etc 150 | String s = expression.trim().replace("\n", " "); 151 | StringBuilder sb = new StringBuilder(); 152 | 153 | int start = s.indexOf("${=") + 3; 154 | 155 | // Keep text before the js 156 | if (start > 3) { 157 | sb.append(s.substring(0, start - 3)); 158 | } 159 | int end = s.lastIndexOf("}"); 160 | FileUtils.LOG.debug("start=" + start + ", end=" + end); 161 | 162 | String js = s.substring(start, end); 163 | FileUtils.LOG.info("Evaluate js: " + js); 164 | 165 | sb.append( 166 | String.valueOf(new ScriptEngineManager().getEngineByName("JavaScript").eval(js)).trim()); 167 | 168 | // Keep text after the js 169 | if (end < s.length() - 1) { 170 | sb.append(s.substring(end + 1)); 171 | } 172 | 173 | String retval = sb.toString(); 174 | 175 | // If there's more js, evaluate it too 176 | if (StringUtils.isJavaScript(retval)) { 177 | retval = evalJavaScript(retval); 178 | } 179 | 180 | return retval; 181 | } 182 | 183 | /** 184 | * The value looks like "${= javascript_expression }". It must start with a dollar sign and end 185 | * with a curly brace -- ie, JavaScript cannot be embedded within a longer string. 186 | * 187 | * @param js 188 | * @return 189 | */ 190 | public static boolean isJavaScript(String js) { 191 | return js != null 192 | && js.contains("${=") 193 | && js.contains("}") 194 | && js.indexOf("${=") < js.indexOf("}"); 195 | } 196 | 197 | public static String toYaml(Object o) throws IOException { 198 | // Round trip to json to suppress empty collections and null values 199 | String json = toJson(o); 200 | Object generic = fromJson(json, Object.class); 201 | return new Yaml().dump(generic); 202 | } 203 | } 204 | -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/genomics/dockerflow/examples/ExampleGraphsTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.examples; 17 | 18 | import com.google.cloud.genomics.dockerflow.DockerflowConstants; 19 | import com.google.cloud.genomics.dockerflow.TestUtils; 20 | import com.google.cloud.genomics.dockerflow.args.ArgsBuilder; 21 | import com.google.cloud.genomics.dockerflow.args.TaskArgs; 22 | import com.google.cloud.genomics.dockerflow.args.WorkflowArgs; 23 | import com.google.cloud.genomics.dockerflow.runner.TaskRunner; 24 | import com.google.cloud.genomics.dockerflow.task.Task; 25 | import com.google.cloud.genomics.dockerflow.task.TaskBuilder; 26 | import com.google.cloud.genomics.dockerflow.util.StringUtils; 27 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 28 | import com.google.cloud.genomics.dockerflow.workflow.WorkflowFactory; 29 | 30 | import static org.junit.Assert.assertEquals; 31 | 32 | import java.util.HashMap; 33 | import org.junit.BeforeClass; 34 | import org.junit.Test; 35 | import org.slf4j.Logger; 36 | import org.slf4j.LoggerFactory; 37 | 38 | /** 39 | * Unit tests. 40 | * 41 | *

You need to set environment variables: TEST_PROJECT and TEST_GCS_PATH. Then get application 42 | * default credentials: 43 | * 44 | *

gcloud beta auth application-default login
45 | */ 46 | public class ExampleGraphsTest implements DockerflowConstants { 47 | private static Logger LOG = LoggerFactory.getLogger(ExampleGraphsITCase.class); 48 | 49 | protected TestUtils utils = new TestUtils(); 50 | 51 | @BeforeClass 52 | public static void setUpBeforeClass() throws Exception {} 53 | 54 | @Test 55 | public void testTask() throws Exception { 56 | Task t = 57 | TaskBuilder.named("TaskOne") 58 | .project(TestUtils.TEST_PROJECT) 59 | .logging(utils.baseDir + "/dtask/test.log") 60 | .inputFile("inputFile", utils.baseDir + "/input-one.txt", "in.txt") 61 | .input("message", "hello") 62 | .outputFile("outputFile", utils.baseDir + "/dtask/output.txt", "out.txt") 63 | .docker("ubuntu") 64 | .script("cp ${inputFile} ${outputFile} ; echo ${message} >> ${outputFile}") 65 | .zones(new String[] {"us-*"}) 66 | .build(); 67 | ((WorkflowArgs) t.getArgs()).setTesting(DockerflowConstants.DIRECT_RUNNER.equals(utils.runner)); 68 | TaskRunner.runTask(t); 69 | } 70 | 71 | @Test 72 | public void testLinearGraph() throws Exception { 73 | utils.baseDir = TestUtils.TEST_GCS_PATH; 74 | LinearGraph.main( 75 | new String[] { 76 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 77 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 78 | "--" + LOGGING + "=" + utils.baseDir + "/dlinear", 79 | "--" + RUNNER + "=" + utils.runner, 80 | "--inputFile=" + utils.baseDir + "/input-one.txt", 81 | "--outputFile=" + utils.baseDir + "/dlinear/output.txt", 82 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner) 83 | }); 84 | if (utils.checkOutput) { 85 | String output = TestUtils.readAll(utils.baseDir + "/dlinear/output.txt"); 86 | LOG.info("\"" + output + "\", length=" + output.length()); 87 | 88 | assertEquals("Output doesn't match expected", TestUtils.OUTPUT_ONE_TWO, output); 89 | } 90 | } 91 | 92 | @Test 93 | public void testMultiLinearGraph() throws Exception { 94 | MultiLinearGraph.main( 95 | new String[] { 96 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 97 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 98 | "--" + LOGGING + "=" + utils.baseDir, 99 | "--" + RUNNER + "=" + utils.runner, 100 | "--" + WORKSPACE + "=" + utils.baseDir, 101 | "--" + ARGS_FILE + "=" + utils.baseDir + "/workflowArgs.csv", 102 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner) 103 | }); 104 | if (utils.checkOutput) { 105 | String output1 = TestUtils.readAll(utils.baseDir + "/1/TaskOne/output-one.txt"); 106 | LOG.info("\"" + output1 + "\", length=" + output1.length()); 107 | 108 | assertEquals("Output doesn't match expected", TestUtils.OUTPUT_ONE, output1); 109 | 110 | String output2 = TestUtils.readAll(utils.baseDir + "/2/TaskTwo/output-two.txt"); 111 | LOG.info("\"" + output2 + "\", length=" + output2.length()); 112 | 113 | assertEquals("Output doesn't match expected", TestUtils.OUTPUT_TWO_ONE, output2); 114 | } 115 | } 116 | 117 | @Test 118 | public void testComplexGraph() throws Exception { 119 | ComplexGraph.main( 120 | new String[] { 121 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 122 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 123 | "--" + LOGGING + "=" + utils.baseDir + "/dcomplex", 124 | "--" + RUNNER + "=" + utils.runner, 125 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner) 126 | }); 127 | if (utils.checkOutput) { 128 | String output = TestUtils.readAll(utils.baseDir + "/dcomplex/test.log"); 129 | LOG.info("\"" + output); 130 | } 131 | } 132 | 133 | @Test 134 | public void testCwlFeatures() throws Exception { 135 | Workflow w = WorkflowFactory.load(utils.baseDir + "/cwl-graph.yaml"); 136 | LOG.info("Loaded workflow " + w.getDefn().getName()); 137 | 138 | Task t = w.getSteps().get(0); 139 | t.applyArgs(new TaskArgs(t.getArgs())); 140 | LOG.info(StringUtils.toJson(t)); 141 | 142 | assertEquals( 143 | t.getDefn().getDocker().getCmd(), 144 | "echo -a one -a two -b -k five -k six -k seven --file\u003dgs://b/d/test.txt"); 145 | } 146 | 147 | @SuppressWarnings("serial") 148 | @Test 149 | public void testJs() throws Exception { 150 | String path = 151 | "gs://genomics-public-data/test-data/dna/wgs/hiseqx/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam"; 152 | String js = "${= '${path}'.replace(/.*\\//, '').replace(/.bam/, ''); }"; 153 | 154 | String sub = 155 | StringUtils.replaceAll( 156 | new HashMap() { 157 | { 158 | put("path", path); 159 | } 160 | }, 161 | js); 162 | LOG.info(sub); 163 | 164 | String eval = StringUtils.evalJavaScript(sub); 165 | LOG.info(eval); 166 | } 167 | 168 | @Test 169 | public void testScatterByArray() throws Exception { 170 | Task t = 171 | TaskBuilder.named("ArrayTest") 172 | .inputArray("sequence_group_interval", " -L ") 173 | .scatterBy("sequence_group_interval") 174 | .docker("ubuntu") 175 | .script("echo hello") 176 | .build(); 177 | t.getArgs().setFromFile("sequence_group_interval", true); 178 | LOG.info(StringUtils.toJson(t)); 179 | 180 | WorkflowArgs wa = 181 | ArgsBuilder.of() 182 | .inputFromFile("ArrayTest.sequence_group_interval", utils.baseDir + "/seq-group.tsv") 183 | .workspace(utils.baseDir) 184 | .build(); 185 | LOG.info(StringUtils.toJson(wa)); 186 | 187 | t.applyArgs(wa); 188 | 189 | Object r = TaskRunner.getRequest(t); 190 | 191 | LOG.info(StringUtils.toJson(r)); 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/args/ArgsBuilder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.args; 17 | 18 | import com.google.cloud.genomics.dockerflow.args.TaskArgs.Logging; 19 | import com.google.cloud.genomics.dockerflow.args.TaskArgs.ServiceAccount; 20 | import com.google.cloud.genomics.dockerflow.task.Task; 21 | import com.google.cloud.genomics.dockerflow.task.TaskDefn.Disk; 22 | import com.google.cloud.genomics.dockerflow.task.TaskDefn.Resources; 23 | import com.google.cloud.genomics.dockerflow.util.FileUtils; 24 | import com.google.cloud.genomics.dockerflow.workflow.WorkflowFactory; 25 | 26 | import java.io.IOException; 27 | import java.util.ArrayList; 28 | import java.util.LinkedHashMap; 29 | import java.util.List; 30 | import java.util.Map; 31 | 32 | /** Builder for workflow arguments. */ 33 | public class ArgsBuilder { 34 | private WorkflowArgs workflowArgs; 35 | 36 | public static ArgsBuilder of() { 37 | return new ArgsBuilder(null); 38 | } 39 | 40 | /** 41 | * Constructor. 42 | * 43 | * @param clientId a name for the overall workflow instance 44 | * @return 45 | */ 46 | public static ArgsBuilder of(String clientId) { 47 | return new ArgsBuilder(clientId); 48 | } 49 | 50 | /** Load the workflow args from yaml / json from a local or GCS file. */ 51 | public static ArgsBuilder fromFile(String path) throws IOException { 52 | ArgsBuilder b = new ArgsBuilder(null); 53 | b.workflowArgs = FileUtils.parseFile(path, WorkflowArgs.class); 54 | return b; 55 | } 56 | 57 | ArgsBuilder(String clientId) { 58 | workflowArgs = new WorkflowArgs(); 59 | if (clientId != null) { 60 | workflowArgs.setClientId(clientId); // keep any default 61 | } 62 | } 63 | 64 | public static ArgsBuilder fromArgs(String[] args) throws IOException { 65 | return fromArgs(args, null); 66 | } 67 | 68 | public static ArgsBuilder fromArgs(String[] args, String name) throws IOException { 69 | ArgsBuilder b = ArgsBuilder.of(name); 70 | b.workflowArgs = WorkflowFactory.createArgs(args); 71 | return b; 72 | } 73 | 74 | public ArgsBuilder inputs(Map inputs) { 75 | if (workflowArgs.getInputs() == null) { 76 | workflowArgs.setInputs(new LinkedHashMap()); 77 | } 78 | workflowArgs.getInputs().putAll(inputs); 79 | return this; 80 | } 81 | 82 | public ArgsBuilder input(String name, String value) { 83 | workflowArgs.set(name, value); 84 | return this; 85 | } 86 | 87 | public ArgsBuilder inputFromFile(String name, String value) { 88 | workflowArgs.set(name, value); 89 | workflowArgs.setFromFile(name, true); 90 | return this; 91 | } 92 | 93 | public ArgsBuilder outputs(Map outputs) { 94 | if (workflowArgs.getOutputs() == null) { 95 | workflowArgs.setOutputs(new LinkedHashMap()); 96 | } 97 | workflowArgs.getInputs().putAll(outputs); 98 | return this; 99 | } 100 | 101 | public ArgsBuilder output(String name, String value) { 102 | if (workflowArgs.getOutputs() == null) { 103 | workflowArgs.setOutputs(new LinkedHashMap()); 104 | } 105 | workflowArgs.getOutputs().put(name, value); 106 | return this; 107 | } 108 | 109 | public ArgsBuilder cpu(int cores) { 110 | workflowArgs.getResources().setMinimumCpuCores(String.valueOf(cores)); 111 | return this; 112 | } 113 | 114 | public ArgsBuilder memory(String gb) { 115 | if (workflowArgs.getResources() == null) { 116 | workflowArgs.setResources(new Resources()); 117 | } 118 | workflowArgs.getResources().setMinimumRamGb(gb); 119 | return this; 120 | } 121 | 122 | public ArgsBuilder diskSize(String gb) { 123 | if (workflowArgs.getResources() == null) { 124 | workflowArgs.setResources(new Resources()); 125 | } 126 | if (workflowArgs.getResources().getDisks() == null) { 127 | workflowArgs.getResources().setDisks(new ArrayList()); 128 | workflowArgs.getResources().getDisks().add(new Disk()); 129 | } 130 | workflowArgs.getResources().getDisks().get(0).setSizeGb(gb); 131 | return this; 132 | } 133 | 134 | public ArgsBuilder preemptible(boolean b) { 135 | if (workflowArgs.getResources() == null) { 136 | workflowArgs.setResources(new Resources()); 137 | } 138 | workflowArgs.getResources().setPreemptible(b); 139 | return this; 140 | } 141 | 142 | public ArgsBuilder zones(String[] zones) { 143 | if (workflowArgs.getResources() == null) { 144 | workflowArgs.setResources(new Resources()); 145 | } 146 | workflowArgs.getResources().setZones(WorkflowFactory.expandZones(zones)); 147 | return this; 148 | } 149 | 150 | public ArgsBuilder project(String id) { 151 | workflowArgs.setProjectId(id); 152 | return this; 153 | } 154 | 155 | /** 156 | * The base logging path. All task logs will be stored in sub-folders. The log files will be named 157 | * task.log, task-stdout.log, and task-stderr.log. 158 | */ 159 | public ArgsBuilder logging(String path) { 160 | return logging(path, false); 161 | } 162 | 163 | /** 164 | * The base logging path. All task logs will be stored in sub-folders, defined as 165 | * ${workflow.element}. The log files will be named [operationId].log, [operationId]-stdout.log, 166 | * and [operationId]-stderr.log. 167 | */ 168 | public ArgsBuilder logging(String path, boolean useOperationName) { 169 | String s = path + "/${" + Task.WORKFLOW_ELEMENT + "}"; 170 | if (!useOperationName) { 171 | s += "/" + Task.TASK_LOG; 172 | } 173 | 174 | workflowArgs.setLogging(new Logging()); 175 | workflowArgs.getLogging().setGcsPath(s); 176 | return this; 177 | } 178 | 179 | /** 180 | * The base path for task output files, if they use relative paths. If so, all outputs will be 181 | * stored in sub-folders, defined as ${workflow.element}. 182 | * 183 | * @param path 184 | * @return 185 | */ 186 | public ArgsBuilder workspace(String path) { 187 | workflowArgs.setWorkspace( 188 | path + (path.endsWith("/") ? "" : "/") + "${" + Task.WORKFLOW_ELEMENT + "}/"); 189 | return this; 190 | } 191 | 192 | public ArgsBuilder clientId(String id) { 193 | workflowArgs.setClientId(id); 194 | return this; 195 | } 196 | 197 | public ArgsBuilder serviceAccountEmail(String email) { 198 | if (workflowArgs.getServiceAccount() == null) { 199 | workflowArgs.setServiceAccount(new ServiceAccount()); 200 | } 201 | workflowArgs.getServiceAccount().setEmail(email); 202 | return this; 203 | } 204 | 205 | public ArgsBuilder serviceAccountScopes(List scopes) { 206 | if (workflowArgs.getServiceAccount() == null) { 207 | workflowArgs.setServiceAccount(new ServiceAccount()); 208 | } 209 | workflowArgs.getServiceAccount().setScopes(scopes); 210 | return this; 211 | } 212 | 213 | public ArgsBuilder keepAlive(String sec) { 214 | workflowArgs.setKeepVmAliveOnFailureDuration(sec); 215 | return this; 216 | } 217 | 218 | public ArgsBuilder testing(Boolean isTesting) { 219 | workflowArgs.setTesting(isTesting); 220 | return this; 221 | } 222 | 223 | public ArgsBuilder maxTries(int i) { 224 | workflowArgs.setMaxTries(i); 225 | return this; 226 | } 227 | 228 | public ArgsBuilder deleteIntermediateFiles(Boolean b) { 229 | workflowArgs.setDeleteFiles(b != null && b); 230 | return this; 231 | } 232 | 233 | public ArgsBuilder resumeFailedRun(Boolean b) { 234 | workflowArgs.setResumeFailedRun(b != null && b); 235 | return this; 236 | } 237 | 238 | public ArgsBuilder abortOnError(Boolean b) { 239 | workflowArgs.setAbortOnError(b != null && b); 240 | return this; 241 | } 242 | 243 | public WorkflowArgs build() { 244 | return workflowArgs; 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/Dockerflow.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow; 17 | 18 | import com.google.api.client.googleapis.auth.oauth2.GoogleCredential; 19 | import com.google.cloud.dataflow.sdk.Pipeline; 20 | import com.google.cloud.dataflow.sdk.PipelineResult; 21 | import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions; 22 | import com.google.cloud.genomics.dockerflow.args.ArgsTableBuilder; 23 | import com.google.cloud.genomics.dockerflow.args.WorkflowArgs; 24 | import com.google.cloud.genomics.dockerflow.dataflow.DataflowBuilder; 25 | import com.google.cloud.genomics.dockerflow.dataflow.DataflowFactory; 26 | import com.google.cloud.genomics.dockerflow.util.StringUtils; 27 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 28 | import com.google.cloud.genomics.dockerflow.workflow.WorkflowDefn; 29 | import com.google.cloud.genomics.dockerflow.workflow.WorkflowFactory; 30 | 31 | import java.io.File; 32 | import java.io.IOException; 33 | import java.net.URL; 34 | import java.net.URLClassLoader; 35 | import java.util.Map; 36 | import org.slf4j.Logger; 37 | import org.slf4j.LoggerFactory; 38 | 39 | /** 40 | * Command-line runner for Dataflow pipelines with shell steps running in Docker. Multi-step 41 | * pipelines are defined in yaml as a static graph. Command-line options can override default 42 | * settings provided in the graph. Individual Docker steps are described in separate yaml files. 43 | */ 44 | public class Dockerflow implements DockerflowConstants { 45 | private static final Logger LOG = LoggerFactory.getLogger(Dockerflow.class); 46 | 47 | /** 48 | * Run with --help for options. 49 | * 50 | * @param args 51 | * @throws IOException 52 | */ 53 | public static void main(String[] args) throws Exception { 54 | Map m = StringUtils.parseArgs(args); 55 | 56 | // Show help and exit 57 | if (m.isEmpty() || m.containsKey(HELP)) { 58 | System.out.println( 59 | "Description:\n" 60 | + " Run a workflow of Docker tasks defined in Java or yaml/json, using Dataflow " 61 | + "for orchestration.\n\n" 62 | + "COMMON OPTIONS:\n" 63 | + "--" + ARGS_FILE + "=PATH\n" 64 | + " Workflow args in yaml/json in GCS or local. Or a csv with one run per " 65 | + "row and param names\n in columns.\n" 66 | + "--" + INPUTS + "=KEY=VAL,KEY2=VAL2\n" 67 | + " Input parameters to the pipeline.\n" 68 | + "--" + OUTPUTS + "=KEY=VAL,KEY2=VAL2\n" 69 | + " Output files from the pipeline.\n" 70 | + "--" + PREEMPTIBLE + "=BOOL\n" 71 | + " Run with preemptible VMs if the pipeline supports it.\n" 72 | + "--" + PROJECT + "=PROJECT_ID\n" 73 | + " REQUIRED. Google Cloud Project name.\n" 74 | + "--" + RESUME + "=BOOL\n" 75 | + " Attempt to resume a failed run. Useful when debugging\n" 76 | + "--" + RUNNER + "=DATAFLOW_RUNNER\n" 77 | + " Default: " + DEFAULT_RUNNER 78 | + ". Use " + DIRECT_RUNNER + " for local testing.\n" 79 | + "--" + TEST + "=BOOL\n" 80 | + " Dry run for testing. Docker tasks will not execute.\n" 81 | + "--" + WORKSPACE + "=PATH\n" 82 | + " Base path for input, output, and logging files.\n" 83 | + "--" + WORKFLOW_CLASS + "=JAVA_CLASS\n" 84 | + " A workflow defined in a Java class.\n" 85 | + "--" + WORKFLOW_FILE + "=PATH\n" 86 | + " A workflow defined in yaml/json in GCS or local.\n" 87 | + "--" + ZONES + "=STRING\n" 88 | + " Override zones for VMs. Wildcards like eu* are allowed.\n" 89 | + "\n" 90 | + "OTHER OPTIONS\n" 91 | + "--" + ABORT + "\n" 92 | + " Abort if *any* concurrent task fails permanently. Otherwise, continue.\n" 93 | + "--" + CPU + "=INT\n" 94 | + " Override minimum CPU cores.\n" 95 | + "--" + DISK_SIZE + "=INT\n" 96 | + " Override size in Gb for all disks.\n" 97 | + "--" + GLOBALS + "=KEY=VAL,KEY2=VAL2\n" 98 | + " Global parameters to substitute in the args-file.\n" 99 | + "--" + HELP 100 | + "\n Print this message.\n" 101 | + "--" + INPUTS_FROM_FILE + "=KEY=PATH,KEY2=PATH2\n" 102 | + " Load parameter values from local files.\n" 103 | + "--" + KEEP_ALIVE + "=INT\n" 104 | + " Seconds to keep VMs alive after failure to ssh in and debug.\n" 105 | + "--" + LOGGING + "=PATH\n" 106 | + " Base GCS folder where logs will be written.\n" 107 | + "--" + MACHINE_TYPE + "=STRING\n" 108 | + " Dataflow head node GCE instance type. Default: " 109 | + DEFAULT_MACHINE_TYPE 110 | + "--" + MAX_TRIES + "=INT\n" 111 | + " Maximum preemptible tries. Default: " + DEFAULT_MAX_TRIES 112 | + "--" + MAX_WORKERS + "=INT\n" 113 | + " Tip: set to the max number of parallel branches in the workflow.\n" 114 | + "--" + MEMORY + "=INT\n" 115 | + " Override minimum memory in GB.\n" 116 | + "--" + RUN_ID + "=STRING\n" 117 | + " An id provided by you to label operations to monitor or cancel.\n" 118 | + "--" + SERVICE_ACCOUNT_NAME + "=EMAIL\n" 119 | + " Service account to use rather than the default GCE account.\n" 120 | + "--" + SERVICE_ACCOUNT_SCOPES + "=VAL,VAL2\n" 121 | + " Service account scopes.\n" 122 | + "--" + STAGING + "=PATH\n" 123 | + " Dataflow staging location for jars.\n" 124 | + "--" + TASK_FILE + "=PATH\n" 125 | + " A single task defined in yaml/json in GCS or local.\n" 126 | ); 127 | System.out.println("OAuth token: " 128 | + GoogleCredential.getApplicationDefault().getAccessToken() + "\n"); 129 | return; 130 | } 131 | LOG.info("Local working directory: " + new File(".").getAbsoluteFile()); 132 | 133 | Map argsTable = ArgsTableBuilder.fromArgs(args).build(); 134 | DataflowPipelineOptions pipelineOptions = DataflowFactory.pipelineOptions(args); 135 | Workflow w; 136 | Pipeline dataflow; 137 | 138 | if (m.containsKey(WORKFLOW_CLASS)) { 139 | LOG.info("Creating workflow from Java class " + m.get(WORKFLOW_CLASS)); 140 | URLClassLoader cl = 141 | new URLClassLoader(new URL[] {new File(".").getAbsoluteFile().toURI().toURL()}); 142 | WorkflowDefn d = (WorkflowDefn) cl.loadClass(m.get(WORKFLOW_CLASS)).newInstance(); 143 | cl.close(); 144 | w = d.createWorkflow(args); 145 | 146 | } else if (m.containsKey(WORKFLOW_FILE)) { 147 | LOG.info("Creating workflow from file " + m.get(WORKFLOW_FILE)); 148 | w = WorkflowFactory.create(args); 149 | 150 | } else if (m.containsKey(TASK_FILE)) { 151 | LOG.info("Creating workflow from task file " + m.get(TASK_FILE)); 152 | w = WorkflowFactory.create(args); 153 | 154 | } else { 155 | throw new IllegalArgumentException( 156 | "No workflow definition found. " 157 | + "Either a workflow-class, workflow-file, or task-file must be provided."); 158 | } 159 | 160 | dataflow = 161 | DataflowBuilder.of(w) 162 | .createFrom(argsTable) 163 | .pipelineOptions(pipelineOptions) 164 | .build(); 165 | 166 | LOG.info( 167 | "Running Dataflow job " 168 | + ((DataflowPipelineOptions) dataflow.getOptions()).getAppName()); 169 | 170 | PipelineResult result = dataflow.run(); 171 | 172 | LOG.info("State: " + result.getState()); 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /examples/gatk/gatk-args.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | zones: 3 | - us-* 4 | inputs: 5 | sample_name: NA12878 6 | flowcell_unmapped_bams: | 7 | gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam 8 | gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam 9 | gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam 10 | final_gvcf_name: NA12878.g.vcf.gz 11 | 12 | agg_small_disk: 200 13 | agg_medium_disk: 300 14 | agg_large_disk: 400 15 | flowcell_small_disk: 200 16 | flowcell_medium_disk: 300 17 | 18 | unmapped_bam_suffix: .bam 19 | 20 | ref_dict: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict 21 | ref_fasta: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta 22 | ref_fasta_index: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai 23 | ref_alt: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt 24 | ref_bwt: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt 25 | ref_sa: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa 26 | ref_amb: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb 27 | ref_ann: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann 28 | ref_pac: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac 29 | 30 | dbSNP_vcf: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf 31 | dbSNP_vcf_index: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx 32 | known_snps_sites_vcf: gs://genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz 33 | known_snps_sites_vcf_index: gs://genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi 34 | known_indels_sites_vcf: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz 35 | known_indels_sites_vcf_index: gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi 36 | 37 | wgs_coverage_interval_list: gs://genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list 38 | scattered_calling_intervals: | 39 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0001_of_50/scattered.interval_list 40 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0002_of_50/scattered.interval_list 41 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0003_of_50/scattered.interval_list 42 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0004_of_50/scattered.interval_list 43 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0005_of_50/scattered.interval_list 44 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0006_of_50/scattered.interval_list 45 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0007_of_50/scattered.interval_list 46 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0008_of_50/scattered.interval_list 47 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0009_of_50/scattered.interval_list 48 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0010_of_50/scattered.interval_list 49 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0011_of_50/scattered.interval_list 50 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0012_of_50/scattered.interval_list 51 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0013_of_50/scattered.interval_list 52 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0014_of_50/scattered.interval_list 53 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0015_of_50/scattered.interval_list 54 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0016_of_50/scattered.interval_list 55 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0017_of_50/scattered.interval_list 56 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0018_of_50/scattered.interval_list 57 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0019_of_50/scattered.interval_list 58 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0020_of_50/scattered.interval_list 59 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0021_of_50/scattered.interval_list 60 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0022_of_50/scattered.interval_list 61 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0023_of_50/scattered.interval_list 62 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0024_of_50/scattered.interval_list 63 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0025_of_50/scattered.interval_list 64 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0026_of_50/scattered.interval_list 65 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0027_of_50/scattered.interval_list 66 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0028_of_50/scattered.interval_list 67 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0029_of_50/scattered.interval_list 68 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0030_of_50/scattered.interval_list 69 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0031_of_50/scattered.interval_list 70 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0032_of_50/scattered.interval_list 71 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0033_of_50/scattered.interval_list 72 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0034_of_50/scattered.interval_list 73 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0035_of_50/scattered.interval_list 74 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0036_of_50/scattered.interval_list 75 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0037_of_50/scattered.interval_list 76 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0038_of_50/scattered.interval_list 77 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0039_of_50/scattered.interval_list 78 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0040_of_50/scattered.interval_list 79 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0041_of_50/scattered.interval_list 80 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0042_of_50/scattered.interval_list 81 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0043_of_50/scattered.interval_list 82 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0044_of_50/scattered.interval_list 83 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0045_of_50/scattered.interval_list 84 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0046_of_50/scattered.interval_list 85 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0047_of_50/scattered.interval_list 86 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0048_of_50/scattered.interval_list 87 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0049_of_50/scattered.interval_list 88 | gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0050_of_50/scattered.interval_list 89 | 90 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/runner/TaskRunner.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.runner; 17 | 18 | import com.google.cloud.dataflow.sdk.Pipeline; 19 | import com.google.cloud.dataflow.sdk.PipelineResult; 20 | import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions; 21 | import com.google.cloud.genomics.dockerflow.DockerflowConstants; 22 | import com.google.cloud.genomics.dockerflow.args.TaskArgs; 23 | import com.google.cloud.genomics.dockerflow.args.WorkflowArgs; 24 | import com.google.cloud.genomics.dockerflow.dataflow.DataflowFactory; 25 | import com.google.cloud.genomics.dockerflow.task.Task; 26 | import com.google.cloud.genomics.dockerflow.task.TaskDefn; 27 | import com.google.cloud.genomics.dockerflow.task.TaskDefn.Param; 28 | import com.google.cloud.genomics.dockerflow.util.HttpUtils; 29 | import com.google.cloud.genomics.dockerflow.util.StringUtils; 30 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 31 | import java.io.IOException; 32 | import java.io.Serializable; 33 | import java.util.Map; 34 | import java.util.concurrent.TimeUnit; 35 | import org.slf4j.Logger; 36 | import org.slf4j.LoggerFactory; 37 | 38 | /** 39 | * Utilities for running workflows (directed acyclic graphs) of Docker steps with Dataflow. 40 | * Execution of Docker steps happens through the Pipelines API. 41 | */ 42 | public class TaskRunner implements DockerflowConstants { 43 | private static final Logger LOG = LoggerFactory.getLogger(TaskRunner.class); 44 | 45 | /** Run a Docker workflow on Dataflow. */ 46 | public static void run(Workflow w, Map a, DataflowPipelineOptions o) 47 | throws IOException { 48 | LOG.info("Running workflow graph"); 49 | if (w.getArgs().getProjectId() == null) { 50 | throw new IllegalArgumentException("Project id is required"); 51 | } 52 | 53 | Pipeline p = DataflowFactory.dataflow(w, a, o); 54 | 55 | LOG.info("Created Dataflow pipeline"); 56 | LOG.debug(w.toString()); 57 | 58 | PipelineResult r = p.run(); 59 | 60 | LOG.info("Dataflow pipeline completed"); 61 | LOG.info("Result state: " + r.getState()); 62 | } 63 | 64 | /** 65 | * Run a single task. If running in test mode, no API call will be made. 66 | * 67 | * @param w the task definition as a workflow object 68 | * @return the operation status 69 | * @throws IOException 70 | */ 71 | public static Operation runTask(Task t) throws IOException { 72 | Operation o; 73 | LOG.info("Pipelines API request: " + StringUtils.toJson(getRequest(t))); 74 | 75 | if (t.getArgs() != null 76 | && t.getArgs() instanceof WorkflowArgs 77 | && ((WorkflowArgs) t.getArgs()).isTesting() != null 78 | && ((WorkflowArgs) t.getArgs()).isTesting()) { 79 | LOG.info("Running in test mode. No API call will be made. Name=" + t.getDefn().getName()); 80 | o = new Operation(); 81 | o.setDone(true); 82 | o.setName("operations/TEST-" + t.hashCode()); 83 | } else { 84 | o = callAsyncWebService(getRequest(t)); 85 | } 86 | return o; 87 | } 88 | 89 | /** 90 | * Get the web service request object for this task. 91 | * 92 | * @param t the enclosing task 93 | * @return a json-serializable request object 94 | */ 95 | public static Object getRequest(Task t) { 96 | TaskRequest r = new TaskRequest(); 97 | 98 | // Remove CWL fields from the request: Pipelines API doesn't recognize them 99 | TaskDefn defn = new TaskDefn(t.getDefn()); 100 | if (defn.getInputParameters() != null) { 101 | for (Param p : defn.getInputParameters()) { 102 | p.setInputBinding(null); // exclude from json 103 | p.setType(null); 104 | } 105 | } 106 | if (defn.getOutputParameters() != null) { 107 | for (Param p : defn.getOutputParameters()) { 108 | p.setInputBinding(null); // exclude from json 109 | p.setType(null); 110 | } 111 | } 112 | r.setEphemeralPipeline(defn); 113 | 114 | // Remove other fields not recognized by Pipelines API 115 | TaskArgs ta = new TaskArgs(t.getArgs()); 116 | ta.setFromFile(null); 117 | r.setPipelineArgs(ta); 118 | 119 | return r; 120 | } 121 | 122 | /** 123 | * A replacement for the autogenerated Pipelines API's RunPipelineRequest object. Reason: it works 124 | * with standard json serializers. 125 | */ 126 | @SuppressWarnings("serial") 127 | public static class TaskRequest implements Serializable { 128 | private String pipelineId; 129 | private TaskDefn ephemeralPipeline; 130 | private TaskArgs pipelineArgs; 131 | 132 | public String getPipelineId() { 133 | return pipelineId; 134 | } 135 | 136 | public void setPipelineId(String pipelineId) { 137 | this.pipelineId = pipelineId; 138 | } 139 | 140 | public TaskDefn getEphemeralPipeline() { 141 | return ephemeralPipeline; 142 | } 143 | 144 | public void setEphemeralPipeline(TaskDefn ephemeralPipeline) { 145 | this.ephemeralPipeline = ephemeralPipeline; 146 | } 147 | 148 | public TaskArgs getPipelineArgs() { 149 | return pipelineArgs; 150 | } 151 | 152 | public void setPipelineArgs(TaskArgs pipelineArgs) { 153 | this.pipelineArgs = pipelineArgs; 154 | } 155 | } 156 | 157 | /** 158 | * Submit a web service task asynchronously. 159 | * 160 | * @param req the task definition as a PipelineAPI request object 161 | * @return the operation status 162 | * @throws IOException 163 | * @throws TaskException if operation returned errors 164 | */ 165 | public static Operation callAsyncWebService(Object req) throws IOException, TaskException { 166 | LOG.info("Call Pipelines API."); 167 | String res = HttpUtils.doPost(API_RUN_PIPELINE, req); 168 | Operation status = StringUtils.fromJson(res, Operation.class); 169 | 170 | LOG.info("operationId=" + status.getName()); 171 | 172 | if (status.getError() != null) { 173 | String msg = "Failed to call web service!" + status.getError().getMessage(); 174 | if (status.getError().getDetails() != null) { 175 | msg += "\n" + status.getError().getDetails(); 176 | } 177 | LOG.error(msg); 178 | throw new TaskException(msg); 179 | 180 | } else if (status.getResponse() != null) { 181 | LOG.info("Submitted: " + status.getResponse().toString()); 182 | } else { 183 | LOG.info("Submitted"); 184 | } 185 | 186 | return status; 187 | } 188 | 189 | /** 190 | * Block until the operation is done. 191 | * 192 | * @param operationId the operation to poll 193 | * @return final status 194 | */ 195 | public static Operation wait(String operationId) throws IOException { 196 | Operation o = new Operation(); 197 | o.setName(operationId); 198 | o.setDone(false); 199 | return wait(o); 200 | } 201 | 202 | /** 203 | * Block until the operation is done. 204 | * 205 | * @param op the operation to poll 206 | * @return final status 207 | */ 208 | public static Operation wait(Operation op) throws IOException { 209 | if (op.getDone()) { 210 | return op; 211 | } 212 | 213 | Operation status = op; 214 | do { 215 | LOG.debug("Sleeping for " + POLL_INTERVAL + " sec"); 216 | try { 217 | TimeUnit.SECONDS.sleep(POLL_INTERVAL); 218 | } catch (InterruptedException e) { 219 | // ignore 220 | } 221 | try { 222 | status = checkStatus(status.getName()); 223 | } catch (IOException e) { 224 | LOG.warn("Error checking operation status: " + e.getMessage()); 225 | } 226 | } while (status.getDone() == null || !status.getDone()); 227 | 228 | LOG.info("Done! " + status.getName()); 229 | return status; 230 | } 231 | 232 | /** 233 | * Check status by calling the operations REST API. 234 | * 235 | * @param operationId the operation to check 236 | * @return the updated status 237 | */ 238 | public static Operation checkStatus(String operationId) throws IOException { 239 | String res = HttpUtils.doGet(API_OPERATIONS + operationId); 240 | Operation status = StringUtils.fromJson(res, Operation.class); 241 | return status; 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/util/FileUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.util; 17 | 18 | import com.fasterxml.jackson.dataformat.yaml.snakeyaml.Yaml; 19 | import com.google.gson.GsonBuilder; 20 | import com.google.gson.reflect.TypeToken; 21 | import java.io.BufferedReader; 22 | import java.io.File; 23 | import java.io.FileInputStream; 24 | import java.io.IOException; 25 | import java.io.InputStream; 26 | import java.io.InputStreamReader; 27 | import java.net.URI; 28 | import java.net.URISyntaxException; 29 | import java.util.Map; 30 | import org.slf4j.Logger; 31 | import org.slf4j.LoggerFactory; 32 | 33 | /** Utility methods used with the Dataflow Docker example and the Pipelines API runner. */ 34 | public class FileUtils { 35 | static final Logger LOG = LoggerFactory.getLogger(FileUtils.class); 36 | 37 | /** 38 | * Read a GCS file into a string. 39 | * 40 | * @param gcsPath 41 | * @return file contents 42 | * @throws IOException 43 | */ 44 | public static String readFromGcs(String gcsPath) throws IOException { 45 | if (gcsPath == null || !gcsPath.startsWith("gs://")) { 46 | throw new IllegalArgumentException("GCS path must be non-null and start with gs://."); 47 | } 48 | return HttpUtils.doGet(gcsUrl(gcsPath)); 49 | } 50 | 51 | /** Get the HTTP URL for a GCS path. */ 52 | static String gcsUrl(String gcsPath) { 53 | return "https://storage.googleapis.com/" + gcsPath.substring("gs://".length()); 54 | } 55 | 56 | /** Read complete stream into a string. */ 57 | public static String readAll(InputStream is) throws IOException { 58 | BufferedReader in = new BufferedReader(new InputStreamReader(is)); 59 | StringBuilder sb = new StringBuilder(); 60 | String line; 61 | while ((line = in.readLine()) != null) { 62 | if (sb.length() > 0) { 63 | sb.append("\n"); 64 | } 65 | sb.append(line); 66 | } 67 | in.close(); 68 | return sb.toString(); 69 | } 70 | 71 | /** 72 | * Read the contents of a local or GCS path. 73 | * 74 | * @param path 75 | * @return 76 | * @throws IOException 77 | */ 78 | public static String readAll(String path) throws IOException { 79 | String text; 80 | if (path.startsWith("gs://")) { 81 | text = readFromGcs(path); 82 | } else { 83 | text = readAll(new FileInputStream(path)); 84 | } 85 | return text; 86 | } 87 | 88 | /** 89 | * Load a file from GCS or local and parse from json or yaml. 90 | * 91 | * @param 92 | */ 93 | @SuppressWarnings("unchecked") 94 | public static T parseFile(String path, Class c) throws IOException { 95 | LOG.info("Parse file from path: " + path + " for class " + c); 96 | 97 | String text = readAll(path); 98 | 99 | // Ridiculous hack: direct parsing into a real Java object fails with 100 | // SnakeYaml, Gson and Jackson due to mysterious type incompatibility :( 101 | Map map; 102 | if (path.endsWith("yaml") || path.endsWith("yml")) { 103 | map = (Map) new Yaml().load(text); 104 | 105 | } else { 106 | map = 107 | (Map) 108 | new GsonBuilder() 109 | .setLenient() 110 | .create() 111 | .fromJson(text, new TypeToken>() {}.getType()); 112 | } 113 | 114 | String s = StringUtils.toJson(map); 115 | return StringUtils.fromJson(s, c); 116 | } 117 | 118 | /** 119 | * Resolve a possibly local path based on a parent directory path. The parent path *must* end with 120 | * a "/" if it's a directory. Otherwise it will be removed. 121 | * 122 | * @param path a possibly local path 123 | * @param parent a file or a directory ending in a "/" 124 | * @return resolved path 125 | * @throws URISyntaxException 126 | */ 127 | public static String resolve(String path, String parent) 128 | throws IOException, URISyntaxException { 129 | LOG.debug("Resolve " + path + " vs parent " + parent); 130 | String retval; 131 | 132 | if (path.startsWith("/") || path.startsWith("gs:/")) { 133 | retval = path; 134 | } else if (parent.endsWith("/")) { 135 | retval = parent + path; 136 | } else { 137 | retval = parent.substring(0, parent.lastIndexOf("/") + 1) + path; 138 | } 139 | if (retval.startsWith("gs:/")) { 140 | retval = new URI(retval).normalize().toString(); 141 | } else { 142 | retval = new File(retval).getCanonicalPath(); 143 | } 144 | return retval; 145 | } 146 | 147 | /** 148 | * Convert a GCS path to a local file path of the form gs/bucket/file. 149 | * 150 | * @param gcsPath GCS path or, if testing locally, a path on the local machine 151 | */ 152 | public static String localPath(String gcsPath) { 153 | if (gcsPath == null) { 154 | return null; 155 | } 156 | String localPath = gcsPath.replace("gs://", "gs/"); 157 | if (localPath.startsWith("/")) { 158 | localPath = localPath.substring(1); 159 | } 160 | 161 | // For wildcards, use the parent folder 162 | if (localPath.indexOf("*") > 0 || localPath.split("\\s+").length > 1) { 163 | localPath = localPath.substring(0, localPath.lastIndexOf("/")); 164 | } 165 | return localPath; 166 | } 167 | 168 | /** 169 | * The resolved path where the Pipelines API will copy the task log. 170 | * 171 | * @param path the GCS path passed to the Pipelines API 172 | * @param operationName the operation name, or null if unknown 173 | */ 174 | public static String logPath(String path, String operationName) { 175 | return logPath(path, operationName, ""); 176 | } 177 | 178 | /** 179 | * The resolved path where the Pipelines API will copy the contents of stdout. 180 | * 181 | * @param path the GCS path passed to the Pipelines API 182 | * @param operationName the operation name, or null if unknown 183 | */ 184 | public static String stdoutPath(String path, String operationName) { 185 | return logPath(path, operationName, "-stdout"); 186 | } 187 | 188 | /** 189 | * The path where the Pipelines API will copy the contents of stdout. 190 | * 191 | * @param path the GCS path passed to the Pipelines API 192 | * @param operationName the operation name, or null if unknown 193 | */ 194 | public static String stderrPath(String path, String operationName) { 195 | return logPath(path, operationName, "-stderr"); 196 | } 197 | 198 | /** 199 | * The resolved path where the Pipelines API will copy the contents of stdout. 200 | * 201 | * @param path the GCS path passed to the Pipelines API 202 | * @param operationName the operation name, or null if unknown 203 | * @param logName: one of empty string, "-stderr" or "-stdout" 204 | */ 205 | private static String logPath(String path, String operationName, String logName) { 206 | if (path == null) { 207 | throw new IllegalArgumentException("Path cannot be null"); 208 | } 209 | 210 | String retval = path; 211 | if (path.matches("^(.*\\.\\w*)$")) { 212 | retval = 213 | path.substring(0, path.lastIndexOf(".")) 214 | + logName 215 | + path.substring(path.lastIndexOf(".")); 216 | } else { 217 | if (operationName == null) { 218 | throw new IllegalArgumentException( 219 | "operationName cannot be null if logging path is a directory"); 220 | } 221 | retval = 222 | path + "/" + operationName.substring(operationName.indexOf("/") + 1) + logName + ".log"; 223 | } 224 | return retval; 225 | } 226 | 227 | /** 228 | * Check if a URI path exists. 229 | * 230 | * @param path GCS path, typically 231 | * @return true if the path exists 232 | */ 233 | public static boolean gcsPathExists(String gcsPath) { 234 | boolean exists; 235 | if (gcsPath == null) { 236 | exists = false; 237 | } else { 238 | try { 239 | HttpUtils.doHead(gcsUrl(gcsPath)); 240 | exists = true; 241 | } catch (IOException e) { 242 | LOG.info(e.getMessage()); 243 | exists = false; 244 | } 245 | } 246 | return exists; 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/dataflow/DataflowFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.dataflow; 17 | 18 | import com.google.cloud.dataflow.sdk.Pipeline; 19 | import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions; 20 | import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory; 21 | import com.google.cloud.dataflow.sdk.runners.BlockingDataflowPipelineRunner; 22 | import com.google.cloud.dataflow.sdk.runners.DataflowPipelineRunner; 23 | import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner; 24 | import com.google.cloud.dataflow.sdk.runners.PipelineRunner; 25 | import com.google.cloud.dataflow.sdk.transforms.Create; 26 | import com.google.cloud.dataflow.sdk.transforms.ParDo; 27 | import com.google.cloud.dataflow.sdk.values.KV; 28 | import com.google.cloud.dataflow.sdk.values.PCollection; 29 | import com.google.cloud.dataflow.sdk.values.PCollectionList; 30 | import com.google.cloud.genomics.dockerflow.DockerflowConstants; 31 | import com.google.cloud.genomics.dockerflow.args.WorkflowArgs; 32 | import com.google.cloud.genomics.dockerflow.transform.DeleteIntermediateFiles; 33 | import com.google.cloud.genomics.dockerflow.transform.DockerDo; 34 | import com.google.cloud.genomics.dockerflow.transform.MergeBranches; 35 | import com.google.cloud.genomics.dockerflow.util.StringUtils; 36 | import com.google.cloud.genomics.dockerflow.workflow.GraphItem; 37 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 38 | import com.google.cloud.genomics.dockerflow.workflow.Workflow.Branch; 39 | import com.google.cloud.genomics.dockerflow.workflow.Workflow.Steps; 40 | import java.io.IOException; 41 | import java.util.HashMap; 42 | import java.util.List; 43 | import java.util.Map; 44 | import org.slf4j.Logger; 45 | import org.slf4j.LoggerFactory; 46 | 47 | /** Factory methods to create Dataflow Docker stuff. */ 48 | public class DataflowFactory implements DockerflowConstants { 49 | static final Logger LOG = LoggerFactory.getLogger(DataflowFactory.class); 50 | 51 | // Counter to disambiguate PTransform names in branched graphs 52 | private static int numMerges = 0; 53 | 54 | /** 55 | * Create Dataflow Pipeline options from the standard command-line options, "--project=", 56 | * "--runner=" and "--stagingLocation=" 57 | * 58 | * @param args 59 | * @return 60 | * @throws IOException 61 | */ 62 | public static DataflowPipelineOptions pipelineOptions(String[] args) throws IOException { 63 | LOG.info("Set up Dataflow options"); 64 | DataflowPipelineOptions o = PipelineOptionsFactory.as(DataflowPipelineOptions.class); 65 | 66 | Map m = StringUtils.parseArgs(args); 67 | o.setProject(m.get(PROJECT)); 68 | if (m.containsKey(STAGING)) { 69 | o.setStagingLocation(m.get(STAGING)); 70 | } else if (m.containsKey(STAGING_LOCATION)) { 71 | o.setStagingLocation(m.get(STAGING_LOCATION)); 72 | } else if (m.containsKey(WORKSPACE)) { 73 | o.setStagingLocation(m.get(WORKSPACE) + "/staging"); 74 | } 75 | o.setRunner(runner(m.get(RUNNER))); 76 | o.setMaxNumWorkers(m.get(MAX_WORKERS) == null ? 1 : Integer.parseInt(m.get(MAX_WORKERS))); 77 | if (m.containsKey(MACHINE_TYPE)) { 78 | o.setWorkerMachineType(m.get(MACHINE_TYPE)); 79 | } else { 80 | o.setWorkerMachineType(DEFAULT_MACHINE_TYPE); 81 | } 82 | return o; 83 | } 84 | 85 | private static Class> runner(String name) { 86 | Class> c = DirectPipelineRunner.class; // default 87 | 88 | if (DEFAULT_RUNNER.equals(name) || name == null) { 89 | c = DataflowPipelineRunner.class; 90 | } else if (BLOCKING_RUNNER.equals(name)) { 91 | c = BlockingDataflowPipelineRunner.class; 92 | } else if (DIRECT_RUNNER.equals(name)) { 93 | c = DirectPipelineRunner.class; 94 | } 95 | return c; 96 | } 97 | 98 | /** 99 | * Dynamically construct a Dataflow from the workflow definition. The root PCollection has one 100 | * element, the root task's name. 101 | * 102 | * @param workflow 103 | * @param dataflowArgs 104 | * @return 105 | * @throws IOException 106 | */ 107 | public static Pipeline dataflow( 108 | Workflow workflow, Map workflowArgs, DataflowPipelineOptions o) 109 | throws IOException { 110 | 111 | assert (workflow != null); 112 | assert (o != null); 113 | assert (workflow.getDefn() != null); 114 | 115 | // Set defaults 116 | if (o.getAppName() == null) { 117 | o.setAppName(workflow.getDefn().getName()); 118 | } 119 | if (o.getProject() == null && workflow.getArgs() != null) { 120 | o.setProject(workflow.getArgs().getProjectId()); 121 | } 122 | if (o.getMaxNumWorkers() == 0) { 123 | o.setMaxNumWorkers(1); 124 | } 125 | if (o.getWorkerMachineType() == null) { 126 | o.setWorkerMachineType(DEFAULT_MACHINE_TYPE); 127 | } 128 | 129 | LOG.info("Initializing dataflow pipeline"); 130 | Pipeline p = Pipeline.create(o); 131 | 132 | LOG.info("Creating input collection of workflow args"); 133 | if (workflowArgs == null) { 134 | workflowArgs = new HashMap(); 135 | } 136 | if (workflowArgs.isEmpty()) { 137 | LOG.info("No workflow args were provided. Using default values."); 138 | workflowArgs.put(workflow.getDefn().getName(), new WorkflowArgs()); 139 | } else if (workflow.getArgs() != null) { 140 | LOG.info("Merging default workflow args with instance-specific args"); 141 | 142 | for (String key : workflowArgs.keySet()) { 143 | WorkflowArgs instanceArgs = workflowArgs.get(key); 144 | instanceArgs.mergeDefaultArgs(workflow.getArgs()); 145 | LOG.debug("Merged args: " + StringUtils.toJson(instanceArgs)); 146 | } 147 | } 148 | 149 | LOG.info("Creating dataflow pipeline for workflow " + workflow.getDefn().getName()); 150 | PCollection> input = p.apply(Create.of(workflowArgs)); 151 | input = dataflow(Workflow.Steps.graph(workflow), input); 152 | 153 | if (workflowArgs.values().iterator().next().getDeleteFiles()) { 154 | LOG.info("Intermediate files will be deleted"); 155 | input = 156 | input.apply( 157 | ParDo.named("DeleteIntermediateFiles").of(new DeleteIntermediateFiles(workflow))); 158 | } 159 | 160 | return p; 161 | } 162 | 163 | /** 164 | * Recursively construct the dataflow pipeline. 165 | * 166 | * @param graphItem a node, edge or branch point 167 | * @param input the inputs to the graph element 168 | * @throws IOException 169 | */ 170 | private static PCollection> dataflow( 171 | GraphItem graphItem, PCollection> input) throws IOException { 172 | PCollection> output = input; 173 | 174 | // It's a node 175 | if (graphItem instanceof Workflow) { 176 | Workflow w = (Workflow) graphItem; 177 | 178 | LOG.info("Adding task: " + w.getDefn().getName()); 179 | output = input.apply(DockerDo.of(w)); 180 | // It's a branch 181 | } else if (graphItem instanceof Branch) { 182 | LOG.info("Pipeline splits into branches. Adding branches"); 183 | output = branches(((Branch) graphItem), input); 184 | // It's an edge 185 | } else if (graphItem instanceof Steps) { 186 | LOG.info("Adding steps"); 187 | Steps steps = (Steps) graphItem; 188 | 189 | // For each sequential element, the output of one is the input to 190 | // the next 191 | if (steps.getSteps() != null) { 192 | for (GraphItem item : steps.getSteps()) { 193 | output = dataflow(item, output); 194 | } 195 | } 196 | } else { 197 | throw new IllegalStateException("Invalid graph element type: " + graphItem); 198 | } 199 | return output; 200 | } 201 | 202 | /** 203 | * The graph splits into parallel branches. Generate a PCollection for each, then merge them 204 | * together into a flattened PCollectionList, and finally combine globally so that the graph can 205 | * continue merged. 206 | * 207 | *

In order to run the branch tasks in parallel the Dataflow pipeline must have maxNumWorkers 208 | * == number of branches. To run Dataflow on a single VM, and support branching graphs nicely, 209 | * some optimization is needed. 210 | * 211 | * @param graphItem 212 | * @param input 213 | * @return the globally combined, flattened output of all edges; value is a singleton string, 214 | * which is randomly chosen from the inputs 215 | * @throws IOException 216 | */ 217 | public static PCollection> branches( 218 | Branch graphItem, PCollection> input) throws IOException { 219 | 220 | List branches = graphItem.getBranches(); 221 | LOG.info("Branch count: " + branches.size()); 222 | 223 | PCollectionList> outputs = null; 224 | 225 | // For each edge, apply a transform to the input collection 226 | for (GraphItem branch : branches) { 227 | LOG.info("Adding branch"); 228 | PCollection> branchOutput = dataflow(branch, input); 229 | outputs = outputs == null ? PCollectionList.of(branchOutput) : outputs.and(branchOutput); 230 | } 231 | 232 | LOG.info("Merging " + outputs.size() + " branches"); 233 | return outputs.apply(new MergeBranches("MergeBranches" + (numMerges > 1 ? ++numMerges : ""))); 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/args/ArgsTableBuilder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.args; 17 | 18 | import com.google.cloud.genomics.dockerflow.DockerflowConstants; 19 | import com.google.cloud.genomics.dockerflow.args.TaskArgs.Logging; 20 | import com.google.cloud.genomics.dockerflow.task.TaskDefn.Resources; 21 | import com.google.cloud.genomics.dockerflow.util.FileUtils; 22 | import com.google.cloud.genomics.dockerflow.util.StringUtils; 23 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 24 | import java.io.IOException; 25 | import java.util.ArrayList; 26 | import java.util.LinkedHashMap; 27 | import java.util.List; 28 | import java.util.Map; 29 | import org.apache.commons.csv.CSVFormat; 30 | import org.apache.commons.csv.CSVParser; 31 | import org.apache.commons.csv.CSVRecord; 32 | import org.slf4j.Logger; 33 | import org.slf4j.LoggerFactory; 34 | 35 | /** 36 | * Builder for a table of workflow args to allow bulk processing. 37 | */ 38 | public class ArgsTableBuilder implements DockerflowConstants { 39 | private static final Logger LOG = LoggerFactory.getLogger(ArgsTableBuilder.class); 40 | 41 | private Map table; 42 | 43 | public static ArgsTableBuilder named(String name) { 44 | ArgsTableBuilder b = new ArgsTableBuilder(); 45 | b.table = new LinkedHashMap(); 46 | b.table.put(name, new WorkflowArgs()); 47 | return b; 48 | } 49 | 50 | public static ArgsTableBuilder of(Workflow w) throws IOException { 51 | return of((WorkflowArgs) w.getArgs()); 52 | } 53 | 54 | public static ArgsTableBuilder of(WorkflowArgs args) { 55 | ArgsTableBuilder b = new ArgsTableBuilder(); 56 | b.table = new LinkedHashMap(); 57 | b.table.put("args", new WorkflowArgs(args)); 58 | return b; 59 | } 60 | 61 | public static ArgsTableBuilder of(Map args) { 62 | ArgsTableBuilder b = new ArgsTableBuilder(); 63 | b.table = args; 64 | return b; 65 | } 66 | 67 | public static ArgsTableBuilder fromArgs(String[] args) throws IOException { 68 | Map m = StringUtils.parseArgs(args); 69 | ArgsTableBuilder b; 70 | WorkflowArgs wa = ArgsBuilder.fromArgs(args).build(); 71 | 72 | if (m.containsKey(ARGS_FILE)) { 73 | Map globals = null; 74 | if (m.containsKey(GLOBALS)) { 75 | globals = StringUtils.parseParameters(m.get(GLOBALS), false); 76 | } 77 | b = ArgsTableBuilder.fromFile(m.get(ARGS_FILE)).globals(globals); 78 | b.parameters(wa); // apply command-line settings 79 | } else { 80 | b = ArgsTableBuilder.of(wa); 81 | } 82 | return b; 83 | } 84 | 85 | /** 86 | * Load from file. 87 | * 88 | * @param file yaml or json for a single set of args, or csv for multiple. 89 | * @return 90 | * @throws IOException 91 | */ 92 | public static ArgsTableBuilder fromFile(String file) throws IOException { 93 | if (file == null) { 94 | throw new IllegalArgumentException("File cannot be null"); 95 | } 96 | ArgsTableBuilder b = new ArgsTableBuilder(); 97 | 98 | if (file.toLowerCase().endsWith(".csv")) { 99 | b.table = loadCsv(file); 100 | // Parse from yaml/json 101 | } else { 102 | WorkflowArgs wa = FileUtils.parseFile(file, WorkflowArgs.class); 103 | b.table = new LinkedHashMap(); 104 | b.table.put(file, wa); 105 | } 106 | return b; 107 | } 108 | 109 | ArgsTableBuilder() {} 110 | 111 | /** 112 | * Load the workflow arguments from a CSV file. The header of the CSV contains the input or output 113 | * parameter names. Each row contains the workflow args for a single run. To run 100 instances of 114 | * a workflow concurrently, create a CSV with a header row plus 100 rows for each set of 115 | * parameters. 116 | * 117 | *

Columns by default are input parameters, passed as environment variables to the Docker 118 | * script. For file parameters, you can prefix the column header with "<" for input or ">" for 119 | * output. For clarity, you can also prefix the regular input parameters as "<", if you like. 120 | * 121 | *

The column header can also be "logging", which is a reserved name for the logging path. 122 | * 123 | * @param csvFile CSV file (RFC4180) that's local or in GCS 124 | * @return a map with the key being the clientId 125 | * @throws IOException 126 | */ 127 | static Map loadCsv(String csvFile) throws IOException { 128 | Map retval = new LinkedHashMap(); 129 | 130 | String csv = FileUtils.readAll(csvFile); 131 | CSVParser parser = CSVParser.parse(csv, CSVFormat.RFC4180); 132 | 133 | // Parse header 134 | List header = null; 135 | 136 | int row = 0; 137 | 138 | // Parse by row 139 | for (CSVRecord csvRecord : parser) { 140 | ArgsBuilder args = ArgsBuilder.of(String.valueOf(row)); 141 | 142 | LOG.debug(StringUtils.toJson(csvRecord)); 143 | 144 | // Parse header the first time 145 | if (row == 0) { 146 | header = new ArrayList(); 147 | for (String col : csvRecord) { 148 | header.add(col); 149 | } 150 | } else { 151 | // Set parameter defined in each column 152 | for (int col = 0; col < header.size(); ++col) { 153 | String name = header.get(col); 154 | String val = csvRecord.get(col); 155 | 156 | if (name.startsWith(PREFIX_INPUT)) { 157 | name = name.replace(PREFIX_INPUT, ""); 158 | args.input(name, val); 159 | } else if (name.startsWith(INPUTS + "=")) { 160 | name = name.replace(INPUTS + "=", ""); 161 | args.input(name, val); 162 | } else if (name.startsWith(INPUTS_FROM_FILE + "=")) { 163 | name = name.replace(INPUTS_FROM_FILE + "=", ""); 164 | args.inputFromFile(name, val); 165 | } else if (name.startsWith(PREFIX_OUTPUT)) { 166 | name = name.replace(PREFIX_OUTPUT, ""); 167 | args.output(name, val); 168 | } else if (name.startsWith(OUTPUTS + "=")) { 169 | name = name.replace(OUTPUTS + "=", ""); 170 | args.input(name, val); 171 | } else if (LOGGING.equals(name)) { 172 | args.logging(val); 173 | } else if (WORKSPACE.equals(name)) { 174 | args.workspace(val); 175 | } else if (MEMORY.equals(name)) { 176 | args.memory(val); 177 | } else if (DISK_SIZE.equals(name)) { 178 | args.diskSize(val); 179 | } else if (CPU.equals(name)) { 180 | args.cpu(Integer.parseInt(val)); 181 | } else { 182 | args.input(name, val); 183 | } 184 | } 185 | WorkflowArgs a = args.build(); 186 | a.setRunIndex(row); 187 | retval.put(a.getClientId(), a); 188 | } 189 | ++row; 190 | } 191 | return retval; 192 | } 193 | 194 | /** 195 | * Substitute global variables of the form ${KEY} in all inputs, outputs, and logging paths where 196 | * they occur. 197 | * 198 | * @param args 199 | * @param globals 200 | */ 201 | public ArgsTableBuilder globals(Map globals) { 202 | for (WorkflowArgs a : table.values()) { 203 | a.substitute(globals); 204 | } 205 | return this; 206 | } 207 | 208 | /** 209 | * Substitute a global variable of the form ${KEY} in all inputs, outputs, and logging paths where 210 | * it occurs. 211 | * 212 | * @param args 213 | * @param globals 214 | */ 215 | public ArgsTableBuilder global(String key, String value) { 216 | Map globals = new LinkedHashMap(); 217 | globals.put(key, value); 218 | 219 | for (WorkflowArgs a : table.values()) { 220 | a.substitute(globals); 221 | } 222 | return this; 223 | } 224 | 225 | /** 226 | * Set project, logging, resources on all workflow args. 227 | * 228 | * @param args 229 | * @param defaults 230 | */ 231 | public ArgsTableBuilder parameters(TaskArgs values) { 232 | for (WorkflowArgs a : table.values()) { 233 | a.applyArgs(values); 234 | } 235 | return this; 236 | } 237 | 238 | public ArgsTableBuilder project(String name) { 239 | TaskArgs args = new TaskArgs(); 240 | args.setProjectId(name); 241 | parameters(args); 242 | return this; 243 | } 244 | 245 | public ArgsTableBuilder logging(String path) { 246 | TaskArgs args = new TaskArgs(); 247 | args.setLogging(new Logging()); 248 | args.getLogging().setGcsPath(path); 249 | parameters(args); 250 | return this; 251 | } 252 | 253 | public ArgsTableBuilder preemptible(boolean b) { 254 | TaskArgs args = new TaskArgs(); 255 | args.setResources(new Resources()); 256 | args.getResources().setPreemptible(b); 257 | parameters(args); 258 | return this; 259 | } 260 | 261 | public ArgsTableBuilder cores(int minCpuCores) { 262 | TaskArgs args = new TaskArgs(); 263 | args.setResources(new Resources()); 264 | args.getResources().setMinimumCpuCores(String.valueOf(minCpuCores)); 265 | parameters(args); 266 | return this; 267 | } 268 | 269 | public ArgsTableBuilder memory(double minRamGb) { 270 | TaskArgs args = new TaskArgs(); 271 | args.setResources(new Resources()); 272 | args.getResources().setMinimumRamGb(String.valueOf(minRamGb)); 273 | parameters(args); 274 | return this; 275 | } 276 | 277 | public ArgsTableBuilder testing(boolean b) { 278 | WorkflowArgs args = new WorkflowArgs(); 279 | args.setTesting(b); 280 | parameters(args); 281 | return this; 282 | } 283 | 284 | public Map build() { 285 | return table; 286 | } 287 | } 288 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/genomics/dockerflow/workflow/Workflow.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow.workflow; 17 | 18 | import com.google.cloud.genomics.dockerflow.DockerflowConstants; 19 | import com.google.cloud.genomics.dockerflow.args.TaskArgs; 20 | import com.google.cloud.genomics.dockerflow.args.WorkflowArgs; 21 | import com.google.cloud.genomics.dockerflow.task.Task; 22 | import java.io.Serializable; 23 | import java.util.ArrayList; 24 | import java.util.Arrays; 25 | import java.util.List; 26 | import java.util.Map; 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | 30 | /** 31 | * A static directed acyclic graph definition that can be executed on Dataflow. It's a 32 | * generalization of the Pipelines API RunPipelineRequest to support workflow graphs. (It's not a 33 | * derived class, because RunPipelineRequest is a final class.) 34 | * 35 | *

To pass command-line parameter overrides to subtasks, qualify with the task name like 36 | * "task1_name.inputName=value", "task2_name.outputName=value", etc. 37 | */ 38 | @SuppressWarnings("serial") 39 | public class Workflow extends Task { 40 | static final Logger LOG = LoggerFactory.getLogger(Workflow.class); 41 | 42 | private String workflowDefnFile; 43 | private List graph; 44 | private List steps; 45 | private Steps dag; 46 | 47 | public Workflow() {} 48 | 49 | public Workflow(Task t) { 50 | super(t); 51 | if (t.getArgs() != null) { 52 | args = new WorkflowArgs(t.getArgs()); 53 | } 54 | } 55 | 56 | /** Recursively set global variables. Called by {@link #applyArgs(TaskArgs)}. */ 57 | @Override 58 | public void substitute(Map globals) { 59 | super.substitute(globals); 60 | 61 | if (steps != null) { 62 | for (Workflow w : steps) { 63 | w.substitute(globals); 64 | } 65 | } 66 | } 67 | 68 | /** Recursively set parameters in the subtasks. */ 69 | @Override 70 | public void applyArgs(TaskArgs a) { 71 | super.applyArgs(a); 72 | 73 | if (steps != null) { 74 | for (Workflow w : steps) { 75 | w.applyArgs(a); 76 | } 77 | } 78 | } 79 | 80 | public String getWorkflowDefnFile() { 81 | return workflowDefnFile; 82 | } 83 | 84 | public void setWorkflowDefnFile(String file) { 85 | this.workflowDefnFile = file; 86 | } 87 | 88 | public List getSteps() { 89 | return steps; 90 | } 91 | 92 | public void setSteps(List steps) { 93 | this.steps = steps; 94 | } 95 | 96 | /** Set the steps from a graph. */ 97 | public void setSteps(Steps graph) { 98 | dag = graph; 99 | if (graph != null && graph.getSteps() != null) { 100 | if (steps == null) { 101 | steps = new ArrayList(); 102 | } 103 | steps.addAll(steps(graph)); 104 | } 105 | } 106 | 107 | /** Recursively descend the graph and add all tasks. */ 108 | private List steps(GraphItem graphItem) { 109 | List retval = new ArrayList(); 110 | 111 | if (graphItem instanceof Workflow) { 112 | Workflow w = (Workflow) graphItem; 113 | 114 | if (w.getSteps() == null || w.getSteps().isEmpty()) { 115 | retval.add(w); 116 | } else { 117 | retval.addAll(w.getSteps()); 118 | } 119 | } else if (graphItem instanceof Branch) { 120 | Branch b = (Branch) graphItem; 121 | for (GraphItem branch : b.getBranches()) { 122 | retval.addAll(steps(branch)); 123 | } 124 | } else if (graphItem instanceof Steps) { 125 | Steps g = (Steps) graphItem; 126 | for (GraphItem step : g.getSteps()) { 127 | retval.addAll(steps(step)); 128 | } 129 | } 130 | return retval; 131 | } 132 | 133 | /** The graph as yaml/json using task names only. */ 134 | public List getGraph() { 135 | return graph; 136 | } 137 | 138 | /** The graph as yaml/json using task names only. */ 139 | public void setGraph(List graph) { 140 | this.graph = graph; 141 | } 142 | 143 | /** Find the step in the workflow having the desired name. */ 144 | public Workflow step(String name) { 145 | for (Workflow step : steps) { 146 | if (name.equals(step.getDefn().getName())) { 147 | return step; 148 | } 149 | } 150 | throw new IllegalStateException("Workflow step not found: " + name); 151 | } 152 | 153 | /** The directed acyclic graph to be converted into a Dataflow pipeline. */ 154 | public Steps getDAG() { 155 | if (dag == null) { 156 | dag = Steps.of(this); 157 | } 158 | return dag; 159 | } 160 | 161 | /** A branch in a directed acyclic graph. */ 162 | public static class Branch implements GraphItem, Serializable { 163 | private List branches; 164 | 165 | public static GraphItem of(GraphItem... graphItems) { 166 | Branch b = new Branch(); 167 | b.setBranches(Arrays.asList(graphItems)); 168 | return b; 169 | } 170 | 171 | public List getBranches() { 172 | return branches; 173 | } 174 | 175 | public void setBranches(List branches) { 176 | this.branches = branches; 177 | } 178 | } 179 | 180 | /** A directed acyclic graph. */ 181 | public static class Steps implements GraphItem, Serializable { 182 | static final Logger LOG = LoggerFactory.getLogger(Steps.class); 183 | 184 | private List steps; 185 | 186 | public static Steps of(GraphItem... graphItems) { 187 | Steps g = new Steps(); 188 | g.steps = Arrays.asList(graphItems); 189 | return g; 190 | } 191 | 192 | public static Steps of(Workflow workflow) { 193 | return graph(workflow); 194 | } 195 | 196 | public List getSteps() { 197 | return steps; 198 | } 199 | 200 | public void setSteps(List steps) { 201 | this.steps = steps; 202 | } 203 | 204 | /** 205 | * Get the directed acyclic graph (DAG) of all workflow objects. The workflow definition file 206 | * contains just the task name strings to keep it human-readable. But that means we now have to 207 | * create an identically shaped graph that actually has the full task definition for each node 208 | * -- and each node could itself expand into a graph. 209 | * 210 | * @param w 211 | * @return a list of sequential steps in a directed acyclic graph 212 | */ 213 | public static Steps graph(Workflow w) { 214 | LOG.info("Creating graph for workflow " + w.getDefn().getName()); 215 | Steps graph = new Steps(); 216 | graph.setSteps(new ArrayList()); 217 | 218 | // It's a singleton pipeline 219 | if (w.getDefn() != null && (w.getSteps() == null || w.getSteps().isEmpty())) { 220 | LOG.info("Add workflow to graph: " + w.getDefn().getName()); 221 | graph.getSteps().add(w); 222 | // It's a DAG defined in code 223 | } else if (w.dag != null) { 224 | return w.getDAG(); 225 | } else { 226 | // No DAG defined; run steps in order 227 | if (w.getGraph() == null || w.getGraph().isEmpty()) { 228 | graph.getSteps().addAll(w.getSteps()); 229 | } 230 | 231 | // DAG is defined in yaml/json 232 | if (w.getGraph() != null) { 233 | for (Object node : w.getGraph()) { 234 | GraphItem subgraph = subgraph(node, w); 235 | graph.getSteps().add(subgraph); 236 | } 237 | } 238 | } 239 | return graph; 240 | } 241 | 242 | /** 243 | * Recursive function that constructs the graph nodes (workflows), branches (maps with exactly 244 | * one key, "BRANCH"), and edges (lists of sequential steps). 245 | * 246 | * @param graphElement an element in the graph definition 247 | * @param workflow 248 | * @return the top-most node in a directed acyclic subgraph 249 | */ 250 | @SuppressWarnings({"rawtypes", "unchecked"}) 251 | static GraphItem subgraph(Object graphElement, Workflow workflow) { 252 | LOG.info("Get subgraph"); 253 | 254 | GraphItem subgraph = null; 255 | 256 | // It's a node. Add a single linear step in sequence 257 | if (graphElement instanceof String) { 258 | LOG.info("Subgraph is a single node named: " + graphElement); 259 | 260 | Workflow step = workflow.step((String) graphElement); 261 | Steps tasks = graph(step); 262 | 263 | // The step might be an individual task or a whole workflow 264 | subgraph = tasks.getSteps().size() == 1 ? tasks.getSteps().get(0) : tasks; 265 | // Branch to multiple parallel steps 266 | } else if (isBranch(graphElement)) { 267 | LOG.info("Subgraph is a BRANCH"); 268 | 269 | Branch branch = new Branch(); 270 | 271 | List branches = (List) ((Map) graphElement).get(DockerflowConstants.BRANCH); 272 | LOG.info("Branch count: " + branches.size()); 273 | 274 | List newBranches = new ArrayList(branches.size()); 275 | branch.setBranches(newBranches); 276 | 277 | // Add a subgraph for each branch 278 | for (Object subnode : branches) { 279 | LOG.info("Adding branch"); 280 | newBranches.add(subgraph(subnode, workflow)); 281 | } 282 | subgraph = branch; 283 | // It's an edge. Add a subsequence of steps 284 | } else if (graphElement instanceof List) { 285 | List steps = new ArrayList(); 286 | Steps g = new Steps(); 287 | g.setSteps(steps); 288 | 289 | for (Object step : (List) graphElement) { 290 | steps.add(subgraph(step, workflow)); 291 | } 292 | subgraph = g; 293 | } else { 294 | throw new IllegalStateException( 295 | "Malformed pipeline graph for datapipe " 296 | + workflow.getDefn().getName() 297 | + " at node " 298 | + graphElement); 299 | } 300 | return subgraph; 301 | } 302 | 303 | /** The graph element is a branch point -- ie, it's a map with key "BRANCH" = List. */ 304 | @SuppressWarnings({"unchecked", "rawtypes"}) 305 | static boolean isBranch(Object graphElement) { 306 | return graphElement instanceof Map 307 | && ((Map) graphElement).size() == 1 308 | && (((Map) graphElement).keySet().contains(DockerflowConstants.BRANCH)); 309 | } 310 | } 311 | } 312 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Disclaimer 2 | 3 | This is not an official Google product. 4 | 5 | # Update 6 | 7 | As of 11 Nov 2017, Dockerflow is no longer actively maintained and will not be enhanced with new 8 | features. 9 | 10 | For multi-step batch workflows consisting of Docker tasks, we now recommend running in the cloud 11 | using: 12 | 13 | * [dsub](https://github.com/googlegenomics/dsub), a command-line batch submission tool 14 | 15 | To run multi-step workflows with dsub, you can create a bash or python script with multiple dsub 16 | calls. Execution graphs can be constructed using dsub's 17 | [job control functionality](https://github.com/googlegenomics/dsub/blob/master/docs/job_control.md). 18 | 19 | For any Dockerflow functionality that is not satisfied by dsub, please 20 | [file an issue](https://github.com/googlegenomics/dsub/issues) in the dsub repository. 21 | 22 | # Dockerflow 23 | 24 | Dockerflow makes it easy to run a multi-step workflow of Docker tasks using 25 | [Google Cloud Dataflow](https://cloud.google.com/dataflow) for orchestration. 26 | Docker steps are run using the [Pipelines API](https://cloud.google.com/genomics/v1alpha2/pipelines). 27 | 28 | You can run Dockerflow from a shell on your laptop, and the job will run in 29 | Google Cloud Platform using Dataflow's fully managed service and web UI. 30 | 31 | Dockerflow workflows can be defined in [YAML](http://yaml.org) files, or by writing 32 | Java code. Examples of workflows defined in YAML can be found in 33 | 34 | * [examples](examples) 35 | * [src/test/resources](src/test/resources) 36 | 37 | Examples of workflows defined in Java can be found in 38 | 39 | * [examples](examples) 40 | * [src/test/java/com/google/cloud/genomics/dockerflow/examples](src/test/java/com/google/cloud/genomics/dockerflow/examples) 41 | 42 | You can run a batch of workflows at once by providing a CSV file with one row per 43 | workflow to define the parameters. 44 | 45 | ## Why Dockerflow? 46 | 47 | This project was created as a proof-of-concept that Dataflow can be used 48 | for monitoring and management of directed acyclic graphs of command-line tools. 49 | 50 | Dataflow and Docker complement each other nicely: 51 | 52 | * Dataflow provides graph optimization, a nice monitoring interface, retries, 53 | and other niceties. 54 | * Docker provides portability of the tools themselves, and there's a large 55 | library of packaged tools already available as Docker images. 56 | 57 | While Dockerflow supports a simple YAML workflow definition, a similar approach 58 | could be taken to implement a runner for one of the open standards like [Common 59 | Workflow Language] 60 | (https://github.com/common-workflow-language/common-workflow-language) or 61 | [Workflow Definition Language](github.com/broadinstitute/wdl). 62 | 63 | ## Table of contents 64 | 65 | * [Prerequisites](#prerequisites) 66 | * [Getting started](#getting-started) 67 | * [Docker + Dataflow vs custom scripts](#docker-and-dataflow-vs-custom-scripts) 68 | * [Creating your own workflows](#creating-your-own-workflows) 69 | * [Sequential workflows](#sequential-workflows) 70 | * [Parallel workflows](#parallel-workflows) 71 | * [Branching workflows](#branching-workflows) 72 | * [Testing](#testing) 73 | * [What next](#what-next) 74 | 75 | ## Prerequisites 76 | 77 | 1. Sign up for a Google Cloud Platform account and [create a project] 78 | (https://console.cloud.google.com/project?). 79 | 2. [Enable the APIs] 80 | (https://console.cloud.google.com/flows/enableapi?apiid=genomics,dataflow,storage_component,compute_component&redirect=https://console.cloud.google.com) 81 | for Cloud Dataflow, Google Genomics, Compute Engine and Cloud Storage.\ 82 | 3. [Install the Google Cloud SDK](https://cloud.google.com/sdk/) and run 83 | 84 | gcloud init 85 | gcloud auth login 86 | gcloud auth application-default login 87 | 88 | ## Getting started 89 | 90 | Run the following steps on your laptop or local workstation: 91 | 92 | 1. git clone this repository. 93 | 94 | git clone https://github.com/googlegenomics/dockerflow 95 | 96 | 2. Build it with Maven. 97 | 98 | cd dockerflow 99 | mvn package -DskipTests 100 | 101 | 3. Set up the DOCKERFLOW_HOME environment. 102 | 103 | export DOCKERFLOW_HOME="$(pwd)" 104 | export PATH="${PATH}":"${DOCKERFLOW_HOME}/bin" 105 | chmod +x bin/* 106 | 107 | 4. Run a sample workflow: 108 | 109 | dockerflow --project=MY-PROJECT \ 110 | --workflow-file=src/test/resources/linear-graph.yaml \ 111 | --workspace=gs://MY-BUCKET/MY-PATH \ 112 | --input BASE_DIR=gs://MY-BUCKET/MY-PATH/MY-INPUT-FILE.txt 113 | --runner=DirectPipelineRunner 114 | 115 | Set `MY-PROJECT` to your cloud project name, and set `MY-BUCKET` and `MY-PATH` 116 | to your cloud bucket and folder. You'll need to have a text file in Cloud Storage 117 | as well, here called `MY-INPUT-FILE.txt`. You can copy one from 118 | [src/test/resources/input-one.txt](src/test/resources/input-one.txt): 119 | 120 | gsutil cp src/test/resources/input-one.txt gs://MY-BUCKET/MY-PATH/input-one.txt 121 | 122 | The example will run Dataflow locally with the `DirectPipelineRunner`, for 123 | orchestration. It will spin up VMs remotely in Google Cloud to run the 124 | individual tasks in Docker. Execution of the local Dataflow runner will block 125 | until the workflow completes. The DirectPipelineRunner is useful for 126 | debugging, because you'll see all of the log messages output to your shell. 127 | 128 | To run in your cloud project, and see the pretty Dataflow UI in Google Cloud 129 | Console, you can remove the `--runner` option to use the default Dataflow runner. 130 | 131 | ## Docker and Dataflow vs custom scripts 132 | 133 | How is Dataflow better than a shell script? 134 | 135 | Dataflow provides: 136 | 137 | * **Complex workflow orchestration**: Dataflow supports arbitrary directed 138 | acyclic graphs. The logic of branching, merging, parallelizing, and monitoring is 139 | all handled automatically. 140 | * **Monitoring**: 141 | [Dataflow's monitoring UI](https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf) 142 | shows you what jobs you've run and shows an execution graph with nice details. 143 | * **Debugging**: Dataflow keeps logs at each step, and you can view them directly 144 | in the UI. 145 | * **Task retries**: Dataflow automatically retries failed steps. 146 | Dockerflow adds support for preemptible VMs, rerunning failures on standard VM 147 | instances. 148 | * **Parallelization**: Dataflow can run 100 tasks on 100 files and 149 | keep track of them all for you, retrying any steps that failed. 150 | * **Optimization**: Dataflow optimizes the execution graph for your workflow. 151 | 152 | Docker provides: 153 | 154 | * **Portability**: Tools packaged in Docker images can be run 155 | anywhere Docker is supported. 156 | * **A library of pre-packaged tools**: The community has contributed a growing 157 | library of popular tools. 158 | 159 | ## Creating your own workflows 160 | 161 | The Dockerflow command-line expects a static workflow graph definition in YAML, 162 | or the Java class name of a Java definition. 163 | 164 | If you'd rather define workflows in code, you'll use the Java SDK. See 165 | 166 | * [src/test/java/com/google/cloud/genomics/dockerflow/examples](src/test/java/com/google/cloud/genomics/dockerflow/examples) 167 | 168 | Everything that can be done with YAML can also be done (and more compactly) in 169 | Java code. Java provides greater flexibility too. 170 | 171 | The documentation below provides details for defining workflows in YAML. To 172 | create a workflow, you define the tasks and execution graph. You can 173 | define the tasks and execution graph in a single file, or your graph can 174 | reference tasks that are defined in separate YAML files. 175 | 176 | A workflow is a recursive format, meaning that a workflow can contain multiple 177 | steps, and each of the steps can be a workflow. 178 | 179 | ## Hello, world 180 | 181 | Dockerflow has lots of features for creating complex, real-world workflows. 182 | The best way to get started with your own workflows is to look at the 183 | [examples](examples). 184 | 185 | The [hello, world](examples/hello) example shows the most basic workflow 186 | in both YAML and Java. 187 | 188 | All of the advanced features can be seen in the more complex 189 | [GATK](examples/gatk) example. Again, it offers both YAML and Java 190 | versions. You'll see pretty much the full range of functionality. 191 | 192 | ## Testing 193 | 194 | Workflows can be tricky to test and debug. Dataflow has a local runner that 195 | makes it easy to fix the obvious bugs before running in your Google Cloud 196 | Platform project. 197 | 198 | To test locally, set `--runner=DirectPipelineRunner`. Now Dataflow will run on 199 | your local computer rather than in the cloud. You'll be able to see all of the 200 | log messages. 201 | 202 | Two other flags are really useful for testing: `--test=true` and 203 | `--resume=true`. 204 | 205 | When you set `test` to true, you'll get a dry run of the pipeline. No calls to 206 | the Pipelines API will be made. Instead, the code will print a log message and 207 | continue. That lets you do a first sanity check before submitting and running on 208 | the cloud. You can catch many errors, mismatched parameters, etc. 209 | 210 | When you use the `resume` flag, Dockerflow will try to resume a failed pipeline 211 | run. For example, suppose you're trying to get your 10-step pipeline to work. It 212 | fails on step 6. You go into your YAML definition file, or edit your Java code. 213 | Now you want to re-run the pipeline. However, it takes 1 hour to run steps 1-5. 214 | That's a long time to wait. With `--resume=true`, Dockerflow will look to see if 215 | the outputs of each step exist already, and if they do, it will print a log 216 | message and proceed to the next step. That means it takes only seconds to skip 217 | ahead to the failed step and try to rerun it. 218 | 219 | ## What next? 220 | 221 | * See the YAML examples in the [src/test/resources](src/test/resources) directory. 222 | * See the Java code examples in 223 | * [examples](examples) 224 | * [src/test/java/com/google/cloud/genomics/dockerflow/examples](src/test/java/com/google/cloud/genomics/dockerflow/examples) 225 | * Learn about the [Pipelines API] 226 | (https://cloud.google.com/genomics/v1alpha2/pipelines). 227 | * Read about [Dataflow](https://cloud.google.com/dataflow). 228 | * Write your own workflows! 229 | 230 | ## FAQ and Troubleshooting 231 | 232 | ### What if I want to run large batch jobs? 233 | 234 | Google Cloud Platform has various quotas that affect how many VMs and IP addresses, and how much disk space you can get. Some tips: 235 | 236 | * [Check and potentially increase quotas](https://console.cloud.google.com/compute/quotas) 237 | * Consider listing all zones in the region or geography where your Cloud Storage bucket is (e.g., for standard buckets in the EU, use "eu-"; for regional buckets in US central, use "us-central-") 238 | * The pipeline system will queue jobs until resources are available if quotas are exceeded 239 | * Dockerflow will abort if any job fails. Use the '--abort=false' flag for different behavior. 240 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/genomics/dockerflow/DockerflowTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2016 Google. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.genomics.dockerflow; 17 | 18 | import static org.junit.Assert.assertEquals; 19 | import static org.junit.Assert.assertTrue; 20 | import static org.junit.Assert.fail; 21 | 22 | import com.google.cloud.genomics.dockerflow.args.WorkflowArgs; 23 | import com.google.cloud.genomics.dockerflow.util.FileUtils; 24 | import com.google.cloud.genomics.dockerflow.util.StringUtils; 25 | import com.google.cloud.genomics.dockerflow.workflow.Workflow; 26 | import com.google.cloud.genomics.dockerflow.workflow.WorkflowFactory; 27 | 28 | import java.io.IOException; 29 | import java.util.Map; 30 | 31 | import org.junit.Test; 32 | import org.slf4j.Logger; 33 | import org.slf4j.LoggerFactory; 34 | 35 | /** 36 | * Unit tests. 37 | * 38 | *

You need to set environment variables: TEST_PROJECT and TEST_GCS_PATH. Then get 39 | * application default credentials: 40 | * 41 | *

gcloud beta auth application-default login
42 | */ 43 | public class DockerflowTest implements DockerflowConstants { 44 | static final Logger LOG = LoggerFactory.getLogger(DockerflowTest.class); 45 | 46 | protected TestUtils utils = new TestUtils(); 47 | 48 | @Test 49 | public void testHelp() throws Exception { 50 | Dockerflow.main(new String[] {"--help"}); 51 | } 52 | 53 | @Test 54 | public void testSingleTaskNoWait() throws Exception { 55 | Dockerflow.main( 56 | new String[] { 57 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 58 | "--" + TASK_FILE + "=" + utils.baseDir + "/task-one.yaml", 59 | "--" + LOGGING + "=" + utils.baseDir + "/async/test.log", 60 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 61 | "--" + INPUTS + "=TaskOne.inputFile=" + utils.baseDir + "/input-one.txt", 62 | "--" + OUTPUTS + "=TaskOne.outputFile=" + utils.baseDir + "/async/output-one.txt", 63 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner), 64 | "--" + RUNNER + "=" + utils.runner 65 | }); 66 | if (utils.checkOutput) { 67 | try { 68 | FileUtils.readAll(utils.baseDir + "/async/output-one.txt"); 69 | fail(); 70 | } catch (IOException e) { 71 | assertTrue(e.getMessage().startsWith("HTTP error: 404")); 72 | } 73 | } 74 | } 75 | 76 | @Test 77 | public void testSingleTask() throws Exception { 78 | Dockerflow.main( 79 | new String[] { 80 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 81 | "--" + TASK_FILE + "=" + utils.baseDir + "/task-one.yaml", 82 | "--" + LOGGING + "=" + utils.baseDir + "/task/test.log", 83 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 84 | "--" + INPUTS + "=TaskOne.inputFile=" + utils.baseDir + "/input-one.txt", 85 | "--" + OUTPUTS + "=TaskOne.outputFile=" + utils.baseDir + "/task/output-one.txt", 86 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner), 87 | "--" + RUNNER + "=" + utils.runner 88 | }); 89 | if (utils.checkOutput) { 90 | String output = TestUtils.readAll(utils.baseDir + "/task/output-one.txt"); 91 | LOG.info("\"" + output + "\", length=" + output.length()); 92 | 93 | assertEquals("Output doesn't match expected", TestUtils.OUTPUT_ONE, output); 94 | } 95 | } 96 | 97 | @Test 98 | public void testParameterSubstitution() throws Exception { 99 | String[] args = 100 | new String[] { 101 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 102 | "--" + WORKFLOW_FILE + "=" + utils.baseDir + "/param-sub.yaml", 103 | "--" + LOGGING + "=" + utils.baseDir + "/task", 104 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 105 | "--" + INPUTS + "=stepOne.inputFile=" + utils.baseDir + "/input-two.txt," 106 | + "BASE_DIR=" + utils.baseDir + "/task", 107 | "--" + OUTPUTS + "=stepOne.outputFile=" + utils.baseDir + "/task/output-two.txt", 108 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner), 109 | "--" + RUNNER + "=" + utils.runner 110 | }; 111 | Workflow w = WorkflowFactory.create(args); 112 | WorkflowArgs wa = WorkflowFactory.createArgs(args); 113 | w.setArgs(wa); 114 | w.applyArgs(wa); 115 | 116 | String s = StringUtils.toJson(w); 117 | LOG.info(s); 118 | 119 | assertTrue("Search and replace of globals failed", s.indexOf("${BASE_DIR}") < 0); 120 | } 121 | 122 | @Test 123 | public void testLinearGraph() throws Exception { 124 | Dockerflow.main( 125 | new String[] { 126 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 127 | "--" + WORKFLOW_FILE + "=" + utils.baseDir + "/linear-graph.yaml", 128 | "--" + WORKSPACE + "=" + utils.baseDir + "/linear", 129 | "--" + INPUTS + "=BASE_DIR=" + utils.baseDir, 130 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner), 131 | "--" + RUNNER + "=" + utils.runner 132 | }); 133 | if (utils.checkOutput) { 134 | String output = TestUtils.readAll(utils.baseDir + "/linear/stepTwo/output-two.txt"); 135 | LOG.info("\"" + output + "\""); 136 | 137 | assertEquals("Output doesn't match expected", TestUtils.OUTPUT_ONE_TWO, output); 138 | } 139 | } 140 | 141 | @Test 142 | public void testScatter() throws Exception { 143 | Dockerflow.main( 144 | new String[] { 145 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 146 | "--" + WORKFLOW_FILE + "=" + utils.baseDir + "/parallel-graph.yaml", 147 | "--" + LOGGING + "=" + utils.baseDir + "/parallel", 148 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 149 | "--" + WORKSPACE + "=" + utils.baseDir + "/parallel", 150 | "--" + INPUTS + "=BASE_DIR=" + utils.baseDir, 151 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner), 152 | "--" + RUNNER + "=" + utils.runner 153 | }); 154 | if (utils.checkOutput) { 155 | String output = TestUtils.readAll(utils.baseDir + "/parallel/stepOne/1/output-one.txt"); 156 | LOG.info("\"" + output + "\""); 157 | 158 | if (!TestUtils.OUTPUT_ONE.equals(output) && !TestUtils.OUTPUT_TWO.equals(output)) { 159 | fail("Output doesn't match expected"); 160 | } 161 | } 162 | } 163 | 164 | @Test 165 | public void testGather() throws Exception { 166 | Dockerflow.main( 167 | new String[] { 168 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 169 | "--" + WORKFLOW_FILE + "=" + utils.baseDir + "/gather-graph.yaml", 170 | "--" + LOGGING + "=" + utils.baseDir + "/gather", 171 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 172 | "--" + WORKSPACE + "=" + utils.baseDir + "/gather", 173 | "--" + INPUTS + "=BASE_DIR=" + utils.baseDir, 174 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner), 175 | "--" + RUNNER + "=" + utils.runner 176 | }); 177 | if (utils.checkOutput) { 178 | String output = TestUtils.readAll(utils.baseDir + "/gather/stepTwo/output-two.txt"); 179 | LOG.info("\"" + output + "\""); 180 | 181 | if (!TestUtils.OUTPUT_ONE_TWO.equals(output)) { 182 | fail("Output doesn't match expected"); 183 | } 184 | } 185 | } 186 | 187 | @Test 188 | public void testReorderedGraph() throws Exception { 189 | Dockerflow.main( 190 | new String[] { 191 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 192 | "--" + WORKFLOW_FILE + "=" + utils.baseDir + "/reordered-graph.yaml", 193 | "--" + LOGGING + "=" + utils.baseDir + "/reordered", 194 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 195 | "--" + WORKSPACE + "=" + utils.baseDir + "/reordered", 196 | "--" + INPUTS + "=BASE_DIR=" + utils.baseDir, 197 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner), 198 | "--" + RUNNER + "=" + utils.runner 199 | }); 200 | if (utils.checkOutput) { 201 | String output = TestUtils.readAll(utils.baseDir + "/reordered/output-one.txt"); 202 | LOG.info("\"" + output + "\""); 203 | 204 | assertEquals("Output doesn't match expected", TestUtils.OUTPUT_TWO_ONE, output); 205 | } 206 | } 207 | 208 | @Test 209 | public void testBranchingGraph() throws Exception { 210 | Dockerflow.main( 211 | new String[] { 212 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 213 | "--" + WORKFLOW_FILE + "=" + utils.baseDir + "/branching-graph.yaml", 214 | "--" + LOGGING + "=" + utils.baseDir + "/branching", 215 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 216 | "--" + INPUTS + "=BASE_DIR=" + utils.baseDir, 217 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner), 218 | "--" + RUNNER + "=" + utils.runner 219 | }); 220 | if (utils.checkOutput) { 221 | String output = TestUtils.readAll(utils.baseDir + "/branching/output-three.txt"); 222 | LOG.info("\"" + output + "\""); 223 | 224 | assertEquals("Output doesn't match expected", TestUtils.OUTPUT_ONE_TWO, output); 225 | } 226 | } 227 | 228 | @Test 229 | public void testComplexGraph() throws Exception { 230 | Dockerflow.main( 231 | new String[] { 232 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 233 | "--" + WORKFLOW_FILE + "=" + utils.baseDir + "/complex-graph.yaml", 234 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 235 | "--" + LOGGING + "=" + utils.baseDir + "/complex", 236 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner), 237 | "--" + RUNNER + "=" + utils.runner 238 | }); 239 | if (utils.checkOutput) { 240 | String output = TestUtils.readAll(utils.baseDir + "/complex/stepSix/task.log"); 241 | LOG.info("\"" + output); 242 | } 243 | } 244 | 245 | @Test 246 | public void testFolderCopy() throws Exception { 247 | Dockerflow.main( 248 | new String[] { 249 | "--" + PROJECT + "=" + TestUtils.TEST_PROJECT, 250 | "--" + WORKFLOW_FILE + "=" + utils.baseDir + "/folder-copy.yaml", 251 | "--" + WORKSPACE + "=" + utils.baseDir + "/folder-copy", 252 | "--" + TEST + "=" + DockerflowConstants.DIRECT_RUNNER.equals(utils.runner), 253 | "--" + INPUTS + "=stepOne.inputFolder=../../test-folder", 254 | "--" + OUTPUTS + "=stepOne.outputFolder=test-output", 255 | "--" + STAGING + "=" + utils.baseDir + "/dataflow", 256 | "--" + RUNNER + "=" + utils.runner 257 | }); 258 | if (utils.checkOutput) { 259 | String output = TestUtils.readAll(utils.baseDir + "/folder-copy/stepOne/test-output/file1.txt"); 260 | LOG.info("\"" + output + "\""); 261 | 262 | assertEquals("Folder copy failed", TestUtils.OUTPUT_ONE, output); 263 | } 264 | } 265 | 266 | @Test 267 | public void testFileParsing() throws Exception { 268 | Map m = StringUtils.parseParameters("key_1=${val_1},key_2=${val_2}", false); 269 | 270 | assertEquals("Wrong number of keys", 2, m.size()); 271 | } 272 | 273 | @Test 274 | public void testArrayParsing() throws Exception { 275 | Map m = StringUtils.parseParameters("foo[\" sep=val \"]=bar", false); 276 | 277 | assertEquals("Wrong number of keys", 1, m.size()); 278 | assertEquals("Array key parsed incorrectly", "foo[\" sep=val \"]", m.keySet().iterator().next()); 279 | } 280 | } 281 | --------------------------------------------------------------------------------