├── .circleci
    └── config.yml
├── .gitignore
├── .isort.cfg
├── .pre-commit-config.yaml
├── DETAILS.md
├── LICENSE
├── README.md
├── bin
    ├── caper
    ├── run_mysql_server_docker.sh
    └── run_mysql_server_singularity.sh
├── caper
    ├── __init__.py
    ├── __main__.py
    ├── arg_tool.py
    ├── backward_compatibility.py
    ├── caper_args.py
    ├── caper_backend_conf.py
    ├── caper_base.py
    ├── caper_client.py
    ├── caper_init.py
    ├── caper_labels.py
    ├── caper_runner.py
    ├── caper_wdl_parser.py
    ├── caper_workflow_opts.py
    ├── cli.py
    ├── cli_hpc.py
    ├── cromwell.py
    ├── cromwell_backend.py
    ├── cromwell_metadata.py
    ├── cromwell_rest_api.py
    ├── cromwell_workflow_monitor.py
    ├── dict_tool.py
    ├── hocon_string.py
    ├── hpc.py
    ├── nb_subproc_thread.py
    ├── resource_analysis.py
    ├── server_heartbeat.py
    ├── singularity.py
    └── wdl_parser.py
├── docs
    ├── conf_aws.md
    ├── conf_encode_workshop_2019.md
    ├── conf_gcp.md
    └── resource_param.md
├── scripts
    ├── aws_caper_server
    │   ├── README.md
    │   ├── TROUBLESHOOTING.md
    │   └── create_instance.sh
    ├── gcp_caper_server
    │   ├── README.md
    │   ├── TROUBLESHOOTING.md
    │   └── create_instance.sh
    └── resource_monitor
    │   └── resource_monitor.sh
├── setup.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── example_wdl.py
    ├── pytest.ini
    ├── test_arg_tool.py
    ├── test_caper_labels.py
    ├── test_caper_wdl_parser.py
    ├── test_caper_workflow_opts.py
    ├── test_cli_run.py
    ├── test_cli_server_client_gcp.py
    ├── test_cromwell.py
    ├── test_cromwell_backend.py
    ├── test_cromwell_metadata.py
    ├── test_cromwell_rest_api.py
    ├── test_dict_tool.py
    ├── test_hocon_string.py
    ├── test_nb_subproc_thread.py
    ├── test_resource_analysis.py
    ├── test_server_heartbeat.py
    ├── test_singularity.py
    └── test_wdl_parser.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | version: 2.1
  3 | 
  4 | defaults:
  5 |   machine:
  6 |     image: circleci/classic:latest
  7 |   working_directory: ~/caper
  8 | 
  9 | 
 10 | machine_defaults: &machine_defaults
 11 |   machine:
 12 |     image: ubuntu-2004:202010-01
 13 |   working_directory: ~/caper
 14 | 
 15 | 
 16 | update_apt: &update_apt
 17 |   name: Update apt
 18 |   command: |
 19 |     sudo apt-get update -y
 20 |     sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata
 21 | 
 22 | 
 23 | install_python3: &install_python3
 24 |   name: Install python3, pip3, java
 25 |   command: |
 26 |     sudo apt-get install -y software-properties-common git wget curl python3 python3-pip default-jre
 27 | 
 28 | 
 29 | install_singularity: &install_singularity
 30 |   name: Install Singularity (container)
 31 |   command: |
 32 |     sudo apt-get install -y alien squashfs-tools libseccomp-dev
 33 |     sudo wget https://github.com/sylabs/singularity/releases/download/v3.11.3/singularity-ce-3.11.3-1.el8.x86_64.rpm
 34 |     sudo alien -d singularity-ce-3.11.3-1.el8.x86_64.rpm
 35 |     sudo apt-get install -y ./singularity-ce_3.11.3-2_amd64.deb
 36 |     singularity --version
 37 | 
 38 | 
 39 | install_py3_packages: &install_py3_packages
 40 |   name: Install Python packages
 41 |   command: |
 42 |     sudo python3 -m pip install --upgrade pip
 43 |     sudo pip3 install PyYAML --ignore-installed
 44 |     sudo pip3 install pyOpenSSL pytest requests dateparser filelock autouri miniwdl pyhocon numpy pandas scikit-learn matplotlib six
 45 |     #sudo pip3 install pyOpenSSL pytest requests dateparser filelock autouri miniwdl pyhocon numpy pandas scikit-learn matplotlib "six>=1.13.0" "PyYAML==3.11"
 46 |     #sudo pip3 install --upgrade pyasn1-modules
 47 | 
 48 | 
 49 | install_gcs_lib: &install_gcs_lib
 50 |   name: Install Google Cloud SDK (gcloud and gsutil) and Python API (google-cloud-storage)
 51 |   command: |
 52 |     echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
 53 |     curl -k https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
 54 |     sudo apt-get update && sudo apt-get install google-cloud-sdk -y
 55 |     sudo pip3 install google-cloud-storage
 56 | 
 57 | 
 58 | install_aws_lib: &install_aws_lib
 59 |   name: Install AWS Python API (boto3) and CLI (awscli)
 60 |   command: |
 61 |     sudo pip3 install boto3 awscli
 62 | 
 63 | 
 64 | jobs:
 65 |   pytest:
 66 |     <<: *machine_defaults
 67 |     steps:
 68 |       - checkout
 69 |       - run: *update_apt
 70 |       - run: *install_python3
 71 |       - run: *install_singularity
 72 |       - run: *install_py3_packages
 73 |       - run: *install_gcs_lib
 74 |       - run: *install_aws_lib
 75 |       - run:
 76 |           no_output_timeout: 60m
 77 |           command: |
 78 |             cd tests/
 79 | 
 80 |             # service account's key file
 81 |             echo ${GCLOUD_SERVICE_ACCOUNT_SECRET_JSON} > tmp_key.json
 82 |             export KEY_FILE="${PWD}/tmp_key.json"
 83 | 
 84 |             # run pytest
 85 |             pytest --ci-prefix ${CIRCLE_WORKFLOW_ID} \
 86 |                    --gcs-root ${GCS_ROOT} \
 87 |                    --gcp-service-account-key-json ${KEY_FILE} \
 88 |                    --gcp-prj ${GOOGLE_PROJECT_ID} \
 89 |                    --debug-caper \
 90 |                    -vv -s
 91 | 
 92 |             # auth for gsutil
 93 |             export GOOGLE_APPLICATION_CREDENTIALS=${KEY_FILE}
 94 |             export GOOGLE_CLOUD_PROJECT=${GOOGLE_PROJECT_ID}
 95 |             export BOTO_CONFIG=/dev/null
 96 | 
 97 |             # clean up
 98 |             rm -f tmp_key.json
 99 |             gsutil -m rm -rf ${GCS_ROOT}/caper_out/${CIRCLE_WORKFLOW_ID} || true
100 | 
101 | 
102 | # Define workflow here
103 | workflows:
104 |   version: 2.1
105 |   build_workflow:
106 |     jobs:
107 |       - pytest
108 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # dev
107 | src/test_dev
108 | src/caper_tmp
109 | caper_tmp
110 | src/test_caper_uri/
111 | 
112 | cromwell.out
113 | dev/
114 | tests/hpc/
115 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
 1 | [settings]
 2 | multi_line_output = 3
 3 | include_trailing_comma = True
 4 | force_grid_wrap = 0
 5 | use_parentheses = True
 6 | line_length = 88
 7 | known_third_party = WDL,autouri,distutils,humanfriendly,matplotlib,numpy,pandas,pyhocon,pytest,requests,setuptools,sklearn
 8 | 
 9 | [mypy-bin]
10 | ignore_errors = True
11 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | repos:
 3 |   - repo: https://github.com/psf/black
 4 |     rev: 22.3.0
 5 |     hooks:
 6 |       - id: black
 7 |         args: [--skip-string-normalization]
 8 |         language_version: python3
 9 | 
10 |   - repo: https://github.com/asottile/seed-isort-config
11 |     rev: v1.9.2
12 |     hooks:
13 |       - id: seed-isort-config
14 | 
15 |   - repo: https://github.com/pre-commit/mirrors-isort
16 |     rev: v4.3.21
17 |     hooks:
18 |       - id: isort
19 |         language_version: python3
20 | 
21 |   - repo: https://github.com/detailyang/pre-commit-shell
22 |     rev: v1.0.6
23 |     hooks:
24 |       - id: shell-lint
25 |         args: [--exclude, 'SC1078,SC1079']
26 | 
27 |   - repo: https://github.com/pre-commit/pre-commit-hooks
28 |     rev: v2.2.3
29 |     hooks:
30 |       - id: flake8
31 |         args: [--ignore, 'E203,E501,W503']
32 |       - id: trailing-whitespace
33 |       - id: end-of-file-fixer
34 |       - id: debug-statements
35 |       - id: check-yaml
36 | 
37 | #  - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt
38 | #    rev: 0.0.10
39 | #    hooks:
40 | #      - id: yamlfmt
41 | #        args: [--mapping, '2', --sequence, '4', --offset, '2']
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 ENCODE DCC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![CircleCI](https://circleci.com/gh/ENCODE-DCC/caper.svg?style=svg)](https://circleci.com/gh/ENCODE-DCC/caper)
  2 | 
  3 | 
  4 | ## Introduction
  5 | 
  6 | Caper (Cromwell Assisted Pipeline ExecutoR) is a wrapper Python package for [Cromwell](https://github.com/broadinstitute/cromwell/). Caper wraps Cromwell to run pipelines on multiple platforms like GCP (Google Cloud Platform), AWS (Amazon Web Service) and HPCs like SLURM, SGE, PBS/Torque and LSF. It provides easier way of running Cromwell server/run modes by automatically composing necessary input files for Cromwell. Caper can run each task on a specified environment (Docker, Singularity or Conda). Also, Caper automatically localizes all files (keeping their directory structure) defined in your input JSON and command line according to the specified backend. For example, if your chosen backend is GCP and files in your input JSON are on S3 buckets (or even URLs) then Caper automatically transfers `s3://` and `http(s)://` files to a specified `gs://` bucket directory. Supported URIs are `s3://`, `gs://`, `http(s)://` and local absolute paths. You can use such URIs either in CLI and input JSON. Private URIs are also accessible if you authenticate using cloud platform CLIs like `gcloud auth`, `aws configure` and using `~/.netrc` for URLs.
  7 | 
  8 | 
  9 | ## Installation for Google Cloud Platform and AWS
 10 | 
 11 | See [this](scripts/gcp_caper_server/README.md) for details.
 12 | 
 13 | 
 14 | ## Installation for AWS
 15 | 
 16 | See [this](scripts/aws_caper_server/README.md) for details.
 17 | 
 18 | 
 19 | ## Installation for local computers and HPCs
 20 | 
 21 | 1) Make sure that you have Java (>= 11) and Python>=3.6 installed on your system and `pip` to install Caper.
 22 | 
 23 | 	```bash
 24 | 	$ pip install caper
 25 | 	```
 26 | 
 27 | 2) If you see an error message like `caper: command not found` after installing then add the following line to the bottom of `~/.bashrc` and re-login.
 28 | 
 29 | 	```bash
 30 | 	export PATH=$PATH:~/.local/bin
 31 | 	```
 32 | 
 33 | 3) Choose a backend from the following table and initialize Caper. This will create a default Caper configuration file `~/.caper/default.conf`, which have only required parameters for each backend. `caper init` will also install Cromwell/Womtool JARs on `~/.caper/`. Downloading those files can take up to 10 minutes. Once they are installed, Caper can completely work offline with local data files.
 34 | 
 35 | 	**Backend**|**Description**
 36 | 	:--------|:-----
 37 | 	local | local computer without a cluster engine
 38 | 	slurm | SLURM (e.g. Stanford Sherlock and SCG)
 39 | 	sge | Sun GridEngine
 40 | 	pbs | PBS cluster
 41 | 	lsf | LSF cluster
 42 | 
 43 | 	> **IMPORTANT**: `sherlock` and `scg` backends have been deprecated. Use `slurm` backend instead and following instruction comments in the configuration file.
 44 | 
 45 | 	```bash
 46 | 	$ caper init [BACKEND]
 47 | 	```
 48 | 
 49 | 4) Edit `~/.caper/default.conf` and follow instructions in there. **CAREFULLY READ INSTRUCTION AND DO NOT LEAVE IMPORTANT PARAMETERS UNDEFINED OR CAPER WILL NOT WORK CORRECTLY**
 50 | 
 51 | 
 52 | ## Docker, Singularity and Conda
 53 | 
 54 | For local backends (`local`, `slurm`, `sge`, `pbs` and `lsf`), you can use `--docker`, `--singularity` or `--conda` to run WDL tasks in a pipeline within one of these environment. For example, `caper run ... --singularity docker://ubuntu:latest` will run each task within a Singularity image built from a docker image `ubuntu:latest`. These parameters can also be used as flags. If used as a flag, Caper will try to find a default docker/singularity/conda in WDL. e.g. All ENCODE pipelines have default docker/singularity images defined within WDL's meta section (under key `caper_docker` or `default_docker`).
 55 | 
 56 | > **IMPORTANT**: Docker/singularity/conda defined in Caper's configuration file or in CLI (`--docker`, `--singularity` and `--conda`) will be overriden by those defined in WDL task's `runtime`. We provide these parameters to define default/base environment for a pipeline, not to override on WDL task's `runtime`.
 57 | 
 58 | For Conda users, make sure that you have installed pipeline's Conda environments before running pipelines. Caper only knows Conda environment's name. You don't need to activate any Conda environment before running a pipeline since Caper will internally run `conda run -n ENV_NAME TASK_SHELL_SCRIPT` for each task.
 59 | 
 60 | Take a look at the following examples:
 61 | ```bash
 62 | $ caper run test.wdl --docker # can be used as a flag too, Caper will find a default docker image in WDL if defined
 63 | $ caper run test.wdl --singularity docker://ubuntu:latest # define default singularity image in the command line
 64 | $ caper hpc submit test.wdl --singularity --leader-job-name test1 # submit to job engine and use singularity defined in WDL
 65 | $ caper submit test.wdl --conda your_conda_env_name # running caper server is required
 66 | ```
 67 | 
 68 | An environemnt defined here will be overriden by those defined in WDL task's `runtime`. Therefore, think of this as a base/default environment for your pipeline. You can define per-task docker, singularity images to override those defined in Caper's command line. For example:
 69 | ```wdl
 70 | task my_task {
 71 | 	...
 72 | 	runtime {
 73 | 		docker: "ubuntu:latest"
 74 | 		singularity: "docker://ubuntu:latest"
 75 | 	}
 76 | }
 77 | ```
 78 | 
 79 | For cloud backends (`gcp` and `aws`), Caper will automatically try to find a base docker image defined in your WDL. For other pipelines, define a base docker image in Caper's CLI or directly in each WDL task's `runtime`.
 80 | 
 81 | 
 82 | ## Running pipelines on HPCs
 83 | 
 84 | Use `--singularity` or `--conda` in CLI to run a pipeline inside Singularity image or Conda environment. Most HPCs do not allow docker. For example, `caper hpc submit ... --singularity` will submit Caper process to the job engine as a leader job. Then Caper's leader job will submit its child jobs to the job engine so that both leader and child jobs can be found with `squeue` or `qstat`.
 85 | 
 86 | Use `caper hpc list` to list all leader jobs. Use `caper hpc abort JOB_ID` to abort a running leader job. **DO NOT DIRECTLY CANCEL A JOB USING CLUSTER COMMAND LIKE SCANCEL OR QDEL** then only your leader job will be canceled, not all the child jobs.
 87 | 
 88 | Here are some example command lines to submit Caper as a leader job. Make sure that you correctly configured Caper with `caper init` and filled all parameters in the conf file `~/.caper/default.conf`.
 89 | 
 90 | There is an extra set of parameters `--file-db [METADATA_DB_PATH_FOR_CALL_CACHING]` to use call-caching (restarting workflows by re-using previous outputs). If you want to restart a failed workflow then use the same metadata DB path then pipeline will start from where it left off. It will actually start over but will reuse (soft-link) previous outputs.
 91 | 
 92 | ```bash
 93 | # make a new output directory for a workflow.
 94 | $ cd [OUTPUT_DIR]
 95 | 
 96 | # Example with Singularity without using call-caching.
 97 | $ caper hpc submit [WDL] -i [INPUT_JSON] --singularity --leader-job-name GOOD_NAME1
 98 | 
 99 | # Example with Conda and using call-caching (restarting a workflow from where it left off)
100 | # Use the same --file-db PATH for next re-run then Caper will collect and softlink previous outputs.
101 | # If you see any DB connection error then replace it with "--db in-memory" then call-cahing will be disabled
102 | $ caper hpc submit [WDL] -i [INPUT_JSON] --conda --leader-job-name GOOD_NAME2 --file-db [METADATA_DB_PATH]
103 | 
104 | # List all leader jobs.
105 | $ caper hpc list
106 | 
107 | # Check leader job's STDOUT file to monitor workflow's status.
108 | # Example for SLURM
109 | $ tail -f slurm-[JOB_ID].out
110 | 
111 | # Cromwell's log will be written to cromwell.out* on the same directory.
112 | # It will be helpful for monitoring your workflow in detail.
113 | $ ls -l cromwell.out*
114 | 
115 | # Abort a leader job (this will cascade-kill all its child jobs)
116 | # If you directly use job engine's command like scancel or qdel then child jobs will still remain running.
117 | $ caper hpc abort [JOB_ID]
118 | ```
119 | 
120 | ## Restarting a pipeline on local machine (and HPCs)
121 | 
122 | Caper uses Cromwell's call-caching to restart a pipeline from where it left off. Such database is automatically generated on `local_out_dir` defined in the configuration file `~/.caper/default.conf`. The DB file name is simply consist of WDL's basename and input JSON file's basename so you can simply run the same `caper run` command line on the same working directory to restart a workflow.
123 | 
124 | ```bash
125 | # for standalone/client
126 | $ caper run ... --db in-memory
127 | 
128 | # for server
129 | $ caper server ... --db in-memory
130 | ````
131 | 
132 | 
133 | ## DB connection timeout
134 | 
135 | If you see any DB connection timeout error, that means you have multiple caper/Cromwell processes trying to connect to the same file DB. Check any running Cromwell processes with `ps aux | grep cromwell` and close them with `kill         PID`. If that does not fix the problem, then use `caper run ... --db in-memory` to disable Cromwell's metadata DB. You will not be able to use call-caching.
136 | 
137 | 
138 | ## Customize resource parameters on HPCs
139 | 
140 | If default settings of Caper does not work with your HPC, then see [this document](docs/resource_param.md) to manually customize resource command line (e.g. `sbatch ... [YOUR_CUSTOM_PARAMETER]`) for your chosen backend.
141 | 
142 | # DETAILS
143 | 
144 | See [details](DETAILS.md).
145 | 


--------------------------------------------------------------------------------
/bin/caper:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | try:
 3 |     from caper.cli import main
 4 | except ImportError:
 5 |     import os
 6 |     import sys
 7 | 
 8 |     script_path = os.path.dirname(os.path.realpath(__file__))
 9 |     sys.path.append(os.path.join(script_path, "../"))
10 |     from caper.cli import main
11 | 
12 | if __name__ == "__main__":
13 |     main()
14 | 


--------------------------------------------------------------------------------
/bin/run_mysql_server_docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | if [ $# -lt 1 ]; then
 5 |   echo "Usage: ./run_mysql_server_docker.sh [DB_DIR] [PORT] [CONTAINER_NAME] \
 6 | [MYSQL_USER] [MYSQL_PASSWORD] [MYSQL_DB_NAME]"
 7 |   echo
 8 |   echo "Example: run_mysql_server_docker.sh ~/cromwell_data_dir 3307 mysql_cromwell2"
 9 |   echo
10 |   echo "[DB_DIR]: This directory will be mapped to '/var/lib/mysql' inside a container"
11 |   echo "[PORT] (optional): MySQL database port for docker host (default: 3306)"
12 |   echo "[CONTAINER_NAME] (optional): MySQL container name (default: mysql_cromwell)"
13 |   echo "[MYSQL_USER] (optional): MySQL username (default: cromwell)"
14 |   echo "[MYSQL_PASSWORD] (optional): MySQL password (default: cromwell)"
15 |   echo "[MYSQL_DB_NAME] (optional): MySQL database name. Match it with Caper's --mysql-db-name (default: cromwell)"
16 |   echo
17 |   exit 1
18 | fi
19 | 
20 | # check if DB_DIR exists
21 | if [ ! -d "$1" ]; then
22 |   echo "[DB_DIR] ($1) doesn't exists."
23 |   exit 1
24 | fi
25 | DB_DIR=$(cd "$(dirname "$1")" && pwd -P)/$(basename "$1")
26 | 
27 | # check if PORT is taken
28 | if [ $# -gt 1 ]; then PORT=$2; else PORT=3306; fi
29 | if netstat -tulpn 2>/dev/null | grep LISTEN | grep ":${PORT}" | grep -q ^; then
30 |   echo "[PORT] (${PORT}) already taken."
31 |   exit 1
32 | fi
33 | 
34 | if [ $# -gt 2 ]; then CONTAINER_NAME=$3; else CONTAINER_NAME=mysql_cromwell; fi
35 | if [ $# -gt 3 ]; then MYSQL_USER=$4; else MYSQL_USER=cromwell; fi
36 | if [ $# -gt 4 ]; then MYSQL_PASSWORD=$5; else MYSQL_PASSWORD=cromwell; fi
37 | if [ $# -gt 5 ]; then MYSQL_DB_NAME=$6; else MYSQL_DB_NAME=cromwell; fi
38 | 
39 | INIT_SQL="""
40 | CREATE USER '${MYSQL_USER}'@'%' IDENTIFIED BY '${MYSQL_PASSWORD}';
41 | GRANT ALL PRIVILEGES ON ${MYSQL_DB_NAME}.* TO '${MYSQL_USER}'@'%' WITH GRANT OPTION;
42 | """
43 | RAND_STR=$(date | md5sum | awk '{print $1}')
44 | TMP_INIT_DIR=${HOME}/.run_mysql_server_docker/${RAND_STR}
45 | TMP_INIT_FILE=${TMP_INIT_DIR}/init_cromwell_user.sql
46 | 
47 | rm -rf "${TMP_INIT_DIR}"
48 | mkdir -p "${TMP_INIT_DIR}"
49 | echo "${INIT_SQL}" > "${TMP_INIT_FILE}"
50 | 
51 | echo "SECURITY WARNING: Your MySQL DB username/password can be exposed in \
52 | ${TMP_INIT_FILE}"
53 | 
54 | chown ${UID} -R "${DB_DIR}"
55 | docker run -d --rm --user ${UID} \
56 | --name "${CONTAINER_NAME}" \
57 | -v "${DB_DIR}":/var/lib/mysql \
58 | -v "${TMP_INIT_DIR}":/docker-entrypoint-initdb.d \
59 | -e MYSQL_ROOT_PASSWORD="${MYSQL_PASSWORD}" \
60 | -e MYSQL_DATABASE="${MYSQL_DB_NAME}" \
61 | --publish "${PORT}":3306 mysql:5.7
62 | 
63 | echo "All done."
64 | 


--------------------------------------------------------------------------------
/bin/run_mysql_server_singularity.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | if [ $# -lt 1 ]; then
 5 |   echo "Usage: ./run_mysql_server_singularity.sh [DB_DIR] [PORT] [CONTAINER_NAME] \
 6 | [MYSQL_USER] [MYSQL_PASSWORD] [MYSQL_DB_NAME]"
 7 |   echo
 8 |   echo "Example: run_mysql_server_singularity.sh ~/cromwell_data_dir 3307"
 9 |   echo
10 |   echo "[DB_DIR]: This directory will be mapped to '/var/lib/mysql' inside a container"
11 |   echo "[PORT] (optional): MySQL database port for singularity host (default: 3306)"
12 |   echo "[CONTAINER_NAME] (optional): MySQL container name (default: mysql_cromwell)"
13 |   echo "[MYSQL_USER] (optional): MySQL username (default: cromwell)"
14 |   echo "[MYSQL_PASSWORD] (optional): MySQL password (default: cromwell)"
15 |   echo "[MYSQL_DB_NAME] (optional): MySQL database name. Match it with Caper's --mysql-db-name (default: cromwell)"
16 |   echo
17 |   exit 1
18 | fi
19 | 
20 | # check if DB_DIR exists
21 | if [ ! -d "$1" ]; then
22 |   echo "[DB_DIR] ($1) doesn't exists."
23 |   exit 1
24 | fi
25 | DB_DIR=$(cd "$(dirname "$1")" && pwd -P)/$(basename "$1")
26 | 
27 | # check if PORT is taken
28 | if [ $# -gt 1 ]; then PORT=$2; else PORT=3306; fi
29 | if netstat -tulpn 2>/dev/null | grep LISTEN | grep ":${PORT}" | grep -q ^; then
30 |   echo "[PORT] (${PORT}) already taken."
31 |   exit 1
32 | fi
33 | 
34 | if [ $# -gt 2 ]; then CONTAINER_NAME=$3; else CONTAINER_NAME=mysql_cromwell; fi
35 | if [ $# -gt 3 ]; then MYSQL_USER=$4; else MYSQL_USER=cromwell; fi
36 | if [ $# -gt 4 ]; then MYSQL_PASSWORD=$5; else MYSQL_PASSWORD=cromwell; fi
37 | if [ $# -gt 5 ]; then MYSQL_DB_NAME=$6; else MYSQL_DB_NAME=cromwell; fi
38 | 
39 | RAND_STR=$(date | md5sum | awk '{print $1}')
40 | 
41 | TMP_MYSQLD=${HOME}/.run_mysql_server_singularity/${RAND_STR}/mysqld
42 | TMP_CNF_FILE=${HOME}/.my.cnf
43 | TMP_ROOT_PW_SQL_FILE=${HOME}/.mysqlrootpw
44 | 
45 | mkdir -p "${TMP_MYSQLD}"
46 | 
47 | cat > "${TMP_CNF_FILE}" << EOM
48 | [mysqld]
49 | innodb_use_native_aio=0
50 | init-file=${HOME}/.mysqlrootpw
51 | port=${PORT}
52 | 
53 | [client]
54 | user=root
55 | password='my-secret-pw'
56 | EOM
57 | 
58 | cat > "${TMP_ROOT_PW_SQL_FILE}" << EOM
59 | SET PASSWORD FOR 'root'@'localhost' = PASSWORD('my-secret-pw');
60 | EOM
61 | 
62 | singularity instance start \
63 | --bind "${HOME}" \
64 | --bind "${DB_DIR}":/var/lib/mysql \
65 | --bind "${TMP_MYSQLD}":/var/run/mysqld \
66 | shub://ISU-HPC/mysql "${CONTAINER_NAME}"
67 | 
68 | INIT_SQL="CREATE DATABASE ${MYSQL_DB_NAME}; CREATE USER '${MYSQL_USER}'@'%' IDENTIFIED BY '${MYSQL_PASSWORD}';
69 | GRANT ALL PRIVILEGES ON ${MYSQL_DB_NAME}.* TO '${MYSQL_USER}'@'%' WITH GRANT OPTION;"
70 | 
71 | singularity run instance://${CONTAINER_NAME}
72 | echo "Creating user ${MYSQL_USER}"
73 | sleep 10
74 | 
75 | singularity exec instance://${CONTAINER_NAME} mysql -e "${INIT_SQL}" || true
76 | 
77 | echo "All done. You can ignore any error messages occurred when creating a user (ERROR 1007)."
78 | 


--------------------------------------------------------------------------------
/caper/__init__.py:
--------------------------------------------------------------------------------
1 | from .caper_client import CaperClient, CaperClientSubmit
2 | from .caper_runner import CaperRunner
3 | 
4 | __all__ = ['CaperClient', 'CaperClientSubmit', 'CaperRunner']
5 | __version__ = '2.3.2'
6 | 


--------------------------------------------------------------------------------
/caper/__main__.py:
--------------------------------------------------------------------------------
1 | from . import cli
2 | 
3 | if __name__ == '__main__':
4 |     cli.main()
5 | 


--------------------------------------------------------------------------------
/caper/arg_tool.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from argparse import ArgumentParser
  3 | from configparser import ConfigParser, MissingSectionHeaderError
  4 | 
  5 | from distutils.util import strtobool
  6 | 
  7 | 
  8 | def read_from_conf(
  9 |     conf_file, conf_section='defaults', conf_key_map=None, no_strip_quote=False
 10 | ):
 11 |     """Read key/value from conf_section of conf_file.
 12 |     Hyphens (-) in keys will be replace with underscores (_).
 13 |     All keys and values are considered as strings.
 14 | 
 15 |     Use update_parser_defaults_with_conf (2nd return value)
 16 |     instead to get result with values converted to correct types.
 17 | 
 18 |     Args:
 19 |         conf_file:
 20 |             INI-like conf file to find defaults key/value.
 21 |         conf_section:
 22 |             Section in conf_file.
 23 |             If section doesn't exist then make one and set as default.
 24 |         conf_key_map:
 25 |             Mapping of keys parsed from conf file.
 26 |             This is useful if you want to replace an old key name with a new one.
 27 |             e.g. to make your code backward compatible when you want to
 28 |             change parameter's name.
 29 |         no_strip_quote:
 30 |             Do not strip single/double quotes from values in conf_file.
 31 | 
 32 |     Returns:
 33 |         Dict of key/value in configuration file.
 34 |     """
 35 |     conf_file = os.path.expanduser(conf_file)
 36 |     if not os.path.exists(conf_file):
 37 |         raise FileNotFoundError('conf_file does not exist. f={f}'.format(f=conf_file))
 38 | 
 39 |     config = ConfigParser()
 40 |     with open(conf_file) as fp:
 41 |         s = fp.read()
 42 |         try:
 43 |             config.read_string(s)
 44 |         except MissingSectionHeaderError:
 45 |             section = '[{sect}]\n'.format(sect=conf_section)
 46 |             config.read_string(section + s)
 47 | 
 48 |     d_ = dict(config.items(conf_section))
 49 |     result = {}
 50 |     for k, v in d_.items():
 51 |         if not no_strip_quote:
 52 |             v = v.strip('"\'')
 53 |         if v:
 54 |             k_ = k.replace('-', '_')
 55 |             if conf_key_map and k_ in conf_key_map:
 56 |                 k_ = conf_key_map[k_]
 57 |             result[k_] = v
 58 | 
 59 |     return result
 60 | 
 61 | 
 62 | def update_parsers_defaults_with_conf(
 63 |     parsers,
 64 |     conf_file,
 65 |     conf_section='defaults',
 66 |     conf_key_map=None,
 67 |     no_strip_quote=False,
 68 |     val_type=None,
 69 |     val_default=None,
 70 | ):
 71 |     """Updates argparse.ArgumentParser's defaults with key/value pairs
 72 |     defined in conf_file. Also, returns a dict of key/values defined in
 73 |     conf_file with correct type for each value.
 74 | 
 75 |     Type of each value in conf_file can be guessed from:
 76 |         - default value of ArgumentParser's argument.
 77 |         - val_type if it's given.
 78 |         - val_default if it's given.
 79 |     Otherwise it is considered as string type since ArgumentParser
 80 |     does not allow direct access to each argument's type.
 81 |     Therefore, this function tries to best-guess such type.
 82 | 
 83 |     This function does not work recursively with subparsers.
 84 |     Therefore, call this function with each subparser to update each
 85 |     subparser's defaults.
 86 | 
 87 |     Args:
 88 |         parsers:
 89 |             List of argparse.ArgumentParser objects to be updated with
 90 |             new defaults defined in conf_file. Useful for subparsers.
 91 |         conf_file:
 92 |             See read_from_conf()
 93 |         conf_section:
 94 |             See read_from_conf()
 95 |         conf_key_map:
 96 |             See read_from_conf()
 97 |         no_strip_quote:
 98 |             See read_from_conf()
 99 |         val_type:
100 |             {key: value's type} where key is a key in conf_file.
101 |             If not defined, var's type can be guessed either from
102 |             parser's default value or from val_default.
103 |             parser's default will override all otehrs.
104 |         val_default:
105 |             {key: value's default} where key is a key in conf_file.
106 |             Type can be guessed from argument's default value.
107 | 
108 |     Returns:
109 |         Dict of key/value pair parsed from conf_file.
110 |         Such value is converted into a correct type which is
111 |         guessed from val_type, val_default and arguments' defaults
112 |         defined in parsers.
113 |     """
114 |     if isinstance(parsers, ArgumentParser):
115 |         parsers = [parsers]
116 | 
117 |     defaults = read_from_conf(
118 |         conf_file=conf_file,
119 |         conf_section=conf_section,
120 |         conf_key_map=conf_key_map,
121 |         no_strip_quote=no_strip_quote,
122 |     )
123 | 
124 |     if val_default:
125 |         for k, v in val_default.items():
126 |             if k not in defaults:
127 |                 defaults[k] = None
128 | 
129 |     # used "is not None" for guessed_default to catch boolean false
130 |     for k, v in defaults.items():
131 |         if val_default and k in val_default:
132 |             guessed_default = val_default[k]
133 |         else:
134 |             for p in parsers:
135 |                 guessed_default = p.get_default(k)
136 |                 if guessed_default:
137 |                     break
138 |         if val_type and k in val_type:
139 |             guessed_type = val_type[k]
140 |         elif guessed_default is not None:
141 |             guessed_type = type(guessed_default)
142 |         else:
143 |             guessed_type = None
144 | 
145 |         if v is None and guessed_default is not None:
146 |             v = guessed_default
147 |             defaults[k] = v
148 | 
149 |         if guessed_type:
150 |             if guessed_type is bool and isinstance(v, str):
151 |                 defaults[k] = bool(strtobool(v))
152 |             else:
153 |                 defaults[k] = guessed_type(v)
154 | 
155 |     # update ArgumentParser's default and then return defaults dict
156 |     for p in parsers:
157 |         p.set_defaults(**defaults)
158 |     return defaults
159 | 


--------------------------------------------------------------------------------
/caper/backward_compatibility.py:
--------------------------------------------------------------------------------
 1 | """Variables and functions for backward_compatibililty
 2 | """
 3 | 
 4 | CAPER_1_0_0_PARAM_KEY_NAME_CHANGE = {
 5 |     'out_dir': 'local_out_dir',
 6 |     'out_gcs_bucket': 'gcp_out_dir',
 7 |     'out_s3_bucket': 'aws_out_dir',
 8 |     'tmp_dir': 'local_loc_dir',
 9 |     'tmp_gcs_bucket': 'gcp_loc_dir',
10 |     'tmp_s3_bucket': 'aws_loc_dir',
11 |     'ip': 'hostname',
12 | }
13 | 
14 | CAPER_1_4_2_PARAM_KEY_NAME_CHANGE = {'auto_update_metadata': 'auto_write_metadata'}
15 | 
16 | PARAM_KEY_NAME_CHANGE = {
17 |     **CAPER_1_0_0_PARAM_KEY_NAME_CHANGE,
18 |     **CAPER_1_4_2_PARAM_KEY_NAME_CHANGE,
19 | }
20 | 


--------------------------------------------------------------------------------
/caper/caper_base.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from datetime import datetime
  4 | 
  5 | from autouri import GCSURI, S3URI, AbsPath, AutoURI
  6 | 
  7 | from .cromwell_backend import BACKEND_AWS, BACKEND_GCP
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class CaperBase:
 13 |     ENV_GOOGLE_APPLICATION_CREDENTIALS = 'GOOGLE_APPLICATION_CREDENTIALS'
 14 |     DEFAULT_LOC_DIR_NAME = '.caper_tmp'
 15 | 
 16 |     def __init__(
 17 |         self,
 18 |         local_loc_dir=None,
 19 |         gcp_loc_dir=None,
 20 |         aws_loc_dir=None,
 21 |         gcp_service_account_key_json=None,
 22 |     ):
 23 |         """Manages work/cache/temp directories for localization on the following
 24 |         storages:
 25 |             - Local*: Local path -> local_loc_dir**
 26 |             - gcp: GCS bucket path -> gcp_loc_dir
 27 |             - aws: S3 bucket path -> aws_loc_dir
 28 | 
 29 |         * Note that it starts with capital L, which is a default backend of Cromwell's
 30 |         default configuration file (application.conf).
 31 |         ** /tmp is not recommended. This directory is very important to store
 32 |         intermediate files used by Cromwell/AutoURI (file transfer/localization).
 33 | 
 34 |         Also manages Google Cloud auth (key JSON file) since both Caper client/server
 35 |         require permission to access to storage.
 36 | 
 37 |         Args:
 38 |             local_loc_dir:
 39 |                 Local cache directory to store files localized for local backends.
 40 |                 Unlike other two directories. This directory is also used to make a
 41 |                 working directory to store intermediate files to run Cromwell.
 42 |                 e.g. backend.conf and workflow_opts.json.
 43 |             gcp_loc_dir:
 44 |                 GCS cache directory to store files localized on GCS for gcp backend.
 45 |             aws_loc_dir:
 46 |                 S3 cache directory to store files localized on S3 for aws backend.
 47 |             gcp_service_account_key_json:
 48 |                 Google Cloud service account for authentication.
 49 |                 This service account should have enough permission to storage.
 50 |         """
 51 |         if local_loc_dir is None:
 52 |             local_loc_dir = os.path.join(os.getcwd(), CaperBase.DEFAULT_LOC_DIR_NAME)
 53 | 
 54 |         if not AbsPath(local_loc_dir).is_valid:
 55 |             raise ValueError(
 56 |                 'local_loc_dir should be a valid local abspath. {f}'.format(
 57 |                     f=local_loc_dir
 58 |                 )
 59 |             )
 60 |         if gcp_loc_dir and not GCSURI(gcp_loc_dir).is_valid:
 61 |             raise ValueError(
 62 |                 'gcp_loc_dir should be a valid GCS path. {f}'.format(f=gcp_loc_dir)
 63 |             )
 64 |         if aws_loc_dir and not S3URI(aws_loc_dir).is_valid:
 65 |             raise ValueError(
 66 |                 'aws_loc_dir should be a valid S3 path. {f}'.format(f=aws_loc_dir)
 67 |             )
 68 | 
 69 |         self._local_loc_dir = local_loc_dir
 70 |         self._gcp_loc_dir = gcp_loc_dir
 71 |         self._aws_loc_dir = aws_loc_dir
 72 | 
 73 |         self._set_env_gcp_app_credentials(gcp_service_account_key_json)
 74 | 
 75 |     def _set_env_gcp_app_credentials(
 76 |         self,
 77 |         gcp_service_account_key_json=None,
 78 |         env_name=ENV_GOOGLE_APPLICATION_CREDENTIALS,
 79 |     ):
 80 |         """Initalizes environment for authentication (VM instance/storage).
 81 | 
 82 |         Args:
 83 |             gcp_service_account_key_json:
 84 |                 Secret key JSON file for auth.
 85 |                 This service account should have full permission to storage and
 86 |                 VM instance.
 87 |                 Environment variable GOOGLE_APPLICATION_CREDENTIALS will be
 88 |                 updated with this.
 89 |         """
 90 |         if gcp_service_account_key_json:
 91 |             gcp_service_account_key_json = os.path.expanduser(
 92 |                 gcp_service_account_key_json
 93 |             )
 94 |             if env_name in os.environ:
 95 |                 auth_file = os.environ[env_name]
 96 |                 if not os.path.samefile(auth_file, gcp_service_account_key_json):
 97 |                     logger.warning(
 98 |                         'Env var {env} does not match with '
 99 |                         'gcp_service_account_key_json. '
100 |                         'Using application default credentials? '.format(env=env_name)
101 |                     )
102 |             logger.debug(
103 |                 'Adding GCP service account key JSON {key} to '
104 |                 'env var {env}'.format(key=gcp_service_account_key_json, env=env_name)
105 |             )
106 |             os.environ[env_name] = gcp_service_account_key_json
107 | 
108 |     def localize_on_backend(self, f, backend, recursive=False, make_md5_file=False):
109 |         """Localize a file according to the chosen backend.
110 |         Each backend has its corresponding storage.
111 |             - gcp -> GCS bucket path (starting with gs://)
112 |             - aws -> S3 bucket path (starting with s3://)
113 |             - All others (based on local backend) -> local storage
114 | 
115 |         If contents of input JSON changes due to recursive localization (deepcopy)
116 |         then a new temporary file suffixed with backend type will be written on loc_prefix.
117 |         For example, /somewhere/test.json -> gs://example-tmp-gcs-bucket/somewhere/test.gcs.json
118 | 
119 |         loc_prefix will be one of the cache directories according to the backend type
120 |             - gcp -> gcp_loc_dir
121 |             - aws -> aws_loc_dir
122 |             - all others (local) -> local_loc_dir
123 | 
124 |         Args:
125 |             f:
126 |                 File to be localized.
127 |             backend:
128 |                 Backend to localize file f on.
129 |             recursive:
130 |                 Recursive localization (deepcopy).
131 |                 All files (if value is valid path/URI string) in JSON/CSV/TSV
132 |                 will be localized together with file f.
133 |             make_md5_file:
134 |                 Make .md5 file for localized files. This is for local only since
135 |                 GCS/S3 bucket paths already include md5 hash information in their metadata.
136 | 
137 |         Returns:
138 |             localized URI.
139 |         """
140 |         if backend == BACKEND_GCP:
141 |             loc_prefix = self._gcp_loc_dir
142 |         elif backend == BACKEND_AWS:
143 |             loc_prefix = self._aws_loc_dir
144 |         else:
145 |             loc_prefix = self._local_loc_dir
146 | 
147 |         return AutoURI(f).localize_on(
148 |             loc_prefix, recursive=recursive, make_md5_file=make_md5_file
149 |         )
150 | 
151 |     def localize_on_backend_if_modified(
152 |         self, f, backend, recursive=False, make_md5_file=False
153 |     ):
154 |         """Wrapper for localize_on_backend.
155 | 
156 |         If localized file is not modified due to recursive localization,
157 |         then it means that localization for such file was redundant.
158 |         So returns the original file instead of a redundantly localized one.
159 |         We can check if file is modifed or not by looking at their basename.
160 |         Modified localized file has a suffix of the target storage. e.g. .s3.
161 |         """
162 |         f_loc = self.localize_on_backend(
163 |             f=f, backend=backend, recursive=recursive, make_md5_file=make_md5_file
164 |         )
165 | 
166 |         if AutoURI(f).basename == AutoURI(f_loc).basename:
167 |             return f
168 |         return f_loc
169 | 
170 |     def create_timestamped_work_dir(self, prefix=''):
171 |         """Creates/returns a local temporary directory on self._local_work_dir.
172 | 
173 |         Args:
174 |             prefix:
175 |                 Prefix for timstamped directory.
176 |                 Directory name will be self._tmpdir / prefix / timestamp.
177 |         """
178 |         timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
179 |         work_dir = os.path.join(self._local_loc_dir, prefix, timestamp)
180 |         os.makedirs(work_dir, exist_ok=True)
181 |         logger.info(
182 |             'Creating a timestamped temporary directory. {d}'.format(d=work_dir)
183 |         )
184 | 
185 |         return work_dir
186 | 
187 |     def get_loc_dir(self, backend):
188 |         """Get localization directory for a backend."""
189 |         if backend == BACKEND_GCP:
190 |             return self._gcp_loc_dir
191 |         elif backend == BACKEND_AWS:
192 |             return self._aws_loc_dir
193 |         else:
194 |             return self._local_loc_dir
195 | 


--------------------------------------------------------------------------------
/caper/caper_init.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from .cromwell import Cromwell
  4 | from .cromwell_backend import (
  5 |     BACKEND_ALIAS_LOCAL,
  6 |     BACKEND_AWS,
  7 |     BACKEND_GCP,
  8 |     BACKEND_LOCAL,
  9 |     BACKEND_LSF,
 10 |     BACKEND_PBS,
 11 |     BACKEND_SGE,
 12 |     BACKEND_SLURM,
 13 | )
 14 | 
 15 | CONF_CONTENTS_TMP_DIR = """
 16 | # Local directory for localized files and Cromwell's intermediate files.
 17 | # If not defined then Caper will make .caper_tmp/ on CWD or `local-out-dir`.
 18 | # /tmp is not recommended since Caper store localized data files here.
 19 | local-loc-dir=
 20 | """
 21 | 
 22 | CONF_CONTENTS_COMMON_RESOURCE_PARAM_HELP = """
 23 | # This parameter defines resource parameters for submitting WDL task to job engine.
 24 | # It is for HPC backends only (slurm, sge, pbs and lsf).
 25 | # It is not recommended to change it unless your cluster has custom resource settings.
 26 | # See https://github.com/ENCODE-DCC/caper/blob/master/docs/resource_param.md for details."""
 27 | 
 28 | CONF_CONTENTS_SLURM_PARAM = ""
 29 | CONF_CONTENTS_SGE_PARAM = """
 30 | # Parallel environment of SGE:
 31 | # Find one with `$ qconf -spl` or ask you admin to add one if not exists.
 32 | sge-pe=
 33 | """
 34 | 
 35 | CONF_CONTENTS_PBS_PARAM = ""
 36 | CONF_CONTENTS_LSF_PARAM = ""
 37 | 
 38 | DEFAULT_CONF_CONTENTS_LOCAL = (
 39 |     """backend=local
 40 | """
 41 |     + CONF_CONTENTS_TMP_DIR
 42 | )
 43 | 
 44 | DEFAULT_CONF_CONTENTS_SLURM = (
 45 |     """backend=slurm
 46 | 
 47 | # SLURM partition. DEFINE ONLY IF REQUIRED BY YOUR CLUSTER'S POLICY.
 48 | # You must define it for Stanford Sherlock.
 49 | slurm-partition=
 50 | 
 51 | # SLURM account. DEFINE ONLY IF REQUIRED BY YOUR CLUSTER'S POLICY.
 52 | # You must define it for Stanford SCG.
 53 | slurm-account=
 54 | """
 55 |     + CONF_CONTENTS_TMP_DIR
 56 |     + CONF_CONTENTS_SLURM_PARAM
 57 | )
 58 | 
 59 | DEFAULT_CONF_CONTENTS_SGE = (
 60 |     """backend=sge
 61 | """
 62 |     + CONF_CONTENTS_TMP_DIR
 63 |     + CONF_CONTENTS_SGE_PARAM
 64 | )
 65 | 
 66 | DEFAULT_CONF_CONTENTS_PBS = (
 67 |     """backend=pbs
 68 | """
 69 |     + CONF_CONTENTS_TMP_DIR
 70 |     + CONF_CONTENTS_PBS_PARAM
 71 | )
 72 | 
 73 | DEFAULT_CONF_CONTENTS_LSF = (
 74 |     """backend=lsf
 75 | """
 76 |     + CONF_CONTENTS_TMP_DIR
 77 |     + CONF_CONTENTS_LSF_PARAM
 78 | )
 79 | 
 80 | DEFAULT_CONF_CONTENTS_AWS = (
 81 |     """backend=aws
 82 | 
 83 | # ARN for AWS Batch.
 84 | aws-batch-arn=
 85 | # AWS region (e.g. us-west-1)
 86 | aws-region=
 87 | # Output bucket path for AWS. This should start with `s3://`.
 88 | aws-out-dir=
 89 | 
 90 | # use this modified cromwell to fix input file localization failures
 91 | # (104 Connection reset by peer)
 92 | # cromwell uses AWS CLI(aws s3 cp)'s native retry feature which is controlled by
 93 | # several environment variables but it doesn't seem to work for some reason
 94 | # this is an adhoc fix to make cromwell retry up to 5 times in the bash script level
 95 | # https://github.com/ENCODE-DCC/cromwell/commit/d16af26483e0019e14d6f8b158eaf64529f57d98
 96 | cromwell=https://storage.googleapis.com/caper-data/cromwell/cromwell-65-d16af26-SNAP.jar
 97 | """
 98 |     + CONF_CONTENTS_TMP_DIR
 99 | )
100 | 
101 | DEFAULT_CONF_CONTENTS_GCP = (
102 |     """backend=gcp
103 | 
104 | # Google Cloud Platform Project
105 | gcp-prj=
106 | # Output bucket path for Google Cloud Platform. This should start with `gs://`.
107 | gcp-out-dir=
108 | 
109 | # Call-cached outputs will be duplicated by making a copy or reference
110 | #   reference: refer to old output file in metadata.json file.
111 | #   copy (not recommended): make a copy for a new workflow.
112 | gcp-call-caching-dup-strat=
113 | 
114 | # Use Google Cloud Life Sciences API instead of Genomics API (deprecating).
115 | # Make sure to enable Google Cloud Life Sciences API on your Google Cloud Console
116 | use-google-cloud-life-sciences=true
117 | 
118 | # gcp-region is required for Life Sciences API only.
119 | # Region is different from zone. Zone is more specific.
120 | # Do not define zone here. Check supported regions:
121 | #   https://cloud.google.com/life-sciences/docs/concepts/locations
122 | # e.g. us-central1
123 | gcp-region=
124 | 
125 | # Comma-separated zones for Genomics API (deprecating).
126 | # This is ignored if use-google-cloud-life-sciences.
127 | # e.g. us-west1-a,us-west1-b,us-west1-c
128 | gcp-zones=
129 | 
130 | # Number of retrials. This parameter also applies to non-OOM failures.
131 | max-retries=1
132 | """
133 |     + CONF_CONTENTS_TMP_DIR
134 | )
135 | 
136 | 
137 | def init_caper_conf(conf_file, backend):
138 |     """Initialize conf file for a given backend.
139 |     There are two special backend aliases for two Stanford clusters.
140 |     These clusters are based on SLURM.
141 |     Also, download/install Cromwell/Womtool JARs, whose
142 |     default URL and install dir are defined in class Cromwell.
143 |     """
144 |     if backend in (BACKEND_LOCAL, BACKEND_ALIAS_LOCAL):
145 |         contents = DEFAULT_CONF_CONTENTS_LOCAL
146 |     elif backend == BACKEND_SLURM:
147 |         contents = DEFAULT_CONF_CONTENTS_SLURM
148 |     elif backend == BACKEND_SGE:
149 |         contents = DEFAULT_CONF_CONTENTS_SGE
150 |     elif backend == BACKEND_PBS:
151 |         contents = DEFAULT_CONF_CONTENTS_PBS
152 |     elif backend == BACKEND_LSF:
153 |         contents = DEFAULT_CONF_CONTENTS_LSF
154 |     elif backend in BACKEND_GCP:
155 |         contents = DEFAULT_CONF_CONTENTS_GCP
156 |     elif backend in BACKEND_AWS:
157 |         contents = DEFAULT_CONF_CONTENTS_AWS
158 |     else:
159 |         raise ValueError('Unsupported backend {p}'.format(p=backend))
160 | 
161 |     conf_file = os.path.expanduser(conf_file)
162 |     os.makedirs(os.path.dirname(conf_file), exist_ok=True)
163 | 
164 |     with open(conf_file, 'w') as fp:
165 |         fp.write(contents + '\n')
166 | 
167 |         cromwell = Cromwell()
168 |         fp.write(
169 |             '{key}={val}\n'.format(key='cromwell', val=cromwell.install_cromwell())
170 |         )
171 |         fp.write('{key}={val}\n'.format(key='womtool', val=cromwell.install_womtool()))
172 | 


--------------------------------------------------------------------------------
/caper/caper_labels.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import pwd
 5 | import re
 6 | 
 7 | from autouri import AutoURI
 8 | 
 9 | from .dict_tool import merge_dict
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | RE_ILLEGAL_STR_LABEL_CHRS = r'[\:\?\*]'
15 | SUB_ILLEGAL_STR_LABEL_CHRS = '_'
16 | 
17 | 
18 | class CaperLabels:
19 |     KEY_CAPER_STR_LABEL = 'caper-str-label'
20 |     KEY_CAPER_USER = 'caper-user'
21 |     KEY_CAPER_BACKEND = 'caper-backend'
22 |     BASENAME_LABELS = 'labels.json'
23 | 
24 |     def create_file(
25 |         self,
26 |         directory,
27 |         backend=None,
28 |         custom_labels=None,
29 |         str_label=None,
30 |         user=None,
31 |         basename=BASENAME_LABELS,
32 |     ):
33 |         """Create labels JSON file.
34 | 
35 |         Args:
36 |             directory:
37 |                 Directory to create a labels JSON file.
38 |             backend:
39 |                 Backend
40 |             custom_labels:
41 |                 User's labels file to be merged.
42 |             str_label:
43 |                 Caper's string label.
44 |                 Wildcards ('*' and '?') and ':' are not allowed by default.
45 |                 These will be replaced with '_' by default.
46 |             basename:
47 |                 Basename of labels file.
48 |         """
49 |         template = {}
50 | 
51 |         if custom_labels:
52 |             s = AutoURI(custom_labels).read()
53 |             merge_dict(template, json.loads(s))
54 | 
55 |         if backend:
56 |             template[CaperLabels.KEY_CAPER_BACKEND] = backend
57 | 
58 |         if str_label:
59 |             new_str_label = re.sub(
60 |                 RE_ILLEGAL_STR_LABEL_CHRS, SUB_ILLEGAL_STR_LABEL_CHRS, str_label
61 |             )
62 |             if str_label != new_str_label:
63 |                 logger.warning(
64 |                     'Found illegal characters in str_label matching with {regex}. '
65 |                     'Replaced with {sub}'.format(
66 |                         regex=RE_ILLEGAL_STR_LABEL_CHRS, sub=SUB_ILLEGAL_STR_LABEL_CHRS
67 |                     )
68 |                 )
69 |             template[CaperLabels.KEY_CAPER_STR_LABEL] = new_str_label
70 | 
71 |         template[CaperLabels.KEY_CAPER_USER] = (
72 |             user if user else pwd.getpwuid(os.getuid())[0]
73 |         )
74 | 
75 |         labels_file = os.path.join(directory, basename)
76 |         AutoURI(labels_file).write(json.dumps(template, indent=4))
77 | 
78 |         return labels_file
79 | 


--------------------------------------------------------------------------------
/caper/caper_wdl_parser.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from .wdl_parser import WDLParser
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | class CaperWDLParser(WDLParser):
 9 |     """WDL parser for Caper."""
10 | 
11 |     RE_WDL_COMMENT_DOCKER = r'^\s*\#\s*CAPER\s+docker\s(.+)'
12 |     RE_WDL_COMMENT_SINGULARITY = r'^\s*\#\s*CAPER\s+singularity\s(.+)'
13 |     WDL_WORKFLOW_META_DOCKER_KEYS = ('default_docker', 'caper_docker')
14 |     WDL_WORKFLOW_META_SINGULARITY_KEYS = ('default_singularity', 'caper_singularity')
15 |     WDL_WORKFLOW_META_CONDA_KEYS = (
16 |         'default_conda',
17 |         'default_conda_env',
18 |         'caper_conda',
19 |         'caper_conda_env',
20 |     )
21 | 
22 |     def __init__(self, wdl):
23 |         super().__init__(wdl)
24 | 
25 |     @property
26 |     def caper_docker(self):
27 |         """Backward compatibility for property name. See property default_docker."""
28 |         return self.default_docker
29 | 
30 |     @property
31 |     def default_docker(self):
32 |         """Find a default Docker image in WDL for Caper.
33 | 
34 |         Backward compatibililty:
35 |             Keep using old regex method
36 |             if WDL_WORKFLOW_META_DOCKER doesn't exist in workflow's meta
37 |         """
38 |         if self.workflow_meta:
39 |             for docker_key in CaperWDLParser.WDL_WORKFLOW_META_DOCKER_KEYS:
40 |                 if docker_key in self.workflow_meta:
41 |                     return self.workflow_meta[docker_key]
42 | 
43 |         ret = self._find_val_of_matched_lines(CaperWDLParser.RE_WDL_COMMENT_DOCKER)
44 |         if ret:
45 |             return ret[0].strip('"\'')
46 | 
47 |     @property
48 |     def caper_singularity(self):
49 |         """Backward compatibility for property name. See property default_singularity."""
50 |         return self.default_singularity
51 | 
52 |     @property
53 |     def default_singularity(self):
54 |         """Find a default Singularity image in WDL for Caper.
55 | 
56 |         Backward compatibililty:
57 |             Keep using old regex method
58 |             if WDL_WORKFLOW_META_SINGULARITY doesn't exist in workflow's meta
59 |         """
60 |         if self.workflow_meta:
61 |             for singularity_key in CaperWDLParser.WDL_WORKFLOW_META_SINGULARITY_KEYS:
62 |                 if singularity_key in self.workflow_meta:
63 |                     return self.workflow_meta[singularity_key]
64 | 
65 |         ret = self._find_val_of_matched_lines(CaperWDLParser.RE_WDL_COMMENT_SINGULARITY)
66 |         if ret:
67 |             return ret[0].strip('"\'')
68 | 
69 |     @property
70 |     def default_conda(self):
71 |         """Find a default Conda environment name in WDL for Caper."""
72 |         if self.workflow_meta:
73 |             for conda_key in CaperWDLParser.WDL_WORKFLOW_META_CONDA_KEYS:
74 |                 if conda_key in self.workflow_meta:
75 |                     return self.workflow_meta[conda_key]
76 | 


--------------------------------------------------------------------------------
/caper/cli_hpc.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | from .hpc import LsfWrapper, PbsWrapper, SgeWrapper, SlurmWrapper
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def make_caper_run_command_for_hpc_submit():
10 |     """Makes `caper run ...` command from `caper hpc submit` command by simply
11 |     replacing `caper hpc submit` with `caper run`.
12 |     This also escapes double quotes in caper run command.
13 |     """
14 |     if sys.argv[1] == 'hpc' and sys.argv[2] == 'submit':
15 |         # Replace "caper hpc submit" with "caper run"
16 |         new_argv = list(sys.argv)
17 |         new_argv.pop(2)
18 |         new_argv[1] = 'run'
19 |         return new_argv
20 |     else:
21 |         raise ValueError('Wrong HPC command')
22 | 
23 | 
24 | def subcmd_hpc(args):
25 |     if args.hpc_action == 'submit':
26 | 
27 |         if args.leader_job_name is None:
28 |             raise ValueError(
29 |                 'Define --leader-job-name [LEADER_JOB_NAME] in the command line arguments.'
30 |             )
31 |         caper_run_command = make_caper_run_command_for_hpc_submit()
32 | 
33 |         if args.backend == 'slurm':
34 |             stdout = SlurmWrapper(
35 |                 args.slurm_leader_job_resource_param.split(),
36 |                 args.slurm_partition,
37 |                 args.slurm_account,
38 |             ).submit(args.leader_job_name, caper_run_command)
39 | 
40 |         elif args.backend == 'sge':
41 |             stdout = SgeWrapper(
42 |                 args.sge_leader_job_resource_param.split(), args.sge_queue
43 |             ).submit(args.leader_job_name, caper_run_command)
44 | 
45 |         elif args.backend == 'pbs':
46 |             stdout = PbsWrapper(
47 |                 args.pbs_leader_job_resource_param.split(), args.pbs_queue
48 |             ).submit(args.leader_job_name, caper_run_command)
49 | 
50 |         elif args.backend == 'lsf':
51 |             stdout = LsfWrapper(
52 |                 args.lsf_leader_job_resource_param.split(), args.lsf_queue
53 |             ).submit(args.leader_job_name, caper_run_command)
54 | 
55 |         else:
56 |             raise ValueError('Unsupported backend {b} for hpc'.format(b=args.backend))
57 |     else:
58 |         if args.backend == 'slurm':
59 |             hpc_wrapper = SlurmWrapper()
60 |         elif args.backend == 'sge':
61 |             hpc_wrapper = SgeWrapper()
62 |         elif args.backend == 'pbs':
63 |             hpc_wrapper = PbsWrapper()
64 |         elif args.backend == 'lsf':
65 |             hpc_wrapper = LsfWrapper()
66 |         else:
67 |             raise ValueError('Unsupported backend {b} for hpc'.format(b=args.backend))
68 | 
69 |         if args.hpc_action == 'list':
70 |             stdout = hpc_wrapper.list()
71 | 
72 |         elif args.hpc_action == 'abort':
73 |             stdout = hpc_wrapper.abort(args.job_ids)
74 | 
75 |         else:
76 |             raise ValueError('Unsupported hpc action {act}'.format(act=args.hpc_action))
77 | 
78 |     print(stdout)
79 | 


--------------------------------------------------------------------------------
/caper/dict_tool.py:
--------------------------------------------------------------------------------
  1 | """dictTool: merge/split/flatten/unflatten dict
  2 | 
  3 | Author:
  4 |     Jin Lee (leepc12@gmail.com) at ENCODE-DCC
  5 | """
  6 | 
  7 | import re
  8 | from collections import defaultdict
  9 | 
 10 | try:
 11 |     from collections.abc import MutableMapping
 12 | except AttributeError:
 13 |     from collections import MutableMapping
 14 | 
 15 | 
 16 | def merge_dict(a, b):
 17 |     """Merges b into a recursively. This mutates a and overwrites
 18 |     items in b on a for conflicts.
 19 | 
 20 |     Ref: https://stackoverflow.com/questions/7204805/dictionaries
 21 |     -of-dictionaries-merge/7205107#7205107
 22 |     """
 23 |     for key in b:
 24 |         if key in a:
 25 |             if isinstance(a[key], dict) and isinstance(b[key], dict):
 26 |                 merge_dict(a[key], b[key])
 27 |             elif a[key] == b[key]:
 28 |                 pass
 29 |             else:
 30 |                 a[key] = b[key]
 31 |         else:
 32 |             a[key] = b[key]
 33 |     return a
 34 | 
 35 | 
 36 | def flatten_dict(d, reducer=None, parent_key=()):
 37 |     """Flattens dict into single-level-tuple-keyed dict with
 38 |         {(tuple of keys of parents and self): value}
 39 | 
 40 |     Args:
 41 |         reducer:
 42 |             Character to join keys in a tuple.
 43 |             If None, returns with key as a tuple.
 44 |     Returns:
 45 |         dict of {
 46 |             (key_lvl1, key_lvl2, key_lvl3, ...): value
 47 |         }
 48 |     """
 49 |     items = []
 50 |     for k, v in d.items():
 51 |         new_key = parent_key + (k if isinstance(k, tuple) else (k,))
 52 |         if isinstance(v, MutableMapping):
 53 |             items.extend(flatten_dict(v, parent_key=new_key).items())
 54 |         else:
 55 |             items.append((new_key, v))
 56 |     if reducer:
 57 |         return {reducer.join(k): v for k, v in type(d)(items).items()}
 58 |     else:
 59 |         return type(d)(items)
 60 | 
 61 | 
 62 | def recurse_dict_value(d, fnc):
 63 |     if isinstance(d, dict):
 64 |         for k, v in d.items():
 65 |             recurse_dict_value(v, fnc)
 66 | 
 67 |     elif isinstance(d, (list, tuple)):
 68 |         for v in d:
 69 |             recurse_dict_value(v, fnc)
 70 |     else:
 71 |         fnc(d)
 72 | 
 73 | 
 74 | def unflatten_dict(d_flat):
 75 |     """Unflattens single-level-tuple-keyed dict into dict"""
 76 |     result = type(d_flat)()
 77 |     for k_tuple, v in d_flat.items():
 78 |         d_curr = result
 79 |         for i, k in enumerate(k_tuple):
 80 |             if i == len(k_tuple) - 1:
 81 |                 d_curr[k] = v
 82 |             elif k not in d_curr:
 83 |                 d_curr[k] = type(d_flat)()
 84 |             d_curr = d_curr[k]
 85 |     return result
 86 | 
 87 | 
 88 | def split_dict(d, rules=None):
 89 |     """Splits dict according to "rule"
 90 | 
 91 |     Returns:
 92 |         List of split dict
 93 | 
 94 |     Args:
 95 |         rule:
 96 |             A list of tuple (RULE_NAME: REGEX)
 97 | 
 98 |             If a key name in an JSON object matches with this REGEX
 99 |             then ALL objects with the same key will be separated from
100 |             the original root JSON object while keeping their hierachy.
101 |             RULE_NAME will be added to root of each new JSON object.
102 | 
103 |             For example, we have a JSON object like the following
104 |             [
105 |                 {
106 |                     "flagstat_qc": {
107 |                         "rep1": {
108 |                             "read1": 100,
109 |                             "read2": 200
110 |                         },
111 |                         "rep2": {
112 |                             "read1": 300,
113 |                             "read2": 400
114 |                         }
115 |                     },
116 |                     "etc": {
117 |                         "samstat_qc": {
118 |                             "rep1": {
119 |                                 "unmapped": 500,
120 |                                 "mapped": 600
121 |                             },
122 |                             "rep2": {
123 |                                 "unmapped": 700,
124 |                                 "mapped": 800
125 |                             }
126 |                         }
127 |                     },
128 |                     "idr_qc": {
129 |                         "qc_test1" : 900
130 |                     }
131 |                 }
132 |             ]
133 |             with "new_row_rule" = "replicate:^rep\\d+$", this JSON object
134 |             will be splitted into three (original, rep1, rep2) JSON object.
135 |             [
136 |                 # original
137 |                 {
138 |                     "idr_qc": {
139 |                         "qc_test1" : 900
140 |                     }
141 |                 },
142 |                 # rep1
143 |                 {
144 |                     "replicate": "rep1",
145 |                     "flagstat_qc": {
146 |                         "read1": 100,
147 |                         "read2": 200
148 |                     },
149 |                     "etc": {
150 |                         "samstat_qc": {
151 |                             "unmapped": 500,
152 |                             "mapped": 600
153 |                         }
154 |                     }
155 |                 },
156 |                 # rep2
157 |                 {
158 |                     "replicate": "rep2",
159 |                     "flagstat_qc": {
160 |                         "read1": 300,
161 |                         "read2": 400
162 |                     },
163 |                     "etc": {
164 |                         "samstat_qc": {
165 |                             "unmapped": 700,
166 |                             "mapped": 800
167 |                         }
168 |                     }
169 |                 },
170 |             ]
171 |     """
172 |     if rules is None:
173 |         return [d]
174 |     if isinstance(rules, tuple):
175 |         rules = [rules]
176 | 
177 |     d_flat = flatten_dict(d)
178 |     result = []
179 |     keys_matched_regex = set()
180 |     d_each_rule = defaultdict(type(d))
181 |     for rule_name, rule_regex in rules:
182 |         for k_tuple, v in d_flat.items():
183 |             new_k_tuple = ()
184 |             pattern_matched_k = None
185 |             for k in k_tuple:
186 |                 if re.findall(rule_regex, k):
187 |                     pattern_matched_k = (rule_name, k)
188 |                 else:
189 |                     new_k_tuple += (k,)
190 |             if pattern_matched_k is not None:
191 |                 d_each_rule[pattern_matched_k][new_k_tuple] = v
192 |                 keys_matched_regex.add(k_tuple)
193 | 
194 |     for (rule_name, k), d_each_matched in d_each_rule.items():
195 |         d_ = unflatten_dict(d_each_matched)
196 |         d_[rule_name] = k
197 |         result.append(d_)
198 | 
199 |     d_others = type(d)()
200 |     for k_tuple, v in d_flat.items():
201 |         if k_tuple not in keys_matched_regex:
202 |             d_others[k_tuple] = v
203 |     if d_others:
204 |         d_ = unflatten_dict(d_others)
205 |         result = [d_] + result
206 |     return result
207 | 
208 | 
209 | def dict_to_dot_str(d, parent_key='digraph D', indent='', base_indent=''):
210 |     """Dict will be converted into DOT like the followings:
211 |         1) Value string will not be double-quotted in DOT.
212 |             - make sure to escape double-quotes in a string with special characters
213 |             (e.g. whitespace, # and ;)
214 |         2) If "value" is None then "key" will be just added to DOT without "="
215 | 
216 |     dict:
217 |         { "key1": "val1", "key2": "val2", "key3": { "key3_1": "val3_1", }... }
218 | 
219 |     dot:
220 |         digraph D {
221 |             key1 = val1;
222 |             key2 = val2;
223 |             key3 {
224 |                 key3_1 = val3_1;
225 |                 ...
226 |             }
227 |             ...
228 |         }
229 | 
230 |     Example in a Croo output def JSON file:
231 |         (note that strings for "label" are double-quote-escaped).
232 | 
233 |     dict:
234 |         {
235 |             "rankdir": "TD",
236 |             "start": "[shape=Mdiamond]",
237 |             "end": "[shape=Msquare]",
238 |             "subgraph cluster_rep1": {
239 |                 "style": "filled",
240 |                 "color": "mistyrose",
241 |                 "label": "\"Replicate 1\""
242 |             },
243 |             "subgraph cluster_rep2": {
244 |                 "style": "filled",
245 |                 "color": "azure",
246 |                 "label": "\"Replicate 2\""
247 |             },
248 |             "a0 -> b0": null,
249 |             "c0 -> d0": null
250 |         }
251 | 
252 |     Such dict will be converted into a dot:
253 | 
254 |     dot:
255 |         digraph D {
256 |             rankDir = TD;
257 |             start = [shape=Mdiamond];
258 |             end = [shape=Msquare];
259 |             subgraph cluster_rep1 {
260 |                 style = filled;
261 |                 color = mistyrose;
262 |                 label = "Replicate 1"
263 |             };
264 |             subgraph cluster_rep2 {
265 |                 style = filled;
266 |                 color = azure;
267 |                 label = "Replicate 2"
268 |             };
269 |             a0 -> b0;
270 |             c0 -> d0;
271 |         }
272 |     """
273 |     result = ''
274 |     if d is None:
275 |         return '{}{};\n'.format(base_indent, parent_key)
276 |     elif isinstance(d, str):
277 |         return '{}{} = {};\n'.format(base_indent, parent_key, d)
278 |     elif isinstance(d, dict):
279 |         result += base_indent + parent_key + ' {\n'
280 |         for k, v in d.items():
281 |             result += dict_to_dot_str(
282 |                 v, parent_key=k, indent=indent, base_indent=base_indent + indent
283 |             )
284 |         result += base_indent + '}\n'
285 |     else:
286 |         raise ValueError(
287 |             'Unsupported data type: {} '
288 |             '(only str and dict/JSON are allowed).'.format(type(d))
289 |         )
290 |     return result
291 | 


--------------------------------------------------------------------------------
/caper/hocon_string.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import json
  3 | import logging
  4 | import re
  5 | 
  6 | from pyhocon import ConfigFactory, HOCONConverter
  7 | 
  8 | from .dict_tool import merge_dict
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | NEW_LINE = '\n'
 14 | RE_HOCON_INCLUDE = [
 15 |     r'include\s+(?:required|url|file|classpath)\(.*\)',
 16 |     r'include\s+".*\.(?:conf|hocon)"',
 17 | ]
 18 | RE_HOCONSTRING_INCLUDE = r'HOCONSTRING_INCLUDE_(?:.*)\s*=\s*"(?:.*)"'
 19 | RE_HOCONSTRING_INCLUDE_VALUE = r'HOCONSTRING_INCLUDE_(?:.*)\s*=\s*"(.*)"'
 20 | HOCONSTRING_INCLUDE_KEY = 'HOCONSTRING_INCLUDE_{id}'
 21 | 
 22 | 
 23 | def escape_double_quotes(double_quotes):
 24 |     return double_quotes.replace('"', '\\"')
 25 | 
 26 | 
 27 | def unescape_double_quotes(escaped_double_quotes):
 28 |     return escaped_double_quotes.replace('\\"', '"')
 29 | 
 30 | 
 31 | def is_valid_include(include):
 32 |     is_valid_format = False
 33 |     for regex in RE_HOCON_INCLUDE:
 34 |         if re.findall(regex, include):
 35 |             is_valid_format = True
 36 |             break
 37 | 
 38 |     return is_valid_format
 39 | 
 40 | 
 41 | def get_include_key(include_str):
 42 |     """Use md5sum hash of the whole include statement string for a key."""
 43 |     return hashlib.md5(include_str.encode()).hexdigest()
 44 | 
 45 | 
 46 | def wrap_includes(hocon_str):
 47 |     """Convert `include` statement string into key = val format.
 48 |     Returns '{key} = "{double_quote_escaped_val}"'.
 49 |     """
 50 |     for regex in RE_HOCON_INCLUDE:
 51 |         for include in re.findall(regex, hocon_str):
 52 |             if '\\"' in include:
 53 |                 continue
 54 | 
 55 |             logger.debug('Found include in HOCON: {include}'.format(include=include))
 56 | 
 57 |             hocon_str = hocon_str.replace(
 58 |                 include,
 59 |                 '{key} = "{val}"'.format(
 60 |                     key=HOCONSTRING_INCLUDE_KEY.format(id=get_include_key(include)),
 61 |                     val=escape_double_quotes(include),
 62 |                 ),
 63 |             )
 64 |     return hocon_str
 65 | 
 66 | 
 67 | def unwrap_includes(key_val_str):
 68 |     """Convert '{key} = "{val}"" formatted string to the original `include` statement string.
 69 |     Args:
 70 |         key:
 71 |             HOCONSTRING_INCLUDE_KEY with `id` as md5sum hash of the original
 72 |             `include` statement string.
 73 |         val:
 74 |             Double-quote-escaped `include` statement string.
 75 |     """
 76 |     val = re.findall(RE_HOCONSTRING_INCLUDE_VALUE, key_val_str)
 77 |     if val:
 78 |         if len(val) > 1:
 79 |             raise ValueError(
 80 |                 'Found multiple matches. Wrong include key=val format? {val}'.format(
 81 |                     val=val
 82 |                 )
 83 |             )
 84 |         return unescape_double_quotes(val[0])
 85 | 
 86 | 
 87 | class HOCONString:
 88 |     def __init__(self, hocon_str):
 89 |         """Find an `include` statement (VALUE) in HOCON string and then convert it
 90 |         into a HOCONSTRING_INCLUDE_KEY="VALUE" pair in HOCON.
 91 | 
 92 |         Double-quotes will be escaped with double slashes.
 93 |         Then the VALUE is kept as it is as a value and can be recovered later when
 94 |         it is converted back to HOCON string.
 95 | 
 96 |         This workaround is to skip parsing `include` statements since there is no
 97 |         information about `classpath` at the parsing time and pyhocon will error out and
 98 |         will stop parsing.
 99 | 
100 |         e.g. we don't know what's in `classpath` before the backend conf file is
101 |         passed to Cromwell.
102 |         """
103 |         if not isinstance(hocon_str, str):
104 |             raise ValueError('HOCONString() takes str type only.')
105 | 
106 |         self._hocon_str = wrap_includes(hocon_str)
107 | 
108 |     def __str__(self):
109 |         return self.get_contents()
110 | 
111 |     @classmethod
112 |     def from_dict(cls, d, include=''):
113 |         """Create HOCONString from dict.
114 | 
115 |         Args:
116 |             include:
117 |                 `include` statement to be added to the top of the HOCONString.
118 |         """
119 |         hocon = ConfigFactory.from_dict(d)
120 |         hocon_str = HOCONConverter.to_hocon(hocon)
121 | 
122 |         if include:
123 |             if not is_valid_include(include):
124 |                 raise ValueError(
125 |                     'Wrong HOCON include format. {include}'.format(include=include)
126 |                 )
127 |             hocon_str = NEW_LINE.join([include, hocon_str])
128 | 
129 |         return cls(hocon_str=hocon_str)
130 | 
131 |     def to_dict(self, with_include=True):
132 |         """Convert HOCON string into dict.
133 | 
134 |         Args:
135 |             with_include:
136 |                 If True then double-quote-escaped `include` statements will be kept as a plain string
137 |                 under key HOCONSTRING_INCLUDE_KEY.
138 |                 Otherwise, `include` statements will be excluded.
139 |         """
140 |         if with_include:
141 |             hocon_str = self._hocon_str
142 |         else:
143 |             hocon_str = self.get_contents(with_include=False)
144 | 
145 |         c = ConfigFactory.parse_string(hocon_str)
146 |         j = HOCONConverter.to_json(c)
147 | 
148 |         return json.loads(j)
149 | 
150 |     def merge(self, b, update=False):
151 |         """Merge self with b and then returns a plain string of merged.
152 |         Args:
153 |             b:
154 |                 HOCONString, dict, str to be merged.
155 |                 b's `include` statement will always be ignored.
156 |             update:
157 |                 If True then replace self with a merged one.
158 |         Returns:
159 |             String of merged HOCONs.
160 |         """
161 |         if isinstance(b, HOCONString):
162 |             d = b.to_dict()
163 |         elif isinstance(b, str):
164 |             d = HOCONString(b).to_dict()
165 |         elif isinstance(b, dict):
166 |             d = b
167 |         else:
168 |             raise TypeError('Unsupported type {t}'.format(t=type(b)))
169 | 
170 |         self_d = self.to_dict()
171 |         merge_dict(self_d, d)
172 | 
173 |         hocon = ConfigFactory.from_dict(self_d)
174 | 
175 |         hocon_str = HOCONConverter.to_hocon(hocon)
176 |         if update:
177 |             self._hocon_str = hocon_str
178 | 
179 |         return HOCONString(hocon_str).get_contents()
180 | 
181 |     def get_contents(self, with_include=True):
182 |         """Check if `include` statement is stored as a plain string.
183 |         If exists, converts it back to HOCON `include` statement.
184 | 
185 |         Args:
186 |             with_include: (renamed/changed from without_include)
187 |                 If True then recover all includes statements from include key=val form
188 |                 (RE_HOCONSTRING_INCLUDE).
189 |                 Otherwise, excludes all `include` statements.
190 |         """
191 |         hocon_str = self._hocon_str
192 | 
193 |         for include_key_val in re.findall(RE_HOCONSTRING_INCLUDE, self._hocon_str):
194 |             logger.debug(
195 |                 'Found include key in HOCONString: {include_key_val}'.format(
196 |                     include_key_val=include_key_val
197 |                 )
198 |             )
199 |             if with_include:
200 |                 original_include_str = unwrap_includes(include_key_val)
201 |                 if original_include_str:
202 |                     hocon_str = hocon_str.replace(include_key_val, original_include_str)
203 |             else:
204 |                 hocon_str = hocon_str.replace(include_key_val, '')
205 | 
206 |         return hocon_str
207 | 


--------------------------------------------------------------------------------
/caper/hpc.py:
--------------------------------------------------------------------------------
  1 | """Caper's HPC Wrapper based on job engine's CLI (shell command).
  2 | e.g. sbatch, squeue, qsub, qstat
  3 | """
  4 | import logging
  5 | import os
  6 | import subprocess
  7 | from abc import ABC, abstractmethod
  8 | from pathlib import Path
  9 | from tempfile import NamedTemporaryFile
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | CAPER_LEADER_JOB_NAME_PREFIX = 'CAPER_'
 14 | ILLEGAL_CHARS_IN_JOB_NAME = [',', ' ', '\t']
 15 | 
 16 | 
 17 | def get_user_from_os_environ():
 18 |     return os.environ['USER']
 19 | 
 20 | 
 21 | def make_bash_script_contents(contents):
 22 |     return f'#!/bin/bash\n{contents}\n'
 23 | 
 24 | 
 25 | def make_caper_leader_job_name(job_name):
 26 |     """Check if job name contains Comma, TAB or whitespace.
 27 |     They are not allowed since they can be used as separators.
 28 |     """
 29 |     for illegal_char in ILLEGAL_CHARS_IN_JOB_NAME:
 30 |         if illegal_char in job_name:
 31 |             raise ValueError(
 32 |                 'Illegal character {chr} in job name {job}'.format(
 33 |                     chr=illegal_char, job=job_name
 34 |                 )
 35 |             )
 36 |     return CAPER_LEADER_JOB_NAME_PREFIX + job_name
 37 | 
 38 | 
 39 | class HpcWrapper(ABC):
 40 |     def __init__(
 41 |         self,
 42 |         leader_job_resource_param=[],
 43 |     ):
 44 |         """Base class for HPC job engine wrapper."""
 45 |         self._leader_job_resource_param = leader_job_resource_param
 46 | 
 47 |     def submit(self, job_name, caper_run_command):
 48 |         """Submits a caper leader job to HPC (e.g. sbatch, qsub).
 49 |         Such leader job will be prefixed with CAPER_LEADER_JOB_NAME_PREFIX.
 50 | 
 51 |         Returns output STDOUT from submission command.
 52 |         """
 53 |         home_dir = f'{str(Path.home())}{os.sep}'
 54 |         with NamedTemporaryFile(prefix=home_dir, suffix='.sh') as shell_script:
 55 |             contents = make_bash_script_contents(' '.join(caper_run_command))
 56 |             shell_script.write(contents.encode())
 57 |             shell_script.flush()
 58 | 
 59 |             return self._submit(job_name, shell_script.name)
 60 | 
 61 |     def list(self):
 62 |         """Filters out non-caper jobs from the job list keeping the first line (header).
 63 |         And then returns output STDOUT.
 64 |         """
 65 |         result = []
 66 |         lines = self._list().split('\n')
 67 | 
 68 |         # keep header
 69 |         result.append(lines[0])
 70 | 
 71 |         # filter out non-caper lines
 72 |         logger.info('Filtering out non-caper leader jobs...')
 73 |         for line in lines[1:]:
 74 |             if CAPER_LEADER_JOB_NAME_PREFIX in line:
 75 |                 result.append(line)
 76 | 
 77 |         return '\n'.join(result)
 78 | 
 79 |     def abort(self, job_ids):
 80 |         """Returns output STDOUT from job engine's abort command (e.g. scancel, qdel)."""
 81 |         return self._abort(job_ids)
 82 | 
 83 |     @abstractmethod
 84 |     def _submit(self, job_name, shell_script):
 85 |         pass
 86 | 
 87 |     def _list(self):
 88 |         pass
 89 | 
 90 |     @abstractmethod
 91 |     def _abort(self, job_ids):
 92 |         """Sends SIGINT (or SIGTERM) to Caper for a graceful shutdown."""
 93 |         pass
 94 | 
 95 |     def _run_command(self, command):
 96 |         """Runs a shell command line and returns STDOUT."""
 97 |         logger.info(f'Running shell command: {" ".join(command)}')
 98 |         return (
 99 |             subprocess.run(
100 |                 command,
101 |                 stdout=subprocess.PIPE,
102 |                 env=os.environ,
103 |             )
104 |             .stdout.decode()
105 |             .strip()
106 |         )
107 | 
108 | 
109 | class SlurmWrapper(HpcWrapper):
110 |     DEFAULT_LEADER_JOB_RESOURCE_PARAM = ['-t', '48:00:00', '--mem', '4G']
111 | 
112 |     def __init__(
113 |         self,
114 |         leader_job_resource_param=DEFAULT_LEADER_JOB_RESOURCE_PARAM,
115 |         slurm_partition=None,
116 |         slurm_account=None,
117 |     ):
118 |         super().__init__(
119 |             leader_job_resource_param=leader_job_resource_param,
120 |         )
121 |         slurm_partition_param = ['-p', slurm_partition] if slurm_partition else []
122 |         slurm_account_param = ['-A', slurm_account] if slurm_account else []
123 |         self._slurm_extra_param = slurm_partition_param + slurm_account_param
124 | 
125 |     def _submit(self, job_name, shell_script):
126 |         command = (
127 |             ['sbatch']
128 |             + self._leader_job_resource_param
129 |             + self._slurm_extra_param
130 |             + [
131 |                 '--export=ALL',
132 |                 '-J',
133 |                 make_caper_leader_job_name(job_name),
134 |                 shell_script,
135 |             ]
136 |         )
137 |         return self._run_command(command)
138 | 
139 |     def _list(self):
140 |         return self._run_command(
141 |             [
142 |                 'squeue',
143 |                 '-u',
144 |                 get_user_from_os_environ(),
145 |                 '--Format=JobID,Name,State,SubmitTime',
146 |             ]
147 |         )
148 | 
149 |     def _abort(self, job_ids):
150 |         """Notes: --full is necessary to correctly send SIGINT to the leader job (Cromwell process).
151 |         Sending SIGTERM may result in an immediate shutdown of the leaderjob on some clusters.
152 |         SIGINT is much better to trigger a graceful shutdown.
153 |         """
154 |         return self._run_command(['scancel', '--full', '--signal=SIGINT'] + job_ids)
155 | 
156 | 
157 | class SgeWrapper(HpcWrapper):
158 |     DEFAULT_LEADER_JOB_RESOURCE_PARAM = ['-l', 'h_rt=48:00:00,h_vmem=4G']
159 | 
160 |     def __init__(
161 |         self,
162 |         leader_job_resource_param=DEFAULT_LEADER_JOB_RESOURCE_PARAM,
163 |         sge_queue=None,
164 |     ):
165 |         super().__init__(
166 |             leader_job_resource_param=leader_job_resource_param,
167 |         )
168 |         self._sge_queue_param = ['-q', sge_queue] if sge_queue else []
169 | 
170 |     def _submit(self, job_name, shell_script):
171 |         command = (
172 |             ['qsub']
173 |             + self._leader_job_resource_param
174 |             + self._sge_queue_param
175 |             + ['-V', '-terse', '-N', make_caper_leader_job_name(job_name), shell_script]
176 |         )
177 |         return self._run_command(command)
178 | 
179 |     def _list(self):
180 |         return self._run_command(['qstat', '-u', get_user_from_os_environ()])
181 | 
182 |     def _abort(self, job_ids):
183 |         return self._run_command(['qdel'] + job_ids)
184 | 
185 | 
186 | class PbsWrapper(HpcWrapper):
187 |     DEFAULT_LEADER_JOB_RESOURCE_PARAM = ['-l', 'walltime=48:00:00,mem=4gb']
188 | 
189 |     def __init__(
190 |         self,
191 |         leader_job_resource_param=DEFAULT_LEADER_JOB_RESOURCE_PARAM,
192 |         pbs_queue=None,
193 |     ):
194 |         super().__init__(
195 |             leader_job_resource_param=leader_job_resource_param,
196 |         )
197 |         self._pbs_queue_param = ['-q', pbs_queue] if pbs_queue else []
198 | 
199 |     def _submit(self, job_name, shell_script):
200 |         command = (
201 |             ['qsub']
202 |             + self._leader_job_resource_param
203 |             + self._pbs_queue_param
204 |             + ['-V', '-N', make_caper_leader_job_name(job_name), shell_script]
205 |         )
206 |         return self._run_command(command)
207 | 
208 |     def _list(self):
209 |         return self._run_command(['qstat', '-u', get_user_from_os_environ()])
210 | 
211 |     def _abort(self, job_ids):
212 |         return self._run_command(['qdel', '-W', '30'] + job_ids)
213 | 
214 | 
215 | class LsfWrapper(HpcWrapper):
216 |     DEFAULT_LEADER_JOB_RESOURCE_PARAM = ['-W', '2880', '-M', '4g']
217 | 
218 |     def __init__(
219 |         self,
220 |         leader_job_resource_param=DEFAULT_LEADER_JOB_RESOURCE_PARAM,
221 |         lsf_queue=None,
222 |     ):
223 |         super().__init__(
224 |             leader_job_resource_param=leader_job_resource_param,
225 |         )
226 |         self._lsf_queue_param = ['-q', lsf_queue] if lsf_queue else []
227 | 
228 |     def _submit(self, job_name, shell_script):
229 |         command = (
230 |             ['bsub']
231 |             + self._leader_job_resource_param
232 |             + self._lsf_queue_param
233 |             + ['-env', 'all', '-J', make_caper_leader_job_name(job_name), shell_script]
234 |         )
235 |         return self._run_command(command)
236 | 
237 |     def _list(self):
238 |         return self._run_command(['bjobs', '-u', get_user_from_os_environ()])
239 | 
240 |     def _abort(self, job_ids):
241 |         return self._run_command(['bkill'] + job_ids)
242 | 


--------------------------------------------------------------------------------
/caper/nb_subproc_thread.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import signal
  3 | import time
  4 | from subprocess import PIPE, Popen
  5 | from threading import Thread
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | interrupted = False
  9 | terminated = False
 10 | 
 11 | 
 12 | def sigterm_handler(signo, frame):
 13 |     global terminated
 14 |     logger.info('Received SIGTERM.')
 15 |     terminated = True
 16 | 
 17 | 
 18 | def sigint_handler(signo, frame):
 19 |     global interrupted
 20 |     logger.info('Received SIGINT.')
 21 |     interrupted = True
 22 | 
 23 | 
 24 | signal.signal(signal.SIGTERM, sigterm_handler)
 25 | signal.signal(signal.SIGINT, sigint_handler)
 26 | 
 27 | 
 28 | def is_fileobj_open(fileobj):
 29 |     return fileobj and not getattr(fileobj, 'closed', False)
 30 | 
 31 | 
 32 | class NBSubprocThread(Thread):
 33 |     DEFAULT_POLL_INTERVAL_SEC = 0.01
 34 |     DEFAULT_SUBPROCESS_NAME = 'Subprocess'
 35 |     DEFAULT_STOP_SIGNAL = signal.SIGTERM
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         args,
 40 |         cwd=None,
 41 |         stdin=None,
 42 |         on_poll=None,
 43 |         on_stdout=None,
 44 |         on_stderr=None,
 45 |         on_finish=None,
 46 |         poll_interval=DEFAULT_POLL_INTERVAL_SEC,
 47 |         quiet=False,
 48 |         subprocess_name=DEFAULT_SUBPROCESS_NAME,
 49 |     ):
 50 |         """Non-blocking STDOUT/STDERR streaming for subprocess.Popen().
 51 | 
 52 |         This class makes two daemonized threads for nonblocking
 53 |         streaming of STDOUT/STDERR.
 54 | 
 55 |         Note that return value of callback functions are updated
 56 |         for the following properties:
 57 |             - status:
 58 |                 Updated with return value of on_poll, on_stdout, on_stderr.
 59 |                 If return value is None then no update.
 60 |             - returnvalue:
 61 |                 Updated with return value of on_finish.
 62 |                 If return value is None then no update.
 63 | 
 64 |         This is useful to check status of the thread and
 65 |         get the final return value of the function that this class
 66 |         actually runs.
 67 | 
 68 |         Args:
 69 |             args:
 70 |                 List of command line arguments.
 71 |             cwd:
 72 |                 subprocess.Popen's cwd.
 73 |             stdin:
 74 |                 subprocess.Popen's stdin.
 75 |                 Note that subprocess.Popen's stdout/stderr is fixed
 76 |                 at subprocess.PIPE/subprocess.STDOUT.
 77 |             on_poll:
 78 |                 Callback on every polling.
 79 |                 If return value is not None then it is used for updating property `status`.
 80 |             on_stdout:
 81 |                 Callback on every non-empty STDOUT line.
 82 |                 If return value is not None then it is used for updating property `status`.
 83 |                 This callback function should take one argument:
 84 |                     - stdout (str):
 85 |                         New incoming STDOUT line string with trailing newline (backslash n).
 86 |             on_stderr:
 87 |                 Callback on every non-empty STDERR line.
 88 |                 If return value is not None then it is used for updating property `status`.
 89 |                 This callback function should take one argument:
 90 |                     - stderr (str):
 91 |                         New incoming STDERR line string with trailing newline (backslash n).
 92 |             on_finish:
 93 |                 Callback on terminating/completing a thread.
 94 |                 If return value is not None then it is used for updating property `returnvalue`.
 95 |             poll_interval (float):
 96 |                 Polling interval in seconds.
 97 |             quiet:
 98 |                 No logging.
 99 |             subprocess_name:
100 |                 Subprocess name for logging.
101 |             signal_handler:
102 |                 Signal handler for a graceful shutdown.
103 |         """
104 |         super().__init__(
105 |             target=self._popen,
106 |             args=(args, cwd, stdin, on_poll, on_stdout, on_stderr, on_finish),
107 |         )
108 |         self._poll_interval = poll_interval
109 |         self._quiet = quiet
110 |         self._subprocess_name = subprocess_name
111 | 
112 |         self._stdout_list = []
113 |         self._stderr_list = []
114 |         self._returncode = None
115 |         self._stop_it = False
116 |         self._stop_signal = None
117 |         self._status = None
118 |         self._returnvalue = None
119 | 
120 |     @property
121 |     def stdout(self):
122 |         return ''.join(self._stdout_list)
123 | 
124 |     @property
125 |     def stderr(self):
126 |         return ''.join(self._stderr_list)
127 | 
128 |     @property
129 |     def returncode(self):
130 |         """Returns subprocess.Popen.returncode.
131 |         None if not completed or any general Exception occurs.
132 |         """
133 |         return self._returncode
134 | 
135 |     @property
136 |     def status(self):
137 |         """Updated with return value of on_poll() for every polling.
138 |         Also updated with return value of on_stdout() or on_stderr()
139 |         if their return values are not None.
140 |         """
141 |         return self._status
142 | 
143 |     @property
144 |     def returnvalue(self):
145 |         """Updated with return value of on_finish()
146 |         which is called when a thread is terminated.
147 |         None if thread is still running so that on_finish() has not been called yet.
148 |         This works like an actual return value of the function ran inside a thread.
149 |         """
150 |         return self._returnvalue
151 | 
152 |     def stop(self, stop_signal=DEFAULT_STOP_SIGNAL, wait=False):
153 |         """Subprocess will be teminated after next polling.
154 | 
155 |         Args:
156 |             wait:
157 |                 Wait for a valid returncode (which is not None).
158 |         """
159 |         self._stop_it = True
160 |         self._stop_signal = stop_signal
161 |         if wait:
162 |             if self._returncode is None:
163 |                 logger.info(
164 |                     '{name}: waiting for a graceful shutdown...'.format(
165 |                         name=self._subprocess_name
166 |                     )
167 |                 )
168 |             while True:
169 |                 if self._returncode is not None:
170 |                     return
171 |                 time.sleep(self._poll_interval)
172 | 
173 |     def _popen(
174 |         self,
175 |         args,
176 |         cwd=None,
177 |         stdin=None,
178 |         on_poll=None,
179 |         on_stdout=None,
180 |         on_stderr=None,
181 |         on_finish=None,
182 |     ):
183 |         """Wrapper for subprocess.Popen()."""
184 |         global terminated
185 |         global interrupted
186 | 
187 |         def read_stdout(stdout_bytes):
188 |             text = stdout_bytes.decode()
189 |             if text:
190 |                 self._stdout_list.append(text)
191 |                 if on_stdout:
192 |                     ret_on_stdout = on_stdout(text)
193 |                     if ret_on_stdout is not None:
194 |                         self._status = ret_on_stdout
195 | 
196 |         def read_stderr(stderr_bytes):
197 |             text = stderr_bytes.decode()
198 |             if text:
199 |                 self._stderr_list.append(text)
200 |                 if on_stderr:
201 |                     ret_on_stderr = on_stderr(text)
202 |                     if ret_on_stderr is not None:
203 |                         self._status = ret_on_stderr
204 | 
205 |         def read_from_stdout_obj(stdout):
206 |             if is_fileobj_open(stdout):
207 |                 for line in iter(stdout.readline, b''):
208 |                     read_stdout(line)
209 | 
210 |         def read_from_stderr_obj(stderr):
211 |             if is_fileobj_open(stderr):
212 |                 for line in iter(stderr.readline, b''):
213 |                     read_stderr(line)
214 | 
215 |         self._stop_it = False
216 | 
217 |         try:
218 |             p = Popen(args, stdout=PIPE, stderr=PIPE, cwd=cwd, stdin=stdin)
219 |             thread_stdout = Thread(
220 |                 target=read_from_stdout_obj, args=(p.stdout,), daemon=True
221 |             )
222 |             thread_stderr = Thread(
223 |                 target=read_from_stderr_obj, args=(p.stderr,), daemon=True
224 |             )
225 |             thread_stdout.start()
226 |             thread_stderr.start()
227 | 
228 |             while True:
229 |                 if on_poll:
230 |                     ret_on_poll = on_poll()
231 |                     if ret_on_poll is not None:
232 |                         self._status = ret_on_poll
233 |                 if p.poll() is not None:
234 |                     self._returncode = p.poll()
235 |                     break
236 | 
237 |                 if terminated or interrupted or self._stop_it and self._stop_signal:
238 |                     if terminated:
239 |                         stop_signal = signal.SIGTERM
240 |                     elif interrupted:
241 |                         stop_signal = signal.SIGINT
242 |                     else:
243 |                         stop_signal = self._stop_signal
244 | 
245 |                     logger.info(
246 |                         f'Sending signal {stop_signal} to subprocess. '
247 |                         f'name: {self._subprocess_name}, pid: {p.pid}'
248 |                     )
249 |                     p.send_signal(stop_signal)
250 | 
251 |                     self._returncode = p.returncode
252 |                     break
253 | 
254 |                 time.sleep(self._poll_interval)
255 | 
256 |         except Exception as e:
257 |             if not self._quiet:
258 |                 logger.error(e, exc_info=True)
259 |             self._returncode = 127
260 | 
261 |         else:
262 |             stdout_bytes, stderr_bytes = p.communicate()
263 |             read_stdout(stdout_bytes)
264 |             read_stderr(stderr_bytes)
265 |             self._returncode = p.returncode
266 | 
267 |         if on_finish:
268 |             ret_on_finish = on_finish()
269 |             if ret_on_finish is not None:
270 |                 self._returnvalue = ret_on_finish
271 | 
272 |         if not self._quiet:
273 |             if self._returncode:
274 |                 logger.error(
275 |                     '{name} failed. returncode={rc}'.format(
276 |                         name=self._subprocess_name, rc=self._returncode
277 |                     )
278 |                 )
279 |             else:
280 |                 logger.info(
281 |                     '{name} finished successfully.'.format(name=self._subprocess_name)
282 |                 )
283 | 


--------------------------------------------------------------------------------
/caper/server_heartbeat.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import socket
  3 | import time
  4 | from threading import Thread
  5 | 
  6 | from autouri import AutoURI
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class ServerHeartbeatTimeoutError(Exception):
 12 |     pass
 13 | 
 14 | 
 15 | class ServerHeartbeat:
 16 |     DEFAULT_SERVER_HEARTBEAT_FILE = '~/.caper/default_server_heartbeat'
 17 |     DEFAULT_HEARTBEAT_TIMEOUT_MS = 120000
 18 |     DEFAULT_INTERVAL_UPDATE_HEARTBEAT_SEC = 60.0
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         heartbeat_file=DEFAULT_SERVER_HEARTBEAT_FILE,
 23 |         heartbeat_timeout=DEFAULT_HEARTBEAT_TIMEOUT_MS,
 24 |         interval_update_heartbeat=DEFAULT_INTERVAL_UPDATE_HEARTBEAT_SEC,
 25 |     ):
 26 |         """Server heartbeat to share store server's hostname/port with clients.
 27 | 
 28 |         Args:
 29 |             heartbeat_file:
 30 |                 Server writes hostname/port on this file.
 31 |                 Client reads hostname/port from this file.
 32 |             heartbeat_timeout:
 33 |                 Expiration period for a heartbeat file (in milliseconds).
 34 |                 Client will use a heartbeat file only if it is fresh (within timeout).
 35 |             interval_update_heartbeat:
 36 |                 Period for updtaing a heartbeat file (in seconds).
 37 |         """
 38 |         self._heartbeat_file = heartbeat_file
 39 |         self._heartbeat_timeout = heartbeat_timeout
 40 |         self._interval_update_heartbeat = interval_update_heartbeat
 41 | 
 42 |         self._stop_it = False
 43 |         self._thread = None
 44 | 
 45 |     def start(self, port, hostname=None):
 46 |         """Starts a thread that writes hostname/port of a server
 47 |         on a heartbeat file.
 48 | 
 49 |         Args:
 50 |             port:
 51 |                 This port will be written to a heartbeat file.
 52 |             hostname:
 53 |                 Optional hostname to be written to heartbeat file.
 54 |                 socket.gethostname() will be used if not defined.
 55 |         """
 56 |         self._thread = Thread(target=self._write_to_file, args=(port, hostname))
 57 |         self._thread.start()
 58 |         return self._thread
 59 | 
 60 |     def is_alive(self):
 61 |         return self._thread.is_alive() if self._thread else False
 62 | 
 63 |     def stop(self):
 64 |         self._stop_it = True
 65 | 
 66 |         if self._thread:
 67 |             self._thread.join()
 68 | 
 69 |     def read(self, raise_timeout=False):
 70 |         """Read from heartbeat file.
 71 |         If a heartbeat file is not fresh (mtime difference < timeout)
 72 |         then None is returned.
 73 | 
 74 |         Returns:
 75 |             Tuple of (hostname, port)
 76 |         """
 77 |         try:
 78 |             u = AutoURI(self._heartbeat_file)
 79 |             if (time.time() - u.mtime) * 1000.0 > self._heartbeat_timeout:
 80 |                 raise ServerHeartbeatTimeoutError
 81 |             else:
 82 |                 hostname, port = u.read().strip('\n').split(':')
 83 |                 logger.info(
 84 |                     'Reading hostname/port from a heartbeat file. {h}:{p}'.format(
 85 |                         h=hostname, p=port
 86 |                     )
 87 |                 )
 88 |                 return hostname, int(port)
 89 | 
 90 |         except ServerHeartbeatTimeoutError:
 91 |             logger.error(
 92 |                 'Found a heartbeat file but it has been expired (> timeout)'
 93 |                 '. {f}'.format(f=self._heartbeat_file)
 94 |             )
 95 |             if raise_timeout:
 96 |                 raise
 97 | 
 98 |         except Exception:
 99 |             logger.error(
100 |                 'Failed to read from a heartbeat file. {f}'.format(
101 |                     f=self._heartbeat_file
102 |                 )
103 |             )
104 | 
105 |     def _write_to_file(self, port, hostname=None):
106 |         if not hostname:
107 |             hostname = socket.gethostname()
108 | 
109 |         logger.info('Server heartbeat thread started.')
110 | 
111 |         while True:
112 |             try:
113 |                 logger.debug(
114 |                     'Writing heartbeat: {hostname}, {port}'.format(
115 |                         hostname=hostname, port=port
116 |                     )
117 |                 )
118 |                 AutoURI(self._heartbeat_file).write(
119 |                     '{hostname}:{port}'.format(hostname=hostname, port=port)
120 |                 )
121 |             except Exception:
122 |                 logger.error(
123 |                     'Failed to write to a heartbeat_file. {f}'.format(
124 |                         f=self._heartbeat_file
125 |                     )
126 |                 )
127 |             cnt = 0
128 |             while cnt < self._interval_update_heartbeat:
129 |                 cnt += 1
130 |                 if self._stop_it:
131 |                     break
132 |                 time.sleep(1)
133 |             if self._stop_it:
134 |                 break
135 | 
136 |         logger.info('Server heartbeat thread ended.')
137 | 


--------------------------------------------------------------------------------
/caper/singularity.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from autouri import AbsPath, AutoURI, URIBase
 5 | from autouri.loc_aux import recurse_json
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | DEFAULT_COMMON_ROOT_SEARCH_LEVEL = 5
11 | 
12 | 
13 | def find_bindpath(json_file, common_root_search_level=DEFAULT_COMMON_ROOT_SEARCH_LEVEL):
14 |     """Recursively find paths to be bound for singularity.
15 |     Find common roots for all files in an input JSON file.
16 |     This function will recursively visit all values in input JSON and
17 |     also JSON, TSV, CSV files in the input JSON itself.
18 | 
19 |     This function visit all files in input JSON.
20 |     Files with some extensions (defined by Autouri's URIBase.LOC_RECURSE_EXT_AND_FNC)
21 |     are recursively visited.
22 | 
23 |     Add all (but not too high level<4) parent directories
24 |     to all_dirnames. start from original
25 |     For example, we have /a/b/c/d/e/f/g/h with common_root_search_level = 5
26 |         add all the followings:
27 |         /a/b/c/d/e/f/g/h (org)
28 |         /a/b/c/d/e/f/g
29 |         /a/b/c/d/e/f
30 |         /a/b/c/d/e
31 |         /a/b/c/d (minimum level = COMMON_ROOT_SEARCH_LEVEL-1)
32 | 
33 |     Args:
34 |         json_file:
35 |             Input JSON file which have local paths in it.
36 |             Non-path values will be just ignored.
37 |         common_root_search_level:
38 |             See above description.
39 |     """
40 |     json_contents = AutoURI(json_file).read()
41 |     all_dirnames = []
42 | 
43 |     def find_dirname(s):
44 |         u = AbsPath(s)
45 |         if u.is_valid:
46 |             for ext, recurse_fnc_for_ext in URIBase.LOC_RECURSE_EXT_AND_FNC.items():
47 |                 if u.ext == ext:
48 |                     _, _ = recurse_fnc_for_ext(u.read(), find_dirname)
49 |             # file can be a soft-link
50 |             # singularity will want to have access to both soft-link and real one
51 |             # so add dirnames of both soft-link and realpath
52 |             all_dirnames.append(u.dirname)
53 |             all_dirnames.append(os.path.dirname(os.path.realpath(u.uri)))
54 |         return None, False
55 | 
56 |     _, _ = recurse_json(json_contents, find_dirname)
57 | 
58 |     all_dnames_incl_parents = set(all_dirnames)
59 |     for d in all_dirnames:
60 |         dir_arr = d.split(os.sep)
61 |         for i, _ in enumerate(dir_arr[common_root_search_level:]):
62 |             d_child = os.sep.join(dir_arr[: i + common_root_search_level])
63 |             all_dnames_incl_parents.add(d_child)
64 | 
65 |     bindpaths = set()
66 |     # remove overlapping directories
67 |     for i, d1 in enumerate(sorted(all_dnames_incl_parents, reverse=True)):
68 |         overlap_found = False
69 |         for j, d2 in enumerate(sorted(all_dnames_incl_parents, reverse=True)):
70 |             if i >= j:
71 |                 continue
72 |             if d1.startswith(d2):
73 |                 overlap_found = True
74 |                 break
75 |         if not overlap_found:
76 |             bindpaths.add(d1)
77 | 
78 |     return ','.join(bindpaths)
79 | 


--------------------------------------------------------------------------------
/caper/wdl_parser.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import re
  4 | import shutil
  5 | from tempfile import TemporaryDirectory
  6 | 
  7 | from autouri import HTTPURL, AbsPath, AutoURI
  8 | from WDL import parse_document
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class WDLParser:
 14 |     RE_WDL_IMPORT = r'^\s*import\s+[\"\'](.+)[\"\']\s*'
 15 |     RECURSION_DEPTH_LIMIT = 20
 16 |     BASENAME_IMPORTS = 'imports.zip'
 17 | 
 18 |     def __init__(self, wdl):
 19 |         """Wraps miniwdl's parse_document()."""
 20 |         u = AutoURI(wdl)
 21 |         if not u.exists:
 22 |             raise FileNotFoundError('WDL does not exist: wdl={wdl}'.format(wdl=wdl))
 23 |         self._wdl = wdl
 24 |         self._wdl_contents = AutoURI(wdl).read()
 25 |         try:
 26 |             self._wdl_doc = parse_document(self._wdl_contents)
 27 |         except Exception:
 28 |             logger.error('Failed to parse WDL with miniwdl.')
 29 |             self._wdl_doc = None
 30 | 
 31 |     @property
 32 |     def contents(self):
 33 |         return self._wdl_contents
 34 | 
 35 |     @property
 36 |     def workflow_meta(self):
 37 |         if self._wdl_doc:
 38 |             return self._wdl_doc.workflow.meta
 39 | 
 40 |     @property
 41 |     def workflow_parameter_meta(self):
 42 |         if self._wdl_doc:
 43 |             return self._wdl_doc.workflow.parameter_meta
 44 | 
 45 |     @property
 46 |     def imports(self):
 47 |         """Miniwdl (0.3.7) has a bug for URL imports.
 48 |         Keep using reg-ex to find imports until it's fixed.
 49 |         Returns:
 50 |             List of URIs of imported subworkflows.
 51 |         """
 52 |         try:
 53 |             return [i.uri for i in self._wdl_doc.imports]
 54 |         except Exception:
 55 |             pass
 56 |         return self._find_val_of_matched_lines(WDLParser.RE_WDL_IMPORT)
 57 | 
 58 |     def zip_subworkflows(self, zip_file):
 59 |         """Recursively find/zip imported subworkflow WDLs
 60 |         This will zip sub-WDLs with relative paths only.
 61 |         i.e. URIs are ignored.
 62 |         For this (main) workflow, any URI is allowed.
 63 |         However, only subworkflows with relative path will be zipped
 64 |         since there is no way to make directory structure to zip them.
 65 |         Returns:
 66 |             Zipped imports file.
 67 |             None if no subworkflows recursively found in WDL.
 68 |         """
 69 |         with TemporaryDirectory() as tmp_d:
 70 |             # localize WDL first. If it's already local
 71 |             # then will use its original path without loc.
 72 |             wdl = AutoURI(self._wdl).localize_on(tmp_d)
 73 |             # keep directory structure as they imported
 74 |             num_sub_wf_packed = self.__recurse_zip_subworkflows(
 75 |                 root_zip_dir=tmp_d, root_wdl_dir=AutoURI(wdl).dirname
 76 |             )
 77 |             if num_sub_wf_packed:
 78 |                 shutil.make_archive(AutoURI(zip_file).uri_wo_ext, 'zip', tmp_d)
 79 |                 return zip_file
 80 | 
 81 |     def create_imports_file(self, directory, basename=BASENAME_IMPORTS):
 82 |         """Wrapper for zip_subworkflows.
 83 |         This creates an imports zip file with basename on directory.
 84 |         """
 85 |         zip_file = os.path.join(directory, basename)
 86 |         if self.zip_subworkflows(zip_file):
 87 |             return zip_file
 88 | 
 89 |     def _find_val_of_matched_lines(self, regex, no_strip=False):
 90 |         """Find value of the first line matching regex.
 91 |         Args:
 92 |             regex:
 93 |                 Regular expression. This should have only one ().
 94 |             no_strip:
 95 |                 Do not strip result strings.
 96 |         Returns:
 97 |             Value of the first line matching regex.
 98 |         """
 99 |         res = []
100 |         for line in self.contents.split('\n'):
101 |             r = re.findall(regex, line)
102 |             if len(r) > 0:
103 |                 res.append(r[0] if no_strip else r[0].strip())
104 |         return res
105 | 
106 |     def __recurse_zip_subworkflows(
107 |         self, root_zip_dir, root_wdl_dir, imported_as_url=False, depth=0
108 |     ):
109 |         """Recurse imported sub-WDLs in main-WDL.
110 |         Unlike Cromwell, Womtool does not take imports.zip while validating WDLs.
111 |         All sub-WDLs should be in a correct directory structure relative to the
112 |         root WDL.
113 |         For Womtool, we should make a temporary directory and unpack imports.zip there and
114 |         need to make a copy of root WDL on it. Then run Womtool to validate them.
115 |         This function is to make such imports.zip.
116 |         Sub-WDLs imported as relative path simply inherit parent's directory.
117 |         Sub-WDLs imported as URL does not inherit parent's directory but root
118 |         WDL's directory.
119 |         Sub-WDLs imported as absolute path are not allowed. This can work with "caper run"
120 |         but not with "caper submit" (or Cromwell submit).
121 |         Args:
122 |             depth: Recursion depth
123 |         Returns:
124 |             Total number of subworkflows:
125 |                 Sub WDL files "recursively" localized on "root_zip_dir".
126 |         """
127 |         if depth > WDLParser.RECURSION_DEPTH_LIMIT:
128 |             raise ValueError(
129 |                 'Reached recursion depth limit while zipping subworkflows recursively. '
130 |                 'Possible cyclic import or self-refencing in WDLs? wdl={wdl}'.format(
131 |                     wdl=self._wdl
132 |                 )
133 |             )
134 | 
135 |         if imported_as_url:
136 |             main_wdl_dir = root_wdl_dir
137 |         else:
138 |             main_wdl_dir = AbsPath(self._wdl).dirname
139 | 
140 |         num_sub_wf_packed = 0
141 |         for sub_rel_to_parent in self.imports:
142 |             sub_wdl_file = AutoURI(sub_rel_to_parent)
143 | 
144 |             if isinstance(sub_wdl_file, HTTPURL):
145 |                 sub_abs = sub_wdl_file.uri
146 |                 imported_as_url_sub = True
147 |             elif isinstance(sub_wdl_file, AbsPath):
148 |                 raise ValueError(
149 |                     'For sub WDL zipping, absolute path is not allowed for sub WDL. '
150 |                     'main={main}, sub={sub}'.format(
151 |                         main=self._wdl, sub=sub_rel_to_parent
152 |                     )
153 |                 )
154 |             else:
155 |                 sub_abs = os.path.realpath(
156 |                     os.path.join(main_wdl_dir, sub_rel_to_parent)
157 |                 )
158 |                 if not AbsPath(sub_abs).exists:
159 |                     raise FileNotFoundError(
160 |                         'Sub WDL does not exist. Did you import main WDL '
161 |                         'as a URL but sub WDL references a local file? '
162 |                         'main={main}, sub={sub}, imported_as_url={i}'.format(
163 |                             main=self._wdl, sub=sub_rel_to_parent, i=imported_as_url
164 |                         )
165 |                     )
166 |                 if not sub_abs.startswith(root_wdl_dir):
167 |                     raise ValueError(
168 |                         'Sub WDL exists but it is out of root WDL directory. '
169 |                         'Too many "../" in your sub WDL? '
170 |                         'Or main WDL is imported as an URL but sub WDL '
171 |                         'has "../"? '
172 |                         'main={main}, sub={sub}, imported_as_url={i}'.format(
173 |                             main=self._wdl, sub=sub_rel_to_parent, i=imported_as_url
174 |                         )
175 |                     )
176 | 
177 |                 # make a copy on zip_dir
178 |                 rel_path = os.path.relpath(sub_abs, root_wdl_dir)
179 |                 cp_dest = os.path.join(root_zip_dir, rel_path)
180 | 
181 |                 AbsPath(sub_abs).cp(cp_dest)
182 |                 num_sub_wf_packed += 1
183 |                 imported_as_url_sub = False
184 | 
185 |             num_sub_wf_packed += WDLParser(sub_abs).__recurse_zip_subworkflows(
186 |                 root_zip_dir=root_zip_dir,
187 |                 root_wdl_dir=root_wdl_dir,
188 |                 imported_as_url=imported_as_url_sub,
189 |                 depth=depth + 1,
190 |             )
191 |         return num_sub_wf_packed
192 | 


--------------------------------------------------------------------------------
/docs/conf_aws.md:
--------------------------------------------------------------------------------
1 | Deprecated. Please see [this](../scripts/aws_caper_server/README.md) instead.
2 | 


--------------------------------------------------------------------------------
/docs/conf_encode_workshop_2019.md:
--------------------------------------------------------------------------------
  1 | # Welcome to the 2019 ENCODE Users' Meeting Pipeline Workshop
  2 | 
  3 | ## Do this before the workshop
  4 | 
  5 | 0. Register by following instructions in the email you received with the subject "Welcome to Using ENCODE in the Cloud".
  6 | 
  7 | 1. Open a web browser (Chrome, Safari, or Edge - Firefox is not supported) and go to [our workshop server instance on Google Cloud Platform console](https://console.cloud.google.com/compute/instancesDetail/zones/us-west1-b/instances/workshop-server?project=encode-workshop).
  8 | 
  9 | 2. Click on the `SSH` button under `Remote Access`.  It may sake several seconds to open a connection to the server instance.
 10 | > **WARNING**: If it takes too long (>2 minutes) to log in, then switch to a "Cloud Shell" method. Click on the inverse triangle next to "SSH" button and choose "View gcloud command". Click on "RUN IN CLOUD SHELL" button in the bottom-right corner. Push Enter to execute the copied command line. Answer "Y" to the question. Push Enter twice to pass two questions.
 11 | 
 12 | 3. Set up your server account:  Soft-link a shared configuration file.
 13 | ```bash
 14 | $ mkdir -p ~/.caper && cd ~/.caper && rm -f ~/.caper/default.conf
 15 | $ ln -s /opt/code/default.conf default.conf
 16 | ```
 17 | 
 18 | 4. Authenticate yourself to get access to buckets. After running each command, follow the link and copy and paste the authentication key into the console.
 19 | ```bash
 20 | $ gcloud auth login --no-launch-browser
 21 | $ gcloud auth application-default login --no-launch-browser
 22 | ```
 23 | 
 24 | ## To do together during workshop
 25 | 
 26 | > **WARNING**: **USERS SHOULD NOT FOLLOW THE BELOW STEPS BEFORE THE WORKSHOP**.
 27 | 
 28 | 5. Submit a workflow to Caper server.
 29 | ```bash
 30 | $ caper submit /opt/code/rna-seq-pipeline/rna-seq-pipeline.wdl -i gs://encode-workshop-samples/rna-seq-pipeline/input_workshop_example_SSD.json
 31 | # you will see the following message. make sure to remember the workflow_id
 32 | # in this example, the workflow_id is f7094621-3d38-48a6-b877-1da2b0cec931
 33 | [Caper] submit:  {'id': 'f7094621-3d38-48a6-b877-1da2b0cec931', 'status': 'Submitted'}
 34 | ```
 35 | 
 36 | 6. Make sure to remember `workflow_id` of your submitted workflow. You can monitor workflows with:
 37 | ```bash
 38 | $ caper list [WORKFLOW_ID]
 39 | 
 40 | # you can also find it by your username
 41 | 
 42 | $ caper list | grep $USER
 43 | ```
 44 | 
 45 | 7. Once your workflow is done (marked as `Succeeded`). Retrieve a `metadata.json` with the following command:
 46 | ```bash
 47 | $ caper metadata [WORKFLOW_ID] > metadata.json
 48 | ```
 49 | 
 50 | 8. Run Croo with the retrieved `metadata.json` to organized outputs on `--out-dir`.
 51 | ```bash
 52 | $ croo metadata.json --out-dir gs://encode-workshop-croo/$USER --out-def-json /opt/code/rna-seq-pipeline/output_definition.json
 53 | ```
 54 | 
 55 | 9. Open a web browser and go to [Google Cloud Storage console](https://console.cloud.google.com/storage/browser/encode-workshop-croo/?project=encode-workshop&folder=true&organizationId=true).
 56 | 
 57 | 10. Navigate to your organized output directory under your username. For example, `gs://encode-workshop-croo/[YOUR_USER_NAME]/`. Click on an HTML file then you will see a nice file table summarizing all outputs with description. Find any bigwig file in it and take a URL for it. That URL will be public so you can use it to visualize the track with your preferred genome browser (for example, you can use [this one](http://epigenomegateway.wustl.edu/legacy/)).
 58 | 
 59 | ## To be done by admins
 60 | 
 61 | 0. Run Croo with the retrieved `metadata.json` to organized outputs locally.
 62 | ```bash
 63 | $ cd /srv/scratch
 64 | $ mkdir -p test_croo && cd test_croo
 65 | 
 66 | $ caper metadata [WORKFLOW_ID] > metadata.json
 67 | $ croo metadata.json --out-def-json /opt/code/rna-seq-pipeline/output_definition.json
 68 | ```
 69 | 
 70 | 
 71 | 
 72 | ## Setting up a Caper server instance (ADMIN ONLY)
 73 | 
 74 | This example is to set up a server instance for the ENCODE workshop 2019 at Seattle. However, this example should also be helpful to set up your own server instance.
 75 | 
 76 | > **WARNING**: This section is for admins only. **USERS SHOULD NOT FOLLOW THE BELOW STEPS ON THE INSTANCE**.
 77 | 
 78 | 1. Create an instance with Debian-based Linux (e.g. Ubuntu). Minimum requirements for the server is CPU >=4, Memorsy > 16GB.
 79 | 
 80 | 2. Install softwares. Install Caper (Cromwell wrapper) and Croo (Cromwell output organizer).
 81 | ```bash
 82 | $ sudo apt-get update && sudo apt-get install -y default-jdk acl python3 python3-pip git wget curl htop
 83 | $ sudo pip3 install caper croo
 84 | ```
 85 | 
 86 | 3. Clone pipeline codes and share them with users. This example will install ENCODE RNA-Seq and Demo pipelines on `/opt/code`.
 87 | ```bash
 88 | $ sudo mkdir /opt/code
 89 | $ sudo chown $USER:$USER /opt/code
 90 | $ cd /opt/code
 91 | $ git clone https://github.com/ENCODE-DCC/rna-seq-pipeline
 92 | $ git clone https://github.com/ENCODE-DCC/demo-pipeline
 93 | ```
 94 | 
 95 | 4. Authenticate yourself.
 96 | ```bash
 97 | $ gcloud auth login --no-launch-browser
 98 | $ gcloud auth application-default login --no-launch-browser
 99 | ```
100 | 
101 | 5. Create a scratch directory for Caper. Any subdirectories under `/srv/scratch` will inherit permissions from their parent directory.
102 | ```bash
103 | $ sudo mkdir /srv/scratch
104 | $ sudo chown $USER:$USER /srv/scratch
105 | $ sudo chmod 777 /srv/scratch
106 | $ sudo setfacl -d -m u::rwx /srv/scratch
107 | $ sudo setfacl -d -m g::rwx /srv/scratch
108 | $ sudo setfacl -d -m o::rwx /srv/scratch
109 | ```
110 | 
111 | 6. Create a Caper configuration file, which will be shared with all users.
112 | ```bash
113 | $ touch /opt/code/default.conf
114 | ```
115 | 
116 | 7. Edit the shared configuration file `/opt/code/default.conf`. You can comment settings for the ENCODE workshop 2019 and uncomment/define your own `gcp-prj`, `tmp-gcs-bucket` and `out-gcs-bucket`.
117 | ```bash
118 | [defaults]
119 | cromwell=/opt/code/cromwell-42.jar
120 | java-heap-server=8G
121 | 
122 | backend=gcp
123 | 
124 | out-dir=/srv/scratch/caper_out
125 | tmp-dir=/srv/scratch/caper_tmp
126 | 
127 | #gcp-prj=[YOUR_GOOGLE_PROJECT]
128 | gcp-prj=encode-workshop
129 | 
130 | #out-gcs-bucket=[YOUR_OUTPUT_BUCKET_FOR_CAPER]
131 | #tmp-gcs-bucket=[YOUR_TMP_BUCKET_FOR_CAPER]
132 | out-gcs-bucket=gs://encode-workshop-outputs/caper_out
133 | tmp-gcs-bucket=gs://encode-workshop-outputs/caper_tmp
134 | 
135 | max-concurrent-workflows=100
136 | ```
137 | 
138 | 8. Download Cromwell 42 JAR and share it with all users.
139 | ```bash
140 | $ cd /opt/code
141 | $ wget https://github.com/broadinstitute/cromwell/releases/download/42/cromwell-42.jar
142 | ```
143 | 
144 | 9. Soft-link a shared configuration file.
145 | ```bash
146 | $ mkdir -p ~/.caper && cd ~/.caper
147 | $ ln -s /opt/code/default.conf default.conf
148 | ```
149 | 
150 | 10. Create Caper's output bucket `gs://encode-workshop-outputs`.
151 | 
152 | 11. Make the bucket public by adding a `Storage Object Viewer` role for `allUsers` to the bucket. This will allow public HTTP access to all files on the bucket, which will be used to visualize some of pipeline outputs (e.g. bigwigs) on a genome browser.
153 | 
154 | 12. Give write permission to **ALL WORKSHOP PARTICIPANTS* (not for all public users). Add `Storage Object Creator` role to all participants. This is to give all participants write access to Caper tmp directory `gs://encode-workshop-outputs/caper_tmp` so that `--deepcopy` does not make duplicate files on the shared bucket. This will also give them write access to `gs://encode-workshop-outputs/croo` so that their organized outputs generates from Croo will be write on that bucket directory.
155 | 
156 | 13. Run a Caper server.
157 | ```bash
158 | $ caper server
159 | ```
160 | 
161 | 14. Make all buckets public (Read access to anyone).
162 | 
163 | 15. Give users the following IAM Roles:
164 | 
165 | 1) For the whole project
166 | 	- Compute Engine > Compute Instance Admin (v1)
167 | 	- Compute Engine > Compute OS Login
168 | 	- Service Account > Service Account User
169 | 
170 | 2) For the croo bucket (`gs://encode-workshop-croo`)
171 | 	- Storage Object Admin
172 | 


--------------------------------------------------------------------------------
/docs/conf_gcp.md:
--------------------------------------------------------------------------------
 1 | Deprecated. Please see [this](../scripts/gcp_caper_server/README.md) instead.
 2 | 
 3 | # DEPRECATED
 4 | 
 5 | # Configuration for Google Cloud Platform backend (`gcp`)
 6 | 
 7 | 1. Sign up for a Google account.
 8 | 2. Go to [Google Project](https://console.developers.google.com/project) page and click "SIGN UP FOR FREE TRIAL" on the top left and agree to terms.
 9 | 3. Set up a payment method and click "START MY FREE TRIAL".
10 | 4. Create a [Google Project](https://console.developers.google.com/project) `[YOUR_PROJECT_NAME]` and choose it on the top of the page.
11 | 5. Create a [Google Cloud Storage bucket](https://console.cloud.google.com/storage/browser) `gs://[YOUR_BUCKET_NAME]` by clicking on a button "CREATE BUCKET" and create it to store pipeline outputs.
12 | 6. Find and enable following APIs in your [API Manager](https://console.developers.google.com/apis/library). Click a back button on your web brower after enabling each.
13 |     * Compute Engine API
14 |     * Google Cloud Storage (DO NOT click on "Create credentials")
15 |     * Google Cloud Storage JSON API
16 |     * Genomics API
17 |     * **Google Cloud Life Sciences API** (for Cromwell's new API, i.e. `--use-google-cloud-life-sciences`)
18 | 
19 | 7. Install [Google Cloud Platform SDK](https://cloud.google.com/sdk/downloads) and authenticate through it. You will be asked to enter verification keys. Get keys from the URLs they provide.
20 |     ```bash
21 |     $ gcloud auth login --no-launch-browser
22 |     $ gcloud auth application-default login --no-launch-browser
23 |     ```
24 | 
25 | 8. If you see permission errors at runtime, then unset environment variable `GOOGLE_APPLICATION_CREDENTIALS` or add it to your BASH startup scripts (`$HOME/.bashrc` or `$HOME/.bash_profile`).
26 |     ```bash
27 |       unset GOOGLE_APPLICATION_CREDENTIALS
28 |     ```
29 | 
30 | 7. Set your default Google Cloud Project. Pipeline will provision instances on this project.
31 |     ```bash
32 |     $ gcloud config set project [YOUR_PROJECT_NAME]
33 |     ```
34 | 
35 | # Setting up a Caper server instance
36 | 
37 | You will find [this](./conf_encode_workshop_2019.md) useful to set up your own Caper server on Google Cloud Platform.
38 | 
39 | # How to run Caper with a service account
40 | 
41 | On your Google Cloud Console, create a service account (`IAM & Admin` -> `Service Accounts`) with the following roles. You can add roles later in `IAM & Admin` -> `IAM`.
42 |     * Service Account User
43 |     * Compute Admin
44 |     * Genomics Admin
45 |     * **Cloud Life Sciences Admin** (for Cromwell's new API, i.e. `--use-google-cloud-life-sciences`)
46 |     * Storage Admin (or set it up for an individual bucket)
47 | 
48 | Create a secret key JSON file for your service account. Make sure that your service account has enough permission for provionsing VM instances and write permission on output/work Google Cloud Storage buckets (`--gcp-out-dir` and `--gcp-work-dir`).
49 | 
50 | > **IMPORTANT**: Click on the created service account and make sure that `Enable G Suite Domain-wide Delegation` is checked to prevent the following permission error.
51 | 
52 | ```
53 | 400 Bad Request
54 | POST https://lifesciences.googleapis.com/v2beta/projects/99884963860/locations/us-central1/operations/XXXXXXXXXXXXXXXXXXXX:cancel
55 | {
56 |   "code" : 400,
57 |   "errors" : [ {
58 |     "domain" : "global",
59 |     "message" : "Precondition check failed.",
60 |     "reason" : "failedPrecondition"
61 |   } ],
62 |   "message" : "Precondition check failed.",
63 |   "status" : "FAILED_PRECONDITION"
64 | }
65 | ```
66 | 


--------------------------------------------------------------------------------
/docs/resource_param.md:
--------------------------------------------------------------------------------
 1 | # Resource parameters for HPC backends (slurm, sge, pbs, lsf)
 2 | 
 3 | Note that Cromwell's implicit type conversion (`String` to `Integer`) seems to be buggy for WomLong type memory variables (`memory_mb` and `memory_gb`). So be careful about using the `+` operator between `WomLong` and other types (`String`, even `Int`). See https://github.com/broadinstitute/cromwell/issues/4659
 4 | 
 5 | For example, `${"--mem=" + memory_mb}` will not work since `memory_mb` is `WomLong`. Use `${"if defined(memory_mb) then "--mem=" else ""}{memory_mb}${"if defined(memory_mb) then "mb " else " "}`.
 6 | 
 7 | You can use Cromwell's built-in variables (attributes defined in WDL task's runtime) within Cromwell's `${}` notation.
 8 | - `cpu`: Number of cores for a job (default = 1)
 9 | - `memory_mb`, `memory_gb`: Total memory for a job in MB or GB. These are converted from 'memory' string attribute (including size unit)
10 |  defined in WDL task's runtime
11 | - `time`: Time limit for a job in hour
12 | - `gpu`: Specified gpu name or number of gpus (it's declared as String)
13 | 
14 | # How to configure resource parameters on HPCs
15 | 
16 | Open `~/.caper/default.conf` with a text editor and add the following code lines according to your HPC type. Following commented instructions to customize resource parameters of HPC's submit/monitor/delete commands.
17 | 
18 | ## SLURM
19 | 
20 | ```ini
21 | # This parameter defines resource parameters for Caper's leader job only.
22 | slurm-leader-job-resource-param=-t 48:00:00 --mem 4G
23 | 
24 | # This parameter defines resource parameters for submitting WDL task to job engine.
25 | # It is for HPC backends only (slurm, sge, pbs and lsf).
26 | # It is not recommended to change it unless your cluster has custom resource settings.
27 | # See https://github.com/ENCODE-DCC/caper/blob/master/docs/resource_param.md for details.
28 | slurm-resource-param=-n 1 --ntasks-per-node=1 --cpus-per-task=${cpu} ${if defined(memory_mb) then "--mem=" else ""}${memory_mb}${if defined(memory_mb) then "M" else ""} ${if defined(time) then "--time=" else ""}${time*60} ${if defined(gpu) then "--gres=gpu:" else ""}${gpu}
29 | 
30 | ```
31 | 
32 | ## SGE
33 | 
34 | ```ini
35 | # This parameter defines resource parameters for Caper's leader job only.
36 | sge-leader-job-resource-param=-l h_rt=48:00:00,h_vmem=4G
37 | 
38 | # Parallel environment of SGE:
39 | # Find one with `$ qconf -spl` or ask you admin to add one if not exists.
40 | # If your cluster works without PE then edit the below sge-resource-param
41 | sge-pe=
42 | 
43 | # This parameter defines resource parameters for submitting WDL task to job engine.
44 | # It is for HPC backends only (slurm, sge, pbs and lsf).
45 | # It is not recommended to change it unless your cluster has custom resource settings.
46 | # See https://github.com/ENCODE-DCC/caper/blob/master/docs/resource_param.md for details.
47 | sge-resource-param=${if cpu > 1 then "-pe " + sge_pe + " " else ""} ${if cpu > 1 then cpu else ""} ${true="-l h_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} ${true="-l s_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} ${"-l h_rt=" + time + ":00:00"} ${"-l s_rt=" + time + ":00:00"} ${"-l gpu=" + gpu}
48 | ```
49 | 
50 | ## PBS
51 | 
52 | ```ini
53 | # This parameter defines resource parameters for Caper's leader job only.
54 | pbs-leader-job-resource-param=-l walltime=48:00:00,mem=4gb
55 | 
56 | # This parameter defines resource parameters for submitting WDL task to job engine.
57 | # It is for HPC backends only (slurm, sge, pbs and lsf).
58 | # It is not recommended to change it unless your cluster has custom resource settings.
59 | # See https://github.com/ENCODE-DCC/caper/blob/master/docs/resource_param.md for details.
60 | pbs-resource-param=${"-lnodes=1:ppn=" + cpu}${if defined(gpu) then ":gpus=" + gpu else ""} ${if defined(memory_mb) then "-l mem=" else ""}${memory_mb}${if defined(memory_mb) then "mb" else ""} ${"-lwalltime=" + time + ":0:0"}
61 | ```
62 | 
63 | ## LSF
64 | 
65 | ```ini
66 | # This parameter defines resource parameters for Caper's leader job only.
67 | lsf-leader-job-resource-param=-W 2880 -M 4g
68 | 
69 | # This parameter defines resource parameters for submitting WDL task to job engine.
70 | # It is for HPC backends only (slurm, sge, pbs and lsf).
71 | # It is not recommended to change it unless your cluster has custom resource settings.
72 | # See https://github.com/ENCODE-DCC/caper/blob/master/docs/resource_param.md for details.
73 | lsf-resource-param=${"-n " + cpu} ${if defined(gpu) then "-gpu " + gpu else ""} ${if defined(memory_mb) then "-M " else ""}${memory_mb}${if defined(memory_mb) then "m" else ""} ${"-W " + 60*time}
74 | ```
75 | 


--------------------------------------------------------------------------------
/scripts/aws_caper_server/README.md:
--------------------------------------------------------------------------------
  1 | ## Introduction
  2 | 
  3 | `create_instance.sh` will create a new Caper server instance on your AWS EC2 region and configure the instance for Cromwell with PostgreSQL database.
  4 | 
  5 | 
  6 | ## AWS account
  7 | 
  8 | 1. Sign up for an [AWS account](https://aws.amazon.com/account/).
  9 | 2. Make sure that your account has full permission on two services (S3 and EC2).
 10 | 3. Configure your AWS CLI. Enter key, secret (password) and region (**IMPORTANT**) obtained from your account's IAM.
 11 | ```bash
 12 | $ aws configure
 13 | ```
 14 | 
 15 | ## VPC
 16 | 
 17 | 1. Click on [this](
 18 | https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=GenomicsVPC&templateURL=https://aws-quickstart.s3.amazonaws.com/quickstart-aws-vpc/templates/aws-vpc.template.yaml) to create a new AWS VPC. Make sure that the region on top right corner of the console page matches with your region of interest. Click on `Next` and then `Next` again. Agree to `Capabililties`. Click on `Create stack`.
 19 | 2. Choose available zones in `Availability Zones`. For example, if your region is `us-west-2`, then you will see `us-west-2a`, `us-west-2b` and  `us-west-2c`.
 20 | 
 21 | 
 22 | ## AWS Batch
 23 | 
 24 | 1. Click on [this](
 25 | https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=gwfcore&templateURL=https://caper-aws-genomics-workflows.s3-us-west-2.amazonaws.com/templates/gwfcore/gwfcore-root.template.yaml) to create a new AWS Batch. Make sure that the region on top right corner of the console page matches with your region of interest. Click on `Next`.
 26 | 2. There are several required parameters to be specified on this page
 27 | - `S3 Bucket name`: S3 bucket name to store your pipeline outputs. This is not a full path for the output directory. It's just bucket's name without the scheme prefix `s3://`. Make sure that this bucket doesn't exist. If it exists then delete it or try with a different non-existing bucket name.
 28 | - `VPC ID`: Choose the VPC `GenomicsVPC` that you just created.
 29 | - `VPC Subnet IDs`: Choose all private subnets created with the above VPC.
 30 | - `Max vCPUs for Default Queue`: Maximum total number of CPUs for the spot instance queue. It's 4000 by default, which is  huge already. But if you use more CPUs than this limit then your jobs will be stuck at `RUNNABLE` status.
 31 | - `Max vCPUs for Priority Queue`: Maximum total number of CPUs for the on-demand instance queue. It's 4000 by default, which is huge already. But if you use more CPUs than this limit then your jobs will be stuck at `RUNNABLE` status.
 32 | 3. Click on `Next` and then `Next` again. Agree to `Capabililties`. Click on `Create stack`.
 33 | 4. Go to your [AWS Batch](https://console.aws.amazon.com/batch) and click on `Job queues` in the left sidebar. You will see two Job Queues (`priority-*` and `default-*`). There has been some issues with the default one which is based on spot instances. Spot instances are interrupted quite often and Cromwell doesn't seem to handle it properly. We recommend to use `priority-*` queue even though it costs a bit more than spot instances. Click on the chosen job queue and get ARN of it. This ARN will be used later to create Caper server instance.
 34 | 
 35 | 
 36 | ## How to create a server instance
 37 | 
 38 | Run without parameters to see detailed help.
 39 | ```bash
 40 | $ bash create_instance.sh
 41 | ```
 42 | 
 43 | Try with the positional arguments only first and see if it works.
 44 | ```bash
 45 | $ bash create_instance.sh [INSTANCE_NAME] [AWS_REGION] [PUBLIC_SUBNET_ID] [AWS_BATCH_ARN] [KEY_PAIR_NAME] [AWS_OUT_DIR]
 46 | ```
 47 | 
 48 | - `AWS_REGION`: Your AWS region. e.g. `us-west-2`. Make sure that it matches with `region` in your AWS credentials file `$HOME/.aws/credentials`.
 49 | - `PUBLIC_SUBNET_ID`: Click on `Services` on AWS Console and Choose `VPC`. Click on `Subnets` on the left sidebar and find `Public subnet 1` under your VPC created from the above instruction.
 50 | - `AWS_BATCH_ARN`: ARN of the AWS Batch created from the above instruction. Double-quote the whole ARN since it includes `:`.
 51 | - `KEY_PAIR_NAME`: Click on `Services` on AWS Console and Choose `EC2`. Choose `Key Pairs` on the left sidebar and create a new key pair (in `.pem` format). Take note of the key name and keep the `.pem` key file on a secure directory where you want to SSH to the instance from. You will need it later when you SSH to the instancec.
 52 | - `AWS_OUT_DIR`: Full output directory path starting with the bucket name you used in the above instruction. This directory should start with `s3://`. e.g. `s3://caper-server-out-bucket/out`.
 53 | 
 54 | Go to the AWS Console and Click on `Services` on AWS Console and Choose `EC2`. Click on `Instances` on the left sidebar and find the created instance. Click on the instance.
 55 | 
 56 | Click on `Security` and find `Security groups`. Click on the security group. Add an inbound rule. Choose type `SSH` and define CIDR for your IP range. Setting it to `0.0.0.0/0` will open the VPC to the world.
 57 | 
 58 | > **IMPORTANT**: It is a default security group for the VPC so use it at your own risk. It's recommended to calculate CIDR for your computer/company and use it here.
 59 | 
 60 | Go back to `Instances` on the console and find the server instance. Get the command line to SSH to it. Make sure that you have the `.pem` key file on your local computer.
 61 | 
 62 | Connect to the instance and wait until `caper -v` works. Allow 20-30 minutes for Caper installation.
 63 | ```bash
 64 | $ caper -v
 65 | ```
 66 | 
 67 | Authenticate yourself for AWS services.
 68 | ```bash
 69 | $ sudo su
 70 | $ aws configure
 71 | # enter your AWS credential and region (IMPORTANT)
 72 | ```
 73 | 
 74 | Run Caper server.
 75 | ```bash
 76 | # cd to caper's main directory
 77 | $ sudo su
 78 | $ cd /opt/caper
 79 | $ screen -dmS caper_server bash -c "caper server > caper_server.log 2>&1"
 80 | ```
 81 | 
 82 | ## How to stop Caper server
 83 | 
 84 | On the instance, attach to the existing screen `caper_server`, stop it with Ctrl + C.
 85 | ```bash
 86 | $ sudo su # log-in as root
 87 | $ screen -r caper_server # attach to the screen
 88 | # in the screen, press Ctrl + C to send SIGINT to Caper
 89 | ```
 90 | 
 91 | ## How to start Caper server
 92 | 
 93 | On the instance, make a new screen `caper_server`.
 94 | ```bash
 95 | $ sudo su
 96 | $ cd /opt/caper
 97 | $ screen -dmS caper_server bash -c "caper server > caper_server.log 2>&1"
 98 | ```
 99 | 
100 | ## How to submit a workflow
101 | 
102 | For the first log-in, authenticate yourself to get permission to read/write on the output S3 bucket. This is to localize any external URIs (defined in an input JSON) on the output S3 bucket's directory with suffix `.caper_tmp/`. Make sure that you have full permission on the output S3 bucket.
103 | ```bash
104 | $ aws configure
105 | # enter your AWS credential and correct region (IMPORTANT)
106 | ```
107 | 
108 | Check if `caper list` works without any network errors.
109 | ```bash
110 | $ caper list
111 | ```
112 | 
113 | Submit a workflow.
114 | ```bash
115 | $ caper submit [WDL] -i input.json ...
116 | ```
117 | 
118 | Caper will localize big data files on a S3 bucket directory `--aws-loc-dir` (or `aws-loc-dir` in the Caper conf file), which defaults to `[AWS_OUT_DIR]/.caper_tmp/` if not defined. e.g. your FASTQs and reference genome data defined in an input JSON.
119 | 
120 | 
121 | ## Using S3 URIs in input JSON
122 | 
123 | **VERY IMPORTANT!**
124 | 
125 | Caper localizes input files on output S3 bucket path + `./caper_tmp` if they are given as non-S3 URIs (e.g. `gs://example/ok.txt`, `http://hello,com/a.txt`, `/any/absolute/path.txt`). However if S3 URIs are given in an input JSON then Caper will not localize them and will directly pass them to Cromwell. However, Cromwell is very picky about **region** and **permission**.
126 | 
127 | First of all **PLEASE DO NOT USE ANY EXTERNAL S3 FILES OUT OF YOUR REGION**. Call-caching will not work for those external files. For example, if your Caper server resides on `us-west-2` and you want to use a Broad reference file `s3://broad-references/hg38/v0/Homo_sapiens_assembly38.dict`. All broad data are on `us-east-1` so call-caching will never work.
128 | 
129 | Another example is ENCODE portal's file. [This FASTQ file](`https://www.encodeproject.org/files/ENCFF641SFZ/`) has a public S3 URI in metadata, which is `s3://encode-public/2017/01/27/92e9bb3b-bc49-43f4-81d9-f51fbc5bb8d5/ENCFF641SFZ.fastq.gz`. All ENCODE portal's data are on `us-west-2`. Call-caching will not work other regions. It's recommended to directly use the URL of this file `https://www.encodeproject.org/files/ENCFF641SFZ/@@download/ENCFF641SFZ.fastq.gz` in input JSON.
130 | 
131 | **DO NOT USE S3 FILES ON A PRIVATE BUCKET**. Job instances will not have access to those private files even though the server instance has one (with your credentials configured with `aws configure`). For example, ENCODE portal's unreleased files are on a private bucket `s3://encode-priavte`. Jobs will always fail if you use these private files.
132 | 
133 | If S3 files in an input JSON are public in the same region then check if you have `s3:GetObjectAcl` permission on the file.
134 | ```bash
135 | $ aws s3api get-object-acl --bucket encode-public --key 2017/01/27/92e9bb3b-bc49-43f4-81d9-f51fbc5bb8d5/ENCFF641SFZ.fastq.gz
136 | {
137 |     "Owner": {
138 |         "DisplayName": "encode-data",
139 |         "ID": "50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c311840523f208e7591d"
140 |     },
141 |     "Grants": [
142 |         {
143 |             "Grantee": {
144 |                 "DisplayName": "encode-aws",
145 |                 "ID": "a0dd0872acae5121b64b11c694371e606e28ab2e746e180ec64a2f85709eb0cd",
146 |                 "Type": "CanonicalUser"
147 |             },
148 |             "Permission": "FULL_CONTROL"
149 |         },
150 |         {
151 |             "Grantee": {
152 |                 "Type": "Group",
153 |                 "URI": "http://acs.amazonaws.com/groups/global/AllUsers"
154 |             },
155 |             "Permission": "READ"
156 |         }
157 |     ]
158 | }
159 | ```
160 | If you get `403 Permission denied` then call-caching will not work.
161 | 
162 | To avoid all permission/region problems, please use non-S3 URIs/URLs.
163 | 
164 | 
165 | ## References
166 | 
167 | https://docs.opendata.aws/genomics-workflows/orchestration/cromwell/cromwell-overview.html
168 | 
169 | 
170 | ## Troubleshooting
171 | 
172 | See [this] for troubleshooting.
173 | 


--------------------------------------------------------------------------------
/scripts/aws_caper_server/TROUBLESHOOTING.md:
--------------------------------------------------------------------------------
 1 | ## Troubleshooting
 2 | 
 3 | Run `caper debug WORKFLOW_ID` to debug/troubleshoot a workflow.
 4 | 
 5 | 
 6 | 
 7 | 
 8 | 
 9 | 
10 | ### `Could not read from s3...`
11 | 
12 | 
13 | If you use private S3 URIs in an input JSON then you will see this error. Please don't use any private S3 URIs. Get a presigned HTTP URL of the private bucket file or use `~/.netrc` authentication instead.
14 | 
15 | ```javascript
16 | "failures": [
17 |     {
18 |         "causedBy": [
19 |             {
20 |                 "causedBy": [
21 |                     {
22 |                         "message": "s3://s3.amazonaws.com/encode-processing/test_without_size_call/5826859d-d07c-4749-a2fe-802c6c6964a6/call-get_b/get_b-rc.txt",
23 |                         "causedBy": []
24 |                     }
25 |                 ],
26 |                 "message": "Could not read from s3://encode-processing/test_without_size_call/5826859d-d07c-4749-a2fe-802c6c6964a6/call-get_b/get_b-rc.txt: s3://s3.amazonaws.com/encode-processing/test_without_size_call/5826859d-d07c-4749-a2fe-802c6c6964a6/call-get_b/get_b-rc.txt"
27 |             }
28 |         ],
29 |         "message": "[Attempted 1 time(s)] - IOException: Could not read from s3://encode-processing/test_without_size_call/5826859d-d07c-4749-a2fe-802c6c6964a6/call-get_b/get_b-rc.txt: s3://s3.amazonaws.com/encode-processing/test_without_size_call/5826859d-d07c-4749-a2fe-802c6c6964a6/call-get_b/get_b-rc.txt"
30 |     }
31 | ],
32 | ```
33 | 
34 | If you still see this error, then please try with the `priority` queue instead of `default` queue. Go to AWS Batch on your AWS Console and click on Job Queues. Get ARN of the `priority-*` queue and define it for `aws-batch-arn=` in your Caper conf (`~/.caper/default.conf`). The `default` queue is based on spot instances and they seem to be interrupted quite often and Cromwell doesn't handle it properly.
35 | 
36 | 
37 | 
38 | ### `S3Exception: null (Service: S3, Status Code: 301)`
39 | 
40 | If you use S3 URIs in an input JSON which are in a different region, then you will see `301 Error`. Please don't use S3 URIs out of your region. It's better to
41 | 
42 | ```javascript
43 | "callCaching": {
44 |     "hashFailures": [
45 |         {
46 |             "causedBy": [
47 |                 {
48 |                     "message": "null (Service: S3, Status Code: 301, Request ID: null, Extended Request ID: MpqH6PrTGZwXu2x5pt8H38VWqnrpWWT7nzH/fZtbiEIKJkN9qrB2koEXlmXAYdvehvAfy5yQggE=)",
49 |                     "causedBy": []
50 |                 }
51 |             ],
52 |             "message": "[Attempted 1 time(s)] - S3Exception: null (Service: S3, Status Code: 301, Request ID: null, Extended Request ID: MpqH6PrTGZwXu2x5pt8H38VWqnrpWWT7nzH/fZtbiEIKJkN9qrB2koEXlmXAYdvehvAfy5yQggE=)"
53 |         }
54 |     ],
55 |     "allowResultReuse": false,
56 |     "hit": false,
57 |     "result": "Cache Miss",
58 |     "effectiveCallCachingMode": "CallCachingOff"
59 | }
60 | ```
61 | 
62 | 
63 | ### `S3Exception: null (Service: S3, Status Code: 400)`
64 | 
65 | If you see `400` error then please use this shell script `./create_instance.sh` to create an instance instead of running Caper server on your laptop/machine.
66 | 
67 | 
68 | ### Tasks (jobs) are stuck at RUNNABLE status
69 | 
70 | Go to `Job Queues` in `AWS Batch` on your AWS console and find your job queue (default or priority) that matches with the ARN in your Caper conf. Edit the queue and increase number of maximum vCPUs.
71 | 


--------------------------------------------------------------------------------
/scripts/aws_caper_server/create_instance.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -eo pipefail
  3 | 
  4 | if [[ $# -lt 1 ]]; then
  5 |   echo "Automated shell script to create Caper server instance with PostgreSQL on AWS."
  6 |   echo
  7 |   echo "Usage: ./create_instance.sh [INSTANCE_NAME] [AWS_REGION] [PUBLIC_SUBNET_ID] "
  8 |   echo "                            [AWS_BATCH_ARN] [KEY_PAIR_NAME] [AWS_OUT_DIR]"
  9 |   echo "                            <OPTIONAL_ARGUMENTS>"
 10 |   echo
 11 |   echo "Positional arguments:"
 12 |   echo "  [INSTANCE_NAME]: New instance's name (tag)."
 13 |   echo "  [AWS_REGION]: Region for AWS. us-east-1 by default."
 14 |   echo "  [PUBLIC_SUBNET_ID]: Public subnet ID. ID of \"Public subnet 1A\" under your VPC. e.g. subnet-0d9e1116acXXXXXXX."
 15 |   echo "  [AWS_BATCH_ARN]: AWS Batch Queue's ARN. --aws-batch-arn in Caper. Choose a default queue. e.g. arn:aws:batch:us-east-1:..."
 16 |   echo "  [KEY_PAIR_NAME]: AWS EC2 key pair name."
 17 |   echo "  [AWS_OUT_DIR]: s3:// bucket dir path for outputs. --aws-out-dir in Caper."
 18 |   echo
 19 |   echo "Optional arguments for Caper:"
 20 |   echo "  -l, --aws-loc-dir: s3:// bucket dir path for localization."
 21 |   echo "  --postgresql-db-ip: localhost by default."
 22 |   echo "  --postgresql-db-port: 5432 by default."
 23 |   echo "  --postgresql-db-user: cromwell by default."
 24 |   echo "  --postgresql-db-password: cromwell by default."
 25 |   echo "  --postgresql-db-name: cromwell by default."
 26 |   echo
 27 |   echo "Optional arguments for instance creation (gcloud compute instances create):"
 28 |   echo "  -i, --instance-type: Instance type. t2.xlarge by default."
 29 |   echo "  -b, --boot-disk-size: Boot disk size in GB. DO NOT USE ANY SIZE UNIT."
 30 |   echo "  --boot-disk-device-name: Boot disk type. /dev/sda1 by default."
 31 |   echo "  --ami-name-search-query: Operating system for the image. \"Ubuntu 18.04 LTS\" by default. Caper server requires Ubuntu/Debian based OS. Check https://docs.aws.amazon.com/opsworks/latest/userguide/workinginstances-os.html"
 32 |   echo
 33 | 
 34 |   if [[ $# -lt 6 ]]; then
 35 |     echo "Define all positional arguments."
 36 |   fi
 37 |   exit 1
 38 | fi
 39 | 
 40 | # parse opt args first.
 41 | POSITIONAL=()
 42 | while [[ $# -gt 0 ]]; do
 43 |   key="$1"
 44 |   case $key in
 45 |     -l|--aws-loc-dir)
 46 |       AWS_LOC_DIR="$2"
 47 |       shift
 48 |       shift
 49 |       ;;
 50 |     --postgresql-db-ip)
 51 |       POSTGRESQL_DB_IP="$2"
 52 |       shift
 53 |       shift
 54 |       ;;
 55 |     --postgresql-db-port)
 56 |       POSTGRESQL_DB_PORT="$2"
 57 |       shift
 58 |       shift
 59 |       ;;
 60 |     --postgresql-db-user)
 61 |       POSTGRESQL_DB_USER="$2"
 62 |       shift
 63 |       shift
 64 |       ;;
 65 |     --postgresql-db-password)
 66 |       POSTGRESQL_DB_PASSWORD="$2"
 67 |       shift
 68 |       shift
 69 |       ;;
 70 |     --postgresql-db-name)
 71 |       POSTGRESQL_DB_NAME="$2"
 72 |       shift
 73 |       shift
 74 |       ;;
 75 |     -i|--instance-type)
 76 |       INSTANCE_TYPE="$2"
 77 |       shift
 78 |       shift
 79 |       ;;
 80 |     -b|--boot-disk-size)
 81 |       BOOT_DISK_SIZE="$2"
 82 |       shift
 83 |       shift
 84 |       ;;
 85 |     --boot-disk-device-name)
 86 |       BOOT_DISK_DEVICE_NAME="$2"
 87 |       shift
 88 |       shift
 89 |       ;;
 90 |     --ami-name-search-query)
 91 |       AMI_NAME_SEARCH_QUERY="$2"
 92 |       shift
 93 |       shift
 94 |       ;;
 95 |     -*)
 96 |       echo "Wrong parameter: $1."
 97 |       shift
 98 |       exit 1
 99 |       ;;
100 |     *)
101 |       POSITIONAL+=("$1")
102 |       shift
103 |       ;;
104 |   esac
105 | done
106 | 
107 | # restore pos args.
108 | set -- "${POSITIONAL[@]}"
109 | 
110 | # parse pos args.
111 | INSTANCE_NAME="$1"
112 | AWS_REGION="$2"
113 | PUBLIC_SUBNET_ID="$3"
114 | AWS_BATCH_ARN="$4"
115 | KEY_PAIR_NAME="$5"
116 | AWS_OUT_DIR="$6"
117 | 
118 | # set defaults for opt args. (caper)
119 | if [[ -z "$AWS_LOC_DIR" ]]; then
120 |   AWS_LOC_DIR="$AWS_OUT_DIR"/.caper_tmp
121 | fi
122 | if [[ -z "$AWS_REGION" ]]; then
123 |   AWS_REGION=us-east-1
124 | fi
125 | if [[ -z "$POSTGRESQL_DB_IP" ]]; then
126 |   POSTGRESQL_DB_IP=localhost
127 | fi
128 | if [[ -z "$POSTGRESQL_DB_PORT" ]]; then
129 |   POSTGRESQL_DB_PORT=5432
130 | fi
131 | if [[ -z "$POSTGRESQL_DB_USER" ]]; then
132 |   POSTGRESQL_DB_USER=cromwell
133 | fi
134 | if [[ -z "$POSTGRESQL_DB_PASSWORD" ]]; then
135 |   POSTGRESQL_DB_PASSWORD=cromwell
136 | fi
137 | if [[ -z "$POSTGRESQL_DB_NAME" ]]; then
138 |   POSTGRESQL_DB_NAME=cromwell
139 | fi
140 | 
141 | # set defaults for opt args.
142 | if [[ -z "$INSTANCE_TYPE" ]]; then
143 |   INSTANCE_TYPE=t2.xlarge
144 | fi
145 | if [[ -z "$BOOT_DISK_SIZE" ]]; then
146 |   BOOT_DISK_SIZE=150
147 | fi
148 | if [[ -z "$BOOT_DISK_DEVICE_NAME" ]]; then
149 |   BOOT_DISK_DEVICE_NAME="/dev/sda1"
150 | fi
151 | if [[ -z "$AMI_NAME_SEARCH_QUERY" ]]; then
152 |   AMI_NAME_SEARCH_QUERY="ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64*"
153 | fi
154 | 
155 | # validate all args.
156 | if [[ "$PUBLIC_SUBNET_ID" != subnet-* ]]; then
157 |   echo "[PUBLIC_SUBNET_ID] should start with subnet-."
158 |   exit 1
159 | fi
160 | if [[ "$AWS_BATCH_ARN" != arn* ]]; then
161 |   echo "[AWS_BATCH_ARN] is not valid."
162 |   exit 1
163 | fi
164 | if [[ "$AWS_OUT_DIR" != s3://* ]]; then
165 |   echo "[AWS_OUT_DIR] should be a S3 bucket path starting with s3://."
166 |   exit 1
167 | fi
168 | if [[ "$AWS_LOC_DIR" != s3://* ]]; then
169 |   echo "-l, --aws-loc-dir should be a S3 bucket path starting with s3://."
170 |   exit 1
171 | fi
172 | if [[ -z "$KEY_PAIR_NAME" ]]; then
173 |   echo "[KEY_PAIR_NAME] is not valid."
174 |   exit 1
175 | fi
176 | if [[ "$POSTGRESQL_DB_IP" == localhost && "$POSTGRESQL_DB_PORT" != 5432 ]]; then
177 |   echo "--postgresql-db-port should be 5432 for locally installed PostgreSQL (--postgresql-db-ip localhost)."
178 |   exit 1
179 | fi
180 | 
181 | # constants for files/params on instance.
182 | AWS_AUTH_SH="/etc/profile.d/aws-auth.sh"
183 | CAPER_CONF_DIR=/opt/caper
184 | ROOT_CAPER_CONF_DIR=/root/.caper
185 | GLOBAL_CAPER_CONF_FILE="$CAPER_CONF_DIR/default.conf"
186 | 
187 | # prepend more init commands to the startup-script
188 | STARTUP_SCRIPT="""#!/bin/bash
189 | ### update apt and install and packages
190 | sudo apt-get update
191 | sudo apt-get install -y screen python3 python3-pip default-jre postgresql postgresql-contrib
192 | 
193 | ### install gsutil
194 | echo \"deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main\" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
195 | sudo apt-get install -y apt-transport-https ca-certificates gnupg
196 | 
197 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
198 | sudo apt-get update
199 | sudo apt-get install -y google-cloud-sdk
200 | 
201 | ### make caper's work directory
202 | sudo mkdir -p $CAPER_CONF_DIR
203 | sudo chmod 777 -R $CAPER_CONF_DIR
204 | sudo setfacl -d -m u::rwX $CAPER_CONF_DIR
205 | sudo setfacl -d -m g::rwX $CAPER_CONF_DIR
206 | sudo setfacl -d -m o::rwX $CAPER_CONF_DIR
207 | 
208 | ### make caper's out/localization directory
209 | sudo mkdir -p $CAPER_CONF_DIR/local_loc_dir $CAPER_CONF_DIR/local_out_dir
210 | 
211 | ### make caper conf file
212 | cat <<EOF > $GLOBAL_CAPER_CONF_FILE
213 | # caper
214 | backend=aws
215 | no-server-heartbeat=True
216 | # cromwell
217 | max-concurrent-workflows=300
218 | max-concurrent-tasks=1000
219 | # local backend
220 | local-out-dir=$CAPER_CONF_DIR/local_out_dir
221 | local-loc-dir=$CAPER_CONF_DIR/local_loc_dir
222 | # aws backend
223 | aws-batch-arn=$AWS_BATCH_ARN
224 | aws-region=$AWS_REGION
225 | aws-out-dir=$AWS_OUT_DIR
226 | aws-loc-dir=$AWS_LOC_DIR
227 | cromwell=https://storage.googleapis.com/caper-data/cromwell/cromwell-65-d16af26-SNAP.jar
228 | # metadata DB
229 | db=postgresql
230 | postgresql-db-ip=$POSTGRESQL_DB_IP
231 | postgresql-db-port=$POSTGRESQL_DB_PORT
232 | postgresql-db-user=$POSTGRESQL_DB_USER
233 | postgresql-db-password=$POSTGRESQL_DB_PASSWORD
234 | postgresql-db-name=$POSTGRESQL_DB_NAME
235 | EOF
236 | sudo chmod +r $GLOBAL_CAPER_CONF_FILE
237 | 
238 | ### soft-link conf file for root
239 | sudo mkdir -p $ROOT_CAPER_CONF_DIR
240 | sudo ln -s $GLOBAL_CAPER_CONF_FILE $ROOT_CAPER_CONF_DIR
241 | 
242 | ### caper conf shared with all users
243 | sudo touch $AWS_AUTH_SH
244 | echo \"mkdir -p ~/.caper\" >> $AWS_AUTH_SH
245 | echo \"ln -s /opt/caper/default.conf ~/.caper/ 2> /dev/null | true\" >> $AWS_AUTH_SH
246 | """
247 | 
248 | # append more init commands to the startup-script
249 | STARTUP_SCRIPT="""$STARTUP_SCRIPT
250 | ### init PostgreSQL for Cromwell
251 | sudo -u postgres createuser root -s
252 | sudo createdb $POSTGRESQL_DB_NAME
253 | sudo psql -d $POSTGRESQL_DB_NAME -c \"create extension lo;\"
254 | sudo psql -d $POSTGRESQL_DB_NAME -c \"create role $POSTGRESQL_DB_USER with superuser login password '$POSTGRESQL_DB_PASSWORD'\"
255 | 
256 | ### upgrade pip and install caper croo
257 | sudo python3 -m pip install --upgrade pip
258 | sudo -H pip3 install --ignore-installed PyYAML
259 | sudo pip install caper croo
260 | """
261 | 
262 | echo "$(date): Making a temporary startup script..."
263 | echo "$STARTUP_SCRIPT" > tmp_startup_script.sh
264 | 
265 | # find the most recent AMI matching the name search query
266 | # https://gist.github.com/vancluever/7676b4dafa97826ef0e9
267 | echo "$(date): Searching for AMI with name matching \"${AMI_NAME_SEARCH_QUERY}\" in region ${AWS_REGION}..."
268 | AMI=$(aws --region "${AWS_REGION}" ec2 describe-images --filters "Name=name,Values=${AMI_NAME_SEARCH_QUERY}" --query 'sort_by(Images,&CreationDate)[-1].ImageId')
269 | AMI="${AMI%\"}"
270 | AMI="${AMI#\"}"
271 | echo "$(date): Found AMI: ${AMI}"
272 | 
273 | echo "$(date): Creating an instance..."
274 | aws ec2 --region "${AWS_REGION}" run-instances \
275 |   --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=${INSTANCE_NAME}}]" \
276 |   --image-id "${AMI}" \
277 |   --subnet-id "${PUBLIC_SUBNET_ID}" \
278 |   --key-name "${KEY_PAIR_NAME}" \
279 |   --block-device-mappings "DeviceName=${BOOT_DISK_DEVICE_NAME},Ebs={VolumeSize=${BOOT_DISK_SIZE}}" \
280 |   --instance-type "${INSTANCE_TYPE}" \
281 |   --user-data "file://tmp_startup_script.sh"
282 | echo "$(date): Created an instance successfully."
283 | 
284 | echo "$(date): Deleting the temporary startup script..."
285 | rm -f tmp_startup_script.sh
286 | 
287 | echo "$(date): Please allow 20-30 minutes for the startup script installing/configuring Caper."
288 | echo "$(date): Run \"caper -v\" to check it's installed."
289 | echo "$(date): Run \"aws configure\" as root so that Cromwell can use your AWS credentials to create instances and write outputs on the bucket."
290 | 


--------------------------------------------------------------------------------
/scripts/gcp_caper_server/README.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | `create_instance.sh` will create an instance on Google Cloud Compute Engine in Google your project and configure the instance for Caper with PostgreSQL database and Google Cloud Life Sciences API (`v2beta`).
 4 | 
 5 | > **NOTE**: Google Cloud Life Sciences API is a new API replacing the old deprecating Genomics API (`v2alpha1`). It requires `--gcp-region` to be defined correctly. Check [supported regions](https://cloud.google.com/life-sciences/docs/concepts/locations) for the new API.
 6 | 
 7 | ## Install Google Cloud SDK SLI
 8 | 
 9 | Make sure that `gcloud` (Google Cloud SDK CLI) is installed on your system.
10 | 
11 | Go to [APIs & Services](https://console.cloud.google.com/apis/dashboard) on your project and enable the following APIs on your Google Cloud console.
12 | * Compute Engine API
13 | * Cloud Storage: DO NOT click on `Create credentials`.
14 | * Cloud Storage JSON API
15 | * Google Cloud Life Sciences API
16 | 
17 | Go to [Service accounts](https://console.cloud.google.com/iam-admin/serviceaccounts) on your project and create a new service account with the following roles:
18 | * Compute Admin
19 | * Storage Admin: You can skip this and individually configure permission on each bucket on the project.
20 | * Cloud Life Sciences Admin (Cromwell's PAPI v2beta)
21 | * **Service Account User** (VERY IMPORTANT).
22 | 
23 | Generate a secret key JSON from the service account and keep it locally on your computer.
24 | 
25 | > **WARNING**: Such secret JSON file is a master key for important resources on your project. Keep it secure at your own risk. This file will be used for Caper so that it will be trasnferred to the created instance at `/opt/caper/service_account_key.json` visible to all users on the instance.
26 | 
27 | ## How to create an instance
28 | 
29 | Run without arguments to see detailed help. Some optional arguments are very important depending on your region/zone. e.g. `--gcp-region` (for provisioning worker instances of Life Sciences API) and `--zone` (for server instance creation only). These regional parameters default to US central region/zones.
30 | ```bash
31 | $ bash create_instance.sh
32 | ```
33 | 
34 | However, this script is designed to work well with default arguments. Try with positional arguments only first and see if it works.
35 | ```bash
36 | $ bash create_instance.sh [INSTANCE_NAME] [PROJECT_ID] [GCP_SERVICE_ACCOUNT_KEY_JSON_FILE] [GCP_OUT_DIR]
37 | ```
38 | 
39 | This script will run Caper server by user `root` in a `screen` named `caper_server` at the end the installation.
40 | 
41 | 
42 | ## How to stop Caper server
43 | 
44 | On the instance, attach to the existing screen `caper_server`, stop it with Ctrl + C.
45 | ```bash
46 | $ sudo su # log-in as root
47 | $ screen -r caper_server # attach to the screen
48 | # in the screen, press Ctrl + C to send SIGINT to Caper
49 | ```
50 | 
51 | ## How to start Caper server
52 | 
53 | On the instance, make a new screen `caper_server`.
54 | ```bash
55 | $ cd /opt/caper
56 | $ screen -dmS caper_server bash -c "caper server > caper_server.log 2>&1"
57 | ```
58 | 
59 | ## How to submit workflow
60 | 
61 | Check if `caper list` works without any network errors.
62 | ```bash
63 | $ caper list
64 | ```
65 | 
66 | Submit a workflow.
67 | ```bash
68 | $ caper submit [WDL] -i input.json ...
69 | ```
70 | 
71 | Caper will localize big data files on a GCS bucket directory `--gcp-loc-dir`, which defaults to `[GCP_OUT_DIR]/.caper_tmp/` if not defined. e.g. your FASTQs and reference genome data defined in an input JSON.
72 | 
73 | 
74 | ## How to configure Caper
75 | 
76 | **This section is for advanced users only**. Caper tries to find a default configuration file at `~/.caper/default.conf` which is symlinked from `/opt/caper/default.conf`. `/opt/caper/default.conf` is a globally shared configuration file. Edit this file for both server/client.
77 | 
78 | Everytime a user logs in, symlinking is reset. It is controlled by `/etc/profile.d/gcp-auth.sh`.
79 | ```bash
80 | gcloud auth activate-service-account --key-file=/opt/caper/service_account_key.json
81 | mkdir -p ~/.caper
82 | ln -s /opt/caper/default.conf ~/.caper/ 2> /dev/null | true
83 | ```
84 | 
85 | If users want to have their own configuration at `~/.caper/default.conf`, simply delete this symlink and make a copy of globally shared one.
86 | ```bash
87 | $ rm ~/.caper/default.conf
88 | $ cp /opt/caper/default.conf ~/.caper/default.conf
89 | ```
90 | 
91 | 
92 | ## Troubleshooting
93 | 
94 | See [this] for troubleshooting.
95 | 


--------------------------------------------------------------------------------
/scripts/gcp_caper_server/TROUBLESHOOTING.md:
--------------------------------------------------------------------------------
 1 | ## Troubleshooting errors
 2 | 
 3 | If you see permission errors check if the above roles are correctly configured for your service account.
 4 | 
 5 | If you see PAPI errors and Google's HTTP endpoint deprecation warning. Remove Life Sciences API role from your service account and add it back.
 6 | 
 7 | If you see the following error then click on your service account on `Service Account` in `IAM` of your Google project and make sure that `Enable G Suite Domain-wide Delegation` is checked.
 8 | ```
 9 | 400 Bad Request
10 | POST https://lifesciences.googleapis.com/v2beta/projects/99884963860/locations/us-central1/operations/XXXXXXXXXXXXXXXXXXXX:cancel
11 | {
12 |   "code" : 400,
13 |   "errors" : [ {
14 |     "domain" : "global",
15 |     "message" : "Precondition check failed.",
16 |     "reason" : "failedPrecondition"
17 |   } ],
18 |   "message" : "Precondition check failed.",
19 |   "status" : "FAILED_PRECONDITION"
20 | }
21 | ```
22 | 


--------------------------------------------------------------------------------
/scripts/resource_monitor/resource_monitor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Resource monitoring tool for Cromwell.
 3 | # This script is for GCP backend only.
 4 | 
 5 | INTERVAL=$1
 6 | if [[ -z "$INTERVAL" ]]; then
 7 |   INTERVAL=20
 8 | fi
 9 | 
10 | printf 'time\tmem\tdisk\tcpu_pct\n'
11 | 
12 | while true; do
13 |   # Seconds since epoch.
14 |   TIME=$(date +%s)
15 |   # -b for size in bytes.
16 |   MEM=$(free -b | awk 'NR==2{print $3}')
17 |   # -b for size in bytes.
18 |   DISK=$(du -s -b /cromwell_root | awk '{print $1}')
19 |   # Use top to get total cpu usage: usage = 100 - idle.
20 |   # Use data from the 2nd iteration (top -n2 and tail -1) for better accuracy.
21 |   # https://stackoverflow.com/questions/9229333/how-to-get-overall-cpu-usage-e-g-57-on-linux#comment33209786_9229692
22 |   CPU_PCT=$(top -b -n2 -p1 | grep -F '%Cpu' | tail -1 | awk -F 'id,' '{n=split($1,vs,","); v=vs[n]; sub(" ","",v); print 100.0-v}')
23 | 
24 |   printf '%d\t%d\t%d\t%.2f\n' "$TIME" "$MEM" "$DISK" "$CPU_PCT"
25 |   sleep "$INTERVAL"
26 | done
27 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | from pathlib import Path
 4 | 
 5 | import setuptools
 6 | 
 7 | META_PATH = Path('caper', '__init__.py')
 8 | HERE = os.path.abspath(os.path.dirname(__file__))
 9 | 
10 | 
11 | def read(*parts):
12 |     """
13 |     Build an absolute path from *parts* and and return the contents of the
14 |     resulting file.  Assume UTF-8 encoding.
15 |     """
16 |     with Path(HERE, *parts).open(encoding='utf-8') as f:
17 |         return f.read()
18 | 
19 | 
20 | META_FILE = read(META_PATH)
21 | 
22 | 
23 | def find_meta(meta):
24 |     """
25 |     Extract __*meta*__ from META_FILE.
26 |     """
27 |     meta_match = re.search(
28 |         r"^__{meta}__ = ['\"]([^'\"]*)['\"]".format(meta=meta), META_FILE, re.M
29 |     )
30 |     if meta_match:
31 |         return meta_match.group(1)
32 |     raise
33 | 
34 | 
35 | with open('README.md', 'r') as fh:
36 |     long_description = fh.read()
37 | 
38 | setuptools.setup(
39 |     name='caper',
40 |     version=find_meta('version'),
41 |     python_requires='>=3.6',
42 |     scripts=[
43 |         'bin/caper',
44 |         'bin/run_mysql_server_docker.sh',
45 |         'bin/run_mysql_server_singularity.sh',
46 |         'scripts/gcp_caper_server/create_instance.sh',
47 |     ],
48 |     author='Jin Lee',
49 |     author_email='leepc12@gmail.com',
50 |     description='Cromwell Assisted Pipeline ExecutoR',
51 |     long_description='https://github.com/ENCODE-DCC/caper',
52 |     long_description_content_type='text/markdown',
53 |     url='https://github.com/ENCODE-DCC/caper',
54 |     packages=setuptools.find_packages(exclude=['mysql*', 'docs']),
55 |     classifiers=[
56 |         'Programming Language :: Python :: 3',
57 |         'License :: OSI Approved :: MIT License',
58 |         'Operating System :: POSIX :: Linux',
59 |     ],
60 |     install_requires=[
61 |         'pyhocon>=0.3.53',
62 |         'requests>=2.20',
63 |         'pyopenssl',
64 |         'autouri>=0.4.4',
65 |         'miniwdl>=0.7.0',
66 |         'humanfriendly',
67 |         'numpy>=1.8.2',
68 |         'pandas>=1.0',
69 |         'scikit-learn>=0.19.2',
70 |         'matplotlib>=1.5',
71 |         'six>=1.13.0',
72 |     ],
73 | )
74 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ENCODE-DCC/caper/6759671f9f18dc1220fb4b9c26f99c0216e4e368/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | """
 4 | import pytest
 5 | 
 6 | from caper.cromwell import Cromwell
 7 | 
 8 | 
 9 | def pytest_addoption(parser):
10 |     parser.addoption(
11 |         '--ci-prefix', default='default_ci_prefix', help='Prefix for CI test.'
12 |     )
13 |     parser.addoption(
14 |         '--gcs-root',
15 |         default='gs://encode-test-caper',
16 |         help='GCS root path for CI test. '
17 |         'This GCS bucket must be publicly accessible '
18 |         '(read access for everyone is enough for testing).',
19 |     )
20 |     parser.addoption(
21 |         '--cromwell',
22 |         default=Cromwell.DEFAULT_CROMWELL,
23 |         help='URI for Cromwell JAR. Local path is recommended.',
24 |     )
25 |     parser.addoption(
26 |         '--womtool',
27 |         default=Cromwell.DEFAULT_WOMTOOL,
28 |         help='URI for Womtool JAR. Local path is recommended.',
29 |     )
30 |     parser.addoption(
31 |         '--gcp-prj', default='encode-dcc-1016', help='Project on Google Cloud Platform.'
32 |     )
33 |     parser.addoption(
34 |         '--gcp-service-account-key-json', help='JSON key file for GCP service account.'
35 |     )
36 |     parser.addoption(
37 |         '--debug-caper', action='store_true', help='Debug-level logging for CLI tests.'
38 |     )
39 | 
40 | 
41 | @pytest.fixture(scope='session')
42 | def ci_prefix(request):
43 |     return request.config.getoption('--ci-prefix').rstrip('/')
44 | 
45 | 
46 | @pytest.fixture(scope='session')
47 | def gcs_root(request):
48 |     """GCS root to generate test GCS URIs on."""
49 |     return request.config.getoption('--gcs-root').rstrip('/')
50 | 
51 | 
52 | @pytest.fixture(scope='session')
53 | def cromwell(request):
54 |     return request.config.getoption('--cromwell')
55 | 
56 | 
57 | @pytest.fixture(scope='session')
58 | def womtool(request):
59 |     return request.config.getoption('--womtool')
60 | 
61 | 
62 | @pytest.fixture(scope='session')
63 | def gcp_prj(request):
64 |     return request.config.getoption('--gcp-prj')
65 | 
66 | 
67 | @pytest.fixture(scope='session')
68 | def gcp_service_account_key_json(request):
69 |     return request.config.getoption('--gcp-service-account-key-json')
70 | 
71 | 
72 | @pytest.fixture(scope='session')
73 | def debug_caper(request):
74 |     return request.config.getoption('--debug-caper')
75 | 
76 | 
77 | @pytest.fixture(scope='session')
78 | def gcp_res_analysis_metadata():
79 |     return 'gs://caper-data/gcp_resource_analysis/out/atac/e5eab444-cb6c-414a-a090-2c12417be542/metadata.json'
80 | 


--------------------------------------------------------------------------------
/tests/example_wdl.py:
--------------------------------------------------------------------------------
  1 | """WDLs used for testing.
  2 | 
  3 | To test many functions recursively working with subworkflows
  4 | these WDLs have the following structure:
  5 | 
  6 |     main.wdl (imports sub.wdl)
  7 |         sub/
  8 |             sub.wdl (imports sub_sub.wdl)
  9 |             sub/
 10 |                 sub_sub.wdl (imports nothing)
 11 |         inputs.json (inputs JSON file)
 12 | """
 13 | import json
 14 | import os
 15 | from textwrap import dedent
 16 | 
 17 | from autouri import AutoURI
 18 | 
 19 | WRONG_WDL = dedent(
 20 |     """\
 21 |     version 1.0
 22 | 
 23 |     workflwwwwwwwwwowwwww main {
 24 |     }
 25 | """
 26 | )
 27 | 
 28 | MAIN_WDL = dedent(
 29 |     """\
 30 |     version 1.0
 31 |     import "sub/sub.wdl" as sub
 32 | 
 33 |     workflow main {
 34 |         meta {
 35 |             key1: "val1"
 36 |             key2: "val2"
 37 |         }
 38 |         parameter_meta {
 39 |             input_s: {
 40 |                 key1: "val1"
 41 |             }
 42 |             input_i: {
 43 |                 key1: "val1"
 44 |             }
 45 |         }
 46 |         input {
 47 |             String input_s
 48 |             Int input_i = 1
 49 |         }
 50 | 
 51 |         call t1
 52 |         call sub.sub
 53 | 
 54 |         output {
 55 |             File out1 = t1.out
 56 |             File out_sub = sub.out
 57 |         }
 58 |     }
 59 | 
 60 |     task t1 {
 61 |         command {
 62 |             echo 1 > out.txt
 63 |             sleep 10
 64 |         }
 65 |         output {
 66 |             File out = 'out.txt'
 67 |         }
 68 |     }
 69 | """
 70 | )
 71 | 
 72 | MAIN_INPUTS = {'main.input_s': 'a'}
 73 | 
 74 | MAIN_WDL_META_DICT = {'key1': 'val1', 'key2': 'val2'}
 75 | 
 76 | 
 77 | MAIN_WDL_PARAMETER_META_DICT = {
 78 |     'input_s': {'key1': 'val1'},
 79 |     'input_i': {'key1': 'val1'},
 80 | }
 81 | 
 82 | SUB_WDL = dedent(
 83 |     """\
 84 |     version 1.0
 85 |     import "sub/sub_sub.wdl" as sub_sub
 86 | 
 87 |     workflow sub {
 88 |         call t2
 89 |         call sub_sub.sub_sub
 90 | 
 91 |         output {
 92 |             File out = t2.out
 93 |             File out_sub = sub_sub.out
 94 |         }
 95 |     }
 96 | 
 97 |     task t2 {
 98 |         command {
 99 |             echo 2 > out2.txt
100 |         }
101 |         output {
102 |             File out = 'out2.txt'
103 |         }
104 |     }
105 | """
106 | )
107 | 
108 | SUB_WDL_TO_FAIL = dedent(
109 |     """\
110 |     version 1.0
111 |     import "sub/sub_sub.wdl" as sub_sub
112 | 
113 |     workflow sub {
114 |         call t2_failing
115 |         call sub_sub.sub_sub
116 | 
117 |         output {
118 |             File out = t2_failing.out
119 |             File out_sub = sub_sub.out
120 |         }
121 |     }
122 | 
123 |     task t2_failing {
124 |         command {
125 |             echo 2 > out2.txt
126 |             INTENTED_ERROR
127 |         }
128 |         output {
129 |             File out = 'out2.txt'
130 |         }
131 |     }
132 | """
133 | )
134 | 
135 | SUB_SUB_WDL = dedent(
136 |     """\
137 |     version 1.0
138 | 
139 |     workflow sub_sub {
140 |         call t3
141 |         output {
142 |             File out = t3.out
143 |         }
144 |     }
145 | 
146 |     task t3 {
147 |         command {
148 |             echo 3 > out3.txt
149 |         }
150 |         output {
151 |             File out = 'out3.txt'
152 |         }
153 |     }
154 | """
155 | )
156 | 
157 | 
158 | def make_directory_with_wdls(directory, no_sub_wdl=False):
159 |     """
160 |     Run Cromwell with WDLs:
161 |     main + 1 sub + 1 sub's sub.
162 | 
163 |     Returns:
164 |         Created root directory
165 |     """
166 |     main_inputs = os.path.join(directory, 'inputs.json')
167 |     AutoURI(main_inputs).write(json.dumps(MAIN_INPUTS, indent=4))
168 | 
169 |     main_wdl = os.path.join(directory, 'main.wdl')
170 |     AutoURI(main_wdl).write(MAIN_WDL)
171 | 
172 |     if not no_sub_wdl:
173 |         sub_wdl = os.path.join(directory, 'sub', 'sub.wdl')
174 |         AutoURI(sub_wdl).write(SUB_WDL)
175 | 
176 |         sub_sub_wdl = os.path.join(directory, 'sub', 'sub', 'sub_sub.wdl')
177 |         AutoURI(sub_sub_wdl).write(SUB_SUB_WDL)
178 | 
179 | 
180 | def make_directory_with_failing_wdls(directory, no_sub_wdl=False):
181 |     """
182 |     Run Cromwell with WDLs:
183 |     main + 1 sub (supposed to fail) + 1 sub's sub.
184 | 
185 |     Returns:
186 |         Created root directory
187 |     """
188 |     main_inputs = os.path.join(directory, 'inputs.json')
189 |     AutoURI(main_inputs).write(json.dumps(MAIN_INPUTS, indent=4))
190 | 
191 |     main_wdl = os.path.join(directory, 'main.wdl')
192 |     AutoURI(main_wdl).write(MAIN_WDL)
193 | 
194 |     if not no_sub_wdl:
195 |         sub_wdl = os.path.join(directory, 'sub', 'sub.wdl')
196 |         AutoURI(sub_wdl).write(SUB_WDL_TO_FAIL)
197 | 
198 |         sub_sub_wdl = os.path.join(directory, 'sub', 'sub', 'sub_sub.wdl')
199 |         AutoURI(sub_sub_wdl).write(SUB_SUB_WDL)
200 | 


--------------------------------------------------------------------------------
/tests/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers=
3 |     integration
4 |     google_cloud
5 | 


--------------------------------------------------------------------------------
/tests/test_arg_tool.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from configparser import DuplicateOptionError
  3 | from textwrap import dedent
  4 | 
  5 | import pytest
  6 | 
  7 | from caper.arg_tool import read_from_conf, update_parsers_defaults_with_conf
  8 | 
  9 | CONF_CONTENTS = dedent(
 10 |     """\
 11 |     param-wo-default="please_remove_double_quote"
 12 |     param-w-type-wo-default='4.0'
 13 |     param-w-type-wo-default2="5.0"
 14 |     param_w_type_wo_default3=
 15 |     param-w-int-default=10
 16 |     param-w-int-default3=
 17 |     flag-w-default=True
 18 |     flag-w-default2='False'
 19 |     flag-wo-default='FALSE'
 20 |     flag-wo-default2="True"
 21 |     to_be-replaced=1
 22 | """
 23 | )
 24 | 
 25 | 
 26 | CONF_CONTENTS_DUPLICATE_ENTRY = dedent(
 27 |     """\
 28 |     shared-param=200
 29 |     shared-param=400
 30 |     uniq-param-a=2000
 31 |     uniq-param-c=4
 32 | """
 33 | )
 34 | 
 35 | 
 36 | CONF_CONTENTS_FOR_SUBPARSER = dedent(
 37 |     """\
 38 |     shared-param=600
 39 |     uniq-param-a=2000
 40 |     uniq-param-b=4000
 41 |     uniq-param-c=4
 42 | """
 43 | )
 44 | 
 45 | 
 46 | @pytest.fixture
 47 | def parser_wo_subparsers():
 48 |     parser = argparse.ArgumentParser()
 49 |     parser.add_argument('--param-wo-default')
 50 |     parser.add_argument('--param-w-type-wo-default', type=float)
 51 |     parser.add_argument('--param-w-type-wo-default2', type=float)
 52 |     parser.add_argument('--param-w-type-wo-default3', type=float)
 53 |     parser.add_argument('--param-w-int-default', default=2)
 54 |     parser.add_argument('--param-w-int-default2', default=4)
 55 |     parser.add_argument('--param-w-int-default3', default=6)
 56 |     parser.add_argument('--flag-w-default', action='store_true', default=False)
 57 |     parser.add_argument('--flag-w-default2', action='store_true', default=False)
 58 |     parser.add_argument('--flag-wo-default', action='store_true')
 59 |     parser.add_argument('--flag-wo-default2', action='store_true')
 60 |     return parser
 61 | 
 62 | 
 63 | @pytest.fixture
 64 | def parser_with_subparsers():
 65 |     parser = argparse.ArgumentParser()
 66 |     subparser = parser.add_subparsers(dest='action')
 67 | 
 68 |     p_sub_a = subparser.add_parser('a')
 69 |     p_sub_b = subparser.add_parser('b')
 70 | 
 71 |     # two subparsers will have shared and original parameters
 72 |     p_sub_a.add_argument('--shared-param', default=2)
 73 |     p_sub_a.add_argument('--uniq-param-a', default=20)
 74 | 
 75 |     p_sub_b.add_argument('--shared-param', default=4.0)
 76 |     p_sub_b.add_argument('--uniq-param-b', default=40.0)
 77 |     p_sub_b.add_argument('--uniq-param-c', type=int)
 78 | 
 79 |     return parser, [p_sub_a, p_sub_b]
 80 | 
 81 | 
 82 | def test_read_from_conf(tmp_path):
 83 |     c = tmp_path / 'c1.conf'
 84 |     c.write_text(CONF_CONTENTS)
 85 | 
 86 |     d1 = read_from_conf(
 87 |         c, no_strip_quote=False, conf_key_map={'to_be_replaced': 'replaced_key'}
 88 |     )
 89 |     assert d1['param_wo_default'] == 'please_remove_double_quote'
 90 |     assert d1['param_w_type_wo_default'] == '4.0'
 91 |     assert d1['param_w_type_wo_default2'] == '5.0'
 92 |     assert 'param_w_type_wo_default3' not in d1
 93 |     assert d1['param_w_int_default'] == '10'
 94 |     assert 'param_w_int_default3' not in d1
 95 |     assert d1['flag_w_default'] == 'True'
 96 |     assert d1['flag_w_default2'] == 'False'
 97 |     assert d1['flag_wo_default'] == 'FALSE'
 98 |     assert d1['flag_wo_default2'] == 'True'
 99 |     assert d1['replaced_key'] == '1'
100 |     assert 'to_be-replaced' not in d1
101 | 
102 |     d2 = read_from_conf(c, no_strip_quote=True)
103 |     assert d2['param_wo_default'] == '"please_remove_double_quote"'
104 |     assert d2['param_w_type_wo_default2'] == '"5.0"'
105 |     assert d2['flag_w_default2'] == '\'False\''
106 |     assert d2['flag_wo_default'] == '\'FALSE\''
107 |     assert d2['flag_wo_default2'] == '"True"'
108 | 
109 |     c2 = tmp_path / 'c2.conf'
110 |     c2.write_text(CONF_CONTENTS_DUPLICATE_ENTRY)
111 | 
112 |     with pytest.raises(DuplicateOptionError):
113 |         d2 = read_from_conf(c2)
114 | 
115 | 
116 | def test_update_parsers_defaults_with_conf(tmp_path, parser_wo_subparsers):
117 |     """Check if this function correctly updates argparse parser's
118 |     default values.
119 |     """
120 |     val_type = {'param_w_type_wo_default2': float}
121 |     val_default = {'param_w_type_wo_default3': 'hello', 'param_w_int_default3': 50}
122 | 
123 |     p1 = parser_wo_subparsers
124 |     c1 = tmp_path / 'c1.conf'
125 | 
126 |     # can mix up _ and -
127 |     c1.write_text(CONF_CONTENTS)
128 |     d1 = update_parsers_defaults_with_conf(
129 |         parsers=[p1], conf_file=str(c1), val_type=val_type, val_default=val_default
130 |     )
131 | 
132 |     assert p1.get_default('param_wo_default') == 'please_remove_double_quote'
133 |     assert p1.get_default('param_w_type_wo_default') == '4.0'
134 |     assert p1.get_default('param_w_type_wo_default2') == 5.0
135 |     assert p1.get_default('param_w_type_wo_default3') == 'hello'
136 |     assert p1.get_default('param_w_int_default') == 10
137 |     assert p1.get_default('param_w_int_default2') == 4
138 |     assert p1.get_default('param_w_int_default3') == 50
139 |     assert p1.get_default('flag_w_default')
140 |     assert not p1.get_default('flag_w_default2')
141 |     assert not p1.get_default('flag_wo_default')
142 |     assert p1.get_default('flag_wo_default2')
143 | 
144 |     assert d1['param_wo_default'] == 'please_remove_double_quote'
145 |     assert d1['param_w_type_wo_default'] == '4.0'
146 |     assert d1['param_w_type_wo_default2'] == 5.0
147 |     assert d1['param_w_type_wo_default3'] == 'hello'
148 |     assert d1['param_w_int_default'] == 10
149 |     assert 'param_w_int_default2' not in d1
150 |     assert d1['param_w_int_default3'] == 50
151 |     assert d1['flag_w_default']
152 |     assert not d1['flag_w_default2']
153 |     assert not d1['flag_wo_default']
154 |     assert d1['flag_wo_default2']
155 | 
156 | 
157 | def test_update_parsers_defaults_with_conf_with_subparsers(
158 |     tmp_path, parser_with_subparsers
159 | ):
160 |     """Check if this function correctly updates argparse parser's
161 |     default values.
162 |     """
163 |     p, subparsers = parser_with_subparsers
164 |     c1 = tmp_path / 'c1.conf'
165 | 
166 |     # can mix up _ and -
167 |     c1.write_text(CONF_CONTENTS_FOR_SUBPARSER)
168 |     d = update_parsers_defaults_with_conf(parsers=subparsers, conf_file=str(c1))
169 |     args_a, _ = p.parse_known_args(['a'])
170 |     args_b, _ = p.parse_known_args(['b'])
171 | 
172 |     assert args_a.shared_param == 600.0
173 |     assert args_a.uniq_param_a == 2000
174 | 
175 |     assert args_b.shared_param == 600.0
176 |     assert args_b.uniq_param_b == 4000.0
177 |     # cannot parse "type" from argparse
178 |     assert args_b.uniq_param_c == 4
179 | 
180 |     assert d['shared_param'] == 600.0
181 |     assert d['uniq_param_a'] == 2000
182 |     assert d['uniq_param_b'] == 4000.0
183 |     assert d['uniq_param_c'] == '4'
184 | 


--------------------------------------------------------------------------------
/tests/test_caper_labels.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from caper.caper_labels import CaperLabels
 5 | 
 6 | 
 7 | def test_create_file(tmp_path):
 8 |     cl = CaperLabels()
 9 | 
10 |     backend = 'my_backend'
11 | 
12 |     custom_labels = tmp_path / 'my_custom_labels.json'
13 |     custom_labels_dict = {'hello': 'world', 'good': {'bye': 'bro'}}
14 |     custom_labels.write_text(json.dumps(custom_labels_dict, indent=4))
15 | 
16 |     str_label = 'my:str?label*'
17 |     user = 'my_user'
18 |     basename = 'my_basename.json'
19 | 
20 |     f = cl.create_file(
21 |         directory=str(tmp_path),
22 |         backend=backend,
23 |         custom_labels=str(custom_labels),
24 |         str_label=str_label,
25 |         user=user,
26 |         basename=basename,
27 |     )
28 | 
29 |     with open(f) as fp:
30 |         d = json.loads(fp.read())
31 | 
32 |     assert d[CaperLabels.KEY_CAPER_BACKEND] == backend
33 |     assert d['hello'] == 'world'
34 |     assert d['good']['bye'] == 'bro'
35 |     assert d[CaperLabels.KEY_CAPER_STR_LABEL] == 'my_str_label_'
36 |     assert d[CaperLabels.KEY_CAPER_USER] == user
37 |     assert os.path.basename(f) == basename
38 |     assert os.path.dirname(f) == str(tmp_path)
39 | 


--------------------------------------------------------------------------------
/tests/test_caper_wdl_parser.py:
--------------------------------------------------------------------------------
 1 | from textwrap import dedent
 2 | 
 3 | from caper.caper_wdl_parser import CaperWDLParser
 4 | 
 5 | WDL_CONTENTS = dedent(
 6 |     """\
 7 |     version 1.0
 8 | 
 9 |     workflow test_wdl {
10 |         meta {
11 |             caper_docker: "ubuntu:latest"
12 |             caper_singularity: "docker://ubuntu:latest"
13 |         }
14 |     }
15 | """
16 | )
17 | 
18 | 
19 | OLD_WDL_CONTENTS = dedent(
20 |     """\
21 |     #CAPER docker "ubuntu:latest"
22 |     #CAPER singularity "docker://ubuntu:latest"
23 | 
24 |     workflow test_wdl {
25 |     }
26 | """
27 | )
28 | 
29 | 
30 | def test_properties(tmp_path):
31 |     """Test the following properties.
32 |     - caper_docker
33 |     - caper_singularity
34 |     """
35 |     main_wdl = tmp_path / 'main.wdl'
36 |     main_wdl.write_text(WDL_CONTENTS)
37 | 
38 |     old_wdl = tmp_path / 'old_main.wdl'
39 |     old_wdl.write_text(OLD_WDL_CONTENTS)
40 | 
41 |     # test reading from workflow.meta
42 |     main = CaperWDLParser(str(main_wdl))
43 |     assert main.caper_docker == 'ubuntu:latest'
44 |     assert main.caper_singularity == 'docker://ubuntu:latest'
45 | 
46 |     # test reading from comments (old-style)
47 |     old = CaperWDLParser(str(old_wdl))
48 |     assert old.caper_docker == 'ubuntu:latest'
49 |     assert old.caper_singularity == 'docker://ubuntu:latest'
50 | 


--------------------------------------------------------------------------------
/tests/test_cli_run.py:
--------------------------------------------------------------------------------
  1 | """This does not cover all CLI parameters defined in caper/caper_args.py.
  2 | Google Cloud Platform is tested in test_cli_server_client.py.
  3 | However, other cloud (aws) and HPCs (slurm/sge/pbs) are not tested.
  4 | 
  5 | In this testing module, 'caper run' is tested with a local backend.
  6 | 
  7 | See test_cli_server_client.py for 'caper server/submit/...'.
  8 | We will use gcp (Google Cloud Platform) backend to test server-client
  9 | functions.
 10 | """
 11 | import json
 12 | import os
 13 | 
 14 | import pytest
 15 | from autouri import GCSURI
 16 | 
 17 | from caper.cli import main as cli_main
 18 | from caper.cromwell_metadata import CromwellMetadata
 19 | from caper.wdl_parser import WDLParser
 20 | 
 21 | from .example_wdl import make_directory_with_wdls
 22 | 
 23 | 
 24 | def test_wrong_subcmd():
 25 |     cmd = ['wrong_subcmd']
 26 |     with pytest.raises(SystemExit):
 27 |         cli_main(cmd)
 28 | 
 29 | 
 30 | @pytest.mark.parametrize(
 31 |     'cmd',
 32 |     [
 33 |         ['--docker', '--singularity'],
 34 |         ['--docker', 'ubuntu:latest', '--singularity'],
 35 |         ['--docker', '--singularity', 'docker://ubuntu:latest'],
 36 |         ['--docker', 'ubuntu:latest', '--singularity', 'docker://ubuntu:latest'],
 37 |         ['--docker', '--soft-glob-output'],
 38 |         ['--docker', 'ubuntu:latest', '--soft-glob-output'],
 39 |     ],
 40 | )
 41 | def test_mutually_exclusive_params(tmp_path, cmd):
 42 |     make_directory_with_wdls(str(tmp_path))
 43 | 
 44 |     cmd = ['run', str(tmp_path / 'main.wdl')] + cmd
 45 |     with pytest.raises(ValueError):
 46 |         cli_main(cmd)
 47 | 
 48 | 
 49 | @pytest.mark.integration
 50 | def test_run(tmp_path, cromwell, womtool, debug_caper):
 51 |     """Will test most local parameters (run only) here."""
 52 |     make_directory_with_wdls(str(tmp_path))
 53 |     wdl = tmp_path / 'main.wdl'
 54 |     inputs = tmp_path / 'inputs.json'
 55 |     p = WDLParser(str(wdl))
 56 |     imports = p.zip_subworkflows(str(tmp_path / 'imports.zip'))
 57 | 
 58 |     cmd = ['run']
 59 |     cmd += [str(wdl)]
 60 |     cmd += ['--tmp-dir', str(tmp_path / 'tmp_dir')]
 61 |     # local (instead of correct Local with capital L) should work.
 62 |     cmd += ['--backend', 'local']
 63 |     cmd += ['--cromwell-stdout', str(tmp_path / 'cromwell_stdout.o')]
 64 |     cmd += ['--db', 'file']
 65 |     cmd += ['--db-timeout', '500000']
 66 |     cmd += ['--file-db', str(tmp_path / 'file_db_prefix')]
 67 |     cmd += ['--max-concurrent-tasks', '2']
 68 |     cmd += ['--max-concurrent-workflows', '2']
 69 |     cmd += ['--disable-call-caching']
 70 |     cmd += ['--soft-glob-output']
 71 |     cmd += ['--local-hash-strat', 'path']
 72 |     cmd += ['--local-out-dir', str(tmp_path / 'out_dir')]
 73 |     cmd += ['--inputs', str(inputs)]
 74 |     cmd += ['--imports', str(imports)]
 75 |     cmd += ['--ignore-womtool']
 76 |     cmd += ['--cromwell', cromwell]
 77 |     cmd += ['--womtool', womtool]
 78 |     cmd += ['--java-heap-womtool', '2G']
 79 |     cmd += ['--java-heap-run', '2G']
 80 |     cmd += ['--max-retries', '1']
 81 |     cmd += ['--metadata-output', str(tmp_path / 'metadata.json')]
 82 |     if debug_caper:
 83 |         cmd += ['--debug']
 84 | 
 85 |     cli_main(cmd)
 86 | 
 87 |     assert (tmp_path / 'tmp_dir').exists()
 88 |     assert (tmp_path / 'file_db_prefix.lobs').exists()
 89 |     assert (tmp_path / 'metadata.json').exists()
 90 |     assert (tmp_path / 'cromwell_stdout.o').exists()
 91 | 
 92 |     # test cleanup() on local storage
 93 |     cm = CromwellMetadata(str(tmp_path / 'metadata.json'))
 94 |     # check if metadata JSON and workflowRoot dir exists
 95 |     root_out_dir = cm.data['workflowRoot']
 96 |     assert os.path.exists(root_out_dir) and os.path.isdir(root_out_dir)
 97 | 
 98 |     # dry-run should not delete anything
 99 |     cm.cleanup(dry_run=True)
100 |     assert os.path.exists(root_out_dir)
101 | 
102 |     cm.cleanup(dry_run=False)
103 |     assert not os.path.exists(root_out_dir)
104 | 
105 | 
106 | @pytest.mark.google_cloud
107 | @pytest.mark.integration
108 | def test_run_gcp_with_life_sciences_api(
109 |     tmp_path,
110 |     gcs_root,
111 |     ci_prefix,
112 |     cromwell,
113 |     womtool,
114 |     gcp_prj,
115 |     gcp_service_account_key_json,
116 |     debug_caper,
117 | ):
118 |     """Test run with Google Cloud Life Sciences API"""
119 |     out_gcs_bucket = os.path.join(gcs_root, 'caper_out', ci_prefix)
120 |     tmp_gcs_bucket = os.path.join(gcs_root, 'caper_tmp')
121 | 
122 |     # prepare WDLs and input JSON, imports to be submitted
123 |     make_directory_with_wdls(str(tmp_path))
124 |     wdl = tmp_path / 'main.wdl'
125 |     inputs = tmp_path / 'inputs.json'
126 |     metadata = tmp_path / 'metadata.json'
127 | 
128 |     cmd = ['run', str(wdl)]
129 |     cmd += ['--inputs', str(inputs)]
130 |     cmd += ['-m', str(metadata)]
131 |     if gcp_service_account_key_json:
132 |         cmd += ['--gcp-service-account-key-json', gcp_service_account_key_json]
133 |     cmd += ['--use-google-cloud-life-sciences']
134 |     cmd += ['--gcp-region', 'us-central1']
135 |     # --gcp-zones should be ignored
136 |     cmd += ['--gcp-zones', 'us-west1-a,us-west1-b']
137 |     cmd += ['--gcp-prj', gcp_prj]
138 |     cmd += ['--memory-retry-error-keys', 'Killed']
139 |     cmd += ['--memory-retry-multiplier', '1.5']
140 |     cmd += ['--tmp-dir', str(tmp_path / 'tmp_dir')]
141 |     cmd += ['--backend', 'gcp']
142 |     cmd += ['--gcp-out-dir', out_gcs_bucket]
143 |     cmd += ['--gcp-loc-dir', tmp_gcs_bucket]
144 |     cmd += ['--cromwell-stdout', str(tmp_path / 'cromwell_stdout.o')]
145 |     # test with file type DB
146 |     cmd += ['--db', 'file']
147 |     cmd += ['--db-timeout', '500000']
148 |     cmd += ['--file-db', str(tmp_path / 'file_db_prefix')]
149 |     cmd += ['--max-concurrent-tasks', '2']
150 |     cmd += ['--max-concurrent-workflows', '2']
151 |     cmd += ['--disable-call-caching']
152 |     cmd += ['--cromwell', cromwell]
153 |     cmd += ['--womtool', womtool]
154 |     cmd += ['--java-heap-run', '4G']
155 |     cmd += ['--docker', 'ubuntu:latest']
156 |     if debug_caper:
157 |         cmd += ['--debug']
158 |     print(' '.join(cmd))
159 | 
160 |     cli_main(cmd)
161 |     m_dict = json.loads(metadata.read_text())
162 | 
163 |     assert m_dict['status'] == 'Succeeded'
164 | 
165 |     # test CromwellMetadata.gcp_monitor() here
166 |     # since it's for gcp only and this function is one of the two
167 |     # test functions ran on a gcp backend.
168 |     # task main.t1 has sleep 10 so that monitoring_script has time to
169 |     # write monitoring data to `monitoringLog` file
170 |     cm = CromwellMetadata(m_dict)
171 |     monitor_data = cm.gcp_monitor()
172 |     for data in monitor_data:
173 |         instance_cpu = data['instance']['cpu']
174 |         instance_mem = data['instance']['mem']
175 |         instance_disk = data['instance']['disk']
176 |         assert instance_cpu >= 1
177 |         assert instance_mem >= 1024 * 1024 * 1024
178 |         assert instance_disk >= 10 * 1024 * 1024 * 1024
179 | 
180 |         max_cpu_percent = data['stats']['max']['cpu_pct']
181 |         max_mem = data['stats']['max']['mem']
182 |         max_disk = data['stats']['max']['disk']
183 |         if max_cpu_percent or data['task_name'] == 'main.t1':
184 |             assert max_cpu_percent <= 100.0
185 |         if max_mem or data['task_name'] == 'main.t1':
186 |             assert max_mem <= instance_mem
187 |         if max_disk or data['task_name'] == 'main.t1':
188 |             assert max_disk <= instance_disk
189 | 
190 |     # test cleanup on gcp backend (gs://)
191 |     root_out_dir = cm.data['workflowRoot']
192 | 
193 |     # remote metadata JSON file on workflow's root output dir.
194 |     remote_metadata_json_file = os.path.join(root_out_dir, 'metadata.json')
195 |     assert GCSURI(remote_metadata_json_file).exists
196 | 
197 |     # dry-run should not delete anything
198 |     cm.cleanup(dry_run=True)
199 |     assert GCSURI(remote_metadata_json_file).exists
200 | 
201 |     cm.cleanup(dry_run=False)
202 |     assert not GCSURI(remote_metadata_json_file).exists
203 | 


--------------------------------------------------------------------------------
/tests/test_cli_server_client_gcp.py:
--------------------------------------------------------------------------------
  1 | """This does not cover all CLI parameters defined in caper/caper_args.py.
  2 | gcp (Google Cloud Platform) backend is tested here with server/client functions.
  3 | """
  4 | import os
  5 | import time
  6 | 
  7 | import pytest
  8 | from autouri import AutoURI
  9 | 
 10 | from caper.cli import main as cli_main
 11 | from caper.cromwell_rest_api import CromwellRestAPI
 12 | from caper.wdl_parser import WDLParser
 13 | 
 14 | from .example_wdl import make_directory_with_wdls
 15 | 
 16 | TIMEOUT_SERVER_SPIN_UP = 500
 17 | TIMEOUT_SERVER_RUN_WORKFLOW = 960
 18 | 
 19 | 
 20 | @pytest.mark.google_cloud
 21 | @pytest.mark.integration
 22 | def test_server_client(
 23 |     tmp_path,
 24 |     gcs_root,
 25 |     ci_prefix,
 26 |     cromwell,
 27 |     womtool,
 28 |     gcp_prj,
 29 |     gcp_service_account_key_json,
 30 |     debug_caper,
 31 | ):
 32 |     """Test server, client stuffs"""
 33 |     # server command line
 34 |     server_port = 8015
 35 | 
 36 |     out_gcs_bucket = os.path.join(gcs_root, 'caper_out', ci_prefix)
 37 |     tmp_gcs_bucket = os.path.join(gcs_root, 'caper_tmp')
 38 | 
 39 |     cmd = ['server']
 40 |     cmd += ['--local-loc-dir', str(tmp_path / 'tmp_dir')]
 41 |     cmd += ['--backend', 'gcp']
 42 |     if gcp_service_account_key_json:
 43 |         cmd += ['--gcp-service-account-key-json', gcp_service_account_key_json]
 44 |     cmd += ['--gcp-prj', gcp_prj]
 45 |     cmd += ['--gcp-zones', 'us-west1-a,us-west1-b']
 46 |     cmd += ['--gcp-out-dir', out_gcs_bucket]
 47 |     cmd += ['--gcp-loc-dir', tmp_gcs_bucket]
 48 |     cmd += ['--cromwell-stdout', str(tmp_path / 'cromwell_stdout.o')]
 49 |     cmd += ['--db', 'in-memory']
 50 |     cmd += ['--db-timeout', '500000']
 51 |     cmd += ['--file-db', str(tmp_path / 'file_db_prefix')]
 52 |     cmd += ['--max-concurrent-tasks', '2']
 53 |     cmd += ['--max-concurrent-workflows', '2']
 54 |     cmd += ['--disable-call-caching']
 55 |     cmd += ['--local-hash-strat', 'path']
 56 |     cmd += ['--local-out-dir', str(tmp_path / 'out_dir')]
 57 |     cmd += ['--cromwell', cromwell]
 58 |     cmd += ['--java-heap-server', '8G']
 59 |     cmd += ['--port', str(server_port)]
 60 |     if debug_caper:
 61 |         cmd += ['--debug']
 62 |     print(' '.join(cmd))
 63 | 
 64 |     try:
 65 |         th = cli_main(cmd, nonblocking_server=True)
 66 | 
 67 |         # wait until server is ready to take submissions
 68 |         t_start = time.time()
 69 |         while th.status is None:
 70 |             time.sleep(1)
 71 |             if time.time() - t_start > TIMEOUT_SERVER_SPIN_UP:
 72 |                 raise TimeoutError('Timed out waiting for Cromwell server spin-up.')
 73 | 
 74 |         # prepare WDLs and input JSON, imports to be submitted
 75 |         make_directory_with_wdls(str(tmp_path))
 76 |         wdl = tmp_path / 'main.wdl'
 77 |         inputs = tmp_path / 'inputs.json'
 78 |         p = WDLParser(str(wdl))
 79 |         imports = p.zip_subworkflows(str(tmp_path / 'imports.zip'))
 80 | 
 81 |         # test "submit" with on_hold
 82 |         cmd = ['submit', str(wdl)]
 83 |         if gcp_service_account_key_json:
 84 |             cmd += ['--gcp-service-account-key-json', gcp_service_account_key_json]
 85 |         cmd += ['--port', str(server_port)]
 86 |         cmd += ['--inputs', str(inputs)]
 87 |         cmd += ['--imports', str(imports)]
 88 |         cmd += ['--gcp-zones', 'us-west1-a,us-west1-b']
 89 |         cmd += ['--gcp-loc-dir', tmp_gcs_bucket]
 90 |         cmd += ['--ignore-womtool']
 91 |         cmd += ['--java-heap-womtool', '2G']
 92 |         cmd += ['--max-retries', '1']
 93 |         cmd += ['--docker', 'ubuntu:latest']
 94 |         cmd += ['--backend', 'gcp']
 95 |         cmd += ['--hold']
 96 |         if debug_caper:
 97 |             cmd += ['--debug']
 98 |         cli_main(cmd)
 99 | 
100 |         time.sleep(10)
101 | 
102 |         # find workflow ID
103 |         cra = CromwellRestAPI(hostname='localhost', port=server_port)
104 |         workflow_id = cra.find(['*'])[0]['id']
105 | 
106 |         m = cra.get_metadata([workflow_id])[0]
107 |         assert m['status'] == 'On Hold'
108 | 
109 |         # unhold it
110 |         cmd = ['unhold', workflow_id]
111 |         cmd += ['--port', str(server_port)]
112 |         cli_main(cmd)
113 | 
114 |         time.sleep(5)
115 | 
116 |         m = cra.get_metadata([workflow_id])[0]
117 |         assert m['status'] in ('Submitted', 'Running')
118 | 
119 |         t_start = time.time()
120 |         while True:
121 |             time.sleep(5)
122 |             m = cra.get_metadata([workflow_id])[0]
123 |             workflow_root = m.get('workflowRoot')
124 |             if workflow_root:
125 |                 metadata_json_file = os.path.join(workflow_root, 'metadata.json')
126 |             else:
127 |                 metadata_json_file = None
128 |             print('polling: ', workflow_id, m['status'], metadata_json_file)
129 | 
130 |             if m['status'] in ('Failed', 'Succeeded'):
131 |                 if AutoURI(metadata_json_file).exists:
132 |                     break
133 |             elif metadata_json_file:
134 |                 assert not AutoURI(metadata_json_file).exists
135 | 
136 |             if time.time() - t_start > TIMEOUT_SERVER_RUN_WORKFLOW:
137 |                 raise TimeoutError('Timed out waiting for workflow being done.')
138 | 
139 |     finally:
140 |         # all done. so stop the server
141 |         if th:
142 |             th.stop()
143 |             th.join()
144 | 


--------------------------------------------------------------------------------
/tests/test_cromwell.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import sys
  4 | import time
  5 | 
  6 | import pytest
  7 | 
  8 | from caper.cromwell import Cromwell, WomtoolValidationFailed
  9 | from caper.cromwell_rest_api import CromwellRestAPI
 10 | from caper.wdl_parser import WDLParser
 11 | 
 12 | from .example_wdl import WRONG_WDL, make_directory_with_wdls
 13 | 
 14 | BACKEND_CONF_CONTENTS = """
 15 | backend {{
 16 |   providers {{
 17 |     Local {{
 18 |       config {{
 19 |         root = {root}
 20 |       }}
 21 |     }}
 22 |   }}
 23 | }}
 24 | """
 25 | 
 26 | TIMEOUT_SERVER_SPIN_UP = 200
 27 | TIMEOUT_SERVER_RUN_WORKFLOW = 960
 28 | 
 29 | 
 30 | def test_validate(tmp_path, cromwell, womtool):
 31 |     c = Cromwell(cromwell=cromwell, womtool=womtool)
 32 | 
 33 |     wdl = tmp_path / 'wrong.wdl'
 34 |     wdl.write_text(WRONG_WDL)
 35 |     with pytest.raises(WomtoolValidationFailed):
 36 |         c.validate(str(wdl))
 37 | 
 38 |     make_directory_with_wdls(str(tmp_path / 'successful'))
 39 |     wdl = tmp_path / 'successful' / 'main.wdl'
 40 |     inputs = tmp_path / 'successful' / 'inputs.json'
 41 |     c.validate(str(wdl), str(inputs))
 42 | 
 43 |     # zip subworkflows for later use
 44 |     p = WDLParser(str(wdl))
 45 |     imports = p.zip_subworkflows(str(tmp_path / 'imports.zip'))
 46 | 
 47 |     # test with imports.zip
 48 |     make_directory_with_wdls(str(tmp_path / 'wo_sub_wdls'), no_sub_wdl=True)
 49 |     wdl = tmp_path / 'wo_sub_wdls' / 'main.wdl'
 50 |     inputs = tmp_path / 'wo_sub_wdls' / 'inputs.json'
 51 |     c.validate(str(wdl), str(inputs), imports)
 52 | 
 53 | 
 54 | def test_run(tmp_path, cromwell, womtool):
 55 |     fileobj_stdout = sys.stdout
 56 | 
 57 |     c = Cromwell(cromwell=cromwell, womtool=womtool)
 58 | 
 59 |     make_directory_with_wdls(str(tmp_path))
 60 | 
 61 |     o_dir = tmp_path / 'output'
 62 |     o_dir.mkdir()
 63 |     work_dir = tmp_path / 'work_dir'
 64 |     work_dir.mkdir()
 65 | 
 66 |     backend_conf = tmp_path / 'backend.conf'
 67 |     backend_conf.write_text(BACKEND_CONF_CONTENTS.format(root=o_dir))
 68 | 
 69 |     try:
 70 |         th = c.run(
 71 |             backend_conf=str(backend_conf),
 72 |             wdl=str(tmp_path / 'main.wdl'),
 73 |             inputs=str(tmp_path / 'inputs.json'),
 74 |             metadata=str(tmp_path / 'metadata.json'),
 75 |             fileobj_stdout=fileobj_stdout,
 76 |             work_dir=work_dir,
 77 |             cwd=str(tmp_path),
 78 |         )
 79 |     finally:
 80 |         th.join()
 81 |     assert th.returncode == 0
 82 | 
 83 |     # check if metadata.json is written on both specified location
 84 |     # (tmp_path/metadata.json) and workflow's root directory
 85 |     metadata_dict = th.returnvalue
 86 |     root_dir = metadata_dict['workflowRoot']
 87 | 
 88 |     with open(os.path.join(root_dir, 'metadata.json')) as fp:
 89 |         metadata_contents_on_root = fp.read()
 90 |     metadata_dict_on_root = json.loads(metadata_contents_on_root)
 91 | 
 92 |     assert metadata_dict == metadata_dict_on_root
 93 |     # check if backend_conf's change of root directory worked
 94 |     assert root_dir.startswith(str(o_dir))
 95 | 
 96 |     # zip subworkflows for later use
 97 |     p = WDLParser(str(tmp_path / 'main.wdl'))
 98 |     imports = p.zip_subworkflows(str(tmp_path / 'imports.zip'))
 99 | 
100 |     # test without sub WDLs but with imports.zip
101 |     # test run without work_dir
102 |     make_directory_with_wdls(str(tmp_path / 'wo_sub_wdls'), no_sub_wdl=True)
103 | 
104 |     try:
105 |         th = c.run(
106 |             wdl=str(tmp_path / 'wo_sub_wdls' / 'main.wdl'),
107 |             inputs=str(tmp_path / 'wo_sub_wdls' / 'inputs.json'),
108 |             imports=imports,
109 |             fileobj_stdout=fileobj_stdout,
110 |             cwd=str(tmp_path / 'wo_sub_wdls'),
111 |         )
112 |     finally:
113 |         th.join()
114 |     assert th.returncode == 0
115 | 
116 | 
117 | def test_server(tmp_path, cromwell, womtool):
118 |     """Test Cromwell.server() method, which returns a Thread object."""
119 |     server_port = 8005
120 |     fileobj_stdout = sys.stdout
121 | 
122 |     c = Cromwell(cromwell=cromwell, womtool=womtool)
123 | 
124 |     o_dir = tmp_path / 'output'
125 |     o_dir.mkdir()
126 | 
127 |     backend_conf = tmp_path / 'backend.conf'
128 |     backend_conf.write_text(BACKEND_CONF_CONTENTS.format(root=o_dir))
129 | 
130 |     is_server_started = False
131 | 
132 |     def on_server_start():
133 |         nonlocal is_server_started
134 |         is_server_started = True
135 | 
136 |     workflow_id = None
137 |     is_workflow_done = False
138 | 
139 |     def on_status_change(metadata):
140 |         nonlocal workflow_id
141 |         nonlocal is_workflow_done
142 | 
143 |         if metadata:
144 |             if metadata['id'] == workflow_id:
145 |                 if metadata['status'] in ('Succeeded', 'Failed'):
146 |                     is_workflow_done = True
147 | 
148 |     # also tests two callback functions
149 |     try:
150 |         th = c.server(
151 |             server_port=server_port,
152 |             backend_conf=str(backend_conf),
153 |             embed_subworkflow=True,
154 |             fileobj_stdout=fileobj_stdout,
155 |             on_server_start=on_server_start,
156 |             on_status_change=on_status_change,
157 |             cwd=str(tmp_path),
158 |         )
159 |         assert th.status is None
160 | 
161 |         # wait until server is ready to take submissions
162 |         t_start = time.time()
163 |         while not is_server_started:
164 |             time.sleep(1)
165 |             if time.time() - t_start > TIMEOUT_SERVER_SPIN_UP:
166 |                 raise TimeoutError('Timed out waiting for Cromwell server spin-up.')
167 | 
168 |         # another way of checking server is started
169 |         assert th.status
170 | 
171 |         # make WDLs and imports
172 |         wdl = tmp_path / 'main.wdl'
173 |         make_directory_with_wdls(str(tmp_path))
174 |         # zip subworkflows for later use
175 |         p = WDLParser(str(wdl))
176 |         imports = p.zip_subworkflows(str(tmp_path / 'imports.zip'))
177 | 
178 |         cra = CromwellRestAPI(hostname='localhost', port=server_port)
179 |         r = cra.submit(
180 |             source=str(wdl), dependencies=imports, inputs=str(tmp_path / 'inputs.json')
181 |         )
182 |         workflow_id = r['id']
183 | 
184 |         t_start = time.time()
185 |         while not is_workflow_done:
186 |             time.sleep(1)
187 |             print('polling: ', workflow_id, is_workflow_done)
188 |             if time.time() - t_start > TIMEOUT_SERVER_RUN_WORKFLOW:
189 |                 raise TimeoutError('Timed out waiting for workflow being done.')
190 | 
191 |         metadata = cra.get_metadata([workflow_id], embed_subworkflow=True)[0]
192 | 
193 |         # check if metadata JSON is written on workflow's root directory.
194 |         root_dir = metadata['workflowRoot']
195 |         metadata_file = os.path.join(root_dir, 'metadata.json')
196 |         assert os.path.exists(metadata_file)
197 | 
198 |         # check if subworkflow is embedded.
199 |         with open(metadata_file) as fp:
200 |             metadata_from_file = json.loads(fp.read())
201 |         assert metadata == metadata_from_file
202 | 
203 |     finally:
204 |         th.stop()
205 |         th.join()
206 | 


--------------------------------------------------------------------------------
/tests/test_cromwell_backend.py:
--------------------------------------------------------------------------------
 1 | """There are lots of UserDict-based classesi n caper/cromwell_backend.py
 2 | In this test, only the followings classes with public methods
 3 | will be tested.
 4 |     - CromwellBackendBase
 5 | 
 6 | """
 7 | from caper.cromwell_backend import CromwellBackendBase
 8 | 
 9 | 
10 | def test_cromwell_backend_base_backend():
11 |     """Test a property backend's getter, setter"""
12 |     bb1 = CromwellBackendBase('test1')
13 |     backend_dict = {'a': 1, 'b': '2'}
14 | 
15 |     bb1.backend = backend_dict
16 |     assert bb1.backend == backend_dict
17 | 
18 | 
19 | def test_cromwell_backend_base_merge_backend():
20 |     bb1 = CromwellBackendBase('test1')
21 |     bb1.backend = {'a': 1, 'b': '2'}
22 |     backend_dict = {'c': 3.0, 'd': '4.0'}
23 | 
24 |     bb1.merge_backend(backend_dict)
25 |     assert bb1.backend == {'a': 1, 'b': '2', 'c': 3.0, 'd': '4.0'}
26 | 
27 | 
28 | def test_cromwell_backend_base_backend_config():
29 |     bb1 = CromwellBackendBase('test1')
30 |     bb1.backend = {'config': {'root': 'test/folder'}}
31 |     assert bb1.backend_config == {'root': 'test/folder'}
32 | 
33 | 
34 | def test_cromwell_backend_base_backend_config_dra():
35 |     bb1 = CromwellBackendBase('test1')
36 |     bb1.backend = {
37 |         'config': {
38 |             'root': 'test/folder',
39 |             'default-runtime-attributes': {'docker': 'ubuntu:latest'},
40 |         }
41 |     }
42 |     assert bb1.default_runtime_attributes == {'docker': 'ubuntu:latest'}
43 | 


--------------------------------------------------------------------------------
/tests/test_cromwell_metadata.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from autouri import AutoURI
 5 | 
 6 | from caper.cromwell import Cromwell
 7 | from caper.cromwell_metadata import CromwellMetadata
 8 | 
 9 | from .example_wdl import make_directory_with_failing_wdls, make_directory_with_wdls
10 | 
11 | 
12 | def test_on_successful_workflow(tmp_path, cromwell, womtool):
13 |     fileobj_stdout = sys.stdout
14 | 
15 |     make_directory_with_wdls(str(tmp_path / 'successful'))
16 | 
17 |     # Run Cromwell to get metadata JSON
18 |     c = Cromwell(cromwell=cromwell, womtool=womtool)
19 |     th = c.run(
20 |         wdl=str(tmp_path / 'successful' / 'main.wdl'),
21 |         inputs=str(tmp_path / 'successful' / 'inputs.json'),
22 |         fileobj_stdout=fileobj_stdout,
23 |         cwd=str(tmp_path / 'successful'),
24 |     )
25 |     th.join()
26 |     metadata = th.returnvalue
27 |     assert metadata
28 | 
29 |     cm = CromwellMetadata(metadata)
30 |     # test all properties
31 |     assert cm.data == metadata
32 |     assert cm.metadata == metadata
33 |     assert CromwellMetadata(metadata).data == metadata
34 |     assert cm.workflow_id == metadata['id']
35 |     assert cm.workflow_status == metadata['status']
36 |     # no failures for successful workflow's metadata
37 |     assert cm.failures is None
38 |     assert cm.calls == metadata['calls']
39 |     assert sorted([call_name for call_name, _, _ in cm.recursed_calls]) == sorted(
40 |         ['main.t1', 'sub.t2', 'sub_sub.t3']
41 |     )
42 | 
43 |     # test recurse_calls(): test with a simple function
44 |     def fnc(call_name, call, parent_call_names):
45 |         assert call_name in ('main.t1', 'sub.t2', 'sub_sub.t3')
46 |         assert call['executionStatus'] == 'Done'
47 |         if call_name == 'main.t1':
48 |             assert not parent_call_names
49 |         elif call_name == 'sub.t2':
50 |             assert parent_call_names == ('main.sub',)
51 |         elif call_name == 'sub_sub.t3':
52 |             assert parent_call_names == ('main.sub', 'sub.sub_sub')
53 |         else:
54 |             raise ValueError('Wrong call_name: {name}'.format(name=call_name))
55 | 
56 |     list(cm.recurse_calls(fnc))
57 | 
58 |     # test write_on_workflow_root()
59 |     m_file_on_root = os.path.join(cm.metadata['workflowRoot'], 'metadata.json')
60 |     u = AutoURI(m_file_on_root)
61 |     u.rm()
62 |     assert not u.exists
63 | 
64 |     cm.write_on_workflow_root()
65 |     assert os.path.exists(m_file_on_root)
66 |     assert CromwellMetadata(m_file_on_root).metadata == cm.metadata
67 | 
68 | 
69 | def test_on_failed_workflow(tmp_path, cromwell, womtool):
70 |     fileobj_stdout = sys.stdout
71 | 
72 |     make_directory_with_failing_wdls(str(tmp_path / 'failed'))
73 | 
74 |     # Run Cromwell to get metadata JSON
75 |     # designed to fail in a subworkflow
76 |     c = Cromwell(cromwell=cromwell, womtool=womtool)
77 |     th = c.run(
78 |         wdl=str(tmp_path / 'failed' / 'main.wdl'),
79 |         inputs=str(tmp_path / 'failed' / 'inputs.json'),
80 |         fileobj_stdout=fileobj_stdout,
81 |         cwd=str(tmp_path / 'failed'),
82 |     )
83 |     th.join()
84 | 
85 |     # check failed
86 |     assert th.returncode
87 |     metadata = th.returnvalue
88 |     assert metadata
89 |     cm = CromwellMetadata(metadata)
90 | 
91 |     assert cm.failures == metadata['failures']
92 |     assert cm.calls == metadata['calls']
93 | 
94 |     # test troubleshoot()
95 |     report = cm.troubleshoot()
96 |     assert '* Found failures JSON object' in report
97 |     assert 'NAME=sub.t2_failing' in report
98 |     assert 'INTENTED_ERROR: command not found' in report
99 | 


--------------------------------------------------------------------------------
/tests/test_cromwell_rest_api.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | 
  4 | import pytest
  5 | 
  6 | from caper.caper_labels import CaperLabels
  7 | from caper.cromwell import Cromwell
  8 | from caper.cromwell_rest_api import CromwellRestAPI, has_wildcard, is_valid_uuid
  9 | from caper.wdl_parser import WDLParser
 10 | 
 11 | from .example_wdl import make_directory_with_wdls
 12 | 
 13 | 
 14 | @pytest.mark.parametrize(
 15 |     'test_input,expected',
 16 |     [
 17 |         ('asldkhjlkasdf289jisdl;sladkjasdflksd', False),
 18 |         ('cromwell-f9c26f2e-f550-4748-a650-5d0d4cab9f3a', False),
 19 |         ('f9c26f2e-f550-4748-a650-5d0d4c', False),
 20 |         ('f9c26f2e-f550-4748-a650-5d0d4cab9f3a', True),
 21 |         ('F9C26f2e-F550-4748-A650-5D0D4cab9f3a', False),
 22 |         ('f9c26f2e', False),
 23 |         ([], False),
 24 |         (tuple(), False),
 25 |         (None, False),
 26 |     ],
 27 | )
 28 | def test_is_valid_uuid(test_input, expected):
 29 |     assert is_valid_uuid(test_input) == expected
 30 | 
 31 | 
 32 | @pytest.mark.parametrize(
 33 |     'test_input,expected',
 34 |     [
 35 |         ('?????', True),
 36 |         (('lskadfj', 'sdkfjaslf'), False),
 37 |         ('*', True),
 38 |         ('?', True),
 39 |         (':', False),
 40 |         (('*', '?'), True),
 41 |         (('_', '-', 'asdfjkljklasdfjklasdf'), False),
 42 |         ([], False),
 43 |         (tuple(), False),
 44 |         (None, False),
 45 |     ],
 46 | )
 47 | def test_has_wildcard(test_input, expected):
 48 |     assert has_wildcard(test_input) == expected
 49 | 
 50 | 
 51 | def test_all(tmp_path, cromwell, womtool):
 52 |     """Test Cromwell.server() method, which returns a Thread object."""
 53 |     server_port = 8010
 54 |     fileobj_stdout = sys.stdout
 55 |     test_label = 'test_label'
 56 | 
 57 |     c = Cromwell(cromwell=cromwell, womtool=womtool)
 58 | 
 59 |     o_dir = tmp_path / 'output'
 60 |     o_dir.mkdir()
 61 | 
 62 |     labels_file = CaperLabels().create_file(
 63 |         directory=str(tmp_path), str_label=test_label
 64 |     )
 65 | 
 66 |     is_server_started = False
 67 | 
 68 |     def on_server_start():
 69 |         nonlocal is_server_started
 70 |         is_server_started = True
 71 | 
 72 |     workflow_id = None
 73 |     is_workflow_done = False
 74 | 
 75 |     def on_status_change(metadata):
 76 |         nonlocal workflow_id
 77 |         nonlocal is_workflow_done
 78 | 
 79 |         if metadata:
 80 |             if metadata['id'] == workflow_id:
 81 |                 if metadata['status'] in ('Succeeded', 'Failed'):
 82 |                     is_workflow_done = True
 83 | 
 84 |     # also tests two callback functions
 85 |     try:
 86 |         th = c.server(
 87 |             server_port=server_port,
 88 |             embed_subworkflow=True,
 89 |             fileobj_stdout=fileobj_stdout,
 90 |             on_server_start=on_server_start,
 91 |             on_status_change=on_status_change,
 92 |             cwd=str(tmp_path),
 93 |         )
 94 |         assert th.status is None
 95 | 
 96 |         # wait until server is ready to take submissions
 97 |         t_start = time.time()
 98 |         while not is_server_started:
 99 |             time.sleep(1)
100 |             if time.time() - t_start > 60:
101 |                 raise TimeoutError('Timed out waiting for Cromwell server spin-up.')
102 | 
103 |         # another way of checking server is started
104 |         assert th.status
105 | 
106 |         # make WDLs and imports
107 |         wdl = tmp_path / 'main.wdl'
108 |         make_directory_with_wdls(str(tmp_path))
109 |         # zip subworkflows for later use
110 |         p = WDLParser(str(wdl))
111 |         imports = p.zip_subworkflows(str(tmp_path / 'imports.zip'))
112 | 
113 |         cra = CromwellRestAPI(hostname='localhost', port=server_port)
114 |         # no workflow
115 |         assert not cra.find(workflow_ids=['*'])
116 | 
117 |         # put a hold on a workflow when submitting
118 |         r = cra.submit(
119 |             source=str(wdl),
120 |             dependencies=imports,
121 |             inputs=str(tmp_path / 'inputs.json'),
122 |             labels=labels_file,
123 |             on_hold=True,
124 |         )
125 |         workflow_id = r['id']
126 |         time.sleep(10)
127 |         # find by workflow ID
128 |         workflow_by_id = cra.find(workflow_ids=[workflow_id])[0]
129 |         # find by label
130 |         workflow_by_label = cra.find(labels=[('caper-str-label', test_label)])[0]
131 |         # find by workflow ID with wildcard *
132 |         workflow_by_id_with_wildcard = cra.find(workflow_ids=[workflow_id[:-10] + '*'])[
133 |             0
134 |         ]
135 |         # find by label with wildcard ?
136 |         workflow_by_label_with_wildcard = cra.find(
137 |             labels=[('caper-str-label', test_label[:-1] + '?')]
138 |         )[0]
139 | 
140 |         assert workflow_by_label['id'] == workflow_id
141 |         assert workflow_by_id['id'] == workflow_id
142 |         assert workflow_by_id_with_wildcard['id'] == workflow_id
143 |         assert workflow_by_label_with_wildcard['id'] == workflow_id
144 |         assert workflow_by_id['status'] == 'On Hold'
145 | 
146 |         cra.release_hold([workflow_id])
147 |         time.sleep(3)
148 | 
149 |         assert cra.get_label(workflow_id, 'caper-str-label') == test_label
150 |         assert cra.get_labels(workflow_id)['caper-str-label'] == test_label
151 | 
152 |         # abort it
153 |         assert cra.find([workflow_id])[0]['status'] in ('Submitted', 'On Hold')
154 |         cra.abort([workflow_id])
155 |         time.sleep(5)
156 |         assert cra.find([workflow_id])[0]['status'] == 'Aborted'
157 | 
158 |         # submit another workflow
159 |         r = cra.submit(
160 |             source=str(wdl),
161 |             dependencies=imports,
162 |             inputs=str(tmp_path / 'inputs.json'),
163 |             on_hold=False,
164 |         )
165 |         is_workflow_done = False
166 |         workflow_id = r['id']
167 |         time.sleep(5)
168 | 
169 |         t_start = time.time()
170 |         while not is_workflow_done:
171 |             time.sleep(1)
172 |             print('polling: ', workflow_id, is_workflow_done)
173 |             if time.time() - t_start > 120:
174 |                 raise TimeoutError('Timed out waiting for workflow being done.')
175 | 
176 |         metadata = cra.get_metadata([workflow_id], embed_subworkflow=True)[0]
177 |         metadata_wo_sub = cra.get_metadata([workflow_id], embed_subworkflow=False)[0]
178 | 
179 |         assert 'subWorkflowMetadata' not in metadata_wo_sub['calls']['main.sub'][0]
180 |         subworkflow = metadata['calls']['main.sub'][0]
181 |         assert 'subWorkflowMetadata' in subworkflow
182 |         assert (
183 |             'subWorkflowMetadata'
184 |             in subworkflow['subWorkflowMetadata']['calls']['sub.sub_sub'][0]
185 |         )
186 | 
187 |         # check server's properties before closing it
188 |         assert cra.get_default_backend() == 'Local'
189 |         assert cra.get_backends()['supportedBackends'] == ['Local']
190 | 
191 |     finally:
192 |         th.stop()
193 |         th.join()
194 | 


--------------------------------------------------------------------------------
/tests/test_dict_tool.py:
--------------------------------------------------------------------------------
  1 | from textwrap import dedent
  2 | 
  3 | from caper.dict_tool import (
  4 |     dict_to_dot_str,
  5 |     flatten_dict,
  6 |     merge_dict,
  7 |     split_dict,
  8 |     unflatten_dict,
  9 | )
 10 | 
 11 | 
 12 | def test_merge_dict():
 13 |     d1 = {
 14 |         'flagstat_qc': {'rep1': {'read1': 100}, 'rep2': {'read2': 400}},
 15 |         'etc': {'samstat_qc': {'rep1': {'unmapped': 500, 'mapped': 600}}},
 16 |     }
 17 |     d2 = {
 18 |         'flagstat_qc': {'rep1': {'read2': 200}, 'rep2': {'read1': 300}},
 19 |         'etc': {'samstat_qc': {'rep2': {'unmapped': 700, 'mapped': 800}}},
 20 |         'idr_qc': {'qc_test1': 900},
 21 |     }
 22 | 
 23 |     assert merge_dict(d1, d2) == {
 24 |         'flagstat_qc': {
 25 |             'rep1': {'read1': 100, 'read2': 200},
 26 |             'rep2': {'read1': 300, 'read2': 400},
 27 |         },
 28 |         'etc': {
 29 |             'samstat_qc': {
 30 |                 'rep1': {'unmapped': 500, 'mapped': 600},
 31 |                 'rep2': {'unmapped': 700, 'mapped': 800},
 32 |             }
 33 |         },
 34 |         'idr_qc': {'qc_test1': 900},
 35 |     }
 36 | 
 37 | 
 38 | def test_flatten_dict():
 39 |     d = {
 40 |         'flagstat_qc': {
 41 |             'rep1': {'read1': 100, 'read2': 200},
 42 |             'rep2': {'read1': 300, 'read2': 400},
 43 |         },
 44 |         'rep': 1,
 45 |     }
 46 |     assert flatten_dict(d) == {
 47 |         ('flagstat_qc', 'rep1', 'read1'): 100,
 48 |         ('flagstat_qc', 'rep1', 'read2'): 200,
 49 |         ('flagstat_qc', 'rep2', 'read1'): 300,
 50 |         ('flagstat_qc', 'rep2', 'read2'): 400,
 51 |         ('rep',): 1,
 52 |     }
 53 | 
 54 | 
 55 | def test_unflatten_dict():
 56 |     d_f = {
 57 |         ('flagstat_qc', 'rep1', 'read1'): 100,
 58 |         ('flagstat_qc', 'rep1', 'read2'): 200,
 59 |         ('flagstat_qc', 'rep2', 'read1'): 300,
 60 |         ('flagstat_qc', 'rep2', 'read2'): 400,
 61 |         ('rep',): 1,
 62 |     }
 63 |     assert unflatten_dict(d_f) == {
 64 |         'flagstat_qc': {
 65 |             'rep1': {'read1': 100, 'read2': 200},
 66 |             'rep2': {'read1': 300, 'read2': 400},
 67 |         },
 68 |         'rep': 1,
 69 |     }
 70 | 
 71 | 
 72 | def test_split_dict():
 73 |     d = {
 74 |         'flagstat_qc': {
 75 |             'rep1': {'read1': 100, 'read2': 200},
 76 |             'rep2': {'read1': 300, 'read2': 400},
 77 |         },
 78 |         'etc': {
 79 |             'samstat_qc': {
 80 |                 'rep1': {'unmapped': 500, 'mapped': 600},
 81 |                 'rep2': {'unmapped': 700, 'mapped': 800},
 82 |             }
 83 |         },
 84 |         'idr_qc': {'qc_test1': 900},
 85 |     }
 86 |     splits = split_dict(d, ('replicate', r'^rep\d+$'))
 87 |     splits_ref = [
 88 |         {'idr_qc': {'qc_test1': 900}},
 89 |         {
 90 |             'flagstat_qc': {'read1': 100, 'read2': 200},
 91 |             'etc': {'samstat_qc': {'unmapped': 500, 'mapped': 600}},
 92 |             'replicate': 'rep1',
 93 |         },
 94 |         {
 95 |             'flagstat_qc': {'read1': 300, 'read2': 400},
 96 |             'etc': {'samstat_qc': {'unmapped': 700, 'mapped': 800}},
 97 |             'replicate': 'rep2',
 98 |         },
 99 |     ]
100 |     assert splits == splits_ref
101 | 
102 | 
103 | def test_dict_to_dot_str():
104 |     d = {
105 |         'rankDir': 'TD',
106 |         'start': '[shape=Mdiamond]',
107 |         'end': '[shape=Msquare]',
108 |         'subgraph cluster_rep1': {
109 |             'style': 'filled',
110 |             'color': 'mistyrose',
111 |             'label': '"Replicate 1"',
112 |         },
113 |         'subgraph cluster_rep2': {
114 |             'style': 'filled',
115 |             'color': 'azure',
116 |             'label': '"Replicate 2"',
117 |         },
118 |         'a0 -> b0': None,
119 |         'c0 -> d0': None,
120 |     }
121 |     dot = dict_to_dot_str(d, parent_key='digraph D', indent=' ' * 4)
122 |     ref = dedent(
123 |         """\
124 |         digraph D {
125 |             rankDir = TD;
126 |             start = [shape=Mdiamond];
127 |             end = [shape=Msquare];
128 |             subgraph cluster_rep1 {
129 |                 style = filled;
130 |                 color = mistyrose;
131 |                 label = "Replicate 1";
132 |             }
133 |             subgraph cluster_rep2 {
134 |                 style = filled;
135 |                 color = azure;
136 |                 label = "Replicate 2";
137 |             }
138 |             a0 -> b0;
139 |             c0 -> d0;
140 |         }
141 |     """
142 |     )
143 |     assert dot == ref
144 | 


--------------------------------------------------------------------------------
/tests/test_hocon_string.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | from textwrap import dedent
  3 | 
  4 | from caper.dict_tool import merge_dict
  5 | from caper.hocon_string import HOCONString
  6 | 
  7 | INCLUDE_CROMWELL = 'include required(classpath("application"))'
  8 | 
  9 | 
 10 | def get_test_hocon_str():
 11 |     hocon_str = dedent(
 12 |         """\
 13 |         include required(classpath("application"))
 14 |         backend {
 15 |           default = "gcp"
 16 |           providers {
 17 |             Local {
 18 |               actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"
 19 |               config {
 20 |                 default-runtime-attributes {
 21 |                   docker = "ubuntu:latest"
 22 |                 }
 23 |                 root = "/mnt/data/scratch/leepc12/caper_out"
 24 |               }
 25 |             }
 26 |           }
 27 |         }"""
 28 |     )
 29 |     return hocon_str
 30 | 
 31 | 
 32 | def get_test_hocon_str2():
 33 |     hocon_str2 = dedent(
 34 |         """\
 35 |         include required(classpath("application"))
 36 |         backend {
 37 |           providers {
 38 |             gcp {
 39 |               actor-factory = "GOOGLE"
 40 |             }
 41 |           }
 42 |         }"""
 43 |     )
 44 |     return hocon_str2
 45 | 
 46 | 
 47 | def get_test_hocon_str_multiple_includes():
 48 |     return dedent(
 49 |         """\
 50 |         include required(classpath("application"))
 51 |         include required(file("application"))
 52 |         include required(url("application"))
 53 |         include required("application.conf")
 54 |         level1 {
 55 |           include file("/srv/test.conf")
 56 |           level2 {
 57 |             include url("http://ok.com/test.conf")
 58 |             level3 {
 59 |               include classpath("test")
 60 |               level4 {
 61 |                 include "test.conf"
 62 |                 level5 {
 63 |                   include "test.hocon"
 64 |                 }
 65 |               }
 66 |             }
 67 |           }
 68 |         }"""
 69 |     )
 70 | 
 71 | 
 72 | def get_test_dict(with_include=False):
 73 |     base_dict = {
 74 |         'backend': {
 75 |             'default': 'gcp',
 76 |             'providers': {
 77 |                 'Local': {
 78 |                     'actor-factory': 'cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory',
 79 |                     'config': {
 80 |                         'default-runtime-attributes': {'docker': 'ubuntu:latest'},
 81 |                         'root': '/mnt/data/scratch/leepc12/caper_out',
 82 |                     },
 83 |                 }
 84 |             },
 85 |         }
 86 |     }
 87 |     if with_include:
 88 |         base_dict[
 89 |             'HOCONSTRING_INCLUDE_ad5c3c187d5107c099f66681f1896c70'
 90 |         ] = 'include required(classpath("application"))'
 91 | 
 92 |     return base_dict
 93 | 
 94 | 
 95 | def get_test_dict2():
 96 |     """Without "include" lines."""
 97 |     return {'backend': {'providers': {'gcp': {'actor-factory': 'GOOGLE'}}}}
 98 | 
 99 | 
100 | def get_test_multiple_includes(with_include=False):
101 |     if with_include:
102 |         return {
103 |             "HOCONSTRING_INCLUDE_ad5c3c187d5107c099f66681f1896c70": "include required(classpath(\"application\"))",
104 |             "HOCONSTRING_INCLUDE_61b86ce2e19939719a2e043b923774e4": "include required(file(\"application\"))",
105 |             "HOCONSTRING_INCLUDE_543d042c69d8a730bc2b5785ac2f13c9": "include required(url(\"application\"))",
106 |             "HOCONSTRING_INCLUDE_9456b859a44adad9a3d00ff3fcbbc5ae": "include required(\"application.conf\")",
107 |             "level1": {
108 |                 "HOCONSTRING_INCLUDE_0714deb341d3d6291199d4738656c32b": "include file(\"/srv/test.conf\")",
109 |                 "level2": {
110 |                     "HOCONSTRING_INCLUDE_91f31b362d72089d09f6245e912efb30": "include url(\"http://ok.com/test.conf\")",
111 |                     "level3": {
112 |                         "HOCONSTRING_INCLUDE_906d6e6eff885e840b705c2e7be3ba2d": "include classpath(\"test\")",
113 |                         "level4": {
114 |                             "HOCONSTRING_INCLUDE_c971be2dbb00ef0b44b9e4bf3c57f5cb": "include \"test.conf\"",
115 |                             "level5": {
116 |                                 "HOCONSTRING_INCLUDE_44cb98470497b76dde0ab244c70870f0": "include \"test.hocon\""
117 |                             },
118 |                         },
119 |                     },
120 |                 },
121 |             },
122 |         }
123 |     else:
124 |         return {'level1': {'level2': {'level3': {'level4': {'level5': {}}}}}}
125 | 
126 | 
127 | def test_from_dict():
128 |     ref_d = get_test_dict()
129 |     hs = HOCONString.from_dict(ref_d, include=INCLUDE_CROMWELL)
130 |     print(str(hs))
131 |     print(get_test_hocon_str())
132 |     assert str(hs) == get_test_hocon_str()
133 | 
134 | 
135 | def test_to_dict():
136 |     hs = HOCONString(get_test_hocon_str())
137 |     assert hs.to_dict(with_include=False) == get_test_dict(with_include=False)
138 |     assert hs.to_dict(with_include=True) == get_test_dict(with_include=True)
139 | 
140 |     hs = HOCONString(get_test_hocon_str_multiple_includes())
141 |     assert hs.to_dict(with_include=False) == get_test_multiple_includes(
142 |         with_include=False
143 |     )
144 |     assert hs.to_dict(with_include=True) == get_test_multiple_includes(
145 |         with_include=True
146 |     )
147 | 
148 | 
149 | def test_merge():
150 |     s1 = get_test_hocon_str()
151 |     s2 = get_test_hocon_str2()
152 |     s3 = get_test_hocon_str_multiple_includes()
153 | 
154 |     d1 = get_test_dict()
155 |     d2 = get_test_dict2()
156 |     d3 = get_test_multiple_includes(True)
157 | 
158 |     dm12 = deepcopy(d1)
159 |     merge_dict(dm12, d2)
160 |     dm32 = deepcopy(d3)
161 |     merge_dict(dm32, d2)
162 | 
163 |     hs1 = HOCONString(s1)
164 |     hs2 = HOCONString(s2)
165 |     hs3 = HOCONString(s3)
166 |     hsm12 = HOCONString.from_dict(dm12, include=INCLUDE_CROMWELL)
167 |     hsm32 = HOCONString.from_dict(dm32)
168 | 
169 |     assert str(hsm12) == hs1.merge(hs2)
170 |     assert str(hsm12) == hs1.merge(d2)
171 |     assert str(hsm12) == hs1.merge(s2)
172 | 
173 |     assert str(hsm32) == hs3.merge(hs2)
174 |     assert str(hsm32) == hs3.merge(d2)
175 |     assert str(hsm32) == hs3.merge(s2)
176 | 
177 |     # merge with update
178 |     # item 1 should be updated with merged
179 |     hs1_original_str = str(hs1)
180 |     assert str(hsm12) == hs1.merge(hs2, update=True)
181 |     assert str(hs1) == str(hsm12)
182 |     assert hs1_original_str != str(hs1)
183 | 
184 | 
185 | def test_get_contents():
186 |     s2 = get_test_hocon_str2()
187 |     hs2 = HOCONString(s2)
188 | 
189 |     assert hs2.get_contents(with_include=True).strip() == s2
190 |     assert (
191 |         hs2.get_contents(with_include=False).strip()
192 |         == s2.replace(INCLUDE_CROMWELL, '').strip()
193 |     )
194 | 


--------------------------------------------------------------------------------
/tests/test_nb_subproc_thread.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | import pytest
  5 | 
  6 | from caper.nb_subproc_thread import NBSubprocThread
  7 | 
  8 | SH_CONTENTS = """#!/bin/bash
  9 | 
 10 | echoerr() { echo "$@" 1>&2; }
 11 | 
 12 | NUM=$1
 13 | if [ -z "$NUM" ]
 14 | then
 15 |   NUM=10
 16 | fi
 17 | 
 18 | echo $NUM
 19 | 
 20 | # NUM kitties (1 kitty per sec)
 21 | for i in $(seq 1 $NUM)
 22 | do
 23 |   echo "hello kitty $i-1. (STDOUT)"
 24 |   sleep 0.25
 25 |   echoerr "hello kitty $i-1. (STDERR)"
 26 |   sleep 0.25
 27 |   echoerr "hello kitty $i-2. (STDERR)"
 28 |   sleep 0.25
 29 |   echo "hello kitty $i-2. (STDOUT)"
 30 |   sleep 0.25
 31 | done
 32 | 
 33 | exit 10
 34 | """
 35 | 
 36 | 
 37 | def on_stdout(stdout):
 38 |     print('captured stdout:', stdout)
 39 |     assert stdout.endswith('\n')
 40 | 
 41 | 
 42 | def on_stderr(stderr):
 43 |     print('captured stderr:', stderr)
 44 |     assert stderr.endswith('\n')
 45 | 
 46 | 
 47 | def on_poll():
 48 |     print('polling')
 49 | 
 50 | 
 51 | def on_finish():
 52 |     return 'done'
 53 | 
 54 | 
 55 | def test_nb_subproc_thread(tmp_path):
 56 |     sh = tmp_path / 'test.sh'
 57 |     sh.write_text(SH_CONTENTS)
 58 | 
 59 |     th = NBSubprocThread(
 60 |         args=['bash', str(sh)],
 61 |         on_poll=on_poll,
 62 |         on_stdout=on_stdout,
 63 |         on_stderr=on_stderr,
 64 |         on_finish=on_finish,
 65 |         poll_interval=0.1,
 66 |     )
 67 |     assert th.returnvalue is None
 68 |     assert not th.is_alive()
 69 |     th.start()
 70 |     assert th.is_alive()
 71 |     # rc is None while running
 72 |     assert th.returncode is None
 73 |     th.join()
 74 |     assert th.returncode == 10
 75 |     assert th.returnvalue == 'done'
 76 | 
 77 | 
 78 | def test_nb_subproc_thread_stopped(tmp_path):
 79 |     sh = tmp_path / 'test.sh'
 80 |     sh.write_text(SH_CONTENTS)
 81 | 
 82 |     th = NBSubprocThread(args=['bash', str(sh)], on_stdout=on_stdout)
 83 |     th.start()
 84 |     time.sleep(2)
 85 |     assert th.is_alive()
 86 |     th.stop(wait=True)
 87 |     assert not th.is_alive()
 88 |     # rc should be is None if terminated
 89 |     assert th.returncode is not None
 90 |     # subprocess is terminated until it reaches kitty 4 (4 sec > 2 sec).
 91 |     assert 'hello kitty 4' not in th.stderr
 92 | 
 93 | 
 94 | def test_nb_subproc_thread_nonzero_rc():
 95 |     for rc in range(10):
 96 |         th = NBSubprocThread(
 97 |             args=['bash', '-c', 'exit {rc}'.format(rc=rc)], on_stderr=on_stderr
 98 |         )
 99 |         th.start()
100 |         th.join()
101 |         assert th.returncode == rc
102 | 
103 | 
104 | @pytest.mark.parametrize('test_app,expected_rc', [('cat', 1), ('ls', 2), ('java', 1)])
105 | def test_nb_subproc_thread_nonzero_rc_for_real_apps(test_app, expected_rc):
106 |     test_str = 'asdfasf-10190212-zxcv'
107 |     if os.path.exists(test_str):
108 |         raise ValueError('Test string should not be an existing file.')
109 | 
110 |     th = NBSubprocThread(
111 |         args=[test_app, test_str], on_stdout=on_stdout, on_stderr=on_stderr
112 |     )
113 |     th.start()
114 |     th.join()
115 |     assert th.returncode == expected_rc
116 |     assert test_str in th.stderr
117 |     assert th.stdout == ''
118 | 


--------------------------------------------------------------------------------
/tests/test_resource_analysis.py:
--------------------------------------------------------------------------------
 1 | """Test is based on a metadata JSON file generated from
 2 | running atac-seq-pipeline v1.8.0 with the following input JSON.
 3 | gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ_subsampled_caper.json
 4 | """
 5 | 
 6 | import pytest
 7 | 
 8 | from caper.resource_analysis import LinearResourceAnalysis, ResourceAnalysis
 9 | 
10 | 
11 | def test_resource_analysis_abstract_class(gcp_res_analysis_metadata):
12 |     with pytest.raises(TypeError):
13 |         # abstract base-class
14 |         ResourceAnalysis()
15 | 
16 | 
17 | def test_resource_analysis_analyze_task(gcp_res_analysis_metadata):
18 |     analysis = LinearResourceAnalysis()
19 |     analysis.collect_resource_data([gcp_res_analysis_metadata])
20 | 
21 |     result_align1 = analysis.analyze_task(
22 |         'atac.align',
23 |         in_file_vars=['fastqs_R1'],
24 |         reduce_in_file_vars=None,
25 |         target_resources=['stats.max.mem', 'stats.mean.cpu_pct'],
26 |     )
27 |     assert result_align1['x'] == {'fastqs_R1': [15643136, 18963919]}
28 |     assert 'stats.mean.cpu_pct' in result_align1['y']
29 |     assert 'stats.max.mem' in result_align1['y']
30 |     assert 'stats.max.disk' not in result_align1['y']
31 |     assert list(result_align1['y'].keys()) == list(result_align1['coeffs'].keys())
32 |     assert result_align1['coeffs']['stats.mean.cpu_pct'][0][0] == pytest.approx(
33 |         1.6844513715565233e-06
34 |     )
35 |     assert result_align1['coeffs']['stats.mean.cpu_pct'][1] == pytest.approx(
36 |         42.28561239506905
37 |     )
38 |     assert result_align1['coeffs']['stats.max.mem'][0][0] == pytest.approx(
39 |         48.91222341236991
40 |     )
41 |     assert result_align1['coeffs']['stats.max.mem'][1] == pytest.approx(
42 |         124314029.09791338
43 |     )
44 | 
45 |     result_align2 = analysis.analyze_task(
46 |         'atac.align', in_file_vars=['fastqs_R2'], reduce_in_file_vars=sum
47 |     )
48 |     assert result_align2['x'] == {'sum(fastqs_R2)': [16495088, 20184668]}
49 |     assert 'stats.mean.cpu_pct' not in result_align2['y']
50 |     assert 'stats.max.mem' in result_align2['y']
51 |     assert 'stats.max.disk' in result_align2['y']
52 |     assert list(result_align2['y'].keys()) == list(result_align2['coeffs'].keys())
53 | 
54 |     result_align_star = analysis.analyze_task('atac.align*', reduce_in_file_vars=max)
55 |     assert result_align_star['x'] == {
56 |         'max(chrsz,fastqs_R1,fastqs_R2,idx_tar,tmp_fastqs)': [
57 |             32138224,
58 |             39148587,
59 |             3749246230,
60 |             3749246230,
61 |         ]
62 |     }
63 | 
64 | 
65 | def test_resource_analysis_analyze(gcp_res_analysis_metadata):
66 |     """Test method analyze() which analyze all tasks defined in in_file_vars."""
67 |     analysis = LinearResourceAnalysis()
68 |     analysis.collect_resource_data([gcp_res_analysis_metadata])
69 | 
70 |     result = analysis.analyze(
71 |         in_file_vars={
72 |             'atac.align*': ['fastqs_R1', 'fastqs_R2'],
73 |             'atac.filter*': ['bam'],
74 |         }
75 |     )
76 |     assert len(result) == 2
77 |     assert result['atac.align*']['x'] == {
78 |         'sum(fastqs_R1,fastqs_R2)': [32138224, 39148587, 32138224, 39148587]
79 |     }
80 |     assert result['atac.filter*']['x'] == {
81 |         'sum(bam)': [61315022, 76789196, 61315022, 76789196]
82 |     }
83 | 
84 |     result_all = analysis.analyze()
85 |     # 38 tasks in total
86 |     assert len(result_all) == 38
87 | 


--------------------------------------------------------------------------------
/tests/test_server_heartbeat.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | import time
 3 | 
 4 | import pytest
 5 | 
 6 | from caper.server_heartbeat import ServerHeartbeat, ServerHeartbeatTimeoutError
 7 | 
 8 | 
 9 | def test_server_heartbeat(tmp_path):
10 |     """All methods will be tested here.
11 |     This willl test 3 things:
12 |         - can read from file
13 |         - can get hostname of this machine
14 |         - can ignore old file (> heartbeat_timeout of 5 sec)
15 |     """
16 |     hb_file = tmp_path / 'hb_file'
17 | 
18 |     hb = ServerHeartbeat(heartbeat_file=str(hb_file), heartbeat_timeout=5000)
19 | 
20 |     # before starting write thread
21 |     # it should return None
22 |     assert hb.read() is None
23 | 
24 |     try:
25 |         hb.start(port=9999)
26 | 
27 |         time.sleep(1)
28 |         assert hb.read() == (socket.gethostname(), 9999)
29 |     finally:
30 |         hb.stop()
31 | 
32 |     # after timeout
33 |     time.sleep(5)
34 |     assert hb.read() is None
35 | 
36 |     with pytest.raises(ServerHeartbeatTimeoutError):
37 |         hb.read(raise_timeout=True)
38 | 


--------------------------------------------------------------------------------
/tests/test_singularity.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from textwrap import dedent
 3 | 
 4 | from caper.singularity import find_bindpath
 5 | 
 6 | UBUNTU_18_04_3 = (
 7 |     'ubuntu@sha256:d1d454df0f579c6be4d8161d227462d69e163a8ff9d20a847533989cf0c94d90'
 8 | )
 9 | UBUNTU_18_04_3_LAST_HASH_TAR_GZ = (
10 |     'sha256:6001e1789921cf851f6fb2e5fe05be70f482fe9c2286f66892fe5a3bc404569c.tar.gz'
11 | )
12 | 
13 | 
14 | def test_find_bindpath(tmp_path):
15 |     """Parse input JSON file to recursively get all the files defined in it.
16 |     For found local abspaths, find common root directories for those.
17 | 
18 |     This is necessary for singularity to bind paths (similar to mounting directories
19 |     in docker).
20 | 
21 |     Input JSON file has one TSV file and this file will be recursively visited by
22 |     find_bindpath().
23 |     """
24 |     tsv = tmp_path / 'test.tsv'
25 |     tsv_contents = dedent(
26 |         """\
27 |         file1\t/1/2/3/4.txt
28 |         file2\t/1/5/6/7.txt
29 |         file3\t/a/t/c.txt
30 |     """
31 |     )
32 |     tsv.write_text(tsv_contents)
33 | 
34 |     inputs = tmp_path / 'inputs.json'
35 |     inputs_dict = {
36 |         'test.input_tsv': str(tsv),
37 |         'test.input': '/a/b/c/d.txt',
38 |         'test.input2': '/a/b/e.txt',
39 |         'test.input3': '/f/g/h.txt',
40 |         'test.input4': '/s/x/y/s/d/e/s/.txt',
41 |     }
42 |     inputs.write_text(json.dumps(inputs_dict, indent=4))
43 | 
44 |     # test with two different levels
45 |     bindpaths_5 = find_bindpath(str(inputs), 5).split(',')
46 |     assert sorted(bindpaths_5) == sorted(
47 |         [
48 |             '/1/2/3',
49 |             '/1/5/6',
50 |             '/a/b',
51 |             '/a/t',
52 |             '/f/g',
53 |             '/s/x/y/s',
54 |             '/'.join(str(tmp_path).split('/')[:5]),
55 |         ]
56 |     )
57 | 
58 |     bindpaths_2 = find_bindpath(str(inputs), 2).split(',')
59 |     assert sorted(bindpaths_2) == sorted(
60 |         ['/1', '/a', '/f', '/s', '/'.join(str(tmp_path).split('/')[:2])]
61 |     )
62 | 


--------------------------------------------------------------------------------
/tests/test_wdl_parser.py:
--------------------------------------------------------------------------------
 1 | """
 2 | There are three WDL 1.0 files to test subworkflow zipping.
 3 | main.wdl (imports sub/sub.wdl)
 4 |     sub/
 5 |         sub.wdl (imports sub/sub_sub.wdl)
 6 |         sub/
 7 |             sub_sub.wdl
 8 | There is another trivial WDL 1.0 file with empty workflow.
 9 | """
10 | 
11 | import os
12 | import shutil
13 | 
14 | from caper.wdl_parser import WDLParser
15 | 
16 | from .example_wdl import (
17 |     MAIN_WDL,
18 |     MAIN_WDL_META_DICT,
19 |     MAIN_WDL_PARAMETER_META_DICT,
20 |     make_directory_with_wdls,
21 | )
22 | 
23 | 
24 | def test_properties(tmp_path):
25 |     """Test the following properties.
26 |     - contents
27 |     - workflow_meta
28 |     - workflow_parameter_meta
29 |     - imports
30 |     """
31 |     wdl = tmp_path / 'main.wdl'
32 |     wdl.write_text(MAIN_WDL)
33 | 
34 |     wp = WDLParser(str(wdl))
35 |     assert wp.contents == MAIN_WDL
36 |     assert wp.workflow_meta == MAIN_WDL_META_DICT
37 |     assert wp.workflow_parameter_meta == MAIN_WDL_PARAMETER_META_DICT
38 |     assert wp.imports == ['sub/sub.wdl']
39 | 
40 | 
41 | def test_zip_subworkflows(tmp_path):
42 |     """This actually tests create_imports_file since
43 |     create_imports_file's merely a wrapper for zip_subworkflows.
44 |     """
45 |     # make tmp directory to store WDLs
46 |     make_directory_with_wdls(str(tmp_path))
47 | 
48 |     # we now have all WDL files
49 |     # main.wdl, sub/sub.wdl, sub/sub/sub_sub.wdl
50 | 
51 |     main_wdl = tmp_path / 'main.wdl'
52 |     sub_sub_wdl = tmp_path / 'sub' / 'sub' / 'sub_sub.wdl'
53 | 
54 |     main = WDLParser(str(main_wdl))
55 | 
56 |     # simple WDL without any imports
57 |     simple = WDLParser(str(sub_sub_wdl))
58 | 
59 |     # make working directory
60 |     d = tmp_path / 'imports'
61 |     d.mkdir(parents=True)
62 | 
63 |     simple_zip_file = simple.create_imports_file(str(d), 'test_trivial_imports.zip')
64 |     assert simple_zip_file is None
65 | 
66 |     main_zip_file = main.create_imports_file(str(d), 'test_imports.zip')
67 |     assert os.path.basename(main_zip_file) == 'test_imports.zip'
68 | 
69 |     shutil.unpack_archive(main_zip_file, extract_dir=str(d))
70 |     assert os.path.exists(str(d / 'sub' / 'sub.wdl'))
71 |     assert os.path.exists(str(d / 'sub' / 'sub' / 'sub_sub.wdl'))
72 | 


--------------------------------------------------------------------------------