├── .circleci └── config.yml ├── .flake8 ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── docs ├── docs │ ├── CNAME │ ├── feature-reference │ │ ├── adding-and-updating-custom-databases.md │ │ ├── async-runs.md │ │ ├── authentication.md │ │ ├── output-objects.md │ │ ├── output-streaming.md │ │ └── using-aws-with-toolchest.md │ ├── getting-started │ │ ├── installation.md │ │ ├── python-functions-and-containers.md │ │ ├── running-bioinformatics-on-toolchest.md │ │ └── using-files.md │ ├── images │ │ └── toolchest_t.png │ ├── index.md │ ├── tool-reference │ │ ├── about.md │ │ ├── aligners.md │ │ ├── aligners │ │ │ ├── bowtie-2.md │ │ │ ├── clustal-omega.md │ │ │ ├── diamond.md │ │ │ ├── diamond │ │ │ │ ├── diamond-blastp.md │ │ │ │ └── diamond-blastx.md │ │ │ ├── kallisto.md │ │ │ ├── rapsearch2.md │ │ │ ├── salmon.md │ │ │ └── star.md │ │ ├── all-other-tools.md │ │ ├── assemblers.md │ │ ├── assemblers │ │ │ ├── megahit.md │ │ │ └── unicycler.md │ │ ├── demultiplexers.md │ │ ├── demultiplexers │ │ │ └── demucs.md │ │ ├── post-processing.md │ │ ├── post-processing │ │ │ └── bracken.md │ │ ├── pre-processing.md │ │ ├── pre-processing │ │ │ └── fastqc.md │ │ ├── python3.md │ │ ├── structure-prediction.md │ │ ├── structure-prediction │ │ │ └── alphafold.md │ │ ├── taxonomic-classifiers.md │ │ ├── taxonomic-classifiers │ │ │ ├── centrifuge.md │ │ │ ├── kraken-2.md │ │ │ └── metaphlan.md │ │ ├── test-runs.md │ │ ├── transfer.md │ │ ├── workflows-meta-tools.md │ │ └── workflows-meta-tools │ │ │ └── humann3.md │ └── toolchest-hosted-cloud │ │ ├── instance-types.md │ │ ├── pricing.md │ │ └── running-toolchest-in-your-aws-account.md └── mkdocs.yaml ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── tests ├── __init__.py ├── conftest.py ├── test_async.py ├── test_blastn.py ├── test_bowtie2.py ├── test_cellranger.py ├── test_centrifuge.py ├── test_chaining.py ├── test_clustalo.py ├── test_database_update.py ├── test_diamond.py ├── test_download.py ├── test_fastqc.py ├── test_filepath.py ├── test_humann3.py ├── test_kallisto.py ├── test_kraken2.py ├── test_last.py ├── test_megahit.py ├── test_metaphlan.py ├── test_output.py ├── test_public_uri.py ├── test_python3.py ├── test_rapsearch2.py ├── test_salmon.py ├── test_sanity.py ├── test_shi7.py ├── test_shogun.py ├── test_star.py ├── test_transfer.py ├── test_unicycler.py └── util │ ├── __init__.py │ ├── filter_output.py │ ├── hash.py │ ├── numpy_test.Dockerfile │ ├── s3.py │ └── streaming_script.py └── toolchest_client ├── __init__.py ├── api ├── __init__.py ├── auth.py ├── download.py ├── exceptions.py ├── instance_type.py ├── output.py ├── query.py ├── status.py ├── streaming.py └── urls.py ├── cli ├── __init__.py ├── cli.py ├── kraken2.py └── test.py ├── files ├── __init__.py ├── general.py ├── merge.py ├── public_uris.py ├── s3.py ├── split.py ├── tests │ ├── __init__.py │ ├── data │ │ ├── eight_line.fastq │ │ ├── eight_line_split_one.fastq │ │ ├── eight_line_split_two.fastq │ │ ├── paired_end │ │ │ ├── eight_line_R1.fastq │ │ │ └── eight_line_R2.fastq │ │ └── very_small_file.txt │ ├── test_general.py │ ├── test_merge.py │ ├── test_s3.py │ └── test_split.py └── unpack.py ├── logging.py └── tools ├── __init__.py ├── alphafold.py ├── api.py ├── blastn.py ├── bowtie2.py ├── bracken.py ├── cellranger.py ├── centrifuge.py ├── clustalo.py ├── demucs.py ├── diamond.py ├── fastqc.py ├── humann.py ├── jupyter.py ├── kallisto.py ├── kraken2.py ├── last.py ├── lug.py ├── megahit.py ├── metaphlan.py ├── python3.py ├── rapsearch2.py ├── salmon.py ├── shi7.py ├── shogun.py ├── star.py ├── test.py ├── tests ├── __init__.py ├── test_generic.py ├── test_kraken2.py ├── test_sanity.py └── test_star.py ├── tool.py ├── tool_args.py ├── transfer.py └── unicycler.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | orbs: 3 | python: circleci/python@1.4.0 4 | jobs: 5 | deploy-to-pypi: 6 | executor: python/default 7 | working_directory: ~/repo 8 | steps: 9 | - checkout 10 | - python/install-packages: 11 | pkg-manager: poetry 12 | - run: 13 | name: Build and publish to PyPI 14 | command: | 15 | cd ~/repo 16 | PYPI_ENVIRONMENT=$([[ $CIRCLE_BRANCH = main ]] && echo prod-pypi || echo test-pypi) 17 | PYPI_ACCESS_TOKEN=$([[ $PYPI_ENVIRONMENT = prod-pypi ]] && echo $PYPI_PROD_TOKEN || echo $PYPI_TEST_TOKEN) 18 | poetry config pypi-token.$PYPI_ENVIRONMENT $PYPI_ACCESS_TOKEN 19 | poetry publish --build -r $PYPI_ENVIRONMENT 20 | unit-tests: 21 | executor: python/default 22 | working_directory: ~/repo 23 | steps: 24 | - checkout 25 | - python/install-packages: 26 | pkg-manager: poetry 27 | - setup_remote_docker: 28 | version: 20.10.14 29 | - run: 30 | name: Run unit tests 31 | command: | 32 | cd ~/repo 33 | poetry install 34 | poetry run pytest -v -m "not (integration or integration_full)" 35 | integration-tests: 36 | executor: python/default 37 | working_directory: ~/repo 38 | parallelism: 8 39 | steps: 40 | - checkout 41 | - run: 42 | name: Skip tests if last commit is a chore commit 43 | command: | 44 | cd ~/repo 45 | last_commit="$(git log -1 --pretty=%s | grep chore: || true)" 46 | if [ ${#last_commit} -gt 0 ]; then circleci-agent step halt; fi 47 | - run: 48 | name: Create AWS credentials manually 49 | command: | 50 | mkdir ~/.aws 51 | touch ~/.aws/config 52 | chmod 600 ~/.aws/config 53 | echo "[profile circleci]" > ~/.aws/config 54 | echo "aws_access_key_id=$AWS_ACCESS_KEY_ID" >> ~/.aws/config 55 | echo "aws_secret_access_key=$AWS_SECRET_ACCESS_KEY" >> ~/.aws/config 56 | - python/install-packages: 57 | pkg-manager: poetry 58 | - setup_remote_docker: 59 | version: 20.10.14 60 | - run: 61 | name: Run integration tests 62 | parallel: true 63 | command: | 64 | cd ~/repo 65 | export DEPLOY_ENVIRONMENT=$([[ $CIRCLE_BRANCH = main ]] && echo production || echo staging) 66 | shopt -s globstar 67 | TESTFILES=$(circleci tests glob tests/**/test*.py | circleci tests split --split-by=timings) 68 | shopt -u globstar 69 | poetry install 70 | mkdir -p test-results 71 | poetry run pytest -v -m integration --durations=0 --junitxml=test-results/junit.xml $TESTFILES 72 | no_output_timeout: 1h 73 | - store_test_results: 74 | path: test-results 75 | - store_artifacts: 76 | path: test-results 77 | lint: 78 | executor: python/default 79 | working_directory: ~/repo 80 | steps: 81 | - checkout 82 | - python/install-packages: 83 | pkg-manager: poetry 84 | - run: 85 | name: Run flake8 linter 86 | parallel: true 87 | command: | 88 | cd ~/repo 89 | pip install flake8 90 | flake8 ./ --output-file test-reports 91 | no_output_timeout: 5m 92 | - store_artifacts: 93 | path: test-reports 94 | destination: test-reports 95 | workflows: 96 | test: 97 | jobs: 98 | - unit-tests 99 | - lint 100 | - integration-tests: 101 | filters: 102 | branches: 103 | only: 104 | - main 105 | - staging 106 | deploy: 107 | jobs: 108 | - deploy-to-pypi: 109 | filters: 110 | branches: 111 | only: 112 | - main 113 | - staging 114 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | exclude= 4 | demo.py 5 | docs/conf.py 6 | ignore = 7 | E731 8 | per-file-ignores = 9 | # ignores unused imports and imports not at the top of file in init files 10 | */__init__.py:F401,E402 11 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Deploy docs 2 | on: 3 | push: 4 | branches: 5 | - main 6 | - feat/update-docs 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | defaults: 12 | run: 13 | working-directory: ./docs 14 | steps: 15 | - uses: actions/checkout@v2 16 | - uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.x 19 | - run: pip install mkdocs-material 20 | - run: mkdocs gh-deploy --force 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Custom files 132 | demo.py 133 | demo-import.py 134 | src/demo.py 135 | toolchest_client/demo.py 136 | 137 | # JetBrains 138 | .idea/ 139 | 140 | # Emacs 141 | *~ 142 | 143 | # Default temporary directory for splitting input files 144 | temp_toolchest* 145 | 146 | # Integration test directories 147 | temp_test_* 148 | 149 | # macOS 150 | .DS_Store 151 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Toolchest Python Client 2 | 3 | **Toolchest** runs computational biology software in the cloud with just a few lines of code. 4 | You can call Toolchest from anywhere Python or R runs, using input files located on your computer or S3. 5 | 6 | This package contains the **Python** client for using Toolchest. 7 | For the **R** client, [see here](https://github.com/trytoolchest/toolchest-client-r). 8 | 9 | ## [Documentation & User Guide](https://docs.trytoolchest.com/) 10 | 11 | ## Installation 12 | 13 | The Toolchest client is available [on PyPI](https://pypi.org/project/toolchest-client): 14 | ``` shell 15 | pip install toolchest-client 16 | ``` 17 | 18 | ## Usage 19 | 20 | Using a tool in Toolchest is as simple as: 21 | 22 | ``` python 23 | import toolchest_client as toolchest 24 | toolchest.set_key("YOUR_TOOLCHEST_KEY") 25 | toolchest.kraken2( 26 | tool_args="", 27 | inputs="path/to/input.fastq", 28 | output_path="path/to/output.fastq", 29 | ) 30 | ``` 31 | 32 | For a list of available tools, see the [documentation](https://docs.trytoolchest.com/tool-reference/about/). 33 | 34 | ## Configuration 35 | 36 | To use Toolchest, you must have an authentication key stored 37 | in the `TOOLCHEST_KEY` environment variable. 38 | 39 | ``` python 40 | import toolchest_client as toolchest 41 | toolchest.set_key("YOUR_TOOLCHEST_KEY") # or a file path containing the key 42 | ``` 43 | 44 | Contact Toolchest if: 45 | 46 | - you need a key 47 | - you’ve forgotten your key 48 | - the key is producing authentication errors. 49 | -------------------------------------------------------------------------------- /docs/docs/CNAME: -------------------------------------------------------------------------------- 1 | docs.trytoolchest.com -------------------------------------------------------------------------------- /docs/docs/feature-reference/async-runs.md: -------------------------------------------------------------------------------- 1 | # Asynchronous Runs 2 | 3 | Toolchest supports async execution for every tool. Async runs are useful long running commands, because you do not need to keep an open terminal or connection while Toolchest is executing. 4 | 5 | We've seen people use async runs from AWS Lambda functions, custom automated pipelines, and manual calls from IDEs. 6 | 7 | ## Launching an Async Run 8 | 9 | To launch an async run, ad the **`is_async`** parameter with the value **True** in your function call. For example, 10 | using the `test` function: 11 | 12 | ```python 13 | my_run = tc.test( 14 | inputs="./", 15 | output_path="./output", 16 | is_async=True, 17 | ) 18 | ``` 19 | 20 | After the Toolchest run is initialized and all file transfers are complete, the Toolchest call returns an 21 | [output object](output-objects.md) containing a run ID. 22 | 23 | You can check your run status using the returned run ID (e.g. `my_run.run_id`). 24 | 25 | 26 | Once you see this, Toolchest is executing your run in the background, and you're safe to close your terminal. (Be sure 27 | to record the run ID!) 28 | 29 | ## Checking Run Status 30 | 31 | To check the status of your async run, call the **`get_status`** function with your run ID. 32 | 33 | ```python 34 | print(tc.get_status(run_id="YOUR_RUN_ID")) 35 | 'executing' 36 | ``` 37 | 38 | **`get_status`** returns a string. Once the status is `ready_to_transfer_to_client`, the run has finished execution and 39 | is ready to download. 40 | 41 | ### Statuses enum 42 | 43 | There's an enum –`Status` – that contains all statuses returned from `get_status()`. You can check statuses against 44 | this enum for custom error handling, progress tracking, or whatever you're building. 45 | 46 | ```python 47 | status = tc.get_status(run_id="YOUR_RUN_ID") 48 | if status == tc.Status.COMPLETE: 49 | print("AlphaFold run finished! Sending email to researcher...") 50 | ``` 51 | 52 | To check all possible enum values, you can print the enum as a list: 53 | 54 | ```python 55 | print(list(tc.Status)) 56 | [, ... 57 | ``` 58 | 59 | ## Downloading Output 60 | 61 | To download the output manually, call the **`download`** function with your run ID and output directory. 62 | 63 | ```python 64 | tc.download( 65 | run_id="YOUR_RUN_ID", 66 | output_path="./output/", 67 | ) 68 | ``` 69 | 70 | 71 | This downloads the run's output file(s) into the output directory. You can run `download` for 7 days after starting the 72 | run. 73 | -------------------------------------------------------------------------------- /docs/docs/feature-reference/authentication.md: -------------------------------------------------------------------------------- 1 | # Authentication 2 | 3 | To run Toolchest jobs, you'll need a Toolchest key. If you don't have one yet, you can get a key 4 | [here](https://airtable.com/shrKzQNuDHrGkEAI2). 5 | 6 | ## Setting a Key 7 | 8 | Use the **`set_key`** function to authenticate your Toolchest calls: 9 | 10 | ```python 11 | import toolchest_client as tc 12 | tc.set_key("YOUR_TOOLCHEST_KEY") 13 | ``` 14 | 15 | `YOUR_TOOLCHEST_KEY` should be a string containing either the key value or a path to a file containing the key. 16 | 17 | You can also set your key through the `TOOLCHEST_KEY` environment variable. 18 | 19 | ## Getting a Stored Key 20 | 21 | To check the value of the key in use, use the **`get_key`** function, which returns a string containing your key value. 22 | 23 | ```python 24 | import toolchest_client as tc 25 | tc.get_key() 26 | ``` 27 | 28 | ## Private Tools and Databases 29 | 30 | If you'd like to use a private tool database with Toolchest without exposing it to the public, Toolchest supports 31 | restricting some databases and tools to your account. -------------------------------------------------------------------------------- /docs/docs/feature-reference/output-objects.md: -------------------------------------------------------------------------------- 1 | # Output Objects 2 | 3 | Every Toolchest run returns an object containing the run ID (`run_id`), local paths to downloaded output files 4 | (`output_path`), and more. 5 | 6 | As an example, we'll use the output from this `test` function call, but you can do this for any Toolchest tool: 7 | 8 | ```python 9 | import toolchest_client as tc 10 | 11 | toolchest_output = tc.test( 12 | inputs="./", 13 | output_path="./output/", 14 | tool_args="", 15 | ) 16 | ``` 17 | 18 | ## Run Metadata 19 | 20 | The **`run_id`** instance variable contains the ID of the Toolchest run, stored as a string. 21 | 22 | Likewise, the **`output_path`** instance variable contains local paths to downloaded output files. 23 | 24 | ```python 25 | >>> toolchest_output.run_id 26 | '00000000-0000-0000-0000-000000000000' # this will be your custom run ID 27 | >>> toolchest_output.output_path 28 | 'OUTPUT_DIR/test_output.txt' 29 | ``` 30 | 31 | You can store and use the `run_id` check the run's status with [Async Runs](async-runs.md). 32 | 33 | `output_path` will be a string (for 1 output file), a list of strings (for multiple output files), or a null value (if 34 | download was skipped). 35 | 36 | ## Download 37 | 38 | You can also directly call the **`download`** function from the output object to download (or re-download) the outputs. 39 | 40 | ```python 41 | toolchest_output.download( 42 | output_path="./", 43 | ) 44 | ``` 45 | 46 | However, keep in mind that Toolchest only retains your job's output for 7 days after job execution. -------------------------------------------------------------------------------- /docs/docs/feature-reference/output-streaming.md: -------------------------------------------------------------------------------- 1 | # Live-Streaming Tool Output 2 | 3 | For synchronous runs for Python and Lug, Toolchest supports streaming remote output live to wherever you're running 4 | Toolchest. 5 | 6 | 7 | For example, here's a `python3` Toolchest call with streaming enabled and an example script: 8 | ```python 9 | import toolchest_client as tc 10 | tc.set_key("YOUR_KEY") 11 | tc.python3( 12 | script="script.py", 13 | streaming_enabled=True, 14 | ) 15 | ``` 16 | 17 | ```python 18 | # script.py 19 | import time 20 | for letter in ["A", "B", "C"]: 21 | print(f"Hello world {letter}") 22 | time.sleep(1) 23 | ``` 24 | 25 | You'll see the following lines printed as they are generated by the remotely-running Python script, one line per second: 26 | ```text 27 | Hello world A 28 | Hello world B 29 | Hello world C 30 | ``` 31 | 32 | 33 | !!! warning "Streaming and cancelling runs" 34 | 35 | With streaming enabled, tool execution terminates if the streaming connection is broken. This includes cancelling 36 | your job by entering Ctrl-C. 37 | 38 | If a job is cancelled before encountering a bug in your script, the error may not be visible in Toolchest logs. 39 | 40 | ## Supported Tools 41 | Output streaming is supported for `python3` and `lug`. For both, streaming is enabled by default. -------------------------------------------------------------------------------- /docs/docs/feature-reference/using-aws-with-toolchest.md: -------------------------------------------------------------------------------- 1 | # Using AWS with Toolchest 2 | 3 | Toolchest supports reading and writing from your S3 buckets. You can also run Toolchest within your own AWS account, so the files you pass to `inputs` and `output_path` aren't transferred outside your account. 4 | 5 | ## Input Files 6 | 7 | Files stored on S3 can be passed in as inputs, using the file's S3 URI. For example: 8 | s 9 | ```python 10 | tc.kraken2( 11 | inputs="s3://toolchest-demo-data/SRR16201572_R1.fastq", 12 | output_path="./", 13 | ) 14 | ``` 15 | 16 | ## Output to S3 17 | 18 | Some tools support uploading outputs directly to your custom S3 bucket. For these runs, put the S3 bucket + prefix in 19 | **`output_path`**. For example: 20 | 21 | ```python 22 | tc.kraken2( 23 | inputs="./example.fastq", 24 | output_path="s3://your-output/your-intended-subfolder", 25 | ) 26 | ``` 27 | 28 | ## Custom Databases 29 | 30 | For some tools, you can use use a custom database stored on S3 using **`custom_database_path`**: 31 | 32 | ```python 33 | tc.kraken2( 34 | inputs="./example.fastq", 35 | output_path="./example_output_dir", 36 | custom_database_path="s3://your-databases/your-kraken2-database", 37 | ) 38 | ``` 39 | 40 | Toolchest needs permission to list and copy all of the files in the S3 prefix you use. 41 | 42 | ## Granting Permissions to Toolchest to Access Your S3 Bucket 43 | 44 | To grant Toolchest access to your S3 bucket, use this policy: 45 | 46 | ```json 47 | { 48 | "Version": "2012-10-17", 49 | "Statement": [ 50 | { 51 | "Sid": "Toolchest", 52 | "Effect": "Allow", 53 | "Principal": { 54 | "AWS": "arn:aws:iam::172533437917:role/toolchest-worker-node-role" 55 | }, 56 | "Action": [ 57 | "s3:GetObject", 58 | "s3:ListBucket" 59 | ], 60 | "Resource": [ 61 | "arn:aws:s3:::YOUR_BUCKET_NAME", 62 | "arn:aws:s3:::YOUR_BUCKET_NAME/*" 63 | ] 64 | } 65 | ] 66 | } 67 | ``` 68 | 69 | (Make sure to replace`YOUR_BUCKET_NAME` with your bucket) 70 | 71 | You can restrict this to specific files or prefixes with whatever IAM policy you'd like, just make sure that Toolchest 72 | has `s3:GetObject` for any file you'll use with Toolchest and `s3:ListBucket` permissions for any prefix. 73 | 74 | After you add this policy, let us know and we'll complete the setup process! 75 | 76 | ## Running Toolchest in Your Own AWS Account 77 | 78 | You can run Toolchest in your own AWS account, and the data that you pass to `inputs` and `output_path` doesn't leave 79 | your own AWS environment. [Get in touch with us](mailto:hello@trytoolchest.com) if you'd like to know more! -------------------------------------------------------------------------------- /docs/docs/getting-started/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | Note: if you haven't already, make sure you [have an API key](https://trytoolchest.com)! 4 | 5 | ## With `pip` 6 | 7 | ```python 8 | pip install toolchest-client 9 | ``` 10 | 11 | ## With Poetry 12 | ```python 13 | poetry add toolchest-client 14 | ``` 15 | 16 | ## Supported Python versions 17 | 18 | We support Python 3.7 through the latest Python 3.11 release candidate. 19 | 20 | ## Supported operating systems 21 | 22 | You can run Toolchest on most recent versions of macOS, Linux and Windows. 23 | 24 | 25 | -------------------------------------------------------------------------------- /docs/docs/getting-started/python-functions-and-containers.md: -------------------------------------------------------------------------------- 1 | # Deploying Python Functions and Docker Images 2 | 3 | If you have custom Python functions that need more power, you can deploy those to the cloud using Toolchest as well! 4 | 5 | This tends to work well with: 6 | 7 | - custom command-line software that's packed in a Docker image 8 | - packages that aren't on Toolchest yet 9 | - or where you just have 10 | 11 | To do this, we recommend using [Lug](https://lug.dev), a fully open-source project that builds on top of Toolchest. 12 | 13 | -------------------------------------------------------------------------------- /docs/docs/getting-started/running-bioinformatics-on-toolchest.md: -------------------------------------------------------------------------------- 1 | # Toolchest-wrapped Command-line Software 2 | 3 | Note: if you haven't already, make sure you [have an API key](https://trytoolchest.com) and Toolchest is installed! 4 | 5 | The most popular bioinformatics software is run through the command line. Toolchest wraps this software in Python and 6 | runs it on the cloud. 7 | 8 | ## A quick start 9 | 10 | To get started, we'll use STAR, but you can use any of the [packages supported by Toolchest](../tool-reference/about.md) 11 | . On the command-line, running STAR looks like: 12 | 13 | ```shell 14 | STAR --outFileNamePrefix ./output_path --genomeDir ./database_CRCh38 --readFilesIn ./inputs/ 15 | ``` 16 | 17 | With Toolchest, it's: 18 | 19 | ```python 20 | import toolchest-client as tc 21 | 22 | tc.set_key("YOUR_KEY") 23 | 24 | tc.STAR( 25 | read_one="s3://toolchest-demo-data/SRR2557119_small.fastq", 26 | output_path="./output_path/", 27 | database_name="GRCh38", 28 | ) 29 | ``` 30 | 31 | and it runs in the cloud! Breaking down the arguments: 32 | 33 | - `read_one` is for input files. They can be on your computer, or somewhere else like S3. 34 | - `output_path` is where your output files are written. This can also be your computer, or somewhere else like S3. 35 | - `database_name` is the name of the Toolchest-hosted database. 36 | 37 | ## Adding more options 38 | 39 | ```python 40 | import toolchest-client as tc 41 | 42 | tc.set_key("YOUR_KEY") 43 | 44 | tc.STAR( 45 | read_one="s3://toolchest-demo-data/SRR2557119_small.fastq", 46 | output_path="./output/", 47 | database_name="GRCh38", 48 | database_version="1", 49 | tool_args="--outSAMtype BAM Unsorted" 50 | ) 51 | ``` 52 | 53 | We added two new arguments: 54 | - `database_version` is the version number of the Toolchest-hosted database. 55 | - `tool_args` are the arguments that you would normally set on the command-line to customize execution. 56 | 57 | Next, let's learn more about what kinds of files you can use with Toolchest. 58 | -------------------------------------------------------------------------------- /docs/docs/getting-started/using-files.md: -------------------------------------------------------------------------------- 1 | # Using Files 2 | 3 | Toolchest works with files on your computer (local files) or files on something like S3 (remote files). We recommend 4 | using local or S3 files for data integrity and speed of execution, but HTTP or FTP URLs are supported too. 5 | 6 | For all tools and file types, `inputs` takes a string path or a list of paths. `output_path` always takes a directory 7 | path. 8 | 9 | Let's take a look at what it looks like to use different types of local and remote paths! 10 | 11 | !!! note "You can mix and match file sources" 12 | You can mix and match local and remote files in the same call. Every file is handled independently, so you can use S3, 13 | FTP, and local files together. 14 | 15 | ## Local files and directories 16 | 17 | Local files are the most intuitive: you just pass normal paths directly to Toolchest. In the background, the files are 18 | transferred to and from the cloud. 19 | 20 | `inputs` takes paths to files and directories. 21 | 22 | `output_path` takes a path to a directory. Output files are written in this directory. 23 | 24 | ### Local directory inputs 25 | If a directory is passed, all files within the directory are used as input. Directory structure will be destroyed unless 26 | `compress_inputs=True` is provided as an argument. 27 | 28 | For example if you have the following directory structure: 29 | ```text 30 | /path/to/base/directory/ 31 | subdirectory_one/ 32 | input.fastq 33 | subdirectory_two/ 34 | input.fastq 35 | info.txt 36 | ``` 37 | and you used the following toolchest call: 38 | ```python 39 | tc.test( 40 | inputs="/path/to/base/directory/", 41 | compress_inputs=True 42 | ) 43 | ``` 44 | Then the input files will retain the directory structure without name conflicts. If `compress_inputs` is set to `False` 45 | or not provided, the 2 `inputs.fastq` would overwrite whichever one was downloaded second. 46 | 47 | ## Remote files 48 | 49 | ### AWS S3 50 | 51 | S3 files are the fastest and most reliable input source. Toolchest pulls directly from the path you pass. 52 | 53 | - `inputs` takes S3 URIs for a file. If you have multiple files in an S3 directory, make sure to list the directory first 54 | and pass each file as an input. 55 | - `output_path` accepts an S3 URI for a S3 prefix. 56 | 57 | Here's an example using the `test` package with an S3 input: 58 | ```python 59 | tc.test( 60 | inputs="s3://toolchest-public-examples/example.fastq", 61 | output_path="s3://toolchest-public-output/remote-output/" 62 | ) 63 | ``` 64 | 65 | !!! note "Make sure Toolchest has access to your S3 bucket" 66 | 67 | To grant Toolchest access, see [AWS Integration](../feature-reference/using-aws-with-toolchest.md). 68 | 69 | ### HTTP/HTTPS 70 | 71 | !!! warning "HTTP and HTTPS files are dangerous!" 72 | We can't guarantee data integrity on transfer, because different servers behave differently. Make sure that the HTTP 73 | server supports `GET` requests with the `range` header. Always use a local or S3 file path if possible. Ye be warned! 74 | 75 | - `inputs` takes an HTTP URL for a file. If you have multiple files in an HTTP directory, make sure to list the directory 76 | first, and pass each file as an input. 77 | - `output_path` does not accept HTTP outputs at this time. 78 | 79 | Here's an example using the `test` package with an HTTP input: 80 | ```python 81 | tc.test( 82 | inputs="https://rest.uniprot.org/uniprotkb/P48754.fasta", 83 | output_path="./" 84 | ) 85 | ``` 86 | 87 | ### FTP 88 | 89 | !!! warning "FTP files are dangerous!" 90 | We can't guarantee data integrity on transfer, because different servers behave differently. Always use a local or S3 91 | file path if possible. Ye be warned! 92 | 93 | - `inputs` accepts an FTP URL for a file. If you have multiple files in an FTP directory, make sure to list the 94 | directory first, and pass each file as an input. 95 | - `output_path` does not accept FTP outputs at this time. 96 | 97 | Here's an example using the `test` package with an FTP input: 98 | ```python 99 | tc.test( 100 | inputs="ftp://ftp.sra.ebi.ac.uk/vol1/fastq//SRR999/000/SRR9990000/SRR9990000.fastq.gz", 101 | output_path="./" 102 | ) 103 | ``` -------------------------------------------------------------------------------- /docs/docs/images/toolchest_t.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/docs/docs/images/toolchest_t.png -------------------------------------------------------------------------------- /docs/docs/index.md: -------------------------------------------------------------------------------- 1 | # Toolchest 2 | 3 | If you're ready to start building, head straight to [Installation](getting-started/installation.md), 4 | [Running Bioinformatics Packages with Toolchest](./getting-started/running-bioinformatics-on-toolchest.md), or 5 | [Custom Python Functions and Containers](./getting-started/python-functions-and-containers.md). 6 | 7 | ## What does Toolchest do? 8 | 9 | Toolchest is an open source library for running computational biology software in the cloud. For software that has 10 | reference databases, Toolchest comes with pre-built reference DBs on our high-speed cloud database store – or you can 11 | add your own. 12 | 13 | Toolchest handles input and output file transfer as well as cloud resource provisioning. That means you can use the 14 | Toolchest library from anywhere you write Python, including Jupyter notebooks or a Python function – on your computer or 15 | in the cloud. 16 | 17 | ## Who should use Toolchest? 18 | 19 | If you: 20 | 21 | - use bioinformatics software that runs on the command line, but you write code in Python 22 | - have functions that need more resources than your laptop, but you don't want to manage your own cloud infrastructure 23 | - handle a lot of data 24 | 25 | then you should try Toolchest! 26 | 27 | ## What doesn't Toolchest solve? 28 | 29 | - Pipelining (see Prefect, Dagster, Nextflow, or Snakemake) 30 | - Data versioning or management 31 | 32 | ## Why Toolchest? 33 | 34 | - You can scale instantly with Toolchest; Toolchest is built on top of AWS 35 | - You don't need an AWS account! Toolchest jobs run in our own AWS account by default 36 | - Cloud resources are spun up and down immediately, maximizing efficiency and reducing idling resources 37 | -------------------------------------------------------------------------------- /docs/docs/tool-reference/about.md: -------------------------------------------------------------------------------- 1 | # About the Tool Reference 2 | 3 | This section contains documentation for the core "tools" that make up Toolchest: the aligners, assemblers, classifiers, 4 | and other software that Toolchest wraps. -------------------------------------------------------------------------------- /docs/docs/tool-reference/aligners.md: -------------------------------------------------------------------------------- 1 | # Aligners 2 | 3 | Aligners find the similarity between two or more sequences. Sometimes, query sequences are compared against a reference 4 | database in a sort of fuzzy search (e.g. [Bowtie 2](aligners/bowtie-2.md)). In other contexts, several query sequences 5 | are compared against one another (e.g. [Clustal Omega](aligners/clustal-omega.md)). 6 | 7 | Most aligners are tailored for specific types of data: [STAR](aligners/star.md) for single-cell RNA-Seq, 8 | [DIAMOND BLASTP](aligners/diamond/diamond-blastp.md) for protein sequences against a protein database, and 9 | [DIAMOND BLASTX](aligners/diamond/diamond-blastx.md) for translated nucleotide sequences against a protein database. 10 | 11 | Toolchest hosts both the aligner and the reference databases, and you can also 12 | [use your own custom database](../feature-reference/adding-and-updating-custom-databases.md). 13 | 14 | If you don't need the extra information that aligners return – e.g. for some microbiome taxonomic classification – 15 | you can also use a more efficient [classifier](taxonomic-classifiers.md). 16 | 17 | If you want to use an aligner that's not listed here, [let us know](https://airtable.com/shrNBkD0bG2wB15jQ)! It might 18 | already be available on our infrastructure but not documented. -------------------------------------------------------------------------------- /docs/docs/tool-reference/aligners/clustal-omega.md: -------------------------------------------------------------------------------- 1 | **Clustal Omega** is a fast and scalable tool that makes multiple sequence alignments of protein sequences. For more 2 | information, see the tool's [homepage](http://www.clustal.org/omega/). 3 | 4 | Function Call 5 | ============= 6 | 7 | ```python 8 | tc.clustalo( 9 | inputs, 10 | output_path=None, 11 | tool_args="", 12 | is_async=False, 13 | ) 14 | ``` 15 | 16 | Function Arguments 17 | ------------------ 18 | 19 | See the Notes section below for more details. 20 | 21 | | Argument | Use in place of: | Description | 22 | | :------------ | :------------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 23 | | `inputs` | `-i` | Path to one or more files to use as input. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 24 | | `output_path` | `-o` | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 25 | | `tool_args` | all other arguments | (optional) Additional arguments to be passed to Clustal Omega. This should be a string of arguments like the command line. See [Supported Additional Arguments](#supported-additional-arguments) for more details. | 26 | | `is_async` | | Whether to run a job asynchronously. See [Async Runs](../../feature-reference/async-runs.md) for more. | 27 | 28 | Tool Versions 29 | ============= 30 | 31 | Toolchest currently supports version **1.2.4** of Clustal Omega. 32 | 33 | Supported Additional Arguments 34 | ============================== 35 | 36 | - `--auto` 37 | - `--dealign` 38 | - `--infmt` 39 | - `--is-profile` 40 | - `--iter` 41 | - `--iterations` 42 | - `--max-guidetree-iterations` 43 | - `--max-hmm-iterations` 44 | - `--maxnumseq` 45 | - `--maxseqlen` 46 | - `--outfmt` 47 | - `--output-order` 48 | - `--residuenumber` 49 | - `--resno` 50 | - `--seqtype` 51 | - `-t` 52 | - `--wrap` 53 | 54 | Additional arguments can be specified under the `tool_args` argument. -------------------------------------------------------------------------------- /docs/docs/tool-reference/aligners/diamond.md: -------------------------------------------------------------------------------- 1 | DIAMOND is an aligner for protein and translated DNA sequences. For more information, see the tool's 2 | [GitHub repo and wiki](https://github.com/bbuchfink/diamond). 3 | 4 | DIAMOND has two modes available with Toolchest: **BLASTP** (`diamond blastp`) and **BLASTX** (`diamond blastx`). 5 | Each mode has its own function call (`diamond_blastp` and `diamond_blastx`, respectively). See the relevant subpage for 6 | in-depth documentation: 7 | 8 | - [DIAMOND BLASTP](diamond/diamond-blastp) 9 | - [DIAMOND BLASTX](diamond/diamond-blastx) -------------------------------------------------------------------------------- /docs/docs/tool-reference/aligners/diamond/diamond-blastp.md: -------------------------------------------------------------------------------- 1 | **DIAMOND BLASTP** is [DIAMOND](../diamond.md)'s mode for protein sequence searches. For more information, see the tool's [GitHub repo and wiki](https://github.com/bbuchfink/diamond). 2 | 3 | # Function Call 4 | 5 | ```python 6 | tc.diamond_blastp( 7 | inputs, 8 | output_path=None, 9 | database_name="diamond_blastp_standard", 10 | database_version="1", 11 | remote_database_path=None, 12 | remote_database_primary_name=None, 13 | tool_args="", 14 | is_async=False, 15 | ) 16 | ``` 17 | 18 | ## Function Arguments 19 | 20 | See the Notes section below for more details. 21 | 22 | | Argument | Use in place of: | Description | 23 | | :----------------------------- | :------------------ |:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 24 | | `inputs` | `-q`, `--query` | Path to one or more files to use as input. FASTA or FASTQ formats are supported, as well as gzip-compressed FASTA/FASTQ files. The files can be a local or remote, see [Using Files](../../../getting-started/using-files.md). | 25 | | `output_path` | `-o`, `--out` | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../../getting-started/using-files.md). | 26 | | `database_name` | `-d` | (optional) Name of database to use for DIAMOND BLASTP. Defaults to `"diamond_blastp_standard"`, the SeqScreen database. | 27 | | `database_version` | database version | (optional) Version of database to use for DIAMOND BLASTP. Defaults to `"1"`. | 28 | | `remote_database_path` | `-d` (path) | (optional) AWS S3 URI to the directory that contains your custom database. | 29 | | `remote_database_primary_name` | `-d` (name) | (optional) The primary name (e.g. UNIREF100.mini) of your custom database. | 30 | | `tool_args` | all other arguments | (optional) Additional arguments to be passed to Diamond BLASTp. This should be a string of arguments like the command line. | 31 | | `is_async` | | Whether to run a job asynchronously. See [Async Runs](../../../feature-reference/async-runs.md) for more. | 32 | 33 | DIAMOND BLASTP runs are aligned against the SeqScreen database by default. See the [Databases](#databases) section for more details. 34 | 35 | # Tool Versions 36 | 37 | Toolchest currently supports version **2.0.14** of DIAMOND. 38 | 39 | # Databases 40 | 41 | Toolchest currently supports the following databases for DIAMOND BLASTP: 42 | 43 | | `database_name` | `database_version` | Description | 44 | | :------------------------ | :----------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------- | 45 | | `diamond_blastp_standard` | `1` | SeqScreen DIAMOND BLASTP Database. See [the SeqScreen wiki](https://gitlab.com/treangenlab/seqscreen/-/wikis/02.-SeqScreen-Dependencies) for more details. | 46 | 47 | # Supported Additional Arguments 48 | 49 | - `-f` 50 | - `--fast` 51 | - `-l` 52 | - `--mid-sensitive` 53 | - `--min-orf` 54 | - `--more-sensitive` 55 | - `--no-self-hits` 56 | - `--outfmt` 57 | - `--sallseqid` 58 | - `--salltitles` 59 | - `--sensitive` 60 | - `--strand` 61 | - `--ultra-sensitive` 62 | - `--unal` 63 | - `--very-sensitive` 64 | 65 | Additional arguments can be specified under the `tool_args` argument. -------------------------------------------------------------------------------- /docs/docs/tool-reference/aligners/rapsearch2.md: -------------------------------------------------------------------------------- 1 | **RAPSearch2** is an aligner for protein similarity searches. It aligns DNA reads or protein sequences against a 2 | protein database. For more information, see the tool's [homepage](https://omics.informatics.indiana.edu/mg/RAPSearch2/) 3 | , [GitHub repo](https://github.com/zhaoyanswill/RAPSearch2), and [Sourceforge page](http://rapsearch2.sourceforge.net/). 4 | 5 | Function Call 6 | ============= 7 | 8 | ```python 9 | tc.rapsearch2( 10 | inputs, 11 | output_path=None, 12 | database_name="rapsearch2_seqscreen", 13 | database_version="1", 14 | tool_args="", 15 | is_async=False, 16 | ) 17 | ``` 18 | 19 | Function Arguments 20 | ------------------ 21 | 22 | See the Notes section below for more details. 23 | 24 | | Argument | Use in place of: | Description | 25 | | :----------------- | :------------------ |:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 26 | | `inputs` | `-q` | Path to one or more files to use as input. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 27 | | `output_path` | `-o` | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 28 | | `database_name` | `-d`\* | (optional) Name of database to use for RAPSearch2 alignment. Defaults to `"GRCh38"` (human genome). | 29 | | `database_version` | `-d`\* | (optional) Version of database to use for RAPSearch2 alignment. Defaults to `"1"`. | 30 | | `tool_args` | all other arguments | (optional) Additional arguments to be passed to RAPSearch2. This should be a string of arguments like the command line. See [Supported Additional Arguments](#supported-additional-arguments) for more details. | 31 | | `is_async` | | Whether to run a job asynchronously. See [Async Runs](../../feature-reference/async-runs.md) for more. | 32 | 33 | \*See the [Databases](#databases) section for more details. 34 | 35 | Tool Versions 36 | ============= 37 | 38 | Toolchest currently supports version **2.24** of RAPSearch2. 39 | 40 | Databases 41 | ========= 42 | 43 | Toolchest currently supports the following databases for RAPSearch2: 44 | 45 | | `database_name` | `database_version` | Description | 46 | | :-------------------- | :----------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------- | 47 | | `rapsearch_seqscreen` | `1` | SeqScreen RAPSearch2 Database. See [the SeqScreen wiki](https://gitlab.com/treangenlab/seqscreen/-/wikis/02.-SeqScreen-Dependencies) for more details. | 48 | 49 | Supported Additional Arguments 50 | ============================== 51 | 52 | - `-a` 53 | - `-b` 54 | - `-e` 55 | - `-g` 56 | - `-i` 57 | - `-l` 58 | - `-p` 59 | - `-s` 60 | - `-t` 61 | - `-v` 62 | - `-w` 63 | - `-x` 64 | 65 | Additional arguments can be specified under the `tool_args` argument. -------------------------------------------------------------------------------- /docs/docs/tool-reference/aligners/salmon.md: -------------------------------------------------------------------------------- 1 | **Salmon** is a computational genomics tool for transcriptomic analysis. For more information, see the tool's 2 | [GitHub repo](https://github.com/COMBINE-lab/salmon). Toolchest only supports running `salmon quant` with pre-built 3 | indexes in mapping mode at this time. 4 | 5 | # Function Call 6 | 7 | ```python 8 | tc.salmon( 9 | read_one=None, 10 | read_two=None, 11 | single_end=None, 12 | output_path=None, 13 | tool_args="", 14 | database_name="salmon_hg38", 15 | database_version="1", 16 | library_type="A", 17 | is_async=False, 18 | ) 19 | ``` 20 | 21 | ## Function Arguments 22 | 23 | | Argument | Use in place of: | Description | 24 | | :----------------- | :------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 25 | | `read_one` | `-1` | (optional) Path or list of paths to R1 of paired-end read input files. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 26 | | `read_two` | `-2` | (optional) Path or list of paths to R2 of paired-end read input files. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 27 | | `single_end` | `-r` | (optional) Path or list of paths to of single-end (or just R1 or R2) read input files. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 28 | | `output_path` | output file location | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 29 | | `tool_args` | all other arguments | (optional) Additional arguments to be passed to Salmon. This should be a string of arguments like the command line. | 30 | | `database_name` | `-i` | (optional) Name of database to use for Kraken 2 alignment. Defaults to `"salmon_hg38"`. | 31 | | `database_version` | `-i` | (optional) Version of database to use for Kraken 2 alignment. Defaults to `"1"`. | 32 | | `library_type` | `-l`, `--libType` | (optional) The library type used. Defaults to "A" for automatic classification. See [the Salmon docs on library types](https://salmon.readthedocs.io/en/latest/salmon.html#what-s-this-libtype) for more. | 33 | | `is_async` | | Whether to run a job asynchronously. See [Async Runs](../../feature-reference/async-runs.md) for more. | 34 | 35 | See the [Databases](#databases) section for more details. 36 | 37 | ## Notes 38 | 39 | ### Paired-end inputs 40 | 41 | Paired-end read inputs can be set with either `inputs` or through `read_one` and `read_two`. 42 | 43 | Make sure that the first item in `read_one` corresponds to the first item in `read_two`– and so on. 44 | 45 | If you only have one end of a paired-end run, use the `single_end` argument. 46 | 47 | # Tool Versions 48 | 49 | Toolchest currently supports version **1.9.0** of Salmon. 50 | 51 | # Databases 52 | 53 | Toolchest currently supports the following databases for Salmon: 54 | 55 | | `database_name` | `database_version` | Description | 56 | | :-------------- | :----------------- | :------------------------------------------------------------------------------- | 57 | | `hg38` | `1` | hg38 precomputed index for Salmon, pulled from . | 58 | 59 | # Other modes 60 | 61 | Only `quant` mode is supported at this time. -------------------------------------------------------------------------------- /docs/docs/tool-reference/all-other-tools.md: -------------------------------------------------------------------------------- 1 | Before any tool lands in our documentation, we add [publicly exposed integration tests](https://github.com/trytoolchest/toolchest-client-python/tree/main/tests) to ensure data quality. 2 | 3 | If you want to use a tool not listed above, please [let us know](https://airtable.com/shrNBkD0bG2wB15jQ)! Some are in private beta as we test and verify their functionality, and we're rapidly adding more. -------------------------------------------------------------------------------- /docs/docs/tool-reference/assemblers.md: -------------------------------------------------------------------------------- 1 | Genome assemblers take partial sequences of a genome and assemble them to form a larger contiguous sequence – ideally, the whole genome. 2 | 3 | The best assemblers work with both short reads (e.g. Illumina) and long reads (e.g. Oxford Nanopore) to quickly assemble a genome (e.g. [Unicycler](assemblers/unicycler.md)). 4 | 5 | If you want to use an assembler that's not listed here, [let us know](https://airtable.com/shrNBkD0bG2wB15jQ)! It might even be already available on our infrastructure but not listed. -------------------------------------------------------------------------------- /docs/docs/tool-reference/assemblers/megahit.md: -------------------------------------------------------------------------------- 1 | **MEGAHIT** is an assembler that's optimized for metagenomes. For more information, see the tool's 2 | [GitHub repo and wiki](https://github.com/voutcn/megahit). 3 | 4 | Function Call 5 | ============= 6 | 7 | ```python 8 | tc.megahit( 9 | read_one=None, 10 | read_two=None, 11 | interleaved=None, 12 | single_end=None, 13 | output_path=None, 14 | tool_args="", 15 | is_async=False, 16 | ) 17 | ``` 18 | 19 | Function Arguments 20 | ------------------ 21 | 22 | See the Notes section below for more details. 23 | 24 | | Argument | Use in place of: | Description | 25 | | :------------ | :------------------ |:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 26 | | `read_one` | `-1` | (optional) Path to R1 of paired-end short read input files. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 27 | | `read_two` | `-2` | (optional) Path to R2 of paired-end short read input files. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 28 | | `interleaved` | `--12` | (optional) Path to the file containing interleaved reads. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 29 | | `single_end` | `-r` | (optional) Path to the file containing singled-ended reads. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 30 | | `output_path` | `-o` | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 31 | | `tool_args` | all other arguments | (optional) A string containing additional arguments to be passed to MEGAHIT, formatted as if using the command line. | 32 | | `is_async` | | Whether to run a job asynchronously. See [Async Runs](../../feature-reference/async-runs.md) for more. | 33 | 34 | Notes 35 | ----- 36 | 37 | ### Paired-end reads 38 | 39 | For each paired-end input, make sure the corresponding read is in the same position in the input list. For example, two 40 | pairs of paired-end files – `one_R1.fastq`, `one_R2.fastq`, `two_R1.fastq`, `two_R2.fastq` – should be passed to 41 | Toolchest as: 42 | 43 | ```python 44 | tc.megahit( 45 | read_one=["one_R1.fastq", "two_R1.fastq"], 46 | read_two=["one_R2.fastq", "two_R2.fastq"], 47 | ... 48 | ) 49 | ``` 50 | 51 | Tool Versions 52 | ============= 53 | 54 | Toolchest currently supports version **1.2.9** of MEGAHIT. 55 | 56 | Supported Additional Arguments 57 | ============================== 58 | 59 | - \--min-count 60 | - \--k-list 61 | - \--k-min 62 | - \--k-max 63 | - \--k-step 64 | - \--no-mercy 65 | - \--bubble-level 66 | - \--merge-level 67 | - \--prune-level 68 | - \--prune-depth 69 | - \--disconnect-ratio 70 | - \--low-local-ratio 71 | - \--max-tip-len 72 | - \--cleaning-rounds 73 | - \--no-local 74 | - \--kmin-1pass 75 | - \--presets 76 | - \--min-contig-len 77 | 78 | Set additional arguments with `tool_args`. For example: `tool_args="--no-local --no-mercy"` -------------------------------------------------------------------------------- /docs/docs/tool-reference/assemblers/unicycler.md: -------------------------------------------------------------------------------- 1 | **Unicycler** is an assembly pipeline for bacterial genomes. For more information, see the tool's 2 | [GitHub repo and wiki](https://github.com/rrwick/Unicycler). 3 | 4 | Function Call 5 | ============= 6 | 7 | ```python 8 | tc.unicycler( 9 | read_one=None, 10 | read_two=None, 11 | long_reads=None, 12 | output_path=None, 13 | tool_args="", 14 | is_async=False, 15 | ) 16 | ``` 17 | 18 | Function Arguments 19 | ------------------ 20 | 21 | | Argument | Use in place of: | Description | 22 | | :------------ | :------------------ | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 23 | | `read_one` | `-1` | (optional) Path to R1 of paired-end short read input files. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 24 | | `read_two` | `-2` | (optional) Path to R2 of paired-end short read input files. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 25 | | `long_reads` | `-l` | (optional) Path to the file containing long reads. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 26 | | `output_path` | `-o` | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 27 | | `tool_args` | all other arguments | (optional) Additional arguments to be passed to Unicycler. This should be a string of arguments like the command line. See [Supported Additional Arguments](#supported-additional-arguments) for more details. | 28 | | `is_async` | | Whether to run a job asynchronously. See [Async Runs](../../feature-reference/async-runs.md) for more. | 29 | 30 | Notes 31 | ----- 32 | 33 | ### Paired-end reads 34 | 35 | Paired-end short read inputs should be specified with both `read_one` and `read_two`. 36 | 37 | Tool Versions 38 | ============= 39 | 40 | Toolchest currently supports version **0.4.9** of Unicycler. 41 | 42 | Supported Additional Arguments 43 | ============================== 44 | 45 | - `--depth_filter` 46 | - `--kmer_count` 47 | - `--kmers` 48 | - `--largest_component` 49 | - `--linear_seqs` 50 | - `--low_score` 51 | - `--max_kmer_frac` 52 | - `--min_component_size` 53 | - `--min_dead_end_size` 54 | - `--min_fasta_length` 55 | - `--min_kmer_frac` 56 | - `--min_polish_size` 57 | - `--mode` 58 | - `--no_correct` 59 | - `--no_miniasm` 60 | - `--no_pilon` 61 | - `--no_rotate` 62 | - `--scores` 63 | - `--start_gene_cov` 64 | - `--start_gene_id` 65 | - `--verbosity` 66 | 67 | Additional arguments can be specified under the `tool_args` argument. -------------------------------------------------------------------------------- /docs/docs/tool-reference/demultiplexers.md: -------------------------------------------------------------------------------- 1 | Demultiplexing is extracting components from something that's all mixed together. It's like taking a rope and unwinding 2 | each individual thread. 3 | 4 | Sometimes, this means using a straightforward tool like `bcl2fastq` to generate FASTQs from raw Illumina NGS reads. 5 | Just for fun, we've also included an ML-based tool called [Demucs](demultiplexers/demucs.md) to separate song tracks. -------------------------------------------------------------------------------- /docs/docs/tool-reference/demultiplexers/demucs.md: -------------------------------------------------------------------------------- 1 | **demucs** is a demultiplexing tool for audio source separation. To learn more about the tool, check out its 2 | [GitHub repo](https://github.com/facebookresearch/demucs). 3 | 4 | Function Call 5 | ============= 6 | 7 | ```python 8 | tc.demucs( 9 | inputs, 10 | output_path=None, 11 | tool_args="", 12 | is_async=False, 13 | ) 14 | ``` 15 | 16 | Function Arguments 17 | ------------------ 18 | 19 | See the Notes section below for more details. 20 | 21 | | Argument | Use in place of: | Description | 22 | | :------------ | :------------------ | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 23 | | `inputs` | `--input` | Path to a file that will be passed in as input. All formats supported by `ffmpeg` are allowed. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 24 | | `output_path` | `--output` | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 25 | | `tool_args` | all other arguments | (optional) A string containing additional arguments to be passed to Demucs, formatted as if using the command line. | 26 | | `is_async` | | Whether to run a job asynchronously. See [Async Runs](../../feature-reference/async-runs.md) for more. | 27 | 28 | Tool Versions 29 | ============= 30 | 31 | Toolchest supports version **3.0.4** of Demucs. 32 | 33 | Supported Additional Arguments 34 | ============================== 35 | 36 | - \-v 37 | - \--verbose 38 | - \--shifts 39 | - \--overlap 40 | - \-no-split 41 | - \--two-stems 42 | - \--int24 43 | - \--float32 44 | - \--clip-mode 45 | - \--mp3 46 | - \--mp3-bitrate 47 | - \-n 48 | 49 | Set additional arguments with `tool_args`. For example: `tool_args="-n mdx_extra --shifts=5"` -------------------------------------------------------------------------------- /docs/docs/tool-reference/post-processing.md: -------------------------------------------------------------------------------- 1 | Some tools modify your raw data, but you still need another tool to bring the data to a usable state. We call that a 2 | post-processing tool. 3 | 4 | One example is [Kraken](taxonomic-classifiers/kraken-2.md), often used with [Bracken](post-processing/bracken.md) for 5 | post-processing. -------------------------------------------------------------------------------- /docs/docs/tool-reference/pre-processing.md: -------------------------------------------------------------------------------- 1 | Some tools can check data integrity or transform data before use in another tool. We call that a pre-processing tool. -------------------------------------------------------------------------------- /docs/docs/tool-reference/pre-processing/fastqc.md: -------------------------------------------------------------------------------- 1 | **FastQC** is a quality control tool for genomic sequence data. [See their website for more details](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). 2 | 3 | # Function Call 4 | 5 | ```python 6 | tc.fastqc( 7 | inputs, 8 | output_path=None, 9 | contaminants=None, 10 | adapters=None, 11 | limits=None, 12 | tool_args="", 13 | is_async=False, 14 | ) 15 | ``` 16 | 17 | ## Function Arguments 18 | 19 | 20 | | Argument | Use in place of: | Description | 21 | |:-----------------------|:------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 22 | | `inputs` | input file location | Path to one or more files to use as input. \nSAM, BAM, or FASTQ formats are supported, as well as gzip-compressed variants. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 23 | | `output_path` | `-o` (directory name) | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 24 | | `contaminants` | `-c` or `--contaminants` file path. | (optional) Path to a custom contaminants file. | 25 | | `adapters` | `-a` or `--adapters` file path. | (optional) Path to a custom adapters file. | 26 | | `limits` | `-l` or `--limits` file path | (optional) Path to a custom limits file. | 27 | | `tool_args` | all other arguments | (optional) Additional arguments to be passed to FastQC. This should be a string of arguments like the command line. | 28 | | `is_async` | | Whether to run a job asynchronously. See [Async Runs](../../feature-reference/async-runs.md) for more. | 29 | 30 | 31 | 32 | ## Output Files 33 | 34 | A FastQC run will output the html report and output zip into `output_path`: 35 | 36 | - `{input file name}_fastqc.html`: FastQC HTML report for checking data quality 37 | - `{input file name}\_fastqc.zip`: Zip directory containing the HTML report and some supporting files 38 | 39 | # Tool Versions 40 | 41 | Toolchest currently supports version **0.11.9** of FastQC. -------------------------------------------------------------------------------- /docs/docs/tool-reference/structure-prediction.md: -------------------------------------------------------------------------------- 1 | Structure prediction takes an input protein sequence and predicts the shape of its three dimensional structure. 2 | 3 | Recent advances – including [AlphaFold](structure-prediction/alphafold.md) – make use of deep learning, making use of 4 | GPUs and terabytes of pre-trained reference databases to predict 3D structure. 5 | 6 | AlphaFold 2 is one of the few structure prediction tools that's hosted on Toolchest, but 7 | [let us know](https://airtable.com/shrNBkD0bG2wB15jQ) if there's another tool that you'd like to use! -------------------------------------------------------------------------------- /docs/docs/tool-reference/structure-prediction/alphafold.md: -------------------------------------------------------------------------------- 1 | AlphaFold is a deep learning tool for predicting a protein’s 3D structure from its amino acid sequence. It was 2 | developed by DeepMind and utilizes GPU compute. For more information, see the tool's 3 | [homepage](https://alphafold.ebi.ac.uk/) and [GitHub repo](https://github.com/deepmind/alphafold). 4 | 5 | Function Call 6 | ============= 7 | 8 | ```python 9 | tc.alphafold( 10 | inputs, 11 | output_path=None, 12 | model_preset=None, 13 | max_template_date=None, 14 | use_reduced_dbs=False, 15 | is_prokaryote_list=None, 16 | is_async=False, 17 | ) 18 | ``` 19 | 20 | Function Arguments 21 | ------------------ 22 | 23 | See the Notes section below for more details. 24 | 25 | | Argument | Use in place of: | Description | 26 | | :------------------- | :------------------------ | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 27 | | `inputs` | `--fasta-paths` | Path to one or more files to use as input. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 28 | | `output_path` | `--output_dir` | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 29 | | `model_preset` | `--model_preset` | (optional) Specific AlphaFold model to use. Options are [`monomer`, `monomer_casp14`, `monomer_ptm`, `multimer`]. Defaults to `monomer`. | 30 | | `max_template_date` | `--max_template_date` | (optional) String of date in YYYY-MM-DD format. Restricts protein structure prediction to those in the database before this date. Defaults to today's date. | 31 | | `use_reduced_dbs` | `--db_preset=reduced_dbs` | (optional) Whether to use a smaller version of the BFD database. If true, reduces run time at the cost of result quality. | 32 | | `is_prokaryote_list` | `--is_prokaryote_list` | (optional) List of booleans that determines whether all input sequences in the given FASTA file are prokaryotic. Expects the string normally used input into AlphaFold (e.g. "true,true" if there are two prokaryote inputs). | 33 | | `is_async` | | Whether to run a job asynchronously. See [Async Runs](../../feature-reference/async-runs.md) for more. | 34 | 35 | Tool Versions 36 | ============= 37 | 38 | Toolchest currently supports version **2.1.2** of AlphaFold. 39 | 40 | Database 41 | ======== 42 | 43 | Toolchest's implementation of AlphaFold uses AlphaFold's required genetic sequence databases. For a complete list of databases used, see the tool's [GitHub page](https://github.com/deepmind/alphafold). 44 | 45 | Supported Additional Arguments 46 | ============================== 47 | 48 | Toolchest supports the following arguments for AlphaFold: 49 | 50 | - `--db_preset` 51 | - `--is_prokaryote_list` 52 | - `--max_template_date` 53 | - `--model_preset` 54 | 55 | However, these should be specified via specific argument values in the function call, rather than through a generic `tool_args` argument (like other Toolchest tools). See [Function Arguments](#function-arguments) for more details. -------------------------------------------------------------------------------- /docs/docs/tool-reference/taxonomic-classifiers.md: -------------------------------------------------------------------------------- 1 | Taxonomic classifiers perform a fuzzy search between input sequences and reference databases. A classic use-case is 2 | determining the relative abundance of a microbiome sample. 3 | 4 | If you only need relative abundance, you can use a taxonomic profiler that simply returns relative abundance. For a 5 | more detailed view, you can use a classifier like [Kraken 2](taxonomic-classifiers/kraken-2.md). 6 | 7 | Toolchest hosts both the taxonomic classifier and the corresponding reference databases, and you can also 8 | [use your own custom database](../feature-reference/adding-and-updating-custom-databases.md). 9 | 10 | Typically, taxonomic classifiers are more efficient than aligners for taxonomic classification, but most are based on 11 | heuristic methods rather than optimal alignment scores. If you're looking for something more analogous to BLAST, check 12 | out [aligners](aligners.md). 13 | 14 | If you want to use a taxonomic classifier that's not listed here, let us know! It might even be already available on 15 | our infrastructure but not listed. -------------------------------------------------------------------------------- /docs/docs/tool-reference/taxonomic-classifiers/metaphlan.md: -------------------------------------------------------------------------------- 1 | **MetaPhlAn** is a tool for profiling the composition of microbial communities. For more information, see the tool's 2 | [website](https://huttenhower.sph.harvard.edu/metaphlan/) or 3 | [Github wiki](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-3.0). 4 | 5 | # Function Call 6 | 7 | ```python 8 | tc.metaphlan( 9 | inputs, 10 | output_path=None, 11 | output_primary_name="out.txt", 12 | tool_args="", 13 | is_async=False, 14 | ) 15 | ``` 16 | 17 | ## Function Arguments 18 | 19 | | Argument | Use in place of: | Description | 20 | | :-------------------- | :------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 21 | | `inputs` | input file location | Path to one or more files to use as input. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 22 | | `output_path` | output file location | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). | 23 | | `tool_args` | all other arguments | (optional) Additional arguments to be passed to MetaPhlAn. This should be a string of arguments like the command line. | 24 | | `output_primary_name` | | (optional) Sets the name of the main output file. Defaults to "out.txt" | 25 | | `is_async` | | Whether to run a job asynchronously. See [Async Runs](../../feature-reference/async-runs.md) for more. | 26 | 27 | See the [Databases](#databases) section for more details. 28 | 29 | ## Output Files 30 | 31 | A MetaPhlAn run will output 2 files into `output_path`: 32 | 33 | - `out.txt`: Results of the MetaPhlAn run. 34 | - `{input_file_name}.bowtie2out.txt`: The intermediate Bowtie 2 output file generated by MetaPhlAn. This can be passed in as input to quickly rerun with the same input. This is not generated if `--no-map`is passed via `tool_args` 35 | 36 | # Tool Versions 37 | 38 | Toolchest currently supports version **3.0.14** of MetaPhlAn. 39 | 40 | # Databases 41 | 42 | Toolchest currently supports the latest version of the `mpa_v30_CHOCOPhlAn_201901_marker_info` database. You can read more about the database on the [Github wiki](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-3.0). -------------------------------------------------------------------------------- /docs/docs/tool-reference/test-runs.md: -------------------------------------------------------------------------------- 1 | You can call the `test` function to mimic a Toolchest run. 2 | 3 | `test` actually uploads your inputs to Toolchest's infrastructure. Nothing is done to the files beyond the upload. 4 | 5 | Function Call 6 | ============= 7 | 8 | ```python 9 | tc.test( 10 | inputs, 11 | output_path=None, 12 | tool_args="", 13 | is_async=False, 14 | ) 15 | ``` 16 | 17 | Output Files 18 | ------------ 19 | 20 | `test` has one output file, `test_output.txt`, a text document that reads: 21 | 22 | ``` 23 | success 24 | ``` -------------------------------------------------------------------------------- /docs/docs/tool-reference/transfer.md: -------------------------------------------------------------------------------- 1 | Sometimes, there's no alternative to downloading a terabyte of data from an FTP or HTTPS source. When the source 2 | download speed is 5 MB/s (looking at you, NCBI RefSeq and EMBL!), the transfer takes days – a long time to keep your 3 | laptop up and running. 4 | 5 | `transfer` moves files from any supported input location to any supported output location. It runs in the background, 6 | meaning you don't need to keep your laptop or server running during transfer. 7 | 8 | # Function Call 9 | 10 | ```python 11 | tc.transfer( 12 | inputs, 13 | output_path=None, 14 | is_async=True, 15 | ) 16 | ``` 17 | 18 | ## Function Arguments 19 | 20 | 21 | | Argument | Description | 22 | | :------------ |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 23 | | `inputs` | Path to a file that will be passed in as input. All formats supported by `ffmpeg` are allowed. The files can be a local or remote, see [Using Files](../getting-started/using-files.md). | 24 | | `output_path` | (optional) Path (directory) to where the output files will be downloaded. The path can be a local or remote, see [Using Files](../getting-started/using-files.md). | 25 | | `is_async` | Whether to run the job asynchronously. By default, this is true. If you set this to false, the Toolchest command will wait to exit until the transfer is complete. See [Async Runs](../feature-reference/async-runs.md) for more. | 26 | -------------------------------------------------------------------------------- /docs/docs/tool-reference/workflows-meta-tools.md: -------------------------------------------------------------------------------- 1 | Workflow tools – or meta-tools – wrap several tools to form a unified pipeline. These workflow tools are usually 2 | focused specific area like microbiome or single-cell analysis, but they wrap more generic tools under the hood. 3 | 4 | They're popular, because they're easy to use; there's no need to make your own choices on aligners, assemblers, or 5 | classifiers, and the tools the workflow creator chooses are often pre-tuned for one specific use-case. 6 | 7 | [HUMAnN](workflows-meta-tools/humann3.md) is a perfect example of these types of meta-tools. Under the hood, it uses: 8 | 9 | - [Bowtie 2](aligners/bowtie-2.md) 10 | - [Diamond](aligners/diamond.md) 11 | - [Python 3](python3.md) 12 | - [RAPSearch2](aligners/rapsearch2.md) 13 | - [MetaPhlAn 3](taxonomic-classifiers/metaphlan.md) 14 | 15 | and several other tools, all wrapped under the `humann` command. 16 | 17 | When using workflow tools via Toolchest, you'll notice a new argument: `mode`. This lets you run sub-tools directly. -------------------------------------------------------------------------------- /docs/docs/toolchest-hosted-cloud/instance-types.md: -------------------------------------------------------------------------------- 1 | # Instance Types 2 | 3 | | Instance type | vCPUs | Memory (GB) | GPU type | 4 | | :------------ | :---- | :---------- | :---------------------------------------- | 5 | | compute-2 | 2 | 4 | | 6 | | compute-4 | 4 | 8 | | 7 | | compute-8 | 8 | 16 | | 8 | | compute-16 | 16 | 32 | | 9 | | compute-32 | 32 | 64 | | 10 | | compute-48 | 48 | 96 | | 11 | | compute-64 | 64 | 128 | | 12 | | compute-96 | 96 | 192 | | 13 | | general-2 | 2 | 8 | | 14 | | general-4 | 4 | 16 | | 15 | | general-8 | 8 | 32 | | 16 | | general-16 | 16 | 64 | | 17 | | general-32 | 32 | 128 | | 18 | | general-48 | 48 | 192 | | 19 | | general-64 | 64 | 256 | | 20 | | general-96 | 96 | 384 | | 21 | | gpu-V100 | 8 | 61 | 1 NVIDIA Tesla V100 with 16 GB of memory | 22 | | memory-16 | 2 | 16 | | 23 | | memory-32 | 4 | 32 | | 24 | | memory-64 | 8 | 64 | | 25 | | memory-128 | 16 | 128 | | 26 | | memory-256 | 32 | 256 | | 27 | | memory-384 | 48 | 384 | | 28 | | memory-512 | 64 | 512 | | 29 | -------------------------------------------------------------------------------- /docs/docs/toolchest-hosted-cloud/pricing.md: -------------------------------------------------------------------------------- 1 | # Toolchest-hosted pricing and instance types 2 | 3 | By default, Toolchest jobs run in Toolchest's managed AWS account. The prices below are for resources that you spawn by 4 | running Toolchest jobs. For information on running Toolchest in your own AWS account, see 5 | [Running Toolchest in your AWS account](./running-toolchest-in-your-aws-account.md) 6 | 7 | Toolchest Hosted Cloud pricing starts with a free allowance and moves to incremental billing, scaling as your usage 8 | grows. 9 | 10 | Per-minute billing starts when the Toolchest instance begins executing, and stops immediately when a run finishes. You 11 | can say goodbye to paying for idling cloud instances. 12 | 13 | ## Free tier 14 | 15 | ### Compute 16 | 17 | | Service | Free tier | What can you run? | 18 | | :---------- | :------------- | :------------------------------------------------ | 19 | | vCPU | 50 vCPU-hours | A run that lasts 2 hours with 25 vCPUs. | 20 | | RAM | 100 GB-hours | A run that lasts 2 hours with 50 GB of RAM | 21 | | Disk | 2 TB-hour | A run that lasts 2 hours with 1 TB of disk space. | 22 | | Invocations | 50 invocations | 50 runs | 23 | 24 | ### Files 25 | 26 | | Service | Free tier | What can you run? | 27 | | :------------------------------ | :-------- | :--------------------------------------------------------------------------------- | 28 | | Input and output files | 100 GB | A run with 40 GB of transferred input files and 60 GB of transferred output files. | 29 | | High speed reference DB storage | 50 GB/mo | A custom reference database for Kraken 2 that's 50 GB. | 30 | 31 | ## Growth pricing 32 | 33 | ### Compute 34 | 35 | | Service | Cost | Billing increment | 36 | | :--------- | :------------------- | :------------------------------------ | 37 | | vCPU | $0.084 per vCPU-hour | Per minute, with a one minute minimum | 38 | | RAM | $0.016 per GB-hour | Per minute, with a one minute minimum | 39 | | Disk | $0.009 per TB-hour | Per minute, with a one minute minimum | 40 | | Invocation | $0.10 per invocation | Per run | 41 | 42 | ### Files 43 | 44 | | Service | Cost | Billing increment | 45 | | :------------------------------------ | :------------- | :--------------------------------- | 46 | | Input and output files | $0.1 per GB | Per GB | 47 | | High speed reference database storage | $2.4 per GB-mo | Per month, with at least one month | 48 | 49 | !!! note "Input and output file pricing includes network data transfer and temporary storage" 50 | Every input and output file includes free transfer to and from Toolchest infrastructure. The files are cached for one week (7 days) after the run is initialized. 51 | 52 | ### Example pricing with a Toolchest-hosted bioinformatics tool, Kraken 2 53 | 54 | A Kraken 2 run with 2 GB of input files, 16 vCPUs, and 128 GB of RAM with 128 GB of disk space runs for 5 minutes. It produces 1 GB of output files, for a total of 3 GB of input and output files. This costs: 55 | 56 | - 3 GB of input and output files \* $0.1 per GB = $0.3 57 | - 16 vCPUs \* 0.08 hours \* $0.084 per vCPU-hour = $0.10752 58 | - 128 GB of RAM \* 0.08 hours \* $0.016 per RAM GB-hour = $0.16384 59 | - 0.125 TB of disk \* 0.08 hours \* $0.009 per TB-hour = $0.00009 60 | - 1 invocation = $0.10 61 | 62 | For a total of **$0.67** 63 | 64 | ### Example pricing with a custom Python script 65 | 66 | A custom Python3 script with 40 GB of input files, 32 vCPUs, and 64 GB of RAM with 256 GB of disk space runs for 30 minutes. It produces 10 GB of output files, for a total of 50 GB of input and output files. This costs: 67 | 68 | - 50 GB of input and output files \* $0.1 per GB = $5 69 | - 32 vCPUs \* 0.5 hours \* $0.084 per vCPU-hour = $1.344 70 | - 64 GB of RAM \* 0.5 hours \* $0.016 per RAM GB-hour = $0.512 71 | - 0.25 TB of disk \* 0.5 hours \* $0.009 per TB-hour = $0.001125 72 | - 1 invocation = $0.10 73 | 74 | For a total of **$6.96** 75 | 76 | 77 | ## Support 78 | 79 | Every customer gets access to text-based support – including a shared Slack channel, email, and any other async way 80 | that you can think of talking to us. 81 | 82 | We offer synchronous support, and SLAs for support and infrastructure availability, too. 83 | 84 | ## Custom plans 85 | 86 | If you're a business with unique needs (e.g. high volume, a non-standard business model, or very large files), we can 87 | build a custom plan for you. -------------------------------------------------------------------------------- /docs/docs/toolchest-hosted-cloud/running-toolchest-in-your-aws-account.md: -------------------------------------------------------------------------------- 1 | # Running Toolchest in your AWS account 2 | 3 | If you just want to enable Toolchest to pull from your S3 buckets, check out 4 | [Using AWS with Toolchest](../feature-reference/using-aws-with-toolchest.md). 5 | 6 | You can also set up Toolchest to run instances in your own AWS account. We're gating this feature for now, so let us 7 | know if you'd like access. -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "toolchest-client" 3 | version = "0.11.14" 4 | description = "Python client for Toolchest" 5 | authors = [ 6 | "Justin Herr ", 7 | "Noah Lebovic =1.0.0"] 60 | build-backend = "poetry.core.masonry.api" 61 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | integration: mark a test as an integration test. 4 | integration_full: mark a test as a full-suite integration test (only run on pre-deploy). 5 | junit_family=xunit1 6 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def pytest_configure(config): 5 | """ 6 | Allows plugins and conftest files to perform initial configuration. 7 | This hook is called for every plugin and initial conftest 8 | file after command line options have been parsed. 9 | """ 10 | if os.environ.get("DEPLOY_ENVIRONMENT") == "staging": 11 | os.environ["TOOLCHEST_API_URL"] = os.environ["TOOLCHEST_STAGING_URL"] 12 | -------------------------------------------------------------------------------- /tests/test_async.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import time 4 | 5 | from tests.util import hash 6 | import toolchest_client as toolchest 7 | 8 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 9 | if toolchest_api_key: 10 | toolchest.set_key(toolchest_api_key) 11 | 12 | 13 | @pytest.mark.integration 14 | def test_async_execution(): 15 | """ 16 | Tests Kraken 2 running async using a small reference database 17 | """ 18 | 19 | test_dir = "temp_test_async_execution" 20 | os.makedirs(f"./{test_dir}", exist_ok=True) 21 | output_dir_path = f"./{test_dir}" 22 | output_file_path = f"{output_dir_path}/kraken2_output.txt" 23 | 24 | custom_db = "s3://toolchest-fsx-databases/kraken2/k2_viral_20210517/" 25 | toolchest_run = toolchest.kraken2( 26 | read_one="s3://toolchest-integration-tests/synthetic_bacteroides_reads.fasta", 27 | remote_database_path=custom_db, 28 | output_path=output_dir_path, 29 | is_async=True, 30 | ) 31 | 32 | run_status = '' 33 | while run_status != toolchest.Status.READY_TO_TRANSFER_TO_CLIENT: 34 | time.sleep(5) 35 | 36 | run_status = toolchest.get_status(run_id=toolchest_run.run_id) 37 | if run_status == toolchest.Status.FAILED: 38 | print("Toolchest run failed.") 39 | 40 | toolchest.download( 41 | output_path=output_dir_path, 42 | run_id=toolchest_run.run_id, 43 | ) 44 | 45 | assert hash.unordered(output_file_path) == 1003212151 46 | -------------------------------------------------------------------------------- /tests/test_blastn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import hash 5 | import toolchest_client as toolchest 6 | 7 | 8 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 9 | if toolchest_api_key: 10 | toolchest.set_key(toolchest_api_key) 11 | 12 | 13 | @pytest.mark.integration 14 | def test_blastn_nt(): 15 | """ 16 | Tests BLASTN against the default nt (v1) DB 17 | """ 18 | test_dir = "temp_test_blastn_nt" 19 | os.makedirs(f"./{test_dir}", exist_ok=True) 20 | output_dir_path = f"./{test_dir}" 21 | output_file_path = f"{output_dir_path}/blastn_results.out" 22 | 23 | toolchest.blastn( 24 | inputs="s3://toolchest-integration-tests/small_synthetic_bacteroides_reads.fasta", 25 | output_path=output_dir_path, 26 | tool_args="-mt_mode 1" 27 | ) 28 | 29 | assert hash.unordered(output_file_path) == 1290536116 30 | 31 | 32 | @pytest.mark.integration 33 | def test_blastn_nt_task_blastn(): 34 | """ 35 | Tests BLASTN against the default nt (v1) DB 36 | """ 37 | test_dir = "temp_test_blastn_nt" 38 | os.makedirs(f"./{test_dir}", exist_ok=True) 39 | output_dir_path = f"./{test_dir}" 40 | output_file_path = f"{output_dir_path}/blastn_results.out" 41 | 42 | toolchest.blastn( 43 | inputs="s3://toolchest-integration-tests/small_synthetic_bacteroides_reads.fasta", 44 | output_path=output_dir_path, 45 | tool_args="-mt_mode 1 -task blastn" 46 | ) 47 | 48 | assert hash.unordered(output_file_path) == 1657058660 49 | -------------------------------------------------------------------------------- /tests/test_bowtie2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import hash, filter_output 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | 12 | @pytest.mark.integration 13 | def test_bowtie2(): 14 | """ 15 | Tests bowtie2 16 | """ 17 | 18 | test_dir = "temp_test_bowtie2_standard" 19 | os.makedirs(f"./{test_dir}", exist_ok=True) 20 | output_dir_path = f"./{test_dir}" 21 | output_file_path = f"{output_dir_path}/bowtie2_output.sam" 22 | filtered_output_file_path = f"{output_dir_path}/bowtie2_output.filtered.sam" 23 | 24 | toolchest.bowtie2( 25 | inputs="s3://toolchest-integration-tests/DRR000006.fastq.gz", 26 | output_path=output_dir_path, 27 | ) 28 | 29 | # Filter non-deterministic metadata lines 30 | filter_output.filter_sam(output_file_path, filtered_output_file_path) 31 | assert hash.unordered(filtered_output_file_path) == 1444969892 32 | -------------------------------------------------------------------------------- /tests/test_cellranger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import shutil 4 | 5 | from tests.util import s3, hash 6 | import toolchest_client as toolchest 7 | 8 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 9 | if toolchest_api_key: 10 | toolchest.set_key(toolchest_api_key) 11 | 12 | 13 | @pytest.mark.integration 14 | def test_cellranger_count_s3_inputs(): 15 | test_dir = "temp_test_cellranger_count_s3_inputs" 16 | output_dir_path = f"./{test_dir}/output/" 17 | 18 | # Test using an S3 prefix containing 3 FASTQs 19 | output = toolchest.cellranger_count( 20 | inputs="s3://toolchest-integration-tests/cellranger/count/pbmc_1k_v3_fastqs_trimmed", 21 | database_name="GRCh38", 22 | output_path=output_dir_path, 23 | skip_decompression=True, 24 | ) 25 | verify_cellranger_count_outputs(output.output_file_paths, output_dir_path) 26 | 27 | 28 | @pytest.mark.integration 29 | def test_cellranger_count_local_inputs(): 30 | test_dir = "temp_test_cellranger_count_local_inputs" 31 | input_dir_path = f"./{test_dir}/inputs/" 32 | output_dir_path = f"./{test_dir}/output/" 33 | os.makedirs(input_dir_path, exist_ok=True) 34 | 35 | # Test from a directory of local inputs 36 | input_file_names = [ 37 | "pbmc_1k_v3_trimmed_S1_L001_I1_001.fastq", 38 | "pbmc_1k_v3_trimmed_S1_L001_R1_001.fastq", 39 | "pbmc_1k_v3_trimmed_S1_L001_R2_001.fastq", 40 | ] 41 | for input_file_name in input_file_names: 42 | s3.download_integration_test_input( 43 | s3_file_key=f"cellranger/count/pbmc_1k_v3_fastqs_trimmed/{input_file_name}", 44 | output_file_path=f"{input_dir_path}/{input_file_name}", 45 | ) 46 | output = toolchest.cellranger_count( 47 | inputs=input_dir_path, 48 | database_name="GRCh38", 49 | output_path=output_dir_path, 50 | skip_decompression=True, 51 | ) 52 | verify_cellranger_count_outputs(output.output_file_paths, output_dir_path) 53 | 54 | 55 | def verify_cellranger_count_outputs(archive_path, output_dir_path): 56 | # Expected properties of outputs 57 | MIN_EXPECTED_ARCHIVE_SIZE = 34000000 58 | MAX_EXPECTED_ARCHIVE_SIZE = 38000000 59 | EXPECTED_SUMMARY_SIZE = 2744825 60 | EXPECTED_RAW_MATRIX_SIZE = 868393 61 | EXPECTED_RAW_MATRIX_HASH = "d00cca1d2b4344b03946eeaeedc17ed5" 62 | EXPECTED_FILTERED_MATRIX_SIZE = 503956 63 | 64 | # Verify properties of packed archive 65 | archive_size = os.path.getsize(archive_path) 66 | assert MIN_EXPECTED_ARCHIVE_SIZE <= archive_size <= MAX_EXPECTED_ARCHIVE_SIZE 67 | 68 | shutil.unpack_archive( 69 | filename=archive_path, 70 | extract_dir=output_dir_path, 71 | format="gztar", 72 | ) 73 | 74 | # Verify properties of unpacked files 75 | summary_path = f"{output_dir_path}outs/web_summary.html" 76 | raw_matrix_path = f"{output_dir_path}outs/raw_feature_bc_matrix.h5" 77 | filtered_matrix_path = f"{output_dir_path}outs/filtered_feature_bc_matrix.h5" 78 | assert os.path.getsize(summary_path) == EXPECTED_SUMMARY_SIZE 79 | assert os.path.getsize(raw_matrix_path) == EXPECTED_RAW_MATRIX_SIZE 80 | assert os.path.getsize(filtered_matrix_path) == EXPECTED_FILTERED_MATRIX_SIZE 81 | assert hash.binary_hash(raw_matrix_path) == EXPECTED_RAW_MATRIX_HASH 82 | -------------------------------------------------------------------------------- /tests/test_centrifuge.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import hash 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | 12 | @pytest.mark.integration 13 | def test_centrifuge_many_types(): 14 | """ 15 | Tests Centrifuge with one pair of paired-end inputs and two single-end inputs. 16 | """ 17 | test_dir = "temp_test_centrifuge/many_types" 18 | os.makedirs(f"./{test_dir}", exist_ok=True) 19 | output_dir_path = f"./{test_dir}" 20 | output_file_path = f"{output_dir_path}/centrifuge_output.txt" 21 | output_report_path = f"{output_dir_path}/centrifuge_report.tsv" 22 | 23 | toolchest.centrifuge( 24 | read_one="s3://toolchest-integration-tests/megahit/r3_1.fa", 25 | read_two="s3://toolchest-integration-tests/megahit/r3_2.fa", 26 | unpaired="s3://toolchest-integration-tests/megahit/r4.fa", 27 | tool_args="-f", 28 | output_path=output_dir_path, 29 | ) 30 | 31 | assert hash.unordered(output_file_path) == 1779279198 32 | assert hash.unordered(output_report_path) == 1100843098 33 | 34 | 35 | @pytest.mark.integration 36 | def test_centrifuge_multiple_pairs(): 37 | """ 38 | Tests Centrifuge with two pairs of paired-end inputs. 39 | """ 40 | test_dir = "temp_test_centrifuge/multiple_pairs" 41 | os.makedirs(f"./{test_dir}", exist_ok=True) 42 | output_dir_path = f"./{test_dir}" 43 | output_file_path = f"{output_dir_path}/centrifuge_output.txt" 44 | output_report_path = f"{output_dir_path}/centrifuge_report.tsv" 45 | 46 | toolchest.centrifuge( 47 | read_one=[ 48 | "s3://toolchest-integration-tests/sample_r1.fastq.gz", 49 | "s3://toolchest-integration-tests/r1.fastq.gz", 50 | ], 51 | read_two=[ 52 | "s3://toolchest-integration-tests/sample_r2.fastq.gz", 53 | "s3://toolchest-integration-tests/r2.fastq.gz", 54 | ], 55 | output_path=output_dir_path, 56 | volume_size=32, 57 | ) 58 | 59 | assert hash.unordered(output_report_path) == 1895979303 60 | assert hash.unordered(output_file_path) == 1059786093 61 | -------------------------------------------------------------------------------- /tests/test_chaining.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import hash 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | SHI7_SINGLE_END_HASH = 1570879637 12 | SHOGUN_CHAINED_HASH = 1708070294 13 | 14 | 15 | @pytest.mark.integration 16 | @pytest.mark.skip(reason="Load reduction for integration tests") 17 | def test_shi7_shogun_chaining(): 18 | """ 19 | Tests S3-based chaining with shi7 and shogun. Passes the S3 URI of the 20 | shi7 output to shogun as input, skipping the (intermediate) shi7 output download. 21 | Downloads the (final) shogun output to hash for testing. 22 | 23 | To enforce shi7 determinism, a single R1 input is used. 24 | 25 | Note: This test also tests the Output object generated by the shi7() tool call, 26 | and chaining the shi7 output files depends on how the Output is structured. 27 | If the Output class is modified, this test should be modified as well. 28 | """ 29 | 30 | test_dir = "temp_test_shi7_shogun_chaining" 31 | os.makedirs(f"./{test_dir}", exist_ok=True) 32 | output_dir_path = f"./{test_dir}" 33 | output_file_path_shogun = f"{output_dir_path}/alignment.bowtie2.sam" 34 | 35 | output_shi7 = toolchest.shi7( 36 | tool_args="-SE", 37 | inputs="s3://toolchest-integration-tests/sample_r1.fastq.gz", 38 | ) 39 | 40 | # Note: since output_path was omitted from the shi7 function call, 41 | # local download is skipped, and the local output_file_paths of output_shi7 42 | # should be None. 43 | assert output_shi7.output_file_paths is None 44 | 45 | output_shogun = toolchest.shogun_align( 46 | inputs=output_shi7.s3_uri, 47 | output_path=output_dir_path, 48 | ) 49 | 50 | assert hash.unordered(output_file_path_shogun) == SHOGUN_CHAINED_HASH 51 | assert hash.unordered(output_shogun.output_file_paths) == SHOGUN_CHAINED_HASH 52 | -------------------------------------------------------------------------------- /tests/test_clustalo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import hash 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | 12 | @pytest.mark.integration 13 | @pytest.mark.parametrize("provider", ["aws", "tce"]) 14 | def test_clustalo_standard(provider): 15 | """ 16 | Tests Clustal Omega 17 | """ 18 | test_dir = "temp_test_clustalo_standard" 19 | os.makedirs(f"./{test_dir}", exist_ok=True) 20 | output_dir_path = f"./{test_dir}" 21 | output_file_name = "sample_output.fasta" 22 | output_file_path = f"{output_dir_path}/{output_file_name}" 23 | 24 | toolchest.clustalo( 25 | inputs="s3://toolchest-integration-tests/clustalo_input.fasta", 26 | output_path=output_dir_path, 27 | output_primary_name=output_file_name, 28 | provider=provider, 29 | ) 30 | 31 | assert hash.unordered(output_file_path) == 1217555147 32 | -------------------------------------------------------------------------------- /tests/test_diamond.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import hash, s3 5 | import toolchest_client as toolchest 6 | from toolchest_client.api.instance_type import InstanceType 7 | 8 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 9 | if toolchest_api_key: 10 | toolchest.set_key(toolchest_api_key) 11 | 12 | 13 | @pytest.mark.integration 14 | @pytest.mark.parametrize("provider", ["aws", "tce"]) 15 | def test_diamond_blastp_standard(provider): 16 | """ 17 | Tests Diamond blastp mode 18 | """ 19 | test_dir = "temp_test_diamond_blastp_standard" 20 | os.makedirs(f"./{test_dir}", exist_ok=True) 21 | output_dir_path = f"./{test_dir}" 22 | output_file_name = "sample_output.tsv" 23 | output_file_path = f"{output_dir_path}/{output_file_name}" 24 | 25 | toolchest.diamond_blastp( 26 | inputs="s3://toolchest-integration-tests/diamond_blastp_input.fa", 27 | output_path=output_dir_path, 28 | output_primary_name=output_file_name, 29 | provider=provider, 30 | ) 31 | 32 | assert hash.unordered(output_file_path) == 952562472 33 | 34 | 35 | @pytest.mark.integration 36 | @pytest.mark.parametrize("provider", ["aws", "tce"]) 37 | def test_diamond_blastp_remote_database(provider): 38 | """ 39 | Tests DIAMOND BLASTP with a remote database, including a primary name 40 | """ 41 | test_dir = "test_diamond_blastp_remote_database" 42 | os.makedirs(f"./{test_dir}", exist_ok=True) 43 | output_dir_path = f"./{test_dir}" 44 | output_file_name = "sample_output.tsv" 45 | output_file_path = f"{output_dir_path}/{output_file_name}" 46 | 47 | toolchest.diamond_blastp( 48 | inputs="s3://toolchest-integration-tests/short_diamond_blastp_input.fa", 49 | remote_database_path="s3://toolchest-fsx-databases/tests/", 50 | remote_database_primary_name="custom_diamond_blastp_db", 51 | output_path=output_dir_path, 52 | output_primary_name=output_file_name, 53 | provider=provider, 54 | ) 55 | 56 | assert hash.unordered(output_file_path) == 563371739 57 | 58 | 59 | @pytest.mark.integration 60 | @pytest.mark.parametrize("provider", ["aws", "tce"]) 61 | def test_diamond_blastx_standard(provider): 62 | """ 63 | Tests Diamond blastx mode 64 | """ 65 | test_dir = "temp_test_diamond_blastx_standard" 66 | os.makedirs(f"./{test_dir}", exist_ok=True) 67 | output_dir_path = f"./{test_dir}" 68 | output_file_name = "sample_output.tsv" 69 | output_file_path = f"{output_dir_path}/{output_file_name}" 70 | 71 | toolchest.diamond_blastx( 72 | inputs="s3://toolchest-integration-tests/sample_r1_shortened.fastq", 73 | output_path=output_dir_path, 74 | output_primary_name=output_file_name, 75 | provider=provider, 76 | instance_type=InstanceType.COMPUTE_48 77 | ) 78 | 79 | assert hash.unordered(output_file_path) == 883070112 80 | 81 | 82 | @pytest.mark.integration 83 | def test_diamond_blastx_distributed(): 84 | """ 85 | Tests DIAMOND BLASTX distributed mode 86 | """ 87 | test_dir = "./temp_test_diamond_blastx_distributed" 88 | os.makedirs(f"{test_dir}", exist_ok=True) 89 | input_file_path = f"{test_dir}/combined_seqs_unfiltered.fna" 90 | output_dir_path = f"./{test_dir}" 91 | output_file_name = "sample_output.tsv" 92 | output_file_path = f"{output_dir_path}/{output_file_name}" 93 | 94 | s3.download_integration_test_input( 95 | s3_file_key="combined_seqs_unfiltered.fna", 96 | output_file_path=input_file_path, 97 | is_private=True, 98 | ) 99 | 100 | print(input_file_path) 101 | 102 | toolchest.diamond_blastx( 103 | inputs=input_file_path, 104 | output_path=output_dir_path, 105 | output_primary_name=output_file_name, 106 | distributed=True, 107 | ) 108 | 109 | assert 1390254000 < os.path.getsize(output_file_path) <= 1390256000 110 | -------------------------------------------------------------------------------- /tests/test_download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import hash 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | KRAKEN2_SINGLE_END_HASH = 886254946 12 | 13 | 14 | @pytest.mark.integration 15 | def test_kraken2_output_manual_download(): 16 | """ 17 | Tests Kraken 2 against the standard (v1) database, with 18 | output manually downloaded after job completion 19 | """ 20 | test_dir = "temp_test_kraken2_output_manual_download" 21 | input_file_s3_uri = "s3://toolchest-integration-tests/synthetic_bacteroides_reads.fasta" 22 | manual_output_dir_path = f"./{test_dir}/manual" 23 | manual_output_file_path = f"{manual_output_dir_path}/kraken2_output.txt" 24 | toolchest_s3_dir_path = f"./{test_dir}/toolchest/s3" 25 | toolchest_s3_file_path = f"{toolchest_s3_dir_path}/kraken2_output.txt" 26 | toolchest_pipeline_dir_path = f"./{test_dir}/toolchest/id" 27 | toolchest_pipeline_file_path = f"{toolchest_pipeline_dir_path}/kraken2_output.txt" 28 | 29 | # Run job without downloading 30 | output = toolchest.kraken2( 31 | inputs=input_file_s3_uri, 32 | ) 33 | 34 | # Manually invoke download from output 35 | path_from_manual_download = output.download(manual_output_dir_path) 36 | 37 | # If multiple files are returned, path_from_manual download will be a list, 38 | # so we simply check if kraken2_output.txt is contained in it 39 | if isinstance(path_from_manual_download, list): 40 | path_from_manual_download = [os.path.abspath(path) for path in path_from_manual_download] 41 | else: 42 | path_from_manual_download = [path_from_manual_download] 43 | assert os.path.abspath(manual_output_file_path) in path_from_manual_download 44 | assert hash.unordered(manual_output_file_path) == KRAKEN2_SINGLE_END_HASH 45 | 46 | # Test again with toolchest.download(), using S3 URI 47 | path_from_toolchest_download = toolchest.download(toolchest_s3_dir_path, s3_uri=output.s3_uri) 48 | if isinstance(path_from_toolchest_download, list): 49 | path_from_toolchest_download = [os.path.abspath(path) for path in path_from_toolchest_download] 50 | else: 51 | path_from_toolchest_download = [path_from_toolchest_download] 52 | assert os.path.abspath(toolchest_s3_file_path) in path_from_toolchest_download 53 | assert hash.unordered(toolchest_s3_file_path) == KRAKEN2_SINGLE_END_HASH 54 | 55 | # Test again with toolchest.download(), using pipeline segment instance ID 56 | PIPELINE_INDEX_IN_S3_URI = 3 57 | pipeline_segment_instance_id = output.s3_uri.split("/")[PIPELINE_INDEX_IN_S3_URI] 58 | path_from_toolchest_download = toolchest.download( 59 | toolchest_pipeline_dir_path, 60 | pipeline_segment_instance_id=pipeline_segment_instance_id, 61 | ) 62 | if isinstance(path_from_toolchest_download, list): 63 | path_from_toolchest_download = [os.path.abspath(path) for path in path_from_toolchest_download] 64 | else: 65 | path_from_toolchest_download = [path_from_toolchest_download] 66 | assert os.path.abspath(toolchest_pipeline_file_path) in path_from_toolchest_download 67 | assert hash.unordered(toolchest_pipeline_file_path) == KRAKEN2_SINGLE_END_HASH 68 | -------------------------------------------------------------------------------- /tests/test_fastqc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import hash, filter_output 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | 12 | @pytest.mark.integration 13 | def test_fastqc(): 14 | """ 15 | Tests FastQC with a fastq file 16 | 17 | """ 18 | 19 | test_dir = "temp_test_fastqc" 20 | os.makedirs(f"./{test_dir}", exist_ok=True) 21 | output_dir_path = f"./{test_dir}" 22 | filtered_html_output_path = f"{output_dir_path}/sample_r1_shortened_fastqc_filtered.html" 23 | toolchest.fastqc( 24 | inputs="s3://toolchest-integration-tests/sample_r1_shortened.fastq", 25 | output_path=output_dir_path 26 | ) 27 | 28 | filter_output.filter_regex( 29 | os.path.join(test_dir, "sample_r1_shortened_fastqc.html"), 30 | filtered_html_output_path, 31 | search_regex='id="header_filename">([\\w\\s]+)= EXPECTED_MIN_OUTPUT_SIZE_MANY_TYPES 45 | 46 | 47 | @pytest.mark.integration 48 | def test_megahit_multiple_pairs(): 49 | """ 50 | Tests Megahit with two pairs of paired-end inputs. 51 | 52 | Note: Multithreaded megahit is not deterministic, so 53 | we check the size of the file instead. 54 | See https://github.com/voutcn/megahit/issues/48. 55 | """ 56 | test_dir = "temp_test_megahit_two_pairs" 57 | os.makedirs(f"./{test_dir}", exist_ok=True) 58 | output_dir_path = f"./{test_dir}" 59 | output_file_path = f"{output_dir_path}/final.contigs.fa" 60 | 61 | toolchest.megahit( 62 | read_one=[ 63 | "s3://toolchest-integration-tests/megahit/r3_1.fa", 64 | "s3://toolchest-integration-tests/r1.fastq.gz", 65 | ], 66 | read_two=[ 67 | "s3://toolchest-integration-tests/megahit/r3_2.fa", 68 | "s3://toolchest-integration-tests/r2.fastq.gz", 69 | ], 70 | tool_args="--presets meta-large", 71 | output_path=output_dir_path, 72 | ) 73 | 74 | assert os.path.getsize(output_file_path) >= EXPECTED_MIN_OUTPUT_SIZE_TWO_PAIRS 75 | -------------------------------------------------------------------------------- /tests/test_metaphlan.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import hash, filter_output 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | 12 | @pytest.mark.integration 13 | def test_metaphlan(): 14 | """ 15 | Tests MetaPhlAn with a fastq file 16 | 17 | """ 18 | 19 | test_dir = "temp_test_metaphlan" 20 | os.makedirs(f"./{test_dir}", exist_ok=True) 21 | output_dir_path = f"./{test_dir}" 22 | toolchest.metaphlan( 23 | inputs="s3://toolchest-integration-tests/metaphlan/SRS014464-Anterior_nares.fasta.gz", 24 | output_path=output_dir_path 25 | ) 26 | 27 | # MetaPhLan includes the command in the output but that contains a non-deterministic uuid so that line is removed. 28 | filtered_output_path = os.path.join(output_dir_path, "out_filtered.txt") 29 | filter_output.filter_regex( 30 | unfiltered_path=f"{output_dir_path}/out.txt", 31 | filtered_path=filtered_output_path, 32 | search_regex="#/usr/local/bin/metaphlan.*\n", 33 | replacement_str="", 34 | ) 35 | assert hash.unordered(filtered_output_path) == 1401032462 36 | 37 | bowtie2outfile_path = os.path.join(output_dir_path, "SRS014464-Anterior_nares.fasta.gz.bowtie2out.txt") 38 | assert hash.unordered(bowtie2outfile_path) == 1308716263 39 | -------------------------------------------------------------------------------- /tests/test_output.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | import toolchest_client as toolchest 5 | 6 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 7 | if toolchest_api_key: 8 | toolchest.set_key(toolchest_api_key) 9 | 10 | 11 | @pytest.mark.integration 12 | def test_output_object(): 13 | """ 14 | Verifies that the output object has all the desired parameters. 15 | """ 16 | test_dir = "temp_test_output_object" 17 | input_file_s3_uri = "s3://toolchest-integration-tests/small_synthetic_bacteroides_reads.fasta" 18 | output_dir_path = f"./{test_dir}" 19 | output_file_path = f"{output_dir_path}/test_output.txt" 20 | os.makedirs(output_dir_path, exist_ok=True) 21 | 22 | toolchest_output = toolchest.test( 23 | inputs=input_file_s3_uri, 24 | output_path=output_dir_path 25 | ) 26 | 27 | print(toolchest_output) 28 | assert toolchest_output.tool_name == "test" 29 | assert toolchest_output.tool_version == "0.1.0" 30 | assert toolchest_output.database_name is None 31 | assert toolchest_output.database_version is None 32 | assert toolchest_output.run_id is not None 33 | assert toolchest_output.output_path == os.path.abspath(output_dir_path) 34 | assert toolchest_output.output_file_paths == os.path.abspath(output_file_path) 35 | -------------------------------------------------------------------------------- /tests/test_public_uri.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | import toolchest_client as toolchest 5 | 6 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 7 | if toolchest_api_key: 8 | toolchest.set_key(toolchest_api_key) 9 | 10 | 11 | @pytest.mark.integration 12 | def test_s3_http_input(): 13 | """ 14 | Tests test function with an http input 15 | """ 16 | test_dir = "temp_test_http" 17 | os.makedirs(f"./{test_dir}", exist_ok=True) 18 | input_file_path = "https://toolchest-public-examples-no-encryption.s3.amazonaws.com/example.fastq" 19 | output_dir_path = f"./{test_dir}" 20 | output_file_path = f"{output_dir_path}/test_output.txt" 21 | 22 | toolchest.test( 23 | inputs=input_file_path, 24 | output_path=output_dir_path 25 | ) 26 | 27 | with open(output_file_path, "r") as f: 28 | assert f.read().strip() == "success" 29 | 30 | 31 | @pytest.mark.integration 32 | def test_ftp_input(): 33 | """ 34 | Tests transfer function with an ftp input 35 | """ 36 | test_dir = "temp_test_ftp" 37 | os.makedirs(f"./{test_dir}", exist_ok=True) 38 | output_dir_path = f"./{test_dir}" 39 | output_file_path = f"{output_dir_path}/SRR9990000.fastq.gz" 40 | 41 | toolchest.transfer( 42 | inputs="ftp://ftp.sra.ebi.ac.uk/vol1/fastq//SRR999/000/SRR9990000/SRR9990000.fastq.gz", 43 | output_path=output_dir_path 44 | ) 45 | 46 | assert os.path.getsize(output_file_path) == 11632985 47 | -------------------------------------------------------------------------------- /tests/test_python3.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import pathlib 4 | import sys 5 | 6 | import docker 7 | import pytest 8 | 9 | import toolchest_client as toolchest 10 | from toolchest_client.api.instance_type import InstanceType 11 | 12 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 13 | if toolchest_api_key: 14 | toolchest.set_key(toolchest_api_key) 15 | 16 | THIS_FILE_PATH = pathlib.Path(__file__).parent.resolve() 17 | 18 | 19 | @pytest.mark.integration 20 | def test_python3(): 21 | """ 22 | Tests python3 tool's inputs, output, and tool_arg 23 | 24 | NOTE: streaming is disabled for this test 25 | """ 26 | 27 | test_dir = "./temp_test_python3" 28 | os.makedirs(f"{test_dir}", exist_ok=True) 29 | toolchest.python3( 30 | tool_args="./input/example.fastq", 31 | script="s3://toolchest-integration-tests/write_test.py", 32 | inputs="s3://toolchest-integration-tests/example.fastq", 33 | output_path=f"{test_dir}/", 34 | instance_type=InstanceType.COMPUTE_2, 35 | streaming_enabled=False, 36 | ) 37 | 38 | output_file = open(f"{test_dir}/output.txt", "r") 39 | assert output_file.readline() == "Success" 40 | output_file.close() 41 | 42 | 43 | @pytest.mark.integration 44 | def test_python3_with_docker(): 45 | """ 46 | Tests adding dependencies to python3 via a custom docker image 47 | 48 | Specifically tests matrix multiplication via numpy 49 | """ 50 | client = docker.from_env() 51 | client.images.build( 52 | path=f"{THIS_FILE_PATH}/util/", 53 | dockerfile="numpy_test.Dockerfile", 54 | tag="python3-numpy:3.9", 55 | platform="linux/amd64" 56 | ) 57 | 58 | test_dir = "./temp_test_python3/with_docker" 59 | os.makedirs(f"{test_dir}", exist_ok=True) 60 | toolchest.python3( 61 | script="s3://toolchest-integration-tests/numpy_test.py", 62 | output_path=f"{test_dir}/", 63 | custom_docker_image_id="python3-numpy:3.9", 64 | instance_type="compute-2", 65 | ) 66 | 67 | output_file = open(f"{test_dir}/output.txt", "r") 68 | assert output_file.readline() == "[[ 58 64]\n" 69 | assert output_file.readline() == " [139 154]]" 70 | output_file.close() 71 | 72 | 73 | @pytest.mark.integration 74 | def test_python3_with_public_docker(): 75 | """ 76 | Tests using a public docker image with the write test script 77 | """ 78 | 79 | test_dir = "./temp_test_python3/with_public_docker" 80 | os.makedirs(f"{test_dir}", exist_ok=True) 81 | toolchest.python3( 82 | script="s3://toolchest-integration-tests/write_path.py", 83 | output_path=f"{test_dir}/", 84 | custom_docker_image_id="python:alpine3.16", 85 | ) 86 | 87 | output_file = open(f"{test_dir}/output.txt", "r") 88 | assert output_file.readline() == "['/data/home/ec2-user/input', '/usr/local/lib/python311.zip', " \ 89 | "'/usr/local/lib/python3.11', '/usr/local/lib/python3.11/lib-dynload', " \ 90 | "'/usr/local/lib/python3.11/site-packages']" 91 | output_file.close() 92 | 93 | 94 | @pytest.mark.integration 95 | def test_python3_streaming(): 96 | """ 97 | Tests python3 with output streaming enabled 98 | """ 99 | test_dir = "./temp_test_python3_streaming" 100 | os.makedirs(f"{test_dir}", exist_ok=True) 101 | test_script_path = "tests/util/streaming_script.py" 102 | 103 | # Run with captured stdout 104 | captured_stdout = io.StringIO() 105 | sys.stdout = captured_stdout 106 | toolchest.python3( 107 | script=test_script_path, 108 | output_path=f"{test_dir}/", 109 | instance_type=InstanceType.COMPUTE_2, 110 | streaming_enabled=True, 111 | ) 112 | # Reset stdout capture 113 | sys.stdout = sys.__stdout__ 114 | 115 | # Verify toolchest.python3() output files 116 | with open(f"{test_dir}/output.txt", "r") as output_file: 117 | assert output_file.readline() == "Success" 118 | 119 | # Check printed stdout 120 | stdout_lines = captured_stdout.getvalue().splitlines() 121 | stream_start = stdout_lines.index("==> Begin streamed lines <==") 122 | stream_end = stdout_lines.index("==> End streamed lines <==") 123 | streamed_lines = stdout_lines[stream_start:stream_end + 1] 124 | assert streamed_lines == ["==> Begin streamed lines <==", "0", "1", "2", "3", "4", "==> End streamed lines <=="] 125 | -------------------------------------------------------------------------------- /tests/test_rapsearch2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import hash 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | 12 | @pytest.mark.integration 13 | def test_rapsearch2(): 14 | """ 15 | Tests rapsearch2 on SeqScreen DB 16 | """ 17 | 18 | test_dir = "./temp_test_rapsearch2" 19 | os.makedirs(f"{test_dir}", exist_ok=True) 20 | output_file_path_aln = f"./{test_dir}/rapsearch2.aln" 21 | output_file_path_m8 = f"./{test_dir}/rapsearch2.m8" 22 | 23 | toolchest.rapsearch2( 24 | tool_args="-e 1e-9", 25 | inputs="s3://toolchest-integration-tests/example.fastq", 26 | output_path=f"{test_dir}/", 27 | output_primary_name="rapsearch2", 28 | ) 29 | 30 | # m8 output is nondeterministic, so we check file size 31 | assert 71362000 <= os.path.getsize(output_file_path_m8) <= 71362200 32 | 33 | assert 321661100 <= os.path.getsize(output_file_path_aln) <= 321661300 34 | assert hash.unordered(output_file_path_aln) == 2129168459 35 | -------------------------------------------------------------------------------- /tests/test_salmon.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | import toolchest_client as toolchest 5 | 6 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 7 | if toolchest_api_key: 8 | toolchest.set_key(toolchest_api_key) 9 | 10 | 11 | @pytest.mark.integration 12 | def test_salmon_hg38(): 13 | """ 14 | Tests salmon with a scRNA-Seq FASTA file 15 | 16 | """ 17 | 18 | test_dir = "temp_test_salmon_hg38" 19 | os.makedirs(f"./{test_dir}", exist_ok=True) 20 | output_dir_path = f"./{test_dir}" 21 | 22 | toolchest.salmon( 23 | single_end="s3://toolchest-integration-tests/salmon/SRR2557119_500k.fastq", 24 | output_path=output_dir_path, 25 | database_name="salmon_hg38", 26 | database_version=1, 27 | ) 28 | 29 | # Non-deterministic 30 | assert 8143860 <= os.path.getsize(os.path.join(output_dir_path, "quant.sf")) <= 8143880 31 | -------------------------------------------------------------------------------- /tests/test_sanity.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.mark.integration 5 | def test_sanity(): 6 | assert 1 + 1 == 2 7 | -------------------------------------------------------------------------------- /tests/test_shi7.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import s3, hash 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | # Because shi7 paired-end is non-deterministic, we just make sure it's not equal to the single-end version 12 | SHI7_SINGLE_END_HASH = 1570879637 13 | 14 | 15 | @pytest.mark.integration 16 | @pytest.mark.skip(reason="Load reduction for integration tests") 17 | def test_shi7_single_end(): 18 | """ 19 | Tests shi7 with a single R1 input 20 | 21 | Note: This test also tests the Output object generated by the shi7() tool call, 22 | and chaining the shi7 output files depends on how the Output is structured. 23 | If the Output class is modified, this test should be modified as well. 24 | """ 25 | 26 | test_dir = "./temp_test_shi7_single_end" 27 | os.makedirs(test_dir, exist_ok=True) 28 | input_one_file_path = f"{test_dir}/shi7_input_R1.fastq.gz" 29 | output_file_path = f"{test_dir}/combined_seqs.fna" 30 | 31 | s3.download_integration_test_input( 32 | s3_file_key="sample_r1.fastq.gz", 33 | output_file_path=input_one_file_path, 34 | ) 35 | 36 | output_shi7 = toolchest.shi7( 37 | tool_args="-SE", 38 | inputs=test_dir, 39 | output_path=test_dir, 40 | ) 41 | 42 | # Note: since shi7 produces multiple files, output_shi7.output_path 43 | # should be a list of paths to each unpacked output file. 44 | assert hash.unordered(output_file_path) == SHI7_SINGLE_END_HASH 45 | assert isinstance(output_shi7.output_file_paths, list) 46 | 47 | 48 | @pytest.mark.integration 49 | @pytest.mark.skip(reason="Load reduction for integration tests") 50 | def test_shi7_paired_end(): 51 | """ 52 | Tests shi7 with paired-end inputs 53 | 54 | Unfortunately, shi7 is non-deterministic. This means we can't check a hash. 55 | As a means of having some level of guarantee, we check the output file size instead. 56 | 57 | Because of this, we should not recommend shi7 for use. 58 | """ 59 | 60 | test_dir = "./temp_test_shi7_paired_end" 61 | os.makedirs(test_dir, exist_ok=True) 62 | input_one_file_path = f"{test_dir}/shi7_input_R1.fastq.gz" 63 | input_two_file_path = f"{test_dir}/shi7_input_R2.fastq.gz" 64 | output_file_path = f"{test_dir}/combined_seqs.fna" 65 | 66 | s3.download_integration_test_input( 67 | s3_file_key="sample_r1.fastq.gz", 68 | output_file_path=input_one_file_path, 69 | ) 70 | s3.download_integration_test_input( 71 | s3_file_key="sample_r2.fastq.gz", 72 | output_file_path=input_two_file_path, 73 | ) 74 | 75 | toolchest.shi7( 76 | inputs=test_dir, 77 | output_path=test_dir, 78 | ) 79 | 80 | # Because shi7 paired-end is non-deterministic, we just make sure it's not equal to the single-end version 81 | assert hash.unordered(output_file_path) != SHI7_SINGLE_END_HASH 82 | -------------------------------------------------------------------------------- /tests/test_shogun.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import s3, hash 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | 12 | @pytest.mark.integration 13 | @pytest.mark.skip(reason="Load reduction for integration tests") 14 | def test_shogun_filter_and_align(): 15 | """ 16 | Tests shogun (filter and align for simplicity) with a single R1 input 17 | """ 18 | 19 | test_dir = "./temp_test_shogun_filter_and_align" 20 | os.makedirs(f"{test_dir}", exist_ok=True) 21 | input_file_path = f"{test_dir}/combined_seqs_unfiltered.fna" 22 | output_file_path_filter = f"{test_dir}/combined_seqs.filtered.fna" 23 | output_file_path_align = f"{test_dir}/alignment.bowtie2.sam" 24 | 25 | s3.download_integration_test_input( 26 | s3_file_key="combined_seqs_unfiltered.fna", 27 | output_file_path=input_file_path, 28 | is_private=True, 29 | ) 30 | 31 | toolchest.shogun_filter( 32 | tool_args="--alignment True", 33 | inputs=input_file_path, 34 | output_path=test_dir, 35 | ) 36 | 37 | assert hash.unordered(output_file_path_filter) == 510167908 38 | 39 | toolchest.shogun_align( 40 | tool_args="", 41 | inputs=output_file_path_filter, 42 | output_path=test_dir, 43 | ) 44 | assert hash.unordered(output_file_path_align) == 1952162202 45 | -------------------------------------------------------------------------------- /tests/test_star.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import s3, hash, filter_output 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | 12 | @pytest.mark.integration 13 | @pytest.mark.parametrize("provider", ["aws", "tce"]) 14 | def test_star_grch38(provider): 15 | """ 16 | Tests STAR against the grch38 database 17 | """ 18 | test_dir = "temp_test_star_grch38" 19 | os.makedirs(f"./{test_dir}", exist_ok=True) 20 | input_file_path = "./small_star.fastq" 21 | output_dir_path = f"./{test_dir}" 22 | output_file_path = f"{output_dir_path}/Aligned.out.sam" 23 | filtered_output_file_path = f"{output_dir_path}/Aligned.filtered.out.sam" 24 | 25 | s3.download_integration_test_input( 26 | s3_file_key="small_star_500k.fastq", 27 | output_file_path=input_file_path, 28 | is_private=True, 29 | ) 30 | 31 | toolchest.STAR( 32 | read_one=input_file_path, 33 | output_path=output_dir_path, 34 | database_name="GRCh38", 35 | provider=provider 36 | ) 37 | 38 | # Because STAR output contains run ID (non-deterministic), verify that the number of bytes is in range 39 | assert 185952700 <= os.path.getsize(output_file_path) <= 185952900 # expected size 185952796 40 | 41 | # Filter non-deterministic metadata lines 42 | filter_output.filter_sam(output_file_path, filtered_output_file_path) 43 | assert hash.unordered(filtered_output_file_path) == 2099424598 44 | 45 | 46 | @pytest.mark.integration 47 | @pytest.mark.skip(reason="Pysam removed so parallelization is disabled until a new sam file merger is written or found") 48 | def test_star_grch38_parallel(): 49 | """ 50 | Tests STAR against the grch38 database, using parallel mode 51 | """ 52 | test_dir = "temp_test_star_grch38_parallel" 53 | os.makedirs(f"./{test_dir}", exist_ok=True) 54 | input_file_path = "./large_star.fastq" 55 | output_dir_path = f"./{test_dir}" 56 | output_file_path = f"{output_dir_path}/Aligned.out.sam" 57 | 58 | s3.download_integration_test_input( 59 | s3_file_key="large_star_15GB.fastq", 60 | output_file_path=input_file_path, 61 | is_private=True, 62 | ) 63 | 64 | toolchest.STAR( 65 | read_one=input_file_path, 66 | output_path=output_file_path, 67 | database_name="GRCh38", 68 | parallelize=True, 69 | ) 70 | 71 | # Because STAR output contains run ID (non-deterministic), verify that the number of bytes is in range 72 | # TODO: verify new file size with dockerized STAR after re-enabling parallelization 73 | # TODO: add a hash test of output file without @PG and @CO lines 74 | assert 33292990718 <= os.path.getsize(output_file_path) <= 33292994718 75 | 76 | 77 | @pytest.mark.integration 78 | @pytest.mark.parametrize("provider", ["aws", "tce"]) 79 | def test_star_grch38_dangerous_arg(provider): 80 | """ 81 | Tests STAR against the grch38 database, with a dangerous arg (changing functionality) 82 | """ 83 | test_dir = "temp_test_star_grch38" 84 | os.makedirs(f"./{test_dir}", exist_ok=True) 85 | input_file_path = "./small_star.fastq" 86 | output_dir_path = f"./{test_dir}" 87 | output_file_path = f"{output_dir_path}/Aligned.out.bam" 88 | 89 | s3.download_integration_test_input( 90 | s3_file_key="small_star_500k.fastq", 91 | output_file_path=input_file_path, 92 | is_private=True, 93 | ) 94 | 95 | toolchest.STAR( 96 | read_one=input_file_path, 97 | output_path=output_dir_path, 98 | database_name="GRCh38", 99 | tool_args="--outSAMtype BAM Unsorted", 100 | parallelize=True, # this should be deliberately ignored 101 | provider=provider, 102 | ) 103 | 104 | # Because STAR output contains run ID (non-deterministic) and BAMs are compressed, 105 | # verify that the number of bytes is in range 106 | assert 38236000 <= os.path.getsize(output_file_path) <= 38236100 # expected size 38236044 107 | 108 | # Make sure all non-parallel files exist as well 109 | assert os.path.isfile(f"{output_dir_path}/Log.final.out") 110 | assert os.path.isfile(f"{output_dir_path}/Log.out") 111 | assert os.path.isfile(f"{output_dir_path}/Log.progress.out") 112 | assert os.path.isfile(f"{output_dir_path}/SJ.out.tab") 113 | -------------------------------------------------------------------------------- /tests/test_transfer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | import toolchest_client as toolchest 5 | 6 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 7 | if toolchest_api_key: 8 | toolchest.set_key(toolchest_api_key) 9 | 10 | 11 | @pytest.mark.integration 12 | def test_transfer_http(): 13 | """ 14 | Tests transfer function with an http input 15 | """ 16 | test_dir = "temp_test_transfer_http" 17 | os.makedirs(f"./{test_dir}", exist_ok=True) 18 | output_dir_path = f"./{test_dir}" 19 | output_file_path = f"{output_dir_path}/P48754.fasta" 20 | 21 | toolchest.transfer( 22 | inputs="https://rest.uniprot.org/uniprotkb/P48754.fasta", 23 | output_path=output_dir_path 24 | ) 25 | 26 | with open(output_file_path, "r") as f: 27 | assert f.read().startswith(">sp|P48754|BRCA1_MOUSE") 28 | -------------------------------------------------------------------------------- /tests/test_unicycler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from tests.util import hash 5 | import toolchest_client as toolchest 6 | 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY") 8 | if toolchest_api_key: 9 | toolchest.set_key(toolchest_api_key) 10 | 11 | 12 | @pytest.mark.integration 13 | def test_unicycler(): 14 | """ 15 | Tests Unicycler 16 | """ 17 | 18 | test_dir = "temp_test_unicycler" 19 | os.makedirs(f"./{test_dir}", exist_ok=True) 20 | output_dir_path = f"./{test_dir}/" 21 | 22 | toolchest.unicycler( 23 | output_path=output_dir_path, 24 | read_one="s3://toolchest-integration-tests/r1.fastq.gz", 25 | read_two="s3://toolchest-integration-tests/r2.fastq.gz", 26 | long_reads="s3://toolchest-integration-tests/long_reads.fasta.gz" 27 | ) 28 | 29 | assert hash.unordered(f"{output_dir_path}assembly.fasta") == 882369120 30 | -------------------------------------------------------------------------------- /tests/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/tests/util/__init__.py -------------------------------------------------------------------------------- /tests/util/filter_output.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def filter_sam(unfiltered_path, filtered_path): 5 | """ 6 | Filters out non-deterministic metadata lines from a SAM output file. 7 | """ 8 | with open(filtered_path, "w") as outfile: 9 | with open(unfiltered_path, "r") as infile: 10 | outfile.writelines([line for line in infile if not line.startswith("@PG") and not line.startswith("@CO")]) 11 | 12 | 13 | def filter_regex(unfiltered_path, filtered_path, search_regex, replacement_str): 14 | """ 15 | Filters out non-deterministic metadata lines from a SAM output file. 16 | """ 17 | with open(filtered_path, "w") as outfile: 18 | with open(unfiltered_path, "r") as infile: 19 | for line in infile: 20 | outfile.write(re.sub(search_regex, replacement_str, line)) 21 | -------------------------------------------------------------------------------- /tests/util/hash.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import zlib 3 | 4 | 5 | def unordered(file_path): 6 | """ 7 | Generates a hash of an ASCII-encoded file, not impacted by line order. 8 | Not anywhere near cryptographically secure. 9 | """ 10 | file_hash = 1 11 | eighth_mersenne_prime = 2147483647 12 | with open(file_path) as file: 13 | print("Hashing", file_path) 14 | for line in file: 15 | file_hash = (zlib.adler32(line.encode()) * file_hash) % eighth_mersenne_prime 16 | print("Hash is", file_hash) 17 | return file_hash 18 | 19 | 20 | def binary_hash(file_path): 21 | """ 22 | Generates an MD5 hash of a binary file. 23 | """ 24 | with open(file_path, "rb") as file: 25 | print("Hashing", file_path) 26 | file_hash = hashlib.md5(file.read()).hexdigest() 27 | print("Hash is", file_hash) 28 | return file_hash 29 | -------------------------------------------------------------------------------- /tests/util/numpy_test.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | RUN pip install numpy 3 | -------------------------------------------------------------------------------- /tests/util/s3.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | 3 | 4 | # Downloads input from S3 to local file path. 5 | # Used for tests that upload local inputs. 6 | def download_integration_test_input(s3_file_key, output_file_path, is_private=False): 7 | s3_client = boto3.client('s3') 8 | bucket_name = 'toolchest-integration-tests-private' if is_private else 'toolchest-integration-tests' 9 | s3_client.download_file(bucket_name, s3_file_key, output_file_path) 10 | -------------------------------------------------------------------------------- /tests/util/streaming_script.py: -------------------------------------------------------------------------------- 1 | # For use with tests in test_python3.py 2 | import time 3 | print("==> Begin streamed lines <==") 4 | for number in range(5): 5 | print(number) 6 | time.sleep(1) 7 | print("==> End streamed lines <==") 8 | with open("./output/output.txt", "w") as f: 9 | f.write("Success") 10 | -------------------------------------------------------------------------------- /toolchest_client/__init__.py: -------------------------------------------------------------------------------- 1 | import builtins 2 | from dotenv import load_dotenv, find_dotenv 3 | import functools 4 | 5 | from .logging import setup_logging 6 | 7 | # set __version__ module 8 | try: 9 | # importlib.metadata is present in Python 3.8 and later 10 | import importlib.metadata as importlib_metadata 11 | except ImportError: 12 | import importlib_metadata as importlib_metadata 13 | try: 14 | __version__ = importlib_metadata.version(__package__ or __name__) 15 | except importlib_metadata.PackageNotFoundError: 16 | __version__ = None 17 | 18 | # .env load must be before imports that use environment variables 19 | load_dotenv(find_dotenv(".env")) 20 | 21 | # specifying print flushing is necessary to support loading from R 22 | builtins.print = functools.partial(print, flush=True) 23 | 24 | # configure logger 25 | setup_logging() 26 | 27 | from toolchest_client.api.auth import get_key, set_key 28 | from toolchest_client.api.download import download 29 | from toolchest_client.api.exceptions import ToolchestException, DataLimitError, ToolchestJobError, \ 30 | ToolchestDownloadError 31 | from toolchest_client.api.query import Query 32 | from toolchest_client.api.status import Status, get_status 33 | from toolchest_client.api.urls import get_api_url, set_api_url 34 | from .tools.api import add_database, alphafold, blastn, bowtie2, bracken, cellranger_count, centrifuge, clustalo, \ 35 | demucs, diamond_blastp, diamond_blastx, fastqc, humann3, jupyter, kallisto, kraken2, lastal5, lug, megahit, \ 36 | metaphlan, python3, rapsearch, rapsearch2, salmon, shi7, shogun_align, shogun_filter, STAR, test, transfer, \ 37 | unicycler, update_database 38 | -------------------------------------------------------------------------------- /toolchest_client/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/toolchest_client/api/__init__.py -------------------------------------------------------------------------------- /toolchest_client/api/auth.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.api.auth 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This module contains functions for configuring the Toolchest API key. 6 | 7 | """ 8 | from loguru import logger 9 | import os 10 | import sys 11 | 12 | import requests 13 | from requests.exceptions import HTTPError 14 | 15 | from toolchest_client.api.exceptions import ToolchestKeyError 16 | from toolchest_client.api.urls import get_api_url 17 | 18 | 19 | def get_key(): 20 | """Retrieves the Toolchest API key, if it is set.""" 21 | 22 | try: 23 | key = os.environ["TOOLCHEST_KEY"] 24 | except KeyError as e: 25 | logger.error("Key not found. Please set environment variable TOOLCHEST_KEY to your Toolchest API key.") 26 | logger.error("Function call:") 27 | logger.error(" toolchest_client.set_key(YOUR_KEY_HERE)") 28 | return e 29 | return key 30 | 31 | 32 | def set_key(key): 33 | """Sets the Toolchest auth key (env var TOOLCHEST_KEY) to the given value. 34 | 35 | :param key: key value (str) or path to file containing key. If given a filename, 36 | the file must consist of only the key itself. 37 | 38 | Usage:: 39 | 40 | >>> import toolchest_client as toolchest 41 | >>> toolchest.set_key(YOUR_KEY_HERE) 42 | 43 | """ 44 | 45 | if os.path.isfile(key): 46 | with open(key, "r") as f: 47 | os.environ["TOOLCHEST_KEY"] = f.read().strip() 48 | else: 49 | os.environ["TOOLCHEST_KEY"] = key 50 | 51 | 52 | def validate_key(): 53 | """Validates Toolchest API key, retrieved from get_key().""" 54 | 55 | validation_response = requests.get( 56 | get_api_url(), 57 | headers=get_headers(), 58 | ) 59 | try: 60 | validation_response.raise_for_status() 61 | except HTTPError: 62 | error_message = "Invalid Toolchest auth key. Please check the key value or contact Toolchest." 63 | logger.error(error_message, file=sys.stderr) 64 | raise ToolchestKeyError(error_message) from None 65 | 66 | 67 | def get_headers(): 68 | """Returns headers for Toolchest API calls.""" 69 | return {"Authorization": f"Key {get_key()}"} 70 | -------------------------------------------------------------------------------- /toolchest_client/api/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.api.exceptions 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This module contains custom exceptions used for the Toolchest client. 6 | """ 7 | 8 | 9 | class ToolchestException(OSError): 10 | """There was an unknown exception that occurred during your 11 | Toolchest job. 12 | """ 13 | 14 | 15 | class ToolchestKeyError(ToolchestException): 16 | """Invalid Toolchest auth key.""" 17 | 18 | 19 | class ToolchestS3AccessError(ToolchestException): 20 | """S3 input cannot be accessed by Toolchest.""" 21 | 22 | 23 | class ToolchestDownloadError(ToolchestException): 24 | """An error occurred when downloading files from Toolchest.""" 25 | 26 | 27 | class DataLimitError(ToolchestException): 28 | """Data limit for Toolchest exceeded.""" 29 | 30 | 31 | class ToolchestJobError(ToolchestException): 32 | """An error occurred when running the job instance.""" 33 | -------------------------------------------------------------------------------- /toolchest_client/api/instance_type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class InstanceType(Enum): 5 | # Compute optimized, lists vCPUs, 1:2 vCPU to RAM ratio 6 | COMPUTE_2 = "compute-2" 7 | COMPUTE_4 = "compute-4" 8 | COMPUTE_8 = "compute-8" 9 | COMPUTE_16 = "compute-16" 10 | COMPUTE_32 = "compute-32" 11 | COMPUTE_48 = "compute-48" 12 | COMPUTE_64 = "compute-64" 13 | COMPUTE_96 = "compute-96" 14 | # General optimized, lists vCPUs, 1:4 vCPU to RAM ratio 15 | GENERAL_2 = "general-2" 16 | GENERAL_4 = "general-4" 17 | GENERAL_8 = "general-8" 18 | GENERAL_16 = "general-16" 19 | GENERAL_32 = "general-32" 20 | GENERAL_48 = "general-48" 21 | GENERAL_64 = "general-64" 22 | GENERAL_96 = "general-96" 23 | # Memory optimized, lists memory, 1:8 vCPU to RAM ratio 24 | MEMORY_16 = "memory-16" 25 | MEMORY_32 = "memory-32" 26 | MEMORY_64 = "memory-64" 27 | MEMORY_128 = "memory-128" 28 | MEMORY_256 = "memory-256" 29 | MEMORY_384 = "memory-384" 30 | MEMORY_512 = "memory-512" 31 | # GPU instances, lists core GPU type 32 | GPU_V100 = "gpu-V100" 33 | -------------------------------------------------------------------------------- /toolchest_client/api/output.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.api.output 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This module provides an Output object returned by any completed queries 6 | made by Toolchest tools. The output object contains information about 7 | where the output file can be located. 8 | 9 | Note: The Output object does NOT represent the contents of any of the 10 | tool output files themselves. 11 | """ 12 | 13 | from toolchest_client.api.download import download 14 | from toolchest_client.api.status import get_status as get_api_status 15 | 16 | 17 | class Output: 18 | """A Toolchest query output. 19 | 20 | Provides information about location of output file(s), both locally 21 | (if downloaded) and in the cloud. 22 | 23 | """ 24 | 25 | def __init__(self, s3_uri=None, output_path=None, run_id=None): 26 | self.tool_name = None 27 | self.tool_version = None 28 | self.database_name = None 29 | self.database_version = None 30 | self.s3_uri = s3_uri 31 | self.output_path = output_path 32 | self.output_file_paths = None 33 | self.run_id = run_id 34 | self.last_status = None 35 | 36 | def __repr__(self): 37 | return str(self.__dict__) 38 | 39 | def __str__(self): 40 | return str(self.__dict__) 41 | 42 | def set_run_id(self, run_id): 43 | self.run_id = run_id 44 | 45 | def set_s3_uri(self, s3_uri): 46 | self.s3_uri = s3_uri 47 | 48 | def set_output_path(self, output_path, output_file_paths=None): 49 | self.output_path = output_path 50 | self.output_file_paths = output_file_paths 51 | 52 | def set_tool(self, tool_name=None, tool_version=None): 53 | """Sets the tool name and tool version for ensuring versioning and reproducibility.""" 54 | self.tool_name = tool_name 55 | self.tool_version = tool_version 56 | 57 | def set_database(self, database_name=None, database_version=None): 58 | """Sets the database name and database version for ensuring versioning and reproducibility. 59 | 60 | `database_version` increments when updating a database through the API. 61 | """ 62 | self.database_name = database_name 63 | self.database_version = database_version 64 | 65 | def download(self, output_path=None, output_dir=None, skip_decompression=False): 66 | if not output_path: 67 | if not output_dir: 68 | raise ValueError("Output destination directory (output_path) must be specified.") 69 | output_path = output_dir # backwards compatibility for old calls 70 | 71 | self.output_file_paths = download( 72 | output_path=output_path, 73 | s3_uri=self.s3_uri, 74 | run_id=self.run_id, 75 | skip_decompression=skip_decompression, 76 | ) 77 | return self.output_file_paths 78 | 79 | def refresh_status(self, **kwargs): 80 | self.last_status = get_api_status(self.run_id, **kwargs) 81 | 82 | def get_status(self, **kwargs): 83 | """ 84 | Returns the status of a run 85 | """ 86 | if not self.run_id: 87 | raise ValueError("Cannot get status on an output that has no run_id") 88 | self.refresh_status(**kwargs) 89 | return self.last_status 90 | -------------------------------------------------------------------------------- /toolchest_client/api/status.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.api.status 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This module contains a function to check pipeline_segment_instance statuses and status enums. 6 | """ 7 | 8 | from enum import Enum 9 | 10 | 11 | def get_status(run_id, **kwargs): 12 | """Returns the status of the Toolchest run. 13 | 14 | Call this less than once a second to avoid being rate-limited. 15 | 16 | :param run_id: the ID returned by a tool. Internally, this ID is the pipeline_segment_instance_id. 17 | """ 18 | from toolchest_client.api.query import Query # local import to avoid circular dependency 19 | 20 | query = Query( 21 | is_async=True, 22 | pipeline_segment_instance_id=run_id, 23 | ) 24 | 25 | return query.get_job_status(**kwargs) 26 | 27 | 28 | class Status(str, Enum): 29 | """Status values for the Toolchest API.""" 30 | 31 | # NOTE: These statuses aren't currently being used with threading. 32 | # All status updates are encapsulated in the statuses of the threads. 33 | INITIALIZED = "initialized" 34 | TRANSFERRING_FROM_CLIENT = "transferring_from_client" 35 | TRANSFERRED_FROM_CLIENT = "transferred_from_client" 36 | AWAITING_EXECUTION = "awaiting_execution" 37 | BEGINNING_EXECUTION = "beginning_execution" 38 | EXECUTING = "executing" 39 | READY_TO_TRANSFER_TO_CLIENT = "ready_to_transfer_to_client" 40 | TRANSFERRING_TO_CLIENT = "transferring_to_client" 41 | TRANSFERRED_TO_CLIENT = "transferred_to_client" 42 | TERMINATED = "terminated" 43 | COMPLETE = "complete" 44 | FAILED = "failed" 45 | 46 | 47 | class PrettyStatus(str, Enum): 48 | """Status values for local threads""" 49 | 50 | INITIALIZING = "initializing" 51 | INITIALIZED = "initialized" 52 | UPLOADING = "uploading" 53 | EXECUTING = "executing" 54 | DOWNLOADING = "downloading" 55 | COMPLETE = "complete" 56 | INTERRUPTING = "interrupting" 57 | TERMINATED = "terminated" 58 | FAILED = "failed" 59 | -------------------------------------------------------------------------------- /toolchest_client/api/streaming.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.api.streaming 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This module provides a StreamingClient object, used by Toolchest queries to 6 | receive and print output lines streamed from the Toolchest server. 7 | """ 8 | import asyncio 9 | from loguru import logger 10 | import ssl 11 | import sys 12 | 13 | import websockets 14 | from websockets.exceptions import ConnectionClosed 15 | 16 | 17 | class StreamingClient: 18 | """A Toolchest output stream client. 19 | 20 | Provides an interface to output lines streamed from the server. 21 | 22 | """ 23 | 24 | def __init__(self): 25 | self.ssl_context = None 26 | self.streaming_token = None 27 | self.streaming_ip_address = None 28 | self.streaming_tls_cert = None 29 | self.initialized = False 30 | self.ready_to_start = False 31 | self.stream_is_open = False 32 | 33 | def initialize_params(self, streaming_token, streaming_ip_address, streaming_tls_cert): 34 | self.streaming_token = streaming_token 35 | self.streaming_ip_address = streaming_ip_address 36 | self.streaming_tls_cert = streaming_tls_cert 37 | 38 | ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) 39 | ssl_context.load_verify_locations(cadata=self.streaming_tls_cert) 40 | self.ssl_context = ssl_context 41 | self.ready_to_start = True 42 | self.initialized = True 43 | 44 | async def receive_stream(self): 45 | streaming_username = "toolchest" 46 | streaming_port = "8765" 47 | uri = f"wss://{streaming_username}:{self.streaming_token}@{self.streaming_ip_address}:{streaming_port}" 48 | logger.info("Connecting to remote server for streaming...") 49 | sys.stdout.flush() 50 | retry_count = 0 51 | while True: 52 | try: 53 | async for websocket in websockets.connect(uri, ssl=self.ssl_context): 54 | logger.debug("Connected!") 55 | try: 56 | self.stream_is_open = True 57 | while self.stream_is_open: 58 | stream_lines = await websocket.recv() 59 | # Not using logger here, because I couldn't get formatting right 60 | print(stream_lines, end="") 61 | except ConnectionClosed: 62 | self.stream_is_open = False 63 | logger.debug("\nConnection closed by server.") 64 | return 65 | except ConnectionRefusedError: 66 | retry_count += 1 67 | if retry_count > 3: 68 | raise RuntimeError("Can't connect to server. Try disabling output streaming and re-running.") 69 | else: 70 | continue 71 | 72 | def stream(self): 73 | self.ready_to_start = False 74 | try: 75 | loop = asyncio.get_running_loop() 76 | except RuntimeError: 77 | loop = None 78 | 79 | if loop and loop.is_running(): 80 | raise ValueError("Output streaming cannot be enabled within a running asyncio event loop.") 81 | else: 82 | asyncio.run(self.receive_stream()) 83 | -------------------------------------------------------------------------------- /toolchest_client/api/urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.api.urls 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This module serves as a single source for URLs used in 6 | Toolchest queries and API calls. 7 | """ 8 | 9 | import os 10 | 11 | 12 | def get_api_url(): 13 | """Retrieves the base URL for the Toolchest server API. Defaults to "https://api.toolche.st" 14 | if a custom API URL is not set. 15 | """ 16 | # Note: BASE_URL is checked for backwards compatibility. 17 | return os.environ.get("TOOLCHEST_API_URL", os.environ.get("BASE_URL", "https://api.toolche.st")) 18 | 19 | 20 | def get_pipeline_segment_instances_url(): 21 | """Retrieves the Toolchest API Route for pipeline segment instances. Used internally.""" 22 | PIPELINE_SEGMENT_INSTANCES_ROUTE = "/pipeline-segment-instances" 23 | return get_api_url() + PIPELINE_SEGMENT_INSTANCES_ROUTE 24 | 25 | 26 | def get_s3_metadata_url(): 27 | """Retrieves the Toolchest API Route for S3 metadata. Used internally.""" 28 | S3_ROUTE = "/s3" 29 | S3_URL = get_api_url() + S3_ROUTE 30 | return S3_URL + "/metadata" 31 | 32 | 33 | def set_api_url(custom_api_url=None): 34 | """Sets the Toolchest API URL (env var TOOLCHEST_API_URL) to the given value. 35 | If a URL is not provided, resets to the default Toolchest API URL. 36 | 37 | :param custom_api_url: Custom API URL. Any trailing slashes should be removed. 38 | 39 | Usage:: 40 | 41 | >>> import toolchest_client as toolchest 42 | >>> toolchest.set_api_url("http://your.custom.api.url.here") 43 | 44 | """ 45 | if custom_api_url: 46 | os.environ["TOOLCHEST_API_URL"] = custom_api_url 47 | else: 48 | os.environ.pop("TOOLCHEST_API_URL", None) 49 | -------------------------------------------------------------------------------- /toolchest_client/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/toolchest_client/cli/__init__.py -------------------------------------------------------------------------------- /toolchest_client/cli/cli.py: -------------------------------------------------------------------------------- 1 | import typer 2 | 3 | import toolchest_client.cli.kraken2 as kraken2 4 | import toolchest_client.cli.test as test 5 | 6 | 7 | app = typer.Typer() 8 | 9 | # Apparently this not recommended but it allows each tool to have its own file and maintain readability 10 | app.command()(kraken2.kraken2) 11 | app.command()(test.test) 12 | 13 | if __name__ == "__main__": 14 | app() 15 | -------------------------------------------------------------------------------- /toolchest_client/cli/test.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import typer 4 | 5 | from toolchest_client.tools import Test 6 | 7 | app = typer.Typer() 8 | 9 | 10 | @app.command() 11 | def test( 12 | inputs: List[str], 13 | output_path: str = typer.Option(None, help='Sets the directory where the success file will be downloaded'), 14 | is_async: bool = typer.Option(False, '--is_async', help='Executes the Toolchest job as an async job') 15 | ): 16 | """ 17 | Confirms that you are able to run toolchest 18 | """ 19 | test_instance = Test( 20 | tool_args='', 21 | output_name='output.tar.gz', 22 | inputs=inputs, 23 | output_path=output_path, 24 | is_async=is_async, 25 | ) 26 | test_instance.run() 27 | 28 | 29 | if __name__ == "__main__": 30 | app() 31 | -------------------------------------------------------------------------------- /toolchest_client/files/__init__.py: -------------------------------------------------------------------------------- 1 | from .general import assert_exists, check_file_size, files_in_path, sanity_check, compress_files_in_path, \ 2 | convert_input_params_to_prefix_mapping 3 | from .merge import concatenate_files, merge_sam_files 4 | from .s3 import assert_accessible_s3, get_s3_file_size, get_params_from_s3_uri, path_is_s3_uri 5 | from .split import open_new_output_file, split_file_by_lines, split_paired_files_by_lines 6 | from .unpack import OutputType, unpack_files 7 | from .public_uris import get_url_with_protocol, path_is_http_url, path_is_accessible_ftp_url 8 | -------------------------------------------------------------------------------- /toolchest_client/files/merge.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.files.merge 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | Functions for merging files 6 | """ 7 | 8 | import multiprocessing 9 | import shutil 10 | 11 | 12 | def concatenate_files(input_file_paths, output_file_path): 13 | """Concatenates a list of files using shutil. 14 | 15 | :param input_file_paths: Paths to the files which are to be concatenated. 16 | :param output_file_path: Path to the merged output file. 17 | """ 18 | with open(output_file_path, "wb") as output_file: 19 | for input_file_path in input_file_paths: 20 | input_file = open(input_file_path, "rb") 21 | shutil.copyfileobj(input_file, output_file) 22 | input_file.close() 23 | 24 | 25 | def merge_sam_files(input_file_paths, output_file_path): 26 | """Merges SAM files – the output for tools like STAR – using samtools. 27 | 28 | :param input_file_paths: Paths to the files which are to be merged with samtools. 29 | :param output_file_path: Path to the merged output file. 30 | """ 31 | # Only import pysam – an optional dependency – if absolutely needed 32 | import pysam 33 | 34 | # This cause problems if run on a shared machine with non-available cores 35 | num_cores = multiprocessing.cpu_count() 36 | 37 | # Options for merging SAM files: 38 | # -f: force overwrite output file 39 | # -o: specify output manually 40 | # -u: write output as an uncompressed SAM 41 | # -c: combine headers when they exist in both files 42 | # -p: merge @PG IDs 43 | # --threads: number of threads 44 | pysam.merge( 45 | "-f", 46 | "-u", 47 | "-c", 48 | "-p", 49 | "--threads", 50 | f"{num_cores}", 51 | output_file_path, 52 | *input_file_paths 53 | ) 54 | -------------------------------------------------------------------------------- /toolchest_client/files/public_uris.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.files.public_uris 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | Functions for handling files given by HTTP / HTTPS / FTP URIs. 6 | """ 7 | from ftplib import FTP 8 | from urllib.parse import urlparse 9 | from urllib3.exceptions import LocationParseError 10 | 11 | import requests 12 | from requests.exceptions import HTTPError, InvalidURL, InvalidSchema 13 | 14 | 15 | def get_url_with_protocol(url): 16 | """Returns URL with `http://` prepended, if a protocol is not specified. 17 | 18 | :param url: An input URL. 19 | """ 20 | parsed_url = urlparse(url) 21 | if not parsed_url.scheme: 22 | url = "http://" + url 23 | return url 24 | 25 | 26 | def path_is_http_url(path): 27 | """Returns whether the given path is an accessible URL by sending a GET request for the first byte. 28 | 29 | :param path: An input path. 30 | """ 31 | try: 32 | response = requests.get(path, headers={"Range": "bytes=0-0"}) 33 | response.raise_for_status() 34 | return len(response.content) == 1 35 | except (InvalidURL, HTTPError, InvalidSchema, LocationParseError, UnicodeError, Exception): 36 | return False 37 | 38 | 39 | def path_is_accessible_ftp_url(path): 40 | """Returns whether the given path is an accessible URL by sending a HEAD request. 41 | 42 | :param path: An input path. 43 | """ 44 | if path.startswith("ftp://"): 45 | file_size = get_ftp_url_file_size(path) 46 | return file_size > 0 47 | return False 48 | 49 | 50 | def get_ftp_url_file_size(url): 51 | """Returns file size of an accessible FTP URL, via SIZE command. 52 | 53 | :param url: An input URL. 54 | """ 55 | parsed_url = urlparse(url) 56 | with FTP(parsed_url.netloc) as ftp: 57 | ftp.login() 58 | return ftp.size(parsed_url.path) 59 | -------------------------------------------------------------------------------- /toolchest_client/files/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/toolchest_client/files/tests/__init__.py -------------------------------------------------------------------------------- /toolchest_client/files/tests/data/eight_line.fastq: -------------------------------------------------------------------------------- 1 | @test.fastq.1 BOGUS length=121 2 | ACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGN 3 | +test.fastq.1 BOGUS length=121 4 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<< 5 | @test.fastq.2 BOGUS length=121 6 | TGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGAT 7 | +test.fastq.2 BOGUS length=121 8 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<< 9 | -------------------------------------------------------------------------------- /toolchest_client/files/tests/data/eight_line_split_one.fastq: -------------------------------------------------------------------------------- 1 | @test.fastq.1 BOGUS length=121 2 | ACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGN 3 | +test.fastq.1 BOGUS length=121 4 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<< 5 | -------------------------------------------------------------------------------- /toolchest_client/files/tests/data/eight_line_split_two.fastq: -------------------------------------------------------------------------------- 1 | @test.fastq.2 BOGUS length=121 2 | TGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGAT 3 | +test.fastq.2 BOGUS length=121 4 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<< 5 | -------------------------------------------------------------------------------- /toolchest_client/files/tests/data/paired_end/eight_line_R1.fastq: -------------------------------------------------------------------------------- 1 | @test.fastq.1 BOGUS length=121 2 | ACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGN 3 | +test.fastq.1 BOGUS length=121 4 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<< 5 | @test.fastq.2 BOGUS length=121 6 | TGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGAT 7 | +test.fastq.2 BOGUS length=121 8 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<< 9 | -------------------------------------------------------------------------------- /toolchest_client/files/tests/data/paired_end/eight_line_R2.fastq: -------------------------------------------------------------------------------- 1 | @test.fastq.1 BOGUS length=121 2 | ACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGN 3 | +test.fastq.1 BOGUS length=121 4 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<< 5 | @test.fastq.2 BOGUS length=121 6 | TGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGAT 7 | +test.fastq.2 BOGUS length=121 8 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<< 9 | -------------------------------------------------------------------------------- /toolchest_client/files/tests/data/very_small_file.txt: -------------------------------------------------------------------------------- 1 | A 2 | -------------------------------------------------------------------------------- /toolchest_client/files/tests/test_general.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | 4 | import pytest 5 | 6 | from .. import assert_exists, check_file_size, files_in_path, sanity_check, convert_input_params_to_prefix_mapping 7 | 8 | THIS_FILE_PATH = os.path.normpath(pathlib.Path(__file__).parent.resolve()) 9 | 10 | 11 | def test_small_file(): 12 | small_file_path = f"{THIS_FILE_PATH}/data/very_small_file.txt" 13 | with pytest.raises(ValueError): 14 | sanity_check(small_file_path) 15 | 16 | 17 | def test_files_in_path(): 18 | tmp_dir = f"{THIS_FILE_PATH}/tmp" 19 | tmp1 = f"{tmp_dir}/tmp1" 20 | tmp2 = f"{tmp_dir}/tmp2" 21 | sub_dir = f"{tmp_dir}/sub_dir" 22 | tmp3 = f"{sub_dir}/tmp3" 23 | file_paths = [tmp1, tmp2, tmp3] 24 | os.makedirs(sub_dir, exist_ok=True) 25 | for file in file_paths: 26 | open(file, "w").close() 27 | file_paths = sorted([os.path.normpath(x) for x in file_paths]) 28 | 29 | assert sorted([os.path.normpath(x) for x in files_in_path(tmp_dir)]) == file_paths 30 | 31 | for file in file_paths: 32 | os.remove(file) 33 | os.removedirs(sub_dir) 34 | 35 | 36 | def test_file_too_large(): 37 | with pytest.raises(ValueError): 38 | check_file_size(f"{THIS_FILE_PATH}/data/eight_line.fastq", max_size_bytes=100) 39 | 40 | 41 | def test_nonexistent_file(): 42 | bogus_file_path = f"{THIS_FILE_PATH}/data/bogus_file_path" 43 | with pytest.raises(FileNotFoundError): 44 | assert_exists(bogus_file_path) 45 | 46 | 47 | def test_exists_but_not_file(): 48 | dir_file_path = f"{THIS_FILE_PATH}/data" 49 | with pytest.raises(ValueError): 50 | assert_exists(dir_file_path, must_be_file=True) 51 | 52 | 53 | def test_generate_prefix_mapping(): 54 | tag_to_param_map = { 55 | "-1": ["example1_R1.fastq", "example2_R1.fastq"], 56 | "-2": ["example1_R2.fastq", "example2_R2.fastq"], 57 | "-U": ["example1_U.fastq", "example2_U.fastq"], 58 | } 59 | input_list, prefix_mapping = convert_input_params_to_prefix_mapping(tag_to_param_map) 60 | assert sorted(input_list) == sorted([ 61 | "example1_R1.fastq", 62 | "example2_R1.fastq", 63 | "example1_R2.fastq", 64 | "example2_R2.fastq", 65 | "example1_U.fastq", 66 | "example2_U.fastq", 67 | ]) 68 | assert prefix_mapping == { 69 | "example1_R1.fastq": { 70 | "prefix": "-1", 71 | "order": 0, 72 | }, 73 | "example1_R2.fastq": { 74 | "prefix": "-2", 75 | "order": 0, 76 | }, 77 | "example1_U.fastq": { 78 | "prefix": "-U", 79 | "order": 0, 80 | }, 81 | "example2_R1.fastq": { 82 | "prefix": "-1", 83 | "order": 1, 84 | }, 85 | "example2_R2.fastq": { 86 | "prefix": "-2", 87 | "order": 1, 88 | }, 89 | "example2_U.fastq": { 90 | "prefix": "-U", 91 | "order": 1, 92 | }, 93 | } 94 | -------------------------------------------------------------------------------- /toolchest_client/files/tests/test_merge.py: -------------------------------------------------------------------------------- 1 | import filecmp 2 | import os 3 | import pathlib 4 | 5 | from .. import concatenate_files 6 | 7 | THIS_FILE_PATH = pathlib.Path(__file__).parent.resolve() 8 | 9 | 10 | def test_concatenate_files(): 11 | input_file_path = f"{THIS_FILE_PATH}/data/eight_line.fastq" 12 | split_one_path = f"{THIS_FILE_PATH}/data/eight_line_split_one.fastq" 13 | split_two_path = f"{THIS_FILE_PATH}/data/eight_line_split_two.fastq" 14 | temp_output_file_path = f"{THIS_FILE_PATH}/data/temp_output.fastq" 15 | 16 | concatenate_files([split_one_path, split_two_path], temp_output_file_path) 17 | 18 | assert filecmp.cmp(input_file_path, temp_output_file_path) 19 | 20 | os.remove(temp_output_file_path) 21 | 22 | # TODO: test merge_sam_files() in a way that's reproducible on different OS choices 23 | -------------------------------------------------------------------------------- /toolchest_client/files/tests/test_s3.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from .. import assert_accessible_s3, get_s3_file_size, get_params_from_s3_uri 4 | from ...api.exceptions import ToolchestS3AccessError 5 | 6 | EXAMPLE_FASTQ_SIZE = 48468258 7 | EXAMPLE_FASTQ_URI = "s3://toolchest-public-examples/example.fastq" 8 | 9 | 10 | def test_s3_params(): 11 | example_s3_uri = "s3://toolchest-public-examples/dummy-id/example.fastq" 12 | params = get_params_from_s3_uri(example_s3_uri) 13 | target_params = { 14 | "arn": "arn:aws:s3:::toolchest-public-examples/dummy-id/example.fastq", 15 | "bucket": "toolchest-public-examples", 16 | "key": "dummy-id/example.fastq", 17 | "key_initial": "dummy-id", 18 | "key_final": "example.fastq" 19 | } 20 | 21 | assert params == target_params 22 | 23 | 24 | @pytest.mark.integration 25 | def test_public_s3_file(): 26 | assert_accessible_s3(EXAMPLE_FASTQ_URI) 27 | 28 | 29 | @pytest.mark.integration 30 | def test_fake_s3_file(): 31 | fake_s3_uri = "s3://toolchest-this-is-a-bad-bucket/bogus.fastq" 32 | with pytest.raises(ToolchestS3AccessError): 33 | assert_accessible_s3(fake_s3_uri) 34 | 35 | 36 | @pytest.mark.integration 37 | def test_s3_file_size(): 38 | assert get_s3_file_size(EXAMPLE_FASTQ_URI) == EXAMPLE_FASTQ_SIZE 39 | -------------------------------------------------------------------------------- /toolchest_client/files/tests/test_split.py: -------------------------------------------------------------------------------- 1 | import filecmp 2 | import os 3 | import pathlib 4 | 5 | from .. import split_file_by_lines, split_paired_files_by_lines 6 | 7 | THIS_FILE_PATH = pathlib.Path(__file__).parent.resolve() 8 | 9 | 10 | def delete_temp_files(file_paths): 11 | """ 12 | Deletes temporary files. Only use for testing. 13 | """ 14 | for file_path in file_paths: 15 | os.remove(file_path) 16 | 17 | 18 | def assert_files_eq(file_path_one, file_path_two): 19 | assert filecmp.cmp(file_path_one, file_path_two) 20 | 21 | 22 | def test_split_small_fastq(): 23 | new_file_paths = [] 24 | split_file_paths = split_file_by_lines( 25 | input_file_path=f"{THIS_FILE_PATH}/data/eight_line.fastq", 26 | num_lines_in_group=4, 27 | max_bytes=100 28 | ) 29 | 30 | for _, file_path in split_file_paths: 31 | new_file_paths.append(file_path) 32 | 33 | assert len(new_file_paths) == 2 34 | 35 | assert_files_eq(new_file_paths[0], f"{THIS_FILE_PATH}/data/eight_line_split_one.fastq") 36 | assert_files_eq(new_file_paths[1], f"{THIS_FILE_PATH}/data/eight_line_split_two.fastq") 37 | 38 | delete_temp_files(new_file_paths) 39 | 40 | 41 | def test_split_small_fastq_small_bytes(): 42 | new_file_paths = [] 43 | split_file_paths = split_file_by_lines( 44 | input_file_path=f"{THIS_FILE_PATH}/data/eight_line.fastq", 45 | num_lines_in_group=4, 46 | max_bytes=1 47 | ) 48 | 49 | for _, file_path in split_file_paths: 50 | new_file_paths.append(file_path) 51 | 52 | assert len(new_file_paths) == 2 53 | 54 | assert_files_eq(new_file_paths[0], f"{THIS_FILE_PATH}/data/eight_line_split_one.fastq") 55 | assert_files_eq(new_file_paths[1], f"{THIS_FILE_PATH}/data/eight_line_split_two.fastq") 56 | 57 | delete_temp_files(new_file_paths) 58 | 59 | 60 | def test_split_paired_fastqs(): 61 | new_file_paths = [] 62 | files_to_delete = [] 63 | split_file_paths = split_paired_files_by_lines( 64 | input_file_paths=[ 65 | f"{THIS_FILE_PATH}/data/paired_end/eight_line_R1.fastq", 66 | f"{THIS_FILE_PATH}/data/paired_end/eight_line_R2.fastq", 67 | ], 68 | num_lines_in_group=4, 69 | max_bytes=100 70 | ) 71 | 72 | for split_paired_input_files in split_file_paths: 73 | new_file_paths.append(split_paired_input_files) 74 | files_to_delete.append(split_paired_input_files[0]) 75 | files_to_delete.append(split_paired_input_files[1]) 76 | 77 | assert len(new_file_paths) == 2 78 | assert len(new_file_paths[0]) == 2 79 | assert len(new_file_paths[1]) == 2 80 | 81 | assert_files_eq(new_file_paths[0][0], f"{THIS_FILE_PATH}/data/eight_line_split_one.fastq") 82 | assert_files_eq(new_file_paths[1][0], f"{THIS_FILE_PATH}/data/eight_line_split_two.fastq") 83 | assert_files_eq(new_file_paths[0][1], f"{THIS_FILE_PATH}/data/eight_line_split_one.fastq") 84 | assert_files_eq(new_file_paths[1][1], f"{THIS_FILE_PATH}/data/eight_line_split_two.fastq") 85 | 86 | delete_temp_files(files_to_delete) 87 | -------------------------------------------------------------------------------- /toolchest_client/files/unpack.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | import os 3 | import shutil 4 | import tarfile 5 | 6 | 7 | class OutputType(Enum): 8 | GZ_TAR = ".tar.gz" 9 | FLAT_TEXT = ".txt" 10 | SAM_FILE = ".sam" 11 | S3 = "" 12 | 13 | 14 | def unpack_files(file_path_to_unpack, is_compressed): 15 | """Unpack output file, if needed. Returns the path(s) to the (optionally) unpacked output. 16 | If only 1 file is unpacked, returns a string containing that file's path. 17 | If there are multiple unpacked files, returns a list of paths. 18 | Returns a list of file paths to unpacked files. 19 | """ 20 | if is_compressed: 21 | # Get names of files in archive 22 | with tarfile.open(file_path_to_unpack) as tar: 23 | unpacked_file_names = tar.getnames() 24 | 25 | unpacked_outputs_dir = os.path.dirname(file_path_to_unpack) 26 | shutil.unpack_archive( 27 | filename=file_path_to_unpack, 28 | extract_dir=unpacked_outputs_dir, 29 | format="gztar", 30 | ) 31 | 32 | # Remove the unpacked .tar.gz file and empty unpacked output folder 33 | os.remove(file_path_to_unpack) 34 | 35 | unpacked_paths = ["/".join([unpacked_outputs_dir, file_name]) for file_name in unpacked_file_names] 36 | unpacked_file_paths = [os.path.normpath(path) for path in unpacked_paths if os.path.isfile(path)] 37 | 38 | # If only 1 file is unpacked, just return path instead of [path]. 39 | # This is to be consistent with the return value from the other output types. 40 | if len(unpacked_file_paths) == 1: 41 | return unpacked_file_paths[0] 42 | return unpacked_file_paths 43 | else: 44 | return file_path_to_unpack 45 | -------------------------------------------------------------------------------- /toolchest_client/logging.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | import os 3 | import sys 4 | 5 | LOG_LEVEL = os.environ.get("TOOLCHEST_LOG_LEVEL", "INFO") 6 | 7 | 8 | def get_log_level(): 9 | return LOG_LEVEL 10 | 11 | 12 | def setup_logging(log_level=None): 13 | global LOG_LEVEL 14 | if log_level and log_level != LOG_LEVEL: 15 | LOG_LEVEL = log_level 16 | logger.remove() 17 | 18 | valid_log_levels = ["DEBUG", "INFO", "WARNING", "ERROR"] 19 | if LOG_LEVEL not in valid_log_levels: 20 | raise ValueError(f"Invalid log level: {LOG_LEVEL}. Valid levels are: {valid_log_levels}") 21 | 22 | if LOG_LEVEL in ["DEBUG", "INFO", "WARNING"]: 23 | stdout_filter = lambda record: record["level"].no < 40 24 | logger.add( 25 | sys.stdout, 26 | filter=stdout_filter, 27 | level=LOG_LEVEL, 28 | format="{time} | {level} | {message}", 29 | ) 30 | # Including if log_level == "ERROR" 31 | logger.add(sys.stderr, level="ERROR") 32 | -------------------------------------------------------------------------------- /toolchest_client/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .tool import Tool 2 | from .alphafold import AlphaFold 3 | from .blastn import BLASTN 4 | from .bowtie2 import Bowtie2 5 | from .bracken import Bracken 6 | from .cellranger import CellRangerCount 7 | from .centrifuge import Centrifuge 8 | from .clustalo import ClustalO 9 | from .demucs import Demucs 10 | from .diamond import DiamondBlastp, DiamondBlastx 11 | from .fastqc import FastQC 12 | from .humann import HUMAnN3 13 | from .jupyter import Jupyter 14 | from .kallisto import Kallisto 15 | from .kraken2 import Kraken2 16 | from .last import Lastal5 17 | from .lug import Lug 18 | from .megahit import Megahit 19 | from .metaphlan import MetaPhlAn 20 | from .python3 import Python3 21 | from .rapsearch2 import Rapsearch2 22 | from .salmon import Salmon 23 | from .shi7 import Shi7 24 | from .shogun import ShogunAlign, ShogunFilter 25 | from .star import STARInstance 26 | from .test import Test 27 | from .transfer import Transfer 28 | from .unicycler import Unicycler 29 | -------------------------------------------------------------------------------- /toolchest_client/tools/alphafold.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.AlphaFold 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the AlphaFold implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType 8 | from . import Tool 9 | 10 | 11 | class AlphaFold(Tool): 12 | """ 13 | The AlphaFold implementation of the Tool class. 14 | """ 15 | def __init__(self, inputs, output_path, tool_args, **kwargs): 16 | super().__init__( 17 | tool_name="alphafold", 18 | tool_version="2.1.2", 19 | tool_args=tool_args, 20 | inputs=inputs, 21 | database_name="alphafold_standard", 22 | database_version="2.1.2", 23 | parallel_enabled=False, 24 | output_type=OutputType.GZ_TAR, 25 | output_path=output_path, 26 | **kwargs, 27 | ) 28 | -------------------------------------------------------------------------------- /toolchest_client/tools/blastn.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.BLASTN 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the BLASTN implementation of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType 9 | 10 | 11 | class BLASTN(Tool): 12 | """ 13 | The BLASTN implementation of the Tool class. 14 | """ 15 | def __init__(self, tool_args, inputs, output_path, output_primary_name, database_name, database_version, **kwargs): 16 | super().__init__( 17 | tool_name="blastn", 18 | tool_version="2.13.0", 19 | tool_args=tool_args, 20 | output_path=output_path, 21 | output_primary_name=output_primary_name, 22 | inputs=inputs, 23 | database_name=database_name, 24 | database_version=database_version, 25 | max_input_bytes_per_file=10 * 1024 * 1024 * 1024, 26 | output_type=OutputType.GZ_TAR, 27 | expected_output_file_names=["blastn_results.out"], 28 | **kwargs, 29 | ) 30 | -------------------------------------------------------------------------------- /toolchest_client/tools/bowtie2.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.bowtie2 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the bowtie2 implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType 8 | 9 | from . import Tool 10 | 11 | 12 | class Bowtie2(Tool): 13 | """ 14 | The bowtie2 implementation of the Tool class. 15 | """ 16 | def __init__(self, tool_args, inputs, output_path, database_name, 17 | database_version, **kwargs): 18 | super().__init__( 19 | tool_name="bowtie2", 20 | tool_version="2.4.4", # todo: allow bowtie2 version to be set by the user 21 | tool_args=tool_args, 22 | output_path=output_path, 23 | inputs=inputs, 24 | database_name=database_name, 25 | database_version=database_version, 26 | parallel_enabled=False, 27 | output_type=OutputType.GZ_TAR, 28 | expected_output_file_names=["bowtie2.log", "bowtie2_output.sam"], 29 | **kwargs, 30 | ) 31 | -------------------------------------------------------------------------------- /toolchest_client/tools/bracken.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.bracken 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Bracken implementation of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType 9 | from toolchest_client.files.s3 import path_is_s3_uri 10 | 11 | 12 | class Bracken(Tool): 13 | """ 14 | The Bracken implementation of the Tool class. 15 | """ 16 | def __init__(self, tool_args, inputs, output_path, 17 | database_name, database_version, remote_database_path, **kwargs): 18 | super().__init__( 19 | tool_name="bracken", 20 | tool_version="2.7", # todo: allow bracken version to be set by the user 21 | tool_args=tool_args, 22 | output_path=output_path, 23 | inputs=inputs, 24 | database_name=database_name, 25 | database_version=database_version, 26 | remote_database_path=remote_database_path, 27 | max_input_bytes_per_file=64 * 1024 * 1024 * 1024, 28 | output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR, 29 | **kwargs, 30 | ) 31 | -------------------------------------------------------------------------------- /toolchest_client/tools/cellranger.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.cellranger 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This contains the cellranger implementations of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType 9 | 10 | 11 | class CellRangerCount(Tool): 12 | """ 13 | The cellranger_count implementation of the Tool class. 14 | """ 15 | def __init__(self, tool_args, inputs, output_path, database_name, 16 | database_version, **kwargs): 17 | super().__init__( 18 | tool_name="cellranger_count", 19 | tool_version="6.1.2", # todo: allow cellranger version to be set by the user 20 | tool_args=tool_args, 21 | output_path=output_path, 22 | inputs=inputs, 23 | database_name=database_name, 24 | database_version=database_version, 25 | compress_inputs=True, 26 | max_input_bytes_per_file=128 * 1024 * 1024 * 1024, 27 | output_type=OutputType.GZ_TAR, 28 | **kwargs, 29 | ) 30 | -------------------------------------------------------------------------------- /toolchest_client/tools/centrifuge.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.centrifuge 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the centrifuge implementation of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType, sanity_check 9 | 10 | 11 | class Centrifuge(Tool): 12 | """ 13 | The centrifuge implementation of the Tool class. 14 | """ 15 | def __init__(self, tool_args, inputs, input_prefix_mapping, database_name, database_version, 16 | output_path, **kwargs): 17 | super().__init__( 18 | tool_name="centrifuge", 19 | tool_version="1.0.4", # todo: allow version to be set by the user 20 | tool_args=tool_args, 21 | output_path=output_path, 22 | inputs=inputs, 23 | input_prefix_mapping=input_prefix_mapping, 24 | database_name=database_name, 25 | database_version=database_version, 26 | max_inputs=None, 27 | parallel_enabled=False, 28 | output_type=OutputType.GZ_TAR, 29 | expected_output_file_names=[ 30 | "centrifuge_output.txt", 31 | "centrifuge_report.tsv", 32 | ], 33 | **kwargs, 34 | ) 35 | 36 | def _postflight(self, output): 37 | if self.output_validation_enabled: 38 | for output_file_name in self.expected_output_file_names: 39 | # Skip validation for the "done" file, which should be empty. 40 | if output_file_name != "done": 41 | output_file_path = f"{self.output_path}/{output_file_name}" 42 | sanity_check(output_file_path) 43 | -------------------------------------------------------------------------------- /toolchest_client/tools/clustalo.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.clustalo 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Clustal Omega implementation of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType 9 | 10 | 11 | class ClustalO(Tool): 12 | """ 13 | The Clustal Omega implementation of the Tool class. 14 | """ 15 | def __init__(self, tool_args, inputs, output_path, output_primary_name, **kwargs): 16 | super().__init__( 17 | tool_name="clustalo", 18 | tool_version='1.2.4', 19 | tool_args=tool_args, 20 | output_primary_name=output_primary_name, 21 | output_path=output_path, 22 | inputs=inputs, 23 | parallel_enabled=False, 24 | output_type=OutputType.GZ_TAR, 25 | expected_output_file_names=[output_primary_name, f"{output_primary_name}.log"], 26 | **kwargs, 27 | ) 28 | -------------------------------------------------------------------------------- /toolchest_client/tools/demucs.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.demucs 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Demucs implementation of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType 9 | 10 | 11 | class Demucs(Tool): 12 | """ 13 | The Demucs implementation of the Tool class. 14 | """ 15 | def __init__(self, tool_args, inputs, output_path, **kwargs): 16 | super().__init__( 17 | tool_name="demucs", 18 | tool_version='3.0.4', 19 | tool_args=tool_args, 20 | output_path=output_path, 21 | inputs=inputs, 22 | parallel_enabled=False, 23 | output_type=OutputType.GZ_TAR, 24 | expected_output_file_names=["error.log", "output.log"], 25 | **kwargs, 26 | ) 27 | -------------------------------------------------------------------------------- /toolchest_client/tools/diamond.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.Diamond 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Diamond implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType 8 | from . import Tool 9 | 10 | 11 | class DiamondBlastp(Tool): 12 | """ 13 | The DIAMOND BLASTP implementation of the Tool class. 14 | """ 15 | def __init__(self, inputs, database_name, database_version, output_path, output_primary_name, tool_args, 16 | remote_database_path, remote_database_primary_name, **kwargs): 17 | super().__init__( 18 | tool_name="diamond_blastp", 19 | tool_version="2.0.14", 20 | tool_args=tool_args, 21 | output_primary_name=output_primary_name, 22 | inputs=inputs, 23 | remote_database_path=remote_database_path, 24 | remote_database_primary_name=remote_database_primary_name, 25 | database_name=database_name, 26 | database_version=database_version, 27 | parallel_enabled=False, 28 | output_type=OutputType.GZ_TAR, 29 | output_path=output_path, 30 | expected_output_file_names=[output_primary_name, "diamond.log"], 31 | **kwargs, 32 | ) 33 | 34 | 35 | class DiamondBlastx(Tool): 36 | """ 37 | The DIAMOND BLASTX implementation of the Tool class. 38 | """ 39 | def __init__(self, inputs, database_name, database_version, output_path, output_primary_name, tool_args, 40 | remote_database_path, distributed=False, **kwargs): 41 | super().__init__( 42 | tool_name="diamond_blastx" if not distributed else "diamond_blastx_parallel", 43 | tool_version="2.0.13", 44 | tool_args=tool_args, 45 | output_primary_name=output_primary_name, 46 | inputs=inputs, 47 | remote_database_path=remote_database_path, 48 | database_name=database_name, 49 | database_version=database_version, 50 | output_type=OutputType.GZ_TAR, 51 | output_path=output_path, 52 | expected_output_file_names=[output_primary_name], 53 | **kwargs, 54 | ) 55 | -------------------------------------------------------------------------------- /toolchest_client/tools/fastqc.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.fastqc 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the FastQC implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType 8 | from toolchest_client.files.s3 import path_is_s3_uri 9 | 10 | from . import Tool 11 | 12 | 13 | class FastQC(Tool): 14 | """ 15 | The FastQC implementation of the Tool class. 16 | """ 17 | def __init__(self, tool_args, inputs, output_path, **kwargs): 18 | super().__init__( 19 | tool_name="fastqc", 20 | tool_version="0.11.9", 21 | tool_args=tool_args, 22 | output_path=output_path, 23 | inputs=inputs, 24 | max_input_bytes_per_file=64 * 1024 * 1024 * 1024, 25 | output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR, 26 | **kwargs, 27 | ) 28 | -------------------------------------------------------------------------------- /toolchest_client/tools/humann.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.humann 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the HUMAnN implementation of the Tool class. 6 | """ 7 | from enum import Enum 8 | 9 | from toolchest_client.files import OutputType 10 | 11 | from . import Tool 12 | 13 | 14 | class HUMAnN3(Tool): 15 | """ 16 | The HUMAnN implementation of the Tool class. 17 | """ 18 | def __init__(self, tool_args, inputs, output_primary_name, input_prefix_mapping, output_path, **kwargs): 19 | super().__init__( 20 | tool_name="humann3", 21 | tool_version="3.1.1", # todo: allow version to be set by the user 22 | database_name="humann3_protein_uniref90_diamond", 23 | database_version="1", 24 | tool_args=tool_args, 25 | output_path=output_path, 26 | output_primary_name=output_primary_name, 27 | inputs=inputs, 28 | input_prefix_mapping=input_prefix_mapping, 29 | parallel_enabled=False, 30 | output_type=OutputType.GZ_TAR, 31 | **kwargs, 32 | ) 33 | 34 | 35 | class HUMAnN3Mode(Enum): 36 | HUMANN = ("humann", False) 37 | HUMANN_BARPLOT = ("humann_barplot", True) 38 | HUMANN_GENE_FAMILIES_GENUS_LEVEL = ("humann_genefamilies_genus_level", True) 39 | HUMANN_JOIN_TABLES = ("humann_join_tables", True) 40 | HUMANN_REDUCE_TABLE = ("humann_reduce_table", True) 41 | HUMANN_REGROUP_TABLE = ("humann_regroup_table", True) 42 | HUMANN_RENORM_TABLE = ("humann_renorm_table", True) 43 | HUMANN_RENAME_TABLE = ("humann_rename_table", True) 44 | HUMANN_SPLIT_STRATIFIED_TABLE = ("humann_split_stratified_table", False) 45 | HUMANN_UNPACK_PATHWAYS = ("humann_unpack_pathways", True) 46 | -------------------------------------------------------------------------------- /toolchest_client/tools/jupyter.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.jupyter 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Jupyter implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType, path_is_s3_uri 8 | 9 | from . import Tool 10 | 11 | 12 | class Jupyter(Tool): 13 | """ 14 | The Jupyter implementation of the Tool class. 15 | """ 16 | def __init__(self, tool_args, inputs, output_path, input_prefix_mapping, **kwargs): 17 | super().__init__( 18 | tool_name="jupyter", 19 | tool_version="1", 20 | tool_args=tool_args, 21 | output_path=output_path, 22 | inputs=inputs, 23 | input_prefix_mapping=input_prefix_mapping, 24 | output_primary_name="token.txt", 25 | output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.FLAT_TEXT, 26 | **kwargs, 27 | ) 28 | -------------------------------------------------------------------------------- /toolchest_client/tools/kallisto.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.kallisto 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Kallisto implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType 8 | from toolchest_client.files.s3 import path_is_s3_uri 9 | 10 | from . import Tool 11 | 12 | 13 | class Kallisto(Tool): 14 | """ 15 | The Kallisto implementation of the Tool class. 16 | """ 17 | def __init__(self, tool_args, inputs, output_path, database_name, 18 | database_version, input_prefix_mapping, **kwargs): 19 | super().__init__( 20 | tool_name="kallisto", 21 | tool_version="0.48.0", 22 | tool_args=tool_args, 23 | output_path=output_path, 24 | inputs=inputs, 25 | input_prefix_mapping=input_prefix_mapping, 26 | database_name=database_name, 27 | database_version=database_version, 28 | max_input_bytes_per_file=64 * 1024 * 1024 * 1024, 29 | output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR, 30 | **kwargs, 31 | ) 32 | -------------------------------------------------------------------------------- /toolchest_client/tools/kraken2.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.kraken2 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Kraken2 implementation of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType 9 | from toolchest_client.files.s3 import path_is_s3_uri 10 | 11 | 12 | class Kraken2(Tool): 13 | """ 14 | The Kraken2 implementation of the Tool class. 15 | """ 16 | def __init__(self, tool_args, inputs, output_path, 17 | database_name, database_version, remote_database_path, tool_version='2.1.1', **kwargs): 18 | super().__init__( 19 | tool_name="kraken2", 20 | tool_version=tool_version, 21 | tool_args=tool_args, 22 | output_path=output_path, 23 | inputs=inputs, 24 | database_name=database_name, 25 | database_version=database_version, 26 | remote_database_path=remote_database_path, 27 | max_input_bytes_per_file=64 * 1024 * 1024 * 1024, 28 | parallel_enabled=False, 29 | output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR, 30 | expected_output_file_names=["kraken2_output.txt", "kraken2_report.txt"], 31 | **kwargs, 32 | ) 33 | -------------------------------------------------------------------------------- /toolchest_client/tools/last.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.last 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Last implementation of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType 9 | from toolchest_client.files.s3 import path_is_s3_uri 10 | 11 | 12 | class Lastal5(Tool): 13 | """ 14 | The lastal5 implementation of the Tool class. 15 | """ 16 | def __init__(self, tool_args, inputs, output_path, output_primary_name, database_name, database_version, **kwargs): 17 | super().__init__( 18 | tool_name="lastal5", 19 | tool_version="1411", 20 | tool_args=tool_args, 21 | output_path=output_path, 22 | output_primary_name=output_primary_name, 23 | inputs=inputs, 24 | database_name=database_name, 25 | database_version=database_version, 26 | max_input_bytes_per_file=64 * 1024 * 1024 * 1024, 27 | parallel_enabled=False, 28 | output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR, 29 | **kwargs, 30 | ) 31 | -------------------------------------------------------------------------------- /toolchest_client/tools/lug.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.lug 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Lug implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType, path_is_s3_uri 8 | 9 | from . import Tool 10 | 11 | 12 | class Lug(Tool): 13 | """ 14 | The Lug implementation of the Tool class. 15 | """ 16 | def __init__(self, tool_args, inputs, output_path, tool_version, 17 | custom_docker_image_id=None, **kwargs): 18 | super().__init__( 19 | tool_name="lug", 20 | tool_version=tool_version, 21 | tool_args=tool_args, 22 | output_path=output_path, 23 | inputs=inputs, 24 | max_input_bytes_per_file=4 * 1024 * 1024 * 1024 * 1024, 25 | output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR, 26 | custom_docker_image_id=custom_docker_image_id, 27 | **kwargs, 28 | ) 29 | -------------------------------------------------------------------------------- /toolchest_client/tools/megahit.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.megahit 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the megahit implementation of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType, sanity_check 9 | 10 | 11 | class Megahit(Tool): 12 | """ 13 | The megahit implementation of the Tool class. 14 | """ 15 | def __init__(self, tool_args, inputs, input_prefix_mapping, 16 | output_path, **kwargs): 17 | super().__init__( 18 | tool_name="megahit", 19 | tool_version="1.2.9", # todo: allow version to be set by the user 20 | tool_args=tool_args, 21 | output_path=output_path, 22 | inputs=inputs, 23 | input_prefix_mapping=input_prefix_mapping, 24 | max_inputs=None, 25 | parallel_enabled=False, 26 | output_type=OutputType.GZ_TAR, 27 | expected_output_file_names=[ 28 | "checkpoints.txt", 29 | "done", 30 | "final.contigs.fa", 31 | "log", 32 | "options.json", 33 | ], 34 | **kwargs, 35 | ) 36 | 37 | def _postflight(self, output): 38 | if self.output_validation_enabled: 39 | for output_file_name in self.expected_output_file_names: 40 | # Skip validation for the "done" file, which should be empty. 41 | if output_file_name != "done": 42 | output_file_path = f"{self.output_path}/{output_file_name}" 43 | sanity_check(output_file_path) 44 | -------------------------------------------------------------------------------- /toolchest_client/tools/metaphlan.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.metaphlan 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the MetaPhlAn implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType 8 | from toolchest_client.files.s3 import path_is_s3_uri 9 | 10 | from . import Tool 11 | 12 | 13 | class MetaPhlAn(Tool): 14 | """ 15 | The MetaPhlAn implementation of the Tool class. 16 | """ 17 | def __init__(self, tool_args, inputs, output_path, output_primary_name, **kwargs): 18 | super().__init__( 19 | tool_name="metaphlan", 20 | tool_version="3.0.14", 21 | tool_args=tool_args, 22 | output_path=output_path, 23 | database_name="metaphlan_mpa_v30_CHOCOPhlAn_201901", 24 | database_version="1", 25 | output_primary_name=output_primary_name, 26 | inputs=inputs, 27 | max_inputs=1, 28 | max_input_bytes_per_file=64 * 1024 * 1024 * 1024, 29 | output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR, 30 | **kwargs, 31 | ) 32 | -------------------------------------------------------------------------------- /toolchest_client/tools/python3.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.python3 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Python3 implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType, path_is_s3_uri 8 | 9 | from . import Tool 10 | 11 | 12 | class Python3(Tool): 13 | """ 14 | The Python3 implementation of the Tool class. 15 | """ 16 | def __init__(self, tool_args, inputs, output_path, custom_docker_image_id=None, **kwargs): 17 | super().__init__( 18 | tool_name="python3", 19 | tool_version="3.9.1", 20 | tool_args=tool_args, 21 | output_path=output_path, 22 | inputs=inputs, 23 | output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR, 24 | custom_docker_image_id=custom_docker_image_id, 25 | **kwargs, 26 | ) 27 | -------------------------------------------------------------------------------- /toolchest_client/tools/rapsearch2.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.rapsearch 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Rapsearch implementation of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType 9 | 10 | 11 | class Rapsearch2(Tool): 12 | """ 13 | The Rapsearch implementation of the Tool class. 14 | """ 15 | def __init__(self, tool_args, inputs, output_path, output_primary_name, 16 | database_name, database_version, **kwargs): 17 | super().__init__( 18 | tool_name="rapsearch2", 19 | tool_version="2.24", 20 | tool_args=tool_args, 21 | output_primary_name=output_primary_name, 22 | output_path=output_path, 23 | inputs=inputs, 24 | database_name=database_name, 25 | database_version=database_version, 26 | parallel_enabled=False, 27 | output_type=OutputType.GZ_TAR, 28 | expected_output_file_names=[f"{output_primary_name}.m8"], # .aln output may be omitted with certain args 29 | **kwargs, 30 | ) 31 | -------------------------------------------------------------------------------- /toolchest_client/tools/salmon.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.salmon 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Salmon implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType 8 | from toolchest_client.files.s3 import path_is_s3_uri 9 | 10 | from . import Tool 11 | 12 | 13 | class Salmon(Tool): 14 | """ 15 | The Salmon implementation of the Tool class. 16 | """ 17 | def __init__(self, tool_args, inputs, output_path, database_name, 18 | database_version, **kwargs): 19 | super().__init__( 20 | tool_name="salmon", 21 | tool_version="1.9.0", # todo: allow salmon version to be set by the user 22 | tool_args=tool_args, 23 | output_path=output_path, 24 | inputs=inputs, 25 | database_name=database_name, 26 | database_version=database_version, 27 | max_input_bytes_per_file=64 * 1024 * 1024 * 1024, 28 | output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR, 29 | **kwargs, 30 | ) 31 | -------------------------------------------------------------------------------- /toolchest_client/tools/shi7.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.shi7 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the shi7 implementation of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType 9 | 10 | 11 | class Shi7(Tool): 12 | """ 13 | The shi7 implementation of the Tool class. 14 | """ 15 | def __init__(self, tool_args, inputs, output_path, **kwargs): 16 | super().__init__( 17 | tool_name="shi7", 18 | tool_version="1.0.3", # todo: allow shi7 version to be set by the user 19 | tool_args=tool_args, 20 | output_path=output_path, 21 | inputs=inputs, 22 | max_inputs=None, # note: no limit is set on the # of inputs 23 | parallel_enabled=False, 24 | group_paired_ends=True, 25 | max_input_bytes_per_file=16 * 1024 * 1024 * 1024, 26 | output_type=OutputType.GZ_TAR, 27 | expected_output_file_names=["combined_seqs.fna", "shi7.log"], 28 | **kwargs, 29 | ) 30 | -------------------------------------------------------------------------------- /toolchest_client/tools/shogun.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.shogun 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This contains the shogun_align and shogun_filter implementations of the Tool class. 6 | """ 7 | from . import Tool 8 | from toolchest_client.files import OutputType 9 | 10 | 11 | class ShogunAlign(Tool): 12 | """ 13 | The shogun_align implementation of the Tool class. 14 | """ 15 | def __init__(self, tool_args, inputs, output_path, database_name, 16 | database_version, **kwargs): 17 | super().__init__( 18 | tool_name="shogun_align", 19 | tool_version="1.0.8", # todo: allow shogun version to be set by the user 20 | tool_args=tool_args, 21 | output_path=output_path, 22 | inputs=inputs, 23 | database_name=database_name, 24 | database_version=database_version, 25 | parallel_enabled=False, 26 | output_type=OutputType.GZ_TAR, 27 | expected_output_file_names=["alignment.bowtie2.sam"], 28 | **kwargs, 29 | ) 30 | 31 | 32 | class ShogunFilter(Tool): 33 | """ 34 | The shogun_filter implementation of the Tool class. 35 | """ 36 | def __init__(self, tool_args, inputs, output_path, database_name, 37 | database_version, **kwargs): 38 | super().__init__( 39 | tool_name="shogun_filter", 40 | tool_version="1.0.8", # todo: allow shogun version to be set by the user 41 | tool_args=tool_args, 42 | output_path=output_path, 43 | inputs=inputs, 44 | min_inputs=1, 45 | max_inputs=1, 46 | database_name=database_name, 47 | database_version=database_version, 48 | parallel_enabled=False, 49 | output_type=OutputType.GZ_TAR, 50 | expected_output_file_names=["combined_seqs.filtered.fna", "alignment.burst.best.b6"], 51 | **kwargs, 52 | ) 53 | -------------------------------------------------------------------------------- /toolchest_client/tools/star.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.STAR 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the STAR implementation of the Tool class. 6 | 7 | Note: This tool is named STARInstance to differentiate it from 8 | the STAR function called by the user, which is given in all caps 9 | to be in line with the command-line argument. 10 | """ 11 | from . import Tool 12 | from toolchest_client.files import OutputType 13 | 14 | 15 | class STARInstance(Tool): 16 | """ 17 | The STAR implementation of the Tool class. 18 | """ 19 | def __init__(self, tool_args, inputs, input_prefix_mapping, output_path, 20 | database_name, database_version, parallelize, output_primary_name=None, **kwargs): 21 | super().__init__( 22 | tool_name="STAR", 23 | tool_version="2.7.9a", 24 | tool_args=tool_args, 25 | output_path=output_path, 26 | output_primary_name=output_primary_name, 27 | inputs=inputs, 28 | input_prefix_mapping=input_prefix_mapping, 29 | database_name=database_name, 30 | database_version=database_version, 31 | parallel_enabled=False, 32 | max_input_bytes_per_file=128 * 1024 * 1024 * 1024, 33 | max_input_bytes_per_file_parallel=4.5 * 1024 * 1024 * 1024, 34 | output_type=OutputType.SAM_FILE if parallelize else OutputType.GZ_TAR, 35 | expected_output_file_names=[ 36 | "Aligned.out.sam", 37 | "Log.final.out", 38 | "Log.out", 39 | "Log.progress.out", 40 | "SJ.out.tab", 41 | ], 42 | **kwargs, 43 | ) 44 | -------------------------------------------------------------------------------- /toolchest_client/tools/test.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.test 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the test implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType 8 | 9 | from . import Tool 10 | 11 | 12 | class Test(Tool): 13 | """ 14 | The test implementation of the Tool class. 15 | """ 16 | def __init__(self, tool_args, inputs, output_path, **kwargs): 17 | super().__init__( 18 | tool_name="test", 19 | tool_version="0.1.0", 20 | tool_args=tool_args, 21 | output_path=output_path, 22 | inputs=inputs, 23 | max_input_bytes_per_file=256 * 1024 * 1024 * 1024, 24 | parallel_enabled=False, 25 | output_type=OutputType.GZ_TAR, 26 | expected_output_file_names=["test_output.txt"], 27 | **kwargs, 28 | ) 29 | -------------------------------------------------------------------------------- /toolchest_client/tools/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/toolchest_client/tools/tests/__init__.py -------------------------------------------------------------------------------- /toolchest_client/tools/tests/test_generic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from ..test import Test 5 | 6 | THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) 7 | 8 | 9 | def test_unknown_arg_handling(): 10 | tool_args = "--unknown arg" 11 | test_instance = Test( 12 | tool_args=tool_args, 13 | inputs=f"{THIS_DIRECTORY}/test_generic.py", 14 | output_path="./output.tar.gz", 15 | ) 16 | 17 | with pytest.raises(ValueError): 18 | test_instance._validate_args() 19 | 20 | 21 | def test_blacklisted_arg_handling(): 22 | tool_args = "--bad arg" 23 | test_instance = Test( 24 | tool_args=tool_args, 25 | inputs=f"{THIS_DIRECTORY}/test_generic.py", 26 | output_path="./output.tar.gz", 27 | ) 28 | 29 | with pytest.raises(ValueError): 30 | test_instance._validate_args() 31 | -------------------------------------------------------------------------------- /toolchest_client/tools/tests/test_kraken2.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from ..kraken2 import Kraken2 4 | 5 | THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | def test_kraken2_preflight(): 9 | output_path = f"{THIS_DIRECTORY}/output" 10 | kraken_instance = Kraken2( 11 | tool_args="", 12 | inputs=f"{THIS_DIRECTORY}/test_kraken2.py", 13 | output_path=output_path, 14 | database_name="standard", 15 | database_version=1, 16 | remote_database_path=None, 17 | ) 18 | kraken_instance._preflight() 19 | 20 | assert os.path.isdir(output_path) 21 | os.rmdir(output_path) 22 | -------------------------------------------------------------------------------- /toolchest_client/tools/tests/test_sanity.py: -------------------------------------------------------------------------------- 1 | def test_sanity(): 2 | assert 1 + 1 == 2 3 | -------------------------------------------------------------------------------- /toolchest_client/tools/tests/test_star.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from ..star import STARInstance 4 | 5 | THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__)) 6 | 7 | 8 | def test_star_variable_arg_parsing_single(): 9 | star_instance = STARInstance( 10 | tool_args="--quantMode GeneCounts --scoreGap 1", 11 | input_prefix_mapping={ 12 | "r1_path": None, 13 | "r2_path": None, 14 | }, 15 | inputs=f"{THIS_DIRECTORY}/test_star.py", 16 | output_path="./", 17 | database_name="test", 18 | database_version="0.1.0", 19 | parallelize=False, 20 | ) 21 | star_instance._validate_args() 22 | 23 | assert star_instance.tool_args == "--quantMode GeneCounts --scoreGap 1" 24 | 25 | 26 | def test_star_variable_arg_parsing_multiple(): 27 | star_instance = STARInstance( 28 | tool_args="--quantMode TranscriptomeSAM GeneCounts --scoreGap 1", 29 | input_prefix_mapping={ 30 | "r1_path": None, 31 | "r2_path": None, 32 | }, 33 | inputs=f"{THIS_DIRECTORY}/test_star.py", 34 | output_path="./", 35 | database_name="test", 36 | database_version="0.1.0", 37 | parallelize=False, 38 | ) 39 | star_instance._validate_args() 40 | 41 | assert star_instance.tool_args == "--quantMode TranscriptomeSAM GeneCounts --scoreGap 1" 42 | -------------------------------------------------------------------------------- /toolchest_client/tools/transfer.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.transfer 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the arbitrary file transfer implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType, path_is_s3_uri 8 | 9 | from . import Tool 10 | 11 | 12 | class Transfer(Tool): 13 | """ 14 | The arbitrary file transfer implementation of the Tool class. 15 | """ 16 | def __init__(self, inputs, output_path, **kwargs): 17 | super().__init__( 18 | tool_name="transfer", 19 | tool_version="1.0.0", 20 | tool_args="", 21 | output_path=output_path, 22 | inputs=inputs, 23 | max_input_bytes_per_file=1024 * 1024 * 1024 * 1024, 24 | output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR, 25 | expected_output_file_names=[], 26 | **kwargs, 27 | ) 28 | -------------------------------------------------------------------------------- /toolchest_client/tools/unicycler.py: -------------------------------------------------------------------------------- 1 | """ 2 | toolchest_client.tools.unicycler 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 4 | 5 | This is the Unicycler implementation of the Tool class. 6 | """ 7 | from toolchest_client.files import OutputType 8 | 9 | from . import Tool 10 | 11 | 12 | class Unicycler(Tool): 13 | """ 14 | The unicycler implementation of the Tool class. 15 | """ 16 | def __init__(self, tool_args, inputs, input_prefix_mapping, 17 | output_path, **kwargs): 18 | super().__init__( 19 | tool_name="unicycler", 20 | tool_version="0.4.9", # todo: allow unicycler version to be set by the user 21 | tool_args=tool_args, 22 | output_path=output_path, 23 | inputs=inputs, 24 | input_prefix_mapping=input_prefix_mapping, 25 | parallel_enabled=False, 26 | output_type=OutputType.GZ_TAR, 27 | expected_output_file_names=["assembly.fasta", "unicycler.log"], 28 | **kwargs, 29 | ) 30 | --------------------------------------------------------------------------------