├── .circleci
    └── config.yml
├── .flake8
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── docs
    │   ├── CNAME
    │   ├── feature-reference
    │   │   ├── adding-and-updating-custom-databases.md
    │   │   ├── async-runs.md
    │   │   ├── authentication.md
    │   │   ├── output-objects.md
    │   │   ├── output-streaming.md
    │   │   └── using-aws-with-toolchest.md
    │   ├── getting-started
    │   │   ├── installation.md
    │   │   ├── python-functions-and-containers.md
    │   │   ├── running-bioinformatics-on-toolchest.md
    │   │   └── using-files.md
    │   ├── images
    │   │   └── toolchest_t.png
    │   ├── index.md
    │   ├── tool-reference
    │   │   ├── about.md
    │   │   ├── aligners.md
    │   │   ├── aligners
    │   │   │   ├── bowtie-2.md
    │   │   │   ├── clustal-omega.md
    │   │   │   ├── diamond.md
    │   │   │   ├── diamond
    │   │   │   │   ├── diamond-blastp.md
    │   │   │   │   └── diamond-blastx.md
    │   │   │   ├── kallisto.md
    │   │   │   ├── rapsearch2.md
    │   │   │   ├── salmon.md
    │   │   │   └── star.md
    │   │   ├── all-other-tools.md
    │   │   ├── assemblers.md
    │   │   ├── assemblers
    │   │   │   ├── megahit.md
    │   │   │   └── unicycler.md
    │   │   ├── demultiplexers.md
    │   │   ├── demultiplexers
    │   │   │   └── demucs.md
    │   │   ├── post-processing.md
    │   │   ├── post-processing
    │   │   │   └── bracken.md
    │   │   ├── pre-processing.md
    │   │   ├── pre-processing
    │   │   │   └── fastqc.md
    │   │   ├── python3.md
    │   │   ├── structure-prediction.md
    │   │   ├── structure-prediction
    │   │   │   └── alphafold.md
    │   │   ├── taxonomic-classifiers.md
    │   │   ├── taxonomic-classifiers
    │   │   │   ├── centrifuge.md
    │   │   │   ├── kraken-2.md
    │   │   │   └── metaphlan.md
    │   │   ├── test-runs.md
    │   │   ├── transfer.md
    │   │   ├── workflows-meta-tools.md
    │   │   └── workflows-meta-tools
    │   │   │   └── humann3.md
    │   └── toolchest-hosted-cloud
    │   │   ├── instance-types.md
    │   │   ├── pricing.md
    │   │   └── running-toolchest-in-your-aws-account.md
    └── mkdocs.yaml
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── test_async.py
    ├── test_blastn.py
    ├── test_bowtie2.py
    ├── test_cellranger.py
    ├── test_centrifuge.py
    ├── test_chaining.py
    ├── test_clustalo.py
    ├── test_database_update.py
    ├── test_diamond.py
    ├── test_download.py
    ├── test_fastqc.py
    ├── test_filepath.py
    ├── test_humann3.py
    ├── test_kallisto.py
    ├── test_kraken2.py
    ├── test_last.py
    ├── test_megahit.py
    ├── test_metaphlan.py
    ├── test_output.py
    ├── test_public_uri.py
    ├── test_python3.py
    ├── test_rapsearch2.py
    ├── test_salmon.py
    ├── test_sanity.py
    ├── test_shi7.py
    ├── test_shogun.py
    ├── test_star.py
    ├── test_transfer.py
    ├── test_unicycler.py
    └── util
    │   ├── __init__.py
    │   ├── filter_output.py
    │   ├── hash.py
    │   ├── numpy_test.Dockerfile
    │   ├── s3.py
    │   └── streaming_script.py
└── toolchest_client
    ├── __init__.py
    ├── api
        ├── __init__.py
        ├── auth.py
        ├── download.py
        ├── exceptions.py
        ├── instance_type.py
        ├── output.py
        ├── query.py
        ├── status.py
        ├── streaming.py
        └── urls.py
    ├── cli
        ├── __init__.py
        ├── cli.py
        ├── kraken2.py
        └── test.py
    ├── files
        ├── __init__.py
        ├── general.py
        ├── merge.py
        ├── public_uris.py
        ├── s3.py
        ├── split.py
        ├── tests
        │   ├── __init__.py
        │   ├── data
        │   │   ├── eight_line.fastq
        │   │   ├── eight_line_split_one.fastq
        │   │   ├── eight_line_split_two.fastq
        │   │   ├── paired_end
        │   │   │   ├── eight_line_R1.fastq
        │   │   │   └── eight_line_R2.fastq
        │   │   └── very_small_file.txt
        │   ├── test_general.py
        │   ├── test_merge.py
        │   ├── test_s3.py
        │   └── test_split.py
        └── unpack.py
    ├── logging.py
    └── tools
        ├── __init__.py
        ├── alphafold.py
        ├── api.py
        ├── blastn.py
        ├── bowtie2.py
        ├── bracken.py
        ├── cellranger.py
        ├── centrifuge.py
        ├── clustalo.py
        ├── demucs.py
        ├── diamond.py
        ├── fastqc.py
        ├── humann.py
        ├── jupyter.py
        ├── kallisto.py
        ├── kraken2.py
        ├── last.py
        ├── lug.py
        ├── megahit.py
        ├── metaphlan.py
        ├── python3.py
        ├── rapsearch2.py
        ├── salmon.py
        ├── shi7.py
        ├── shogun.py
        ├── star.py
        ├── test.py
        ├── tests
            ├── __init__.py
            ├── test_generic.py
            ├── test_kraken2.py
            ├── test_sanity.py
            └── test_star.py
        ├── tool.py
        ├── tool_args.py
        ├── transfer.py
        └── unicycler.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | version: 2.1
  2 | orbs:
  3 |   python: circleci/python@1.4.0
  4 | jobs:
  5 |   deploy-to-pypi:
  6 |     executor: python/default
  7 |     working_directory: ~/repo
  8 |     steps:
  9 |       - checkout
 10 |       - python/install-packages:
 11 |           pkg-manager: poetry
 12 |       - run:
 13 |           name: Build and publish to PyPI
 14 |           command: |
 15 |             cd ~/repo
 16 |             PYPI_ENVIRONMENT=$([[ $CIRCLE_BRANCH = main ]] && echo prod-pypi || echo test-pypi)
 17 |             PYPI_ACCESS_TOKEN=$([[ $PYPI_ENVIRONMENT = prod-pypi ]] && echo $PYPI_PROD_TOKEN || echo $PYPI_TEST_TOKEN)
 18 |             poetry config pypi-token.$PYPI_ENVIRONMENT $PYPI_ACCESS_TOKEN
 19 |             poetry publish --build -r $PYPI_ENVIRONMENT
 20 |   unit-tests:
 21 |     executor: python/default
 22 |     working_directory: ~/repo
 23 |     steps:
 24 |       - checkout
 25 |       - python/install-packages:
 26 |           pkg-manager: poetry
 27 |       - setup_remote_docker:
 28 |           version: 20.10.14
 29 |       - run:
 30 |           name: Run unit tests
 31 |           command: |
 32 |             cd ~/repo
 33 |             poetry install
 34 |             poetry run pytest -v -m "not (integration or integration_full)"
 35 |   integration-tests:
 36 |     executor: python/default
 37 |     working_directory: ~/repo
 38 |     parallelism: 8
 39 |     steps:
 40 |       - checkout
 41 |       - run:
 42 |           name: Skip tests if last commit is a chore commit
 43 |           command: |      
 44 |             cd ~/repo
 45 |             last_commit="$(git log -1 --pretty=%s | grep chore: || true)"
 46 |             if [ ${#last_commit} -gt 0 ]; then circleci-agent step halt; fi
 47 |       - run:
 48 |           name: Create AWS credentials manually
 49 |           command: |
 50 |             mkdir ~/.aws
 51 |             touch ~/.aws/config
 52 |             chmod 600 ~/.aws/config
 53 |             echo "[profile circleci]" > ~/.aws/config
 54 |             echo "aws_access_key_id=$AWS_ACCESS_KEY_ID" >> ~/.aws/config
 55 |             echo "aws_secret_access_key=$AWS_SECRET_ACCESS_KEY" >> ~/.aws/config
 56 |       - python/install-packages:
 57 |           pkg-manager: poetry
 58 |       - setup_remote_docker:
 59 |           version: 20.10.14
 60 |       - run:
 61 |           name: Run integration tests
 62 |           parallel: true
 63 |           command: |
 64 |             cd ~/repo
 65 |             export DEPLOY_ENVIRONMENT=$([[ $CIRCLE_BRANCH = main ]] && echo production || echo staging)
 66 |             shopt -s globstar
 67 |             TESTFILES=$(circleci tests glob tests/**/test*.py | circleci tests split --split-by=timings)
 68 |             shopt -u globstar
 69 |             poetry install
 70 |             mkdir -p test-results
 71 |             poetry run pytest -v -m integration --durations=0 --junitxml=test-results/junit.xml $TESTFILES
 72 |           no_output_timeout: 1h
 73 |       - store_test_results:
 74 |           path: test-results
 75 |       - store_artifacts:
 76 |           path: test-results
 77 |   lint:
 78 |     executor: python/default
 79 |     working_directory: ~/repo
 80 |     steps:
 81 |       - checkout
 82 |       - python/install-packages:
 83 |           pkg-manager: poetry
 84 |       - run:
 85 |           name: Run flake8 linter
 86 |           parallel: true
 87 |           command: |
 88 |             cd ~/repo
 89 |             pip install flake8
 90 |             flake8 ./ --output-file test-reports
 91 |           no_output_timeout: 5m
 92 |       - store_artifacts:
 93 |           path: test-reports
 94 |           destination: test-reports
 95 | workflows:
 96 |   test:
 97 |     jobs:
 98 |       - unit-tests
 99 |       - lint
100 |       - integration-tests:
101 |           filters:
102 |             branches:
103 |               only:
104 |                 - main
105 |                 - staging
106 |   deploy:
107 |     jobs:
108 |       - deploy-to-pypi:
109 |           filters:
110 |             branches:
111 |               only:
112 |                 - main
113 |                 - staging
114 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 120
 3 | exclude=
 4 |     demo.py
 5 |     docs/conf.py
 6 | ignore =
 7 |     E731
 8 | per-file-ignores =
 9 |     # ignores unused imports and imports not at the top of file in init files
10 |     */__init__.py:F401,E402
11 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy docs
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |       - feat/update-docs
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     defaults:
12 |       run:
13 |         working-directory: ./docs
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |       - uses: actions/setup-python@v2
17 |         with:
18 |           python-version: 3.x
19 |       - run: pip install mkdocs-material
20 |       - run: mkdocs gh-deploy --force
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Custom files
132 | demo.py
133 | demo-import.py
134 | src/demo.py
135 | toolchest_client/demo.py
136 | 
137 | # JetBrains
138 | .idea/
139 | 
140 | # Emacs
141 | *~
142 | 
143 | # Default temporary directory for splitting input files
144 | temp_toolchest*
145 | 
146 | # Integration test directories
147 | temp_test_*
148 | 
149 | # macOS
150 | .DS_Store
151 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Toolchest Python Client
 2 | 
 3 | **Toolchest** runs computational biology software in the cloud with just a few lines of code. 
 4 | You can call Toolchest from anywhere Python or R runs, using input files located on your computer or S3.
 5 | 
 6 | This package contains the **Python** client for using Toolchest.
 7 | For the **R** client, [see here](https://github.com/trytoolchest/toolchest-client-r).
 8 | 
 9 | ## [Documentation & User Guide](https://docs.trytoolchest.com/)
10 | 
11 | ## Installation
12 | 
13 | The Toolchest client is available [on PyPI](https://pypi.org/project/toolchest-client):
14 | ``` shell
15 | pip install toolchest-client
16 | ```
17 | 
18 | ## Usage
19 | 
20 | Using a tool in Toolchest is as simple as:
21 | 
22 | ``` python
23 | import toolchest_client as toolchest
24 | toolchest.set_key("YOUR_TOOLCHEST_KEY")
25 | toolchest.kraken2(
26 |   tool_args="",
27 |   inputs="path/to/input.fastq",
28 |   output_path="path/to/output.fastq",
29 | )
30 | ```
31 | 
32 | For a list of available tools, see the [documentation](https://docs.trytoolchest.com/tool-reference/about/).
33 | 
34 | ## Configuration
35 | 
36 | To use Toolchest, you must have an authentication key stored
37 | in the `TOOLCHEST_KEY` environment variable.
38 | 
39 | ``` python
40 | import toolchest_client as toolchest
41 | toolchest.set_key("YOUR_TOOLCHEST_KEY") # or a file path containing the key
42 | ```
43 | 
44 | Contact Toolchest if:
45 | 
46 | -   you need a key
47 | -   you’ve forgotten your key
48 | -   the key is producing authentication errors.
49 | 


--------------------------------------------------------------------------------
/docs/docs/CNAME:
--------------------------------------------------------------------------------
1 | docs.trytoolchest.com


--------------------------------------------------------------------------------
/docs/docs/feature-reference/async-runs.md:
--------------------------------------------------------------------------------
 1 | # Asynchronous Runs
 2 | 
 3 | Toolchest supports async execution for every tool. Async runs are useful long running commands, because you do not need to keep an open terminal or connection while Toolchest is executing.
 4 | 
 5 | We've seen people use async runs from AWS Lambda functions, custom automated pipelines, and manual calls from IDEs.
 6 | 
 7 | ## Launching an Async Run
 8 | 
 9 | To launch an async run, ad the **`is_async`** parameter with the value **True** in your function call. For example, 
10 | using the `test` function:
11 | 
12 | ```python
13 | my_run = tc.test(
14 |     inputs="./",
15 |     output_path="./output",
16 |   	is_async=True,
17 | )
18 | ```
19 | 
20 | After the Toolchest run is initialized and all file transfers are complete, the Toolchest call returns an 
21 | [output object](output-objects.md) containing a run ID.
22 | 
23 | You can check your run status using the returned run ID (e.g. `my_run.run_id`).
24 | 
25 | 
26 | Once you see this, Toolchest is executing your run in the background, and you're safe to close your terminal. (Be sure 
27 | to record the run ID!)
28 | 
29 | ## Checking Run Status
30 | 
31 | To check the status of your async run, call the **`get_status`** function with your run ID.
32 | 
33 | ```python
34 | print(tc.get_status(run_id="YOUR_RUN_ID"))
35 | 'executing'
36 | ```
37 | 
38 | **`get_status`** returns a string. Once the status is `ready_to_transfer_to_client`, the run has finished execution and 
39 | is ready to download.
40 | 
41 | ### Statuses enum
42 | 
43 | There's an enum –`Status` – that contains all statuses returned from `get_status()`. You can check statuses against 
44 | this enum for custom error handling, progress tracking, or whatever you're building.
45 | 
46 | ```python
47 | status = tc.get_status(run_id="YOUR_RUN_ID")
48 | if status == tc.Status.COMPLETE:
49 |   print("AlphaFold run finished! Sending email to researcher...")
50 | ```
51 | 
52 | To check all possible enum values, you can print the enum as a list:
53 | 
54 | ```python
55 | print(list(tc.Status))
56 | [<Status.INITIALIZED: 'initialized'>, ...
57 | ```
58 | 
59 | ## Downloading Output
60 | 
61 | To download the output manually, call the **`download`** function with your run ID and output directory.
62 | 
63 | ```python
64 | tc.download(
65 |   	run_id="YOUR_RUN_ID", 
66 |   	output_path="./output/",
67 | )
68 | ```
69 | 
70 | 
71 | This downloads the run's output file(s) into the output directory. You can run `download` for 7 days after starting the 
72 | run.
73 | 


--------------------------------------------------------------------------------
/docs/docs/feature-reference/authentication.md:
--------------------------------------------------------------------------------
 1 | # Authentication
 2 | 
 3 | To run Toolchest jobs, you'll need a Toolchest key. If you don't have one yet, you can get a key 
 4 | [here](https://airtable.com/shrKzQNuDHrGkEAI2).
 5 | 
 6 | ## Setting a Key
 7 | 
 8 | Use the **`set_key`** function to authenticate your Toolchest calls:
 9 | 
10 | ```python
11 | import toolchest_client as tc
12 | tc.set_key("YOUR_TOOLCHEST_KEY")
13 | ```
14 | 
15 | `YOUR_TOOLCHEST_KEY` should be a string containing either the key value or a path to a file containing the key.
16 | 
17 | You can also set your key through the `TOOLCHEST_KEY` environment variable.
18 | 
19 | ## Getting a Stored Key
20 | 
21 | To check the value of the key in use, use the **`get_key`** function, which returns a string containing your key value.
22 | 
23 | ```python
24 | import toolchest_client as tc
25 | tc.get_key()
26 | ```
27 | 
28 | ## Private Tools and Databases
29 | 
30 | If you'd like to use a private tool database with Toolchest without exposing it to the public, Toolchest supports 
31 | restricting some databases and tools to your account.


--------------------------------------------------------------------------------
/docs/docs/feature-reference/output-objects.md:
--------------------------------------------------------------------------------
 1 | # Output Objects
 2 | 
 3 | Every Toolchest run returns an object containing the run ID (`run_id`), local paths to downloaded output files 
 4 | (`output_path`), and more.
 5 | 
 6 | As an example, we'll use the output from this `test` function call, but you can do this for any Toolchest tool:
 7 | 
 8 | ```python
 9 | import toolchest_client as tc
10 | 
11 | toolchest_output = tc.test(
12 |     inputs="./",
13 |     output_path="./output/",
14 |     tool_args="",
15 | )
16 | ```
17 | 
18 | ##  Run Metadata
19 | 
20 | The **`run_id`** instance variable contains the ID of the Toolchest run, stored as a string. 
21 | 
22 | Likewise, the **`output_path`** instance variable contains local paths to downloaded output files.
23 | 
24 | ```python
25 | >>> toolchest_output.run_id
26 | '00000000-0000-0000-0000-000000000000'  # this will be your custom run ID
27 | >>> toolchest_output.output_path
28 | 'OUTPUT_DIR/test_output.txt'
29 | ```
30 | 
31 | You can store and use the `run_id` check the run's status with [Async Runs](async-runs.md).
32 | 
33 | `output_path` will be a string (for 1 output file), a list of strings (for multiple output files), or a null value (if 
34 | download was skipped).
35 | 
36 | ## Download
37 | 
38 | You can also directly call the **`download`** function from the output object to download (or re-download) the outputs. 
39 | 
40 | ```python
41 | toolchest_output.download(
42 |   	output_path="./",
43 | )
44 | ```
45 | 
46 | However, keep in mind that Toolchest only retains your job's output for 7 days after job execution.


--------------------------------------------------------------------------------
/docs/docs/feature-reference/output-streaming.md:
--------------------------------------------------------------------------------
 1 | # Live-Streaming Tool Output
 2 | 
 3 | For synchronous runs for Python and Lug, Toolchest supports streaming remote output live to wherever you're running 
 4 | Toolchest.
 5 | 
 6 | 
 7 | For example, here's a `python3` Toolchest call with streaming enabled and an example script:
 8 | ```python
 9 | import toolchest_client as tc
10 | tc.set_key("YOUR_KEY")
11 | tc.python3(
12 |     script="script.py",
13 |     streaming_enabled=True,
14 | )
15 | ```
16 | 
17 | ```python
18 | # script.py
19 | import time
20 | for letter in ["A", "B", "C"]:
21 |     print(f"Hello world {letter}")
22 |     time.sleep(1)
23 | ```
24 | 
25 | You'll see the following lines printed as they are generated by the remotely-running Python script, one line per second:
26 | ```text
27 | Hello world A
28 | Hello world B
29 | Hello world C
30 | ```
31 | 
32 | 
33 | !!! warning "Streaming and cancelling runs"
34 |     
35 |     With streaming enabled, tool execution terminates if the streaming connection is broken. This includes cancelling
36 |     your job by entering Ctrl-C. 
37 |     
38 |     If a job is cancelled before encountering a bug in your script, the error may not be visible in Toolchest logs.
39 | 
40 | ## Supported Tools
41 | Output streaming is supported for `python3` and `lug`. For both, streaming is enabled by default.


--------------------------------------------------------------------------------
/docs/docs/feature-reference/using-aws-with-toolchest.md:
--------------------------------------------------------------------------------
 1 | # Using AWS with Toolchest
 2 | 
 3 | Toolchest supports reading and writing from your S3 buckets. You can also run Toolchest within your own AWS account, so the files you pass to `inputs` and `output_path` aren't transferred outside your account.
 4 | 
 5 | ## Input Files
 6 | 
 7 | Files stored on S3 can be passed in as inputs, using the file's S3 URI.  For example:
 8 | s
 9 | ```python
10 | tc.kraken2(
11 |     inputs="s3://toolchest-demo-data/SRR16201572_R1.fastq",
12 |     output_path="./",
13 | )
14 | ```
15 | 
16 | ## Output to S3
17 | 
18 | Some tools support uploading outputs directly to your custom S3 bucket. For these runs, put the S3 bucket + prefix in 
19 | **`output_path`**. For example:
20 | 
21 | ```python
22 | tc.kraken2(
23 |     inputs="./example.fastq",
24 |     output_path="s3://your-output/your-intended-subfolder",
25 | )
26 | ```
27 | 
28 | ## Custom Databases
29 | 
30 | For some tools, you can use use a custom database stored on S3 using **`custom_database_path`**:
31 | 
32 | ```python
33 | tc.kraken2(
34 |     inputs="./example.fastq",
35 |     output_path="./example_output_dir",
36 |   	custom_database_path="s3://your-databases/your-kraken2-database",
37 | )
38 | ```
39 | 
40 | Toolchest needs permission to list and copy all of the files in the S3 prefix you use.
41 | 
42 | ## Granting Permissions to Toolchest to Access Your S3 Bucket
43 | 
44 | To grant Toolchest access to your S3 bucket, use this policy:
45 | 
46 | ```json
47 | {
48 | 	"Version": "2012-10-17",
49 | 	"Statement": [
50 | 		{
51 | 			"Sid": "Toolchest",
52 | 			"Effect": "Allow",
53 | 			"Principal": {
54 | 				"AWS": "arn:aws:iam::172533437917:role/toolchest-worker-node-role"
55 | 			},
56 | 			"Action": [
57 | 				"s3:GetObject",
58 | 				"s3:ListBucket"
59 | 			],
60 | 			"Resource": [
61 | 				"arn:aws:s3:::YOUR_BUCKET_NAME",
62 | 				"arn:aws:s3:::YOUR_BUCKET_NAME/*"
63 | 			]
64 | 		}
65 | 	]
66 | }
67 | ```
68 | 
69 | (Make sure to replace`YOUR_BUCKET_NAME` with your bucket)
70 | 
71 | You can restrict this to specific files or prefixes with whatever IAM policy you'd like, just make sure that Toolchest 
72 | has `s3:GetObject` for any file you'll use with Toolchest and `s3:ListBucket` permissions for any prefix.
73 | 
74 | After you add this policy, let us know and we'll complete the setup process!
75 | 
76 | ## Running Toolchest in Your Own AWS Account
77 | 
78 | You can run Toolchest in your own AWS account, and the data that you pass to `inputs` and `output_path` doesn't leave 
79 | your own AWS environment. [Get in touch with us](mailto:hello@trytoolchest.com) if you'd like to know more!


--------------------------------------------------------------------------------
/docs/docs/getting-started/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | Note: if you haven't already, make sure you [have an API key](https://trytoolchest.com)!
 4 | 
 5 | ## With `pip`
 6 | 
 7 | ```python
 8 | pip install toolchest-client
 9 | ```
10 | 
11 | ## With Poetry
12 | ```python
13 | poetry add toolchest-client
14 | ```
15 | 
16 | ## Supported Python versions
17 | 
18 | We support Python 3.7 through the latest Python 3.11 release candidate.
19 | 
20 | ## Supported operating systems
21 | 
22 | You can run Toolchest on most recent versions of macOS, Linux and Windows.
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/docs/docs/getting-started/python-functions-and-containers.md:
--------------------------------------------------------------------------------
 1 | # Deploying Python Functions and Docker Images
 2 | 
 3 | If you have custom Python functions that need more power, you can deploy those to the cloud using Toolchest as well!
 4 | 
 5 | This tends to work well with:
 6 | 
 7 | - custom command-line software that's packed in a Docker image
 8 | - packages that aren't on Toolchest yet
 9 | - or where you just have 
10 | 
11 | To do this, we recommend using [Lug](https://lug.dev), a fully open-source project that builds on top of Toolchest.
12 | 
13 | 


--------------------------------------------------------------------------------
/docs/docs/getting-started/running-bioinformatics-on-toolchest.md:
--------------------------------------------------------------------------------
 1 | # Toolchest-wrapped Command-line Software
 2 | 
 3 | Note: if you haven't already, make sure you [have an API key](https://trytoolchest.com) and Toolchest is installed!
 4 | 
 5 | The most popular bioinformatics software is run through the command line. Toolchest wraps this software in Python and 
 6 | runs it on the cloud.
 7 | 
 8 | ## A quick start
 9 | 
10 | To get started, we'll use STAR, but you can use any of the [packages supported by Toolchest](../tool-reference/about.md)
11 | . On the command-line, running STAR looks like:
12 | 
13 | ```shell
14 | STAR --outFileNamePrefix ./output_path --genomeDir ./database_CRCh38 --readFilesIn ./inputs/
15 | ```
16 | 
17 | With Toolchest, it's:
18 | 
19 | ```python
20 | import toolchest-client as tc
21 | 
22 | tc.set_key("YOUR_KEY")
23 | 
24 | tc.STAR(
25 |     read_one="s3://toolchest-demo-data/SRR2557119_small.fastq",
26 |     output_path="./output_path/",
27 |     database_name="GRCh38",
28 | )
29 | ```
30 | 
31 | and it runs in the cloud! Breaking down the arguments:
32 | 
33 | - `read_one` is for input files. They can be on your computer, or somewhere else like S3.
34 | - `output_path` is where your output files are written. This can also be your computer, or somewhere else like S3.
35 | - `database_name` is the name of the Toolchest-hosted database.
36 | 
37 | ## Adding more options
38 | 
39 | ```python
40 | import toolchest-client as tc
41 | 
42 | tc.set_key("YOUR_KEY")
43 | 
44 | tc.STAR(
45 |     read_one="s3://toolchest-demo-data/SRR2557119_small.fastq",
46 |     output_path="./output/",
47 |     database_name="GRCh38",
48 |     database_version="1",
49 |     tool_args="--outSAMtype BAM Unsorted"
50 | )
51 | ```
52 | 
53 | We added two new arguments:
54 | - `database_version` is the version number of the Toolchest-hosted database.
55 | - `tool_args` are the arguments that you would normally set on the command-line to customize execution.
56 | 
57 | Next, let's learn more about what kinds of files you can use with Toolchest.
58 | 


--------------------------------------------------------------------------------
/docs/docs/getting-started/using-files.md:
--------------------------------------------------------------------------------
  1 | # Using Files
  2 | 
  3 | Toolchest works with files on your computer (local files) or files on something like S3 (remote files). We recommend 
  4 | using local or S3 files for data integrity and speed of execution, but HTTP or FTP URLs are supported too.
  5 | 
  6 | For all tools and file types, `inputs` takes a string path or a list of paths. `output_path` always takes a directory 
  7 | path.
  8 | 
  9 | Let's take a look at what it looks like to use different types of local and remote paths!
 10 | 
 11 | !!! note "You can mix and match file sources"
 12 |     You can mix and match local and remote files in the same call. Every file is handled independently, so you can use S3, 
 13 |     FTP, and local files together.
 14 | 
 15 | ## Local files and directories
 16 | 
 17 | Local files are the most intuitive: you just pass normal paths directly to Toolchest. In the background, the files are 
 18 | transferred to and from the cloud.
 19 | 
 20 | `inputs` takes paths to files and directories.
 21 | 
 22 | `output_path` takes a path to a directory. Output files are written in this directory.
 23 | 
 24 | ### Local directory inputs
 25 | If a directory is passed, all files within the directory are used as input. Directory structure will be destroyed unless
 26 | `compress_inputs=True` is provided as an argument.
 27 | 
 28 | For example if you have the following directory structure:
 29 | ```text
 30 | /path/to/base/directory/
 31 |     subdirectory_one/
 32 |         input.fastq
 33 |     subdirectory_two/
 34 |         input.fastq
 35 |         info.txt
 36 | ```
 37 | and you used the following toolchest call:
 38 | ```python
 39 | tc.test(
 40 |     inputs="/path/to/base/directory/",
 41 |     compress_inputs=True
 42 | )
 43 | ```
 44 | Then the input files will retain the directory structure without name conflicts. If `compress_inputs` is set to `False`
 45 | or not provided, the 2 `inputs.fastq` would overwrite whichever one was downloaded second. 
 46 | 
 47 | ## Remote files
 48 | 
 49 | ### AWS S3
 50 | 
 51 | S3 files are the fastest and most reliable input source. Toolchest pulls directly from the path you pass.
 52 | 
 53 | - `inputs` takes S3 URIs for a file. If you have multiple files in an S3 directory, make sure to list the directory first 
 54 | and pass each file as an input.
 55 | - `output_path` accepts an S3 URI for a S3 prefix.
 56 | 
 57 | Here's an example using the `test` package with an S3 input:
 58 | ```python
 59 | tc.test(
 60 |     inputs="s3://toolchest-public-examples/example.fastq",
 61 |     output_path="s3://toolchest-public-output/remote-output/"
 62 | )
 63 | ```
 64 | 
 65 | !!! note "Make sure Toolchest has access to your S3 bucket"
 66 | 
 67 |     To grant Toolchest access, see [AWS Integration](../feature-reference/using-aws-with-toolchest.md).
 68 | 
 69 | ### HTTP/HTTPS
 70 | 
 71 | !!! warning "HTTP and HTTPS files are dangerous!"
 72 |     We can't guarantee data integrity on transfer, because different servers behave differently. Make sure that the HTTP 
 73 |     server supports `GET` requests with the `range` header. Always use a local or S3 file path if possible. Ye be warned!
 74 | 
 75 | - `inputs` takes an HTTP URL for a file. If you have multiple files in an HTTP directory, make sure to list the directory 
 76 | first, and pass each file as an input.
 77 | - `output_path` does not accept HTTP outputs at this time.
 78 | 
 79 | Here's an example using the `test` package with an HTTP input:
 80 | ```python
 81 | tc.test(
 82 |     inputs="https://rest.uniprot.org/uniprotkb/P48754.fasta",
 83 |     output_path="./"
 84 | )
 85 | ```
 86 | 
 87 | ### FTP
 88 | 
 89 | !!! warning "FTP files are dangerous!"
 90 |     We can't guarantee data integrity on transfer, because different servers behave differently. Always use a local or S3 
 91 |     file path if possible. Ye be warned!
 92 | 
 93 | - `inputs` accepts an FTP URL for a file. If you have multiple files in an FTP directory, make sure to list the 
 94 | directory first, and pass each file as an input.
 95 | - `output_path` does not accept FTP outputs at this time.
 96 | 
 97 | Here's an example using the `test` package with an FTP input:
 98 | ```python
 99 | tc.test(
100 |     inputs="ftp://ftp.sra.ebi.ac.uk/vol1/fastq//SRR999/000/SRR9990000/SRR9990000.fastq.gz",
101 |     output_path="./"
102 | )
103 | ```


--------------------------------------------------------------------------------
/docs/docs/images/toolchest_t.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/docs/docs/images/toolchest_t.png


--------------------------------------------------------------------------------
/docs/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Toolchest
 2 | 
 3 | If you're ready to start building, head straight to [Installation](getting-started/installation.md), 
 4 | [Running Bioinformatics Packages with Toolchest](./getting-started/running-bioinformatics-on-toolchest.md), or 
 5 | [Custom Python Functions and Containers](./getting-started/python-functions-and-containers.md).
 6 | 
 7 | ## What does Toolchest do?
 8 | 
 9 | Toolchest is an open source library for running computational biology software in the cloud.  For software that has 
10 | reference databases, Toolchest comes with pre-built reference DBs on our high-speed cloud database store – or you can 
11 | add your own.
12 | 
13 | Toolchest handles input and output file transfer as well as cloud resource provisioning. That means you can use the 
14 | Toolchest library from anywhere you write Python, including Jupyter notebooks or a Python function – on your computer or 
15 | in the cloud.
16 | 
17 | ## Who should use Toolchest?
18 | 
19 | If you:
20 | 
21 | - use bioinformatics software that runs on the command line, but you write code in Python
22 | - have functions that need more resources than your laptop, but you don't want to manage your own cloud infrastructure
23 | - handle a lot of data
24 | 
25 | then you should try Toolchest!
26 | 
27 | ## What doesn't Toolchest solve?
28 | 
29 | - Pipelining (see Prefect, Dagster, Nextflow, or Snakemake)
30 | - Data versioning or management
31 | 
32 | ## Why Toolchest?
33 | 
34 | - You can scale instantly with Toolchest; Toolchest is built on top of AWS
35 | - You don't need an AWS account! Toolchest jobs run in our own AWS account by default
36 | - Cloud resources are spun up and down immediately, maximizing efficiency and reducing idling resources
37 | 


--------------------------------------------------------------------------------
/docs/docs/tool-reference/about.md:
--------------------------------------------------------------------------------
1 | # About the Tool Reference
2 | 
3 | This section contains documentation for the core "tools" that make up Toolchest: the aligners, assemblers, classifiers, 
4 | and other software that Toolchest wraps.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/aligners.md:
--------------------------------------------------------------------------------
 1 | # Aligners
 2 | 
 3 | Aligners find the similarity between two or more sequences. Sometimes, query sequences are compared against a reference 
 4 | database in a sort of fuzzy search (e.g. [Bowtie 2](aligners/bowtie-2.md)). In other contexts, several query sequences
 5 | are compared against one another (e.g. [Clustal Omega](aligners/clustal-omega.md)).
 6 | 
 7 | Most aligners are tailored for specific types of data: [STAR](aligners/star.md) for single-cell RNA-Seq, 
 8 | [DIAMOND BLASTP](aligners/diamond/diamond-blastp.md) for protein sequences against a protein database, and 
 9 | [DIAMOND BLASTX](aligners/diamond/diamond-blastx.md) for translated nucleotide sequences against a protein database.
10 | 
11 | Toolchest hosts both the aligner and the reference databases, and you can also 
12 | [use your own custom database](../feature-reference/adding-and-updating-custom-databases.md).
13 | 
14 | If you don't need the extra information that aligners return – e.g. for some microbiome taxonomic classification – 
15 | you can also use a more efficient [classifier](taxonomic-classifiers.md).
16 | 
17 | If you want to use an aligner that's not listed here, [let us know](https://airtable.com/shrNBkD0bG2wB15jQ)! It might 
18 | already be available on our infrastructure but not documented.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/aligners/clustal-omega.md:
--------------------------------------------------------------------------------
 1 | **Clustal Omega** is a fast and scalable tool that makes multiple sequence alignments of protein sequences. For more 
 2 | information, see the tool's [homepage](http://www.clustal.org/omega/).
 3 | 
 4 | Function Call
 5 | =============
 6 | 
 7 | ```python
 8 | tc.clustalo(
 9 |     inputs,
10 |     output_path=None,
11 |     tool_args="",
12 |     is_async=False,
13 | )
14 | ```
15 | 
16 | Function Arguments
17 | ------------------
18 | 
19 | See the Notes section below for more details.
20 | 
21 | | Argument      | Use in place of:    | Description                                                                                                                                                                                                                                                        |
22 | | :------------ | :------------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
23 | | `inputs`      | `-i`                | Path to one or more files to use as input. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                                                                                    |
24 | | `output_path` | `-o`                | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                         |
25 | | `tool_args`   | all other arguments | (optional) Additional arguments to be passed to Clustal Omega. This should be a string of arguments like the command line. See [Supported Additional Arguments](#supported-additional-arguments) for more details. |
26 | | `is_async`    |                     | Whether to run a job asynchronously.  See [Async Runs](../../feature-reference/async-runs.md) for more.                                                                                                                                                                            |
27 | 
28 | Tool Versions
29 | =============
30 | 
31 | Toolchest currently supports version **1.2.4** of Clustal Omega.
32 | 
33 | Supported Additional Arguments
34 | ==============================
35 | 
36 | - `--auto`
37 | - `--dealign`
38 | - `--infmt`
39 | - `--is-profile`
40 | - `--iter`
41 | - `--iterations`
42 | - `--max-guidetree-iterations`
43 | - `--max-hmm-iterations`
44 | - `--maxnumseq`
45 | - `--maxseqlen`
46 | - `--outfmt`
47 | - `--output-order`
48 | - `--residuenumber`
49 | - `--resno`
50 | - `--seqtype`
51 | - `-t`
52 | - `--wrap`
53 | 
54 | Additional arguments can be specified under the `tool_args` argument.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/aligners/diamond.md:
--------------------------------------------------------------------------------
1 | DIAMOND is an aligner for protein and translated DNA sequences. For more information, see the tool's 
2 | [GitHub repo and wiki](https://github.com/bbuchfink/diamond).
3 | 
4 | DIAMOND has two modes available with Toolchest: **BLASTP** (`diamond blastp`) and **BLASTX** (`diamond blastx`). 
5 | Each mode has its own function call (`diamond_blastp` and `diamond_blastx`, respectively). See the relevant subpage for 
6 | in-depth documentation:
7 | 
8 | - [DIAMOND BLASTP](diamond/diamond-blastp)
9 | - [DIAMOND BLASTX](diamond/diamond-blastx)


--------------------------------------------------------------------------------
/docs/docs/tool-reference/aligners/diamond/diamond-blastp.md:
--------------------------------------------------------------------------------
 1 | **DIAMOND BLASTP** is [DIAMOND](../diamond.md)'s mode for protein sequence searches. For more information, see the tool's [GitHub repo and wiki](https://github.com/bbuchfink/diamond).
 2 | 
 3 | # Function Call
 4 | 
 5 | ```python
 6 | tc.diamond_blastp(
 7 |     inputs,
 8 |     output_path=None,
 9 |     database_name="diamond_blastp_standard",
10 |     database_version="1",
11 |     remote_database_path=None,
12 |     remote_database_primary_name=None,
13 |     tool_args="",
14 |     is_async=False,
15 | )
16 | ```
17 | 
18 | ## Function Arguments
19 | 
20 | See the Notes section below for more details.
21 | 
22 | | Argument                       | Use in place of:    | Description                                                                                                                                                                                                                    |
23 | | :----------------------------- | :------------------ |:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
24 | | `inputs`                       | `-q`, `--query`     | Path to one or more files to use as input. FASTA or FASTQ formats are supported, as well as gzip-compressed FASTA/FASTQ files. The files can be a local or remote, see [Using Files](../../../getting-started/using-files.md). |
25 | | `output_path`                  | `-o`, `--out`       | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../../getting-started/using-files.md).                          |
26 | | `database_name`                | `-d`                | (optional) Name of database to use for DIAMOND BLASTP. Defaults to `"diamond_blastp_standard"`, the SeqScreen database.                                                                                                        |
27 | | `database_version`             | database version    | (optional) Version of database to use for DIAMOND BLASTP. Defaults to `"1"`.                                                                                                                                                   |
28 | | `remote_database_path`         | `-d` (path)         | (optional) AWS S3 URI to the directory that contains your custom database.                                                                                                                                                     |
29 | | `remote_database_primary_name` | `-d` (name)         | (optional) The primary name (e.g. UNIREF100.mini) of your custom database.                                                                                                                                                     |
30 | | `tool_args`                    | all other arguments | (optional) Additional arguments to be passed to Diamond BLASTp. This should be a string of arguments like the command line.                                                                                                    |
31 | | `is_async`                     |                     | Whether to run a job asynchronously.  See [Async Runs](../../../feature-reference/async-runs.md) for more.                                                                                                                     |
32 | 
33 | DIAMOND BLASTP runs are aligned against the SeqScreen database by default. See the [Databases](#databases) section for more details.
34 | 
35 | # Tool Versions
36 | 
37 | Toolchest currently supports version **2.0.14** of DIAMOND.
38 | 
39 | # Databases
40 | 
41 | Toolchest currently supports the following databases for DIAMOND BLASTP:
42 | 
43 | | `database_name`           | `database_version` | Description                                                                                                                                                |
44 | | :------------------------ | :----------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------- |
45 | | `diamond_blastp_standard` | `1`                | SeqScreen DIAMOND BLASTP Database. See [the SeqScreen wiki](https://gitlab.com/treangenlab/seqscreen/-/wikis/02.-SeqScreen-Dependencies) for more details. |
46 | 
47 | # Supported Additional Arguments
48 | 
49 | - `-f`
50 | - `--fast`
51 | - `-l`
52 | - `--mid-sensitive`
53 | - `--min-orf`
54 | - `--more-sensitive`
55 | - `--no-self-hits`
56 | - `--outfmt`
57 | - `--sallseqid`
58 | - `--salltitles`
59 | - `--sensitive`
60 | - `--strand`
61 | - `--ultra-sensitive`
62 | - `--unal`
63 | - `--very-sensitive`
64 | 
65 | Additional arguments can be specified under the `tool_args` argument.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/aligners/rapsearch2.md:
--------------------------------------------------------------------------------
 1 | **RAPSearch2** is an aligner for protein similarity searches. It aligns DNA reads or protein sequences against a 
 2 | protein database. For more information, see the tool's [homepage](https://omics.informatics.indiana.edu/mg/RAPSearch2/)
 3 | , [GitHub repo](https://github.com/zhaoyanswill/RAPSearch2), and [Sourceforge page](http://rapsearch2.sourceforge.net/).
 4 | 
 5 | Function Call
 6 | =============
 7 | 
 8 | ```python
 9 | tc.rapsearch2(
10 |     inputs,
11 |     output_path=None,
12 |     database_name="rapsearch2_seqscreen",
13 |     database_version="1",
14 |     tool_args="",
15 |     is_async=False,
16 | )
17 | ```
18 | 
19 | Function Arguments
20 | ------------------
21 | 
22 | See the Notes section below for more details.
23 | 
24 | | Argument           | Use in place of:    | Description                                                                                                                                                                                                                                                                  |
25 | | :----------------- | :------------------ |:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
26 | | `inputs`           | `-q`                | Path to one or more files to use as input. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                                                                                                                      |
27 | | `output_path`      | `-o`                | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                                                           |
28 | | `database_name`    | `-d`\*              | (optional) Name of database to use for RAPSearch2 alignment. Defaults to `"GRCh38"` (human genome).                                                                                                                                                                          |
29 | | `database_version` | `-d`\*              | (optional) Version of database to use for RAPSearch2 alignment. Defaults to `"1"`.                                                                                                                                                                                           |
30 | | `tool_args`        | all other arguments | (optional) Additional arguments to be passed to RAPSearch2. This should be a string of arguments like the command line. See [Supported Additional Arguments](#supported-additional-arguments) for more details.                                                              |
31 | | `is_async`         |                     | Whether to run a job asynchronously.  See [Async Runs](../../feature-reference/async-runs.md) for more.                                                                                                                                                                      |
32 | 
33 | \*See the [Databases](#databases) section for more details.
34 | 
35 | Tool Versions
36 | =============
37 | 
38 | Toolchest currently supports version **2.24** of RAPSearch2.
39 | 
40 | Databases
41 | =========
42 | 
43 | Toolchest currently supports the following databases for RAPSearch2:
44 | 
45 | | `database_name`       | `database_version` | Description                                                                                                                                            |
46 | | :-------------------- | :----------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------- |
47 | | `rapsearch_seqscreen` | `1`                | SeqScreen RAPSearch2 Database. See [the SeqScreen wiki](https://gitlab.com/treangenlab/seqscreen/-/wikis/02.-SeqScreen-Dependencies) for more details. |
48 | 
49 | Supported Additional Arguments
50 | ==============================
51 | 
52 | - `-a`
53 | - `-b`
54 | - `-e`
55 | - `-g`
56 | - `-i`
57 | - `-l`
58 | - `-p`
59 | - `-s`
60 | - `-t`
61 | - `-v`
62 | - `-w`
63 | - `-x`
64 | 
65 | Additional arguments can be specified under the `tool_args` argument.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/aligners/salmon.md:
--------------------------------------------------------------------------------
 1 | **Salmon** is a computational genomics tool for transcriptomic analysis. For more information, see the tool's 
 2 | [GitHub repo](https://github.com/COMBINE-lab/salmon). Toolchest only supports running `salmon quant` with pre-built 
 3 | indexes in mapping mode at this time.
 4 | 
 5 | # Function Call
 6 | 
 7 | ```python
 8 | tc.salmon(
 9 |   	read_one=None,
10 |   	read_two=None,
11 |     single_end=None,
12 |   	output_path=None,
13 |   	tool_args="",
14 |   	database_name="salmon_hg38",
15 |   	database_version="1",
16 |     library_type="A",
17 |   	is_async=False,
18 | )
19 | ```
20 | 
21 | ## Function Arguments
22 | 
23 | | Argument           | Use in place of:     | Description                                                                                                                                                                                                                |
24 | | :----------------- | :------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
25 | | `read_one`         | `-1`                 | (optional) Path or list of paths to R1 of paired-end read input files. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                |
26 | | `read_two`         | `-2`                 | (optional) Path or list of paths to R2 of paired-end read input files. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                |
27 | | `single_end`       | `-r`                 | (optional) Path or list of paths to of single-end (or just R1 or R2) read input files. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).                |
28 | | `output_path`      | output file location | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). |
29 | | `tool_args`        | all other arguments  | (optional) Additional arguments to be passed to Salmon. This should be a string of arguments like the command line.                                                                                                        |
30 | | `database_name`    | `-i`                 | (optional) Name of database to use for Kraken 2 alignment. Defaults to `"salmon_hg38"`.                                                                                                                                    |
31 | | `database_version` | `-i`                 | (optional) Version of database to use for Kraken 2 alignment. Defaults to `"1"`.                                                                                                                                           |
32 | | `library_type`     | `-l`, `--libType`    | (optional) The library type used. Defaults to "A" for automatic classification. See [the Salmon docs on library types](https://salmon.readthedocs.io/en/latest/salmon.html#what-s-this-libtype) for more.                  |
33 | | `is_async`         |                      | Whether to run a job asynchronously.  See [Async Runs](../../feature-reference/async-runs.md) for more.                                                                                                                                    |
34 | 
35 | See the [Databases](#databases) section for more details.
36 | 
37 | ## Notes
38 | 
39 | ### Paired-end inputs
40 | 
41 | Paired-end read inputs can be set with either `inputs` or through `read_one` and `read_two`.
42 | 
43 | Make sure that the first item in `read_one` corresponds to the first item in `read_two`– and so on.
44 | 
45 | If you only have one end of a paired-end run, use the `single_end` argument.
46 | 
47 | # Tool Versions
48 | 
49 | Toolchest currently supports version **1.9.0** of Salmon.
50 | 
51 | # Databases
52 | 
53 | Toolchest currently supports the following databases for Salmon:
54 | 
55 | | `database_name` | `database_version` | Description                                                                      |
56 | | :-------------- | :----------------- | :------------------------------------------------------------------------------- |
57 | | `hg38`          | `1`                | hg38 precomputed index for Salmon, pulled from <http://refgenomes.databio.org/>. |
58 | 
59 | # Other modes
60 | 
61 | Only `quant` mode is supported at this time.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/all-other-tools.md:
--------------------------------------------------------------------------------
1 | Before any tool lands in our documentation, we add [publicly exposed integration tests](https://github.com/trytoolchest/toolchest-client-python/tree/main/tests) to ensure data quality.
2 | 
3 | If you want to use a tool not listed above, please [let us know](https://airtable.com/shrNBkD0bG2wB15jQ)! Some are in private beta as we test and verify their functionality, and we're rapidly adding more.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/assemblers.md:
--------------------------------------------------------------------------------
1 | Genome assemblers take partial sequences of a genome and assemble them to form a larger contiguous sequence – ideally, the whole genome.
2 | 
3 | The best assemblers work with both short reads (e.g. Illumina) and long reads (e.g. Oxford Nanopore) to quickly assemble a genome (e.g. [Unicycler](assemblers/unicycler.md)).
4 | 
5 | If you want to use an assembler that's not listed here, [let us know](https://airtable.com/shrNBkD0bG2wB15jQ)! It might even be already available on our infrastructure but not listed.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/assemblers/megahit.md:
--------------------------------------------------------------------------------
 1 | **MEGAHIT** is an assembler that's optimized for metagenomes. For more information, see the tool's 
 2 | [GitHub repo and wiki](https://github.com/voutcn/megahit).
 3 | 
 4 | Function Call
 5 | =============
 6 | 
 7 | ```python
 8 | tc.megahit(
 9 |   	read_one=None,
10 |   	read_two=None,
11 |   	interleaved=None,
12 |     single_end=None,
13 |   	output_path=None,
14 |   	tool_args="",
15 |   	is_async=False,
16 | )
17 | ```
18 | 
19 | Function Arguments
20 | ------------------
21 | 
22 | See the Notes section below for more details.
23 | 
24 | | Argument      | Use in place of:    | Description                                                                                                                                                                                        |
25 | | :------------ | :------------------ |:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
26 | | `read_one`    | `-1`                | (optional) Path to R1 of paired-end short read input files. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                            |
27 | | `read_two`    | `-2`                | (optional) Path to R2 of paired-end short read input files. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                            |
28 | | `interleaved` | `--12`              | (optional) Path to the file containing interleaved reads. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                              |
29 | | `single_end`  | `-r`                | (optional) Path to the file containing singled-ended reads. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                            |
30 | | `output_path` | `-o`                | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). |
31 | | `tool_args`   | all other arguments | (optional) A string containing additional arguments to be passed to MEGAHIT, formatted as if using the command line.                                                                               |
32 | | `is_async`    |                     | Whether to run a job asynchronously. See [Async Runs](../../feature-reference/async-runs.md) for more.                                                                                             |
33 | 
34 | Notes
35 | -----
36 | 
37 | ### Paired-end reads
38 | 
39 | For each paired-end input, make sure the corresponding read is in the same position in the input list. For example, two 
40 | pairs of paired-end files – `one_R1.fastq`, `one_R2.fastq`, `two_R1.fastq`, `two_R2.fastq` – should be passed to 
41 | Toolchest as:
42 | 
43 | ```python
44 | tc.megahit(
45 |   read_one=["one_R1.fastq", "two_R1.fastq"],
46 |   read_two=["one_R2.fastq", "two_R2.fastq"],
47 |   ...
48 | )
49 | ```
50 | 
51 | Tool Versions
52 | =============
53 | 
54 | Toolchest currently supports version **1.2.9** of MEGAHIT. 
55 | 
56 | Supported Additional Arguments
57 | ==============================
58 | 
59 | - \--min-count
60 | - \--k-list
61 | - \--k-min
62 | - \--k-max
63 | - \--k-step
64 | - \--no-mercy
65 | - \--bubble-level
66 | - \--merge-level
67 | - \--prune-level
68 | - \--prune-depth
69 | - \--disconnect-ratio
70 | - \--low-local-ratio
71 | - \--max-tip-len
72 | - \--cleaning-rounds
73 | - \--no-local
74 | - \--kmin-1pass
75 | - \--presets
76 | - \--min-contig-len
77 | 
78 | Set additional arguments with `tool_args`. For example: `tool_args="--no-local --no-mercy"`


--------------------------------------------------------------------------------
/docs/docs/tool-reference/assemblers/unicycler.md:
--------------------------------------------------------------------------------
 1 | **Unicycler** is an assembly pipeline for bacterial genomes. For more information, see the tool's 
 2 | [GitHub repo and wiki](https://github.com/rrwick/Unicycler).
 3 | 
 4 | Function Call
 5 | =============
 6 | 
 7 | ```python
 8 | tc.unicycler(
 9 |   	read_one=None,
10 |   	read_two=None,
11 |   	long_reads=None,
12 |   	output_path=None,
13 |   	tool_args="",
14 |   	is_async=False,
15 | )
16 | ```
17 | 
18 | Function Arguments
19 | ------------------
20 | 
21 | | Argument      | Use in place of:    | Description                                                                                                                                                                                                                                                |
22 | | :------------ | :------------------ | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
23 | | `read_one`    | `-1`                | (optional) Path to R1 of paired-end short read input files. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                                                            |
24 | | `read_two`    | `-2`                | (optional) Path to R2 of paired-end short read input files. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                                                            |
25 | | `long_reads`  | `-l`                | (optional) Path to the file containing long reads. The file can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                                                                     |
26 | | `output_path` | `-o`                | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                 |
27 | | `tool_args`   | all other arguments | (optional) Additional arguments to be passed to Unicycler. This should be a string of arguments like the command line. See [Supported Additional Arguments](#supported-additional-arguments) for more details. |
28 | | `is_async`    |                     | Whether to run a job asynchronously.  See [Async Runs](../../feature-reference/async-runs.md) for more.                                                                                                                                                                    |
29 | 
30 | Notes
31 | -----
32 | 
33 | ### Paired-end reads
34 | 
35 | Paired-end short read inputs should be specified with both `read_one` and `read_two`.
36 | 
37 | Tool Versions
38 | =============
39 | 
40 | Toolchest currently supports version **0.4.9** of Unicycler. 
41 | 
42 | Supported Additional Arguments
43 | ==============================
44 | 
45 | - `--depth_filter`
46 | - `--kmer_count`
47 | - `--kmers`
48 | - `--largest_component`
49 | - `--linear_seqs`
50 | - `--low_score`
51 | - `--max_kmer_frac`
52 | - `--min_component_size`
53 | - `--min_dead_end_size`
54 | - `--min_fasta_length`
55 | - `--min_kmer_frac`
56 | - `--min_polish_size`
57 | - `--mode`
58 | - `--no_correct`
59 | - `--no_miniasm`
60 | - `--no_pilon`
61 | - `--no_rotate`
62 | - `--scores`
63 | - `--start_gene_cov`
64 | - `--start_gene_id`
65 | - `--verbosity`
66 | 
67 | Additional arguments can be specified under the `tool_args` argument.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/demultiplexers.md:
--------------------------------------------------------------------------------
1 | Demultiplexing is extracting components from something that's all mixed together. It's like taking a rope and unwinding
2 | each individual thread.
3 | 
4 | Sometimes, this means using a straightforward tool like `bcl2fastq` to generate FASTQs from raw Illumina NGS reads. 
5 | Just for fun, we've also included an ML-based tool called [Demucs](demultiplexers/demucs.md) to separate song tracks.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/demultiplexers/demucs.md:
--------------------------------------------------------------------------------
 1 | **demucs** is a demultiplexing tool for audio source separation. To learn more about the tool, check out its 
 2 | [GitHub repo](https://github.com/facebookresearch/demucs).
 3 | 
 4 | Function Call
 5 | =============
 6 | 
 7 | ```python
 8 | tc.demucs(
 9 |   	inputs,
10 |   	output_path=None,
11 |   	tool_args="",
12 |   	is_async=False,
13 | )
14 | ```
15 | 
16 | Function Arguments
17 | ------------------
18 | 
19 | See the Notes section below for more details.
20 | 
21 | | Argument      | Use in place of:    | Description                                                                                                                                                                                                                |
22 | | :------------ | :------------------ | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
23 | | `inputs`      | `--input`           | Path to a file that will be passed in as input. All formats supported by `ffmpeg` are allowed. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).        |
24 | | `output_path` | `--output`          | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). |
25 | | `tool_args`   | all other arguments | (optional) A string containing additional arguments to be passed to Demucs, formatted as if using the command line.                                                                                                        |
26 | | `is_async`    |                     | Whether to run a job asynchronously.  See [Async Runs](../../feature-reference/async-runs.md) for more.                                                                                                                                    |
27 | 
28 | Tool Versions
29 | =============
30 | 
31 | Toolchest supports version **3.0.4** of Demucs.
32 | 
33 | Supported Additional Arguments
34 | ==============================
35 | 
36 | - \-v
37 | - \--verbose
38 | - \--shifts
39 | - \--overlap
40 | - \-no-split
41 | - \--two-stems
42 | - \--int24
43 | - \--float32
44 | - \--clip-mode
45 | - \--mp3
46 | - \--mp3-bitrate
47 | - \-n
48 | 
49 | Set additional arguments with `tool_args`. For example: `tool_args="-n mdx_extra --shifts=5"`


--------------------------------------------------------------------------------
/docs/docs/tool-reference/post-processing.md:
--------------------------------------------------------------------------------
1 | Some tools modify your raw data, but you still need another tool to bring the data to a usable state. We call that a 
2 | post-processing tool.
3 | 
4 | One example is [Kraken](taxonomic-classifiers/kraken-2.md), often used with [Bracken](post-processing/bracken.md) for 
5 | post-processing.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/pre-processing.md:
--------------------------------------------------------------------------------
1 | Some tools can check data integrity or transform data before use in another tool. We call that a pre-processing tool.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/pre-processing/fastqc.md:
--------------------------------------------------------------------------------
 1 | **FastQC** is a quality control tool for genomic sequence data. [See their website for more details](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
 2 | 
 3 | # Function Call
 4 | 
 5 | ```python
 6 | tc.fastqc(
 7 |     inputs,
 8 |     output_path=None,
 9 |     contaminants=None,
10 |     adapters=None,
11 |     limits=None,
12 |     tool_args="",
13 |     is_async=False,
14 | )
15 | ```
16 | 
17 | ## Function Arguments
18 | 
19 | 
20 | | Argument               | Use in place of:                    | Description                                                                                                                                                                                                               |
21 | |:-----------------------|:------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
22 | | `inputs`               | input file location                 | Path to one or more files to use as input.  \nSAM, BAM, or FASTQ formats are supported, as well as gzip-compressed variants. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). |
23 | | `output_path`          | `-o` (directory name)               | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).                        |
24 | | `contaminants`         | `-c` or `--contaminants` file path. | (optional) Path to a custom contaminants file.                                                                                                                                                                            |
25 | | `adapters`             | `-a` or `--adapters` file path.     | (optional) Path to a custom adapters file.                                                                                                                                                                                |
26 | | `limits`               | `-l` or `--limits` file path        | (optional) Path to a custom limits file.                                                                                                                                                                                  |
27 | | `tool_args`            | all other arguments                 | (optional) Additional arguments to be passed to FastQC. This should be a string of arguments like the command line.                                                                                                       |
28 | | `is_async`             |                                     | Whether to run a job asynchronously.  See [Async Runs](../../feature-reference/async-runs.md) for more.                                                                                                                   |
29 | 
30 | 
31 | 
32 | ## Output Files
33 | 
34 | A FastQC run will output the html report and output zip into `output_path`:
35 | 
36 | - `{input file name}_fastqc.html`: FastQC HTML report for checking data quality
37 | - `{input file name}\_fastqc.zip`: Zip directory containing the HTML report and some supporting files
38 | 
39 | # Tool Versions
40 | 
41 | Toolchest currently supports version **0.11.9** of FastQC.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/structure-prediction.md:
--------------------------------------------------------------------------------
1 | Structure prediction takes an input protein sequence and predicts the shape of its three dimensional structure.
2 | 
3 | Recent advances – including [AlphaFold](structure-prediction/alphafold.md) – make use of deep learning, making use of 
4 | GPUs and terabytes of pre-trained reference databases to predict 3D structure.
5 | 
6 | AlphaFold 2 is one of the few structure prediction tools that's hosted on Toolchest, but 
7 | [let us know](https://airtable.com/shrNBkD0bG2wB15jQ) if there's another tool that you'd like to use!


--------------------------------------------------------------------------------
/docs/docs/tool-reference/structure-prediction/alphafold.md:
--------------------------------------------------------------------------------
 1 | AlphaFold is a deep learning tool for predicting a protein’s 3D structure from its amino acid sequence. It was 
 2 | developed by DeepMind and utilizes GPU compute. For more information, see the tool's 
 3 | [homepage](https://alphafold.ebi.ac.uk/) and [GitHub repo](https://github.com/deepmind/alphafold).
 4 | 
 5 | Function Call
 6 | =============
 7 | 
 8 | ```python
 9 | tc.alphafold(
10 |   	inputs,
11 |   	output_path=None,
12 |   	model_preset=None,
13 |   	max_template_date=None,
14 |   	use_reduced_dbs=False,
15 |   	is_prokaryote_list=None,
16 |   	is_async=False,
17 | )
18 | ```
19 | 
20 | Function Arguments
21 | ------------------
22 | 
23 | See the Notes section below for more details.
24 | 
25 | | Argument             | Use in place of:          | Description                                                                                                                                                                                                                   |
26 | | :------------------- | :------------------------ | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
27 | | `inputs`             | `--fasta-paths`           | Path to one or more files to use as input. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                                               |
28 | | `output_path`        | `--output_dir`            | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).    |
29 | | `model_preset`       | `--model_preset`          | (optional) Specific AlphaFold model to use. Options are [`monomer`, `monomer_casp14`, `monomer_ptm`, `multimer`]. Defaults to `monomer`.                                                                                      |
30 | | `max_template_date`  | `--max_template_date`     | (optional) String of date in YYYY-MM-DD format.  Restricts protein structure prediction to those in the database before this date. Defaults to today's date.                                                                  |
31 | | `use_reduced_dbs`    | `--db_preset=reduced_dbs` | (optional) Whether to use a smaller version of the BFD database. If true, reduces run time at the cost of result quality.                                                                                                     |
32 | | `is_prokaryote_list` | `--is_prokaryote_list`    | (optional) List of booleans that determines whether all input sequences in the given FASTA file are prokaryotic. Expects the string normally used input into AlphaFold (e.g. "true,true" if there are two prokaryote inputs). |
33 | | `is_async`           |                           | Whether to run a job asynchronously.  See [Async Runs](../../feature-reference/async-runs.md) for more.                                                                                                                                       |
34 | 
35 | Tool Versions
36 | =============
37 | 
38 | Toolchest currently supports version **2.1.2** of AlphaFold. 
39 | 
40 | Database
41 | ========
42 | 
43 | Toolchest's implementation of AlphaFold uses AlphaFold's required genetic sequence databases. For a complete list of databases used, see the tool's [GitHub page](https://github.com/deepmind/alphafold).
44 | 
45 | Supported Additional Arguments
46 | ==============================
47 | 
48 | Toolchest supports the following arguments for AlphaFold: 
49 | 
50 | - `--db_preset`
51 | - `--is_prokaryote_list`
52 | - `--max_template_date`
53 | - `--model_preset`
54 | 
55 | However, these should be specified via specific argument values in the function call, rather than through a generic `tool_args` argument (like other Toolchest tools). See [Function Arguments](#function-arguments) for more details.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/taxonomic-classifiers.md:
--------------------------------------------------------------------------------
 1 | Taxonomic classifiers perform a fuzzy search between input sequences and reference databases. A classic use-case is 
 2 | determining the relative abundance of a microbiome sample.
 3 | 
 4 | If you only need relative abundance, you can use a taxonomic profiler that simply returns relative abundance. For a 
 5 | more detailed view, you can use a classifier like [Kraken 2](taxonomic-classifiers/kraken-2.md).
 6 | 
 7 | Toolchest hosts both the taxonomic classifier and the corresponding reference databases, and you can also 
 8 | [use your own custom database](../feature-reference/adding-and-updating-custom-databases.md).
 9 | 
10 | Typically, taxonomic classifiers are more efficient than aligners for taxonomic classification, but most are based on 
11 | heuristic methods rather than optimal alignment scores. If you're looking for something more analogous to BLAST, check 
12 | out [aligners](aligners.md).
13 | 
14 | If you want to use a taxonomic classifier that's not listed here, let us know! It might even be already available on 
15 | our infrastructure but not listed.


--------------------------------------------------------------------------------
/docs/docs/tool-reference/taxonomic-classifiers/metaphlan.md:
--------------------------------------------------------------------------------
 1 | **MetaPhlAn** is a tool for profiling the composition of microbial communities. For more information, see the tool's 
 2 | [website](https://huttenhower.sph.harvard.edu/metaphlan/) or 
 3 | [Github wiki](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-3.0).
 4 | 
 5 | # Function Call
 6 | 
 7 | ```python
 8 | tc.metaphlan(
 9 |   	inputs,
10 |   	output_path=None,
11 |   	output_primary_name="out.txt",
12 |   	tool_args="",
13 |   	is_async=False,
14 | )
15 | ```
16 | 
17 | ## Function Arguments
18 | 
19 | | Argument              | Use in place of:     | Description                                                                                                                                                                                                                |
20 | | :-------------------- | :------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
21 | | `inputs`              | input file location  | Path to one or more files to use as input. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md).                                                            |
22 | | `output_path`         | output file location | (optional) Path (directory) to where the output files will be downloaded. If omitted, skips download. The files can be a local or remote, see [Using Files](../../getting-started/using-files.md). |
23 | | `tool_args`           | all other arguments  | (optional) Additional arguments to be passed to MetaPhlAn. This should be a string of arguments like the command line.                                                                                                     |
24 | | `output_primary_name` |                      | (optional) Sets the name of the main output file. Defaults to "out.txt"                                                                                                                                                    |
25 | | `is_async`            |                      | Whether to run a job asynchronously.  See [Async Runs](../../feature-reference/async-runs.md) for more.                                                                                                                                    |
26 | 
27 | See the [Databases](#databases) section for more details.
28 | 
29 | ## Output Files
30 | 
31 | A MetaPhlAn run will output 2 files into `output_path`:
32 | 
33 | - `out.txt`: Results of the MetaPhlAn run.
34 | - `{input_file_name}.bowtie2out.txt`: The intermediate Bowtie 2 output file generated by MetaPhlAn. This can be passed in as input to quickly rerun with the same input. This is not generated if `--no-map`is passed via `tool_args`
35 | 
36 | # Tool Versions
37 | 
38 | Toolchest currently supports version **3.0.14** of MetaPhlAn.
39 | 
40 | # Databases
41 | 
42 | Toolchest currently supports the latest version of the `mpa_v30_CHOCOPhlAn_201901_marker_info` database. You can read more about the database on the [Github wiki](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-3.0).


--------------------------------------------------------------------------------
/docs/docs/tool-reference/test-runs.md:
--------------------------------------------------------------------------------
 1 | You can call the `test` function to mimic a Toolchest run. 
 2 | 
 3 | `test` actually uploads your inputs to Toolchest's infrastructure. Nothing is done to the files beyond the upload.
 4 | 
 5 | Function Call
 6 | =============
 7 | 
 8 | ```python
 9 | tc.test(
10 |   	inputs,
11 |   	output_path=None,
12 |   	tool_args="",
13 |   	is_async=False,
14 | )
15 | ```
16 | 
17 | Output Files
18 | ------------
19 | 
20 | `test` has one output file, `test_output.txt`, a text document that reads:
21 | 
22 | ```
23 | success
24 | ```


--------------------------------------------------------------------------------
/docs/docs/tool-reference/transfer.md:
--------------------------------------------------------------------------------
 1 | Sometimes, there's no alternative to downloading a terabyte of data from an FTP or HTTPS source. When the source 
 2 | download speed is 5 MB/s (looking at you, NCBI RefSeq and EMBL!), the transfer takes days – a long time to keep your 
 3 | laptop up and running.
 4 | 
 5 | `transfer` moves files from any supported input location to any supported output location. It runs in the background, 
 6 | meaning you don't need to keep your laptop or server running during transfer.
 7 | 
 8 | # Function Call
 9 | 
10 | ```python
11 | tc.transfer(
12 |   	inputs,
13 |   	output_path=None,
14 |   	is_async=True,
15 | )
16 | ```
17 | 
18 | ## Function Arguments
19 | 
20 | 
21 | | Argument      | Description                                                                                                                                                                                                                   |
22 | | :------------ |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
23 | | `inputs`      | Path to a file that will be passed in as input. All formats supported by `ffmpeg` are allowed. The files can be a local or remote, see [Using Files](../getting-started/using-files.md).                                      |
24 | | `output_path` | (optional) Path (directory) to where the output files will be downloaded. The path can be a local or remote, see [Using Files](../getting-started/using-files.md).                                                            |
25 | | `is_async`    | Whether to run the job asynchronously. By default, this is true. If you set this to false, the Toolchest command will wait to exit until the transfer is complete. See [Async Runs](../feature-reference/async-runs.md) for more. |
26 | 


--------------------------------------------------------------------------------
/docs/docs/tool-reference/workflows-meta-tools.md:
--------------------------------------------------------------------------------
 1 | Workflow tools – or meta-tools – wrap several tools to form a unified pipeline. These workflow tools are usually 
 2 | focused specific area like microbiome or single-cell analysis, but they wrap more generic tools under the hood. 
 3 | 
 4 | They're popular, because they're easy to use; there's no need to make your own choices on aligners, assemblers, or 
 5 | classifiers, and the tools the workflow creator chooses are often pre-tuned for one specific use-case.
 6 | 
 7 | [HUMAnN](workflows-meta-tools/humann3.md) is a perfect example of these types of meta-tools. Under the hood, it uses:
 8 | 
 9 | - [Bowtie 2](aligners/bowtie-2.md)
10 | - [Diamond](aligners/diamond.md)
11 | - [Python 3](python3.md)
12 | - [RAPSearch2](aligners/rapsearch2.md)
13 | - [MetaPhlAn 3](taxonomic-classifiers/metaphlan.md)
14 | 
15 | and several other tools, all wrapped under the `humann` command.
16 | 
17 | When using workflow tools via Toolchest, you'll notice a new argument: `mode`. This lets you run sub-tools directly.


--------------------------------------------------------------------------------
/docs/docs/toolchest-hosted-cloud/instance-types.md:
--------------------------------------------------------------------------------
 1 | # Instance Types
 2 | 
 3 | | Instance type | vCPUs | Memory (GB) | GPU type                                  |
 4 | | :------------ | :---- | :---------- | :---------------------------------------- |
 5 | | compute-2     | 2     | 4           |                                           |
 6 | | compute-4     | 4     | 8           |                                           |
 7 | | compute-8     | 8     | 16          |                                           |
 8 | | compute-16    | 16    | 32          |                                           |
 9 | | compute-32    | 32    | 64          |                                           |
10 | | compute-48    | 48    | 96          |                                           |
11 | | compute-64    | 64    | 128         |                                           |
12 | | compute-96    | 96    | 192         |                                           |
13 | | general-2     | 2     | 8           |                                           |
14 | | general-4     | 4     | 16          |                                           |
15 | | general-8     | 8     | 32          |                                           |
16 | | general-16    | 16    | 64          |                                           |
17 | | general-32    | 32    | 128         |                                           |
18 | | general-48    | 48    | 192         |                                           |
19 | | general-64    | 64    | 256         |                                           |
20 | | general-96    | 96    | 384         |                                           |
21 | | gpu-V100      | 8     | 61          | 1  NVIDIA Tesla V100 with 16 GB of memory |
22 | | memory-16     | 2     | 16          |                                           |
23 | | memory-32     | 4     | 32          |                                           |
24 | | memory-64     | 8     | 64          |                                           |
25 | | memory-128    | 16    | 128         |                                           |
26 | | memory-256    | 32    | 256         |                                           |
27 | | memory-384    | 48    | 384         |                                           |
28 | | memory-512    | 64    | 512         |                                           |
29 | 


--------------------------------------------------------------------------------
/docs/docs/toolchest-hosted-cloud/pricing.md:
--------------------------------------------------------------------------------
 1 | # Toolchest-hosted pricing and instance types
 2 | 
 3 | By default, Toolchest jobs run in Toolchest's managed AWS account. The prices below are for resources that you spawn by 
 4 | running Toolchest jobs. For information on running Toolchest in your own AWS account, see 
 5 | [Running Toolchest in your AWS account](./running-toolchest-in-your-aws-account.md)
 6 | 
 7 | Toolchest Hosted Cloud pricing starts with a free allowance and moves to incremental billing, scaling as your usage 
 8 | grows.
 9 | 
10 | Per-minute billing starts when the Toolchest instance begins executing, and stops immediately when a run finishes. You 
11 | can say goodbye to paying for idling cloud instances.
12 | 
13 | ## Free tier
14 | 
15 | ### Compute
16 | 
17 | | Service     | Free tier      | What can you run?                                 |
18 | | :---------- | :------------- | :------------------------------------------------ |
19 | | vCPU        | 50 vCPU-hours  | A run that lasts 2 hours with 25 vCPUs.           |
20 | | RAM         | 100 GB-hours   | A run that lasts 2 hours with 50 GB of RAM        |
21 | | Disk        | 2 TB-hour      | A run that lasts 2 hours with 1 TB of disk space. |
22 | | Invocations | 50 invocations | 50 runs                                           |
23 | 
24 | ### Files
25 | 
26 | | Service                         | Free tier | What can you run?                                                                  |
27 | | :------------------------------ | :-------- | :--------------------------------------------------------------------------------- |
28 | | Input and output files          | 100 GB    | A run with 40 GB of transferred input files and 60 GB of transferred output files. |
29 | | High speed reference DB storage | 50 GB/mo  | A custom reference database for Kraken 2 that's 50 GB.                             |
30 | 
31 | ## Growth pricing
32 | 
33 | ### Compute
34 | 
35 | | Service    | Cost                 | Billing increment                     |
36 | | :--------- | :------------------- | :------------------------------------ |
37 | | vCPU       | $0.084 per vCPU-hour | Per minute, with a one minute minimum |
38 | | RAM        | $0.016 per GB-hour   | Per minute, with a one minute minimum |
39 | | Disk       | $0.009 per TB-hour   | Per minute, with a one minute minimum |
40 | | Invocation | $0.10 per invocation | Per run                               |
41 | 
42 | ### Files
43 | 
44 | | Service                               | Cost           | Billing increment                  |
45 | | :------------------------------------ | :------------- | :--------------------------------- |
46 | | Input and output files                | $0.1 per GB    | Per GB                             |
47 | | High speed reference database storage | $2.4 per GB-mo | Per month, with at least one month |
48 | 
49 | !!! note "Input and output file pricing includes network data transfer and temporary storage"
50 |     Every input and output file includes free transfer to and from Toolchest infrastructure. The files are cached for one week (7 days) after the run is initialized.
51 | 
52 | ### Example pricing with a Toolchest-hosted bioinformatics tool, Kraken 2
53 | 
54 | A Kraken 2 run with 2 GB of input files, 16 vCPUs, and 128 GB of RAM with 128 GB of disk space runs for 5 minutes. It produces 1 GB of output files, for a total of 3 GB of input and output files. This costs:
55 | 
56 | - 3 GB of input and output files \* $0.1 per GB = $0.3
57 | - 16 vCPUs \* 0.08 hours \* $0.084 per vCPU-hour = $0.10752
58 | - 128 GB of RAM \* 0.08 hours \* $0.016 per RAM GB-hour = $0.16384
59 | - 0.125 TB of disk \* 0.08 hours \* $0.009 per TB-hour = $0.00009
60 | - 1 invocation = $0.10
61 | 
62 | For a total of **$0.67**
63 | 
64 | ### Example pricing with a custom Python script
65 | 
66 | A custom Python3 script with 40 GB of input files, 32 vCPUs, and 64 GB of RAM with 256 GB of disk space runs for 30 minutes. It produces 10 GB of output files, for a total of 50 GB of input and output files. This costs:
67 | 
68 | - 50 GB of input and output files \* $0.1 per GB = $5
69 | - 32 vCPUs \* 0.5 hours \* $0.084 per vCPU-hour = $1.344
70 | - 64 GB of RAM \* 0.5 hours \* $0.016 per RAM GB-hour = $0.512
71 | - 0.25 TB of disk \* 0.5 hours \* $0.009 per TB-hour = $0.001125
72 | - 1 invocation = $0.10
73 | 
74 | For a total of **$6.96**
75 | 
76 | 
77 | ## Support
78 | 
79 | Every customer gets access to text-based support – including a shared Slack channel, email, and any other async way 
80 | that you can think of talking to us.
81 | 
82 | We offer synchronous support, and SLAs for support and infrastructure availability, too.
83 | 
84 | ## Custom plans
85 | 
86 | If you're a business with unique needs (e.g. high volume, a non-standard business model, or very large files), we can 
87 | build a custom plan for you.


--------------------------------------------------------------------------------
/docs/docs/toolchest-hosted-cloud/running-toolchest-in-your-aws-account.md:
--------------------------------------------------------------------------------
1 | # Running Toolchest in your AWS account
2 | 
3 | If you just want to enable Toolchest to pull from your S3 buckets, check out 
4 | [Using AWS with Toolchest](../feature-reference/using-aws-with-toolchest.md).
5 | 
6 | You can also set up Toolchest to run instances in your own AWS account. We're gating this feature for now, so let us 
7 | know if you'd like access.


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "toolchest-client"
 3 | version = "0.11.14"
 4 | description = "Python client for Toolchest"
 5 | authors = [
 6 |     "Justin Herr <justin@trytoolchest.com>",
 7 |     "Noah Lebovic <noah@trytoolchest.com",
 8 | ]
 9 | license = "AGPL-3.0-only"
10 | readme = "README.md"
11 | homepage = "https://github.com/trytoolchest/toolchest-client-python"
12 | repository = "https://github.com/trytoolchest/toolchest-client-python"
13 | documentation = "https://docs.trytoolchest.com/"
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "Operating System :: OS Independent",
17 | ]
18 | packages = [
19 |     { include = "toolchest_client" },
20 | ]
21 | 
22 | [tool.poetry.urls]
23 | "Bug Tracker" = "https://github.com/trytoolchest/toolchest-client-python/issues"
24 | 
25 | [tool.poetry.dependencies]
26 | boto3 = "^1.18.29"
27 | python = "^3.7"
28 | requests = "^2.25.1"
29 | docker = "^6.0.0"
30 | importlib-metadata = "~=4.2"
31 | python-dotenv = "^0.20.0"
32 | typer = "^0.4.1"
33 | websockets = "^10.3"
34 | loguru = "^0.6.0"
35 | 
36 | [tool.poetry.dev-dependencies]
37 | flake8 =  { version = "^5.0.4", python = ">=3.8" } # version restriction required for mkdocs-material compatibility
38 | pytest = "^7.1.2"
39 | mkdocs-material = "^8.5.6"
40 | 
41 | [[tool.poetry.source]]
42 | name = "pypi-public"
43 | url = "https://pypi.org/simple/"
44 | 
45 | [[tool.poetry.source]]
46 | name = "prod-pypi"
47 | url = "https://upload.pypi.org/legacy/"
48 | secondary = true
49 | 
50 | [[tool.poetry.source]]
51 | name = "test-pypi"
52 | url = "https://test.pypi.org/legacy/"
53 | secondary = true
54 | 
55 | [tool.poetry.scripts]
56 | toolchest = "toolchest_client.cli.cli:app"
57 | 
58 | [build-system]
59 | requires = ["poetry-core>=1.0.0"]
60 | build-backend = "poetry.core.masonry.api"
61 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     integration: mark a test as an integration test.
4 |     integration_full: mark a test as a full-suite integration test (only run on pre-deploy).
5 | junit_family=xunit1
6 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def pytest_configure(config):
 5 |     """
 6 |     Allows plugins and conftest files to perform initial configuration.
 7 |     This hook is called for every plugin and initial conftest
 8 |     file after command line options have been parsed.
 9 |     """
10 |     if os.environ.get("DEPLOY_ENVIRONMENT") == "staging":
11 |         os.environ["TOOLCHEST_API_URL"] = os.environ["TOOLCHEST_STAGING_URL"]
12 | 


--------------------------------------------------------------------------------
/tests/test_async.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import time
 4 | 
 5 | from tests.util import hash
 6 | import toolchest_client as toolchest
 7 | 
 8 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 9 | if toolchest_api_key:
10 |     toolchest.set_key(toolchest_api_key)
11 | 
12 | 
13 | @pytest.mark.integration
14 | def test_async_execution():
15 |     """
16 |     Tests Kraken 2 running async using a small reference database
17 |     """
18 | 
19 |     test_dir = "temp_test_async_execution"
20 |     os.makedirs(f"./{test_dir}", exist_ok=True)
21 |     output_dir_path = f"./{test_dir}"
22 |     output_file_path = f"{output_dir_path}/kraken2_output.txt"
23 | 
24 |     custom_db = "s3://toolchest-fsx-databases/kraken2/k2_viral_20210517/"
25 |     toolchest_run = toolchest.kraken2(
26 |         read_one="s3://toolchest-integration-tests/synthetic_bacteroides_reads.fasta",
27 |         remote_database_path=custom_db,
28 |         output_path=output_dir_path,
29 |         is_async=True,
30 |     )
31 | 
32 |     run_status = ''
33 |     while run_status != toolchest.Status.READY_TO_TRANSFER_TO_CLIENT:
34 |         time.sleep(5)
35 | 
36 |         run_status = toolchest.get_status(run_id=toolchest_run.run_id)
37 |         if run_status == toolchest.Status.FAILED:
38 |             print("Toolchest run failed.")
39 | 
40 |     toolchest.download(
41 |         output_path=output_dir_path,
42 |         run_id=toolchest_run.run_id,
43 |     )
44 | 
45 |     assert hash.unordered(output_file_path) == 1003212151
46 | 


--------------------------------------------------------------------------------
/tests/test_blastn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash
 5 | import toolchest_client as toolchest
 6 | 
 7 | 
 8 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 9 | if toolchest_api_key:
10 |     toolchest.set_key(toolchest_api_key)
11 | 
12 | 
13 | @pytest.mark.integration
14 | def test_blastn_nt():
15 |     """
16 |     Tests BLASTN against the default nt (v1) DB
17 |     """
18 |     test_dir = "temp_test_blastn_nt"
19 |     os.makedirs(f"./{test_dir}", exist_ok=True)
20 |     output_dir_path = f"./{test_dir}"
21 |     output_file_path = f"{output_dir_path}/blastn_results.out"
22 | 
23 |     toolchest.blastn(
24 |         inputs="s3://toolchest-integration-tests/small_synthetic_bacteroides_reads.fasta",
25 |         output_path=output_dir_path,
26 |         tool_args="-mt_mode 1"
27 |     )
28 | 
29 |     assert hash.unordered(output_file_path) == 1290536116
30 | 
31 | 
32 | @pytest.mark.integration
33 | def test_blastn_nt_task_blastn():
34 |     """
35 |     Tests BLASTN against the default nt (v1) DB
36 |     """
37 |     test_dir = "temp_test_blastn_nt"
38 |     os.makedirs(f"./{test_dir}", exist_ok=True)
39 |     output_dir_path = f"./{test_dir}"
40 |     output_file_path = f"{output_dir_path}/blastn_results.out"
41 | 
42 |     toolchest.blastn(
43 |         inputs="s3://toolchest-integration-tests/small_synthetic_bacteroides_reads.fasta",
44 |         output_path=output_dir_path,
45 |         tool_args="-mt_mode 1 -task blastn"
46 |     )
47 | 
48 |     assert hash.unordered(output_file_path) == 1657058660
49 | 


--------------------------------------------------------------------------------
/tests/test_bowtie2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash, filter_output
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_bowtie2():
14 |     """
15 |     Tests bowtie2
16 |     """
17 | 
18 |     test_dir = "temp_test_bowtie2_standard"
19 |     os.makedirs(f"./{test_dir}", exist_ok=True)
20 |     output_dir_path = f"./{test_dir}"
21 |     output_file_path = f"{output_dir_path}/bowtie2_output.sam"
22 |     filtered_output_file_path = f"{output_dir_path}/bowtie2_output.filtered.sam"
23 | 
24 |     toolchest.bowtie2(
25 |         inputs="s3://toolchest-integration-tests/DRR000006.fastq.gz",
26 |         output_path=output_dir_path,
27 |     )
28 | 
29 |     # Filter non-deterministic metadata lines
30 |     filter_output.filter_sam(output_file_path, filtered_output_file_path)
31 |     assert hash.unordered(filtered_output_file_path) == 1444969892
32 | 


--------------------------------------------------------------------------------
/tests/test_cellranger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import shutil
 4 | 
 5 | from tests.util import s3, hash
 6 | import toolchest_client as toolchest
 7 | 
 8 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 9 | if toolchest_api_key:
10 |     toolchest.set_key(toolchest_api_key)
11 | 
12 | 
13 | @pytest.mark.integration
14 | def test_cellranger_count_s3_inputs():
15 |     test_dir = "temp_test_cellranger_count_s3_inputs"
16 |     output_dir_path = f"./{test_dir}/output/"
17 | 
18 |     # Test using an S3 prefix containing 3 FASTQs
19 |     output = toolchest.cellranger_count(
20 |         inputs="s3://toolchest-integration-tests/cellranger/count/pbmc_1k_v3_fastqs_trimmed",
21 |         database_name="GRCh38",
22 |         output_path=output_dir_path,
23 |         skip_decompression=True,
24 |     )
25 |     verify_cellranger_count_outputs(output.output_file_paths, output_dir_path)
26 | 
27 | 
28 | @pytest.mark.integration
29 | def test_cellranger_count_local_inputs():
30 |     test_dir = "temp_test_cellranger_count_local_inputs"
31 |     input_dir_path = f"./{test_dir}/inputs/"
32 |     output_dir_path = f"./{test_dir}/output/"
33 |     os.makedirs(input_dir_path, exist_ok=True)
34 | 
35 |     # Test from a directory of local inputs
36 |     input_file_names = [
37 |         "pbmc_1k_v3_trimmed_S1_L001_I1_001.fastq",
38 |         "pbmc_1k_v3_trimmed_S1_L001_R1_001.fastq",
39 |         "pbmc_1k_v3_trimmed_S1_L001_R2_001.fastq",
40 |     ]
41 |     for input_file_name in input_file_names:
42 |         s3.download_integration_test_input(
43 |             s3_file_key=f"cellranger/count/pbmc_1k_v3_fastqs_trimmed/{input_file_name}",
44 |             output_file_path=f"{input_dir_path}/{input_file_name}",
45 |         )
46 |     output = toolchest.cellranger_count(
47 |         inputs=input_dir_path,
48 |         database_name="GRCh38",
49 |         output_path=output_dir_path,
50 |         skip_decompression=True,
51 |     )
52 |     verify_cellranger_count_outputs(output.output_file_paths, output_dir_path)
53 | 
54 | 
55 | def verify_cellranger_count_outputs(archive_path, output_dir_path):
56 |     # Expected properties of outputs
57 |     MIN_EXPECTED_ARCHIVE_SIZE = 34000000
58 |     MAX_EXPECTED_ARCHIVE_SIZE = 38000000
59 |     EXPECTED_SUMMARY_SIZE = 2744825
60 |     EXPECTED_RAW_MATRIX_SIZE = 868393
61 |     EXPECTED_RAW_MATRIX_HASH = "d00cca1d2b4344b03946eeaeedc17ed5"
62 |     EXPECTED_FILTERED_MATRIX_SIZE = 503956
63 | 
64 |     # Verify properties of packed archive
65 |     archive_size = os.path.getsize(archive_path)
66 |     assert MIN_EXPECTED_ARCHIVE_SIZE <= archive_size <= MAX_EXPECTED_ARCHIVE_SIZE
67 | 
68 |     shutil.unpack_archive(
69 |         filename=archive_path,
70 |         extract_dir=output_dir_path,
71 |         format="gztar",
72 |     )
73 | 
74 |     # Verify properties of unpacked files
75 |     summary_path = f"{output_dir_path}outs/web_summary.html"
76 |     raw_matrix_path = f"{output_dir_path}outs/raw_feature_bc_matrix.h5"
77 |     filtered_matrix_path = f"{output_dir_path}outs/filtered_feature_bc_matrix.h5"
78 |     assert os.path.getsize(summary_path) == EXPECTED_SUMMARY_SIZE
79 |     assert os.path.getsize(raw_matrix_path) == EXPECTED_RAW_MATRIX_SIZE
80 |     assert os.path.getsize(filtered_matrix_path) == EXPECTED_FILTERED_MATRIX_SIZE
81 |     assert hash.binary_hash(raw_matrix_path) == EXPECTED_RAW_MATRIX_HASH
82 | 


--------------------------------------------------------------------------------
/tests/test_centrifuge.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_centrifuge_many_types():
14 |     """
15 |     Tests Centrifuge with one pair of paired-end inputs and two single-end inputs.
16 |     """
17 |     test_dir = "temp_test_centrifuge/many_types"
18 |     os.makedirs(f"./{test_dir}", exist_ok=True)
19 |     output_dir_path = f"./{test_dir}"
20 |     output_file_path = f"{output_dir_path}/centrifuge_output.txt"
21 |     output_report_path = f"{output_dir_path}/centrifuge_report.tsv"
22 | 
23 |     toolchest.centrifuge(
24 |         read_one="s3://toolchest-integration-tests/megahit/r3_1.fa",
25 |         read_two="s3://toolchest-integration-tests/megahit/r3_2.fa",
26 |         unpaired="s3://toolchest-integration-tests/megahit/r4.fa",
27 |         tool_args="-f",
28 |         output_path=output_dir_path,
29 |     )
30 | 
31 |     assert hash.unordered(output_file_path) == 1779279198
32 |     assert hash.unordered(output_report_path) == 1100843098
33 | 
34 | 
35 | @pytest.mark.integration
36 | def test_centrifuge_multiple_pairs():
37 |     """
38 |     Tests Centrifuge with two pairs of paired-end inputs.
39 |     """
40 |     test_dir = "temp_test_centrifuge/multiple_pairs"
41 |     os.makedirs(f"./{test_dir}", exist_ok=True)
42 |     output_dir_path = f"./{test_dir}"
43 |     output_file_path = f"{output_dir_path}/centrifuge_output.txt"
44 |     output_report_path = f"{output_dir_path}/centrifuge_report.tsv"
45 | 
46 |     toolchest.centrifuge(
47 |         read_one=[
48 |             "s3://toolchest-integration-tests/sample_r1.fastq.gz",
49 |             "s3://toolchest-integration-tests/r1.fastq.gz",
50 |         ],
51 |         read_two=[
52 |             "s3://toolchest-integration-tests/sample_r2.fastq.gz",
53 |             "s3://toolchest-integration-tests/r2.fastq.gz",
54 |         ],
55 |         output_path=output_dir_path,
56 |         volume_size=32,
57 |     )
58 | 
59 |     assert hash.unordered(output_report_path) == 1895979303
60 |     assert hash.unordered(output_file_path) == 1059786093
61 | 


--------------------------------------------------------------------------------
/tests/test_chaining.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | SHI7_SINGLE_END_HASH = 1570879637
12 | SHOGUN_CHAINED_HASH = 1708070294
13 | 
14 | 
15 | @pytest.mark.integration
16 | @pytest.mark.skip(reason="Load reduction for integration tests")
17 | def test_shi7_shogun_chaining():
18 |     """
19 |     Tests S3-based chaining with shi7 and shogun. Passes the S3 URI of the
20 |     shi7 output to shogun as input, skipping the (intermediate) shi7 output download.
21 |     Downloads the (final) shogun output to hash for testing.
22 | 
23 |     To enforce shi7 determinism, a single R1 input is used.
24 | 
25 |     Note: This test also tests the Output object generated by the shi7() tool call,
26 |     and chaining the shi7 output files depends on how the Output is structured.
27 |     If the Output class is modified, this test should be modified as well.
28 |     """
29 | 
30 |     test_dir = "temp_test_shi7_shogun_chaining"
31 |     os.makedirs(f"./{test_dir}", exist_ok=True)
32 |     output_dir_path = f"./{test_dir}"
33 |     output_file_path_shogun = f"{output_dir_path}/alignment.bowtie2.sam"
34 | 
35 |     output_shi7 = toolchest.shi7(
36 |         tool_args="-SE",
37 |         inputs="s3://toolchest-integration-tests/sample_r1.fastq.gz",
38 |     )
39 | 
40 |     # Note: since output_path was omitted from the shi7 function call,
41 |     # local download is skipped, and the local output_file_paths of output_shi7
42 |     # should be None.
43 |     assert output_shi7.output_file_paths is None
44 | 
45 |     output_shogun = toolchest.shogun_align(
46 |         inputs=output_shi7.s3_uri,
47 |         output_path=output_dir_path,
48 |     )
49 | 
50 |     assert hash.unordered(output_file_path_shogun) == SHOGUN_CHAINED_HASH
51 |     assert hash.unordered(output_shogun.output_file_paths) == SHOGUN_CHAINED_HASH
52 | 


--------------------------------------------------------------------------------
/tests/test_clustalo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | @pytest.mark.parametrize("provider", ["aws", "tce"])
14 | def test_clustalo_standard(provider):
15 |     """
16 |     Tests Clustal Omega
17 |     """
18 |     test_dir = "temp_test_clustalo_standard"
19 |     os.makedirs(f"./{test_dir}", exist_ok=True)
20 |     output_dir_path = f"./{test_dir}"
21 |     output_file_name = "sample_output.fasta"
22 |     output_file_path = f"{output_dir_path}/{output_file_name}"
23 | 
24 |     toolchest.clustalo(
25 |         inputs="s3://toolchest-integration-tests/clustalo_input.fasta",
26 |         output_path=output_dir_path,
27 |         output_primary_name=output_file_name,
28 |         provider=provider,
29 |     )
30 | 
31 |     assert hash.unordered(output_file_path) == 1217555147
32 | 


--------------------------------------------------------------------------------
/tests/test_diamond.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | 
  4 | from tests.util import hash, s3
  5 | import toolchest_client as toolchest
  6 | from toolchest_client.api.instance_type import InstanceType
  7 | 
  8 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
  9 | if toolchest_api_key:
 10 |     toolchest.set_key(toolchest_api_key)
 11 | 
 12 | 
 13 | @pytest.mark.integration
 14 | @pytest.mark.parametrize("provider", ["aws", "tce"])
 15 | def test_diamond_blastp_standard(provider):
 16 |     """
 17 |     Tests Diamond blastp mode
 18 |     """
 19 |     test_dir = "temp_test_diamond_blastp_standard"
 20 |     os.makedirs(f"./{test_dir}", exist_ok=True)
 21 |     output_dir_path = f"./{test_dir}"
 22 |     output_file_name = "sample_output.tsv"
 23 |     output_file_path = f"{output_dir_path}/{output_file_name}"
 24 | 
 25 |     toolchest.diamond_blastp(
 26 |         inputs="s3://toolchest-integration-tests/diamond_blastp_input.fa",
 27 |         output_path=output_dir_path,
 28 |         output_primary_name=output_file_name,
 29 |         provider=provider,
 30 |     )
 31 | 
 32 |     assert hash.unordered(output_file_path) == 952562472
 33 | 
 34 | 
 35 | @pytest.mark.integration
 36 | @pytest.mark.parametrize("provider", ["aws", "tce"])
 37 | def test_diamond_blastp_remote_database(provider):
 38 |     """
 39 |     Tests DIAMOND BLASTP with a remote database, including a primary name
 40 |     """
 41 |     test_dir = "test_diamond_blastp_remote_database"
 42 |     os.makedirs(f"./{test_dir}", exist_ok=True)
 43 |     output_dir_path = f"./{test_dir}"
 44 |     output_file_name = "sample_output.tsv"
 45 |     output_file_path = f"{output_dir_path}/{output_file_name}"
 46 | 
 47 |     toolchest.diamond_blastp(
 48 |         inputs="s3://toolchest-integration-tests/short_diamond_blastp_input.fa",
 49 |         remote_database_path="s3://toolchest-fsx-databases/tests/",
 50 |         remote_database_primary_name="custom_diamond_blastp_db",
 51 |         output_path=output_dir_path,
 52 |         output_primary_name=output_file_name,
 53 |         provider=provider,
 54 |     )
 55 | 
 56 |     assert hash.unordered(output_file_path) == 563371739
 57 | 
 58 | 
 59 | @pytest.mark.integration
 60 | @pytest.mark.parametrize("provider", ["aws", "tce"])
 61 | def test_diamond_blastx_standard(provider):
 62 |     """
 63 |     Tests Diamond blastx mode
 64 |     """
 65 |     test_dir = "temp_test_diamond_blastx_standard"
 66 |     os.makedirs(f"./{test_dir}", exist_ok=True)
 67 |     output_dir_path = f"./{test_dir}"
 68 |     output_file_name = "sample_output.tsv"
 69 |     output_file_path = f"{output_dir_path}/{output_file_name}"
 70 | 
 71 |     toolchest.diamond_blastx(
 72 |         inputs="s3://toolchest-integration-tests/sample_r1_shortened.fastq",
 73 |         output_path=output_dir_path,
 74 |         output_primary_name=output_file_name,
 75 |         provider=provider,
 76 |         instance_type=InstanceType.COMPUTE_48
 77 |     )
 78 | 
 79 |     assert hash.unordered(output_file_path) == 883070112
 80 | 
 81 | 
 82 | @pytest.mark.integration
 83 | def test_diamond_blastx_distributed():
 84 |     """
 85 |     Tests DIAMOND BLASTX distributed mode
 86 |     """
 87 |     test_dir = "./temp_test_diamond_blastx_distributed"
 88 |     os.makedirs(f"{test_dir}", exist_ok=True)
 89 |     input_file_path = f"{test_dir}/combined_seqs_unfiltered.fna"
 90 |     output_dir_path = f"./{test_dir}"
 91 |     output_file_name = "sample_output.tsv"
 92 |     output_file_path = f"{output_dir_path}/{output_file_name}"
 93 | 
 94 |     s3.download_integration_test_input(
 95 |         s3_file_key="combined_seqs_unfiltered.fna",
 96 |         output_file_path=input_file_path,
 97 |         is_private=True,
 98 |     )
 99 | 
100 |     print(input_file_path)
101 | 
102 |     toolchest.diamond_blastx(
103 |         inputs=input_file_path,
104 |         output_path=output_dir_path,
105 |         output_primary_name=output_file_name,
106 |         distributed=True,
107 |     )
108 | 
109 |     assert 1390254000 < os.path.getsize(output_file_path) <= 1390256000
110 | 


--------------------------------------------------------------------------------
/tests/test_download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | KRAKEN2_SINGLE_END_HASH = 886254946
12 | 
13 | 
14 | @pytest.mark.integration
15 | def test_kraken2_output_manual_download():
16 |     """
17 |     Tests Kraken 2 against the standard (v1) database, with
18 |     output manually downloaded after job completion
19 |     """
20 |     test_dir = "temp_test_kraken2_output_manual_download"
21 |     input_file_s3_uri = "s3://toolchest-integration-tests/synthetic_bacteroides_reads.fasta"
22 |     manual_output_dir_path = f"./{test_dir}/manual"
23 |     manual_output_file_path = f"{manual_output_dir_path}/kraken2_output.txt"
24 |     toolchest_s3_dir_path = f"./{test_dir}/toolchest/s3"
25 |     toolchest_s3_file_path = f"{toolchest_s3_dir_path}/kraken2_output.txt"
26 |     toolchest_pipeline_dir_path = f"./{test_dir}/toolchest/id"
27 |     toolchest_pipeline_file_path = f"{toolchest_pipeline_dir_path}/kraken2_output.txt"
28 | 
29 |     # Run job without downloading
30 |     output = toolchest.kraken2(
31 |         inputs=input_file_s3_uri,
32 |     )
33 | 
34 |     # Manually invoke download from output
35 |     path_from_manual_download = output.download(manual_output_dir_path)
36 | 
37 |     # If multiple files are returned, path_from_manual download will be a list,
38 |     # so we simply check if kraken2_output.txt is contained in it
39 |     if isinstance(path_from_manual_download, list):
40 |         path_from_manual_download = [os.path.abspath(path) for path in path_from_manual_download]
41 |     else:
42 |         path_from_manual_download = [path_from_manual_download]
43 |     assert os.path.abspath(manual_output_file_path) in path_from_manual_download
44 |     assert hash.unordered(manual_output_file_path) == KRAKEN2_SINGLE_END_HASH
45 | 
46 |     # Test again with toolchest.download(), using S3 URI
47 |     path_from_toolchest_download = toolchest.download(toolchest_s3_dir_path, s3_uri=output.s3_uri)
48 |     if isinstance(path_from_toolchest_download, list):
49 |         path_from_toolchest_download = [os.path.abspath(path) for path in path_from_toolchest_download]
50 |     else:
51 |         path_from_toolchest_download = [path_from_toolchest_download]
52 |     assert os.path.abspath(toolchest_s3_file_path) in path_from_toolchest_download
53 |     assert hash.unordered(toolchest_s3_file_path) == KRAKEN2_SINGLE_END_HASH
54 | 
55 |     # Test again with toolchest.download(), using pipeline segment instance ID
56 |     PIPELINE_INDEX_IN_S3_URI = 3
57 |     pipeline_segment_instance_id = output.s3_uri.split("/")[PIPELINE_INDEX_IN_S3_URI]
58 |     path_from_toolchest_download = toolchest.download(
59 |         toolchest_pipeline_dir_path,
60 |         pipeline_segment_instance_id=pipeline_segment_instance_id,
61 |     )
62 |     if isinstance(path_from_toolchest_download, list):
63 |         path_from_toolchest_download = [os.path.abspath(path) for path in path_from_toolchest_download]
64 |     else:
65 |         path_from_toolchest_download = [path_from_toolchest_download]
66 |     assert os.path.abspath(toolchest_pipeline_file_path) in path_from_toolchest_download
67 |     assert hash.unordered(toolchest_pipeline_file_path) == KRAKEN2_SINGLE_END_HASH
68 | 


--------------------------------------------------------------------------------
/tests/test_fastqc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash, filter_output
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_fastqc():
14 |     """
15 |     Tests FastQC with a fastq file
16 | 
17 |     """
18 | 
19 |     test_dir = "temp_test_fastqc"
20 |     os.makedirs(f"./{test_dir}", exist_ok=True)
21 |     output_dir_path = f"./{test_dir}"
22 |     filtered_html_output_path = f"{output_dir_path}/sample_r1_shortened_fastqc_filtered.html"
23 |     toolchest.fastqc(
24 |         inputs="s3://toolchest-integration-tests/sample_r1_shortened.fastq",
25 |         output_path=output_dir_path
26 |     )
27 | 
28 |     filter_output.filter_regex(
29 |         os.path.join(test_dir, "sample_r1_shortened_fastqc.html"),
30 |         filtered_html_output_path,
31 |         search_regex='id="header_filename">([\\w\\s]+)<br',
32 |         replacement_str='id="header_filename"><br'
33 |     )
34 |     assert hash.unordered(filtered_html_output_path) == 816103024
35 |     assert 325000 <= os.path.getsize(os.path.join(test_dir, "sample_r1_shortened_fastqc.zip")) <= 327000
36 | 


--------------------------------------------------------------------------------
/tests/test_filepath.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash, s3
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_tilde_filepath():
14 |     """
15 |     Tests Kraken 2 against the standard (v1) database
16 |     """
17 |     input_file_path = "~/kraken_input.fasta"
18 |     output_dir_path = "~/temp_test_tilde_filepath"
19 |     output_file_path = os.path.expanduser(f"{output_dir_path}/kraken2_output.txt")
20 |     os.makedirs(os.path.expanduser(output_dir_path), exist_ok=True)
21 | 
22 |     s3.download_integration_test_input(
23 |         s3_file_key="synthetic_bacteroides_reads.fasta",
24 |         output_file_path=os.path.expanduser(input_file_path),
25 |     )
26 | 
27 |     toolchest_output = toolchest.kraken2(
28 |         inputs=input_file_path,  # contains tilde
29 |         output_path=output_dir_path,  # contains tilde
30 |     )
31 | 
32 |     assert hash.unordered(output_file_path) == 886254946
33 | 
34 |     manual_download_dir_path = f"{output_dir_path}/manual_download"
35 |     manual_download_file_path = os.path.expanduser(f"{manual_download_dir_path}/kraken2_output.txt")
36 |     toolchest.download(
37 |         manual_download_dir_path,  # contains tilde
38 |         run_id=toolchest_output.run_id
39 |     )
40 | 
41 |     assert hash.unordered(manual_download_file_path) == 886254946
42 | 


--------------------------------------------------------------------------------
/tests/test_humann3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_humann3_m8():
14 |     """
15 |     Tests humann3 with an m8 file
16 |     Note: This test skips the alignment step.
17 |     """
18 | 
19 |     test_dir = "temp_test_humann3_m8"
20 |     os.makedirs(f"./{test_dir}", exist_ok=True)
21 |     output_dir_path = f"./{test_dir}"
22 |     output_genefamilies_path = f"{output_dir_path}/demo_genefamilies.tsv"
23 |     output_pathabundance_path = f"{output_dir_path}/demo_pathabundance.tsv"
24 |     output_pathcoverage_path = f"{output_dir_path}/demo_pathcoverage.tsv"
25 | 
26 |     toolchest.humann3(
27 |         inputs="s3://toolchest-integration-tests/humann3/demo.m8",
28 |         output_path=output_dir_path,
29 |     )
30 | 
31 |     assert hash.unordered(output_genefamilies_path) == 917720334
32 |     assert hash.unordered(output_pathabundance_path) == 1938423861
33 |     assert hash.unordered(output_pathcoverage_path) == 1315086232
34 | 
35 |     renamed_genefamilies_name = "renamed_demo_genefamilies.tsv"
36 |     renamed_genefamilies_path = f"{output_dir_path}/{renamed_genefamilies_name}"
37 |     toolchest.humann3(
38 |         mode=toolchest.tools.humann.HUMAnN3Mode.HUMANN_RENAME_TABLE,
39 |         inputs=output_genefamilies_path,
40 |         tool_args="--names uniref50",
41 |         output_primary_name=renamed_genefamilies_name,
42 |         output_path=output_dir_path,
43 |     )
44 | 
45 |     assert hash.unordered(renamed_genefamilies_path) == 417465229
46 | 
47 | 
48 | @pytest.mark.integration
49 | def test_humann3_fastq():
50 |     """
51 |     Tests humann3 with a fastq.gz file
52 |     Note: This test uses taxonomic profile (from running the same input without one) to speed up execution. This skips
53 |     some steps that would otherwise happen but cause the test to take around an hour.
54 |     """
55 | 
56 |     test_dir = "temp_test_humann3_fastq"
57 |     os.makedirs(f"./{test_dir}", exist_ok=True)
58 |     output_dir_path = f"./{test_dir}"
59 |     output_genefamilies_path = f"{output_dir_path}/demo_genefamilies.tsv"
60 |     output_pathabundance_path = f"{output_dir_path}/demo_pathabundance.tsv"
61 |     output_pathcoverage_path = f"{output_dir_path}/demo_pathcoverage.tsv"
62 | 
63 |     toolchest.humann3(
64 |         inputs="s3://toolchest-integration-tests/humann3/demo.fastq.gz",
65 |         output_path=output_dir_path,
66 |         taxonomic_profile="s3://toolchest-integration-tests/humann3/demo_metaphlan_bugs_list.tsv",
67 |     )
68 | 
69 |     # Assert existence
70 |     assert os.path.exists(output_genefamilies_path)
71 |     assert os.path.exists(output_pathabundance_path)
72 |     assert os.path.exists(output_pathcoverage_path)
73 | 


--------------------------------------------------------------------------------
/tests/test_kallisto.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | import toolchest_client as toolchest
 5 | from tests.util import hash
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_kallisto_homo_sapiens():
14 |     """
15 |     Tests kallisto with a scRNA-Seq FASTQ file
16 | 
17 |     """
18 | 
19 |     test_dir = "temp_test_kallisto_homo_sapiens"
20 |     os.makedirs(f"./{test_dir}", exist_ok=True)
21 |     output_dir_path = f"./{test_dir}"
22 | 
23 |     toolchest.kallisto(
24 |         inputs="s3://toolchest-integration-tests/salmon/SRR2557119_500k.fastq",
25 |         output_path=output_dir_path,
26 |         tool_args="--single -l 150 -s 20",
27 |     )
28 | 
29 |     assert hash.unordered(os.path.join(output_dir_path, "abundance.tsv")) == 810475052
30 |     assert os.path.getsize(os.path.join(output_dir_path, "abundance.h5")) == 1784377
31 | 


--------------------------------------------------------------------------------
/tests/test_kraken2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | 
  4 | from tests.util import s3, hash
  5 | import toolchest_client as toolchest
  6 | 
  7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
  8 | if toolchest_api_key:
  9 |     toolchest.set_key(toolchest_api_key)
 10 | 
 11 | KRAKEN2_SINGLE_END_HASH = 886254946
 12 | 
 13 | 
 14 | @pytest.mark.integration
 15 | def test_kraken2_standard():
 16 |     """
 17 |     Tests Kraken 2 against the standard (v1) database
 18 |     """
 19 |     test_dir = "temp_test_kraken2_standard"
 20 |     os.makedirs(f"./{test_dir}", exist_ok=True)
 21 |     input_file_path = "./kraken_input.fasta"
 22 |     output_dir_path = f"./{test_dir}"
 23 |     output_file_path = f"{output_dir_path}/kraken2_output.txt"
 24 | 
 25 |     s3.download_integration_test_input(
 26 |         s3_file_key="synthetic_bacteroides_reads.fasta",
 27 |         output_file_path=input_file_path,
 28 |     )
 29 | 
 30 |     toolchest.kraken2(
 31 |         inputs=input_file_path,
 32 |         output_path=output_dir_path,
 33 |     )
 34 | 
 35 |     assert hash.unordered(output_file_path) == KRAKEN2_SINGLE_END_HASH
 36 | 
 37 | 
 38 | @pytest.mark.integration
 39 | def test_kraken2_paired_end():
 40 |     """
 41 |     Tests Kraken 2 with paired-end inputs against the std (v1) DB
 42 |     """
 43 |     test_dir = "temp_test_kraken2_paired_end"
 44 |     os.makedirs(f"./{test_dir}", exist_ok=True)
 45 |     input_one_file_path = f"./{test_dir}/kraken_input_read1.fastq.gz"
 46 |     input_two_file_path = f"./{test_dir}/kraken_input_read2.fastq.gz"
 47 |     output_dir_path = f"./{test_dir}"
 48 |     output_file_path = f"{output_dir_path}/kraken2_output.txt"
 49 | 
 50 |     s3.download_integration_test_input(
 51 |         s3_file_key="sample_r1.fastq.gz",
 52 |         output_file_path=input_one_file_path,
 53 |     )
 54 |     s3.download_integration_test_input(
 55 |         s3_file_key="sample_r2.fastq.gz",
 56 |         output_file_path=input_two_file_path,
 57 |     )
 58 | 
 59 |     toolchest.kraken2(
 60 |         read_one=input_one_file_path,
 61 |         read_two=input_two_file_path,
 62 |         output_path=output_dir_path,
 63 |     )
 64 | 
 65 |     # Kraken 2 paired-end is not completely deterministic, and consistently alternates between these two hashes
 66 |     assert hash.unordered(output_file_path) in [1076645572, 1174140935]
 67 | 
 68 | 
 69 | @pytest.mark.integration
 70 | def test_kraken2_s3_with_bracken():
 71 |     """
 72 |     Tests Kraken 2 with an example input in S3 against the std (v1) DB, with Bracken
 73 |     """
 74 |     test_dir = "temp_test_kraken2_s3"
 75 |     os.makedirs(f"./{test_dir}", exist_ok=True)
 76 |     output_dir_path = f"./{test_dir}"
 77 |     output_file_path = f"{output_dir_path}/kraken2_output.txt"
 78 |     report_file_path = f"{output_dir_path}/kraken2_report.txt"
 79 | 
 80 |     toolchest.kraken2(
 81 |         inputs="s3://toolchest-integration-tests/kraken2/demo.fastq.gz",
 82 |         output_path=output_dir_path,
 83 |         database_name="standard",
 84 |         database_version="1",
 85 |     )
 86 | 
 87 |     assert hash.unordered(output_file_path) == 913900323
 88 |     assert hash.unordered(report_file_path) == 701943163
 89 | 
 90 |     bracken_primary_name = "test.bracken"
 91 | 
 92 |     toolchest.bracken(
 93 |         kraken2_report=report_file_path,
 94 |         output_path=output_dir_path,
 95 |         output_primary_name=bracken_primary_name,
 96 |         database_name="standard",
 97 |         database_version="1",
 98 |         tool_args="-r 150 -l G",
 99 |     )
100 | 
101 |     assert hash.unordered(os.path.join(output_dir_path, bracken_primary_name)) == 459225326
102 | 
103 | 
104 | @pytest.mark.integration
105 | def test_kraken2_custom_db():
106 |     """
107 |     Tests Kraken 2 with an example custom database (viral refseq index)
108 |     """
109 |     KRAKEN2_OUTPUT_VIRAL_HASH = 1003212151
110 | 
111 |     test_dir = "temp_test_kraken2_custom_db"
112 |     os.makedirs(f"./{test_dir}", exist_ok=True)
113 |     output_dir_path = f"./{test_dir}"
114 |     output_file_path = f"{output_dir_path}/kraken2_output.txt"
115 | 
116 |     custom_db = "s3://toolchest-fsx-databases/kraken2/k2_viral_20210517/"
117 |     toolchest.kraken2(
118 |         read_one="s3://toolchest-integration-tests/synthetic_bacteroides_reads.fasta",
119 |         remote_database_path=custom_db,
120 |         output_path=output_dir_path,
121 |     )
122 | 
123 |     assert hash.unordered(output_file_path) == KRAKEN2_OUTPUT_VIRAL_HASH
124 | 


--------------------------------------------------------------------------------
/tests/test_last.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_lastal5():
14 |     """
15 |     Tests lastal5 against the standard database
16 |     """
17 |     test_dir = "temp_test_lastal5_standard"
18 |     os.makedirs(f"./{test_dir}", exist_ok=True)
19 |     output_dir_path = f"./{test_dir}"
20 |     output_file_path = f"{output_dir_path}/out.maf"
21 | 
22 |     toolchest.lastal5(
23 |         inputs="s3://toolchest-integration-tests/synthetic_bacteroides_reads.fasta",
24 |         output_path=output_dir_path,
25 |         tool_args="-m50"
26 |     )
27 | 
28 |     assert hash.unordered(output_file_path) == 927021088
29 | 


--------------------------------------------------------------------------------
/tests/test_megahit.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | import toolchest_client as toolchest
 5 | 
 6 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 7 | if toolchest_api_key:
 8 |     toolchest.set_key(toolchest_api_key)
 9 | 
10 | EXPECTED_MIN_OUTPUT_SIZE_MANY_TYPES = 1000
11 | EXPECTED_MIN_OUTPUT_SIZE_TWO_PAIRS = 3 * 1024 * 1024
12 | 
13 | 
14 | @pytest.mark.integration
15 | def test_megahit_many_types():
16 |     """
17 |     Tests Megahit with two interleaved inputs, one pair of paired-end inputs,
18 |     and two single-end inputs.
19 | 
20 |     Note: Multithreaded megahit is not deterministic, so
21 |     we check the size of the file instead.
22 |     See https://github.com/voutcn/megahit/issues/48.
23 |     """
24 |     test_dir = "temp_test_megahit_many_types"
25 |     os.makedirs(f"./{test_dir}", exist_ok=True)
26 |     output_dir_path = f"./{test_dir}"
27 |     output_file_path = f"{output_dir_path}/final.contigs.fa"
28 | 
29 |     toolchest.megahit(
30 |         interleaved=[
31 |             "s3://toolchest-integration-tests/megahit/r1.il.fa.gz",
32 |             "s3://toolchest-integration-tests/megahit/r2.il.fa.bz2",
33 |         ],
34 |         read_one="s3://toolchest-integration-tests/megahit/r3_1.fa",
35 |         read_two="s3://toolchest-integration-tests/megahit/r3_2.fa",
36 |         single_end=[
37 |             "s3://toolchest-integration-tests/megahit/r4.fa",
38 |             "s3://toolchest-integration-tests/megahit/loop.fa",
39 |         ],
40 |         tool_args="--presets meta-large",
41 |         output_path=output_dir_path,
42 |     )
43 | 
44 |     assert os.path.getsize(output_file_path) >= EXPECTED_MIN_OUTPUT_SIZE_MANY_TYPES
45 | 
46 | 
47 | @pytest.mark.integration
48 | def test_megahit_multiple_pairs():
49 |     """
50 |     Tests Megahit with two pairs of paired-end inputs.
51 | 
52 |     Note: Multithreaded megahit is not deterministic, so
53 |     we check the size of the file instead.
54 |     See https://github.com/voutcn/megahit/issues/48.
55 |     """
56 |     test_dir = "temp_test_megahit_two_pairs"
57 |     os.makedirs(f"./{test_dir}", exist_ok=True)
58 |     output_dir_path = f"./{test_dir}"
59 |     output_file_path = f"{output_dir_path}/final.contigs.fa"
60 | 
61 |     toolchest.megahit(
62 |         read_one=[
63 |             "s3://toolchest-integration-tests/megahit/r3_1.fa",
64 |             "s3://toolchest-integration-tests/r1.fastq.gz",
65 |         ],
66 |         read_two=[
67 |             "s3://toolchest-integration-tests/megahit/r3_2.fa",
68 |             "s3://toolchest-integration-tests/r2.fastq.gz",
69 |         ],
70 |         tool_args="--presets meta-large",
71 |         output_path=output_dir_path,
72 |     )
73 | 
74 |     assert os.path.getsize(output_file_path) >= EXPECTED_MIN_OUTPUT_SIZE_TWO_PAIRS
75 | 


--------------------------------------------------------------------------------
/tests/test_metaphlan.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash, filter_output
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_metaphlan():
14 |     """
15 |     Tests MetaPhlAn with a fastq file
16 | 
17 |     """
18 | 
19 |     test_dir = "temp_test_metaphlan"
20 |     os.makedirs(f"./{test_dir}", exist_ok=True)
21 |     output_dir_path = f"./{test_dir}"
22 |     toolchest.metaphlan(
23 |         inputs="s3://toolchest-integration-tests/metaphlan/SRS014464-Anterior_nares.fasta.gz",
24 |         output_path=output_dir_path
25 |     )
26 | 
27 |     # MetaPhLan includes the command in the output but that contains a non-deterministic uuid so that line is removed.
28 |     filtered_output_path = os.path.join(output_dir_path, "out_filtered.txt")
29 |     filter_output.filter_regex(
30 |         unfiltered_path=f"{output_dir_path}/out.txt",
31 |         filtered_path=filtered_output_path,
32 |         search_regex="#/usr/local/bin/metaphlan.*\n",
33 |         replacement_str="",
34 |     )
35 |     assert hash.unordered(filtered_output_path) == 1401032462
36 | 
37 |     bowtie2outfile_path = os.path.join(output_dir_path, "SRS014464-Anterior_nares.fasta.gz.bowtie2out.txt")
38 |     assert hash.unordered(bowtie2outfile_path) == 1308716263
39 | 


--------------------------------------------------------------------------------
/tests/test_output.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | import toolchest_client as toolchest
 5 | 
 6 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 7 | if toolchest_api_key:
 8 |     toolchest.set_key(toolchest_api_key)
 9 | 
10 | 
11 | @pytest.mark.integration
12 | def test_output_object():
13 |     """
14 |     Verifies that the output object has all the desired parameters.
15 |     """
16 |     test_dir = "temp_test_output_object"
17 |     input_file_s3_uri = "s3://toolchest-integration-tests/small_synthetic_bacteroides_reads.fasta"
18 |     output_dir_path = f"./{test_dir}"
19 |     output_file_path = f"{output_dir_path}/test_output.txt"
20 |     os.makedirs(output_dir_path, exist_ok=True)
21 | 
22 |     toolchest_output = toolchest.test(
23 |         inputs=input_file_s3_uri,
24 |         output_path=output_dir_path
25 |     )
26 | 
27 |     print(toolchest_output)
28 |     assert toolchest_output.tool_name == "test"
29 |     assert toolchest_output.tool_version == "0.1.0"
30 |     assert toolchest_output.database_name is None
31 |     assert toolchest_output.database_version is None
32 |     assert toolchest_output.run_id is not None
33 |     assert toolchest_output.output_path == os.path.abspath(output_dir_path)
34 |     assert toolchest_output.output_file_paths == os.path.abspath(output_file_path)
35 | 


--------------------------------------------------------------------------------
/tests/test_public_uri.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | import toolchest_client as toolchest
 5 | 
 6 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 7 | if toolchest_api_key:
 8 |     toolchest.set_key(toolchest_api_key)
 9 | 
10 | 
11 | @pytest.mark.integration
12 | def test_s3_http_input():
13 |     """
14 |     Tests test function with an http input
15 |     """
16 |     test_dir = "temp_test_http"
17 |     os.makedirs(f"./{test_dir}", exist_ok=True)
18 |     input_file_path = "https://toolchest-public-examples-no-encryption.s3.amazonaws.com/example.fastq"
19 |     output_dir_path = f"./{test_dir}"
20 |     output_file_path = f"{output_dir_path}/test_output.txt"
21 | 
22 |     toolchest.test(
23 |         inputs=input_file_path,
24 |         output_path=output_dir_path
25 |     )
26 | 
27 |     with open(output_file_path, "r") as f:
28 |         assert f.read().strip() == "success"
29 | 
30 | 
31 | @pytest.mark.integration
32 | def test_ftp_input():
33 |     """
34 |     Tests transfer function with an ftp input
35 |     """
36 |     test_dir = "temp_test_ftp"
37 |     os.makedirs(f"./{test_dir}", exist_ok=True)
38 |     output_dir_path = f"./{test_dir}"
39 |     output_file_path = f"{output_dir_path}/SRR9990000.fastq.gz"
40 | 
41 |     toolchest.transfer(
42 |         inputs="ftp://ftp.sra.ebi.ac.uk/vol1/fastq//SRR999/000/SRR9990000/SRR9990000.fastq.gz",
43 |         output_path=output_dir_path
44 |     )
45 | 
46 |     assert os.path.getsize(output_file_path) == 11632985
47 | 


--------------------------------------------------------------------------------
/tests/test_python3.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import pathlib
  4 | import sys
  5 | 
  6 | import docker
  7 | import pytest
  8 | 
  9 | import toolchest_client as toolchest
 10 | from toolchest_client.api.instance_type import InstanceType
 11 | 
 12 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 13 | if toolchest_api_key:
 14 |     toolchest.set_key(toolchest_api_key)
 15 | 
 16 | THIS_FILE_PATH = pathlib.Path(__file__).parent.resolve()
 17 | 
 18 | 
 19 | @pytest.mark.integration
 20 | def test_python3():
 21 |     """
 22 |     Tests python3 tool's inputs, output, and tool_arg
 23 | 
 24 |     NOTE: streaming is disabled for this test
 25 |     """
 26 | 
 27 |     test_dir = "./temp_test_python3"
 28 |     os.makedirs(f"{test_dir}", exist_ok=True)
 29 |     toolchest.python3(
 30 |         tool_args="./input/example.fastq",
 31 |         script="s3://toolchest-integration-tests/write_test.py",
 32 |         inputs="s3://toolchest-integration-tests/example.fastq",
 33 |         output_path=f"{test_dir}/",
 34 |         instance_type=InstanceType.COMPUTE_2,
 35 |         streaming_enabled=False,
 36 |     )
 37 | 
 38 |     output_file = open(f"{test_dir}/output.txt", "r")
 39 |     assert output_file.readline() == "Success"
 40 |     output_file.close()
 41 | 
 42 | 
 43 | @pytest.mark.integration
 44 | def test_python3_with_docker():
 45 |     """
 46 |     Tests adding dependencies to python3 via a custom docker image
 47 | 
 48 |     Specifically tests matrix multiplication via numpy
 49 |     """
 50 |     client = docker.from_env()
 51 |     client.images.build(
 52 |         path=f"{THIS_FILE_PATH}/util/",
 53 |         dockerfile="numpy_test.Dockerfile",
 54 |         tag="python3-numpy:3.9",
 55 |         platform="linux/amd64"
 56 |     )
 57 | 
 58 |     test_dir = "./temp_test_python3/with_docker"
 59 |     os.makedirs(f"{test_dir}", exist_ok=True)
 60 |     toolchest.python3(
 61 |         script="s3://toolchest-integration-tests/numpy_test.py",
 62 |         output_path=f"{test_dir}/",
 63 |         custom_docker_image_id="python3-numpy:3.9",
 64 |         instance_type="compute-2",
 65 |     )
 66 | 
 67 |     output_file = open(f"{test_dir}/output.txt", "r")
 68 |     assert output_file.readline() == "[[ 58  64]\n"
 69 |     assert output_file.readline() == " [139 154]]"
 70 |     output_file.close()
 71 | 
 72 | 
 73 | @pytest.mark.integration
 74 | def test_python3_with_public_docker():
 75 |     """
 76 |     Tests using a public docker image with the write test script
 77 |     """
 78 | 
 79 |     test_dir = "./temp_test_python3/with_public_docker"
 80 |     os.makedirs(f"{test_dir}", exist_ok=True)
 81 |     toolchest.python3(
 82 |         script="s3://toolchest-integration-tests/write_path.py",
 83 |         output_path=f"{test_dir}/",
 84 |         custom_docker_image_id="python:alpine3.16",
 85 |     )
 86 | 
 87 |     output_file = open(f"{test_dir}/output.txt", "r")
 88 |     assert output_file.readline() == "['/data/home/ec2-user/input', '/usr/local/lib/python311.zip', " \
 89 |                                      "'/usr/local/lib/python3.11', '/usr/local/lib/python3.11/lib-dynload', " \
 90 |                                      "'/usr/local/lib/python3.11/site-packages']"
 91 |     output_file.close()
 92 | 
 93 | 
 94 | @pytest.mark.integration
 95 | def test_python3_streaming():
 96 |     """
 97 |     Tests python3 with output streaming enabled
 98 |     """
 99 |     test_dir = "./temp_test_python3_streaming"
100 |     os.makedirs(f"{test_dir}", exist_ok=True)
101 |     test_script_path = "tests/util/streaming_script.py"
102 | 
103 |     # Run with captured stdout
104 |     captured_stdout = io.StringIO()
105 |     sys.stdout = captured_stdout
106 |     toolchest.python3(
107 |         script=test_script_path,
108 |         output_path=f"{test_dir}/",
109 |         instance_type=InstanceType.COMPUTE_2,
110 |         streaming_enabled=True,
111 |     )
112 |     # Reset stdout capture
113 |     sys.stdout = sys.__stdout__
114 | 
115 |     # Verify toolchest.python3() output files
116 |     with open(f"{test_dir}/output.txt", "r") as output_file:
117 |         assert output_file.readline() == "Success"
118 | 
119 |     # Check printed stdout
120 |     stdout_lines = captured_stdout.getvalue().splitlines()
121 |     stream_start = stdout_lines.index("==> Begin streamed lines <==")
122 |     stream_end = stdout_lines.index("==> End streamed lines <==")
123 |     streamed_lines = stdout_lines[stream_start:stream_end + 1]
124 |     assert streamed_lines == ["==> Begin streamed lines <==", "0", "1", "2", "3", "4", "==> End streamed lines <=="]
125 | 


--------------------------------------------------------------------------------
/tests/test_rapsearch2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_rapsearch2():
14 |     """
15 |     Tests rapsearch2 on SeqScreen DB
16 |     """
17 | 
18 |     test_dir = "./temp_test_rapsearch2"
19 |     os.makedirs(f"{test_dir}", exist_ok=True)
20 |     output_file_path_aln = f"./{test_dir}/rapsearch2.aln"
21 |     output_file_path_m8 = f"./{test_dir}/rapsearch2.m8"
22 | 
23 |     toolchest.rapsearch2(
24 |         tool_args="-e 1e-9",
25 |         inputs="s3://toolchest-integration-tests/example.fastq",
26 |         output_path=f"{test_dir}/",
27 |         output_primary_name="rapsearch2",
28 |     )
29 | 
30 |     # m8 output is nondeterministic, so we check file size
31 |     assert 71362000 <= os.path.getsize(output_file_path_m8) <= 71362200
32 | 
33 |     assert 321661100 <= os.path.getsize(output_file_path_aln) <= 321661300
34 |     assert hash.unordered(output_file_path_aln) == 2129168459
35 | 


--------------------------------------------------------------------------------
/tests/test_salmon.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | import toolchest_client as toolchest
 5 | 
 6 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 7 | if toolchest_api_key:
 8 |     toolchest.set_key(toolchest_api_key)
 9 | 
10 | 
11 | @pytest.mark.integration
12 | def test_salmon_hg38():
13 |     """
14 |     Tests salmon with a scRNA-Seq FASTA file
15 | 
16 |     """
17 | 
18 |     test_dir = "temp_test_salmon_hg38"
19 |     os.makedirs(f"./{test_dir}", exist_ok=True)
20 |     output_dir_path = f"./{test_dir}"
21 | 
22 |     toolchest.salmon(
23 |         single_end="s3://toolchest-integration-tests/salmon/SRR2557119_500k.fastq",
24 |         output_path=output_dir_path,
25 |         database_name="salmon_hg38",
26 |         database_version=1,
27 |     )
28 | 
29 |     # Non-deterministic
30 |     assert 8143860 <= os.path.getsize(os.path.join(output_dir_path, "quant.sf")) <= 8143880
31 | 


--------------------------------------------------------------------------------
/tests/test_sanity.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | @pytest.mark.integration
5 | def test_sanity():
6 |     assert 1 + 1 == 2
7 | 


--------------------------------------------------------------------------------
/tests/test_shi7.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import s3, hash
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | # Because shi7 paired-end is non-deterministic, we just make sure it's not equal to the single-end version
12 | SHI7_SINGLE_END_HASH = 1570879637
13 | 
14 | 
15 | @pytest.mark.integration
16 | @pytest.mark.skip(reason="Load reduction for integration tests")
17 | def test_shi7_single_end():
18 |     """
19 |     Tests shi7 with a single R1 input
20 | 
21 |     Note: This test also tests the Output object generated by the shi7() tool call,
22 |     and chaining the shi7 output files depends on how the Output is structured.
23 |     If the Output class is modified, this test should be modified as well.
24 |     """
25 | 
26 |     test_dir = "./temp_test_shi7_single_end"
27 |     os.makedirs(test_dir, exist_ok=True)
28 |     input_one_file_path = f"{test_dir}/shi7_input_R1.fastq.gz"
29 |     output_file_path = f"{test_dir}/combined_seqs.fna"
30 | 
31 |     s3.download_integration_test_input(
32 |         s3_file_key="sample_r1.fastq.gz",
33 |         output_file_path=input_one_file_path,
34 |     )
35 | 
36 |     output_shi7 = toolchest.shi7(
37 |         tool_args="-SE",
38 |         inputs=test_dir,
39 |         output_path=test_dir,
40 |     )
41 | 
42 |     # Note: since shi7 produces multiple files, output_shi7.output_path
43 |     # should be a list of paths to each unpacked output file.
44 |     assert hash.unordered(output_file_path) == SHI7_SINGLE_END_HASH
45 |     assert isinstance(output_shi7.output_file_paths, list)
46 | 
47 | 
48 | @pytest.mark.integration
49 | @pytest.mark.skip(reason="Load reduction for integration tests")
50 | def test_shi7_paired_end():
51 |     """
52 |     Tests shi7 with paired-end inputs
53 | 
54 |     Unfortunately, shi7 is non-deterministic. This means we can't check a hash.
55 |     As a means of having some level of guarantee, we check the output file size instead.
56 | 
57 |     Because of this, we should not recommend shi7 for use.
58 |     """
59 | 
60 |     test_dir = "./temp_test_shi7_paired_end"
61 |     os.makedirs(test_dir, exist_ok=True)
62 |     input_one_file_path = f"{test_dir}/shi7_input_R1.fastq.gz"
63 |     input_two_file_path = f"{test_dir}/shi7_input_R2.fastq.gz"
64 |     output_file_path = f"{test_dir}/combined_seqs.fna"
65 | 
66 |     s3.download_integration_test_input(
67 |         s3_file_key="sample_r1.fastq.gz",
68 |         output_file_path=input_one_file_path,
69 |     )
70 |     s3.download_integration_test_input(
71 |         s3_file_key="sample_r2.fastq.gz",
72 |         output_file_path=input_two_file_path,
73 |     )
74 | 
75 |     toolchest.shi7(
76 |         inputs=test_dir,
77 |         output_path=test_dir,
78 |     )
79 | 
80 |     # Because shi7 paired-end is non-deterministic, we just make sure it's not equal to the single-end version
81 |     assert hash.unordered(output_file_path) != SHI7_SINGLE_END_HASH
82 | 


--------------------------------------------------------------------------------
/tests/test_shogun.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import s3, hash
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | @pytest.mark.skip(reason="Load reduction for integration tests")
14 | def test_shogun_filter_and_align():
15 |     """
16 |     Tests shogun (filter and align for simplicity) with a single R1 input
17 |     """
18 | 
19 |     test_dir = "./temp_test_shogun_filter_and_align"
20 |     os.makedirs(f"{test_dir}", exist_ok=True)
21 |     input_file_path = f"{test_dir}/combined_seqs_unfiltered.fna"
22 |     output_file_path_filter = f"{test_dir}/combined_seqs.filtered.fna"
23 |     output_file_path_align = f"{test_dir}/alignment.bowtie2.sam"
24 | 
25 |     s3.download_integration_test_input(
26 |         s3_file_key="combined_seqs_unfiltered.fna",
27 |         output_file_path=input_file_path,
28 |         is_private=True,
29 |     )
30 | 
31 |     toolchest.shogun_filter(
32 |         tool_args="--alignment True",
33 |         inputs=input_file_path,
34 |         output_path=test_dir,
35 |     )
36 | 
37 |     assert hash.unordered(output_file_path_filter) == 510167908
38 | 
39 |     toolchest.shogun_align(
40 |         tool_args="",
41 |         inputs=output_file_path_filter,
42 |         output_path=test_dir,
43 |     )
44 |     assert hash.unordered(output_file_path_align) == 1952162202
45 | 


--------------------------------------------------------------------------------
/tests/test_star.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | 
  4 | from tests.util import s3, hash, filter_output
  5 | import toolchest_client as toolchest
  6 | 
  7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
  8 | if toolchest_api_key:
  9 |     toolchest.set_key(toolchest_api_key)
 10 | 
 11 | 
 12 | @pytest.mark.integration
 13 | @pytest.mark.parametrize("provider", ["aws", "tce"])
 14 | def test_star_grch38(provider):
 15 |     """
 16 |     Tests STAR against the grch38 database
 17 |     """
 18 |     test_dir = "temp_test_star_grch38"
 19 |     os.makedirs(f"./{test_dir}", exist_ok=True)
 20 |     input_file_path = "./small_star.fastq"
 21 |     output_dir_path = f"./{test_dir}"
 22 |     output_file_path = f"{output_dir_path}/Aligned.out.sam"
 23 |     filtered_output_file_path = f"{output_dir_path}/Aligned.filtered.out.sam"
 24 | 
 25 |     s3.download_integration_test_input(
 26 |         s3_file_key="small_star_500k.fastq",
 27 |         output_file_path=input_file_path,
 28 |         is_private=True,
 29 |     )
 30 | 
 31 |     toolchest.STAR(
 32 |         read_one=input_file_path,
 33 |         output_path=output_dir_path,
 34 |         database_name="GRCh38",
 35 |         provider=provider
 36 |     )
 37 | 
 38 |     # Because STAR output contains run ID (non-deterministic), verify that the number of bytes is in range
 39 |     assert 185952700 <= os.path.getsize(output_file_path) <= 185952900  # expected size 185952796
 40 | 
 41 |     # Filter non-deterministic metadata lines
 42 |     filter_output.filter_sam(output_file_path, filtered_output_file_path)
 43 |     assert hash.unordered(filtered_output_file_path) == 2099424598
 44 | 
 45 | 
 46 | @pytest.mark.integration
 47 | @pytest.mark.skip(reason="Pysam removed so parallelization is disabled until a new sam file merger is written or found")
 48 | def test_star_grch38_parallel():
 49 |     """
 50 |     Tests STAR against the grch38 database, using parallel mode
 51 |     """
 52 |     test_dir = "temp_test_star_grch38_parallel"
 53 |     os.makedirs(f"./{test_dir}", exist_ok=True)
 54 |     input_file_path = "./large_star.fastq"
 55 |     output_dir_path = f"./{test_dir}"
 56 |     output_file_path = f"{output_dir_path}/Aligned.out.sam"
 57 | 
 58 |     s3.download_integration_test_input(
 59 |         s3_file_key="large_star_15GB.fastq",
 60 |         output_file_path=input_file_path,
 61 |         is_private=True,
 62 |     )
 63 | 
 64 |     toolchest.STAR(
 65 |         read_one=input_file_path,
 66 |         output_path=output_file_path,
 67 |         database_name="GRCh38",
 68 |         parallelize=True,
 69 |     )
 70 | 
 71 |     # Because STAR output contains run ID (non-deterministic), verify that the number of bytes is in range
 72 |     # TODO: verify new file size with dockerized STAR after re-enabling parallelization
 73 |     # TODO: add a hash test of output file without @PG and @CO lines
 74 |     assert 33292990718 <= os.path.getsize(output_file_path) <= 33292994718
 75 | 
 76 | 
 77 | @pytest.mark.integration
 78 | @pytest.mark.parametrize("provider", ["aws", "tce"])
 79 | def test_star_grch38_dangerous_arg(provider):
 80 |     """
 81 |     Tests STAR against the grch38 database, with a dangerous arg (changing functionality)
 82 |     """
 83 |     test_dir = "temp_test_star_grch38"
 84 |     os.makedirs(f"./{test_dir}", exist_ok=True)
 85 |     input_file_path = "./small_star.fastq"
 86 |     output_dir_path = f"./{test_dir}"
 87 |     output_file_path = f"{output_dir_path}/Aligned.out.bam"
 88 | 
 89 |     s3.download_integration_test_input(
 90 |         s3_file_key="small_star_500k.fastq",
 91 |         output_file_path=input_file_path,
 92 |         is_private=True,
 93 |     )
 94 | 
 95 |     toolchest.STAR(
 96 |         read_one=input_file_path,
 97 |         output_path=output_dir_path,
 98 |         database_name="GRCh38",
 99 |         tool_args="--outSAMtype BAM Unsorted",
100 |         parallelize=True,  # this should be deliberately ignored
101 |         provider=provider,
102 |     )
103 | 
104 |     # Because STAR output contains run ID (non-deterministic) and BAMs are compressed,
105 |     # verify that the number of bytes is in range
106 |     assert 38236000 <= os.path.getsize(output_file_path) <= 38236100  # expected size 38236044
107 | 
108 |     # Make sure all non-parallel files exist as well
109 |     assert os.path.isfile(f"{output_dir_path}/Log.final.out")
110 |     assert os.path.isfile(f"{output_dir_path}/Log.out")
111 |     assert os.path.isfile(f"{output_dir_path}/Log.progress.out")
112 |     assert os.path.isfile(f"{output_dir_path}/SJ.out.tab")
113 | 


--------------------------------------------------------------------------------
/tests/test_transfer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | import toolchest_client as toolchest
 5 | 
 6 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 7 | if toolchest_api_key:
 8 |     toolchest.set_key(toolchest_api_key)
 9 | 
10 | 
11 | @pytest.mark.integration
12 | def test_transfer_http():
13 |     """
14 |     Tests transfer function with an http input
15 |     """
16 |     test_dir = "temp_test_transfer_http"
17 |     os.makedirs(f"./{test_dir}", exist_ok=True)
18 |     output_dir_path = f"./{test_dir}"
19 |     output_file_path = f"{output_dir_path}/P48754.fasta"
20 | 
21 |     toolchest.transfer(
22 |         inputs="https://rest.uniprot.org/uniprotkb/P48754.fasta",
23 |         output_path=output_dir_path
24 |     )
25 | 
26 |     with open(output_file_path, "r") as f:
27 |         assert f.read().startswith(">sp|P48754|BRCA1_MOUSE")
28 | 


--------------------------------------------------------------------------------
/tests/test_unicycler.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from tests.util import hash
 5 | import toolchest_client as toolchest
 6 | 
 7 | toolchest_api_key = os.environ.get("TOOLCHEST_API_KEY")
 8 | if toolchest_api_key:
 9 |     toolchest.set_key(toolchest_api_key)
10 | 
11 | 
12 | @pytest.mark.integration
13 | def test_unicycler():
14 |     """
15 |     Tests Unicycler
16 |     """
17 | 
18 |     test_dir = "temp_test_unicycler"
19 |     os.makedirs(f"./{test_dir}", exist_ok=True)
20 |     output_dir_path = f"./{test_dir}/"
21 | 
22 |     toolchest.unicycler(
23 |         output_path=output_dir_path,
24 |         read_one="s3://toolchest-integration-tests/r1.fastq.gz",
25 |         read_two="s3://toolchest-integration-tests/r2.fastq.gz",
26 |         long_reads="s3://toolchest-integration-tests/long_reads.fasta.gz"
27 |     )
28 | 
29 |     assert hash.unordered(f"{output_dir_path}assembly.fasta") == 882369120
30 | 


--------------------------------------------------------------------------------
/tests/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/tests/util/__init__.py


--------------------------------------------------------------------------------
/tests/util/filter_output.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def filter_sam(unfiltered_path, filtered_path):
 5 |     """
 6 |     Filters out non-deterministic metadata lines from a SAM output file.
 7 |     """
 8 |     with open(filtered_path, "w") as outfile:
 9 |         with open(unfiltered_path, "r") as infile:
10 |             outfile.writelines([line for line in infile if not line.startswith("@PG") and not line.startswith("@CO")])
11 | 
12 | 
13 | def filter_regex(unfiltered_path, filtered_path, search_regex, replacement_str):
14 |     """
15 |     Filters out non-deterministic metadata lines from a SAM output file.
16 |     """
17 |     with open(filtered_path, "w") as outfile:
18 |         with open(unfiltered_path, "r") as infile:
19 |             for line in infile:
20 |                 outfile.write(re.sub(search_regex, replacement_str, line))
21 | 


--------------------------------------------------------------------------------
/tests/util/hash.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import zlib
 3 | 
 4 | 
 5 | def unordered(file_path):
 6 |     """
 7 |     Generates a hash of an ASCII-encoded file, not impacted by line order.
 8 |     Not anywhere near cryptographically secure.
 9 |     """
10 |     file_hash = 1
11 |     eighth_mersenne_prime = 2147483647
12 |     with open(file_path) as file:
13 |         print("Hashing", file_path)
14 |         for line in file:
15 |             file_hash = (zlib.adler32(line.encode()) * file_hash) % eighth_mersenne_prime
16 |     print("Hash is", file_hash)
17 |     return file_hash
18 | 
19 | 
20 | def binary_hash(file_path):
21 |     """
22 |     Generates an MD5 hash of a binary file.
23 |     """
24 |     with open(file_path, "rb") as file:
25 |         print("Hashing", file_path)
26 |         file_hash = hashlib.md5(file.read()).hexdigest()
27 |     print("Hash is", file_hash)
28 |     return file_hash
29 | 


--------------------------------------------------------------------------------
/tests/util/numpy_test.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9
2 | RUN pip install numpy
3 | 


--------------------------------------------------------------------------------
/tests/util/s3.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | 
 3 | 
 4 | # Downloads input from S3 to local file path.
 5 | # Used for tests that upload local inputs.
 6 | def download_integration_test_input(s3_file_key, output_file_path, is_private=False):
 7 |     s3_client = boto3.client('s3')
 8 |     bucket_name = 'toolchest-integration-tests-private' if is_private else 'toolchest-integration-tests'
 9 |     s3_client.download_file(bucket_name, s3_file_key, output_file_path)
10 | 


--------------------------------------------------------------------------------
/tests/util/streaming_script.py:
--------------------------------------------------------------------------------
 1 | # For use with tests in test_python3.py
 2 | import time
 3 | print("==> Begin streamed lines <==")
 4 | for number in range(5):
 5 |     print(number)
 6 |     time.sleep(1)
 7 | print("==> End streamed lines <==")
 8 | with open("./output/output.txt", "w") as f:
 9 |     f.write("Success")
10 | 


--------------------------------------------------------------------------------
/toolchest_client/__init__.py:
--------------------------------------------------------------------------------
 1 | import builtins
 2 | from dotenv import load_dotenv, find_dotenv
 3 | import functools
 4 | 
 5 | from .logging import setup_logging
 6 | 
 7 | # set __version__ module
 8 | try:
 9 |     # importlib.metadata is present in Python 3.8 and later
10 |     import importlib.metadata as importlib_metadata
11 | except ImportError:
12 |     import importlib_metadata as importlib_metadata
13 | try:
14 |     __version__ = importlib_metadata.version(__package__ or __name__)
15 | except importlib_metadata.PackageNotFoundError:
16 |     __version__ = None
17 | 
18 | # .env load must be before imports that use environment variables
19 | load_dotenv(find_dotenv(".env"))
20 | 
21 | # specifying print flushing is necessary to support loading from R
22 | builtins.print = functools.partial(print, flush=True)
23 | 
24 | # configure logger
25 | setup_logging()
26 | 
27 | from toolchest_client.api.auth import get_key, set_key
28 | from toolchest_client.api.download import download
29 | from toolchest_client.api.exceptions import ToolchestException, DataLimitError, ToolchestJobError, \
30 |     ToolchestDownloadError
31 | from toolchest_client.api.query import Query
32 | from toolchest_client.api.status import Status, get_status
33 | from toolchest_client.api.urls import get_api_url, set_api_url
34 | from .tools.api import add_database, alphafold, blastn, bowtie2, bracken, cellranger_count, centrifuge, clustalo, \
35 |     demucs, diamond_blastp, diamond_blastx, fastqc, humann3, jupyter, kallisto, kraken2, lastal5, lug, megahit, \
36 |     metaphlan, python3, rapsearch, rapsearch2, salmon, shi7, shogun_align, shogun_filter, STAR, test, transfer, \
37 |     unicycler, update_database
38 | 


--------------------------------------------------------------------------------
/toolchest_client/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/toolchest_client/api/__init__.py


--------------------------------------------------------------------------------
/toolchest_client/api/auth.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.api.auth
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This module contains functions for configuring the Toolchest API key.
 6 | 
 7 | """
 8 | from loguru import logger
 9 | import os
10 | import sys
11 | 
12 | import requests
13 | from requests.exceptions import HTTPError
14 | 
15 | from toolchest_client.api.exceptions import ToolchestKeyError
16 | from toolchest_client.api.urls import get_api_url
17 | 
18 | 
19 | def get_key():
20 |     """Retrieves the Toolchest API key, if it is set."""
21 | 
22 |     try:
23 |         key = os.environ["TOOLCHEST_KEY"]
24 |     except KeyError as e:
25 |         logger.error("Key not found. Please set environment variable TOOLCHEST_KEY to your Toolchest API key.")
26 |         logger.error("Function call:")
27 |         logger.error("    toolchest_client.set_key(YOUR_KEY_HERE)")
28 |         return e
29 |     return key
30 | 
31 | 
32 | def set_key(key):
33 |     """Sets the Toolchest auth key (env var TOOLCHEST_KEY) to the given value.
34 | 
35 |     :param key: key value (str) or path to file containing key. If given a filename,
36 |         the file must consist of only the key itself.
37 | 
38 |     Usage::
39 | 
40 |         >>> import toolchest_client as toolchest
41 |         >>> toolchest.set_key(YOUR_KEY_HERE)
42 | 
43 |     """
44 | 
45 |     if os.path.isfile(key):
46 |         with open(key, "r") as f:
47 |             os.environ["TOOLCHEST_KEY"] = f.read().strip()
48 |     else:
49 |         os.environ["TOOLCHEST_KEY"] = key
50 | 
51 | 
52 | def validate_key():
53 |     """Validates Toolchest API key, retrieved from get_key()."""
54 | 
55 |     validation_response = requests.get(
56 |         get_api_url(),
57 |         headers=get_headers(),
58 |     )
59 |     try:
60 |         validation_response.raise_for_status()
61 |     except HTTPError:
62 |         error_message = "Invalid Toolchest auth key. Please check the key value or contact Toolchest."
63 |         logger.error(error_message, file=sys.stderr)
64 |         raise ToolchestKeyError(error_message) from None
65 | 
66 | 
67 | def get_headers():
68 |     """Returns headers for Toolchest API calls."""
69 |     return {"Authorization": f"Key {get_key()}"}
70 | 


--------------------------------------------------------------------------------
/toolchest_client/api/exceptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.api.exceptions
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This module contains custom exceptions used for the Toolchest client.
 6 | """
 7 | 
 8 | 
 9 | class ToolchestException(OSError):
10 |     """There was an unknown exception that occurred during your
11 |     Toolchest job.
12 |     """
13 | 
14 | 
15 | class ToolchestKeyError(ToolchestException):
16 |     """Invalid Toolchest auth key."""
17 | 
18 | 
19 | class ToolchestS3AccessError(ToolchestException):
20 |     """S3 input cannot be accessed by Toolchest."""
21 | 
22 | 
23 | class ToolchestDownloadError(ToolchestException):
24 |     """An error occurred when downloading files from Toolchest."""
25 | 
26 | 
27 | class DataLimitError(ToolchestException):
28 |     """Data limit for Toolchest exceeded."""
29 | 
30 | 
31 | class ToolchestJobError(ToolchestException):
32 |     """An error occurred when running the job instance."""
33 | 


--------------------------------------------------------------------------------
/toolchest_client/api/instance_type.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class InstanceType(Enum):
 5 |     # Compute optimized, lists vCPUs, 1:2 vCPU to RAM ratio
 6 |     COMPUTE_2 = "compute-2"
 7 |     COMPUTE_4 = "compute-4"
 8 |     COMPUTE_8 = "compute-8"
 9 |     COMPUTE_16 = "compute-16"
10 |     COMPUTE_32 = "compute-32"
11 |     COMPUTE_48 = "compute-48"
12 |     COMPUTE_64 = "compute-64"
13 |     COMPUTE_96 = "compute-96"
14 |     # General optimized, lists vCPUs, 1:4 vCPU to RAM ratio
15 |     GENERAL_2 = "general-2"
16 |     GENERAL_4 = "general-4"
17 |     GENERAL_8 = "general-8"
18 |     GENERAL_16 = "general-16"
19 |     GENERAL_32 = "general-32"
20 |     GENERAL_48 = "general-48"
21 |     GENERAL_64 = "general-64"
22 |     GENERAL_96 = "general-96"
23 |     # Memory optimized, lists memory, 1:8 vCPU to RAM ratio
24 |     MEMORY_16 = "memory-16"
25 |     MEMORY_32 = "memory-32"
26 |     MEMORY_64 = "memory-64"
27 |     MEMORY_128 = "memory-128"
28 |     MEMORY_256 = "memory-256"
29 |     MEMORY_384 = "memory-384"
30 |     MEMORY_512 = "memory-512"
31 |     # GPU instances, lists core GPU type
32 |     GPU_V100 = "gpu-V100"
33 | 


--------------------------------------------------------------------------------
/toolchest_client/api/output.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.api.output
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This module provides an Output object returned by any completed queries
 6 | made by Toolchest tools. The output object contains information about
 7 | where the output file can be located.
 8 | 
 9 | Note: The Output object does NOT represent the contents of any of the
10 | tool output files themselves.
11 | """
12 | 
13 | from toolchest_client.api.download import download
14 | from toolchest_client.api.status import get_status as get_api_status
15 | 
16 | 
17 | class Output:
18 |     """A Toolchest query output.
19 | 
20 |     Provides information about location of output file(s), both locally
21 |     (if downloaded) and in the cloud.
22 | 
23 |     """
24 | 
25 |     def __init__(self, s3_uri=None, output_path=None, run_id=None):
26 |         self.tool_name = None
27 |         self.tool_version = None
28 |         self.database_name = None
29 |         self.database_version = None
30 |         self.s3_uri = s3_uri
31 |         self.output_path = output_path
32 |         self.output_file_paths = None
33 |         self.run_id = run_id
34 |         self.last_status = None
35 | 
36 |     def __repr__(self):
37 |         return str(self.__dict__)
38 | 
39 |     def __str__(self):
40 |         return str(self.__dict__)
41 | 
42 |     def set_run_id(self, run_id):
43 |         self.run_id = run_id
44 | 
45 |     def set_s3_uri(self, s3_uri):
46 |         self.s3_uri = s3_uri
47 | 
48 |     def set_output_path(self, output_path, output_file_paths=None):
49 |         self.output_path = output_path
50 |         self.output_file_paths = output_file_paths
51 | 
52 |     def set_tool(self, tool_name=None, tool_version=None):
53 |         """Sets the tool name and tool version for ensuring versioning and reproducibility."""
54 |         self.tool_name = tool_name
55 |         self.tool_version = tool_version
56 | 
57 |     def set_database(self, database_name=None, database_version=None):
58 |         """Sets the database name and database version for ensuring versioning and reproducibility.
59 | 
60 |         `database_version` increments when updating a database through the API.
61 |         """
62 |         self.database_name = database_name
63 |         self.database_version = database_version
64 | 
65 |     def download(self, output_path=None, output_dir=None, skip_decompression=False):
66 |         if not output_path:
67 |             if not output_dir:
68 |                 raise ValueError("Output destination directory (output_path) must be specified.")
69 |             output_path = output_dir  # backwards compatibility for old calls
70 | 
71 |         self.output_file_paths = download(
72 |             output_path=output_path,
73 |             s3_uri=self.s3_uri,
74 |             run_id=self.run_id,
75 |             skip_decompression=skip_decompression,
76 |         )
77 |         return self.output_file_paths
78 | 
79 |     def refresh_status(self, **kwargs):
80 |         self.last_status = get_api_status(self.run_id, **kwargs)
81 | 
82 |     def get_status(self, **kwargs):
83 |         """
84 |         Returns the status of a run
85 |         """
86 |         if not self.run_id:
87 |             raise ValueError("Cannot get status on an output that has no run_id")
88 |         self.refresh_status(**kwargs)
89 |         return self.last_status
90 | 


--------------------------------------------------------------------------------
/toolchest_client/api/status.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.api.status
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This module contains a function to check pipeline_segment_instance statuses and status enums.
 6 | """
 7 | 
 8 | from enum import Enum
 9 | 
10 | 
11 | def get_status(run_id, **kwargs):
12 |     """Returns the status of the Toolchest run.
13 | 
14 |         Call this less than once a second to avoid being rate-limited.
15 | 
16 |         :param run_id: the ID returned by a tool. Internally, this ID is the pipeline_segment_instance_id.
17 |         """
18 |     from toolchest_client.api.query import Query  # local import to avoid circular dependency
19 | 
20 |     query = Query(
21 |         is_async=True,
22 |         pipeline_segment_instance_id=run_id,
23 |     )
24 | 
25 |     return query.get_job_status(**kwargs)
26 | 
27 | 
28 | class Status(str, Enum):
29 |     """Status values for the Toolchest API."""
30 | 
31 |     # NOTE: These statuses aren't currently being used with threading.
32 |     # All status updates are encapsulated in the statuses of the threads.
33 |     INITIALIZED = "initialized"
34 |     TRANSFERRING_FROM_CLIENT = "transferring_from_client"
35 |     TRANSFERRED_FROM_CLIENT = "transferred_from_client"
36 |     AWAITING_EXECUTION = "awaiting_execution"
37 |     BEGINNING_EXECUTION = "beginning_execution"
38 |     EXECUTING = "executing"
39 |     READY_TO_TRANSFER_TO_CLIENT = "ready_to_transfer_to_client"
40 |     TRANSFERRING_TO_CLIENT = "transferring_to_client"
41 |     TRANSFERRED_TO_CLIENT = "transferred_to_client"
42 |     TERMINATED = "terminated"
43 |     COMPLETE = "complete"
44 |     FAILED = "failed"
45 | 
46 | 
47 | class PrettyStatus(str, Enum):
48 |     """Status values for local threads"""
49 | 
50 |     INITIALIZING = "initializing"
51 |     INITIALIZED = "initialized"
52 |     UPLOADING = "uploading"
53 |     EXECUTING = "executing"
54 |     DOWNLOADING = "downloading"
55 |     COMPLETE = "complete"
56 |     INTERRUPTING = "interrupting"
57 |     TERMINATED = "terminated"
58 |     FAILED = "failed"
59 | 


--------------------------------------------------------------------------------
/toolchest_client/api/streaming.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.api.streaming
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This module provides a StreamingClient object, used by Toolchest queries to
 6 | receive and print output lines streamed from the Toolchest server.
 7 | """
 8 | import asyncio
 9 | from loguru import logger
10 | import ssl
11 | import sys
12 | 
13 | import websockets
14 | from websockets.exceptions import ConnectionClosed
15 | 
16 | 
17 | class StreamingClient:
18 |     """A Toolchest output stream client.
19 | 
20 |     Provides an interface to output lines streamed from the server.
21 | 
22 |     """
23 | 
24 |     def __init__(self):
25 |         self.ssl_context = None
26 |         self.streaming_token = None
27 |         self.streaming_ip_address = None
28 |         self.streaming_tls_cert = None
29 |         self.initialized = False
30 |         self.ready_to_start = False
31 |         self.stream_is_open = False
32 | 
33 |     def initialize_params(self, streaming_token, streaming_ip_address, streaming_tls_cert):
34 |         self.streaming_token = streaming_token
35 |         self.streaming_ip_address = streaming_ip_address
36 |         self.streaming_tls_cert = streaming_tls_cert
37 | 
38 |         ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
39 |         ssl_context.load_verify_locations(cadata=self.streaming_tls_cert)
40 |         self.ssl_context = ssl_context
41 |         self.ready_to_start = True
42 |         self.initialized = True
43 | 
44 |     async def receive_stream(self):
45 |         streaming_username = "toolchest"
46 |         streaming_port = "8765"
47 |         uri = f"wss://{streaming_username}:{self.streaming_token}@{self.streaming_ip_address}:{streaming_port}"
48 |         logger.info("Connecting to remote server for streaming...")
49 |         sys.stdout.flush()
50 |         retry_count = 0
51 |         while True:
52 |             try:
53 |                 async for websocket in websockets.connect(uri, ssl=self.ssl_context):
54 |                     logger.debug("Connected!")
55 |                     try:
56 |                         self.stream_is_open = True
57 |                         while self.stream_is_open:
58 |                             stream_lines = await websocket.recv()
59 |                             # Not using logger here, because I couldn't get formatting right
60 |                             print(stream_lines, end="")
61 |                     except ConnectionClosed:
62 |                         self.stream_is_open = False
63 |                         logger.debug("\nConnection closed by server.")
64 |                         return
65 |             except ConnectionRefusedError:
66 |                 retry_count += 1
67 |                 if retry_count > 3:
68 |                     raise RuntimeError("Can't connect to server. Try disabling output streaming and re-running.")
69 |                 else:
70 |                     continue
71 | 
72 |     def stream(self):
73 |         self.ready_to_start = False
74 |         try:
75 |             loop = asyncio.get_running_loop()
76 |         except RuntimeError:
77 |             loop = None
78 | 
79 |         if loop and loop.is_running():
80 |             raise ValueError("Output streaming cannot be enabled within a running asyncio event loop.")
81 |         else:
82 |             asyncio.run(self.receive_stream())
83 | 


--------------------------------------------------------------------------------
/toolchest_client/api/urls.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.api.urls
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This module serves as a single source for URLs used in
 6 | Toolchest queries and API calls.
 7 | """
 8 | 
 9 | import os
10 | 
11 | 
12 | def get_api_url():
13 |     """Retrieves the base URL for the Toolchest server API. Defaults to "https://api.toolche.st"
14 |     if a custom API URL is not set.
15 |     """
16 |     # Note: BASE_URL is checked for backwards compatibility.
17 |     return os.environ.get("TOOLCHEST_API_URL", os.environ.get("BASE_URL", "https://api.toolche.st"))
18 | 
19 | 
20 | def get_pipeline_segment_instances_url():
21 |     """Retrieves the Toolchest API Route for pipeline segment instances. Used internally."""
22 |     PIPELINE_SEGMENT_INSTANCES_ROUTE = "/pipeline-segment-instances"
23 |     return get_api_url() + PIPELINE_SEGMENT_INSTANCES_ROUTE
24 | 
25 | 
26 | def get_s3_metadata_url():
27 |     """Retrieves the Toolchest API Route for S3 metadata. Used internally."""
28 |     S3_ROUTE = "/s3"
29 |     S3_URL = get_api_url() + S3_ROUTE
30 |     return S3_URL + "/metadata"
31 | 
32 | 
33 | def set_api_url(custom_api_url=None):
34 |     """Sets the Toolchest API URL (env var TOOLCHEST_API_URL) to the given value.
35 |     If a URL is not provided, resets to the default Toolchest API URL.
36 | 
37 |     :param custom_api_url: Custom API URL. Any trailing slashes should be removed.
38 | 
39 |     Usage::
40 | 
41 |         >>> import toolchest_client as toolchest
42 |         >>> toolchest.set_api_url("http://your.custom.api.url.here")
43 | 
44 |     """
45 |     if custom_api_url:
46 |         os.environ["TOOLCHEST_API_URL"] = custom_api_url
47 |     else:
48 |         os.environ.pop("TOOLCHEST_API_URL", None)
49 | 


--------------------------------------------------------------------------------
/toolchest_client/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/toolchest_client/cli/__init__.py


--------------------------------------------------------------------------------
/toolchest_client/cli/cli.py:
--------------------------------------------------------------------------------
 1 | import typer
 2 | 
 3 | import toolchest_client.cli.kraken2 as kraken2
 4 | import toolchest_client.cli.test as test
 5 | 
 6 | 
 7 | app = typer.Typer()
 8 | 
 9 | # Apparently this not recommended but it allows each tool to have its own file and maintain readability
10 | app.command()(kraken2.kraken2)
11 | app.command()(test.test)
12 | 
13 | if __name__ == "__main__":
14 |     app()
15 | 


--------------------------------------------------------------------------------
/toolchest_client/cli/test.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import typer
 4 | 
 5 | from toolchest_client.tools import Test
 6 | 
 7 | app = typer.Typer()
 8 | 
 9 | 
10 | @app.command()
11 | def test(
12 |     inputs: List[str],
13 |     output_path: str = typer.Option(None, help='Sets the directory where the success file will be downloaded'),
14 |     is_async: bool = typer.Option(False, '--is_async', help='Executes the Toolchest job as an async job')
15 | ):
16 |     """
17 |     Confirms that you are able to run toolchest
18 |     """
19 |     test_instance = Test(
20 |         tool_args='',
21 |         output_name='output.tar.gz',
22 |         inputs=inputs,
23 |         output_path=output_path,
24 |         is_async=is_async,
25 |     )
26 |     test_instance.run()
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     app()
31 | 


--------------------------------------------------------------------------------
/toolchest_client/files/__init__.py:
--------------------------------------------------------------------------------
1 | from .general import assert_exists, check_file_size, files_in_path, sanity_check, compress_files_in_path, \
2 |     convert_input_params_to_prefix_mapping
3 | from .merge import concatenate_files, merge_sam_files
4 | from .s3 import assert_accessible_s3, get_s3_file_size, get_params_from_s3_uri, path_is_s3_uri
5 | from .split import open_new_output_file, split_file_by_lines, split_paired_files_by_lines
6 | from .unpack import OutputType, unpack_files
7 | from .public_uris import get_url_with_protocol, path_is_http_url, path_is_accessible_ftp_url
8 | 


--------------------------------------------------------------------------------
/toolchest_client/files/merge.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.files.merge
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | Functions for merging files
 6 | """
 7 | 
 8 | import multiprocessing
 9 | import shutil
10 | 
11 | 
12 | def concatenate_files(input_file_paths, output_file_path):
13 |     """Concatenates a list of files using shutil.
14 | 
15 |     :param input_file_paths: Paths to the files which are to be concatenated.
16 |     :param output_file_path: Path to the merged output file.
17 |     """
18 |     with open(output_file_path, "wb") as output_file:
19 |         for input_file_path in input_file_paths:
20 |             input_file = open(input_file_path, "rb")
21 |             shutil.copyfileobj(input_file, output_file)
22 |             input_file.close()
23 | 
24 | 
25 | def merge_sam_files(input_file_paths, output_file_path):
26 |     """Merges SAM files – the output for tools like STAR – using samtools.
27 | 
28 |     :param input_file_paths: Paths to the files which are to be merged with samtools.
29 |     :param output_file_path: Path to the merged output file.
30 |     """
31 |     # Only import pysam – an optional dependency – if absolutely needed
32 |     import pysam
33 | 
34 |     # This cause problems if run on a shared machine with non-available cores
35 |     num_cores = multiprocessing.cpu_count()
36 | 
37 |     # Options for merging SAM files:
38 |     # -f: force overwrite output file
39 |     # -o: specify output manually
40 |     # -u: write output as an uncompressed SAM
41 |     # -c: combine headers when they exist in both files
42 |     # -p: merge @PG IDs
43 |     # --threads: number of threads
44 |     pysam.merge(
45 |         "-f",
46 |         "-u",
47 |         "-c",
48 |         "-p",
49 |         "--threads",
50 |         f"{num_cores}",
51 |         output_file_path,
52 |         *input_file_paths
53 |     )
54 | 


--------------------------------------------------------------------------------
/toolchest_client/files/public_uris.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.files.public_uris
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | Functions for handling files given by HTTP / HTTPS / FTP URIs.
 6 | """
 7 | from ftplib import FTP
 8 | from urllib.parse import urlparse
 9 | from urllib3.exceptions import LocationParseError
10 | 
11 | import requests
12 | from requests.exceptions import HTTPError, InvalidURL, InvalidSchema
13 | 
14 | 
15 | def get_url_with_protocol(url):
16 |     """Returns URL with `http://` prepended, if a protocol is not specified.
17 | 
18 |     :param url: An input URL.
19 |     """
20 |     parsed_url = urlparse(url)
21 |     if not parsed_url.scheme:
22 |         url = "http://" + url
23 |     return url
24 | 
25 | 
26 | def path_is_http_url(path):
27 |     """Returns whether the given path is an accessible URL by sending a GET request for the first byte.
28 | 
29 |     :param path: An input path.
30 |     """
31 |     try:
32 |         response = requests.get(path, headers={"Range": "bytes=0-0"})
33 |         response.raise_for_status()
34 |         return len(response.content) == 1
35 |     except (InvalidURL, HTTPError, InvalidSchema, LocationParseError, UnicodeError, Exception):
36 |         return False
37 | 
38 | 
39 | def path_is_accessible_ftp_url(path):
40 |     """Returns whether the given path is an accessible URL by sending a HEAD request.
41 | 
42 |     :param path: An input path.
43 |     """
44 |     if path.startswith("ftp://"):
45 |         file_size = get_ftp_url_file_size(path)
46 |         return file_size > 0
47 |     return False
48 | 
49 | 
50 | def get_ftp_url_file_size(url):
51 |     """Returns file size of an accessible FTP URL, via SIZE command.
52 | 
53 |     :param url: An input URL.
54 |     """
55 |     parsed_url = urlparse(url)
56 |     with FTP(parsed_url.netloc) as ftp:
57 |         ftp.login()
58 |         return ftp.size(parsed_url.path)
59 | 


--------------------------------------------------------------------------------
/toolchest_client/files/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/toolchest_client/files/tests/__init__.py


--------------------------------------------------------------------------------
/toolchest_client/files/tests/data/eight_line.fastq:
--------------------------------------------------------------------------------
1 | @test.fastq.1 BOGUS length=121
2 | ACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGN
3 | +test.fastq.1 BOGUS length=121
4 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<<
5 | @test.fastq.2 BOGUS length=121
6 | TGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGAT
7 | +test.fastq.2 BOGUS length=121
8 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<<
9 | 


--------------------------------------------------------------------------------
/toolchest_client/files/tests/data/eight_line_split_one.fastq:
--------------------------------------------------------------------------------
1 | @test.fastq.1 BOGUS length=121
2 | ACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGN
3 | +test.fastq.1 BOGUS length=121
4 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<<
5 | 


--------------------------------------------------------------------------------
/toolchest_client/files/tests/data/eight_line_split_two.fastq:
--------------------------------------------------------------------------------
1 | @test.fastq.2 BOGUS length=121
2 | TGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGAT
3 | +test.fastq.2 BOGUS length=121
4 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<<
5 | 


--------------------------------------------------------------------------------
/toolchest_client/files/tests/data/paired_end/eight_line_R1.fastq:
--------------------------------------------------------------------------------
1 | @test.fastq.1 BOGUS length=121
2 | ACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGN
3 | +test.fastq.1 BOGUS length=121
4 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<<
5 | @test.fastq.2 BOGUS length=121
6 | TGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGAT
7 | +test.fastq.2 BOGUS length=121
8 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<<
9 | 


--------------------------------------------------------------------------------
/toolchest_client/files/tests/data/paired_end/eight_line_R2.fastq:
--------------------------------------------------------------------------------
1 | @test.fastq.1 BOGUS length=121
2 | ACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGNACTGTTCGATGN
3 | +test.fastq.1 BOGUS length=121
4 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<<
5 | @test.fastq.2 BOGUS length=121
6 | TGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGATTGTTCGATGNACTGTTCGAT
7 | +test.fastq.2 BOGUS length=121
8 | =<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<=<<=====<=<==<<
9 | 


--------------------------------------------------------------------------------
/toolchest_client/files/tests/data/very_small_file.txt:
--------------------------------------------------------------------------------
1 | A
2 | 


--------------------------------------------------------------------------------
/toolchest_client/files/tests/test_general.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | 
 4 | import pytest
 5 | 
 6 | from .. import assert_exists, check_file_size, files_in_path, sanity_check, convert_input_params_to_prefix_mapping
 7 | 
 8 | THIS_FILE_PATH = os.path.normpath(pathlib.Path(__file__).parent.resolve())
 9 | 
10 | 
11 | def test_small_file():
12 |     small_file_path = f"{THIS_FILE_PATH}/data/very_small_file.txt"
13 |     with pytest.raises(ValueError):
14 |         sanity_check(small_file_path)
15 | 
16 | 
17 | def test_files_in_path():
18 |     tmp_dir = f"{THIS_FILE_PATH}/tmp"
19 |     tmp1 = f"{tmp_dir}/tmp1"
20 |     tmp2 = f"{tmp_dir}/tmp2"
21 |     sub_dir = f"{tmp_dir}/sub_dir"
22 |     tmp3 = f"{sub_dir}/tmp3"
23 |     file_paths = [tmp1, tmp2, tmp3]
24 |     os.makedirs(sub_dir, exist_ok=True)
25 |     for file in file_paths:
26 |         open(file, "w").close()
27 |     file_paths = sorted([os.path.normpath(x) for x in file_paths])
28 | 
29 |     assert sorted([os.path.normpath(x) for x in files_in_path(tmp_dir)]) == file_paths
30 | 
31 |     for file in file_paths:
32 |         os.remove(file)
33 |     os.removedirs(sub_dir)
34 | 
35 | 
36 | def test_file_too_large():
37 |     with pytest.raises(ValueError):
38 |         check_file_size(f"{THIS_FILE_PATH}/data/eight_line.fastq", max_size_bytes=100)
39 | 
40 | 
41 | def test_nonexistent_file():
42 |     bogus_file_path = f"{THIS_FILE_PATH}/data/bogus_file_path"
43 |     with pytest.raises(FileNotFoundError):
44 |         assert_exists(bogus_file_path)
45 | 
46 | 
47 | def test_exists_but_not_file():
48 |     dir_file_path = f"{THIS_FILE_PATH}/data"
49 |     with pytest.raises(ValueError):
50 |         assert_exists(dir_file_path, must_be_file=True)
51 | 
52 | 
53 | def test_generate_prefix_mapping():
54 |     tag_to_param_map = {
55 |         "-1": ["example1_R1.fastq", "example2_R1.fastq"],
56 |         "-2": ["example1_R2.fastq", "example2_R2.fastq"],
57 |         "-U": ["example1_U.fastq", "example2_U.fastq"],
58 |     }
59 |     input_list, prefix_mapping = convert_input_params_to_prefix_mapping(tag_to_param_map)
60 |     assert sorted(input_list) == sorted([
61 |         "example1_R1.fastq",
62 |         "example2_R1.fastq",
63 |         "example1_R2.fastq",
64 |         "example2_R2.fastq",
65 |         "example1_U.fastq",
66 |         "example2_U.fastq",
67 |     ])
68 |     assert prefix_mapping == {
69 |         "example1_R1.fastq": {
70 |             "prefix": "-1",
71 |             "order": 0,
72 |         },
73 |         "example1_R2.fastq": {
74 |             "prefix": "-2",
75 |             "order": 0,
76 |         },
77 |         "example1_U.fastq": {
78 |             "prefix": "-U",
79 |             "order": 0,
80 |         },
81 |         "example2_R1.fastq": {
82 |             "prefix": "-1",
83 |             "order": 1,
84 |         },
85 |         "example2_R2.fastq": {
86 |             "prefix": "-2",
87 |             "order": 1,
88 |         },
89 |         "example2_U.fastq": {
90 |             "prefix": "-U",
91 |             "order": 1,
92 |         },
93 |     }
94 | 


--------------------------------------------------------------------------------
/toolchest_client/files/tests/test_merge.py:
--------------------------------------------------------------------------------
 1 | import filecmp
 2 | import os
 3 | import pathlib
 4 | 
 5 | from .. import concatenate_files
 6 | 
 7 | THIS_FILE_PATH = pathlib.Path(__file__).parent.resolve()
 8 | 
 9 | 
10 | def test_concatenate_files():
11 |     input_file_path = f"{THIS_FILE_PATH}/data/eight_line.fastq"
12 |     split_one_path = f"{THIS_FILE_PATH}/data/eight_line_split_one.fastq"
13 |     split_two_path = f"{THIS_FILE_PATH}/data/eight_line_split_two.fastq"
14 |     temp_output_file_path = f"{THIS_FILE_PATH}/data/temp_output.fastq"
15 | 
16 |     concatenate_files([split_one_path, split_two_path], temp_output_file_path)
17 | 
18 |     assert filecmp.cmp(input_file_path, temp_output_file_path)
19 | 
20 |     os.remove(temp_output_file_path)
21 | 
22 | # TODO: test merge_sam_files() in a way that's reproducible on different OS choices
23 | 


--------------------------------------------------------------------------------
/toolchest_client/files/tests/test_s3.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from .. import assert_accessible_s3, get_s3_file_size, get_params_from_s3_uri
 4 | from ...api.exceptions import ToolchestS3AccessError
 5 | 
 6 | EXAMPLE_FASTQ_SIZE = 48468258
 7 | EXAMPLE_FASTQ_URI = "s3://toolchest-public-examples/example.fastq"
 8 | 
 9 | 
10 | def test_s3_params():
11 |     example_s3_uri = "s3://toolchest-public-examples/dummy-id/example.fastq"
12 |     params = get_params_from_s3_uri(example_s3_uri)
13 |     target_params = {
14 |         "arn": "arn:aws:s3:::toolchest-public-examples/dummy-id/example.fastq",
15 |         "bucket": "toolchest-public-examples",
16 |         "key": "dummy-id/example.fastq",
17 |         "key_initial": "dummy-id",
18 |         "key_final": "example.fastq"
19 |     }
20 | 
21 |     assert params == target_params
22 | 
23 | 
24 | @pytest.mark.integration
25 | def test_public_s3_file():
26 |     assert_accessible_s3(EXAMPLE_FASTQ_URI)
27 | 
28 | 
29 | @pytest.mark.integration
30 | def test_fake_s3_file():
31 |     fake_s3_uri = "s3://toolchest-this-is-a-bad-bucket/bogus.fastq"
32 |     with pytest.raises(ToolchestS3AccessError):
33 |         assert_accessible_s3(fake_s3_uri)
34 | 
35 | 
36 | @pytest.mark.integration
37 | def test_s3_file_size():
38 |     assert get_s3_file_size(EXAMPLE_FASTQ_URI) == EXAMPLE_FASTQ_SIZE
39 | 


--------------------------------------------------------------------------------
/toolchest_client/files/tests/test_split.py:
--------------------------------------------------------------------------------
 1 | import filecmp
 2 | import os
 3 | import pathlib
 4 | 
 5 | from .. import split_file_by_lines, split_paired_files_by_lines
 6 | 
 7 | THIS_FILE_PATH = pathlib.Path(__file__).parent.resolve()
 8 | 
 9 | 
10 | def delete_temp_files(file_paths):
11 |     """
12 |     Deletes temporary files. Only use for testing.
13 |     """
14 |     for file_path in file_paths:
15 |         os.remove(file_path)
16 | 
17 | 
18 | def assert_files_eq(file_path_one, file_path_two):
19 |     assert filecmp.cmp(file_path_one, file_path_two)
20 | 
21 | 
22 | def test_split_small_fastq():
23 |     new_file_paths = []
24 |     split_file_paths = split_file_by_lines(
25 |         input_file_path=f"{THIS_FILE_PATH}/data/eight_line.fastq",
26 |         num_lines_in_group=4,
27 |         max_bytes=100
28 |     )
29 | 
30 |     for _, file_path in split_file_paths:
31 |         new_file_paths.append(file_path)
32 | 
33 |     assert len(new_file_paths) == 2
34 | 
35 |     assert_files_eq(new_file_paths[0], f"{THIS_FILE_PATH}/data/eight_line_split_one.fastq")
36 |     assert_files_eq(new_file_paths[1], f"{THIS_FILE_PATH}/data/eight_line_split_two.fastq")
37 | 
38 |     delete_temp_files(new_file_paths)
39 | 
40 | 
41 | def test_split_small_fastq_small_bytes():
42 |     new_file_paths = []
43 |     split_file_paths = split_file_by_lines(
44 |         input_file_path=f"{THIS_FILE_PATH}/data/eight_line.fastq",
45 |         num_lines_in_group=4,
46 |         max_bytes=1
47 |     )
48 | 
49 |     for _, file_path in split_file_paths:
50 |         new_file_paths.append(file_path)
51 | 
52 |     assert len(new_file_paths) == 2
53 | 
54 |     assert_files_eq(new_file_paths[0], f"{THIS_FILE_PATH}/data/eight_line_split_one.fastq")
55 |     assert_files_eq(new_file_paths[1], f"{THIS_FILE_PATH}/data/eight_line_split_two.fastq")
56 | 
57 |     delete_temp_files(new_file_paths)
58 | 
59 | 
60 | def test_split_paired_fastqs():
61 |     new_file_paths = []
62 |     files_to_delete = []
63 |     split_file_paths = split_paired_files_by_lines(
64 |         input_file_paths=[
65 |             f"{THIS_FILE_PATH}/data/paired_end/eight_line_R1.fastq",
66 |             f"{THIS_FILE_PATH}/data/paired_end/eight_line_R2.fastq",
67 |         ],
68 |         num_lines_in_group=4,
69 |         max_bytes=100
70 |     )
71 | 
72 |     for split_paired_input_files in split_file_paths:
73 |         new_file_paths.append(split_paired_input_files)
74 |         files_to_delete.append(split_paired_input_files[0])
75 |         files_to_delete.append(split_paired_input_files[1])
76 | 
77 |     assert len(new_file_paths) == 2
78 |     assert len(new_file_paths[0]) == 2
79 |     assert len(new_file_paths[1]) == 2
80 | 
81 |     assert_files_eq(new_file_paths[0][0], f"{THIS_FILE_PATH}/data/eight_line_split_one.fastq")
82 |     assert_files_eq(new_file_paths[1][0], f"{THIS_FILE_PATH}/data/eight_line_split_two.fastq")
83 |     assert_files_eq(new_file_paths[0][1], f"{THIS_FILE_PATH}/data/eight_line_split_one.fastq")
84 |     assert_files_eq(new_file_paths[1][1], f"{THIS_FILE_PATH}/data/eight_line_split_two.fastq")
85 | 
86 |     delete_temp_files(files_to_delete)
87 | 


--------------------------------------------------------------------------------
/toolchest_client/files/unpack.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | import os
 3 | import shutil
 4 | import tarfile
 5 | 
 6 | 
 7 | class OutputType(Enum):
 8 |     GZ_TAR = ".tar.gz"
 9 |     FLAT_TEXT = ".txt"
10 |     SAM_FILE = ".sam"
11 |     S3 = ""
12 | 
13 | 
14 | def unpack_files(file_path_to_unpack, is_compressed):
15 |     """Unpack output file, if needed. Returns the path(s) to the (optionally) unpacked output.
16 |     If only 1 file is unpacked, returns a string containing that file's path.
17 |     If there are multiple unpacked files, returns a list of paths.
18 |     Returns a list of file paths to unpacked files.
19 |     """
20 |     if is_compressed:
21 |         # Get names of files in archive
22 |         with tarfile.open(file_path_to_unpack) as tar:
23 |             unpacked_file_names = tar.getnames()
24 | 
25 |         unpacked_outputs_dir = os.path.dirname(file_path_to_unpack)
26 |         shutil.unpack_archive(
27 |             filename=file_path_to_unpack,
28 |             extract_dir=unpacked_outputs_dir,
29 |             format="gztar",
30 |         )
31 | 
32 |         # Remove the unpacked .tar.gz file and empty unpacked output folder
33 |         os.remove(file_path_to_unpack)
34 | 
35 |         unpacked_paths = ["/".join([unpacked_outputs_dir, file_name]) for file_name in unpacked_file_names]
36 |         unpacked_file_paths = [os.path.normpath(path) for path in unpacked_paths if os.path.isfile(path)]
37 | 
38 |         # If only 1 file is unpacked, just return path instead of [path].
39 |         # This is to be consistent with the return value from the other output types.
40 |         if len(unpacked_file_paths) == 1:
41 |             return unpacked_file_paths[0]
42 |         return unpacked_file_paths
43 |     else:
44 |         return file_path_to_unpack
45 | 


--------------------------------------------------------------------------------
/toolchest_client/logging.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | import os
 3 | import sys
 4 | 
 5 | LOG_LEVEL = os.environ.get("TOOLCHEST_LOG_LEVEL", "INFO")
 6 | 
 7 | 
 8 | def get_log_level():
 9 |     return LOG_LEVEL
10 | 
11 | 
12 | def setup_logging(log_level=None):
13 |     global LOG_LEVEL
14 |     if log_level and log_level != LOG_LEVEL:
15 |         LOG_LEVEL = log_level
16 |     logger.remove()
17 | 
18 |     valid_log_levels = ["DEBUG", "INFO", "WARNING", "ERROR"]
19 |     if LOG_LEVEL not in valid_log_levels:
20 |         raise ValueError(f"Invalid log level: {LOG_LEVEL}. Valid levels are: {valid_log_levels}")
21 | 
22 |     if LOG_LEVEL in ["DEBUG", "INFO", "WARNING"]:
23 |         stdout_filter = lambda record: record["level"].no < 40
24 |         logger.add(
25 |             sys.stdout,
26 |             filter=stdout_filter,
27 |             level=LOG_LEVEL,
28 |             format="<green>{time}</green> | <level>{level}</level> | <level>{message}</level>",
29 |         )
30 |     # Including if log_level == "ERROR"
31 |     logger.add(sys.stderr, level="ERROR")
32 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | from .tool import Tool
 2 | from .alphafold import AlphaFold
 3 | from .blastn import BLASTN
 4 | from .bowtie2 import Bowtie2
 5 | from .bracken import Bracken
 6 | from .cellranger import CellRangerCount
 7 | from .centrifuge import Centrifuge
 8 | from .clustalo import ClustalO
 9 | from .demucs import Demucs
10 | from .diamond import DiamondBlastp, DiamondBlastx
11 | from .fastqc import FastQC
12 | from .humann import HUMAnN3
13 | from .jupyter import Jupyter
14 | from .kallisto import Kallisto
15 | from .kraken2 import Kraken2
16 | from .last import Lastal5
17 | from .lug import Lug
18 | from .megahit import Megahit
19 | from .metaphlan import MetaPhlAn
20 | from .python3 import Python3
21 | from .rapsearch2 import Rapsearch2
22 | from .salmon import Salmon
23 | from .shi7 import Shi7
24 | from .shogun import ShogunAlign, ShogunFilter
25 | from .star import STARInstance
26 | from .test import Test
27 | from .transfer import Transfer
28 | from .unicycler import Unicycler
29 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/alphafold.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.AlphaFold
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the AlphaFold implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType
 8 | from . import Tool
 9 | 
10 | 
11 | class AlphaFold(Tool):
12 |     """
13 |     The AlphaFold implementation of the Tool class.
14 |     """
15 |     def __init__(self, inputs, output_path, tool_args, **kwargs):
16 |         super().__init__(
17 |             tool_name="alphafold",
18 |             tool_version="2.1.2",
19 |             tool_args=tool_args,
20 |             inputs=inputs,
21 |             database_name="alphafold_standard",
22 |             database_version="2.1.2",
23 |             parallel_enabled=False,
24 |             output_type=OutputType.GZ_TAR,
25 |             output_path=output_path,
26 |             **kwargs,
27 |         )
28 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/blastn.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.BLASTN
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the BLASTN implementation of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType
 9 | 
10 | 
11 | class BLASTN(Tool):
12 |     """
13 |     The BLASTN implementation of the Tool class.
14 |     """
15 |     def __init__(self, tool_args, inputs, output_path, output_primary_name, database_name, database_version, **kwargs):
16 |         super().__init__(
17 |             tool_name="blastn",
18 |             tool_version="2.13.0",
19 |             tool_args=tool_args,
20 |             output_path=output_path,
21 |             output_primary_name=output_primary_name,
22 |             inputs=inputs,
23 |             database_name=database_name,
24 |             database_version=database_version,
25 |             max_input_bytes_per_file=10 * 1024 * 1024 * 1024,
26 |             output_type=OutputType.GZ_TAR,
27 |             expected_output_file_names=["blastn_results.out"],
28 |             **kwargs,
29 |         )
30 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/bowtie2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.bowtie2
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the bowtie2 implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType
 8 | 
 9 | from . import Tool
10 | 
11 | 
12 | class Bowtie2(Tool):
13 |     """
14 |     The bowtie2 implementation of the Tool class.
15 |     """
16 |     def __init__(self, tool_args, inputs, output_path, database_name,
17 |                  database_version, **kwargs):
18 |         super().__init__(
19 |             tool_name="bowtie2",
20 |             tool_version="2.4.4",  # todo: allow bowtie2 version to be set by the user
21 |             tool_args=tool_args,
22 |             output_path=output_path,
23 |             inputs=inputs,
24 |             database_name=database_name,
25 |             database_version=database_version,
26 |             parallel_enabled=False,
27 |             output_type=OutputType.GZ_TAR,
28 |             expected_output_file_names=["bowtie2.log", "bowtie2_output.sam"],
29 |             **kwargs,
30 |         )
31 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/bracken.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.bracken
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Bracken implementation of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType
 9 | from toolchest_client.files.s3 import path_is_s3_uri
10 | 
11 | 
12 | class Bracken(Tool):
13 |     """
14 |     The Bracken implementation of the Tool class.
15 |     """
16 |     def __init__(self, tool_args, inputs, output_path,
17 |                  database_name, database_version, remote_database_path, **kwargs):
18 |         super().__init__(
19 |             tool_name="bracken",
20 |             tool_version="2.7",  # todo: allow bracken version to be set by the user
21 |             tool_args=tool_args,
22 |             output_path=output_path,
23 |             inputs=inputs,
24 |             database_name=database_name,
25 |             database_version=database_version,
26 |             remote_database_path=remote_database_path,
27 |             max_input_bytes_per_file=64 * 1024 * 1024 * 1024,
28 |             output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR,
29 |             **kwargs,
30 |         )
31 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/cellranger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.cellranger
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This contains the cellranger implementations of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType
 9 | 
10 | 
11 | class CellRangerCount(Tool):
12 |     """
13 |     The cellranger_count implementation of the Tool class.
14 |     """
15 |     def __init__(self, tool_args, inputs, output_path, database_name,
16 |                  database_version, **kwargs):
17 |         super().__init__(
18 |             tool_name="cellranger_count",
19 |             tool_version="6.1.2",  # todo: allow cellranger version to be set by the user
20 |             tool_args=tool_args,
21 |             output_path=output_path,
22 |             inputs=inputs,
23 |             database_name=database_name,
24 |             database_version=database_version,
25 |             compress_inputs=True,
26 |             max_input_bytes_per_file=128 * 1024 * 1024 * 1024,
27 |             output_type=OutputType.GZ_TAR,
28 |             **kwargs,
29 |         )
30 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/centrifuge.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.centrifuge
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the centrifuge implementation of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType, sanity_check
 9 | 
10 | 
11 | class Centrifuge(Tool):
12 |     """
13 |     The centrifuge implementation of the Tool class.
14 |     """
15 |     def __init__(self, tool_args, inputs, input_prefix_mapping, database_name, database_version,
16 |                  output_path, **kwargs):
17 |         super().__init__(
18 |             tool_name="centrifuge",
19 |             tool_version="1.0.4",  # todo: allow version to be set by the user
20 |             tool_args=tool_args,
21 |             output_path=output_path,
22 |             inputs=inputs,
23 |             input_prefix_mapping=input_prefix_mapping,
24 |             database_name=database_name,
25 |             database_version=database_version,
26 |             max_inputs=None,
27 |             parallel_enabled=False,
28 |             output_type=OutputType.GZ_TAR,
29 |             expected_output_file_names=[
30 |                 "centrifuge_output.txt",
31 |                 "centrifuge_report.tsv",
32 |             ],
33 |             **kwargs,
34 |         )
35 | 
36 |     def _postflight(self, output):
37 |         if self.output_validation_enabled:
38 |             for output_file_name in self.expected_output_file_names:
39 |                 # Skip validation for the "done" file, which should be empty.
40 |                 if output_file_name != "done":
41 |                     output_file_path = f"{self.output_path}/{output_file_name}"
42 |                     sanity_check(output_file_path)
43 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/clustalo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.clustalo
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Clustal Omega implementation of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType
 9 | 
10 | 
11 | class ClustalO(Tool):
12 |     """
13 |     The Clustal Omega implementation of the Tool class.
14 |     """
15 |     def __init__(self, tool_args, inputs, output_path, output_primary_name, **kwargs):
16 |         super().__init__(
17 |             tool_name="clustalo",
18 |             tool_version='1.2.4',
19 |             tool_args=tool_args,
20 |             output_primary_name=output_primary_name,
21 |             output_path=output_path,
22 |             inputs=inputs,
23 |             parallel_enabled=False,
24 |             output_type=OutputType.GZ_TAR,
25 |             expected_output_file_names=[output_primary_name, f"{output_primary_name}.log"],
26 |             **kwargs,
27 |         )
28 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/demucs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.demucs
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Demucs implementation of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType
 9 | 
10 | 
11 | class Demucs(Tool):
12 |     """
13 |     The Demucs implementation of the Tool class.
14 |     """
15 |     def __init__(self, tool_args, inputs, output_path, **kwargs):
16 |         super().__init__(
17 |             tool_name="demucs",
18 |             tool_version='3.0.4',
19 |             tool_args=tool_args,
20 |             output_path=output_path,
21 |             inputs=inputs,
22 |             parallel_enabled=False,
23 |             output_type=OutputType.GZ_TAR,
24 |             expected_output_file_names=["error.log", "output.log"],
25 |             **kwargs,
26 |         )
27 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/diamond.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.Diamond
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Diamond implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType
 8 | from . import Tool
 9 | 
10 | 
11 | class DiamondBlastp(Tool):
12 |     """
13 |     The DIAMOND BLASTP implementation of the Tool class.
14 |     """
15 |     def __init__(self, inputs, database_name, database_version, output_path, output_primary_name, tool_args,
16 |                  remote_database_path, remote_database_primary_name, **kwargs):
17 |         super().__init__(
18 |             tool_name="diamond_blastp",
19 |             tool_version="2.0.14",
20 |             tool_args=tool_args,
21 |             output_primary_name=output_primary_name,
22 |             inputs=inputs,
23 |             remote_database_path=remote_database_path,
24 |             remote_database_primary_name=remote_database_primary_name,
25 |             database_name=database_name,
26 |             database_version=database_version,
27 |             parallel_enabled=False,
28 |             output_type=OutputType.GZ_TAR,
29 |             output_path=output_path,
30 |             expected_output_file_names=[output_primary_name, "diamond.log"],
31 |             **kwargs,
32 |         )
33 | 
34 | 
35 | class DiamondBlastx(Tool):
36 |     """
37 |     The DIAMOND BLASTX implementation of the Tool class.
38 |     """
39 |     def __init__(self, inputs, database_name, database_version, output_path, output_primary_name, tool_args,
40 |                  remote_database_path, distributed=False, **kwargs):
41 |         super().__init__(
42 |             tool_name="diamond_blastx" if not distributed else "diamond_blastx_parallel",
43 |             tool_version="2.0.13",
44 |             tool_args=tool_args,
45 |             output_primary_name=output_primary_name,
46 |             inputs=inputs,
47 |             remote_database_path=remote_database_path,
48 |             database_name=database_name,
49 |             database_version=database_version,
50 |             output_type=OutputType.GZ_TAR,
51 |             output_path=output_path,
52 |             expected_output_file_names=[output_primary_name],
53 |             **kwargs,
54 |         )
55 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/fastqc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.fastqc
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the FastQC implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType
 8 | from toolchest_client.files.s3 import path_is_s3_uri
 9 | 
10 | from . import Tool
11 | 
12 | 
13 | class FastQC(Tool):
14 |     """
15 |     The FastQC implementation of the Tool class.
16 |     """
17 |     def __init__(self, tool_args, inputs, output_path, **kwargs):
18 |         super().__init__(
19 |             tool_name="fastqc",
20 |             tool_version="0.11.9",
21 |             tool_args=tool_args,
22 |             output_path=output_path,
23 |             inputs=inputs,
24 |             max_input_bytes_per_file=64 * 1024 * 1024 * 1024,
25 |             output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR,
26 |             **kwargs,
27 |         )
28 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/humann.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.humann
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the HUMAnN implementation of the Tool class.
 6 | """
 7 | from enum import Enum
 8 | 
 9 | from toolchest_client.files import OutputType
10 | 
11 | from . import Tool
12 | 
13 | 
14 | class HUMAnN3(Tool):
15 |     """
16 |     The HUMAnN implementation of the Tool class.
17 |     """
18 |     def __init__(self, tool_args, inputs, output_primary_name, input_prefix_mapping, output_path, **kwargs):
19 |         super().__init__(
20 |             tool_name="humann3",
21 |             tool_version="3.1.1",  # todo: allow version to be set by the user
22 |             database_name="humann3_protein_uniref90_diamond",
23 |             database_version="1",
24 |             tool_args=tool_args,
25 |             output_path=output_path,
26 |             output_primary_name=output_primary_name,
27 |             inputs=inputs,
28 |             input_prefix_mapping=input_prefix_mapping,
29 |             parallel_enabled=False,
30 |             output_type=OutputType.GZ_TAR,
31 |             **kwargs,
32 |         )
33 | 
34 | 
35 | class HUMAnN3Mode(Enum):
36 |     HUMANN = ("humann", False)
37 |     HUMANN_BARPLOT = ("humann_barplot", True)
38 |     HUMANN_GENE_FAMILIES_GENUS_LEVEL = ("humann_genefamilies_genus_level", True)
39 |     HUMANN_JOIN_TABLES = ("humann_join_tables", True)
40 |     HUMANN_REDUCE_TABLE = ("humann_reduce_table", True)
41 |     HUMANN_REGROUP_TABLE = ("humann_regroup_table", True)
42 |     HUMANN_RENORM_TABLE = ("humann_renorm_table", True)
43 |     HUMANN_RENAME_TABLE = ("humann_rename_table", True)
44 |     HUMANN_SPLIT_STRATIFIED_TABLE = ("humann_split_stratified_table", False)
45 |     HUMANN_UNPACK_PATHWAYS = ("humann_unpack_pathways", True)
46 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/jupyter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.jupyter
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Jupyter implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType, path_is_s3_uri
 8 | 
 9 | from . import Tool
10 | 
11 | 
12 | class Jupyter(Tool):
13 |     """
14 |     The Jupyter implementation of the Tool class.
15 |     """
16 |     def __init__(self, tool_args, inputs, output_path, input_prefix_mapping, **kwargs):
17 |         super().__init__(
18 |             tool_name="jupyter",
19 |             tool_version="1",
20 |             tool_args=tool_args,
21 |             output_path=output_path,
22 |             inputs=inputs,
23 |             input_prefix_mapping=input_prefix_mapping,
24 |             output_primary_name="token.txt",
25 |             output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.FLAT_TEXT,
26 |             **kwargs,
27 |         )
28 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/kallisto.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.kallisto
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Kallisto implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType
 8 | from toolchest_client.files.s3 import path_is_s3_uri
 9 | 
10 | from . import Tool
11 | 
12 | 
13 | class Kallisto(Tool):
14 |     """
15 |     The Kallisto implementation of the Tool class.
16 |     """
17 |     def __init__(self, tool_args, inputs, output_path, database_name,
18 |                  database_version, input_prefix_mapping, **kwargs):
19 |         super().__init__(
20 |             tool_name="kallisto",
21 |             tool_version="0.48.0",
22 |             tool_args=tool_args,
23 |             output_path=output_path,
24 |             inputs=inputs,
25 |             input_prefix_mapping=input_prefix_mapping,
26 |             database_name=database_name,
27 |             database_version=database_version,
28 |             max_input_bytes_per_file=64 * 1024 * 1024 * 1024,
29 |             output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR,
30 |             **kwargs,
31 |         )
32 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/kraken2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.kraken2
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Kraken2 implementation of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType
 9 | from toolchest_client.files.s3 import path_is_s3_uri
10 | 
11 | 
12 | class Kraken2(Tool):
13 |     """
14 |     The Kraken2 implementation of the Tool class.
15 |     """
16 |     def __init__(self, tool_args, inputs, output_path,
17 |                  database_name, database_version, remote_database_path, tool_version='2.1.1', **kwargs):
18 |         super().__init__(
19 |             tool_name="kraken2",
20 |             tool_version=tool_version,
21 |             tool_args=tool_args,
22 |             output_path=output_path,
23 |             inputs=inputs,
24 |             database_name=database_name,
25 |             database_version=database_version,
26 |             remote_database_path=remote_database_path,
27 |             max_input_bytes_per_file=64 * 1024 * 1024 * 1024,
28 |             parallel_enabled=False,
29 |             output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR,
30 |             expected_output_file_names=["kraken2_output.txt", "kraken2_report.txt"],
31 |             **kwargs,
32 |         )
33 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/last.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.last
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Last implementation of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType
 9 | from toolchest_client.files.s3 import path_is_s3_uri
10 | 
11 | 
12 | class Lastal5(Tool):
13 |     """
14 |     The lastal5 implementation of the Tool class.
15 |     """
16 |     def __init__(self, tool_args, inputs, output_path, output_primary_name, database_name, database_version, **kwargs):
17 |         super().__init__(
18 |             tool_name="lastal5",
19 |             tool_version="1411",
20 |             tool_args=tool_args,
21 |             output_path=output_path,
22 |             output_primary_name=output_primary_name,
23 |             inputs=inputs,
24 |             database_name=database_name,
25 |             database_version=database_version,
26 |             max_input_bytes_per_file=64 * 1024 * 1024 * 1024,
27 |             parallel_enabled=False,
28 |             output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR,
29 |             **kwargs,
30 |         )
31 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/lug.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.lug
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Lug implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType, path_is_s3_uri
 8 | 
 9 | from . import Tool
10 | 
11 | 
12 | class Lug(Tool):
13 |     """
14 |     The Lug implementation of the Tool class.
15 |     """
16 |     def __init__(self, tool_args, inputs, output_path, tool_version,
17 |                  custom_docker_image_id=None, **kwargs):
18 |         super().__init__(
19 |             tool_name="lug",
20 |             tool_version=tool_version,
21 |             tool_args=tool_args,
22 |             output_path=output_path,
23 |             inputs=inputs,
24 |             max_input_bytes_per_file=4 * 1024 * 1024 * 1024 * 1024,
25 |             output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR,
26 |             custom_docker_image_id=custom_docker_image_id,
27 |             **kwargs,
28 |         )
29 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/megahit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.megahit
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the megahit implementation of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType, sanity_check
 9 | 
10 | 
11 | class Megahit(Tool):
12 |     """
13 |     The megahit implementation of the Tool class.
14 |     """
15 |     def __init__(self, tool_args, inputs, input_prefix_mapping,
16 |                  output_path, **kwargs):
17 |         super().__init__(
18 |             tool_name="megahit",
19 |             tool_version="1.2.9",  # todo: allow version to be set by the user
20 |             tool_args=tool_args,
21 |             output_path=output_path,
22 |             inputs=inputs,
23 |             input_prefix_mapping=input_prefix_mapping,
24 |             max_inputs=None,
25 |             parallel_enabled=False,
26 |             output_type=OutputType.GZ_TAR,
27 |             expected_output_file_names=[
28 |                 "checkpoints.txt",
29 |                 "done",
30 |                 "final.contigs.fa",
31 |                 "log",
32 |                 "options.json",
33 |             ],
34 |             **kwargs,
35 |         )
36 | 
37 |     def _postflight(self, output):
38 |         if self.output_validation_enabled:
39 |             for output_file_name in self.expected_output_file_names:
40 |                 # Skip validation for the "done" file, which should be empty.
41 |                 if output_file_name != "done":
42 |                     output_file_path = f"{self.output_path}/{output_file_name}"
43 |                     sanity_check(output_file_path)
44 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/metaphlan.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.metaphlan
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the MetaPhlAn implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType
 8 | from toolchest_client.files.s3 import path_is_s3_uri
 9 | 
10 | from . import Tool
11 | 
12 | 
13 | class MetaPhlAn(Tool):
14 |     """
15 |     The MetaPhlAn implementation of the Tool class.
16 |     """
17 |     def __init__(self, tool_args, inputs, output_path, output_primary_name, **kwargs):
18 |         super().__init__(
19 |             tool_name="metaphlan",
20 |             tool_version="3.0.14",
21 |             tool_args=tool_args,
22 |             output_path=output_path,
23 |             database_name="metaphlan_mpa_v30_CHOCOPhlAn_201901",
24 |             database_version="1",
25 |             output_primary_name=output_primary_name,
26 |             inputs=inputs,
27 |             max_inputs=1,
28 |             max_input_bytes_per_file=64 * 1024 * 1024 * 1024,
29 |             output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR,
30 |             **kwargs,
31 |         )
32 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/python3.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.python3
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Python3 implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType, path_is_s3_uri
 8 | 
 9 | from . import Tool
10 | 
11 | 
12 | class Python3(Tool):
13 |     """
14 |     The Python3 implementation of the Tool class.
15 |     """
16 |     def __init__(self, tool_args, inputs, output_path, custom_docker_image_id=None, **kwargs):
17 |         super().__init__(
18 |             tool_name="python3",
19 |             tool_version="3.9.1",
20 |             tool_args=tool_args,
21 |             output_path=output_path,
22 |             inputs=inputs,
23 |             output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR,
24 |             custom_docker_image_id=custom_docker_image_id,
25 |             **kwargs,
26 |         )
27 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/rapsearch2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.rapsearch
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Rapsearch implementation of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType
 9 | 
10 | 
11 | class Rapsearch2(Tool):
12 |     """
13 |     The Rapsearch implementation of the Tool class.
14 |     """
15 |     def __init__(self, tool_args, inputs, output_path, output_primary_name,
16 |                  database_name, database_version, **kwargs):
17 |         super().__init__(
18 |             tool_name="rapsearch2",
19 |             tool_version="2.24",
20 |             tool_args=tool_args,
21 |             output_primary_name=output_primary_name,
22 |             output_path=output_path,
23 |             inputs=inputs,
24 |             database_name=database_name,
25 |             database_version=database_version,
26 |             parallel_enabled=False,
27 |             output_type=OutputType.GZ_TAR,
28 |             expected_output_file_names=[f"{output_primary_name}.m8"],  # .aln output may be omitted with certain args
29 |             **kwargs,
30 |         )
31 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/salmon.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.salmon
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Salmon implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType
 8 | from toolchest_client.files.s3 import path_is_s3_uri
 9 | 
10 | from . import Tool
11 | 
12 | 
13 | class Salmon(Tool):
14 |     """
15 |     The Salmon implementation of the Tool class.
16 |     """
17 |     def __init__(self, tool_args, inputs, output_path, database_name,
18 |                  database_version, **kwargs):
19 |         super().__init__(
20 |             tool_name="salmon",
21 |             tool_version="1.9.0",  # todo: allow salmon version to be set by the user
22 |             tool_args=tool_args,
23 |             output_path=output_path,
24 |             inputs=inputs,
25 |             database_name=database_name,
26 |             database_version=database_version,
27 |             max_input_bytes_per_file=64 * 1024 * 1024 * 1024,
28 |             output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR,
29 |             **kwargs,
30 |         )
31 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/shi7.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.shi7
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the shi7 implementation of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType
 9 | 
10 | 
11 | class Shi7(Tool):
12 |     """
13 |     The shi7 implementation of the Tool class.
14 |     """
15 |     def __init__(self, tool_args, inputs, output_path, **kwargs):
16 |         super().__init__(
17 |             tool_name="shi7",
18 |             tool_version="1.0.3",  # todo: allow shi7 version to be set by the user
19 |             tool_args=tool_args,
20 |             output_path=output_path,
21 |             inputs=inputs,
22 |             max_inputs=None,  # note: no limit is set on the # of inputs
23 |             parallel_enabled=False,
24 |             group_paired_ends=True,
25 |             max_input_bytes_per_file=16 * 1024 * 1024 * 1024,
26 |             output_type=OutputType.GZ_TAR,
27 |             expected_output_file_names=["combined_seqs.fna", "shi7.log"],
28 |             **kwargs,
29 |         )
30 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/shogun.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.shogun
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This contains the shogun_align and shogun_filter implementations of the Tool class.
 6 | """
 7 | from . import Tool
 8 | from toolchest_client.files import OutputType
 9 | 
10 | 
11 | class ShogunAlign(Tool):
12 |     """
13 |     The shogun_align implementation of the Tool class.
14 |     """
15 |     def __init__(self, tool_args, inputs, output_path, database_name,
16 |                  database_version, **kwargs):
17 |         super().__init__(
18 |             tool_name="shogun_align",
19 |             tool_version="1.0.8",  # todo: allow shogun version to be set by the user
20 |             tool_args=tool_args,
21 |             output_path=output_path,
22 |             inputs=inputs,
23 |             database_name=database_name,
24 |             database_version=database_version,
25 |             parallel_enabled=False,
26 |             output_type=OutputType.GZ_TAR,
27 |             expected_output_file_names=["alignment.bowtie2.sam"],
28 |             **kwargs,
29 |         )
30 | 
31 | 
32 | class ShogunFilter(Tool):
33 |     """
34 |     The shogun_filter implementation of the Tool class.
35 |     """
36 |     def __init__(self, tool_args, inputs, output_path, database_name,
37 |                  database_version, **kwargs):
38 |         super().__init__(
39 |             tool_name="shogun_filter",
40 |             tool_version="1.0.8",  # todo: allow shogun version to be set by the user
41 |             tool_args=tool_args,
42 |             output_path=output_path,
43 |             inputs=inputs,
44 |             min_inputs=1,
45 |             max_inputs=1,
46 |             database_name=database_name,
47 |             database_version=database_version,
48 |             parallel_enabled=False,
49 |             output_type=OutputType.GZ_TAR,
50 |             expected_output_file_names=["combined_seqs.filtered.fna", "alignment.burst.best.b6"],
51 |             **kwargs,
52 |         )
53 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/star.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.STAR
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the STAR implementation of the Tool class.
 6 | 
 7 | Note: This tool is named STARInstance to differentiate it from
 8 | the STAR function called by the user, which is given in all caps
 9 | to be in line with the command-line argument.
10 | """
11 | from . import Tool
12 | from toolchest_client.files import OutputType
13 | 
14 | 
15 | class STARInstance(Tool):
16 |     """
17 |     The STAR implementation of the Tool class.
18 |     """
19 |     def __init__(self, tool_args, inputs, input_prefix_mapping, output_path,
20 |                  database_name, database_version, parallelize, output_primary_name=None, **kwargs):
21 |         super().__init__(
22 |             tool_name="STAR",
23 |             tool_version="2.7.9a",
24 |             tool_args=tool_args,
25 |             output_path=output_path,
26 |             output_primary_name=output_primary_name,
27 |             inputs=inputs,
28 |             input_prefix_mapping=input_prefix_mapping,
29 |             database_name=database_name,
30 |             database_version=database_version,
31 |             parallel_enabled=False,
32 |             max_input_bytes_per_file=128 * 1024 * 1024 * 1024,
33 |             max_input_bytes_per_file_parallel=4.5 * 1024 * 1024 * 1024,
34 |             output_type=OutputType.SAM_FILE if parallelize else OutputType.GZ_TAR,
35 |             expected_output_file_names=[
36 |                 "Aligned.out.sam",
37 |                 "Log.final.out",
38 |                 "Log.out",
39 |                 "Log.progress.out",
40 |                 "SJ.out.tab",
41 |             ],
42 |             **kwargs,
43 |         )
44 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.test
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the test implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType
 8 | 
 9 | from . import Tool
10 | 
11 | 
12 | class Test(Tool):
13 |     """
14 |     The test implementation of the Tool class.
15 |     """
16 |     def __init__(self, tool_args, inputs, output_path, **kwargs):
17 |         super().__init__(
18 |             tool_name="test",
19 |             tool_version="0.1.0",
20 |             tool_args=tool_args,
21 |             output_path=output_path,
22 |             inputs=inputs,
23 |             max_input_bytes_per_file=256 * 1024 * 1024 * 1024,
24 |             parallel_enabled=False,
25 |             output_type=OutputType.GZ_TAR,
26 |             expected_output_file_names=["test_output.txt"],
27 |             **kwargs,
28 |         )
29 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trytoolchest/toolchest-client-python/063774e104e4842773aeeb2dccc226c1c60f0a2a/toolchest_client/tools/tests/__init__.py


--------------------------------------------------------------------------------
/toolchest_client/tools/tests/test_generic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | 
 4 | from ..test import Test
 5 | 
 6 | THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
 7 | 
 8 | 
 9 | def test_unknown_arg_handling():
10 |     tool_args = "--unknown arg"
11 |     test_instance = Test(
12 |         tool_args=tool_args,
13 |         inputs=f"{THIS_DIRECTORY}/test_generic.py",
14 |         output_path="./output.tar.gz",
15 |     )
16 | 
17 |     with pytest.raises(ValueError):
18 |         test_instance._validate_args()
19 | 
20 | 
21 | def test_blacklisted_arg_handling():
22 |     tool_args = "--bad arg"
23 |     test_instance = Test(
24 |         tool_args=tool_args,
25 |         inputs=f"{THIS_DIRECTORY}/test_generic.py",
26 |         output_path="./output.tar.gz",
27 |     )
28 | 
29 |     with pytest.raises(ValueError):
30 |         test_instance._validate_args()
31 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/tests/test_kraken2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from ..kraken2 import Kraken2
 4 | 
 5 | THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
 6 | 
 7 | 
 8 | def test_kraken2_preflight():
 9 |     output_path = f"{THIS_DIRECTORY}/output"
10 |     kraken_instance = Kraken2(
11 |         tool_args="",
12 |         inputs=f"{THIS_DIRECTORY}/test_kraken2.py",
13 |         output_path=output_path,
14 |         database_name="standard",
15 |         database_version=1,
16 |         remote_database_path=None,
17 |     )
18 |     kraken_instance._preflight()
19 | 
20 |     assert os.path.isdir(output_path)
21 |     os.rmdir(output_path)
22 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/tests/test_sanity.py:
--------------------------------------------------------------------------------
1 | def test_sanity():
2 |     assert 1 + 1 == 2
3 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/tests/test_star.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from ..star import STARInstance
 4 | 
 5 | THIS_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
 6 | 
 7 | 
 8 | def test_star_variable_arg_parsing_single():
 9 |     star_instance = STARInstance(
10 |         tool_args="--quantMode GeneCounts --scoreGap 1",
11 |         input_prefix_mapping={
12 |             "r1_path": None,
13 |             "r2_path": None,
14 |         },
15 |         inputs=f"{THIS_DIRECTORY}/test_star.py",
16 |         output_path="./",
17 |         database_name="test",
18 |         database_version="0.1.0",
19 |         parallelize=False,
20 |     )
21 |     star_instance._validate_args()
22 | 
23 |     assert star_instance.tool_args == "--quantMode GeneCounts --scoreGap 1"
24 | 
25 | 
26 | def test_star_variable_arg_parsing_multiple():
27 |     star_instance = STARInstance(
28 |         tool_args="--quantMode TranscriptomeSAM GeneCounts --scoreGap 1",
29 |         input_prefix_mapping={
30 |             "r1_path": None,
31 |             "r2_path": None,
32 |         },
33 |         inputs=f"{THIS_DIRECTORY}/test_star.py",
34 |         output_path="./",
35 |         database_name="test",
36 |         database_version="0.1.0",
37 |         parallelize=False,
38 |     )
39 |     star_instance._validate_args()
40 | 
41 |     assert star_instance.tool_args == "--quantMode TranscriptomeSAM GeneCounts --scoreGap 1"
42 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/transfer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.transfer
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the arbitrary file transfer implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType, path_is_s3_uri
 8 | 
 9 | from . import Tool
10 | 
11 | 
12 | class Transfer(Tool):
13 |     """
14 |     The arbitrary file transfer implementation of the Tool class.
15 |     """
16 |     def __init__(self, inputs, output_path, **kwargs):
17 |         super().__init__(
18 |             tool_name="transfer",
19 |             tool_version="1.0.0",
20 |             tool_args="",
21 |             output_path=output_path,
22 |             inputs=inputs,
23 |             max_input_bytes_per_file=1024 * 1024 * 1024 * 1024,
24 |             output_type=OutputType.S3 if path_is_s3_uri(output_path) else OutputType.GZ_TAR,
25 |             expected_output_file_names=[],
26 |             **kwargs,
27 |         )
28 | 


--------------------------------------------------------------------------------
/toolchest_client/tools/unicycler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | toolchest_client.tools.unicycler
 3 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 4 | 
 5 | This is the Unicycler implementation of the Tool class.
 6 | """
 7 | from toolchest_client.files import OutputType
 8 | 
 9 | from . import Tool
10 | 
11 | 
12 | class Unicycler(Tool):
13 |     """
14 |     The unicycler implementation of the Tool class.
15 |     """
16 |     def __init__(self, tool_args, inputs, input_prefix_mapping,
17 |                  output_path, **kwargs):
18 |         super().__init__(
19 |             tool_name="unicycler",
20 |             tool_version="0.4.9",  # todo: allow unicycler version to be set by the user
21 |             tool_args=tool_args,
22 |             output_path=output_path,
23 |             inputs=inputs,
24 |             input_prefix_mapping=input_prefix_mapping,
25 |             parallel_enabled=False,
26 |             output_type=OutputType.GZ_TAR,
27 |             expected_output_file_names=["assembly.fasta", "unicycler.log"],
28 |             **kwargs,
29 |         )
30 | 


--------------------------------------------------------------------------------